Coverage for astrocyte/pipeline/token

1"""Token-budgeted recall packing (M35).

3Replaces item-count `top_k` caps with token-count `max_tokens` caps on the

4recall output. Matches Hindsight's `max_tokens=8192` default (see

5``hindsight-api-slim/hindsight_api/engine/memory_engine.py`` and their

6benchmark runner).

8Why tokens, not items

9---------------------

11The LLM answerer consumes a fixed-size token window, not a fixed number of

12items. Item-count caps (our pre-M35 `top_k`) over-count short facts and

13under-count long sections, leading to either wasted context (lots of short

14hits) or truncated context (one long fact eats most of the window).

16The v015l bench made this concrete: with `final_top_n=50`, LME dropped to

1763.3% top_50 because the rank-21-50 candidates added noise. The same

18retrieval at `final_top_n=20` (`top_20`) achieved 74.4% — the bigger pool

19of items diluted answerer attention. A token-budget cap behaves

20proportionally: short, dense facts pack more in; long sections fill the

21budget faster.

23Tokenizer

24---------

26Uses ``tiktoken``'s ``cl100k_base`` encoding by default — the tokenizer

27shared by ``gpt-4o``, ``gpt-4o-mini``, ``gpt-4-turbo``, and

28``text-embedding-3-*``. This matches what our bench answerer (``gpt-4o-mini``)

29actually uses, so the budget is measured in the units that matter.

31Tiktoken's first-load cost is ~50ms; the encoding is then cached for the

32process lifetime. Subsequent calls are sub-millisecond per text.

34References:

36- Hindsight: ``hindsight_api/engine/memory_engine.py`` uses

37 ``_get_tiktoken_encoding()`` for the same purpose.

38- OpenAI tiktoken: https://github.com/openai/tiktoken

39"""

41from __future__ import annotations

43from threading import Lock

44from typing import Iterable, TypeVar

46#: Default encoding shared by all current OpenAI models we care about.

47#: Override per-call if a non-OpenAI model is being targeted.

48_DEFAULT_ENCODING_NAME: str = "cl100k_base"

50#: Lazy-loaded encoding handle. ``tiktoken.get_encoding`` does a small

51#: amount of work on first call (~50ms) and caches internally, but we add

52#: our own module-level cache so the lookup is sub-microsecond after the

53#: first ``count_tokens`` call.

54_encoding_cache: dict[str, object] = {}

55_encoding_lock = Lock()

58def _get_encoding(encoding_name: str = _DEFAULT_ENCODING_NAME):

59 """Return a cached tiktoken encoding handle.

61 Threadsafe. Lazy-loaded so a process that never calls

62 :func:`count_tokens` doesn't pay the tiktoken import cost.

63 """

64 with _encoding_lock:

65 cached = _encoding_cache.get(encoding_name)

66 if cached is None:

67 import tiktoken # noqa: PLC0415

69 cached = tiktoken.get_encoding(encoding_name)

70 _encoding_cache[encoding_name] = cached

71 return cached

74def count_tokens(text: str, *, encoding_name: str = _DEFAULT_ENCODING_NAME) -> int:

75 """Return the number of tokens in ``text`` per the chosen encoding.

77 Defaults to ``cl100k_base`` (gpt-4o family). Empty / None text counts

78 as zero — callers don't need to guard.

79 """

80 if not text:

81 return 0

82 enc = _get_encoding(encoding_name)

83 return len(enc.encode(text))

86_T = TypeVar("_T")

89def pack_to_budget(

90 items: Iterable[_T],

91 *,

92 max_tokens: int,

93 text_of: callable, # type: ignore[type-arg] # callable[[_T], str]

94 encoding_name: str = _DEFAULT_ENCODING_NAME,

95) -> list[_T]:

96 """Pack items into a token-budgeted output list.

98 Iterates ``items`` in their input order (caller should pre-sort by

99 relevance), tokenizes each via ``text_of(item)``, and stops accepting

100 items once the cumulative token count would exceed ``max_tokens``.

101

102 The first item is always included even if it exceeds the budget on

103 its own — otherwise a single long fact would produce an empty

104 result, which is the worst possible failure mode for recall.

105 Subsequent oversize items are skipped (the budget is a soft cap,

106 not a hard truncation of mid-item text).

107

108 Args:

109 items: Ranked candidate iterable. Order matters — items are

110 packed greedily, so put the most relevant first.

111 max_tokens: Token budget. Must be ``> 0``. When ``<= 0``, returns

112 an empty list (consistent with "no budget = no output").

113 text_of: Callable mapping each item to its token-counted text.

114 For ``PageIndexFact`` this is typically

115 ``lambda f: f.text or ""``.

116 encoding_name: Override the tokenizer (default cl100k_base).

117

118 Returns:

119 List of items whose cumulative ``text_of(item)`` token count is

120 at most ``max_tokens`` (with the always-include-first rule).

121 """

122 if max_tokens <= 0:

123 return []

124

125 out: list[_T] = []

126 used = 0

127 for i, item in enumerate(items):

128 text = text_of(item) or ""

129 cost = count_tokens(text, encoding_name=encoding_name)

130 # First item always in (avoid empty result on oversized single fact).

131 if i == 0:

132 out.append(item)

133 used += cost

134 continue

135 if used + cost > max_tokens:

136 continue

137 out.append(item)

138 used += cost

139 return out

140

141

142__all__ = ["count_tokens", "pack_to_budget"]

Coverage for astrocyte/pipeline/token_budget.py: 100%

38 statements