Coverage for astrocyte/pipeline/token_budget.py: 100%

38 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Token-budgeted recall packing (M35). 

2 

3Replaces item-count `top_k` caps with token-count `max_tokens` caps on the 

4recall output. Matches Hindsight's `max_tokens=8192` default (see 

5``hindsight-api-slim/hindsight_api/engine/memory_engine.py`` and their 

6benchmark runner). 

7 

8Why tokens, not items 

9--------------------- 

10 

11The LLM answerer consumes a fixed-size token window, not a fixed number of 

12items. Item-count caps (our pre-M35 `top_k`) over-count short facts and 

13under-count long sections, leading to either wasted context (lots of short 

14hits) or truncated context (one long fact eats most of the window). 

15 

16The v015l bench made this concrete: with `final_top_n=50`, LME dropped to 

1763.3% top_50 because the rank-21-50 candidates added noise. The same 

18retrieval at `final_top_n=20` (`top_20`) achieved 74.4% — the bigger pool 

19of items diluted answerer attention. A token-budget cap behaves 

20proportionally: short, dense facts pack more in; long sections fill the 

21budget faster. 

22 

23Tokenizer 

24--------- 

25 

26Uses ``tiktoken``'s ``cl100k_base`` encoding by default — the tokenizer 

27shared by ``gpt-4o``, ``gpt-4o-mini``, ``gpt-4-turbo``, and 

28``text-embedding-3-*``. This matches what our bench answerer (``gpt-4o-mini``) 

29actually uses, so the budget is measured in the units that matter. 

30 

31Tiktoken's first-load cost is ~50ms; the encoding is then cached for the 

32process lifetime. Subsequent calls are sub-millisecond per text. 

33 

34References: 

35 

36- Hindsight: ``hindsight_api/engine/memory_engine.py`` uses 

37 ``_get_tiktoken_encoding()`` for the same purpose. 

38- OpenAI tiktoken: https://github.com/openai/tiktoken 

39""" 

40 

41from __future__ import annotations 

42 

43from threading import Lock 

44from typing import Iterable, TypeVar 

45 

46#: Default encoding shared by all current OpenAI models we care about. 

47#: Override per-call if a non-OpenAI model is being targeted. 

48_DEFAULT_ENCODING_NAME: str = "cl100k_base" 

49 

50#: Lazy-loaded encoding handle. ``tiktoken.get_encoding`` does a small 

51#: amount of work on first call (~50ms) and caches internally, but we add 

52#: our own module-level cache so the lookup is sub-microsecond after the 

53#: first ``count_tokens`` call. 

54_encoding_cache: dict[str, object] = {} 

55_encoding_lock = Lock() 

56 

57 

58def _get_encoding(encoding_name: str = _DEFAULT_ENCODING_NAME): 

59 """Return a cached tiktoken encoding handle. 

60 

61 Threadsafe. Lazy-loaded so a process that never calls 

62 :func:`count_tokens` doesn't pay the tiktoken import cost. 

63 """ 

64 with _encoding_lock: 

65 cached = _encoding_cache.get(encoding_name) 

66 if cached is None: 

67 import tiktoken # noqa: PLC0415 

68 

69 cached = tiktoken.get_encoding(encoding_name) 

70 _encoding_cache[encoding_name] = cached 

71 return cached 

72 

73 

74def count_tokens(text: str, *, encoding_name: str = _DEFAULT_ENCODING_NAME) -> int: 

75 """Return the number of tokens in ``text`` per the chosen encoding. 

76 

77 Defaults to ``cl100k_base`` (gpt-4o family). Empty / None text counts 

78 as zero — callers don't need to guard. 

79 """ 

80 if not text: 

81 return 0 

82 enc = _get_encoding(encoding_name) 

83 return len(enc.encode(text)) 

84 

85 

86_T = TypeVar("_T") 

87 

88 

89def pack_to_budget( 

90 items: Iterable[_T], 

91 *, 

92 max_tokens: int, 

93 text_of: callable, # type: ignore[type-arg] # callable[[_T], str] 

94 encoding_name: str = _DEFAULT_ENCODING_NAME, 

95) -> list[_T]: 

96 """Pack items into a token-budgeted output list. 

97 

98 Iterates ``items`` in their input order (caller should pre-sort by 

99 relevance), tokenizes each via ``text_of(item)``, and stops accepting 

100 items once the cumulative token count would exceed ``max_tokens``. 

101 

102 The first item is always included even if it exceeds the budget on 

103 its own — otherwise a single long fact would produce an empty 

104 result, which is the worst possible failure mode for recall. 

105 Subsequent oversize items are skipped (the budget is a soft cap, 

106 not a hard truncation of mid-item text). 

107 

108 Args: 

109 items: Ranked candidate iterable. Order matters — items are 

110 packed greedily, so put the most relevant first. 

111 max_tokens: Token budget. Must be ``> 0``. When ``<= 0``, returns 

112 an empty list (consistent with "no budget = no output"). 

113 text_of: Callable mapping each item to its token-counted text. 

114 For ``PageIndexFact`` this is typically 

115 ``lambda f: f.text or ""``. 

116 encoding_name: Override the tokenizer (default cl100k_base). 

117 

118 Returns: 

119 List of items whose cumulative ``text_of(item)`` token count is 

120 at most ``max_tokens`` (with the always-include-first rule). 

121 """ 

122 if max_tokens <= 0: 

123 return [] 

124 

125 out: list[_T] = [] 

126 used = 0 

127 for i, item in enumerate(items): 

128 text = text_of(item) or "" 

129 cost = count_tokens(text, encoding_name=encoding_name) 

130 # First item always in (avoid empty result on oversized single fact). 

131 if i == 0: 

132 out.append(item) 

133 used += cost 

134 continue 

135 if used + cost > max_tokens: 

136 continue 

137 out.append(item) 

138 used += cost 

139 return out 

140 

141 

142__all__ = ["count_tokens", "pack_to_budget"]