Coverage for astrocyte/pipeline/token_budget.py: 100%
38 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Token-budgeted recall packing (M35).
3Replaces item-count `top_k` caps with token-count `max_tokens` caps on the
4recall output. Matches Hindsight's `max_tokens=8192` default (see
5``hindsight-api-slim/hindsight_api/engine/memory_engine.py`` and their
6benchmark runner).
8Why tokens, not items
9---------------------
11The LLM answerer consumes a fixed-size token window, not a fixed number of
12items. Item-count caps (our pre-M35 `top_k`) over-count short facts and
13under-count long sections, leading to either wasted context (lots of short
14hits) or truncated context (one long fact eats most of the window).
16The v015l bench made this concrete: with `final_top_n=50`, LME dropped to
1763.3% top_50 because the rank-21-50 candidates added noise. The same
18retrieval at `final_top_n=20` (`top_20`) achieved 74.4% — the bigger pool
19of items diluted answerer attention. A token-budget cap behaves
20proportionally: short, dense facts pack more in; long sections fill the
21budget faster.
23Tokenizer
24---------
26Uses ``tiktoken``'s ``cl100k_base`` encoding by default — the tokenizer
27shared by ``gpt-4o``, ``gpt-4o-mini``, ``gpt-4-turbo``, and
28``text-embedding-3-*``. This matches what our bench answerer (``gpt-4o-mini``)
29actually uses, so the budget is measured in the units that matter.
31Tiktoken's first-load cost is ~50ms; the encoding is then cached for the
32process lifetime. Subsequent calls are sub-millisecond per text.
34References:
36- Hindsight: ``hindsight_api/engine/memory_engine.py`` uses
37 ``_get_tiktoken_encoding()`` for the same purpose.
38- OpenAI tiktoken: https://github.com/openai/tiktoken
39"""
41from __future__ import annotations
43from threading import Lock
44from typing import Iterable, TypeVar
46#: Default encoding shared by all current OpenAI models we care about.
47#: Override per-call if a non-OpenAI model is being targeted.
48_DEFAULT_ENCODING_NAME: str = "cl100k_base"
50#: Lazy-loaded encoding handle. ``tiktoken.get_encoding`` does a small
51#: amount of work on first call (~50ms) and caches internally, but we add
52#: our own module-level cache so the lookup is sub-microsecond after the
53#: first ``count_tokens`` call.
54_encoding_cache: dict[str, object] = {}
55_encoding_lock = Lock()
58def _get_encoding(encoding_name: str = _DEFAULT_ENCODING_NAME):
59 """Return a cached tiktoken encoding handle.
61 Threadsafe. Lazy-loaded so a process that never calls
62 :func:`count_tokens` doesn't pay the tiktoken import cost.
63 """
64 with _encoding_lock:
65 cached = _encoding_cache.get(encoding_name)
66 if cached is None:
67 import tiktoken # noqa: PLC0415
69 cached = tiktoken.get_encoding(encoding_name)
70 _encoding_cache[encoding_name] = cached
71 return cached
74def count_tokens(text: str, *, encoding_name: str = _DEFAULT_ENCODING_NAME) -> int:
75 """Return the number of tokens in ``text`` per the chosen encoding.
77 Defaults to ``cl100k_base`` (gpt-4o family). Empty / None text counts
78 as zero — callers don't need to guard.
79 """
80 if not text:
81 return 0
82 enc = _get_encoding(encoding_name)
83 return len(enc.encode(text))
86_T = TypeVar("_T")
89def pack_to_budget(
90 items: Iterable[_T],
91 *,
92 max_tokens: int,
93 text_of: callable, # type: ignore[type-arg] # callable[[_T], str]
94 encoding_name: str = _DEFAULT_ENCODING_NAME,
95) -> list[_T]:
96 """Pack items into a token-budgeted output list.
98 Iterates ``items`` in their input order (caller should pre-sort by
99 relevance), tokenizes each via ``text_of(item)``, and stops accepting
100 items once the cumulative token count would exceed ``max_tokens``.
102 The first item is always included even if it exceeds the budget on
103 its own — otherwise a single long fact would produce an empty
104 result, which is the worst possible failure mode for recall.
105 Subsequent oversize items are skipped (the budget is a soft cap,
106 not a hard truncation of mid-item text).
108 Args:
109 items: Ranked candidate iterable. Order matters — items are
110 packed greedily, so put the most relevant first.
111 max_tokens: Token budget. Must be ``> 0``. When ``<= 0``, returns
112 an empty list (consistent with "no budget = no output").
113 text_of: Callable mapping each item to its token-counted text.
114 For ``PageIndexFact`` this is typically
115 ``lambda f: f.text or ""``.
116 encoding_name: Override the tokenizer (default cl100k_base).
118 Returns:
119 List of items whose cumulative ``text_of(item)`` token count is
120 at most ``max_tokens`` (with the always-include-first rule).
121 """
122 if max_tokens <= 0:
123 return []
125 out: list[_T] = []
126 used = 0
127 for i, item in enumerate(items):
128 text = text_of(item) or ""
129 cost = count_tokens(text, encoding_name=encoding_name)
130 # First item always in (avoid empty result on oversized single fact).
131 if i == 0:
132 out.append(item)
133 used += cost
134 continue
135 if used + cost > max_tokens:
136 continue
137 out.append(item)
138 used += cost
139 return out
142__all__ = ["count_tokens", "pack_to_budget"]