Coverage for astrocyte/pipeline/agentic_reflect.py: 74%
228 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Agentic reflect loop (Hindsight parity, native function-calling).
3Closely mirrors Hindsight's ``engine/reflect/agent.py``:
5- **Native function calling** (OpenAI/Anthropic ``tools=`` API), not a
6 JSON-in-prose protocol. Provider adapters return parsed
7 :class:`ToolCall` instances; the loop dispatches by tool name.
8- **Hierarchical tools** in priority order:
9 1. ``recall`` — raw fact retrieval (always available).
10 2. ``search_observations`` — consolidated knowledge layer (when
11 observations are configured for the bank).
12 3. ``expand`` — fetch source memories cited by a compiled fact /
13 wiki page (their "include_chunks" semantics on demand).
14 4. ``done`` — terminal action; the model commits an answer with
15 ``cited_ids``.
16- **Default 10 iterations** (Hindsight default).
17- **Tool-name normalization** for misbehaving LLMs that emit prefixed
18 or decorated tool names (``call=functions.done``,
19 ``done<|channel|>commentary``, ``functions.recall``).
20- **Token tracking** via ``tiktoken`` when available; degrades to None
21 totals when the package isn't installed.
22- **Citation validation** — ``cited_ids`` not seen in the running
23 evidence pool are stripped from the response's ``sources`` list.
24- **Feature detection** — when the provider doesn't support
25 ``tools`` (legacy ``MockLLMProvider`` paths), the loop falls
26 through to forced single-shot synthesis.
28The loop ALWAYS round-trips tool results back to the model as
29``role="tool"`` messages so the next iteration sees its prior tool's
30output, matching OpenAI's tool-use protocol.
31"""
33from __future__ import annotations
35import json
36import logging
37from dataclasses import dataclass, field
38from typing import Any, Awaitable, Callable
40from astrocyte.types import (
41 MemoryHit,
42 Message,
43 ReflectResult,
44 ToolDefinition,
45)
47_logger = logging.getLogger("astrocyte.agentic_reflect")
50# ---------------------------------------------------------------------------
51# Loop configuration
52# ---------------------------------------------------------------------------
55@dataclass
56class AgenticReflectParams:
57 """Tunable knobs for the reflect loop.
59 Defaults match Hindsight where stated:
60 - ``max_iterations=10`` matches their ``DEFAULT_MAX_ITERATIONS``.
61 - ``recall_step_max_results=10`` matches the bench's outer recall.
62 - ``observations_step_max_results=5`` matches their default.
63 - ``expand_step_max_sources=5`` matches their default.
64 - ``max_evidence_pool_size=30`` is our cap; their token-budget cap
65 is computed dynamically from ``max_tokens`` instead.
66 """
68 max_iterations: int = 10
69 recall_step_max_results: int = 10
70 observations_step_max_results: int = 5
71 expand_step_max_sources: int = 5
72 max_evidence_pool_size: int = 30
73 #: When the provider lacks tool-calling support OR doesn't emit any
74 #: tool call on a turn, the loop falls through to forced synthesis
75 #: rather than thrashing. Set ``False`` only for diagnostic runs.
76 fallback_on_no_tool_call: bool = True
77 #: Append adversarial-defense rules to the system prompt
78 #: (premise check, negative-existence handling, time-shift trap,
79 #: explicit "insufficient evidence is always a valid answer").
80 #: Targets the LoCoMo adversarial category.
81 adversarial_defense: bool = False
84# ---------------------------------------------------------------------------
85# Tool schemas (OpenAI-compatible JSON Schema)
86# ---------------------------------------------------------------------------
89_TOOL_SEARCH_MENTAL_MODELS = ToolDefinition(
90 name="search_mental_models",
91 description=(
92 "Search the bank's curated mental models — durable, refreshable summaries "
93 "of stable preferences / personas / concepts (e.g. 'Alice prefers async "
94 "communication' or 'Project X status: blocked on review'). Highest-quality "
95 "tier when the question is about a stable summary or persona; cite the "
96 "model_id like any other memory_id. If `search_mental_models` returns "
97 "nothing relevant, fall through to `search_observations` and `recall`."
98 ),
99 parameters={
100 "type": "object",
101 "properties": {
102 "reason": {
103 "type": "string",
104 "description": "Brief explanation of why a mental-model lookup fits the question.",
105 },
106 "query": {
107 "type": "string",
108 "description": "Free-text query — used for ranking when the store doesn't already filter by scope.",
109 },
110 "scope": {
111 "type": "string",
112 "description": "Optional scope filter (e.g. 'bank' for bank-wide models, 'person:alice' for a person-specific scope). Omit to list all models.",
113 },
114 },
115 "required": ["reason", "query"],
116 },
117)
119_TOOL_RECALL = ToolDefinition(
120 name="recall",
121 description=(
122 "Retrieve raw memories from the bank by semantic similarity to your query. "
123 "Use this for ground-truth facts. Each result has an `id` you must cite "
124 "from in the final `done` call."
125 ),
126 parameters={
127 "type": "object",
128 "properties": {
129 "reason": {
130 "type": "string",
131 "description": "Brief explanation of why you're calling recall (for debug/tracing).",
132 },
133 "query": {
134 "type": "string",
135 "description": "Search query — refine it with names/dates/specific terms when broadening doesn't help.",
136 },
137 "max_results": {
138 "type": "integer",
139 "description": "Maximum number of memories to return (default 10, max 20).",
140 },
141 },
142 "required": ["reason", "query"],
143 },
144)
146_TOOL_SEARCH_OBSERVATIONS = ToolDefinition(
147 name="search_observations",
148 description=(
149 "Search consolidated observations — auto-synthesized knowledge with "
150 "freshness info. Higher precision than raw recall for stable persona / "
151 "preference questions. Stale observations should be cross-checked with "
152 "`recall` before citing."
153 ),
154 parameters={
155 "type": "object",
156 "properties": {
157 "reason": {"type": "string"},
158 "query": {"type": "string"},
159 "max_results": {
160 "type": "integer",
161 "description": "Maximum number of observations (default 5).",
162 },
163 },
164 "required": ["reason", "query"],
165 },
166)
168_TOOL_EXPAND = ToolDefinition(
169 name="expand",
170 description=(
171 "Fetch the source memories cited by a compiled fact / observation / wiki "
172 "page that you've already seen. Pass the `id` of the compiled item; "
173 "you'll receive the raw memories underneath, useful when the distilled "
174 "fact loses important verbatim context."
175 ),
176 parameters={
177 "type": "object",
178 "properties": {
179 "reason": {"type": "string"},
180 "memory_id": {
181 "type": "string",
182 "description": "ID of the compiled memory whose sources to expand.",
183 },
184 "max_sources": {
185 "type": "integer",
186 "description": "Maximum source memories to fetch (default 5).",
187 },
188 },
189 "required": ["reason", "memory_id"],
190 },
191)
193_TOOL_LIST_ENTITIES = ToolDefinition(
194 name="list_entities",
195 description=(
196 "List distinct entity names in the current document with their per-section "
197 "mention counts — use this for COUNTING / AGGREGATION questions where you "
198 "need the exact count of distinct items mentioned across the conversation. "
199 'Pass a substring `pattern` to filter (e.g. `"doctor"`, `"kit"`, '
200 '`"trip"`); pass no pattern to see the top-mentioned entities. Returns '
201 "`[(entity_name, count), ...]` ordered by count desc. The agent counts the "
202 "list — this is the deterministic counting primitive the recall+done loop "
203 "lacks. If `list_entities` returns the right list, your answer is just "
204 "`len(list)` or a sum of relevant counts."
205 ),
206 parameters={
207 "type": "object",
208 "properties": {
209 "reason": {
210 "type": "string",
211 "description": "Why you're listing (e.g. 'count distinct doctors mentioned').",
212 },
213 "pattern": {
214 "type": "string",
215 "description": (
216 "Case-insensitive substring filter on entity_name. Use a "
217 "broad term (`'doctor'` not `'Dr. Smith'`). Omit to see the "
218 "top-mentioned entities globally."
219 ),
220 },
221 "limit": {
222 "type": "integer",
223 "description": "Max entries to return (default 50, max 200).",
224 },
225 },
226 "required": ["reason"],
227 },
228)
230_TOOL_DONE = ToolDefinition(
231 name="done",
232 description=(
233 "Commit your final answer. ALL `cited_ids` must be IDs you have seen in "
234 "an earlier tool result; you cannot invent IDs. If you cannot answer "
235 "with evidence, return an honest 'insufficient evidence' answer."
236 ),
237 parameters={
238 "type": "object",
239 "properties": {
240 "answer": {
241 "type": "string",
242 "description": "Final answer to the user's question, evidence-grounded.",
243 },
244 "cited_ids": {
245 "type": "array",
246 "items": {"type": "string"},
247 "description": "IDs of memories supporting the answer; must come from prior tool results.",
248 },
249 },
250 "required": ["answer", "cited_ids"],
251 },
252)
255# ---------------------------------------------------------------------------
256# Tool-name normalization (Hindsight-parity)
257# ---------------------------------------------------------------------------
260def _normalize_tool_name(name: str) -> str:
261 """Normalize tool names from misbehaving LLMs.
263 Mirrors Hindsight's :func:`_normalize_tool_name`:
264 - ``call=functions.done`` → ``done``
265 - ``functions.recall`` → ``recall``
266 - ``done<|channel|>commentary`` → ``done``
267 """
268 if name.startswith("call="):
269 name = name[len("call=") :]
270 if name.startswith("functions."):
271 name = name[len("functions.") :]
272 if "<|" in name:
273 name = name.split("<|")[0]
274 return name.strip()
277def _is_done_tool(name: str) -> bool:
278 return _normalize_tool_name(name).lower() == "done"
281# ---------------------------------------------------------------------------
282# System prompt
283# ---------------------------------------------------------------------------
286_SYSTEM_PROMPT = """\
287You are an evidence-gathering memory agent. Answer the user's question using \
288ONLY tool results — never invent facts.
290Available tools (priority order — try the higher-quality tiers first):
291- `search_mental_models`: curated, refreshable summaries of stable \
292preferences / personas / concepts (highest quality — try first when the \
293question asks about a person's stable beliefs / preferences / status).
294- `search_observations`: consolidated knowledge auto-synthesized from raw \
295memories (try second for stable patterns when no mental model fits).
296- `recall`: raw memories for ground-truth facts (always available, fall back \
297here when the higher tiers return nothing relevant).
298- `expand`: fetch source memories under a compiled fact / mental model / \
299wiki page you've seen — useful when the distilled summary loses verbatim \
300context.
301- `list_entities`: list distinct entity names with their mention counts. \
302USE THIS FOR COUNTING / AGGREGATION QUESTIONS ("how many doctors / projects \
303/ items"). Pass a substring `pattern` (e.g. `"doctor"`, `"kit"`, `"trip"`); \
304the tool returns the distinct entities matching it with per-section \
305mention counts. The answer is `total_distinct` (or a sum / a filtered \
306subcount). This is far more reliable than counting from raw text — the \
307LLM is bad at distinct-counting across many chunks.
308- `done`: commit final answer with citations.
310COUNTING DISCIPLINE — when the question is "how many X" or "how much Y":
312A. Reach for `list_entities(pattern=...)` BEFORE `recall`. Entities are \
313extracted in two shapes:
315 1. PROPER NOUNS — "Dr. Patel", "Nordstrom", "MoMA". Use these when \
316 the question names a specific person or place.
318 2. STRUCTURED LABELS — `key:value` strings the extractor emits for \
319 COUNTABLE CATEGORIES. Four key prefixes:
320 - `role:<noun>` — occupational categories (`role:doctor`, \
321 `role:dermatologist`, `role:lawyer`)
322 - `category:<noun>` — countable kinds of things \
323 (`category:model_kit`, `category:plant`, `category:restaurant`, \
324 `category:trip`, `category:bike`)
325 - `event:<noun>` — distinct occurrences \
326 (`event:wedding`, `event:sale`, `event:road_trip`)
327 - `expense:<currency_amount>` — money spent \
328 (`expense:$45`, `expense:$185`)
329 - `prefers:<aspect>=<value>` — stable user preference / taste \
330 (`prefers:camera_brand=Sony`, \
331 `prefers:movie_genre=stand-up_comedy`)
333 STRONGLY PREFER querying labels for category counts. Examples:
334 - "How many doctors?" → `list_entities(pattern="role:doctor")` \
335 OR `list_entities(pattern="role:")` to see all roles.
336 - "How many model kits?" → `list_entities(pattern="category:kit")` \
337 OR `pattern="category:model_kit"`.
338 - "Total spent on bikes?" → `list_entities(pattern="expense:")` and \
339 sum the dollar amounts in the entity names.
340 - "How many trips?" → `list_entities(pattern="category:trip")`.
342B. The returned `total_distinct` is the count of DISTINCT label-or-name \
343strings; `section_mentions` is how many sections each appeared in. For \
344"how many doctors" you want `total_distinct` of `role:doctor` rows \
345(or sum the section_mentions if each role mention = one visit).
347C. After `list_entities`, you may still want one `recall` to verify \
348context — but the COUNT is whatever `list_entities` returned.
350D. If `pattern="category:doctor_visit"` returns nothing, try the role: \
351form (`role:doctor`) and counting role mentions instead. Different docs \
352may have used different label conventions; try ~2 patterns before \
353giving up on labels and falling back to recall+count.
355Core rules:
3561. Gather evidence with the tools above before answering.
3572. Each `cited_ids` entry MUST be an ID you saw in a prior tool result.
3583. Refine your query with specific names / dates / sub-topics when initial \
359recall doesn't surface the answer.
361TOOL-RESULT FORMAT — each hit is a JSON entry with these fields:
362- ``id`` — the memory_id (use for citations)
363- ``text`` — the atomic fact (the searchable summary)
364- ``source_chunk`` (when present) — the raw conversation excerpt the fact was \
365extracted from. **This is your primary source for verbatim wording, implicit \
366cues, and details the extracted fact may have lost.** When a question asks \
367for the user's exact phrasing or the assistant's specific recommendation, \
368prefer the source_chunk's wording over the fact's summary text.
369- ``resolved_temporal`` (when present) — pre-resolved absolute dates for \
370relative phrases (see rule 9 below).
372EXTRACTION DISCIPLINE — read this carefully before saying "insufficient evidence":
3744. When tool results CONTAIN the answer (verbatim or near-verbatim, even \
375embedded in larger text), you MUST extract the specific fact. Examples:
376 - Q: "What's James's favorite game?" — recall has "James: my favourite \
377game is Apex Legends" → answer "Apex Legends" and cite the hit.
378 - Q: "What's Joanna's favorite movie?" — recall has "Joanna: I love \
379Eternal Sunshine of the Spotless Mind" → answer with the title.
380 - Q: "What's Jon's favorite style of dance?" — recall has "Jon: \
381contemporary feels right to me" → answer "contemporary".
3835. Wiki / persona pages in recall ARE evidence. If a person's wiki page is \
384returned but doesn't directly list the specific fact, call `expand` on the \
385page id OR run another `recall` with the specific term (e.g. "Joanna favorite \
386movie", "Jon dance style") BEFORE concluding insufficient evidence.
3886. "Insufficient evidence" is ONLY valid when:
389 - No retrieved memory mentions the queried fact at all, OR
390 - The question presupposes something not attested in memory (e.g. \
391"Why did X quit Y?" when X never worked at Y), OR
392 - You've spent the recall budget refining and still nothing matches.
3947. NOT valid reasons to say insufficient evidence:
395 - "The wording in evidence differs from the question's wording"
396 - "I see partial information but not the exact phrasing"
397 - "I'd need more context"
398 - "Multiple options are mentioned, I can't pick one" — pick the most \
399specific match and cite it.
4018. When in doubt between abstaining and extracting: extract, cite the hit, \
402and let the answer be evidence-grounded. A confident extraction beats an \
403overcautious abstain.
405TEMPORAL RESOLUTION — read this when the question asks "when did X happen?":
4079. If a memory in the tool result has a `resolved_temporal` field, USE THE \
408DATES FROM THAT FIELD VERBATIM. Each entry maps a relative phrase \
409(`"yesterday"`, `"last week"`) to the canonical `resolved_date` computed \
410at retain time from the session's authoritative timestamp. DO NOT redo the \
411arithmetic from the raw text — you will get off-by-one errors when the \
412session anchor differs from your assumed `now()`.
414 Example: memory text "Caroline: yesterday I went to the support group" \
415with `resolved_temporal.items = [{"phrase": "yesterday", "resolved_date": \
416"2023-05-07", "granularity": "day"}]` → answer "May 7, 2023". Do not \
417recompute "yesterday" against any other reference date.
41910. Only fall back to raw-text arithmetic when `resolved_temporal` is \
420absent. If both an absolute date in the text AND a `resolved_temporal` \
421entry exist, prefer the absolute date — `resolved_temporal` is for \
422relative phrases only.
424STOP-EARLY DISCIPLINE — `done` is your default, not your last resort:
426The loop has a fixed iteration budget. Empirically, agents that run out \
427the clock produce WORSE answers than agents that commit early on \
428sufficient evidence. After EVERY tool result, ask yourself:
430 "Do I have ≥2 retrieved memories that DIRECTLY contain the answer \
431 (or its components)?"
433If YES → call `done` IMMEDIATELY. Do not issue more `recall`/`expand`/etc. \
434calls "just to be safe" — additional tool calls dilute your synthesis \
435focus and rarely add new information once you have the answer in hand.
437If NO → issue ONE more refining tool call (different query terms, \
438different tool tier, or `expand` on a partial hit). Then re-check the \
439stop condition.
441The ONLY reasons to take more than 2 tool calls:
442- Multi-hop questions where you have evidence for half the bridge but \
443 not the other half (e.g. you found "user works at X" but need "X was \
444 acquired by Y" to answer "who is the user's parent company now").
445- Counting / aggregation questions where `list_entities` returned \
446 ambiguous patterns and you need to disambiguate.
447- Knowledge-update questions where you found a value but not its date, \
448 and need one more recall to confirm "is this the latest?"
450In all OTHER cases, 2 successful tool calls is enough — commit. The \
451`done` tool is your DEFAULT exit, not a fallback. Aim to call `done` \
452within iterations 2-3 of typical questions.
454QUESTION-TYPE ROUTING — read this BEFORE calling `done`:
456Classify the question into ONE of the four shapes below and apply that \
457block's rules when composing your final answer. (Ported from M19a's \
458per-Q-type answerer prompt — the same routing that shipped +12q on \
459multi-hop and +2.25pp overall in the recall+answerer pipeline. See \
460``docs/_design/m19-prompt-routing.md`` and \
461``scripts/mem0_harness/_hindsight_prompt.py``.)
463**For Recommendation / Preference Questions** ("can you recommend...", \
464"any tips...", "what would the user prefer..."):
465- DO NOT invent specific recommendations (no made-up product names, course \
466 titles, paper titles, channel names).
467- DO mention specific brands / products / providers the user ALREADY uses \
468 from the recalled evidence — by name, in the recommendations themselves.
469- Describe WHAT KIND of recommendation the user would prefer, referencing \
470 their existing tools / brands EXPLICITLY in the answer structure.
471- First scan ALL recalled evidence for user's existing tools, brands, \
472 stated preferences. Structure the answer around those, not around \
473 generic categories. If a mental model or wiki page contains a stated \
474 preference ("user prefers X brand"), the answer MUST center on that \
475 preference — not mention it as an aside.
476- Keep answers concise. Focus on key preferences (brand, quality, specific \
477 interests), not exhaustive category lists.
479**For Multi-hop / Synthesis Questions** (requires combining evidence from \
480multiple sessions / facts; "how did X relate to Y", "what happened after Z \
481that led to W"):
482- Synthesize across MULTIPLE retrieved facts — do not over-index on any \
483 single hit.
484- BEFORE calling `done`, verify you have evidence from ≥2 distinct \
485 sources / sessions. If not, issue another `recall` with a refined query \
486 that targets the missing link. (Multi-hop questions are the strongest \
487 case for spending iterations on additional retrieval.)
488- Do NOT structure the answer around a single brand or preference (that's \
489 for recommendation questions only).
490- List the supporting facts inline (1, 2, 3...) to make the synthesis \
491 traceable. Each cited `memory_id` should correspond to one of the \
492 numbered supporting facts.
493- If facts conflict, prefer the more recent / more specific one and note \
494 the conflict explicitly in the answer.
496**For Temporal-Reasoning Questions** ("how long ago", "before/after", \
497"when did X", date arithmetic):
498- Find the ORIGINAL mention date for each event — older facts are often \
499 the right ones (NOT the most recent). The agentic loop's iterative \
500 recall is built for this: when the first recall returns recent hits, \
501 issue another with date-narrowing terms ("first time", "originally", \
502 the specific year if you can infer it).
503- Convert relative dates to absolute first; do the arithmetic explicitly; \
504 show your work in the answer.
505- Use the `resolved_temporal` field on hits when present (see rule 9).
506- Do NOT apply preference framing to date-arithmetic answers.
508**For Knowledge-Update Questions** (the user's current state / latest \
509value of a changing field; "what does the user do now", "where do they \
510live"):
511- Use the MOST RECENT fact for any field that can change over time (job, \
512 location, status, relationship). Earlier facts about the same field are \
513 HISTORICAL, not current.
514- If recall returns multiple facts about the same field, pick the one with \
515 the latest `occurred_start` or session date. Mention the prior value(s) \
516 only if the question explicitly asks about change history.
517"""
520_ADVERSARIAL_DEFENSE_RULES = """\
522ADVERSARIAL DEFENSE (read this carefully):
524A. Premise check before answering. If the question PRESUPPOSES a fact \
525("Why did Alice quit Google?" presupposes Alice worked at Google AND quit), \
526your evidence must directly support each presupposition. If ANY \
527presupposition lacks supporting evidence in the tool results, return:
528 "insufficient evidence — '<the unsupported presupposition>' is not \
529attested in memory."
531B. Entity / date / location verification. If the question names a specific \
532entity, date, or place, verify that token actually appears in the retrieved \
533evidence with the right context before incorporating it into your answer.
535C. Negative-existence questions. If asked "Did X ever do Y?" and recall \
536returns no memory in which X is associated with Y, the correct answer is \
537"No" or "no evidence in memory" — NOT a fabricated yes-answer from \
538loosely-related hits.
540D. Time-shift traps. Dates and time periods in the question must match \
541dates in the retrieved evidence. If the question asks about year N but \
542all evidence is year M, abstain — do not silently shift the date.
544E. "I don't know" is ALWAYS a valid answer. Speculation is never valid. \
545A confident "insufficient evidence" beats a plausible-but-unsupported answer.
546"""
549def _build_system_prompt(*, adversarial_defense: bool = False) -> str:
550 """Compose the agentic-loop system prompt.
552 The base prompt is always included; adversarial-defense rules are
553 appended only when the orchestrator has the feature enabled. Keeping
554 them off by default avoids the small overhead of the extra prompt
555 text on workloads that don't have adversarial questions.
556 """
557 if adversarial_defense:
558 return _SYSTEM_PROMPT + _ADVERSARIAL_DEFENSE_RULES
559 return _SYSTEM_PROMPT
562# ---------------------------------------------------------------------------
563# Loop entry point
564# ---------------------------------------------------------------------------
567RecallFn = Callable[[str, int], Awaitable[list[MemoryHit]]]
568ObservationsFn = Callable[[str, int], Awaitable[list[MemoryHit]]] | None
569ExpandFn = Callable[[str, int], Awaitable[list[MemoryHit]]] | None
570# (query, scope_or_None) -> list[MemoryHit]. Scope is the optional second
571# argument from the tool call; the orchestrator forwards it to
572# ``MentalModelStore.list(bank_id, scope=scope)``. No max_results: mental
573# models are usually a handful per bank — return all in scope and trust the
574# LLM to pick.
575MentalModelsFn = Callable[[str, str | None], Awaitable[list[MemoryHit]]] | None
576# (pattern_or_None, limit) -> [(entity_name, mention_count), ...].
577# Counting primitive — agent uses this for "how many doctors / projects /
578# items" questions where the recall+done loop produces unreliable counts
579# from raw text. PR2.6 addition; Hindsight does not have this tool.
580ListEntitiesFn = Callable[[str | None, int], Awaitable[list[tuple[str, int]]]] | None
583@dataclass
584class _AgentState:
585 """Running-state container for the loop."""
587 evidence: list[MemoryHit] = field(default_factory=list)
588 seen_ids: set[str] = field(default_factory=set)
589 conversation: list[Message] = field(default_factory=list)
590 total_input_tokens: int = 0
591 total_output_tokens: int = 0
594def _add_hits(state: _AgentState, hits: list[MemoryHit], cap: int) -> int:
595 """Append unseen hits, return the count of newly-added hits."""
596 added = 0
597 for h in hits:
598 hid = h.memory_id
599 if hid and hid not in state.seen_ids:
600 state.evidence.append(h)
601 state.seen_ids.add(hid)
602 added += 1
603 if len(state.evidence) >= cap:
604 break
605 return added
608def _resolved_temporal_for_hit(meta: dict | None) -> dict[str, list[dict[str, str]]] | None:
609 """Pull the pre-computed temporal facts from a MemoryHit's metadata.
611 ``temporal_metadata()`` writes ``temporal_phrase`` / ``resolved_date`` /
612 ``date_granularity`` as ``|``-joined strings at retain time, anchored to
613 the session's authoritative ``date_time``. Without surfacing these to
614 the agent, the LLM does its own arithmetic from the raw text (e.g.
615 seeing "yesterday" and subtracting one day from a misread anchor) and
616 drifts off-by-one.
618 Returns a compact dict with parallel lists, one entry per resolved
619 phrase, so the agent can quote the canonical resolved date verbatim.
620 Returns ``None`` when no temporal metadata is present.
621 """
622 if not meta:
623 return None
624 phrases = meta.get("temporal_phrase")
625 resolved = meta.get("resolved_date")
626 if not isinstance(phrases, str) or not isinstance(resolved, str):
627 return None
628 granularities = meta.get("date_granularity")
629 anchor = meta.get("temporal_anchor")
630 p_list = phrases.split("|")
631 r_list = resolved.split("|")
632 g_list = granularities.split("|") if isinstance(granularities, str) else []
633 if not p_list or not r_list or len(p_list) != len(r_list):
634 return None
635 items: list[dict[str, str]] = []
636 for i, (phrase, date) in enumerate(zip(p_list, r_list)):
637 item = {"phrase": phrase, "resolved_date": date}
638 if i < len(g_list) and g_list[i]:
639 item["granularity"] = g_list[i]
640 items.append(item)
641 out: dict[str, list[dict[str, str]] | str] = {"items": items}
642 if isinstance(anchor, str) and anchor:
643 out["anchor_date"] = anchor
644 return out # type: ignore[return-value]
647def _format_hits_for_tool_response(hits: list[MemoryHit]) -> str:
648 """Render hits as a compact JSON array the LLM can consume.
650 Includes a ``resolved_temporal`` block whenever the memory's metadata
651 carries pre-computed temporal facts. This lets the agent quote the
652 canonical resolved date instead of doing its own off-by-one arithmetic
653 from raw "yesterday" / "last week" text.
655 M26 Phase 3 — when the hit's metadata carries ``source_chunk``
656 (populated by the bench harness's ``_to_hit`` bridge from the
657 fact-grain candidate's M24 inline pairing), surface it as a
658 ``source_chunk`` field on the JSON entry. This gives the reflect
659 agent the same fact↔chunk visual cross-reference Hindsight's
660 bench answerer relies on — without it the sub-recall path's facts
661 look the same as M22's flat-bullet shape and we lose the M24 SSP
662 recovery inside the reflect loop.
663 """
664 payload = []
665 for h in hits:
666 text = (h.text or "").strip()
667 if len(text) > 600:
668 text = text[:597] + "..."
669 entry: dict[str, object] = {
670 "id": h.memory_id,
671 "text": text,
672 "score": round(h.score, 4) if h.score else 0.0,
673 }
674 resolved = _resolved_temporal_for_hit(h.metadata)
675 if resolved is not None:
676 entry["resolved_temporal"] = resolved
677 meta = h.metadata or {}
678 src = meta.get("source_chunk") if isinstance(meta, dict) else None
679 if src:
680 src_text = str(src).strip()
681 # Bound chunk length — agent context budget is tight, and
682 # the bench's _slice_section_around_line already caps at
683 # 1200 chars. Mirror that here so we never exceed the cap
684 # if some other caller passes raw chunks.
685 if len(src_text) > 1200:
686 src_text = src_text[:1197] + "..."
687 entry["source_chunk"] = src_text
688 payload.append(entry)
689 return json.dumps(payload, ensure_ascii=False)
692async def agentic_reflect(
693 query: str,
694 *,
695 initial_hits: list[MemoryHit],
696 recall_fn: RecallFn,
697 llm_provider,
698 observations_fn: ObservationsFn = None,
699 expand_fn: ExpandFn = None,
700 mental_models_fn: MentalModelsFn = None,
701 list_entities_fn: ListEntitiesFn = None,
702 params: AgenticReflectParams | None = None,
703 final_synthesize_fn: Callable[..., Awaitable[ReflectResult]] | None = None,
704 final_synthesize_kwargs: dict[str, Any] | None = None,
705) -> ReflectResult:
706 """Run the agentic loop with native function calling.
708 Args:
709 query: The user's question.
710 initial_hits: Evidence already gathered by the outer recall step.
711 recall_fn: Async callable ``(query, max_results) -> [MemoryHit]``.
712 llm_provider: Must support the extended ``complete`` signature
713 with ``tools=`` and emit parsed ``tool_calls`` on the
714 returned :class:`Completion`.
715 observations_fn: Optional. When provided, the ``search_observations``
716 tool is enabled. Callable ``(query, max_results)``.
717 expand_fn: Optional. When provided, the ``expand`` tool is
718 enabled. Callable ``(memory_id, max_sources)``.
719 mental_models_fn: Optional. When provided, the
720 ``search_mental_models`` tool is enabled. Callable
721 ``(query, scope) -> [MemoryHit]`` — orchestrator typically
722 forwards to ``MentalModelStore.list(bank_id, scope=scope)``
723 and converts each :class:`MentalModel` to a ``MemoryHit``.
724 The mental-model tier sits ABOVE ``search_observations`` in
725 the agent's priority order; see the system prompt.
726 params: Loop tuning. Defaults match Hindsight.
727 final_synthesize_fn / final_synthesize_kwargs: Fallback path
728 when the loop hits its iteration cap or the provider lacks
729 tool-call support. Typically the orchestrator's ``synthesize``.
730 """
731 p = params or AgenticReflectParams()
733 # Build the tool list from optional capabilities; recall + done are
734 # always present. Tool ordering matches the priority guidance in the
735 # system prompt: mental_models → observations → recall → expand → done.
736 tools: list[ToolDefinition] = []
737 if mental_models_fn is not None:
738 tools.append(_TOOL_SEARCH_MENTAL_MODELS)
739 if observations_fn is not None:
740 tools.append(_TOOL_SEARCH_OBSERVATIONS)
741 tools.append(_TOOL_RECALL)
742 if expand_fn is not None:
743 tools.append(_TOOL_EXPAND)
744 if list_entities_fn is not None:
745 tools.append(_TOOL_LIST_ENTITIES)
746 tools.append(_TOOL_DONE)
748 state = _AgentState(
749 evidence=list(initial_hits[: p.max_evidence_pool_size]),
750 seen_ids={h.memory_id for h in initial_hits if h.memory_id},
751 conversation=[
752 Message(
753 role="system",
754 content=_build_system_prompt(
755 adversarial_defense=p.adversarial_defense,
756 ),
757 ),
758 Message(
759 role="user",
760 content=(
761 f"Question: {query}\n\n"
762 f"Initial evidence (from outer recall):\n"
763 f"{_format_hits_for_tool_response(initial_hits[: p.max_evidence_pool_size])}"
764 ),
765 ),
766 ],
767 )
769 iteration = 0
770 while iteration < p.max_iterations:
771 iteration += 1
772 try:
773 completion = await llm_provider.complete(
774 state.conversation,
775 max_tokens=1024,
776 temperature=0.0,
777 tools=tools,
778 tool_choice="auto",
779 )
780 except Exception as exc:
781 _logger.warning(
782 "agentic_reflect: LLM call failed at iter %d (%s); falling back to forced synthesis.",
783 iteration,
784 exc,
785 )
786 break
788 if completion.usage:
789 state.total_input_tokens += completion.usage.input_tokens
790 state.total_output_tokens += completion.usage.output_tokens
792 tool_calls = completion.tool_calls or []
794 # Feature-detect: provider didn't return parsed tool_calls. Try
795 # legacy JSON-in-prose for one turn before falling through.
796 if not tool_calls:
797 if p.fallback_on_no_tool_call:
798 _logger.info(
799 "agentic_reflect: no tool_calls at iter %d "
800 "(provider may lack tool support); falling through "
801 "to forced synthesis.",
802 iteration,
803 )
804 break
805 # Otherwise loop again — the model may have emitted prose
806 # this turn but tool calls next turn.
807 state.conversation.append(Message(role="assistant", content=completion.text or ""))
808 continue
810 # Append the assistant's tool-calling turn so the model sees
811 # its own prior calls in the next iteration's context.
812 state.conversation.append(
813 Message(
814 role="assistant",
815 content=completion.text or "",
816 tool_calls=tool_calls,
817 )
818 )
820 # Execute each tool call (typically one per turn, but providers
821 # can emit several). The first ``done`` call short-circuits.
822 for tc in tool_calls:
823 name = _normalize_tool_name(tc.name)
824 args = tc.arguments or {}
826 if name == "done":
827 answer = str(args.get("answer") or "").strip()
828 cited_ids_raw = args.get("cited_ids") or []
829 if not isinstance(cited_ids_raw, list):
830 cited_ids_raw = []
831 cited_set = {str(c) for c in cited_ids_raw if c}
832 cited_hits = [h for h in state.evidence if h.memory_id in cited_set]
833 return ReflectResult(answer=answer, sources=cited_hits or state.evidence)
835 if name == "recall":
836 sub_q = str(args.get("query") or query).strip() or query
837 try:
838 max_results = int(args.get("max_results") or p.recall_step_max_results)
839 except (TypeError, ValueError):
840 max_results = p.recall_step_max_results
841 max_results = max(1, min(max_results, 20))
842 try:
843 new_hits = await recall_fn(sub_q, max_results)
844 except Exception as exc:
845 _logger.warning("agentic_reflect.recall failed (%s)", exc)
846 new_hits = []
847 _add_hits(state, new_hits, p.max_evidence_pool_size)
848 state.conversation.append(
849 Message(
850 role="tool",
851 tool_call_id=tc.id,
852 name=name,
853 content=_format_hits_for_tool_response(new_hits),
854 )
855 )
856 continue
858 if name == "search_observations" and observations_fn is not None:
859 sub_q = str(args.get("query") or query).strip() or query
860 try:
861 max_results = int(args.get("max_results") or p.observations_step_max_results)
862 except (TypeError, ValueError):
863 max_results = p.observations_step_max_results
864 max_results = max(1, min(max_results, 20))
865 try:
866 new_hits = await observations_fn(sub_q, max_results)
867 except Exception as exc:
868 _logger.warning("agentic_reflect.observations failed (%s)", exc)
869 new_hits = []
870 _add_hits(state, new_hits, p.max_evidence_pool_size)
871 state.conversation.append(
872 Message(
873 role="tool",
874 tool_call_id=tc.id,
875 name=name,
876 content=_format_hits_for_tool_response(new_hits),
877 )
878 )
879 continue
881 if name == "search_mental_models" and mental_models_fn is not None:
882 # ``query`` is forwarded for traceability; the underlying
883 # store doesn't currently rank by query (mental models are
884 # usually a handful per bank — return all in scope and let
885 # the LLM pick). ``scope`` filters at the SPI level.
886 sub_q = str(args.get("query") or query).strip() or query
887 scope_arg_raw = args.get("scope")
888 scope_arg = str(scope_arg_raw).strip() or None if isinstance(scope_arg_raw, str) else None
889 try:
890 new_hits = await mental_models_fn(sub_q, scope_arg)
891 except Exception as exc:
892 _logger.warning("agentic_reflect.mental_models failed (%s)", exc)
893 new_hits = []
894 _add_hits(state, new_hits, p.max_evidence_pool_size)
895 state.conversation.append(
896 Message(
897 role="tool",
898 tool_call_id=tc.id,
899 name=name,
900 content=_format_hits_for_tool_response(new_hits),
901 )
902 )
903 continue
905 if name == "list_entities" and list_entities_fn is not None:
906 pattern_arg = args.get("pattern")
907 pattern = str(pattern_arg).strip() or None if isinstance(pattern_arg, str) else None
908 try:
909 limit = int(args.get("limit") or 50)
910 except (TypeError, ValueError):
911 limit = 50
912 limit = max(1, min(limit, 200))
913 try:
914 entities = await list_entities_fn(pattern, limit)
915 except Exception as exc: # noqa: BLE001
916 _logger.warning("agentic_reflect.list_entities failed (%s)", exc)
917 entities = []
918 # Format the response so the agent sees both the count
919 # AND the named items — so it can distinguish over- and
920 # under-counting from extraction errors.
921 payload = {
922 "total_distinct": len(entities),
923 "entities": [{"name": name, "section_mentions": count} for name, count in entities],
924 }
925 state.conversation.append(
926 Message(
927 role="tool",
928 tool_call_id=tc.id,
929 name=name,
930 content=json.dumps(payload, ensure_ascii=False),
931 )
932 )
933 continue
935 if name == "expand" and expand_fn is not None:
936 memory_id = str(args.get("memory_id") or "").strip()
937 try:
938 max_sources = int(args.get("max_sources") or p.expand_step_max_sources)
939 except (TypeError, ValueError):
940 max_sources = p.expand_step_max_sources
941 max_sources = max(1, min(max_sources, 20))
942 if not memory_id:
943 new_hits: list[MemoryHit] = []
944 else:
945 try:
946 new_hits = await expand_fn(memory_id, max_sources)
947 except Exception as exc:
948 _logger.warning("agentic_reflect.expand failed (%s)", exc)
949 new_hits = []
950 _add_hits(state, new_hits, p.max_evidence_pool_size)
951 state.conversation.append(
952 Message(
953 role="tool",
954 tool_call_id=tc.id,
955 name=name,
956 content=_format_hits_for_tool_response(new_hits),
957 )
958 )
959 continue
961 # Unknown tool — feed a structured error back so the model
962 # can recover rather than treating the response as terminal.
963 _logger.info(
964 "agentic_reflect: unknown tool %r at iter %d; informing model and continuing.",
965 name,
966 iteration,
967 )
968 state.conversation.append(
969 Message(
970 role="tool",
971 tool_call_id=tc.id,
972 name=name,
973 content=json.dumps({"error": f"unknown tool: {tc.name!r}; valid: {[t.name for t in tools]}"}),
974 )
975 )
977 # Loop exited without ``done``. Forced single-shot synthesis over
978 # whatever evidence accumulated.
979 if final_synthesize_fn is not None:
980 return await final_synthesize_fn(
981 query=query,
982 hits=state.evidence,
983 llm_provider=llm_provider,
984 **(final_synthesize_kwargs or {}),
985 )
986 return ReflectResult(
987 answer="insufficient evidence",
988 sources=state.evidence,
989 )