Coverage for astrocyte/pipeline/agentic

1"""Agentic reflect loop (Hindsight parity, native function-calling).

3Closely mirrors Hindsight's ``engine/reflect/agent.py``:

5- **Native function calling** (OpenAI/Anthropic ``tools=`` API), not a

6 JSON-in-prose protocol. Provider adapters return parsed

7 :class:`ToolCall` instances; the loop dispatches by tool name.

8- **Hierarchical tools** in priority order:

9 1. ``recall`` — raw fact retrieval (always available).

10 2. ``search_observations`` — consolidated knowledge layer (when

11 observations are configured for the bank).

12 3. ``expand`` — fetch source memories cited by a compiled fact /

13 wiki page (their "include_chunks" semantics on demand).

14 4. ``done`` — terminal action; the model commits an answer with

15 ``cited_ids``.

16- **Default 10 iterations** (Hindsight default).

17- **Tool-name normalization** for misbehaving LLMs that emit prefixed

18 or decorated tool names (``call=functions.done``,

19 ``done<|channel|>commentary``, ``functions.recall``).

20- **Token tracking** via ``tiktoken`` when available; degrades to None

21 totals when the package isn't installed.

22- **Citation validation** — ``cited_ids`` not seen in the running

23 evidence pool are stripped from the response's ``sources`` list.

24- **Feature detection** — when the provider doesn't support

25 ``tools`` (legacy ``MockLLMProvider`` paths), the loop falls

26 through to forced single-shot synthesis.

28The loop ALWAYS round-trips tool results back to the model as

29``role="tool"`` messages so the next iteration sees its prior tool's

30output, matching OpenAI's tool-use protocol.

31"""

33from __future__ import annotations

35import json

36import logging

37from dataclasses import dataclass, field

38from typing import Any, Awaitable, Callable

40from astrocyte.types import (

41 MemoryHit,

42 Message,

43 ReflectResult,

44 ToolDefinition,

45)

47_logger = logging.getLogger("astrocyte.agentic_reflect")

50# ---------------------------------------------------------------------------

51# Loop configuration

52# ---------------------------------------------------------------------------

55@dataclass

56class AgenticReflectParams:

57 """Tunable knobs for the reflect loop.

59 Defaults match Hindsight where stated:

60 - ``max_iterations=10`` matches their ``DEFAULT_MAX_ITERATIONS``.

61 - ``recall_step_max_results=10`` matches the bench's outer recall.

62 - ``observations_step_max_results=5`` matches their default.

63 - ``expand_step_max_sources=5`` matches their default.

64 - ``max_evidence_pool_size=30`` is our cap; their token-budget cap

65 is computed dynamically from ``max_tokens`` instead.

66 """

68 max_iterations: int = 10

69 recall_step_max_results: int = 10

70 observations_step_max_results: int = 5

71 expand_step_max_sources: int = 5

72 max_evidence_pool_size: int = 30

73 #: When the provider lacks tool-calling support OR doesn't emit any

74 #: tool call on a turn, the loop falls through to forced synthesis

75 #: rather than thrashing. Set ``False`` only for diagnostic runs.

76 fallback_on_no_tool_call: bool = True

77 #: Append adversarial-defense rules to the system prompt

78 #: (premise check, negative-existence handling, time-shift trap,

79 #: explicit "insufficient evidence is always a valid answer").

80 #: Targets the LoCoMo adversarial category.

81 adversarial_defense: bool = False

84# ---------------------------------------------------------------------------

85# Tool schemas (OpenAI-compatible JSON Schema)

86# ---------------------------------------------------------------------------

89_TOOL_SEARCH_MENTAL_MODELS = ToolDefinition(

90 name="search_mental_models",

91 description=(

92 "Search the bank's curated mental models — durable, refreshable summaries "

93 "of stable preferences / personas / concepts (e.g. 'Alice prefers async "

94 "communication' or 'Project X status: blocked on review'). Highest-quality "

95 "tier when the question is about a stable summary or persona; cite the "

96 "model_id like any other memory_id. If `search_mental_models` returns "

97 "nothing relevant, fall through to `search_observations` and `recall`."

98 ),

99 parameters={

100 "type": "object",

101 "properties": {

102 "reason": {

103 "type": "string",

104 "description": "Brief explanation of why a mental-model lookup fits the question.",

105 },

106 "query": {

107 "type": "string",

108 "description": "Free-text query — used for ranking when the store doesn't already filter by scope.",

109 },

110 "scope": {

111 "type": "string",

112 "description": "Optional scope filter (e.g. 'bank' for bank-wide models, 'person:alice' for a person-specific scope). Omit to list all models.",

113 },

114 },

115 "required": ["reason", "query"],

116 },

117)

118

119_TOOL_RECALL = ToolDefinition(

120 name="recall",

121 description=(

122 "Retrieve raw memories from the bank by semantic similarity to your query. "

123 "Use this for ground-truth facts. Each result has an `id` you must cite "

124 "from in the final `done` call."

125 ),

126 parameters={

127 "type": "object",

128 "properties": {

129 "reason": {

130 "type": "string",

131 "description": "Brief explanation of why you're calling recall (for debug/tracing).",

132 },

133 "query": {

134 "type": "string",

135 "description": "Search query — refine it with names/dates/specific terms when broadening doesn't help.",

136 },

137 "max_results": {

138 "type": "integer",

139 "description": "Maximum number of memories to return (default 10, max 20).",

140 },

141 },

142 "required": ["reason", "query"],

143 },

144)

145

146_TOOL_SEARCH_OBSERVATIONS = ToolDefinition(

147 name="search_observations",

148 description=(

149 "Search consolidated observations — auto-synthesized knowledge with "

150 "freshness info. Higher precision than raw recall for stable persona / "

151 "preference questions. Stale observations should be cross-checked with "

152 "`recall` before citing."

153 ),

154 parameters={

155 "type": "object",

156 "properties": {

157 "reason": {"type": "string"},

158 "query": {"type": "string"},

159 "max_results": {

160 "type": "integer",

161 "description": "Maximum number of observations (default 5).",

162 },

163 },

164 "required": ["reason", "query"],

165 },

166)

167

168_TOOL_EXPAND = ToolDefinition(

169 name="expand",

170 description=(

171 "Fetch the source memories cited by a compiled fact / observation / wiki "

172 "page that you've already seen. Pass the `id` of the compiled item; "

173 "you'll receive the raw memories underneath, useful when the distilled "

174 "fact loses important verbatim context."

175 ),

176 parameters={

177 "type": "object",

178 "properties": {

179 "reason": {"type": "string"},

180 "memory_id": {

181 "type": "string",

182 "description": "ID of the compiled memory whose sources to expand.",

183 },

184 "max_sources": {

185 "type": "integer",

186 "description": "Maximum source memories to fetch (default 5).",

187 },

188 },

189 "required": ["reason", "memory_id"],

190 },

191)

192

193_TOOL_LIST_ENTITIES = ToolDefinition(

194 name="list_entities",

195 description=(

196 "List distinct entity names in the current document with their per-section "

197 "mention counts — use this for COUNTING / AGGREGATION questions where you "

198 "need the exact count of distinct items mentioned across the conversation. "

199 'Pass a substring `pattern` to filter (e.g. `"doctor"`, `"kit"`, '

200 '`"trip"`); pass no pattern to see the top-mentioned entities. Returns '

201 "`[(entity_name, count), ...]` ordered by count desc. The agent counts the "

202 "list — this is the deterministic counting primitive the recall+done loop "

203 "lacks. If `list_entities` returns the right list, your answer is just "

204 "`len(list)` or a sum of relevant counts."

205 ),

206 parameters={

207 "type": "object",

208 "properties": {

209 "reason": {

210 "type": "string",

211 "description": "Why you're listing (e.g. 'count distinct doctors mentioned').",

212 },

213 "pattern": {

214 "type": "string",

215 "description": (

216 "Case-insensitive substring filter on entity_name. Use a "

217 "broad term (`'doctor'` not `'Dr. Smith'`). Omit to see the "

218 "top-mentioned entities globally."

219 ),

220 },

221 "limit": {

222 "type": "integer",

223 "description": "Max entries to return (default 50, max 200).",

224 },

225 },

226 "required": ["reason"],

227 },

228)

229

230_TOOL_DONE = ToolDefinition(

231 name="done",

232 description=(

233 "Commit your final answer. ALL `cited_ids` must be IDs you have seen in "

234 "an earlier tool result; you cannot invent IDs. If you cannot answer "

235 "with evidence, return an honest 'insufficient evidence' answer."

236 ),

237 parameters={

238 "type": "object",

239 "properties": {

240 "answer": {

241 "type": "string",

242 "description": "Final answer to the user's question, evidence-grounded.",

243 },

244 "cited_ids": {

245 "type": "array",

246 "items": {"type": "string"},

247 "description": "IDs of memories supporting the answer; must come from prior tool results.",

248 },

249 },

250 "required": ["answer", "cited_ids"],

251 },

252)

253

254

255# ---------------------------------------------------------------------------

256# Tool-name normalization (Hindsight-parity)

257# ---------------------------------------------------------------------------

258

259

260def _normalize_tool_name(name: str) -> str:

261 """Normalize tool names from misbehaving LLMs.

262

263 Mirrors Hindsight's :func:`_normalize_tool_name`:

264 - ``call=functions.done`` → ``done``

265 - ``functions.recall`` → ``recall``

266 - ``done<|channel|>commentary`` → ``done``

267 """

268 if name.startswith("call="):

269 name = name[len("call=") :]

270 if name.startswith("functions."):

271 name = name[len("functions.") :]

272 if "<|" in name:

273 name = name.split("<|")[0]

274 return name.strip()

275

276

277def _is_done_tool(name: str) -> bool:

278 return _normalize_tool_name(name).lower() == "done"

279

280

281# ---------------------------------------------------------------------------

282# System prompt

283# ---------------------------------------------------------------------------

284

285

286_SYSTEM_PROMPT = """\

287You are an evidence-gathering memory agent. Answer the user's question using \

288ONLY tool results — never invent facts.

289

290Available tools (priority order — try the higher-quality tiers first):

291- `search_mental_models`: curated, refreshable summaries of stable \

292preferences / personas / concepts (highest quality — try first when the \

293question asks about a person's stable beliefs / preferences / status).

294- `search_observations`: consolidated knowledge auto-synthesized from raw \

295memories (try second for stable patterns when no mental model fits).

296- `recall`: raw memories for ground-truth facts (always available, fall back \

297here when the higher tiers return nothing relevant).

298- `expand`: fetch source memories under a compiled fact / mental model / \

299wiki page you've seen — useful when the distilled summary loses verbatim \

300context.

301- `list_entities`: list distinct entity names with their mention counts. \

302USE THIS FOR COUNTING / AGGREGATION QUESTIONS ("how many doctors / projects \

303/ items"). Pass a substring `pattern` (e.g. `"doctor"`, `"kit"`, `"trip"`); \

304the tool returns the distinct entities matching it with per-section \

305mention counts. The answer is `total_distinct` (or a sum / a filtered \

306subcount). This is far more reliable than counting from raw text — the \

307LLM is bad at distinct-counting across many chunks.

308- `done`: commit final answer with citations.

309

310COUNTING DISCIPLINE — when the question is "how many X" or "how much Y":

311

312A. Reach for `list_entities(pattern=...)` BEFORE `recall`. Entities are \

313extracted in two shapes:

314

315 1. PROPER NOUNS — "Dr. Patel", "Nordstrom", "MoMA". Use these when \

316 the question names a specific person or place.

317

318 2. STRUCTURED LABELS — `key:value` strings the extractor emits for \

319 COUNTABLE CATEGORIES. Four key prefixes:

320 - `role:<noun>` — occupational categories (`role:doctor`, \

321 `role:dermatologist`, `role:lawyer`)

322 - `category:<noun>` — countable kinds of things \

323 (`category:model_kit`, `category:plant`, `category:restaurant`, \

324 `category:trip`, `category:bike`)

325 - `event:<noun>` — distinct occurrences \

326 (`event:wedding`, `event:sale`, `event:road_trip`)

327 - `expense:<currency_amount>` — money spent \

328 (`expense:$45`, `expense:$185`)

329 - `prefers:<aspect>=<value>` — stable user preference / taste \

330 (`prefers:camera_brand=Sony`, \

331 `prefers:movie_genre=stand-up_comedy`)

332

333 STRONGLY PREFER querying labels for category counts. Examples:

334 - "How many doctors?" → `list_entities(pattern="role:doctor")` \

335 OR `list_entities(pattern="role:")` to see all roles.

336 - "How many model kits?" → `list_entities(pattern="category:kit")` \

337 OR `pattern="category:model_kit"`.

338 - "Total spent on bikes?" → `list_entities(pattern="expense:")` and \

339 sum the dollar amounts in the entity names.

340 - "How many trips?" → `list_entities(pattern="category:trip")`.

341

342B. The returned `total_distinct` is the count of DISTINCT label-or-name \

343strings; `section_mentions` is how many sections each appeared in. For \

344"how many doctors" you want `total_distinct` of `role:doctor` rows \

345(or sum the section_mentions if each role mention = one visit).

346

347C. After `list_entities`, you may still want one `recall` to verify \

348context — but the COUNT is whatever `list_entities` returned.

349

350D. If `pattern="category:doctor_visit"` returns nothing, try the role: \

351form (`role:doctor`) and counting role mentions instead. Different docs \

352may have used different label conventions; try ~2 patterns before \

353giving up on labels and falling back to recall+count.

354

355Core rules:

3561. Gather evidence with the tools above before answering.

3572. Each `cited_ids` entry MUST be an ID you saw in a prior tool result.

3583. Refine your query with specific names / dates / sub-topics when initial \

359recall doesn't surface the answer.

360

361TOOL-RESULT FORMAT — each hit is a JSON entry with these fields:

362- ``id`` — the memory_id (use for citations)

363- ``text`` — the atomic fact (the searchable summary)

364- ``source_chunk`` (when present) — the raw conversation excerpt the fact was \

365extracted from. **This is your primary source for verbatim wording, implicit \

366cues, and details the extracted fact may have lost.** When a question asks \

367for the user's exact phrasing or the assistant's specific recommendation, \

368prefer the source_chunk's wording over the fact's summary text.

369- ``resolved_temporal`` (when present) — pre-resolved absolute dates for \

370relative phrases (see rule 9 below).

371

372EXTRACTION DISCIPLINE — read this carefully before saying "insufficient evidence":

373

3744. When tool results CONTAIN the answer (verbatim or near-verbatim, even \

375embedded in larger text), you MUST extract the specific fact. Examples:

376 - Q: "What's James's favorite game?" — recall has "James: my favourite \

377game is Apex Legends" → answer "Apex Legends" and cite the hit.

378 - Q: "What's Joanna's favorite movie?" — recall has "Joanna: I love \

379Eternal Sunshine of the Spotless Mind" → answer with the title.

380 - Q: "What's Jon's favorite style of dance?" — recall has "Jon: \

381contemporary feels right to me" → answer "contemporary".

382

3835. Wiki / persona pages in recall ARE evidence. If a person's wiki page is \

384returned but doesn't directly list the specific fact, call `expand` on the \

385page id OR run another `recall` with the specific term (e.g. "Joanna favorite \

386movie", "Jon dance style") BEFORE concluding insufficient evidence.

387

3886. "Insufficient evidence" is ONLY valid when:

389 - No retrieved memory mentions the queried fact at all, OR

390 - The question presupposes something not attested in memory (e.g. \

391"Why did X quit Y?" when X never worked at Y), OR

392 - You've spent the recall budget refining and still nothing matches.

393

3947. NOT valid reasons to say insufficient evidence:

395 - "The wording in evidence differs from the question's wording"

396 - "I see partial information but not the exact phrasing"

397 - "I'd need more context"

398 - "Multiple options are mentioned, I can't pick one" — pick the most \

399specific match and cite it.

400

4018. When in doubt between abstaining and extracting: extract, cite the hit, \

402and let the answer be evidence-grounded. A confident extraction beats an \

403overcautious abstain.

404

405TEMPORAL RESOLUTION — read this when the question asks "when did X happen?":

406

4079. If a memory in the tool result has a `resolved_temporal` field, USE THE \

408DATES FROM THAT FIELD VERBATIM. Each entry maps a relative phrase \

409(`"yesterday"`, `"last week"`) to the canonical `resolved_date` computed \

410at retain time from the session's authoritative timestamp. DO NOT redo the \

411arithmetic from the raw text — you will get off-by-one errors when the \

412session anchor differs from your assumed `now()`.

413

414 Example: memory text "Caroline: yesterday I went to the support group" \

415with `resolved_temporal.items = [{"phrase": "yesterday", "resolved_date": \

416"2023-05-07", "granularity": "day"}]` → answer "May 7, 2023". Do not \

417recompute "yesterday" against any other reference date.

418

41910. Only fall back to raw-text arithmetic when `resolved_temporal` is \

420absent. If both an absolute date in the text AND a `resolved_temporal` \

421entry exist, prefer the absolute date — `resolved_temporal` is for \

422relative phrases only.

423

424STOP-EARLY DISCIPLINE — `done` is your default, not your last resort:

425

426The loop has a fixed iteration budget. Empirically, agents that run out \

427the clock produce WORSE answers than agents that commit early on \

428sufficient evidence. After EVERY tool result, ask yourself:

429

430 "Do I have ≥2 retrieved memories that DIRECTLY contain the answer \

431 (or its components)?"

432

433If YES → call `done` IMMEDIATELY. Do not issue more `recall`/`expand`/etc. \

434calls "just to be safe" — additional tool calls dilute your synthesis \

435focus and rarely add new information once you have the answer in hand.

436

437If NO → issue ONE more refining tool call (different query terms, \

438different tool tier, or `expand` on a partial hit). Then re-check the \

439stop condition.

440

441The ONLY reasons to take more than 2 tool calls:

442- Multi-hop questions where you have evidence for half the bridge but \

443 not the other half (e.g. you found "user works at X" but need "X was \

444 acquired by Y" to answer "who is the user's parent company now").

445- Counting / aggregation questions where `list_entities` returned \

446 ambiguous patterns and you need to disambiguate.

447- Knowledge-update questions where you found a value but not its date, \

448 and need one more recall to confirm "is this the latest?"

449

450In all OTHER cases, 2 successful tool calls is enough — commit. The \

451`done` tool is your DEFAULT exit, not a fallback. Aim to call `done` \

452within iterations 2-3 of typical questions.

453

454QUESTION-TYPE ROUTING — read this BEFORE calling `done`:

455

456Classify the question into ONE of the four shapes below and apply that \

457block's rules when composing your final answer. (Ported from M19a's \

458per-Q-type answerer prompt — the same routing that shipped +12q on \

459multi-hop and +2.25pp overall in the recall+answerer pipeline. See \

460``docs/_design/m19-prompt-routing.md`` and \

461``scripts/mem0_harness/_hindsight_prompt.py``.)

462

463**For Recommendation / Preference Questions** ("can you recommend...", \

464"any tips...", "what would the user prefer..."):

465- DO NOT invent specific recommendations (no made-up product names, course \

466 titles, paper titles, channel names).

467- DO mention specific brands / products / providers the user ALREADY uses \

468 from the recalled evidence — by name, in the recommendations themselves.

469- Describe WHAT KIND of recommendation the user would prefer, referencing \

470 their existing tools / brands EXPLICITLY in the answer structure.

471- First scan ALL recalled evidence for user's existing tools, brands, \

472 stated preferences. Structure the answer around those, not around \

473 generic categories. If a mental model or wiki page contains a stated \

474 preference ("user prefers X brand"), the answer MUST center on that \

475 preference — not mention it as an aside.

476- Keep answers concise. Focus on key preferences (brand, quality, specific \

477 interests), not exhaustive category lists.

478

479**For Multi-hop / Synthesis Questions** (requires combining evidence from \

480multiple sessions / facts; "how did X relate to Y", "what happened after Z \

481that led to W"):

482- Synthesize across MULTIPLE retrieved facts — do not over-index on any \

483 single hit.

484- BEFORE calling `done`, verify you have evidence from ≥2 distinct \

485 sources / sessions. If not, issue another `recall` with a refined query \

486 that targets the missing link. (Multi-hop questions are the strongest \

487 case for spending iterations on additional retrieval.)

488- Do NOT structure the answer around a single brand or preference (that's \

489 for recommendation questions only).

490- List the supporting facts inline (1, 2, 3...) to make the synthesis \

491 traceable. Each cited `memory_id` should correspond to one of the \

492 numbered supporting facts.

493- If facts conflict, prefer the more recent / more specific one and note \

494 the conflict explicitly in the answer.

495

496**For Temporal-Reasoning Questions** ("how long ago", "before/after", \

497"when did X", date arithmetic):

498- Find the ORIGINAL mention date for each event — older facts are often \

499 the right ones (NOT the most recent). The agentic loop's iterative \

500 recall is built for this: when the first recall returns recent hits, \

501 issue another with date-narrowing terms ("first time", "originally", \

502 the specific year if you can infer it).

503- Convert relative dates to absolute first; do the arithmetic explicitly; \

504 show your work in the answer.

505- Use the `resolved_temporal` field on hits when present (see rule 9).

506- Do NOT apply preference framing to date-arithmetic answers.

507

508**For Knowledge-Update Questions** (the user's current state / latest \

509value of a changing field; "what does the user do now", "where do they \

510live"):

511- Use the MOST RECENT fact for any field that can change over time (job, \

512 location, status, relationship). Earlier facts about the same field are \

513 HISTORICAL, not current.

514- If recall returns multiple facts about the same field, pick the one with \

515 the latest `occurred_start` or session date. Mention the prior value(s) \

516 only if the question explicitly asks about change history.

517"""

518

519

520_ADVERSARIAL_DEFENSE_RULES = """\

521

522ADVERSARIAL DEFENSE (read this carefully):

523

524A. Premise check before answering. If the question PRESUPPOSES a fact \

525("Why did Alice quit Google?" presupposes Alice worked at Google AND quit), \

526your evidence must directly support each presupposition. If ANY \

527presupposition lacks supporting evidence in the tool results, return:

528 "insufficient evidence — '<the unsupported presupposition>' is not \

529attested in memory."

530

531B. Entity / date / location verification. If the question names a specific \

532entity, date, or place, verify that token actually appears in the retrieved \

533evidence with the right context before incorporating it into your answer.

534

535C. Negative-existence questions. If asked "Did X ever do Y?" and recall \

536returns no memory in which X is associated with Y, the correct answer is \

537"No" or "no evidence in memory" — NOT a fabricated yes-answer from \

538loosely-related hits.

539

540D. Time-shift traps. Dates and time periods in the question must match \

541dates in the retrieved evidence. If the question asks about year N but \

542all evidence is year M, abstain — do not silently shift the date.

543

544E. "I don't know" is ALWAYS a valid answer. Speculation is never valid. \

545A confident "insufficient evidence" beats a plausible-but-unsupported answer.

546"""

547

548

549def _build_system_prompt(*, adversarial_defense: bool = False) -> str:

550 """Compose the agentic-loop system prompt.

551

552 The base prompt is always included; adversarial-defense rules are

553 appended only when the orchestrator has the feature enabled. Keeping

554 them off by default avoids the small overhead of the extra prompt

555 text on workloads that don't have adversarial questions.

556 """

557 if adversarial_defense:

558 return _SYSTEM_PROMPT + _ADVERSARIAL_DEFENSE_RULES

559 return _SYSTEM_PROMPT

560

561

562# ---------------------------------------------------------------------------

563# Loop entry point

564# ---------------------------------------------------------------------------

565

566

567RecallFn = Callable[[str, int], Awaitable[list[MemoryHit]]]

568ObservationsFn = Callable[[str, int], Awaitable[list[MemoryHit]]] | None

569ExpandFn = Callable[[str, int], Awaitable[list[MemoryHit]]] | None

570# (query, scope_or_None) -> list[MemoryHit]. Scope is the optional second

571# argument from the tool call; the orchestrator forwards it to

572# ``MentalModelStore.list(bank_id, scope=scope)``. No max_results: mental

573# models are usually a handful per bank — return all in scope and trust the

574# LLM to pick.

575MentalModelsFn = Callable[[str, str | None], Awaitable[list[MemoryHit]]] | None

576# (pattern_or_None, limit) -> [(entity_name, mention_count), ...].

577# Counting primitive — agent uses this for "how many doctors / projects /

578# items" questions where the recall+done loop produces unreliable counts

579# from raw text. PR2.6 addition; Hindsight does not have this tool.

580ListEntitiesFn = Callable[[str | None, int], Awaitable[list[tuple[str, int]]]] | None

581

582

583@dataclass

584class _AgentState:

585 """Running-state container for the loop."""

586

587 evidence: list[MemoryHit] = field(default_factory=list)

588 seen_ids: set[str] = field(default_factory=set)

589 conversation: list[Message] = field(default_factory=list)

590 total_input_tokens: int = 0

591 total_output_tokens: int = 0

592

593

594def _add_hits(state: _AgentState, hits: list[MemoryHit], cap: int) -> int:

595 """Append unseen hits, return the count of newly-added hits."""

596 added = 0

597 for h in hits:

598 hid = h.memory_id

599 if hid and hid not in state.seen_ids:

600 state.evidence.append(h)

601 state.seen_ids.add(hid)

602 added += 1

603 if len(state.evidence) >= cap:

604 break

605 return added

606

607

608def _resolved_temporal_for_hit(meta: dict | None) -> dict[str, list[dict[str, str]]] | None:

609 """Pull the pre-computed temporal facts from a MemoryHit's metadata.

610

611 ``temporal_metadata()`` writes ``temporal_phrase`` / ``resolved_date`` /

612 ``date_granularity`` as ``|``-joined strings at retain time, anchored to

613 the session's authoritative ``date_time``. Without surfacing these to

614 the agent, the LLM does its own arithmetic from the raw text (e.g.

615 seeing "yesterday" and subtracting one day from a misread anchor) and

616 drifts off-by-one.

617

618 Returns a compact dict with parallel lists, one entry per resolved

619 phrase, so the agent can quote the canonical resolved date verbatim.

620 Returns ``None`` when no temporal metadata is present.

621 """

622 if not meta:

623 return None

624 phrases = meta.get("temporal_phrase")

625 resolved = meta.get("resolved_date")

626 if not isinstance(phrases, str) or not isinstance(resolved, str):

627 return None

628 granularities = meta.get("date_granularity")

629 anchor = meta.get("temporal_anchor")

630 p_list = phrases.split("|")

631 r_list = resolved.split("|")

632 g_list = granularities.split("|") if isinstance(granularities, str) else []

633 if not p_list or not r_list or len(p_list) != len(r_list):

634 return None

635 items: list[dict[str, str]] = []

636 for i, (phrase, date) in enumerate(zip(p_list, r_list)):

637 item = {"phrase": phrase, "resolved_date": date}

638 if i < len(g_list) and g_list[i]:

639 item["granularity"] = g_list[i]

640 items.append(item)

641 out: dict[str, list[dict[str, str]] | str] = {"items": items}

642 if isinstance(anchor, str) and anchor:

643 out["anchor_date"] = anchor

644 return out # type: ignore[return-value]

645

646

647def _format_hits_for_tool_response(hits: list[MemoryHit]) -> str:

648 """Render hits as a compact JSON array the LLM can consume.

649

650 Includes a ``resolved_temporal`` block whenever the memory's metadata

651 carries pre-computed temporal facts. This lets the agent quote the

652 canonical resolved date instead of doing its own off-by-one arithmetic

653 from raw "yesterday" / "last week" text.

654

655 M26 Phase 3 — when the hit's metadata carries ``source_chunk``

656 (populated by the bench harness's ``_to_hit`` bridge from the

657 fact-grain candidate's M24 inline pairing), surface it as a

658 ``source_chunk`` field on the JSON entry. This gives the reflect

659 agent the same fact↔chunk visual cross-reference Hindsight's

660 bench answerer relies on — without it the sub-recall path's facts

661 look the same as M22's flat-bullet shape and we lose the M24 SSP

662 recovery inside the reflect loop.

663 """

664 payload = []

665 for h in hits:

666 text = (h.text or "").strip()

667 if len(text) > 600:

668 text = text[:597] + "..."

669 entry: dict[str, object] = {

670 "id": h.memory_id,

671 "text": text,

672 "score": round(h.score, 4) if h.score else 0.0,

673 }

674 resolved = _resolved_temporal_for_hit(h.metadata)

675 if resolved is not None:

676 entry["resolved_temporal"] = resolved

677 meta = h.metadata or {}

678 src = meta.get("source_chunk") if isinstance(meta, dict) else None

679 if src:

680 src_text = str(src).strip()

681 # Bound chunk length — agent context budget is tight, and

682 # the bench's _slice_section_around_line already caps at

683 # 1200 chars. Mirror that here so we never exceed the cap

684 # if some other caller passes raw chunks.

685 if len(src_text) > 1200:

686 src_text = src_text[:1197] + "..."

687 entry["source_chunk"] = src_text

688 payload.append(entry)

689 return json.dumps(payload, ensure_ascii=False)

690

691

692async def agentic_reflect(

693 query: str,

694 *,

695 initial_hits: list[MemoryHit],

696 recall_fn: RecallFn,

697 llm_provider,

698 observations_fn: ObservationsFn = None,

699 expand_fn: ExpandFn = None,

700 mental_models_fn: MentalModelsFn = None,

701 list_entities_fn: ListEntitiesFn = None,

702 params: AgenticReflectParams | None = None,

703 final_synthesize_fn: Callable[..., Awaitable[ReflectResult]] | None = None,

704 final_synthesize_kwargs: dict[str, Any] | None = None,

705) -> ReflectResult:

706 """Run the agentic loop with native function calling.

707

708 Args:

709 query: The user's question.

710 initial_hits: Evidence already gathered by the outer recall step.

711 recall_fn: Async callable ``(query, max_results) -> [MemoryHit]``.

712 llm_provider: Must support the extended ``complete`` signature

713 with ``tools=`` and emit parsed ``tool_calls`` on the

714 returned :class:`Completion`.

715 observations_fn: Optional. When provided, the ``search_observations``

716 tool is enabled. Callable ``(query, max_results)``.

717 expand_fn: Optional. When provided, the ``expand`` tool is

718 enabled. Callable ``(memory_id, max_sources)``.

719 mental_models_fn: Optional. When provided, the

720 ``search_mental_models`` tool is enabled. Callable

721 ``(query, scope) -> [MemoryHit]`` — orchestrator typically

722 forwards to ``MentalModelStore.list(bank_id, scope=scope)``

723 and converts each :class:`MentalModel` to a ``MemoryHit``.

724 The mental-model tier sits ABOVE ``search_observations`` in

725 the agent's priority order; see the system prompt.

726 params: Loop tuning. Defaults match Hindsight.

727 final_synthesize_fn / final_synthesize_kwargs: Fallback path

728 when the loop hits its iteration cap or the provider lacks

729 tool-call support. Typically the orchestrator's ``synthesize``.

730 """

731 p = params or AgenticReflectParams()

732

733 # Build the tool list from optional capabilities; recall + done are

734 # always present. Tool ordering matches the priority guidance in the

735 # system prompt: mental_models → observations → recall → expand → done.

736 tools: list[ToolDefinition] = []

737 if mental_models_fn is not None:

738 tools.append(_TOOL_SEARCH_MENTAL_MODELS)

739 if observations_fn is not None:

740 tools.append(_TOOL_SEARCH_OBSERVATIONS)

741 tools.append(_TOOL_RECALL)

742 if expand_fn is not None:

743 tools.append(_TOOL_EXPAND)

744 if list_entities_fn is not None:

745 tools.append(_TOOL_LIST_ENTITIES)

746 tools.append(_TOOL_DONE)

747

748 state = _AgentState(

749 evidence=list(initial_hits[: p.max_evidence_pool_size]),

750 seen_ids={h.memory_id for h in initial_hits if h.memory_id},

751 conversation=[

752 Message(

753 role="system",

754 content=_build_system_prompt(

755 adversarial_defense=p.adversarial_defense,

756 ),

757 ),

758 Message(

759 role="user",

760 content=(

761 f"Question: {query}\n\n"

762 f"Initial evidence (from outer recall):\n"

763 f"{_format_hits_for_tool_response(initial_hits[: p.max_evidence_pool_size])}"

764 ),

765 ),

766 ],

767 )

768

769 iteration = 0

770 while iteration < p.max_iterations:

771 iteration += 1

772 try:

773 completion = await llm_provider.complete(

774 state.conversation,

775 max_tokens=1024,

776 temperature=0.0,

777 tools=tools,

778 tool_choice="auto",

779 )

780 except Exception as exc:

781 _logger.warning(

782 "agentic_reflect: LLM call failed at iter %d (%s); falling back to forced synthesis.",

783 iteration,

784 exc,

785 )

786 break

787

788 if completion.usage:

789 state.total_input_tokens += completion.usage.input_tokens

790 state.total_output_tokens += completion.usage.output_tokens

791

792 tool_calls = completion.tool_calls or []

793

794 # Feature-detect: provider didn't return parsed tool_calls. Try

795 # legacy JSON-in-prose for one turn before falling through.

796 if not tool_calls:

797 if p.fallback_on_no_tool_call:

798 _logger.info(

799 "agentic_reflect: no tool_calls at iter %d "

800 "(provider may lack tool support); falling through "

801 "to forced synthesis.",

802 iteration,

803 )

804 break

805 # Otherwise loop again — the model may have emitted prose

806 # this turn but tool calls next turn.

807 state.conversation.append(Message(role="assistant", content=completion.text or ""))

808 continue

809

810 # Append the assistant's tool-calling turn so the model sees

811 # its own prior calls in the next iteration's context.

812 state.conversation.append(

813 Message(

814 role="assistant",

815 content=completion.text or "",

816 tool_calls=tool_calls,

817 )

818 )

819

820 # Execute each tool call (typically one per turn, but providers

821 # can emit several). The first ``done`` call short-circuits.

822 for tc in tool_calls:

823 name = _normalize_tool_name(tc.name)

824 args = tc.arguments or {}

825

826 if name == "done":

827 answer = str(args.get("answer") or "").strip()

828 cited_ids_raw = args.get("cited_ids") or []

829 if not isinstance(cited_ids_raw, list):

830 cited_ids_raw = []

831 cited_set = {str(c) for c in cited_ids_raw if c}

832 cited_hits = [h for h in state.evidence if h.memory_id in cited_set]

833 return ReflectResult(answer=answer, sources=cited_hits or state.evidence)

834

835 if name == "recall":

836 sub_q = str(args.get("query") or query).strip() or query

837 try:

838 max_results = int(args.get("max_results") or p.recall_step_max_results)

839 except (TypeError, ValueError):

840 max_results = p.recall_step_max_results

841 max_results = max(1, min(max_results, 20))

842 try:

843 new_hits = await recall_fn(sub_q, max_results)

844 except Exception as exc:

845 _logger.warning("agentic_reflect.recall failed (%s)", exc)

846 new_hits = []

847 _add_hits(state, new_hits, p.max_evidence_pool_size)

848 state.conversation.append(

849 Message(

850 role="tool",

851 tool_call_id=tc.id,

852 name=name,

853 content=_format_hits_for_tool_response(new_hits),

854 )

855 )

856 continue

857

858 if name == "search_observations" and observations_fn is not None:

859 sub_q = str(args.get("query") or query).strip() or query

860 try:

861 max_results = int(args.get("max_results") or p.observations_step_max_results)

862 except (TypeError, ValueError):

863 max_results = p.observations_step_max_results

864 max_results = max(1, min(max_results, 20))

865 try:

866 new_hits = await observations_fn(sub_q, max_results)

867 except Exception as exc:

868 _logger.warning("agentic_reflect.observations failed (%s)", exc)

869 new_hits = []

870 _add_hits(state, new_hits, p.max_evidence_pool_size)

871 state.conversation.append(

872 Message(

873 role="tool",

874 tool_call_id=tc.id,

875 name=name,

876 content=_format_hits_for_tool_response(new_hits),

877 )

878 )

879 continue

880

881 if name == "search_mental_models" and mental_models_fn is not None:

882 # ``query`` is forwarded for traceability; the underlying

883 # store doesn't currently rank by query (mental models are

884 # usually a handful per bank — return all in scope and let

885 # the LLM pick). ``scope`` filters at the SPI level.

886 sub_q = str(args.get("query") or query).strip() or query

887 scope_arg_raw = args.get("scope")

888 scope_arg = str(scope_arg_raw).strip() or None if isinstance(scope_arg_raw, str) else None

889 try:

890 new_hits = await mental_models_fn(sub_q, scope_arg)

891 except Exception as exc:

892 _logger.warning("agentic_reflect.mental_models failed (%s)", exc)

893 new_hits = []

894 _add_hits(state, new_hits, p.max_evidence_pool_size)

895 state.conversation.append(

896 Message(

897 role="tool",

898 tool_call_id=tc.id,

899 name=name,

900 content=_format_hits_for_tool_response(new_hits),

901 )

902 )

903 continue

904

905 if name == "list_entities" and list_entities_fn is not None:

906 pattern_arg = args.get("pattern")

907 pattern = str(pattern_arg).strip() or None if isinstance(pattern_arg, str) else None

908 try:

909 limit = int(args.get("limit") or 50)

910 except (TypeError, ValueError):

911 limit = 50

912 limit = max(1, min(limit, 200))

913 try:

914 entities = await list_entities_fn(pattern, limit)

915 except Exception as exc: # noqa: BLE001

916 _logger.warning("agentic_reflect.list_entities failed (%s)", exc)

917 entities = []

918 # Format the response so the agent sees both the count

919 # AND the named items — so it can distinguish over- and

920 # under-counting from extraction errors.

921 payload = {

922 "total_distinct": len(entities),

923 "entities": [{"name": name, "section_mentions": count} for name, count in entities],

924 }

925 state.conversation.append(

926 Message(

927 role="tool",

928 tool_call_id=tc.id,

929 name=name,

930 content=json.dumps(payload, ensure_ascii=False),

931 )

932 )

933 continue

934

935 if name == "expand" and expand_fn is not None:

936 memory_id = str(args.get("memory_id") or "").strip()

937 try:

938 max_sources = int(args.get("max_sources") or p.expand_step_max_sources)

939 except (TypeError, ValueError):

940 max_sources = p.expand_step_max_sources

941 max_sources = max(1, min(max_sources, 20))

942 if not memory_id:

943 new_hits: list[MemoryHit] = []

944 else:

945 try:

946 new_hits = await expand_fn(memory_id, max_sources)

947 except Exception as exc:

948 _logger.warning("agentic_reflect.expand failed (%s)", exc)

949 new_hits = []

950 _add_hits(state, new_hits, p.max_evidence_pool_size)

951 state.conversation.append(

952 Message(

953 role="tool",

954 tool_call_id=tc.id,

955 name=name,

956 content=_format_hits_for_tool_response(new_hits),

957 )

958 )

959 continue

960

961 # Unknown tool — feed a structured error back so the model

962 # can recover rather than treating the response as terminal.

963 _logger.info(

964 "agentic_reflect: unknown tool %r at iter %d; informing model and continuing.",

965 name,

966 iteration,

967 )

968 state.conversation.append(

969 Message(

970 role="tool",

971 tool_call_id=tc.id,

972 name=name,

973 content=json.dumps({"error": f"unknown tool: {tc.name!r}; valid: {[t.name for t in tools]}"}),

974 )

975 )

976

977 # Loop exited without ``done``. Forced single-shot synthesis over

978 # whatever evidence accumulated.

979 if final_synthesize_fn is not None:

980 return await final_synthesize_fn(

981 query=query,

982 hits=state.evidence,

983 llm_provider=llm_provider,

984 **(final_synthesize_kwargs or {}),

985 )

986 return ReflectResult(

987 answer="insufficient evidence",

988 sources=state.evidence,

989 )

Coverage for astrocyte/pipeline/agentic_reflect.py: 74%

228 statements