Coverage for astrocyte/pipeline/section_fact_extraction.py: 91%

94 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""M12.1 — per-section fact extraction. 

2 

3Each section's raw text is exploded into a list of atomic facts via 

4one LLM call. Each fact carries: 

5 

6- ``text``: self-contained statement ("User visited Dr. Patel on May 5") 

7- ``fact_type``: ``experience | preference | world | plan | opinion`` 

8- ``speaker``: ``user | assistant`` 

9- ``occurred_start`` / ``occurred_end``: anchored to ``session_date`` for 

10 relative phrases ("yesterday" → session - 1) 

11- ``entities``: proper nouns + key:value labels from the M10.2 vocab 

12 (``role:doctor``, ``category:trip``, ``event:wedding``, ``expense:$N``) 

13 

14Sections remain the picker's navigation primitive; facts are the 

15precision grain queried by reflect tools (counting, temporal, entity 

16lookups). Mirrors Hindsight's ``memory_units`` schema on top of the 

17PageIndex tree. 

18 

19See: 

20- ``docs/_design/recall.md`` §14 (M12 plan) 

21- ``hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py`` 

22 for the canonical Hindsight pattern. 

23""" 

24 

25from __future__ import annotations 

26 

27import logging 

28import uuid 

29from datetime import datetime 

30from typing import TYPE_CHECKING 

31 

32from astrocyte.pipeline._json_tolerant import looks_truncated, tolerant_json_loads 

33from astrocyte.types import Message, PageIndexFact 

34 

35if TYPE_CHECKING: 

36 from astrocyte.provider import LLMProvider 

37 from astrocyte.types import PageIndexSection 

38 

39_logger = logging.getLogger("astrocyte.pipeline.section_fact_extraction") 

40 

41 

42# M25 — Hindsight-parity fact_type taxonomy. 

43# 

44# Previously (M14.1+) had `assistant_statement` as a 6th fact_type to 

45# preserve assistant phrasing for LME's single-session-assistant 

46# category. M24 bench showed −2q SSA regression: the inline source- 

47# chunk pairing made the `assistant_statement` fact text + chunk text 

48# render redundantly (both contain the assistant utterance), confusing 

49# the answerer on "what did the assistant say" extraction. 

50# 

51# Hindsight's solve (engine/retain/fact_extraction.py lines 150-345 + 

52# benchmark line 85): 

53# 1. Binary classification: fact_type ∈ {world, assistant} → mapped 

54# to {world, experience} at storage time. Speaker perspective is 

55# carried by the speaker field + per-conversation context tag, 

56# NOT by a special fact_type bucket. 

57# 2. Per-conversation perspective tag at extraction time: the prompt 

58# tells the LLM "you are the assistant in this conversation" so 

59# the extractor uses the right reference frame when classifying 

60# first-person utterances. 

61# 

62# M25 adopts this pattern: 

63# - Drop `assistant_statement` from valid fact_types. Assistant 

64# utterances are extracted as `experience` (with speaker='assistant'). 

65# - Add a perspective-tag preamble to the extraction prompt so the 

66# LLM treats the transcript as "a conversation between a user and 

67# an AI assistant". 

68# - The `speaker` field on MemoryFact preserves the perspective 

69# signal for downstream consumers; the answerer renders facts by 

70# speaker rather than by fact_type. 

71# 

72# Backward compat: legacy rows with fact_type='assistant_statement' 

73# are accepted on read via the M25 shim in extract_facts_for_section 

74# (mapped to 'experience' if the LLM emits the legacy tag). 

75_VALID_FACT_TYPES = { 

76 "experience", 

77 "preference", 

78 "world", 

79 "plan", 

80 "opinion", 

81} 

82 

83# Legacy fact_types accepted on read but remapped to canonical 

84# values. Maps the pre-M25 `assistant_statement` to `experience` — 

85# matches Hindsight's storage-time mapping. 

86_LEGACY_FACT_TYPE_REMAP = { 

87 "assistant_statement": "experience", 

88} 

89_MAX_FACTS_PER_SECTION = 12 # cap to keep retain cost bounded 

90 

91 

92_EXTRACT_PROMPT = """\ 

93You are extracting ATOMIC FACTS from one section of a conversation \ 

94transcript. The transcript is a conversation between a USER and an AI \ 

95ASSISTANT — the 'assistant' role IS the AI, the 'user' role is the \ 

96human being talked to. The reader will query these facts directly for \ 

97"how many X", "when did Y", "what does the user prefer for Z", "what \ 

98did the assistant say about W" type questions. 

99 

100The section's conversation date is ``{session_date}``. Anchor relative \ 

101time phrases ("yesterday", "last week", "3 days ago") against this \ 

102date and output absolute ISO-8601 timestamps. 

103 

104Output a JSON object with one key, ``facts``, containing an array of \ 

105fact objects. Each fact has: 

106 

107- "text": SELF-CONTAINED statement that captures WHAT happened AND WHY \ 

108 it matters / context / nuance. (Hindsight `why` parity — the answerer \ 

109 needs the original framing, not just the bare fact.) \ 

110 Include: subject + verb + entities + the REASON / STRENGTH / SCOPE / \ 

111 CONDITIONS. For preferences especially: capture HOW STRONG the \ 

112 preference is, WHY the user prefers it, and any conditions ("for X \ 

113 use case", "compared to Y"). \ 

114 GOOD (preference): "User strongly prefers Sony cameras for product \ 

115 photography because they already own a Sony 24-70mm lens for their \ 

116 candle business; would not consider switching to Canon or Nikon." \ 

117 GOOD (experience): "User visited Dr. Patel for nasal spray prescription \ 

118 on May 5, 2023; this was their third visit after recurring sinus \ 

119 issues from spring allergies." \ 

120 BAD: "Yesterday I went to the doctor" (missing date anchor, no subject) \ 

121 BAD: "User prefers Sony" (missing reason, scope, strength — answerer \ 

122 cannot structure recommendations around bare preference) 

123- "fact_type": one of: 

124 - "experience" — something the user did or that happened to them, \ 

125 OR something the assistant said / recommended / explained \ 

126 (use the speaker field to distinguish). Hindsight-parity binary \ 

127 taxonomy: assistant utterances are NOT a separate type; they're \ 

128 experience-typed facts whose speaker is "assistant". 

129 - "preference" — stable taste, opinion, or choice the user holds 

130 - "world" — external fact the user mentioned about a non-user entity 

131 - "plan" — intention, future action, goal 

132 - "opinion" — value judgment or stance the user expressed 

133- "speaker": "user" or "assistant" — who stated / did the thing the \ 

134 fact describes. This is the PRIMARY perspective signal. Use \ 

135 speaker="assistant" for any fact that captures what the AI said, \ 

136 recommended, explained, or did in the conversation; speaker="user" \ 

137 for everything the human said / did. 

138- "occurred_start": ISO-8601 date of when the event happened, or null \ 

139 for non-event facts (preferences, plans, opinions, and most \ 

140 assistant utterances that lack a specific event date) 

141- "occurred_end": ISO-8601 date for multi-day events, else null 

142- "entities": array of entity strings. Mix proper nouns ("Dr. Patel", \ 

143 "Nordstrom", "MoMA") and key:value labels for countable categories \ 

144 (``role:doctor``, ``category:trip``, ``event:wedding``, ``expense:$185``). 

145- "confidence": M27 — float 0.0-1.0 indicating how confident you are \ 

146 in this fact. Use 1.0 for facts explicitly stated by the speaker; \ 

147 0.6-0.8 for facts you inferred from context; 0.4-0.5 for tentative \ 

148 / hedged claims ("might", "maybe"); below 0.5 for facts that are \ 

149 highly speculative. The reader uses this to hedge / abstain on \ 

150 low-confidence facts. Default 0.7 if you're unsure how to score. 

151 

152Rules: 

153- Cap at {max_facts} facts per section. Prefer the most-specific facts. 

154- DO NOT emit "user mentioned X" / "they discussed Y" meta-facts — \ 

155 only the actual atomic facts being discussed. 

156- DO emit facts for substantive ASSISTANT utterances: recommendations, \ 

157 explanations, answers, advice. Use fact_type="experience" + \ 

158 speaker="assistant". Preserve the assistant's specific substantive \ 

159 content (the recommendation given, the answer provided, the \ 

160 explanation offered) so the reader can quote it back when asked \ 

161 "what did the assistant say about X" / "what did the agent recommend \ 

162 for Y". Skip pure question-asking by the assistant (no extractable \ 

163 content). 

164- If a fact says "user visited 3 doctors", emit 3 SEPARATE fact rows \ 

165 (one per doctor), not one aggregated fact. 

166- Skip greetings, small talk, agentic confirmations. 

167- Generic chit-chat with no specific facts → ``{{"facts": []}}`` 

168 

169COREFERENCE + ALIASING RULES (M29): 

170 

171These rules make entity strings canonical across sections so cross- \ 

172session link expansion (M27) can stitch "Dr. Patel" in session A to \ 

173"the ENT specialist" in session B without depending on bare-string \ 

174equality. The same person/place/thing should produce the same entity \ 

175token regardless of how the speaker referred to them. 

176 

1771. PRONOUN RESOLUTION WITHIN SECTION: pronouns ("she", "he", "they", \ 

178 "him", "her", "it") resolve to the most recently named entity in \ 

179 the section. When writing the fact ``text``, substitute the \ 

180 resolved name. \ 

181 GOOD: "Emily said she'd be home late" → fact text: "Emily said she \ 

182 would be home late" (with entities=["Emily"], not entities=["she"]). \ 

183 GOOD: "Dr. Patel called. He confirmed the appointment." → fact \ 

184 text: "Dr. Patel confirmed the appointment" (entities=["Dr. Patel", \ 

185 "role:doctor"]). \ 

186 BAD: emitting "she confirmed..." with no antecedent in the fact \ 

187 text — the fact reads in isolation and the reader has no way to \ 

188 know who "she" is. 

189 

1902. ALIAS CANONICALIZATION when BOTH a generic reference AND a name \ 

191 appear for the same referent in the section, use the canonical \ 

192 form ``"Name (descriptor)"`` in the entities array (and prefer the \ 

193 name in the fact text). \ 

194 GOOD: "My roommate Emily came by. Emily then left." → \ 

195 entities=["Emily (user's roommate)"]. \ 

196 GOOD: "Dr. Patel walked in. The doctor checked the chart." → \ 

197 entities=["Dr. Patel (role:doctor)", "role:doctor"]. \ 

198 The parenthetical descriptor is what lets a future section that \ 

199 only says "my roommate" or "the doctor" link back to the same \ 

200 referent via the role/relation label. 

201 

2023. ROLE-BASED ALIASES when only the role appears (no name in this \ 

203 section), still include the role label in entities so cross-section \ 

204 links can find them by role: "the doctor" → include \ 

205 ``"role:doctor"``; "my manager" → include ``"role:manager"``; \ 

206 "my roommate" → include ``"role:roommate"``. The fact text uses the \ 

207 generic reference verbatim. Cross-section link expansion can then \ 

208 join on ``role:doctor`` to surface the named "Dr. Patel" fact from \ 

209 another session. 

210 

2114. STABLE ENTITY-STRING CONVENTION: \ 

212 - bare ``"Name"`` when the name is unambiguous in the bank \ 

213 ("Dr. Patel", "Emily", "Nordstrom"). \ 

214 - ``"Name (descriptor)"`` when the descriptor disambiguates two \ 

215 referents with the same name OR when both a generic reference \ 

216 and the name appeared in the section (per rule 2). \ 

217 - The descriptor is a short, durable label — a relation \ 

218 ("user's roommate"), a role ("role:doctor"), or a \ 

219 distinguishing attribute ("Emily from Stanford") — not a \ 

220 transient state ("Emily who was tired"). 

221 

222Examples: 

223 

224Section: "[user] Yesterday I went to Dr. Patel for a nasal spray. \ 

225[assistant] Have you tried the saline rinse I mentioned last visit? \ 

226It clears post-nasal drip too. [user] About 6 months. I prefer his \ 

227clinic over the previous one. The doctor also suggested an antihistamine." 

228session_date=2023-05-08 

229 

230Output (note: assistant utterance is fact_type=experience + speaker=assistant; \ 

231"the doctor" in the last user turn refers to Dr. Patel, so its fact uses \ 

232the canonical ``"Dr. Patel (role:doctor)"`` form): 

233{{"facts": [ 

234 {{"text": "User visited Dr. Patel on May 7, 2023 to get a prescribed nasal spray for ongoing sinus issues.", \ 

235"fact_type": "experience", "speaker": "user", \ 

236"occurred_start": "2023-05-07", "occurred_end": null, \ 

237"entities": ["Dr. Patel (role:doctor)", "nasal spray", "role:doctor"]}}, 

238 {{"text": "User has been seeing Dr. Patel for about 6 months — indicates an established care relationship.", \ 

239"fact_type": "experience", "speaker": "user", \ 

240"occurred_start": null, "occurred_end": null, \ 

241"entities": ["Dr. Patel (role:doctor)", "role:doctor"]}}, 

242 {{"text": "User prefers Dr. Patel's clinic over their previous one — preference is comparative (Patel > previous), implying dissatisfaction with the prior provider.", \ 

243"fact_type": "preference", "speaker": "user", \ 

244"occurred_start": null, "occurred_end": null, \ 

245"entities": ["Dr. Patel (role:doctor)", "role:doctor"]}}, 

246 {{"text": "Dr. Patel also suggested an antihistamine alongside the nasal spray.", \ 

247"fact_type": "experience", "speaker": "user", \ 

248"occurred_start": null, "occurred_end": null, \ 

249"entities": ["Dr. Patel (role:doctor)", "antihistamine", "role:doctor"]}}, 

250 {{"text": "Assistant recommended a saline rinse alongside the nasal spray because it also clears post-nasal drip; offered as complementary, not alternative, treatment.", \ 

251"fact_type": "experience", "speaker": "assistant", \ 

252"occurred_start": null, "occurred_end": null, \ 

253"entities": ["saline rinse", "post-nasal drip"]}} 

254]}} 

255 

256OUTPUT MUST BE VALID JSON. No prose around it. 

257 

258Section content: 

259{section_text} 

260""" 

261 

262 

263def _parse_iso_date(s: str | None) -> datetime | None: 

264 if not s or not isinstance(s, str): 

265 return None 

266 try: 

267 if "T" in s or ":" in s: 

268 return datetime.fromisoformat(s.replace("Z", "+00:00")) 

269 return datetime.fromisoformat(s) 

270 except (ValueError, TypeError): 

271 return None 

272 

273 

274async def extract_facts_for_section( 

275 provider: "LLMProvider", 

276 section: "PageIndexSection", 

277 section_text: str, 

278 *, 

279 bank_id: str, 

280 model: str | None = None, 

281) -> list[PageIndexFact]: 

282 """One LLM call → up to ``_MAX_FACTS_PER_SECTION`` atomic facts. 

283 

284 Returns ``[]`` when: 

285 - section is generic chit-chat with no specific facts 

286 - LLM output fails to parse 

287 - all candidate facts violated schema (bad fact_type, missing text) 

288 

289 Caller persists the returned facts via 

290 :meth:`PageIndexStore.save_facts`. 

291 """ 

292 text = section_text.strip() 

293 if not text: 

294 return [] 

295 sess_iso = section.session_date.strftime("%Y-%m-%d") if section.session_date is not None else "unknown" 

296 msg = _EXTRACT_PROMPT.format( 

297 session_date=sess_iso, 

298 section_text=text[:6000], 

299 max_facts=_MAX_FACTS_PER_SECTION, 

300 ) 

301 try: 

302 completion = await provider.complete( 

303 messages=[Message(role="user", content=msg)], 

304 model=model, 

305 max_tokens=2000, 

306 temperature=0.0, 

307 response_format={"type": "json_object"}, 

308 ) 

309 except Exception as exc: # noqa: BLE001 

310 _logger.warning( 

311 "section_fact_extraction: LLM call failed doc=%s line=%d: %s", 

312 section.document_id, 

313 section.line_num, 

314 exc, 

315 ) 

316 return [] 

317 # Tolerant parse: handle markdown-fence wrapping / leading-prose noise 

318 # before giving up. On parse failure, optionally retry once with a 

319 # stricter system reminder — but skip the retry when the response 

320 # looks budget-truncated (a retry under the same cap won't help). 

321 data = tolerant_json_loads(completion.text) 

322 if data is None and not looks_truncated(completion.text): 

323 try: 

324 retry = await provider.complete( 

325 messages=[ 

326 Message( 

327 role="system", 

328 content=( 

329 "Return ONLY a valid JSON object. " 

330 "No markdown fences. No prose." 

331 ), 

332 ), 

333 Message(role="user", content=msg), 

334 ], 

335 model=model, 

336 max_tokens=2000, 

337 temperature=0.0, 

338 response_format={"type": "json_object"}, 

339 ) 

340 except Exception as exc: # noqa: BLE001 

341 _logger.warning( 

342 "section_fact_extraction: retry LLM call failed doc=%s line=%d: %s", 

343 section.document_id, 

344 section.line_num, 

345 exc, 

346 ) 

347 retry = None 

348 if retry is not None: 

349 data = tolerant_json_loads(retry.text) 

350 if data is None: 

351 _logger.warning( 

352 "section_fact_extraction: JSON parse failed doc=%s line=%d", 

353 section.document_id, 

354 section.line_num, 

355 ) 

356 return [] 

357 if not isinstance(data, dict): 

358 return [] 

359 raw = data.get("facts") or [] 

360 if not isinstance(raw, list): 

361 return [] 

362 

363 out: list[PageIndexFact] = [] 

364 for entry in raw[:_MAX_FACTS_PER_SECTION]: 

365 if not isinstance(entry, dict): 

366 continue 

367 fact_text = str(entry.get("text", "")).strip() 

368 fact_type = str(entry.get("fact_type", "")).strip() 

369 # M25 legacy compat: if the LLM emits the pre-M25 

370 # `assistant_statement` tag (model trained on old prompt OR 

371 # legacy ingest replay), remap to canonical (`experience` + 

372 # speaker='assistant') to match Hindsight's storage shape. 

373 if fact_type in _LEGACY_FACT_TYPE_REMAP: 

374 fact_type = _LEGACY_FACT_TYPE_REMAP[fact_type] 

375 # Force speaker='assistant' when remapping from 

376 # assistant_statement — the perspective signal must survive 

377 # the type collapse. 

378 entry.setdefault("speaker", "assistant") 

379 if not fact_text or fact_type not in _VALID_FACT_TYPES: 

380 continue 

381 speaker_raw = entry.get("speaker") 

382 speaker = str(speaker_raw).strip() or None if isinstance(speaker_raw, str) else None 

383 if speaker is not None and speaker not in {"user", "assistant"}: 

384 speaker = None 

385 ents_raw = entry.get("entities") or [] 

386 if not isinstance(ents_raw, list): 

387 ents_raw = [] 

388 entities = [str(e).strip() for e in ents_raw if isinstance(e, str) and str(e).strip()] 

389 # Dedupe entities case-insensitively, preserve first-seen casing 

390 seen: set[str] = set() 

391 deduped: list[str] = [] 

392 for e in entities: 

393 k = e.casefold() 

394 if k in seen: 

395 continue 

396 seen.add(k) 

397 deduped.append(e) 

398 # M27 — parse confidence score (0.0-1.0). 

399 # M31b Fix C — DEFAULT to 0.7 when the LLM omits or emits a 

400 # malformed value. The M27 extraction prompt explicitly tells 

401 # the LLM "default 0.7 when uncertain"; in practice the LLM 

402 # often skips the field, leaving confidence_score=None on most 

403 # facts. That meant Fix 3's confidence-aware abstention rule 

404 # in the answerer prompt never fired (the answerer can't hedge 

405 # on confidence it doesn't see). Defaulting at parse-time 

406 # ensures every newly-extracted fact carries a confidence 

407 # value the answerer can act on. Out-of-bounds values clamp. 

408 _DEFAULT_CONF = 0.7 

409 confidence_score: float = _DEFAULT_CONF 

410 raw_conf = entry.get("confidence") 

411 if raw_conf is not None: 

412 try: 

413 cf = float(raw_conf) 

414 if 0.0 <= cf <= 1.0: 

415 confidence_score = cf 

416 elif cf > 1.0: 

417 confidence_score = 1.0 

418 elif cf < 0.0: 

419 confidence_score = 0.0 

420 except (TypeError, ValueError): 

421 # Keep the default rather than emitting None — Fix 3 

422 # needs SOMETHING to hedge against. 

423 confidence_score = _DEFAULT_CONF 

424 # M31 Fix 4 — resolve relative date phrases in the fact's text 

425 # to an absolute datetime at retain time. The section's 

426 # ``session_date`` is the anchor for "last Tuesday" / "3 days 

427 # ago" style references. Resolution is best-effort: returns 

428 # ``None`` when no parseable phrase or no anchor. Distinct from 

429 # ``occurred_start`` (LLM-emitted explicit range) and 

430 # ``mentioned_at`` (session-level discussion date); see 

431 # MemoryFact.event_date docstring. 

432 from astrocyte.pipeline.temporal_resolution import ( # noqa: PLC0415 

433 resolve_event_date, 

434 ) 

435 

436 event_date = resolve_event_date(fact_text, section.session_date) 

437 

438 out.append( 

439 PageIndexFact( 

440 id=str(uuid.uuid4()), 

441 bank_id=bank_id, 

442 document_id=section.document_id, 

443 line_num=section.line_num, 

444 text=fact_text, 

445 fact_type=fact_type, 

446 speaker=speaker, 

447 occurred_start=_parse_iso_date(entry.get("occurred_start")), 

448 occurred_end=_parse_iso_date(entry.get("occurred_end")), 

449 entities=deduped, 

450 embedding=None, # embeddings batched separately at retain time 

451 confidence_score=confidence_score, 

452 # M27 — `mentioned_at` is the session's date (when the 

453 # conversation happened), distinct from `occurred_start` 

454 # (when the event happened). For section-anchored facts 

455 # we copy section.session_date; top-level facts (no 

456 # section anchor) leave it None. 

457 mentioned_at=section.session_date, 

458 event_date=event_date, # M31 Fix 4 

459 ) 

460 ) 

461 return out