Coverage for astrocyte/pipeline/section_fact_extraction.py: 91%
94 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""M12.1 — per-section fact extraction.
3Each section's raw text is exploded into a list of atomic facts via
4one LLM call. Each fact carries:
6- ``text``: self-contained statement ("User visited Dr. Patel on May 5")
7- ``fact_type``: ``experience | preference | world | plan | opinion``
8- ``speaker``: ``user | assistant``
9- ``occurred_start`` / ``occurred_end``: anchored to ``session_date`` for
10 relative phrases ("yesterday" → session - 1)
11- ``entities``: proper nouns + key:value labels from the M10.2 vocab
12 (``role:doctor``, ``category:trip``, ``event:wedding``, ``expense:$N``)
14Sections remain the picker's navigation primitive; facts are the
15precision grain queried by reflect tools (counting, temporal, entity
16lookups). Mirrors Hindsight's ``memory_units`` schema on top of the
17PageIndex tree.
19See:
20- ``docs/_design/recall.md`` §14 (M12 plan)
21- ``hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py``
22 for the canonical Hindsight pattern.
23"""
25from __future__ import annotations
27import logging
28import uuid
29from datetime import datetime
30from typing import TYPE_CHECKING
32from astrocyte.pipeline._json_tolerant import looks_truncated, tolerant_json_loads
33from astrocyte.types import Message, PageIndexFact
35if TYPE_CHECKING:
36 from astrocyte.provider import LLMProvider
37 from astrocyte.types import PageIndexSection
39_logger = logging.getLogger("astrocyte.pipeline.section_fact_extraction")
42# M25 — Hindsight-parity fact_type taxonomy.
43#
44# Previously (M14.1+) had `assistant_statement` as a 6th fact_type to
45# preserve assistant phrasing for LME's single-session-assistant
46# category. M24 bench showed −2q SSA regression: the inline source-
47# chunk pairing made the `assistant_statement` fact text + chunk text
48# render redundantly (both contain the assistant utterance), confusing
49# the answerer on "what did the assistant say" extraction.
50#
51# Hindsight's solve (engine/retain/fact_extraction.py lines 150-345 +
52# benchmark line 85):
53# 1. Binary classification: fact_type ∈ {world, assistant} → mapped
54# to {world, experience} at storage time. Speaker perspective is
55# carried by the speaker field + per-conversation context tag,
56# NOT by a special fact_type bucket.
57# 2. Per-conversation perspective tag at extraction time: the prompt
58# tells the LLM "you are the assistant in this conversation" so
59# the extractor uses the right reference frame when classifying
60# first-person utterances.
61#
62# M25 adopts this pattern:
63# - Drop `assistant_statement` from valid fact_types. Assistant
64# utterances are extracted as `experience` (with speaker='assistant').
65# - Add a perspective-tag preamble to the extraction prompt so the
66# LLM treats the transcript as "a conversation between a user and
67# an AI assistant".
68# - The `speaker` field on MemoryFact preserves the perspective
69# signal for downstream consumers; the answerer renders facts by
70# speaker rather than by fact_type.
71#
72# Backward compat: legacy rows with fact_type='assistant_statement'
73# are accepted on read via the M25 shim in extract_facts_for_section
74# (mapped to 'experience' if the LLM emits the legacy tag).
75_VALID_FACT_TYPES = {
76 "experience",
77 "preference",
78 "world",
79 "plan",
80 "opinion",
81}
83# Legacy fact_types accepted on read but remapped to canonical
84# values. Maps the pre-M25 `assistant_statement` to `experience` —
85# matches Hindsight's storage-time mapping.
86_LEGACY_FACT_TYPE_REMAP = {
87 "assistant_statement": "experience",
88}
89_MAX_FACTS_PER_SECTION = 12 # cap to keep retain cost bounded
92_EXTRACT_PROMPT = """\
93You are extracting ATOMIC FACTS from one section of a conversation \
94transcript. The transcript is a conversation between a USER and an AI \
95ASSISTANT — the 'assistant' role IS the AI, the 'user' role is the \
96human being talked to. The reader will query these facts directly for \
97"how many X", "when did Y", "what does the user prefer for Z", "what \
98did the assistant say about W" type questions.
100The section's conversation date is ``{session_date}``. Anchor relative \
101time phrases ("yesterday", "last week", "3 days ago") against this \
102date and output absolute ISO-8601 timestamps.
104Output a JSON object with one key, ``facts``, containing an array of \
105fact objects. Each fact has:
107- "text": SELF-CONTAINED statement that captures WHAT happened AND WHY \
108 it matters / context / nuance. (Hindsight `why` parity — the answerer \
109 needs the original framing, not just the bare fact.) \
110 Include: subject + verb + entities + the REASON / STRENGTH / SCOPE / \
111 CONDITIONS. For preferences especially: capture HOW STRONG the \
112 preference is, WHY the user prefers it, and any conditions ("for X \
113 use case", "compared to Y"). \
114 GOOD (preference): "User strongly prefers Sony cameras for product \
115 photography because they already own a Sony 24-70mm lens for their \
116 candle business; would not consider switching to Canon or Nikon." \
117 GOOD (experience): "User visited Dr. Patel for nasal spray prescription \
118 on May 5, 2023; this was their third visit after recurring sinus \
119 issues from spring allergies." \
120 BAD: "Yesterday I went to the doctor" (missing date anchor, no subject) \
121 BAD: "User prefers Sony" (missing reason, scope, strength — answerer \
122 cannot structure recommendations around bare preference)
123- "fact_type": one of:
124 - "experience" — something the user did or that happened to them, \
125 OR something the assistant said / recommended / explained \
126 (use the speaker field to distinguish). Hindsight-parity binary \
127 taxonomy: assistant utterances are NOT a separate type; they're \
128 experience-typed facts whose speaker is "assistant".
129 - "preference" — stable taste, opinion, or choice the user holds
130 - "world" — external fact the user mentioned about a non-user entity
131 - "plan" — intention, future action, goal
132 - "opinion" — value judgment or stance the user expressed
133- "speaker": "user" or "assistant" — who stated / did the thing the \
134 fact describes. This is the PRIMARY perspective signal. Use \
135 speaker="assistant" for any fact that captures what the AI said, \
136 recommended, explained, or did in the conversation; speaker="user" \
137 for everything the human said / did.
138- "occurred_start": ISO-8601 date of when the event happened, or null \
139 for non-event facts (preferences, plans, opinions, and most \
140 assistant utterances that lack a specific event date)
141- "occurred_end": ISO-8601 date for multi-day events, else null
142- "entities": array of entity strings. Mix proper nouns ("Dr. Patel", \
143 "Nordstrom", "MoMA") and key:value labels for countable categories \
144 (``role:doctor``, ``category:trip``, ``event:wedding``, ``expense:$185``).
145- "confidence": M27 — float 0.0-1.0 indicating how confident you are \
146 in this fact. Use 1.0 for facts explicitly stated by the speaker; \
147 0.6-0.8 for facts you inferred from context; 0.4-0.5 for tentative \
148 / hedged claims ("might", "maybe"); below 0.5 for facts that are \
149 highly speculative. The reader uses this to hedge / abstain on \
150 low-confidence facts. Default 0.7 if you're unsure how to score.
152Rules:
153- Cap at {max_facts} facts per section. Prefer the most-specific facts.
154- DO NOT emit "user mentioned X" / "they discussed Y" meta-facts — \
155 only the actual atomic facts being discussed.
156- DO emit facts for substantive ASSISTANT utterances: recommendations, \
157 explanations, answers, advice. Use fact_type="experience" + \
158 speaker="assistant". Preserve the assistant's specific substantive \
159 content (the recommendation given, the answer provided, the \
160 explanation offered) so the reader can quote it back when asked \
161 "what did the assistant say about X" / "what did the agent recommend \
162 for Y". Skip pure question-asking by the assistant (no extractable \
163 content).
164- If a fact says "user visited 3 doctors", emit 3 SEPARATE fact rows \
165 (one per doctor), not one aggregated fact.
166- Skip greetings, small talk, agentic confirmations.
167- Generic chit-chat with no specific facts → ``{{"facts": []}}``
169COREFERENCE + ALIASING RULES (M29):
171These rules make entity strings canonical across sections so cross- \
172session link expansion (M27) can stitch "Dr. Patel" in session A to \
173"the ENT specialist" in session B without depending on bare-string \
174equality. The same person/place/thing should produce the same entity \
175token regardless of how the speaker referred to them.
1771. PRONOUN RESOLUTION WITHIN SECTION: pronouns ("she", "he", "they", \
178 "him", "her", "it") resolve to the most recently named entity in \
179 the section. When writing the fact ``text``, substitute the \
180 resolved name. \
181 GOOD: "Emily said she'd be home late" → fact text: "Emily said she \
182 would be home late" (with entities=["Emily"], not entities=["she"]). \
183 GOOD: "Dr. Patel called. He confirmed the appointment." → fact \
184 text: "Dr. Patel confirmed the appointment" (entities=["Dr. Patel", \
185 "role:doctor"]). \
186 BAD: emitting "she confirmed..." with no antecedent in the fact \
187 text — the fact reads in isolation and the reader has no way to \
188 know who "she" is.
1902. ALIAS CANONICALIZATION when BOTH a generic reference AND a name \
191 appear for the same referent in the section, use the canonical \
192 form ``"Name (descriptor)"`` in the entities array (and prefer the \
193 name in the fact text). \
194 GOOD: "My roommate Emily came by. Emily then left." → \
195 entities=["Emily (user's roommate)"]. \
196 GOOD: "Dr. Patel walked in. The doctor checked the chart." → \
197 entities=["Dr. Patel (role:doctor)", "role:doctor"]. \
198 The parenthetical descriptor is what lets a future section that \
199 only says "my roommate" or "the doctor" link back to the same \
200 referent via the role/relation label.
2023. ROLE-BASED ALIASES when only the role appears (no name in this \
203 section), still include the role label in entities so cross-section \
204 links can find them by role: "the doctor" → include \
205 ``"role:doctor"``; "my manager" → include ``"role:manager"``; \
206 "my roommate" → include ``"role:roommate"``. The fact text uses the \
207 generic reference verbatim. Cross-section link expansion can then \
208 join on ``role:doctor`` to surface the named "Dr. Patel" fact from \
209 another session.
2114. STABLE ENTITY-STRING CONVENTION: \
212 - bare ``"Name"`` when the name is unambiguous in the bank \
213 ("Dr. Patel", "Emily", "Nordstrom"). \
214 - ``"Name (descriptor)"`` when the descriptor disambiguates two \
215 referents with the same name OR when both a generic reference \
216 and the name appeared in the section (per rule 2). \
217 - The descriptor is a short, durable label — a relation \
218 ("user's roommate"), a role ("role:doctor"), or a \
219 distinguishing attribute ("Emily from Stanford") — not a \
220 transient state ("Emily who was tired").
222Examples:
224Section: "[user] Yesterday I went to Dr. Patel for a nasal spray. \
225[assistant] Have you tried the saline rinse I mentioned last visit? \
226It clears post-nasal drip too. [user] About 6 months. I prefer his \
227clinic over the previous one. The doctor also suggested an antihistamine."
228session_date=2023-05-08
230Output (note: assistant utterance is fact_type=experience + speaker=assistant; \
231"the doctor" in the last user turn refers to Dr. Patel, so its fact uses \
232the canonical ``"Dr. Patel (role:doctor)"`` form):
233{{"facts": [
234 {{"text": "User visited Dr. Patel on May 7, 2023 to get a prescribed nasal spray for ongoing sinus issues.", \
235"fact_type": "experience", "speaker": "user", \
236"occurred_start": "2023-05-07", "occurred_end": null, \
237"entities": ["Dr. Patel (role:doctor)", "nasal spray", "role:doctor"]}},
238 {{"text": "User has been seeing Dr. Patel for about 6 months — indicates an established care relationship.", \
239"fact_type": "experience", "speaker": "user", \
240"occurred_start": null, "occurred_end": null, \
241"entities": ["Dr. Patel (role:doctor)", "role:doctor"]}},
242 {{"text": "User prefers Dr. Patel's clinic over their previous one — preference is comparative (Patel > previous), implying dissatisfaction with the prior provider.", \
243"fact_type": "preference", "speaker": "user", \
244"occurred_start": null, "occurred_end": null, \
245"entities": ["Dr. Patel (role:doctor)", "role:doctor"]}},
246 {{"text": "Dr. Patel also suggested an antihistamine alongside the nasal spray.", \
247"fact_type": "experience", "speaker": "user", \
248"occurred_start": null, "occurred_end": null, \
249"entities": ["Dr. Patel (role:doctor)", "antihistamine", "role:doctor"]}},
250 {{"text": "Assistant recommended a saline rinse alongside the nasal spray because it also clears post-nasal drip; offered as complementary, not alternative, treatment.", \
251"fact_type": "experience", "speaker": "assistant", \
252"occurred_start": null, "occurred_end": null, \
253"entities": ["saline rinse", "post-nasal drip"]}}
254]}}
256OUTPUT MUST BE VALID JSON. No prose around it.
258Section content:
259{section_text}
260"""
263def _parse_iso_date(s: str | None) -> datetime | None:
264 if not s or not isinstance(s, str):
265 return None
266 try:
267 if "T" in s or ":" in s:
268 return datetime.fromisoformat(s.replace("Z", "+00:00"))
269 return datetime.fromisoformat(s)
270 except (ValueError, TypeError):
271 return None
274async def extract_facts_for_section(
275 provider: "LLMProvider",
276 section: "PageIndexSection",
277 section_text: str,
278 *,
279 bank_id: str,
280 model: str | None = None,
281) -> list[PageIndexFact]:
282 """One LLM call → up to ``_MAX_FACTS_PER_SECTION`` atomic facts.
284 Returns ``[]`` when:
285 - section is generic chit-chat with no specific facts
286 - LLM output fails to parse
287 - all candidate facts violated schema (bad fact_type, missing text)
289 Caller persists the returned facts via
290 :meth:`PageIndexStore.save_facts`.
291 """
292 text = section_text.strip()
293 if not text:
294 return []
295 sess_iso = section.session_date.strftime("%Y-%m-%d") if section.session_date is not None else "unknown"
296 msg = _EXTRACT_PROMPT.format(
297 session_date=sess_iso,
298 section_text=text[:6000],
299 max_facts=_MAX_FACTS_PER_SECTION,
300 )
301 try:
302 completion = await provider.complete(
303 messages=[Message(role="user", content=msg)],
304 model=model,
305 max_tokens=2000,
306 temperature=0.0,
307 response_format={"type": "json_object"},
308 )
309 except Exception as exc: # noqa: BLE001
310 _logger.warning(
311 "section_fact_extraction: LLM call failed doc=%s line=%d: %s",
312 section.document_id,
313 section.line_num,
314 exc,
315 )
316 return []
317 # Tolerant parse: handle markdown-fence wrapping / leading-prose noise
318 # before giving up. On parse failure, optionally retry once with a
319 # stricter system reminder — but skip the retry when the response
320 # looks budget-truncated (a retry under the same cap won't help).
321 data = tolerant_json_loads(completion.text)
322 if data is None and not looks_truncated(completion.text):
323 try:
324 retry = await provider.complete(
325 messages=[
326 Message(
327 role="system",
328 content=(
329 "Return ONLY a valid JSON object. "
330 "No markdown fences. No prose."
331 ),
332 ),
333 Message(role="user", content=msg),
334 ],
335 model=model,
336 max_tokens=2000,
337 temperature=0.0,
338 response_format={"type": "json_object"},
339 )
340 except Exception as exc: # noqa: BLE001
341 _logger.warning(
342 "section_fact_extraction: retry LLM call failed doc=%s line=%d: %s",
343 section.document_id,
344 section.line_num,
345 exc,
346 )
347 retry = None
348 if retry is not None:
349 data = tolerant_json_loads(retry.text)
350 if data is None:
351 _logger.warning(
352 "section_fact_extraction: JSON parse failed doc=%s line=%d",
353 section.document_id,
354 section.line_num,
355 )
356 return []
357 if not isinstance(data, dict):
358 return []
359 raw = data.get("facts") or []
360 if not isinstance(raw, list):
361 return []
363 out: list[PageIndexFact] = []
364 for entry in raw[:_MAX_FACTS_PER_SECTION]:
365 if not isinstance(entry, dict):
366 continue
367 fact_text = str(entry.get("text", "")).strip()
368 fact_type = str(entry.get("fact_type", "")).strip()
369 # M25 legacy compat: if the LLM emits the pre-M25
370 # `assistant_statement` tag (model trained on old prompt OR
371 # legacy ingest replay), remap to canonical (`experience` +
372 # speaker='assistant') to match Hindsight's storage shape.
373 if fact_type in _LEGACY_FACT_TYPE_REMAP:
374 fact_type = _LEGACY_FACT_TYPE_REMAP[fact_type]
375 # Force speaker='assistant' when remapping from
376 # assistant_statement — the perspective signal must survive
377 # the type collapse.
378 entry.setdefault("speaker", "assistant")
379 if not fact_text or fact_type not in _VALID_FACT_TYPES:
380 continue
381 speaker_raw = entry.get("speaker")
382 speaker = str(speaker_raw).strip() or None if isinstance(speaker_raw, str) else None
383 if speaker is not None and speaker not in {"user", "assistant"}:
384 speaker = None
385 ents_raw = entry.get("entities") or []
386 if not isinstance(ents_raw, list):
387 ents_raw = []
388 entities = [str(e).strip() for e in ents_raw if isinstance(e, str) and str(e).strip()]
389 # Dedupe entities case-insensitively, preserve first-seen casing
390 seen: set[str] = set()
391 deduped: list[str] = []
392 for e in entities:
393 k = e.casefold()
394 if k in seen:
395 continue
396 seen.add(k)
397 deduped.append(e)
398 # M27 — parse confidence score (0.0-1.0).
399 # M31b Fix C — DEFAULT to 0.7 when the LLM omits or emits a
400 # malformed value. The M27 extraction prompt explicitly tells
401 # the LLM "default 0.7 when uncertain"; in practice the LLM
402 # often skips the field, leaving confidence_score=None on most
403 # facts. That meant Fix 3's confidence-aware abstention rule
404 # in the answerer prompt never fired (the answerer can't hedge
405 # on confidence it doesn't see). Defaulting at parse-time
406 # ensures every newly-extracted fact carries a confidence
407 # value the answerer can act on. Out-of-bounds values clamp.
408 _DEFAULT_CONF = 0.7
409 confidence_score: float = _DEFAULT_CONF
410 raw_conf = entry.get("confidence")
411 if raw_conf is not None:
412 try:
413 cf = float(raw_conf)
414 if 0.0 <= cf <= 1.0:
415 confidence_score = cf
416 elif cf > 1.0:
417 confidence_score = 1.0
418 elif cf < 0.0:
419 confidence_score = 0.0
420 except (TypeError, ValueError):
421 # Keep the default rather than emitting None — Fix 3
422 # needs SOMETHING to hedge against.
423 confidence_score = _DEFAULT_CONF
424 # M31 Fix 4 — resolve relative date phrases in the fact's text
425 # to an absolute datetime at retain time. The section's
426 # ``session_date`` is the anchor for "last Tuesday" / "3 days
427 # ago" style references. Resolution is best-effort: returns
428 # ``None`` when no parseable phrase or no anchor. Distinct from
429 # ``occurred_start`` (LLM-emitted explicit range) and
430 # ``mentioned_at`` (session-level discussion date); see
431 # MemoryFact.event_date docstring.
432 from astrocyte.pipeline.temporal_resolution import ( # noqa: PLC0415
433 resolve_event_date,
434 )
436 event_date = resolve_event_date(fact_text, section.session_date)
438 out.append(
439 PageIndexFact(
440 id=str(uuid.uuid4()),
441 bank_id=bank_id,
442 document_id=section.document_id,
443 line_num=section.line_num,
444 text=fact_text,
445 fact_type=fact_type,
446 speaker=speaker,
447 occurred_start=_parse_iso_date(entry.get("occurred_start")),
448 occurred_end=_parse_iso_date(entry.get("occurred_end")),
449 entities=deduped,
450 embedding=None, # embeddings batched separately at retain time
451 confidence_score=confidence_score,
452 # M27 — `mentioned_at` is the session's date (when the
453 # conversation happened), distinct from `occurred_start`
454 # (when the event happened). For section-anchored facts
455 # we copy section.session_date; top-level facts (no
456 # section anchor) leave it None.
457 mentioned_at=section.session_date,
458 event_date=event_date, # M31 Fix 4
459 )
460 )
461 return out