Coverage for astrocyte/pipeline/section_fact

1"""M12.1 — per-section fact extraction.

3Each section's raw text is exploded into a list of atomic facts via

4one LLM call. Each fact carries:

6- ``text``: self-contained statement ("User visited Dr. Patel on May 5")

7- ``fact_type``: ``experience | preference | world | plan | opinion``

8- ``speaker``: ``user | assistant``

9- ``occurred_start`` / ``occurred_end``: anchored to ``session_date`` for

10 relative phrases ("yesterday" → session - 1)

11- ``entities``: proper nouns + key:value labels from the M10.2 vocab

12 (``role:doctor``, ``category:trip``, ``event:wedding``, ``expense:$N``)

14Sections remain the picker's navigation primitive; facts are the

15precision grain queried by reflect tools (counting, temporal, entity

16lookups). Mirrors Hindsight's ``memory_units`` schema on top of the

17PageIndex tree.

19See:

20- ``docs/_design/recall.md`` §14 (M12 plan)

21- ``hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py``

22 for the canonical Hindsight pattern.

23"""

25from __future__ import annotations

27import logging

28import uuid

29from datetime import datetime

30from typing import TYPE_CHECKING

32from astrocyte.pipeline._json_tolerant import looks_truncated, tolerant_json_loads

33from astrocyte.types import Message, PageIndexFact

35if TYPE_CHECKING:

36 from astrocyte.provider import LLMProvider

37 from astrocyte.types import PageIndexSection

39_logger = logging.getLogger("astrocyte.pipeline.section_fact_extraction")

42# M25 — Hindsight-parity fact_type taxonomy.

43#

44# Previously (M14.1+) had `assistant_statement` as a 6th fact_type to

45# preserve assistant phrasing for LME's single-session-assistant

46# category. M24 bench showed −2q SSA regression: the inline source-

47# chunk pairing made the `assistant_statement` fact text + chunk text

48# render redundantly (both contain the assistant utterance), confusing

49# the answerer on "what did the assistant say" extraction.

50#

51# Hindsight's solve (engine/retain/fact_extraction.py lines 150-345 +

52# benchmark line 85):

53# 1. Binary classification: fact_type ∈ {world, assistant} → mapped

54# to {world, experience} at storage time. Speaker perspective is

55# carried by the speaker field + per-conversation context tag,

56# NOT by a special fact_type bucket.

57# 2. Per-conversation perspective tag at extraction time: the prompt

58# tells the LLM "you are the assistant in this conversation" so

59# the extractor uses the right reference frame when classifying

60# first-person utterances.

61#

62# M25 adopts this pattern:

63# - Drop `assistant_statement` from valid fact_types. Assistant

64# utterances are extracted as `experience` (with speaker='assistant').

65# - Add a perspective-tag preamble to the extraction prompt so the

66# LLM treats the transcript as "a conversation between a user and

67# an AI assistant".

68# - The `speaker` field on MemoryFact preserves the perspective

69# signal for downstream consumers; the answerer renders facts by

70# speaker rather than by fact_type.

71#

72# Backward compat: legacy rows with fact_type='assistant_statement'

73# are accepted on read via the M25 shim in extract_facts_for_section

74# (mapped to 'experience' if the LLM emits the legacy tag).

75_VALID_FACT_TYPES = {

76 "experience",

77 "preference",

78 "world",

79 "plan",

80 "opinion",

81}

83# Legacy fact_types accepted on read but remapped to canonical

84# values. Maps the pre-M25 `assistant_statement` to `experience` —

85# matches Hindsight's storage-time mapping.

86_LEGACY_FACT_TYPE_REMAP = {

87 "assistant_statement": "experience",

88}

89_MAX_FACTS_PER_SECTION = 12 # cap to keep retain cost bounded

92_EXTRACT_PROMPT = """\

93You are extracting ATOMIC FACTS from one section of a conversation \

94transcript. The transcript is a conversation between a USER and an AI \

95ASSISTANT — the 'assistant' role IS the AI, the 'user' role is the \

96human being talked to. The reader will query these facts directly for \

97"how many X", "when did Y", "what does the user prefer for Z", "what \

98did the assistant say about W" type questions.

100The section's conversation date is ``{session_date}``. Anchor relative \

101time phrases ("yesterday", "last week", "3 days ago") against this \

102date and output absolute ISO-8601 timestamps.

103

104Output a JSON object with one key, ``facts``, containing an array of \

105fact objects. Each fact has:

106

107- "text": SELF-CONTAINED statement that captures WHAT happened AND WHY \

108 it matters / context / nuance. (Hindsight `why` parity — the answerer \

109 needs the original framing, not just the bare fact.) \

110 Include: subject + verb + entities + the REASON / STRENGTH / SCOPE / \

111 CONDITIONS. For preferences especially: capture HOW STRONG the \

112 preference is, WHY the user prefers it, and any conditions ("for X \

113 use case", "compared to Y"). \

114 GOOD (preference): "User strongly prefers Sony cameras for product \

115 photography because they already own a Sony 24-70mm lens for their \

116 candle business; would not consider switching to Canon or Nikon." \

117 GOOD (experience): "User visited Dr. Patel for nasal spray prescription \

118 on May 5, 2023; this was their third visit after recurring sinus \

119 issues from spring allergies." \

120 BAD: "Yesterday I went to the doctor" (missing date anchor, no subject) \

121 BAD: "User prefers Sony" (missing reason, scope, strength — answerer \

122 cannot structure recommendations around bare preference)

123- "fact_type": one of:

124 - "experience" — something the user did or that happened to them, \

125 OR something the assistant said / recommended / explained \

126 (use the speaker field to distinguish). Hindsight-parity binary \

127 taxonomy: assistant utterances are NOT a separate type; they're \

128 experience-typed facts whose speaker is "assistant".

129 - "preference" — stable taste, opinion, or choice the user holds

130 - "world" — external fact the user mentioned about a non-user entity

131 - "plan" — intention, future action, goal

132 - "opinion" — value judgment or stance the user expressed

133- "speaker": "user" or "assistant" — who stated / did the thing the \

134 fact describes. This is the PRIMARY perspective signal. Use \

135 speaker="assistant" for any fact that captures what the AI said, \

136 recommended, explained, or did in the conversation; speaker="user" \

137 for everything the human said / did.

138- "occurred_start": ISO-8601 date of when the event happened, or null \

139 for non-event facts (preferences, plans, opinions, and most \

140 assistant utterances that lack a specific event date)

141- "occurred_end": ISO-8601 date for multi-day events, else null

142- "entities": array of entity strings. Mix proper nouns ("Dr. Patel", \

143 "Nordstrom", "MoMA") and key:value labels for countable categories \

144 (``role:doctor``, ``category:trip``, ``event:wedding``, ``expense:$185``).

145- "confidence": M27 — float 0.0-1.0 indicating how confident you are \

146 in this fact. Use 1.0 for facts explicitly stated by the speaker; \

147 0.6-0.8 for facts you inferred from context; 0.4-0.5 for tentative \

148 / hedged claims ("might", "maybe"); below 0.5 for facts that are \

149 highly speculative. The reader uses this to hedge / abstain on \

150 low-confidence facts. Default 0.7 if you're unsure how to score.

151

152Rules:

153- Cap at {max_facts} facts per section. Prefer the most-specific facts.

154- DO NOT emit "user mentioned X" / "they discussed Y" meta-facts — \

155 only the actual atomic facts being discussed.

156- DO emit facts for substantive ASSISTANT utterances: recommendations, \

157 explanations, answers, advice. Use fact_type="experience" + \

158 speaker="assistant". Preserve the assistant's specific substantive \

159 content (the recommendation given, the answer provided, the \

160 explanation offered) so the reader can quote it back when asked \

161 "what did the assistant say about X" / "what did the agent recommend \

162 for Y". Skip pure question-asking by the assistant (no extractable \

163 content).

164- If a fact says "user visited 3 doctors", emit 3 SEPARATE fact rows \

165 (one per doctor), not one aggregated fact.

166- Skip greetings, small talk, agentic confirmations.

167- Generic chit-chat with no specific facts → ``{{"facts": []}}``

168

169COREFERENCE + ALIASING RULES (M29):

170

171These rules make entity strings canonical across sections so cross- \

172session link expansion (M27) can stitch "Dr. Patel" in session A to \

173"the ENT specialist" in session B without depending on bare-string \

174equality. The same person/place/thing should produce the same entity \

175token regardless of how the speaker referred to them.

176

1771. PRONOUN RESOLUTION WITHIN SECTION: pronouns ("she", "he", "they", \

178 "him", "her", "it") resolve to the most recently named entity in \

179 the section. When writing the fact ``text``, substitute the \

180 resolved name. \

181 GOOD: "Emily said she'd be home late" → fact text: "Emily said she \

182 would be home late" (with entities=["Emily"], not entities=["she"]). \

183 GOOD: "Dr. Patel called. He confirmed the appointment." → fact \

184 text: "Dr. Patel confirmed the appointment" (entities=["Dr. Patel", \

185 "role:doctor"]). \

186 BAD: emitting "she confirmed..." with no antecedent in the fact \

187 text — the fact reads in isolation and the reader has no way to \

188 know who "she" is.

189

1902. ALIAS CANONICALIZATION when BOTH a generic reference AND a name \

191 appear for the same referent in the section, use the canonical \

192 form ``"Name (descriptor)"`` in the entities array (and prefer the \

193 name in the fact text). \

194 GOOD: "My roommate Emily came by. Emily then left." → \

195 entities=["Emily (user's roommate)"]. \

196 GOOD: "Dr. Patel walked in. The doctor checked the chart." → \

197 entities=["Dr. Patel (role:doctor)", "role:doctor"]. \

198 The parenthetical descriptor is what lets a future section that \

199 only says "my roommate" or "the doctor" link back to the same \

200 referent via the role/relation label.

201

2023. ROLE-BASED ALIASES when only the role appears (no name in this \

203 section), still include the role label in entities so cross-section \

204 links can find them by role: "the doctor" → include \

205 ``"role:doctor"``; "my manager" → include ``"role:manager"``; \

206 "my roommate" → include ``"role:roommate"``. The fact text uses the \

207 generic reference verbatim. Cross-section link expansion can then \

208 join on ``role:doctor`` to surface the named "Dr. Patel" fact from \

209 another session.

210

2114. STABLE ENTITY-STRING CONVENTION: \

212 - bare ``"Name"`` when the name is unambiguous in the bank \

213 ("Dr. Patel", "Emily", "Nordstrom"). \

214 - ``"Name (descriptor)"`` when the descriptor disambiguates two \

215 referents with the same name OR when both a generic reference \

216 and the name appeared in the section (per rule 2). \

217 - The descriptor is a short, durable label — a relation \

218 ("user's roommate"), a role ("role:doctor"), or a \

219 distinguishing attribute ("Emily from Stanford") — not a \

220 transient state ("Emily who was tired").

221

222Examples:

223

224Section: "[user] Yesterday I went to Dr. Patel for a nasal spray. \

225[assistant] Have you tried the saline rinse I mentioned last visit? \

226It clears post-nasal drip too. [user] About 6 months. I prefer his \

227clinic over the previous one. The doctor also suggested an antihistamine."

228session_date=2023-05-08

229

230Output (note: assistant utterance is fact_type=experience + speaker=assistant; \

231"the doctor" in the last user turn refers to Dr. Patel, so its fact uses \

232the canonical ``"Dr. Patel (role:doctor)"`` form):

233{{"facts": [

234 {{"text": "User visited Dr. Patel on May 7, 2023 to get a prescribed nasal spray for ongoing sinus issues.", \

235"fact_type": "experience", "speaker": "user", \

236"occurred_start": "2023-05-07", "occurred_end": null, \

237"entities": ["Dr. Patel (role:doctor)", "nasal spray", "role:doctor"]}},

238 {{"text": "User has been seeing Dr. Patel for about 6 months — indicates an established care relationship.", \

239"fact_type": "experience", "speaker": "user", \

240"occurred_start": null, "occurred_end": null, \

241"entities": ["Dr. Patel (role:doctor)", "role:doctor"]}},

242 {{"text": "User prefers Dr. Patel's clinic over their previous one — preference is comparative (Patel > previous), implying dissatisfaction with the prior provider.", \

243"fact_type": "preference", "speaker": "user", \

244"occurred_start": null, "occurred_end": null, \

245"entities": ["Dr. Patel (role:doctor)", "role:doctor"]}},

246 {{"text": "Dr. Patel also suggested an antihistamine alongside the nasal spray.", \

247"fact_type": "experience", "speaker": "user", \

248"occurred_start": null, "occurred_end": null, \

249"entities": ["Dr. Patel (role:doctor)", "antihistamine", "role:doctor"]}},

250 {{"text": "Assistant recommended a saline rinse alongside the nasal spray because it also clears post-nasal drip; offered as complementary, not alternative, treatment.", \

251"fact_type": "experience", "speaker": "assistant", \

252"occurred_start": null, "occurred_end": null, \

253"entities": ["saline rinse", "post-nasal drip"]}}

254]}}

255

256OUTPUT MUST BE VALID JSON. No prose around it.

257

258Section content:

259{section_text}

260"""

261

262

263def _parse_iso_date(s: str | None) -> datetime | None:

264 if not s or not isinstance(s, str):

265 return None

266 try:

267 if "T" in s or ":" in s:

268 return datetime.fromisoformat(s.replace("Z", "+00:00"))

269 return datetime.fromisoformat(s)

270 except (ValueError, TypeError):

271 return None

272

273

274async def extract_facts_for_section(

275 provider: "LLMProvider",

276 section: "PageIndexSection",

277 section_text: str,

278 *,

279 bank_id: str,

280 model: str | None = None,

281) -> list[PageIndexFact]:

282 """One LLM call → up to ``_MAX_FACTS_PER_SECTION`` atomic facts.

283

284 Returns ``[]`` when:

285 - section is generic chit-chat with no specific facts

286 - LLM output fails to parse

287 - all candidate facts violated schema (bad fact_type, missing text)

288

289 Caller persists the returned facts via

290 :meth:`PageIndexStore.save_facts`.

291 """

292 text = section_text.strip()

293 if not text:

294 return []

295 sess_iso = section.session_date.strftime("%Y-%m-%d") if section.session_date is not None else "unknown"

296 msg = _EXTRACT_PROMPT.format(

297 session_date=sess_iso,

298 section_text=text[:6000],

299 max_facts=_MAX_FACTS_PER_SECTION,

300 )

301 try:

302 completion = await provider.complete(

303 messages=[Message(role="user", content=msg)],

304 model=model,

305 max_tokens=2000,

306 temperature=0.0,

307 response_format={"type": "json_object"},

308 )

309 except Exception as exc: # noqa: BLE001

310 _logger.warning(

311 "section_fact_extraction: LLM call failed doc=%s line=%d: %s",

312 section.document_id,

313 section.line_num,

314 exc,

315 )

316 return []

317 # Tolerant parse: handle markdown-fence wrapping / leading-prose noise

318 # before giving up. On parse failure, optionally retry once with a

319 # stricter system reminder — but skip the retry when the response

320 # looks budget-truncated (a retry under the same cap won't help).

321 data = tolerant_json_loads(completion.text)

322 if data is None and not looks_truncated(completion.text):

323 try:

324 retry = await provider.complete(

325 messages=[

326 Message(

327 role="system",

328 content=(

329 "Return ONLY a valid JSON object. "

330 "No markdown fences. No prose."

331 ),

332 ),

333 Message(role="user", content=msg),

334 ],

335 model=model,

336 max_tokens=2000,

337 temperature=0.0,

338 response_format={"type": "json_object"},

339 )

340 except Exception as exc: # noqa: BLE001

341 _logger.warning(

342 "section_fact_extraction: retry LLM call failed doc=%s line=%d: %s",

343 section.document_id,

344 section.line_num,

345 exc,

346 )

347 retry = None

348 if retry is not None:

349 data = tolerant_json_loads(retry.text)

350 if data is None:

351 _logger.warning(

352 "section_fact_extraction: JSON parse failed doc=%s line=%d",

353 section.document_id,

354 section.line_num,

355 )

356 return []

357 if not isinstance(data, dict):

358 return []

359 raw = data.get("facts") or []

360 if not isinstance(raw, list):

361 return []

362

363 out: list[PageIndexFact] = []

364 for entry in raw[:_MAX_FACTS_PER_SECTION]:

365 if not isinstance(entry, dict):

366 continue

367 fact_text = str(entry.get("text", "")).strip()

368 fact_type = str(entry.get("fact_type", "")).strip()

369 # M25 legacy compat: if the LLM emits the pre-M25

370 # `assistant_statement` tag (model trained on old prompt OR

371 # legacy ingest replay), remap to canonical (`experience` +

372 # speaker='assistant') to match Hindsight's storage shape.

373 if fact_type in _LEGACY_FACT_TYPE_REMAP:

374 fact_type = _LEGACY_FACT_TYPE_REMAP[fact_type]

375 # Force speaker='assistant' when remapping from

376 # assistant_statement — the perspective signal must survive

377 # the type collapse.

378 entry.setdefault("speaker", "assistant")

379 if not fact_text or fact_type not in _VALID_FACT_TYPES:

380 continue

381 speaker_raw = entry.get("speaker")

382 speaker = str(speaker_raw).strip() or None if isinstance(speaker_raw, str) else None

383 if speaker is not None and speaker not in {"user", "assistant"}:

384 speaker = None

385 ents_raw = entry.get("entities") or []

386 if not isinstance(ents_raw, list):

387 ents_raw = []

388 entities = [str(e).strip() for e in ents_raw if isinstance(e, str) and str(e).strip()]

389 # Dedupe entities case-insensitively, preserve first-seen casing

390 seen: set[str] = set()

391 deduped: list[str] = []

392 for e in entities:

393 k = e.casefold()

394 if k in seen:

395 continue

396 seen.add(k)

397 deduped.append(e)

398 # M27 — parse confidence score (0.0-1.0).

399 # M31b Fix C — DEFAULT to 0.7 when the LLM omits or emits a

400 # malformed value. The M27 extraction prompt explicitly tells

401 # the LLM "default 0.7 when uncertain"; in practice the LLM

402 # often skips the field, leaving confidence_score=None on most

403 # facts. That meant Fix 3's confidence-aware abstention rule

404 # in the answerer prompt never fired (the answerer can't hedge

405 # on confidence it doesn't see). Defaulting at parse-time

406 # ensures every newly-extracted fact carries a confidence

407 # value the answerer can act on. Out-of-bounds values clamp.

408 _DEFAULT_CONF = 0.7

409 confidence_score: float = _DEFAULT_CONF

410 raw_conf = entry.get("confidence")

411 if raw_conf is not None:

412 try:

413 cf = float(raw_conf)

414 if 0.0 <= cf <= 1.0:

415 confidence_score = cf

416 elif cf > 1.0:

417 confidence_score = 1.0

418 elif cf < 0.0:

419 confidence_score = 0.0

420 except (TypeError, ValueError):

421 # Keep the default rather than emitting None — Fix 3

422 # needs SOMETHING to hedge against.

423 confidence_score = _DEFAULT_CONF

424 # M31 Fix 4 — resolve relative date phrases in the fact's text

425 # to an absolute datetime at retain time. The section's

426 # ``session_date`` is the anchor for "last Tuesday" / "3 days

427 # ago" style references. Resolution is best-effort: returns

428 # ``None`` when no parseable phrase or no anchor. Distinct from

429 # ``occurred_start`` (LLM-emitted explicit range) and

430 # ``mentioned_at`` (session-level discussion date); see

431 # MemoryFact.event_date docstring.

432 from astrocyte.pipeline.temporal_resolution import ( # noqa: PLC0415

433 resolve_event_date,

434 )

435

436 event_date = resolve_event_date(fact_text, section.session_date)

437

438 out.append(

439 PageIndexFact(

440 id=str(uuid.uuid4()),

441 bank_id=bank_id,

442 document_id=section.document_id,

443 line_num=section.line_num,

444 text=fact_text,

445 fact_type=fact_type,

446 speaker=speaker,

447 occurred_start=_parse_iso_date(entry.get("occurred_start")),

448 occurred_end=_parse_iso_date(entry.get("occurred_end")),

449 entities=deduped,

450 embedding=None, # embeddings batched separately at retain time

451 confidence_score=confidence_score,

452 # M27 — `mentioned_at` is the session's date (when the

453 # conversation happened), distinct from `occurred_start`

454 # (when the event happened). For section-anchored facts

455 # we copy section.session_date; top-level facts (no

456 # section anchor) leave it None.

457 mentioned_at=section.session_date,

458 event_date=event_date, # M31 Fix 4

459 )

460 )

461 return out

Coverage for astrocyte/pipeline/section_fact_extraction.py: 91%

94 statements