Coverage for astrocyte/pipeline/section_event_extraction.py: 0%
37 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""M11.1 — structured event-time extraction per section.
3Mirrors Hindsight's per-fact ``occurred_start`` / ``occurred_end``
4columns on ``memory_units``. At retain time, for each section, ask
5the LLM to identify the most-prominent event the section describes
6and emit ISO-8601 start (and optional end) timestamps.
8Why this differs from ``session_date`` (already on every section):
10- ``session_date`` is when the conversation session HAPPENED
11 (e.g. May 8, 2023 — when the user typed the message)
12- ``occurred_start`` is when the discussed EVENT happened
13 (e.g. May 7 — "yesterday I went to the doctor")
15LME temporal-reasoning failures all share the same shape: the picker
16finds the right SESSION but the synth uses the SESSION date instead
17of the EVENT date. With ``occurred_start`` populated,
18:func:`~astrocyte.pipeline.temporal_arithmetic.find_event_date` can
19return the canonical event date directly.
21Relative phrases ("yesterday", "last week", "3 days ago") are anchored
22against the section's ``session_date`` at extraction time so the
23output is always an absolute ISO timestamp.
25See:
26- ``docs/_design/recall.md`` §13 (M10 close-out) + §14 (M11 plan)
27- ``hindsight/hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py``
28 for the canonical Hindsight pattern at memory_unit grain.
29"""
31from __future__ import annotations
33import json
34import logging
35from datetime import datetime
36from typing import TYPE_CHECKING
38from astrocyte.types import Message
40if TYPE_CHECKING:
41 from astrocyte.provider import LLMProvider
42 from astrocyte.types import PageIndexSection
44_logger = logging.getLogger("astrocyte.pipeline.section_event_extraction")
47_EXTRACT_PROMPT = """\
48You are extracting the structured EVENT TIME from one section of a \
49conversation transcript. Your output drives query-time temporal \
50arithmetic — questions like "how many weeks ago did I visit the \
51doctor?" need the canonical date of the doctor visit, NOT the date \
52the user mentioned it.
54The section's conversation date is ``{session_date}``. Anchor any \
55relative time phrase ("yesterday", "last week", "3 days ago", "last \
56month") against THIS date. Output absolute ISO-8601 timestamps.
58Output a JSON object with EXACTLY these fields:
59- "occurred_start": ISO-8601 timestamp of when the most-prominent \
60discussed event began. ``null`` if the section is generic chit-chat \
61with no specific event.
62- "occurred_end": ISO-8601 timestamp of when that event ended. \
63``null`` if it's a single-day or instantaneous event.
64- "event_description": 3-6 word description of the event (for \
65provenance / debugging). ``null`` if no specific event.
67Rules:
68- "Yesterday I went to the doctor" with session_date=2023-05-08 → \
69 ``{{"occurred_start": "2023-05-07", "occurred_end": null, \
70"event_description": "doctor visit"}}``
71- "We had a wedding two Saturdays ago" with session_date=2023-05-15 \
72 → ``{{"occurred_start": "2023-05-06", "occurred_end": null, \
73"event_description": "wedding"}}``
74- "Spent last weekend camping" with session_date=2023-05-22 \
75 (Monday) → ``{{"occurred_start": "2023-05-20", "occurred_end": \
76"2023-05-21", "event_description": "weekend camping"}}``
77- "Trip from May 3-15" → ``{{"occurred_start": "2023-05-03", \
78"occurred_end": "2023-05-15", "event_description": "trip"}}``
79- Generic chit-chat ("How are you today?", recipe discussion with \
80no specific past event) → ``{{"occurred_start": null, "occurred_end": \
81null, "event_description": null}}``
83When the section discusses MULTIPLE events, pick the most-prominent \
84one (the one the user is asking about / spent the most time on). Do \
85NOT try to list multiple events — one per section.
87If a relative phrase is ambiguous ("recently", "a while back"), \
88return ``null`` rather than guessing.
90OUTPUT MUST BE VALID JSON. No prose around it.
92Section content:
93{section_text}
94"""
97def _parse_iso_date(s: str | None) -> datetime | None:
98 if not s or not isinstance(s, str):
99 return None
100 try:
101 # Accept both "2023-05-07" and "2023-05-07T12:00:00" forms.
102 if "T" in s or ":" in s:
103 return datetime.fromisoformat(s.replace("Z", "+00:00"))
104 return datetime.fromisoformat(s)
105 except (ValueError, TypeError) as exc:
106 _logger.debug("section_event_extraction: bad ISO date %r: %s", s, exc)
107 return None
110async def extract_event_date_for_section(
111 provider: "LLMProvider",
112 section: "PageIndexSection",
113 section_text: str,
114 *,
115 model: str | None = None,
116) -> tuple[datetime | None, datetime | None]:
117 """One LLM call → ``(occurred_start, occurred_end)`` for this section.
119 Returns ``(None, None)`` when:
120 - the section is generic chit-chat with no specific event
121 - the LLM output fails to parse
122 - the LLM declines to commit a date (ambiguous phrase)
124 Caller persists the returned dates via
125 :meth:`PageIndexStore.save_section_event_dates`.
126 """
127 text = section_text.strip()
128 if not text:
129 return None, None
130 sess_iso = section.session_date.strftime("%Y-%m-%d") if section.session_date is not None else "unknown"
131 msg = _EXTRACT_PROMPT.format(
132 session_date=sess_iso,
133 section_text=text[:6000], # ~1500 tokens cap
134 )
135 try:
136 completion = await provider.complete(
137 messages=[Message(role="user", content=msg)],
138 model=model,
139 max_tokens=200,
140 temperature=0.0,
141 response_format={"type": "json_object"},
142 )
143 except Exception as exc: # noqa: BLE001
144 _logger.warning(
145 "section_event_extraction: LLM call failed doc=%s line=%d: %s",
146 section.document_id,
147 section.line_num,
148 exc,
149 )
150 return None, None
151 try:
152 data = json.loads(completion.text)
153 except json.JSONDecodeError:
154 _logger.warning(
155 "section_event_extraction: JSON parse failed doc=%s line=%d text=%r",
156 section.document_id,
157 section.line_num,
158 completion.text[:200],
159 )
160 return None, None
161 start = _parse_iso_date(data.get("occurred_start"))
162 end = _parse_iso_date(data.get("occurred_end"))
163 return start, end