Coverage for astrocyte/pipeline/section_event_extraction.py: 0%

37 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""M11.1 — structured event-time extraction per section. 

2 

3Mirrors Hindsight's per-fact ``occurred_start`` / ``occurred_end`` 

4columns on ``memory_units``. At retain time, for each section, ask 

5the LLM to identify the most-prominent event the section describes 

6and emit ISO-8601 start (and optional end) timestamps. 

7 

8Why this differs from ``session_date`` (already on every section): 

9 

10- ``session_date`` is when the conversation session HAPPENED 

11 (e.g. May 8, 2023 — when the user typed the message) 

12- ``occurred_start`` is when the discussed EVENT happened 

13 (e.g. May 7 — "yesterday I went to the doctor") 

14 

15LME temporal-reasoning failures all share the same shape: the picker 

16finds the right SESSION but the synth uses the SESSION date instead 

17of the EVENT date. With ``occurred_start`` populated, 

18:func:`~astrocyte.pipeline.temporal_arithmetic.find_event_date` can 

19return the canonical event date directly. 

20 

21Relative phrases ("yesterday", "last week", "3 days ago") are anchored 

22against the section's ``session_date`` at extraction time so the 

23output is always an absolute ISO timestamp. 

24 

25See: 

26- ``docs/_design/recall.md`` §13 (M10 close-out) + §14 (M11 plan) 

27- ``hindsight/hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py`` 

28 for the canonical Hindsight pattern at memory_unit grain. 

29""" 

30 

31from __future__ import annotations 

32 

33import json 

34import logging 

35from datetime import datetime 

36from typing import TYPE_CHECKING 

37 

38from astrocyte.types import Message 

39 

40if TYPE_CHECKING: 

41 from astrocyte.provider import LLMProvider 

42 from astrocyte.types import PageIndexSection 

43 

44_logger = logging.getLogger("astrocyte.pipeline.section_event_extraction") 

45 

46 

47_EXTRACT_PROMPT = """\ 

48You are extracting the structured EVENT TIME from one section of a \ 

49conversation transcript. Your output drives query-time temporal \ 

50arithmetic — questions like "how many weeks ago did I visit the \ 

51doctor?" need the canonical date of the doctor visit, NOT the date \ 

52the user mentioned it. 

53 

54The section's conversation date is ``{session_date}``. Anchor any \ 

55relative time phrase ("yesterday", "last week", "3 days ago", "last \ 

56month") against THIS date. Output absolute ISO-8601 timestamps. 

57 

58Output a JSON object with EXACTLY these fields: 

59- "occurred_start": ISO-8601 timestamp of when the most-prominent \ 

60discussed event began. ``null`` if the section is generic chit-chat \ 

61with no specific event. 

62- "occurred_end": ISO-8601 timestamp of when that event ended. \ 

63``null`` if it's a single-day or instantaneous event. 

64- "event_description": 3-6 word description of the event (for \ 

65provenance / debugging). ``null`` if no specific event. 

66 

67Rules: 

68- "Yesterday I went to the doctor" with session_date=2023-05-08 → \ 

69 ``{{"occurred_start": "2023-05-07", "occurred_end": null, \ 

70"event_description": "doctor visit"}}`` 

71- "We had a wedding two Saturdays ago" with session_date=2023-05-15 \ 

72 → ``{{"occurred_start": "2023-05-06", "occurred_end": null, \ 

73"event_description": "wedding"}}`` 

74- "Spent last weekend camping" with session_date=2023-05-22 \ 

75 (Monday) → ``{{"occurred_start": "2023-05-20", "occurred_end": \ 

76"2023-05-21", "event_description": "weekend camping"}}`` 

77- "Trip from May 3-15" → ``{{"occurred_start": "2023-05-03", \ 

78"occurred_end": "2023-05-15", "event_description": "trip"}}`` 

79- Generic chit-chat ("How are you today?", recipe discussion with \ 

80no specific past event) → ``{{"occurred_start": null, "occurred_end": \ 

81null, "event_description": null}}`` 

82 

83When the section discusses MULTIPLE events, pick the most-prominent \ 

84one (the one the user is asking about / spent the most time on). Do \ 

85NOT try to list multiple events — one per section. 

86 

87If a relative phrase is ambiguous ("recently", "a while back"), \ 

88return ``null`` rather than guessing. 

89 

90OUTPUT MUST BE VALID JSON. No prose around it. 

91 

92Section content: 

93{section_text} 

94""" 

95 

96 

97def _parse_iso_date(s: str | None) -> datetime | None: 

98 if not s or not isinstance(s, str): 

99 return None 

100 try: 

101 # Accept both "2023-05-07" and "2023-05-07T12:00:00" forms. 

102 if "T" in s or ":" in s: 

103 return datetime.fromisoformat(s.replace("Z", "+00:00")) 

104 return datetime.fromisoformat(s) 

105 except (ValueError, TypeError) as exc: 

106 _logger.debug("section_event_extraction: bad ISO date %r: %s", s, exc) 

107 return None 

108 

109 

110async def extract_event_date_for_section( 

111 provider: "LLMProvider", 

112 section: "PageIndexSection", 

113 section_text: str, 

114 *, 

115 model: str | None = None, 

116) -> tuple[datetime | None, datetime | None]: 

117 """One LLM call → ``(occurred_start, occurred_end)`` for this section. 

118 

119 Returns ``(None, None)`` when: 

120 - the section is generic chit-chat with no specific event 

121 - the LLM output fails to parse 

122 - the LLM declines to commit a date (ambiguous phrase) 

123 

124 Caller persists the returned dates via 

125 :meth:`PageIndexStore.save_section_event_dates`. 

126 """ 

127 text = section_text.strip() 

128 if not text: 

129 return None, None 

130 sess_iso = section.session_date.strftime("%Y-%m-%d") if section.session_date is not None else "unknown" 

131 msg = _EXTRACT_PROMPT.format( 

132 session_date=sess_iso, 

133 section_text=text[:6000], # ~1500 tokens cap 

134 ) 

135 try: 

136 completion = await provider.complete( 

137 messages=[Message(role="user", content=msg)], 

138 model=model, 

139 max_tokens=200, 

140 temperature=0.0, 

141 response_format={"type": "json_object"}, 

142 ) 

143 except Exception as exc: # noqa: BLE001 

144 _logger.warning( 

145 "section_event_extraction: LLM call failed doc=%s line=%d: %s", 

146 section.document_id, 

147 section.line_num, 

148 exc, 

149 ) 

150 return None, None 

151 try: 

152 data = json.loads(completion.text) 

153 except json.JSONDecodeError: 

154 _logger.warning( 

155 "section_event_extraction: JSON parse failed doc=%s line=%d text=%r", 

156 section.document_id, 

157 section.line_num, 

158 completion.text[:200], 

159 ) 

160 return None, None 

161 start = _parse_iso_date(data.get("occurred_start")) 

162 end = _parse_iso_date(data.get("occurred_end")) 

163 return start, end