Coverage for astrocyte/pipeline/temporal_expressions.py: 69%
68 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Query-time relative-temporal expansion.
3Maps relative temporal expressions in a question (``"a few weeks ago"``,
4``"last month"``, ``"3 days ago"``) to absolute ISO date ranges using a
5reference anchor date. Recall code can then feed the range to the
6fact-grain temporal-search SPI without depending on every ingested
7section having a structured ``occurred_start`` populated.
9Two prior attempts at INGEST-TIME structured-date extraction (M14.x,
10M15.x) were reverted because per-fact temporal metadata is sparse —
11many preference/opinion facts have no specific event time and stamping
12one was net-negative across categories. Doing the expansion at QUERY
13time avoids that problem: we widen recall when the question itself
14asks about a time window, leaving ingest extraction untouched.
16Anchor date selection (caller-supplied): use the latest session
17timestamp known for the document. "A few weeks ago" relative to a
18conversation that ran in May 2023 should map to mid-April 2023, not
19relative to wall-clock today.
21Conservative behaviour: if no temporal cue is found in the query,
22return ``None``. Callers should fall back to non-temporal recall.
24Supported expressions (case-insensitive, allowed anywhere in the
25query string):
27 - ``yesterday`` → (anchor − 2d, anchor)
28 - ``today`` → (anchor − 1d, anchor + 1d)
29 - ``last week`` / ``a week ago`` → (anchor − 14d, anchor − 5d)
30 - ``this week`` → (anchor − 8d, anchor + 1d)
31 - ``last month`` / ``a month ago`` → (anchor − 60d, anchor − 20d)
32 - ``last year`` / ``a year ago`` → (anchor − 540d, anchor − 270d)
33 - ``a few <unit>s ago`` → (anchor − 5×unit, anchor − 2×unit)
34 - ``couple <unit>s ago`` → (anchor − 4×unit, anchor − 2×unit)
35 - ``<N> <unit>s ago`` → (anchor − (N+2)×unit, anchor − max(N−2,0)×unit)
36 - ``earlier this <unit>`` → (anchor − 1×unit, anchor)
37 - ``recently`` / ``just`` → (anchor − 14d, anchor + 1d)
39Each "match" widens the window slightly so the recall SPI returns a
40useful neighbourhood instead of an exact day-match (the user said "a
41few weeks ago"; they mean roughly 2-5 weeks).
42"""
44from __future__ import annotations
46import logging
47import re
48from datetime import datetime, timedelta
50logger = logging.getLogger("astrocyte.pipeline.temporal_expressions")
53DateRange = tuple[datetime, datetime]
56_UNIT_TO_DAYS: dict[str, int] = {
57 "day": 1,
58 "days": 1,
59 "week": 7,
60 "weeks": 7,
61 "month": 30,
62 "months": 30,
63 "year": 365,
64 "years": 365,
65}
68_NUMBER_WORD: dict[str, int] = {
69 "one": 1,
70 "two": 2,
71 "three": 3,
72 "four": 4,
73 "five": 5,
74 "six": 6,
75 "seven": 7,
76 "eight": 8,
77 "nine": 9,
78 "ten": 10,
79}
82def _coerce_n(token: str) -> int | None:
83 token = token.strip().lower()
84 if token.isdigit():
85 try:
86 return int(token)
87 except ValueError:
88 return None
89 return _NUMBER_WORD.get(token)
92def _range_centred_on(anchor: datetime, days_ago: float, half_width_days: float) -> DateRange:
93 """Window: ``[anchor − (days_ago + half_width), anchor − (days_ago − half_width)]``.
95 Clamps the lower bound to never go past 5 years before anchor (any
96 document older than that is well outside the LME/LoCoMo bench scope
97 and likely a parse error).
98 """
99 start = anchor - timedelta(days=days_ago + half_width_days)
100 end = anchor - timedelta(days=max(days_ago - half_width_days, 0))
101 floor = anchor - timedelta(days=365 * 5)
102 if start < floor:
103 start = floor
104 return (start, end)
107def expand_temporal_expression(query: str, anchor: datetime) -> DateRange | None:
108 """Parse the first relative-time expression in ``query`` and return
109 a date range. Returns ``None`` when no cue is found.
111 ``anchor`` is the reference "now" for relative expressions —
112 typically the latest session timestamp for the document.
113 """
114 if not query:
115 return None
116 q = query.lower()
118 # "<N> <unit>s ago" — digit or word number, plural or singular unit
119 m = re.search(
120 r"\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\s+"
121 r"(day|days|week|weeks|month|months|year|years)\s+ago\b",
122 q,
123 )
124 if m:
125 n = _coerce_n(m.group(1))
126 unit_days = _UNIT_TO_DAYS.get(m.group(2))
127 if n is not None and unit_days is not None:
128 days_ago = n * unit_days
129 half = max(unit_days, 1)
130 return _range_centred_on(anchor, days_ago, half)
132 # "a few <unit>s ago" — vague but pinned to roughly 2-5 units
133 m = re.search(
134 r"\b(a\s+few|few)\s+(day|days|week|weeks|month|months|year|years)\s+ago\b",
135 q,
136 )
137 if m:
138 unit_days = _UNIT_TO_DAYS.get(m.group(2))
139 if unit_days is not None:
140 return _range_centred_on(anchor, 3.5 * unit_days, 1.5 * unit_days)
142 # "couple <unit>s ago" — ~2-3 units
143 m = re.search(
144 r"\b(a\s+couple\s+of|couple\s+of|a\s+couple|couple)\s+"
145 r"(day|days|week|weeks|month|months|year|years)\s+ago\b",
146 q,
147 )
148 if m:
149 unit_days = _UNIT_TO_DAYS.get(m.group(2))
150 if unit_days is not None:
151 return _range_centred_on(anchor, 2.5 * unit_days, 1.0 * unit_days)
153 # "last <unit>" or "<unit> ago" (no quantifier)
154 m = re.search(
155 r"\b(last|a)\s+(day|week|month|year)\b|"
156 r"\bthe\s+other\s+(day|week|month)\b",
157 q,
158 )
159 if m:
160 unit = (m.group(2) or m.group(3) or "").lower()
161 unit_days = _UNIT_TO_DAYS.get(unit)
162 if unit_days is not None:
163 # "last week" → 5-14 days ago; "last month" → 20-60d; "last year" → 270-540d
164 return _range_centred_on(anchor, 1.4 * unit_days, 0.7 * unit_days)
166 # "this <unit>" — current period
167 m = re.search(r"\bthis\s+(week|month|year)\b", q)
168 if m:
169 unit_days = _UNIT_TO_DAYS.get(m.group(1))
170 if unit_days is not None:
171 return (anchor - timedelta(days=unit_days), anchor + timedelta(days=1))
173 # "earlier this <unit>" — same window as "this <unit>"
174 m = re.search(r"\bearlier\s+this\s+(week|month|year)\b", q)
175 if m:
176 unit_days = _UNIT_TO_DAYS.get(m.group(1))
177 if unit_days is not None:
178 return (anchor - timedelta(days=unit_days), anchor)
180 # Single-word time anchors
181 if re.search(r"\byesterday\b", q):
182 return (anchor - timedelta(days=2), anchor)
183 if re.search(r"\btoday\b", q):
184 return (anchor - timedelta(days=1), anchor + timedelta(days=1))
185 if re.search(r"\brecently\b|\bjust\s+now\b|\blately\b", q):
186 return (anchor - timedelta(days=14), anchor + timedelta(days=1))
188 return None