Coverage for astrocyte/pipeline/query_analyzer.py: 81%
204 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Query analyzer — structured extraction of temporal constraints.
3Many recall queries embed temporal scoping ("what happened last spring?",
4"who did Alice meet in March 2024?", "events before the launch"). The
5default semantic-similarity recall treats time-words as just more text;
6without parsing them out, the system can't filter or boost evidence by
7date range, and temporal-category questions on benchmarks like LoCoMo
8suffer.
10This module exposes :func:`analyze_query`, which returns a
11:class:`QueryAnalysis` describing the structured constraints embedded
12in a query. Currently the only constraint type is
13:class:`TemporalConstraint`; the API is shaped to accept additional
14constraint types (location, entity, fact_type) without breaking
15callers.
17Two-tier extraction:
191. **Regex pre-pass** (no LLM cost): catches the high-volume common
20 patterns — explicit ISO dates, year-only mentions, ``last <unit>``,
21 ``in <month> [<year>]``, ``yesterday`` / ``today``. Bounded to ~15
22 patterns to keep the path predictable.
232. **LLM fallback**: when the regex pass finds no match AND the query
24 contains a temporal-marker token (configurable list), defer to an
25 LLM call with a structured-JSON prompt. Skipped entirely when no
26 temporal-marker is present — most queries don't mention time.
28The fallback is opt-in via ``allow_llm_fallback=True``. Keeping the
29default off makes ``analyze_query`` a fast, deterministic path that
30recall can call on every request without budget concerns.
31"""
33from __future__ import annotations
35import json
36import logging
37import re
38from dataclasses import dataclass
39from datetime import datetime, timedelta, timezone
41from astrocyte.types import Message
43_logger = logging.getLogger("astrocyte.query_analyzer")
46# ---------------------------------------------------------------------------
47# Data shapes
48# ---------------------------------------------------------------------------
51@dataclass
52class TemporalConstraint:
53 """A time range extracted from a query.
55 Both endpoints are inclusive. Either may be ``None`` to express
56 unbounded ("before March 2024" → ``end_date`` set, ``start_date``
57 None; "since Q1 2024" → ``start_date`` set, ``end_date`` None).
58 """
60 start_date: datetime | None = None
61 end_date: datetime | None = None
63 def __str__(self) -> str:
64 s = self.start_date.strftime("%Y-%m-%d") if self.start_date else "any"
65 e = self.end_date.strftime("%Y-%m-%d") if self.end_date else "any"
66 return f"{s} to {e}"
68 def is_bounded(self) -> bool:
69 return self.start_date is not None or self.end_date is not None
72@dataclass
73class QueryAnalysis:
74 """Result of structured query analysis."""
76 temporal_constraint: TemporalConstraint | None = None
77 #: Why the analyzer flagged this constraint — short string for
78 #: debugging / observability. Populated for both regex and LLM hits.
79 rationale: str = ""
81 def has_constraints(self) -> bool:
82 return self.temporal_constraint is not None and self.temporal_constraint.is_bounded()
85# ---------------------------------------------------------------------------
86# Regex pre-pass
87# ---------------------------------------------------------------------------
90# Patterns that are clear enough to extract without an LLM call.
91# Each entry returns ``(start, end, rationale)`` when matched. ``None``
92# for either endpoint means open-ended.
94_MONTHS = {
95 "january": 1,
96 "jan": 1,
97 "february": 2,
98 "feb": 2,
99 "march": 3,
100 "mar": 3,
101 "april": 4,
102 "apr": 4,
103 "may": 5,
104 "june": 6,
105 "jun": 6,
106 "july": 7,
107 "jul": 7,
108 "august": 8,
109 "aug": 8,
110 "september": 9,
111 "sep": 9,
112 "sept": 9,
113 "october": 10,
114 "oct": 10,
115 "november": 11,
116 "nov": 11,
117 "december": 12,
118 "dec": 12,
119}
122def _utc(year: int, month: int = 1, day: int = 1) -> datetime:
123 return datetime(year, month, day, tzinfo=timezone.utc)
126def _month_end(year: int, month: int) -> datetime:
127 if month == 12:
128 return _utc(year + 1, 1, 1) - timedelta(seconds=1)
129 return _utc(year, month + 1, 1) - timedelta(seconds=1)
132def _try_iso_date(query: str) -> tuple[datetime | None, datetime | None, str] | None:
133 """Match explicit ISO dates: ``2024-03-15``, ``2024-03``, ``2024``."""
134 # YYYY-MM-DD
135 m = re.search(r"\b(\d{4})-(\d{2})-(\d{2})\b", query)
136 if m:
137 try:
138 d = _utc(int(m.group(1)), int(m.group(2)), int(m.group(3)))
139 return d, d + timedelta(days=1) - timedelta(seconds=1), f"explicit date {m.group(0)}"
140 except ValueError:
141 pass
142 # YYYY-MM
143 m = re.search(r"\b(\d{4})-(\d{2})\b", query)
144 if m:
145 try:
146 year, month = int(m.group(1)), int(m.group(2))
147 return _utc(year, month), _month_end(year, month), f"explicit month {m.group(0)}"
148 except ValueError:
149 pass
150 return None
153def _try_year(query: str) -> tuple[datetime | None, datetime | None, str] | None:
154 """Match standalone 4-digit year (``in 2024``, ``during 2023``)."""
155 m = re.search(r"\b(?:in|during|from)\s+(\d{4})\b", query, re.IGNORECASE)
156 if m:
157 year = int(m.group(1))
158 if 1900 <= year <= 2100:
159 return _utc(year), _utc(year + 1) - timedelta(seconds=1), f"year-only {year}"
160 # Bare year token at start/end of clause.
161 m = re.search(r"\b(\d{4})\b", query)
162 if m:
163 year = int(m.group(1))
164 if 1900 <= year <= 2100:
165 return _utc(year), _utc(year + 1) - timedelta(seconds=1), f"bare year {year}"
166 return None
169def _try_month_year(query: str) -> tuple[datetime | None, datetime | None, str] | None:
170 """Match ``in March 2024``, ``March 2024``, ``March`` (with reference year)."""
171 pattern = r"\b(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|sept|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)(?:\s+(\d{4}))?\b"
172 m = re.search(pattern, query, re.IGNORECASE)
173 if not m:
174 return None
175 month = _MONTHS.get(m.group(1).lower())
176 if month is None:
177 return None
178 year_str = m.group(2)
179 if year_str:
180 year = int(year_str)
181 return _utc(year, month), _month_end(year, month), f"{m.group(1)} {year}"
182 # Without an explicit year, we can't resolve — skip the regex hit.
183 # The LLM fallback can attempt this with a reference_date.
184 return None
187def _try_relative(
188 query: str,
189 *,
190 reference: datetime,
191) -> tuple[datetime | None, datetime | None, str] | None:
192 """Resolve relative expressions (yesterday, last week, X days ago)."""
193 q = query.lower()
194 # Yesterday / today
195 if re.search(r"\byesterday\b", q):
196 d = reference - timedelta(days=1)
197 d = d.replace(hour=0, minute=0, second=0, microsecond=0)
198 return d, d + timedelta(days=1) - timedelta(seconds=1), "yesterday"
199 if re.search(r"\btoday\b", q):
200 d = reference.replace(hour=0, minute=0, second=0, microsecond=0)
201 return d, d + timedelta(days=1) - timedelta(seconds=1), "today"
202 # Last <unit>
203 m = re.search(r"\blast\s+(week|month|year)\b", q)
204 if m:
205 unit = m.group(1)
206 if unit == "week":
207 end = reference - timedelta(days=reference.weekday())
208 start = end - timedelta(days=7)
209 return start, end - timedelta(seconds=1), "last week"
210 if unit == "month":
211 year, month = reference.year, reference.month
212 month -= 1
213 if month == 0:
214 month, year = 12, year - 1
215 return _utc(year, month), _month_end(year, month), "last month"
216 if unit == "year":
217 year = reference.year - 1
218 return _utc(year), _utc(year + 1) - timedelta(seconds=1), "last year"
219 # N units ago
220 m = re.search(r"\b(\d+)\s+(day|week|month|year)s?\s+ago\b", q)
221 if m:
222 n = int(m.group(1))
223 unit = m.group(2)
224 if unit == "day":
225 d = reference - timedelta(days=n)
226 return d.replace(hour=0, minute=0, second=0, microsecond=0), reference, f"{n} day(s) ago"
227 if unit == "week":
228 d = reference - timedelta(weeks=n)
229 return d, reference, f"{n} week(s) ago"
230 # month/year — approximate; LLM fallback handles precision.
231 if unit == "month":
232 d = reference - timedelta(days=30 * n)
233 return d, reference, f"~{n} month(s) ago"
234 if unit == "year":
235 d = reference - timedelta(days=365 * n)
236 return d, reference, f"~{n} year(s) ago"
237 return None
240def _try_temporal_expansion(
241 query: str,
242 *,
243 reference: datetime,
244) -> tuple[datetime | None, datetime | None, str] | None:
245 """M18a-1 Pass A: extended relative-time patterns from temporal_expressions.
247 Covers patterns the core ``_try_relative`` misses:
248 - word-numbers ("one/two/.../ten X ago")
249 - "a few / few X ago" — vague pinned to roughly 2-5 units
250 - "couple of / a couple X ago" — ~2-3 units
251 - "the other day/week/month"
252 - "this / earlier this <unit>"
253 - "recently / just now / lately"
255 Imported lazily so the broader query_analyzer module load doesn't
256 pay the cost when the flag is off.
257 """
258 from astrocyte.pipeline.temporal_expressions import ( # noqa: PLC0415
259 expand_temporal_expression,
260 )
262 rng = expand_temporal_expression(query, reference)
263 if rng is None:
264 return None
265 start, end = rng
266 return start, end, "temporal-expansion match"
269def _try_dateparser(
270 query: str,
271 *,
272 reference: datetime,
273) -> tuple[datetime | None, datetime | None, str] | None:
274 """M18a-1 Pass B: Hindsight-parity wide-net date extraction via the
275 ``dateparser`` library.
277 Catches everything Pass A's curated regex set misses:
278 - Named dates ("March 15", "the 3rd of June")
279 - ISO ("2024-06-01") and weekday ("Tuesday", "last Friday") refs
280 - Ordinals ("the 5th of last month")
281 - Implicit multi-language ("ayer", "letztes Jahr") — when the
282 dataset is multilingual
284 Returns a widened neighbourhood (±1 day) around dateparser's
285 single-day hit, since fact-grain dates can be one day off the
286 question's mention date.
288 Imported lazily so the broader module load doesn't pay the
289 ``dateparser`` startup cost when the flag is off. Gracefully
290 no-ops (returns None) if the ``dateparser`` package isn't
291 installed — see ``pyproject.toml`` ``[bench]`` extras.
292 """
293 from astrocyte.pipeline.temporal_dateparser import ( # noqa: PLC0415
294 extract_temporal_range_via_dateparser,
295 widen_to_neighbourhood,
296 )
298 rng = extract_temporal_range_via_dateparser(query, reference)
299 if rng is None:
300 return None
301 start, end = widen_to_neighbourhood(rng, pad_days=1)
302 return start, end, "dateparser match (±1d)"
305def _regex_temporal_pass(
306 query: str,
307 *,
308 reference: datetime,
309 allow_temporal_expansion: bool = False,
310) -> TemporalConstraint | None:
311 """Try each regex pattern in order; return the first match.
313 When ``allow_temporal_expansion=True`` (M18a-1 flag), the extended
314 pattern set from ``temporal_expressions`` runs as an additional
315 branch in the chain. Default off so behavior matches the legacy
316 pre-M18 path.
317 """
318 chain = [_try_iso_date, _try_relative, _try_month_year, _try_year]
319 if allow_temporal_expansion:
320 # Pass A: insert after _try_relative — narrow exact regex matches
321 # (yesterday/today/last week/N units ago) still win when they apply;
322 # the expansion patterns catch fuzzy quantifiers, word-numbers,
323 # "lately" that _try_relative missed.
324 chain.insert(2, _try_temporal_expansion)
325 # Pass B: Hindsight-parity dateparser. Appended LAST so all the
326 # precise regex passes get first crack — Pass B is the wide-net
327 # catchall for named dates ("March 15"), weekdays, ISO refs, etc.
328 # No-ops when the `dateparser` package isn't installed.
329 chain.append(_try_dateparser)
331 for fn in chain:
332 if fn in (_try_relative, _try_temporal_expansion, _try_dateparser):
333 hit = fn(query, reference=reference) # type: ignore[arg-type]
334 else:
335 hit = fn(query) # type: ignore[arg-type]
336 if hit is not None:
337 start, end, _rationale = hit
338 return TemporalConstraint(start_date=start, end_date=end)
339 return None
342# ---------------------------------------------------------------------------
343# Temporal-marker detection (cheap gate before LLM fallback)
344# ---------------------------------------------------------------------------
347_TEMPORAL_MARKERS = {
348 "yesterday",
349 "today",
350 "tomorrow",
351 "last",
352 "this",
353 "next",
354 "ago",
355 "before",
356 "after",
357 "since",
358 "until",
359 "during",
360 "when",
361 "while",
362 "then",
363 "now",
364 "recently",
365 "earlier",
366 "later",
367 "previous",
368 "previously",
369 "ever",
370 "never",
371 "always",
372 "year",
373 "month",
374 "week",
375 "day",
376 "morning",
377 "evening",
378 "spring",
379 "summer",
380 "fall",
381 "autumn",
382 "winter",
383}
386def _has_temporal_marker(query: str) -> bool:
387 """Cheap word-level test for temporal markers in the query."""
388 tokens = re.findall(r"[a-z]+", query.lower())
389 if not tokens:
390 return False
391 if any(t in _TEMPORAL_MARKERS for t in tokens):
392 return True
393 # Year mentions (4-digit) count as temporal too.
394 return bool(re.search(r"\b\d{4}\b", query))
397# ---------------------------------------------------------------------------
398# LLM fallback
399# ---------------------------------------------------------------------------
402_LLM_SYSTEM_PROMPT = """\
403You extract a TIME RANGE from a query when one is implied.
405Output a JSON object: {"start_date": "<ISO date or null>", "end_date": \
406"<ISO date or null>", "rationale": "<1 sentence>"}.
408Rules:
4091. If the query has no temporal scope, return {"start_date": null, \
410"end_date": null, "rationale": "no temporal scope"}.
4112. Both dates are inclusive. Use null for open-ended ranges \
412("before March 2024" → end_date set, start_date null).
4133. ISO 8601 with timezone: "2024-03-01T00:00:00Z".
4144. Use the supplied reference date to resolve relative expressions \
415("last spring" relative to a 2025-01-15 reference is "2024-03-01" to \
416"2024-05-31").
4175. Output JSON only. No prose.
418"""
421def _build_llm_user_prompt(query: str, reference: datetime) -> str:
422 return f"Reference date: {reference.isoformat()}\nQuery: {query.strip()}\n\nTime range (JSON):"
425def _parse_llm_response(raw: str) -> TemporalConstraint | None:
426 text = raw.strip()
427 if text.startswith("```"):
428 text = re.sub(r"^```(?:json)?\s*", "", text)
429 text = re.sub(r"\s*```$", "", text)
430 match = re.search(r"\{.*\}", text, re.DOTALL)
431 if match is None:
432 return None
433 try:
434 parsed = json.loads(match.group(0))
435 except json.JSONDecodeError:
436 return None
437 if not isinstance(parsed, dict):
438 return None
440 def _parse_iso(value) -> datetime | None:
441 if not value or not isinstance(value, str):
442 return None
443 s = value.strip()
444 if s.endswith("Z"):
445 s = s[:-1] + "+00:00"
446 try:
447 dt = datetime.fromisoformat(s)
448 except ValueError:
449 return None
450 return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
452 start = _parse_iso(parsed.get("start_date"))
453 end = _parse_iso(parsed.get("end_date"))
454 if start is None and end is None:
455 return None
456 return TemporalConstraint(start_date=start, end_date=end)
459# ---------------------------------------------------------------------------
460# Entry point
461# ---------------------------------------------------------------------------
464async def analyze_query(
465 query: str,
466 *,
467 reference_date: datetime | None = None,
468 llm_provider=None,
469 allow_llm_fallback: bool = True,
470 allow_temporal_expansion: bool = False,
471) -> QueryAnalysis:
472 """Extract structured constraints from a query.
474 Args:
475 query: User question / recall query.
476 reference_date: Used to resolve relative expressions ("last
477 week", "yesterday"). Defaults to ``datetime.now(UTC)``.
478 llm_provider: Required when ``allow_llm_fallback=True``.
479 Passed to the LLM call when the regex pre-pass misses.
480 allow_llm_fallback: When True, falls back to an LLM call after
481 the regex pass when (a) no regex matched AND (b) the query
482 contains a temporal-marker token. When False, only the
483 regex path runs (deterministic, no LLM cost). Default
484 False so callers explicitly opt in.
485 allow_temporal_expansion: M18a-1 flag. When True, the regex
486 pre-pass includes the extended pattern set from
487 ``astrocyte.pipeline.temporal_expressions`` (word-numbers,
488 "a few X ago", "the other day", "this/earlier this <unit>",
489 "recently/lately"). Default False — preserves legacy
490 behavior. Promotion to default = M18c step after bench gate.
492 Returns:
493 :class:`QueryAnalysis` whose ``temporal_constraint`` is set
494 when extraction succeeded. ``has_constraints()`` is the
495 idiomatic check.
496 """
497 if not query or not query.strip():
498 return QueryAnalysis()
500 ref = reference_date or datetime.now(timezone.utc)
501 if ref.tzinfo is None:
502 ref = ref.replace(tzinfo=timezone.utc)
504 # Regex pre-pass (with optional M18a-1 expansion).
505 regex_hit = _regex_temporal_pass(
506 query, reference=ref, allow_temporal_expansion=allow_temporal_expansion,
507 )
508 if regex_hit is not None:
509 return QueryAnalysis(
510 temporal_constraint=regex_hit,
511 rationale="regex match",
512 )
514 # LLM fallback (gated).
515 if not allow_llm_fallback or llm_provider is None:
516 return QueryAnalysis()
517 if not _has_temporal_marker(query):
518 # No temporal-marker token — the LLM is unlikely to find a
519 # constraint that the regex missed. Save the call.
520 return QueryAnalysis()
522 try:
523 completion = await llm_provider.complete(
524 [
525 Message(role="system", content=_LLM_SYSTEM_PROMPT),
526 Message(role="user", content=_build_llm_user_prompt(query, ref)),
527 ],
528 max_tokens=256,
529 temperature=0.0,
530 )
531 except Exception as exc:
532 _logger.warning("query_analyzer LLM fallback failed (%s)", exc)
533 return QueryAnalysis()
535 constraint = _parse_llm_response(completion.text)
536 if constraint is None or not constraint.is_bounded():
537 return QueryAnalysis()
538 return QueryAnalysis(
539 temporal_constraint=constraint,
540 rationale="llm fallback",
541 )