Coverage for astrocyte/pipeline/query_analyzer.py: 81%

204 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Query analyzer — structured extraction of temporal constraints. 

2 

3Many recall queries embed temporal scoping ("what happened last spring?", 

4"who did Alice meet in March 2024?", "events before the launch"). The 

5default semantic-similarity recall treats time-words as just more text; 

6without parsing them out, the system can't filter or boost evidence by 

7date range, and temporal-category questions on benchmarks like LoCoMo 

8suffer. 

9 

10This module exposes :func:`analyze_query`, which returns a 

11:class:`QueryAnalysis` describing the structured constraints embedded 

12in a query. Currently the only constraint type is 

13:class:`TemporalConstraint`; the API is shaped to accept additional 

14constraint types (location, entity, fact_type) without breaking 

15callers. 

16 

17Two-tier extraction: 

18 

191. **Regex pre-pass** (no LLM cost): catches the high-volume common 

20 patterns — explicit ISO dates, year-only mentions, ``last <unit>``, 

21 ``in <month> [<year>]``, ``yesterday`` / ``today``. Bounded to ~15 

22 patterns to keep the path predictable. 

232. **LLM fallback**: when the regex pass finds no match AND the query 

24 contains a temporal-marker token (configurable list), defer to an 

25 LLM call with a structured-JSON prompt. Skipped entirely when no 

26 temporal-marker is present — most queries don't mention time. 

27 

28The fallback is opt-in via ``allow_llm_fallback=True``. Keeping the 

29default off makes ``analyze_query`` a fast, deterministic path that 

30recall can call on every request without budget concerns. 

31""" 

32 

33from __future__ import annotations 

34 

35import json 

36import logging 

37import re 

38from dataclasses import dataclass 

39from datetime import datetime, timedelta, timezone 

40 

41from astrocyte.types import Message 

42 

43_logger = logging.getLogger("astrocyte.query_analyzer") 

44 

45 

46# --------------------------------------------------------------------------- 

47# Data shapes 

48# --------------------------------------------------------------------------- 

49 

50 

51@dataclass 

52class TemporalConstraint: 

53 """A time range extracted from a query. 

54 

55 Both endpoints are inclusive. Either may be ``None`` to express 

56 unbounded ("before March 2024" → ``end_date`` set, ``start_date`` 

57 None; "since Q1 2024" → ``start_date`` set, ``end_date`` None). 

58 """ 

59 

60 start_date: datetime | None = None 

61 end_date: datetime | None = None 

62 

63 def __str__(self) -> str: 

64 s = self.start_date.strftime("%Y-%m-%d") if self.start_date else "any" 

65 e = self.end_date.strftime("%Y-%m-%d") if self.end_date else "any" 

66 return f"{s} to {e}" 

67 

68 def is_bounded(self) -> bool: 

69 return self.start_date is not None or self.end_date is not None 

70 

71 

72@dataclass 

73class QueryAnalysis: 

74 """Result of structured query analysis.""" 

75 

76 temporal_constraint: TemporalConstraint | None = None 

77 #: Why the analyzer flagged this constraint — short string for 

78 #: debugging / observability. Populated for both regex and LLM hits. 

79 rationale: str = "" 

80 

81 def has_constraints(self) -> bool: 

82 return self.temporal_constraint is not None and self.temporal_constraint.is_bounded() 

83 

84 

85# --------------------------------------------------------------------------- 

86# Regex pre-pass 

87# --------------------------------------------------------------------------- 

88 

89 

90# Patterns that are clear enough to extract without an LLM call. 

91# Each entry returns ``(start, end, rationale)`` when matched. ``None`` 

92# for either endpoint means open-ended. 

93 

94_MONTHS = { 

95 "january": 1, 

96 "jan": 1, 

97 "february": 2, 

98 "feb": 2, 

99 "march": 3, 

100 "mar": 3, 

101 "april": 4, 

102 "apr": 4, 

103 "may": 5, 

104 "june": 6, 

105 "jun": 6, 

106 "july": 7, 

107 "jul": 7, 

108 "august": 8, 

109 "aug": 8, 

110 "september": 9, 

111 "sep": 9, 

112 "sept": 9, 

113 "october": 10, 

114 "oct": 10, 

115 "november": 11, 

116 "nov": 11, 

117 "december": 12, 

118 "dec": 12, 

119} 

120 

121 

122def _utc(year: int, month: int = 1, day: int = 1) -> datetime: 

123 return datetime(year, month, day, tzinfo=timezone.utc) 

124 

125 

126def _month_end(year: int, month: int) -> datetime: 

127 if month == 12: 

128 return _utc(year + 1, 1, 1) - timedelta(seconds=1) 

129 return _utc(year, month + 1, 1) - timedelta(seconds=1) 

130 

131 

132def _try_iso_date(query: str) -> tuple[datetime | None, datetime | None, str] | None: 

133 """Match explicit ISO dates: ``2024-03-15``, ``2024-03``, ``2024``.""" 

134 # YYYY-MM-DD 

135 m = re.search(r"\b(\d{4})-(\d{2})-(\d{2})\b", query) 

136 if m: 

137 try: 

138 d = _utc(int(m.group(1)), int(m.group(2)), int(m.group(3))) 

139 return d, d + timedelta(days=1) - timedelta(seconds=1), f"explicit date {m.group(0)}" 

140 except ValueError: 

141 pass 

142 # YYYY-MM 

143 m = re.search(r"\b(\d{4})-(\d{2})\b", query) 

144 if m: 

145 try: 

146 year, month = int(m.group(1)), int(m.group(2)) 

147 return _utc(year, month), _month_end(year, month), f"explicit month {m.group(0)}" 

148 except ValueError: 

149 pass 

150 return None 

151 

152 

153def _try_year(query: str) -> tuple[datetime | None, datetime | None, str] | None: 

154 """Match standalone 4-digit year (``in 2024``, ``during 2023``).""" 

155 m = re.search(r"\b(?:in|during|from)\s+(\d{4})\b", query, re.IGNORECASE) 

156 if m: 

157 year = int(m.group(1)) 

158 if 1900 <= year <= 2100: 

159 return _utc(year), _utc(year + 1) - timedelta(seconds=1), f"year-only {year}" 

160 # Bare year token at start/end of clause. 

161 m = re.search(r"\b(\d{4})\b", query) 

162 if m: 

163 year = int(m.group(1)) 

164 if 1900 <= year <= 2100: 

165 return _utc(year), _utc(year + 1) - timedelta(seconds=1), f"bare year {year}" 

166 return None 

167 

168 

169def _try_month_year(query: str) -> tuple[datetime | None, datetime | None, str] | None: 

170 """Match ``in March 2024``, ``March 2024``, ``March`` (with reference year).""" 

171 pattern = r"\b(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|sept|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)(?:\s+(\d{4}))?\b" 

172 m = re.search(pattern, query, re.IGNORECASE) 

173 if not m: 

174 return None 

175 month = _MONTHS.get(m.group(1).lower()) 

176 if month is None: 

177 return None 

178 year_str = m.group(2) 

179 if year_str: 

180 year = int(year_str) 

181 return _utc(year, month), _month_end(year, month), f"{m.group(1)} {year}" 

182 # Without an explicit year, we can't resolve — skip the regex hit. 

183 # The LLM fallback can attempt this with a reference_date. 

184 return None 

185 

186 

187def _try_relative( 

188 query: str, 

189 *, 

190 reference: datetime, 

191) -> tuple[datetime | None, datetime | None, str] | None: 

192 """Resolve relative expressions (yesterday, last week, X days ago).""" 

193 q = query.lower() 

194 # Yesterday / today 

195 if re.search(r"\byesterday\b", q): 

196 d = reference - timedelta(days=1) 

197 d = d.replace(hour=0, minute=0, second=0, microsecond=0) 

198 return d, d + timedelta(days=1) - timedelta(seconds=1), "yesterday" 

199 if re.search(r"\btoday\b", q): 

200 d = reference.replace(hour=0, minute=0, second=0, microsecond=0) 

201 return d, d + timedelta(days=1) - timedelta(seconds=1), "today" 

202 # Last <unit> 

203 m = re.search(r"\blast\s+(week|month|year)\b", q) 

204 if m: 

205 unit = m.group(1) 

206 if unit == "week": 

207 end = reference - timedelta(days=reference.weekday()) 

208 start = end - timedelta(days=7) 

209 return start, end - timedelta(seconds=1), "last week" 

210 if unit == "month": 

211 year, month = reference.year, reference.month 

212 month -= 1 

213 if month == 0: 

214 month, year = 12, year - 1 

215 return _utc(year, month), _month_end(year, month), "last month" 

216 if unit == "year": 

217 year = reference.year - 1 

218 return _utc(year), _utc(year + 1) - timedelta(seconds=1), "last year" 

219 # N units ago 

220 m = re.search(r"\b(\d+)\s+(day|week|month|year)s?\s+ago\b", q) 

221 if m: 

222 n = int(m.group(1)) 

223 unit = m.group(2) 

224 if unit == "day": 

225 d = reference - timedelta(days=n) 

226 return d.replace(hour=0, minute=0, second=0, microsecond=0), reference, f"{n} day(s) ago" 

227 if unit == "week": 

228 d = reference - timedelta(weeks=n) 

229 return d, reference, f"{n} week(s) ago" 

230 # month/year — approximate; LLM fallback handles precision. 

231 if unit == "month": 

232 d = reference - timedelta(days=30 * n) 

233 return d, reference, f"~{n} month(s) ago" 

234 if unit == "year": 

235 d = reference - timedelta(days=365 * n) 

236 return d, reference, f"~{n} year(s) ago" 

237 return None 

238 

239 

240def _try_temporal_expansion( 

241 query: str, 

242 *, 

243 reference: datetime, 

244) -> tuple[datetime | None, datetime | None, str] | None: 

245 """M18a-1 Pass A: extended relative-time patterns from temporal_expressions. 

246 

247 Covers patterns the core ``_try_relative`` misses: 

248 - word-numbers ("one/two/.../ten X ago") 

249 - "a few / few X ago" — vague pinned to roughly 2-5 units 

250 - "couple of / a couple X ago" — ~2-3 units 

251 - "the other day/week/month" 

252 - "this / earlier this <unit>" 

253 - "recently / just now / lately" 

254 

255 Imported lazily so the broader query_analyzer module load doesn't 

256 pay the cost when the flag is off. 

257 """ 

258 from astrocyte.pipeline.temporal_expressions import ( # noqa: PLC0415 

259 expand_temporal_expression, 

260 ) 

261 

262 rng = expand_temporal_expression(query, reference) 

263 if rng is None: 

264 return None 

265 start, end = rng 

266 return start, end, "temporal-expansion match" 

267 

268 

269def _try_dateparser( 

270 query: str, 

271 *, 

272 reference: datetime, 

273) -> tuple[datetime | None, datetime | None, str] | None: 

274 """M18a-1 Pass B: Hindsight-parity wide-net date extraction via the 

275 ``dateparser`` library. 

276 

277 Catches everything Pass A's curated regex set misses: 

278 - Named dates ("March 15", "the 3rd of June") 

279 - ISO ("2024-06-01") and weekday ("Tuesday", "last Friday") refs 

280 - Ordinals ("the 5th of last month") 

281 - Implicit multi-language ("ayer", "letztes Jahr") — when the 

282 dataset is multilingual 

283 

284 Returns a widened neighbourhood (±1 day) around dateparser's 

285 single-day hit, since fact-grain dates can be one day off the 

286 question's mention date. 

287 

288 Imported lazily so the broader module load doesn't pay the 

289 ``dateparser`` startup cost when the flag is off. Gracefully 

290 no-ops (returns None) if the ``dateparser`` package isn't 

291 installed — see ``pyproject.toml`` ``[bench]`` extras. 

292 """ 

293 from astrocyte.pipeline.temporal_dateparser import ( # noqa: PLC0415 

294 extract_temporal_range_via_dateparser, 

295 widen_to_neighbourhood, 

296 ) 

297 

298 rng = extract_temporal_range_via_dateparser(query, reference) 

299 if rng is None: 

300 return None 

301 start, end = widen_to_neighbourhood(rng, pad_days=1) 

302 return start, end, "dateparser match (±1d)" 

303 

304 

305def _regex_temporal_pass( 

306 query: str, 

307 *, 

308 reference: datetime, 

309 allow_temporal_expansion: bool = False, 

310) -> TemporalConstraint | None: 

311 """Try each regex pattern in order; return the first match. 

312 

313 When ``allow_temporal_expansion=True`` (M18a-1 flag), the extended 

314 pattern set from ``temporal_expressions`` runs as an additional 

315 branch in the chain. Default off so behavior matches the legacy 

316 pre-M18 path. 

317 """ 

318 chain = [_try_iso_date, _try_relative, _try_month_year, _try_year] 

319 if allow_temporal_expansion: 

320 # Pass A: insert after _try_relative — narrow exact regex matches 

321 # (yesterday/today/last week/N units ago) still win when they apply; 

322 # the expansion patterns catch fuzzy quantifiers, word-numbers, 

323 # "lately" that _try_relative missed. 

324 chain.insert(2, _try_temporal_expansion) 

325 # Pass B: Hindsight-parity dateparser. Appended LAST so all the 

326 # precise regex passes get first crack — Pass B is the wide-net 

327 # catchall for named dates ("March 15"), weekdays, ISO refs, etc. 

328 # No-ops when the `dateparser` package isn't installed. 

329 chain.append(_try_dateparser) 

330 

331 for fn in chain: 

332 if fn in (_try_relative, _try_temporal_expansion, _try_dateparser): 

333 hit = fn(query, reference=reference) # type: ignore[arg-type] 

334 else: 

335 hit = fn(query) # type: ignore[arg-type] 

336 if hit is not None: 

337 start, end, _rationale = hit 

338 return TemporalConstraint(start_date=start, end_date=end) 

339 return None 

340 

341 

342# --------------------------------------------------------------------------- 

343# Temporal-marker detection (cheap gate before LLM fallback) 

344# --------------------------------------------------------------------------- 

345 

346 

347_TEMPORAL_MARKERS = { 

348 "yesterday", 

349 "today", 

350 "tomorrow", 

351 "last", 

352 "this", 

353 "next", 

354 "ago", 

355 "before", 

356 "after", 

357 "since", 

358 "until", 

359 "during", 

360 "when", 

361 "while", 

362 "then", 

363 "now", 

364 "recently", 

365 "earlier", 

366 "later", 

367 "previous", 

368 "previously", 

369 "ever", 

370 "never", 

371 "always", 

372 "year", 

373 "month", 

374 "week", 

375 "day", 

376 "morning", 

377 "evening", 

378 "spring", 

379 "summer", 

380 "fall", 

381 "autumn", 

382 "winter", 

383} 

384 

385 

386def _has_temporal_marker(query: str) -> bool: 

387 """Cheap word-level test for temporal markers in the query.""" 

388 tokens = re.findall(r"[a-z]+", query.lower()) 

389 if not tokens: 

390 return False 

391 if any(t in _TEMPORAL_MARKERS for t in tokens): 

392 return True 

393 # Year mentions (4-digit) count as temporal too. 

394 return bool(re.search(r"\b\d{4}\b", query)) 

395 

396 

397# --------------------------------------------------------------------------- 

398# LLM fallback 

399# --------------------------------------------------------------------------- 

400 

401 

402_LLM_SYSTEM_PROMPT = """\ 

403You extract a TIME RANGE from a query when one is implied. 

404 

405Output a JSON object: {"start_date": "<ISO date or null>", "end_date": \ 

406"<ISO date or null>", "rationale": "<1 sentence>"}. 

407 

408Rules: 

4091. If the query has no temporal scope, return {"start_date": null, \ 

410"end_date": null, "rationale": "no temporal scope"}. 

4112. Both dates are inclusive. Use null for open-ended ranges \ 

412("before March 2024" → end_date set, start_date null). 

4133. ISO 8601 with timezone: "2024-03-01T00:00:00Z". 

4144. Use the supplied reference date to resolve relative expressions \ 

415("last spring" relative to a 2025-01-15 reference is "2024-03-01" to \ 

416"2024-05-31"). 

4175. Output JSON only. No prose. 

418""" 

419 

420 

421def _build_llm_user_prompt(query: str, reference: datetime) -> str: 

422 return f"Reference date: {reference.isoformat()}\nQuery: {query.strip()}\n\nTime range (JSON):" 

423 

424 

425def _parse_llm_response(raw: str) -> TemporalConstraint | None: 

426 text = raw.strip() 

427 if text.startswith("```"): 

428 text = re.sub(r"^```(?:json)?\s*", "", text) 

429 text = re.sub(r"\s*```$", "", text) 

430 match = re.search(r"\{.*\}", text, re.DOTALL) 

431 if match is None: 

432 return None 

433 try: 

434 parsed = json.loads(match.group(0)) 

435 except json.JSONDecodeError: 

436 return None 

437 if not isinstance(parsed, dict): 

438 return None 

439 

440 def _parse_iso(value) -> datetime | None: 

441 if not value or not isinstance(value, str): 

442 return None 

443 s = value.strip() 

444 if s.endswith("Z"): 

445 s = s[:-1] + "+00:00" 

446 try: 

447 dt = datetime.fromisoformat(s) 

448 except ValueError: 

449 return None 

450 return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc) 

451 

452 start = _parse_iso(parsed.get("start_date")) 

453 end = _parse_iso(parsed.get("end_date")) 

454 if start is None and end is None: 

455 return None 

456 return TemporalConstraint(start_date=start, end_date=end) 

457 

458 

459# --------------------------------------------------------------------------- 

460# Entry point 

461# --------------------------------------------------------------------------- 

462 

463 

464async def analyze_query( 

465 query: str, 

466 *, 

467 reference_date: datetime | None = None, 

468 llm_provider=None, 

469 allow_llm_fallback: bool = True, 

470 allow_temporal_expansion: bool = False, 

471) -> QueryAnalysis: 

472 """Extract structured constraints from a query. 

473 

474 Args: 

475 query: User question / recall query. 

476 reference_date: Used to resolve relative expressions ("last 

477 week", "yesterday"). Defaults to ``datetime.now(UTC)``. 

478 llm_provider: Required when ``allow_llm_fallback=True``. 

479 Passed to the LLM call when the regex pre-pass misses. 

480 allow_llm_fallback: When True, falls back to an LLM call after 

481 the regex pass when (a) no regex matched AND (b) the query 

482 contains a temporal-marker token. When False, only the 

483 regex path runs (deterministic, no LLM cost). Default 

484 False so callers explicitly opt in. 

485 allow_temporal_expansion: M18a-1 flag. When True, the regex 

486 pre-pass includes the extended pattern set from 

487 ``astrocyte.pipeline.temporal_expressions`` (word-numbers, 

488 "a few X ago", "the other day", "this/earlier this <unit>", 

489 "recently/lately"). Default False — preserves legacy 

490 behavior. Promotion to default = M18c step after bench gate. 

491 

492 Returns: 

493 :class:`QueryAnalysis` whose ``temporal_constraint`` is set 

494 when extraction succeeded. ``has_constraints()`` is the 

495 idiomatic check. 

496 """ 

497 if not query or not query.strip(): 

498 return QueryAnalysis() 

499 

500 ref = reference_date or datetime.now(timezone.utc) 

501 if ref.tzinfo is None: 

502 ref = ref.replace(tzinfo=timezone.utc) 

503 

504 # Regex pre-pass (with optional M18a-1 expansion). 

505 regex_hit = _regex_temporal_pass( 

506 query, reference=ref, allow_temporal_expansion=allow_temporal_expansion, 

507 ) 

508 if regex_hit is not None: 

509 return QueryAnalysis( 

510 temporal_constraint=regex_hit, 

511 rationale="regex match", 

512 ) 

513 

514 # LLM fallback (gated). 

515 if not allow_llm_fallback or llm_provider is None: 

516 return QueryAnalysis() 

517 if not _has_temporal_marker(query): 

518 # No temporal-marker token — the LLM is unlikely to find a 

519 # constraint that the regex missed. Save the call. 

520 return QueryAnalysis() 

521 

522 try: 

523 completion = await llm_provider.complete( 

524 [ 

525 Message(role="system", content=_LLM_SYSTEM_PROMPT), 

526 Message(role="user", content=_build_llm_user_prompt(query, ref)), 

527 ], 

528 max_tokens=256, 

529 temperature=0.0, 

530 ) 

531 except Exception as exc: 

532 _logger.warning("query_analyzer LLM fallback failed (%s)", exc) 

533 return QueryAnalysis() 

534 

535 constraint = _parse_llm_response(completion.text) 

536 if constraint is None or not constraint.is_bounded(): 

537 return QueryAnalysis() 

538 return QueryAnalysis( 

539 temporal_constraint=constraint, 

540 rationale="llm fallback", 

541 )