Coverage for astrocyte/pipeline/temporal_dateparser.py: 46%
52 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Dateparser-based temporal extraction (Hindsight-parity Pass B).
3The hand-rolled regex passes in ``temporal_expressions.py`` (Pass A)
4cover fuzzy/range expressions like "a few weeks ago" and "couple of
5months ago" that a general date parser would mishandle as single
6points. But they miss the **vast majority** of temporal expressions
7that real benchmark questions contain:
9 - Named dates: "March 15", "the 3rd of June"
10 - ISO dates: "2024-06-01"
11 - Weekdays: "Tuesday", "last Friday"
12 - Ordinals: "the 5th"
13 - Implicit relative: "2 weeks ago" (covered by Pass A regex too)
14 - Multi-language: "ayer" (Spanish), "letztes Jahr" (German)
16The ``dateparser`` library covers all of the above with a single API
17(``dateparser.search.search_dates``). Hindsight uses this as their
18default temporal analyzer; their codebase shows two production lessons
19we copy verbatim:
21 1. **Defensive try/except** — dateparser has been observed to crash
22 with internal errors (IndexError from locale.translate_search and
23 similar) on certain inputs. A parser bug must NOT bring down the
24 retrieval pipeline. Treat any failure as "no constraint found"
25 and fall back to non-temporal recall.
27 2. **False-positive filter** for short common words that
28 dateparser misparses as dates: ``{"do", "may", "march", "will",
29 "can", "sat", "sun", "mon", ...}``. Without this filter, the
30 question "What can I do?" extracts "do" as a date.
32This is Pass B in the chain: it runs AFTER the precise regex passes
33(``_try_iso_date``, ``_try_relative``, ``_try_temporal_expansion``,
34``_try_month_year``, ``_try_year``) so narrower exact matches still
35win. Pass B is the **wide-net catchall** for everything else.
37Public API:
38 extract_temporal_range_via_dateparser(query, anchor)
39 -> tuple[datetime, datetime] | None
40"""
42from __future__ import annotations
44import logging
45from datetime import datetime, timedelta
47_logger = logging.getLogger("astrocyte.pipeline.temporal_dateparser")
49# Set once on first failed import so we don't spam logs every recall.
50_DATEPARSER_AVAILABLE: bool | None = None
51_search_dates = None # type: ignore[var-annotated]
53# Short tokens that dateparser frequently misparses as dates. Anything
54# of length ≤ 3 in this set is filtered out (longer hits like
55# "march" inside "Marching band" can still be a real cue when surrounded
56# by clear date context; dateparser's own context handling decides).
57# Mirrors Hindsight's set; kept English-only because our benches are
58# English-only (LME, LoCoMo).
59_FALSE_POSITIVES: frozenset[str] = frozenset(
60 {
61 "do", "may", "march", "will", "can",
62 "sat", "sun", "mon", "tue", "wed", "thu", "fri",
63 "i", "a", "an", "the", "is", "it",
64 }
65)
68def _lazy_load() -> bool:
69 """Lazy-import dateparser. Returns True on success, False if the
70 package isn't installed. Logs once on missing-dep so the recall
71 path stays quiet thereafter."""
72 global _DATEPARSER_AVAILABLE, _search_dates
73 if _DATEPARSER_AVAILABLE is not None:
74 return _DATEPARSER_AVAILABLE
75 try:
76 from dateparser.search import search_dates # noqa: PLC0415
78 _search_dates = search_dates
79 # Warm-up call — triggers lazy-loaded regex tables / locale
80 # data so the first real recall doesn't pay the cold-start.
81 try:
82 _search_dates("today")
83 except Exception: # noqa: BLE001
84 pass
85 _DATEPARSER_AVAILABLE = True
86 except ImportError:
87 _logger.info(
88 "temporal_dateparser: 'dateparser' not installed; "
89 "Pass B disabled. Install with `pip install dateparser` "
90 "or via the `bench` extra to enable.",
91 )
92 _DATEPARSER_AVAILABLE = False
93 return _DATEPARSER_AVAILABLE
96def extract_temporal_range_via_dateparser(
97 query: str,
98 anchor: datetime,
99) -> tuple[datetime, datetime] | None:
100 """Extract a date range from ``query`` using the dateparser library.
102 Returns ``(start, end)`` for a single-day window centered on the
103 first valid date found, or ``None`` when no date is found, the
104 dependency is missing, or the only matches are filtered false
105 positives.
107 The returned range is a single day [00:00:00, 23:59:59.999999] —
108 callers that want a wider window should widen it themselves. This
109 mirrors Hindsight's contract.
111 Args:
112 query: The user's question.
113 anchor: Reference "now" for relative expressions (e.g., the
114 document's latest session timestamp).
115 """
116 if not query:
117 return None
118 if not _lazy_load():
119 return None
121 settings = {
122 "RELATIVE_BASE": anchor,
123 "PREFER_DATES_FROM": "past",
124 "RETURN_AS_TIMEZONE_AWARE": False,
125 }
127 # Wrap the parser call in try/except — dateparser has known bugs
128 # (IndexError from locale.translate_search, KeyError from broken
129 # internal tables) that must not crash the recall pipeline.
130 try:
131 results = _search_dates(query, settings=settings) # type: ignore[misc]
132 except Exception as exc: # noqa: BLE001
133 _logger.warning(
134 "temporal_dateparser: dateparser raised %s; "
135 "treating as no temporal constraint. query=%r",
136 type(exc).__name__, query[:80],
137 )
138 return None
140 if not results:
141 return None
143 # Filter false positives. Hindsight's rule: a short token (≤3 chars
144 # OR in the false-positive set with length≤4) gets dropped because
145 # dateparser misparses common words as dates.
146 valid: list[tuple[str, datetime]] = []
147 for text, parsed in results:
148 t = text.strip().lower()
149 if t in _FALSE_POSITIVES and len(t) <= 4:
150 continue
151 if len(t) <= 2:
152 # Two-letter tokens are almost never legitimate dates.
153 continue
154 valid.append((text, parsed))
156 if not valid:
157 return None
159 # Use the first valid date. Hindsight does the same — multi-date
160 # disambiguation is the LLM-fallback path's job, not the cheap
161 # extractor's.
162 _, parsed_date = valid[0]
163 start = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0)
164 end = parsed_date.replace(hour=23, minute=59, second=59, microsecond=999999)
165 return (start, end)
168def widen_to_neighbourhood(
169 range_: tuple[datetime, datetime],
170 *,
171 pad_days: int = 1,
172) -> tuple[datetime, datetime]:
173 """Widen a single-day dateparser hit by ``pad_days`` on each side.
175 The exact-day hit from dateparser is often too tight for fact-grain
176 retrieval: a question about "what happened on June 5th" may be
177 answered by a fact dated June 4th or June 6th. Callers should widen
178 the dateparser range before handing it to ``search_facts_temporal``.
180 Default pad of 1 day yields a 3-day window. Use larger pads for
181 narrower-resolution fact corpora.
182 """
183 start, end = range_
184 return (start - timedelta(days=pad_days), end + timedelta(days=pad_days))