Coverage for astrocyte/pipeline/temporal_dateparser.py: 46%

52 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Dateparser-based temporal extraction (Hindsight-parity Pass B). 

2 

3The hand-rolled regex passes in ``temporal_expressions.py`` (Pass A) 

4cover fuzzy/range expressions like "a few weeks ago" and "couple of 

5months ago" that a general date parser would mishandle as single 

6points. But they miss the **vast majority** of temporal expressions 

7that real benchmark questions contain: 

8 

9 - Named dates: "March 15", "the 3rd of June" 

10 - ISO dates: "2024-06-01" 

11 - Weekdays: "Tuesday", "last Friday" 

12 - Ordinals: "the 5th" 

13 - Implicit relative: "2 weeks ago" (covered by Pass A regex too) 

14 - Multi-language: "ayer" (Spanish), "letztes Jahr" (German) 

15 

16The ``dateparser`` library covers all of the above with a single API 

17(``dateparser.search.search_dates``). Hindsight uses this as their 

18default temporal analyzer; their codebase shows two production lessons 

19we copy verbatim: 

20 

21 1. **Defensive try/except** — dateparser has been observed to crash 

22 with internal errors (IndexError from locale.translate_search and 

23 similar) on certain inputs. A parser bug must NOT bring down the 

24 retrieval pipeline. Treat any failure as "no constraint found" 

25 and fall back to non-temporal recall. 

26 

27 2. **False-positive filter** for short common words that 

28 dateparser misparses as dates: ``{"do", "may", "march", "will", 

29 "can", "sat", "sun", "mon", ...}``. Without this filter, the 

30 question "What can I do?" extracts "do" as a date. 

31 

32This is Pass B in the chain: it runs AFTER the precise regex passes 

33(``_try_iso_date``, ``_try_relative``, ``_try_temporal_expansion``, 

34``_try_month_year``, ``_try_year``) so narrower exact matches still 

35win. Pass B is the **wide-net catchall** for everything else. 

36 

37Public API: 

38 extract_temporal_range_via_dateparser(query, anchor) 

39 -> tuple[datetime, datetime] | None 

40""" 

41 

42from __future__ import annotations 

43 

44import logging 

45from datetime import datetime, timedelta 

46 

47_logger = logging.getLogger("astrocyte.pipeline.temporal_dateparser") 

48 

49# Set once on first failed import so we don't spam logs every recall. 

50_DATEPARSER_AVAILABLE: bool | None = None 

51_search_dates = None # type: ignore[var-annotated] 

52 

53# Short tokens that dateparser frequently misparses as dates. Anything 

54# of length ≤ 3 in this set is filtered out (longer hits like 

55# "march" inside "Marching band" can still be a real cue when surrounded 

56# by clear date context; dateparser's own context handling decides). 

57# Mirrors Hindsight's set; kept English-only because our benches are 

58# English-only (LME, LoCoMo). 

59_FALSE_POSITIVES: frozenset[str] = frozenset( 

60 { 

61 "do", "may", "march", "will", "can", 

62 "sat", "sun", "mon", "tue", "wed", "thu", "fri", 

63 "i", "a", "an", "the", "is", "it", 

64 } 

65) 

66 

67 

68def _lazy_load() -> bool: 

69 """Lazy-import dateparser. Returns True on success, False if the 

70 package isn't installed. Logs once on missing-dep so the recall 

71 path stays quiet thereafter.""" 

72 global _DATEPARSER_AVAILABLE, _search_dates 

73 if _DATEPARSER_AVAILABLE is not None: 

74 return _DATEPARSER_AVAILABLE 

75 try: 

76 from dateparser.search import search_dates # noqa: PLC0415 

77 

78 _search_dates = search_dates 

79 # Warm-up call — triggers lazy-loaded regex tables / locale 

80 # data so the first real recall doesn't pay the cold-start. 

81 try: 

82 _search_dates("today") 

83 except Exception: # noqa: BLE001 

84 pass 

85 _DATEPARSER_AVAILABLE = True 

86 except ImportError: 

87 _logger.info( 

88 "temporal_dateparser: 'dateparser' not installed; " 

89 "Pass B disabled. Install with `pip install dateparser` " 

90 "or via the `bench` extra to enable.", 

91 ) 

92 _DATEPARSER_AVAILABLE = False 

93 return _DATEPARSER_AVAILABLE 

94 

95 

96def extract_temporal_range_via_dateparser( 

97 query: str, 

98 anchor: datetime, 

99) -> tuple[datetime, datetime] | None: 

100 """Extract a date range from ``query`` using the dateparser library. 

101 

102 Returns ``(start, end)`` for a single-day window centered on the 

103 first valid date found, or ``None`` when no date is found, the 

104 dependency is missing, or the only matches are filtered false 

105 positives. 

106 

107 The returned range is a single day [00:00:00, 23:59:59.999999] — 

108 callers that want a wider window should widen it themselves. This 

109 mirrors Hindsight's contract. 

110 

111 Args: 

112 query: The user's question. 

113 anchor: Reference "now" for relative expressions (e.g., the 

114 document's latest session timestamp). 

115 """ 

116 if not query: 

117 return None 

118 if not _lazy_load(): 

119 return None 

120 

121 settings = { 

122 "RELATIVE_BASE": anchor, 

123 "PREFER_DATES_FROM": "past", 

124 "RETURN_AS_TIMEZONE_AWARE": False, 

125 } 

126 

127 # Wrap the parser call in try/except — dateparser has known bugs 

128 # (IndexError from locale.translate_search, KeyError from broken 

129 # internal tables) that must not crash the recall pipeline. 

130 try: 

131 results = _search_dates(query, settings=settings) # type: ignore[misc] 

132 except Exception as exc: # noqa: BLE001 

133 _logger.warning( 

134 "temporal_dateparser: dateparser raised %s; " 

135 "treating as no temporal constraint. query=%r", 

136 type(exc).__name__, query[:80], 

137 ) 

138 return None 

139 

140 if not results: 

141 return None 

142 

143 # Filter false positives. Hindsight's rule: a short token (≤3 chars 

144 # OR in the false-positive set with length≤4) gets dropped because 

145 # dateparser misparses common words as dates. 

146 valid: list[tuple[str, datetime]] = [] 

147 for text, parsed in results: 

148 t = text.strip().lower() 

149 if t in _FALSE_POSITIVES and len(t) <= 4: 

150 continue 

151 if len(t) <= 2: 

152 # Two-letter tokens are almost never legitimate dates. 

153 continue 

154 valid.append((text, parsed)) 

155 

156 if not valid: 

157 return None 

158 

159 # Use the first valid date. Hindsight does the same — multi-date 

160 # disambiguation is the LLM-fallback path's job, not the cheap 

161 # extractor's. 

162 _, parsed_date = valid[0] 

163 start = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0) 

164 end = parsed_date.replace(hour=23, minute=59, second=59, microsecond=999999) 

165 return (start, end) 

166 

167 

168def widen_to_neighbourhood( 

169 range_: tuple[datetime, datetime], 

170 *, 

171 pad_days: int = 1, 

172) -> tuple[datetime, datetime]: 

173 """Widen a single-day dateparser hit by ``pad_days`` on each side. 

174 

175 The exact-day hit from dateparser is often too tight for fact-grain 

176 retrieval: a question about "what happened on June 5th" may be 

177 answered by a fact dated June 4th or June 6th. Callers should widen 

178 the dateparser range before handing it to ``search_facts_temporal``. 

179 

180 Default pad of 1 day yields a 3-day window. Use larger pads for 

181 narrower-resolution fact corpora. 

182 """ 

183 start, end = range_ 

184 return (start - timedelta(days=pad_days), end + timedelta(days=pad_days))