Coverage for astrocyte/pipeline/temporal_expressions.py: 69%

68 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Query-time relative-temporal expansion. 

2 

3Maps relative temporal expressions in a question (``"a few weeks ago"``, 

4``"last month"``, ``"3 days ago"``) to absolute ISO date ranges using a 

5reference anchor date. Recall code can then feed the range to the 

6fact-grain temporal-search SPI without depending on every ingested 

7section having a structured ``occurred_start`` populated. 

8 

9Two prior attempts at INGEST-TIME structured-date extraction (M14.x, 

10M15.x) were reverted because per-fact temporal metadata is sparse — 

11many preference/opinion facts have no specific event time and stamping 

12one was net-negative across categories. Doing the expansion at QUERY 

13time avoids that problem: we widen recall when the question itself 

14asks about a time window, leaving ingest extraction untouched. 

15 

16Anchor date selection (caller-supplied): use the latest session 

17timestamp known for the document. "A few weeks ago" relative to a 

18conversation that ran in May 2023 should map to mid-April 2023, not 

19relative to wall-clock today. 

20 

21Conservative behaviour: if no temporal cue is found in the query, 

22return ``None``. Callers should fall back to non-temporal recall. 

23 

24Supported expressions (case-insensitive, allowed anywhere in the 

25query string): 

26 

27 - ``yesterday`` → (anchor − 2d, anchor) 

28 - ``today`` → (anchor − 1d, anchor + 1d) 

29 - ``last week`` / ``a week ago`` → (anchor − 14d, anchor − 5d) 

30 - ``this week`` → (anchor − 8d, anchor + 1d) 

31 - ``last month`` / ``a month ago`` → (anchor − 60d, anchor − 20d) 

32 - ``last year`` / ``a year ago`` → (anchor − 540d, anchor − 270d) 

33 - ``a few <unit>s ago`` → (anchor − 5×unit, anchor − 2×unit) 

34 - ``couple <unit>s ago`` → (anchor − 4×unit, anchor − 2×unit) 

35 - ``<N> <unit>s ago`` → (anchor − (N+2)×unit, anchor − max(N−2,0)×unit) 

36 - ``earlier this <unit>`` → (anchor − 1×unit, anchor) 

37 - ``recently`` / ``just`` → (anchor − 14d, anchor + 1d) 

38 

39Each "match" widens the window slightly so the recall SPI returns a 

40useful neighbourhood instead of an exact day-match (the user said "a 

41few weeks ago"; they mean roughly 2-5 weeks). 

42""" 

43 

44from __future__ import annotations 

45 

46import logging 

47import re 

48from datetime import datetime, timedelta 

49 

50logger = logging.getLogger("astrocyte.pipeline.temporal_expressions") 

51 

52 

53DateRange = tuple[datetime, datetime] 

54 

55 

56_UNIT_TO_DAYS: dict[str, int] = { 

57 "day": 1, 

58 "days": 1, 

59 "week": 7, 

60 "weeks": 7, 

61 "month": 30, 

62 "months": 30, 

63 "year": 365, 

64 "years": 365, 

65} 

66 

67 

68_NUMBER_WORD: dict[str, int] = { 

69 "one": 1, 

70 "two": 2, 

71 "three": 3, 

72 "four": 4, 

73 "five": 5, 

74 "six": 6, 

75 "seven": 7, 

76 "eight": 8, 

77 "nine": 9, 

78 "ten": 10, 

79} 

80 

81 

82def _coerce_n(token: str) -> int | None: 

83 token = token.strip().lower() 

84 if token.isdigit(): 

85 try: 

86 return int(token) 

87 except ValueError: 

88 return None 

89 return _NUMBER_WORD.get(token) 

90 

91 

92def _range_centred_on(anchor: datetime, days_ago: float, half_width_days: float) -> DateRange: 

93 """Window: ``[anchor − (days_ago + half_width), anchor − (days_ago − half_width)]``. 

94 

95 Clamps the lower bound to never go past 5 years before anchor (any 

96 document older than that is well outside the LME/LoCoMo bench scope 

97 and likely a parse error). 

98 """ 

99 start = anchor - timedelta(days=days_ago + half_width_days) 

100 end = anchor - timedelta(days=max(days_ago - half_width_days, 0)) 

101 floor = anchor - timedelta(days=365 * 5) 

102 if start < floor: 

103 start = floor 

104 return (start, end) 

105 

106 

107def expand_temporal_expression(query: str, anchor: datetime) -> DateRange | None: 

108 """Parse the first relative-time expression in ``query`` and return 

109 a date range. Returns ``None`` when no cue is found. 

110 

111 ``anchor`` is the reference "now" for relative expressions — 

112 typically the latest session timestamp for the document. 

113 """ 

114 if not query: 

115 return None 

116 q = query.lower() 

117 

118 # "<N> <unit>s ago" — digit or word number, plural or singular unit 

119 m = re.search( 

120 r"\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\s+" 

121 r"(day|days|week|weeks|month|months|year|years)\s+ago\b", 

122 q, 

123 ) 

124 if m: 

125 n = _coerce_n(m.group(1)) 

126 unit_days = _UNIT_TO_DAYS.get(m.group(2)) 

127 if n is not None and unit_days is not None: 

128 days_ago = n * unit_days 

129 half = max(unit_days, 1) 

130 return _range_centred_on(anchor, days_ago, half) 

131 

132 # "a few <unit>s ago" — vague but pinned to roughly 2-5 units 

133 m = re.search( 

134 r"\b(a\s+few|few)\s+(day|days|week|weeks|month|months|year|years)\s+ago\b", 

135 q, 

136 ) 

137 if m: 

138 unit_days = _UNIT_TO_DAYS.get(m.group(2)) 

139 if unit_days is not None: 

140 return _range_centred_on(anchor, 3.5 * unit_days, 1.5 * unit_days) 

141 

142 # "couple <unit>s ago" — ~2-3 units 

143 m = re.search( 

144 r"\b(a\s+couple\s+of|couple\s+of|a\s+couple|couple)\s+" 

145 r"(day|days|week|weeks|month|months|year|years)\s+ago\b", 

146 q, 

147 ) 

148 if m: 

149 unit_days = _UNIT_TO_DAYS.get(m.group(2)) 

150 if unit_days is not None: 

151 return _range_centred_on(anchor, 2.5 * unit_days, 1.0 * unit_days) 

152 

153 # "last <unit>" or "<unit> ago" (no quantifier) 

154 m = re.search( 

155 r"\b(last|a)\s+(day|week|month|year)\b|" 

156 r"\bthe\s+other\s+(day|week|month)\b", 

157 q, 

158 ) 

159 if m: 

160 unit = (m.group(2) or m.group(3) or "").lower() 

161 unit_days = _UNIT_TO_DAYS.get(unit) 

162 if unit_days is not None: 

163 # "last week" → 5-14 days ago; "last month" → 20-60d; "last year" → 270-540d 

164 return _range_centred_on(anchor, 1.4 * unit_days, 0.7 * unit_days) 

165 

166 # "this <unit>" — current period 

167 m = re.search(r"\bthis\s+(week|month|year)\b", q) 

168 if m: 

169 unit_days = _UNIT_TO_DAYS.get(m.group(1)) 

170 if unit_days is not None: 

171 return (anchor - timedelta(days=unit_days), anchor + timedelta(days=1)) 

172 

173 # "earlier this <unit>" — same window as "this <unit>" 

174 m = re.search(r"\bearlier\s+this\s+(week|month|year)\b", q) 

175 if m: 

176 unit_days = _UNIT_TO_DAYS.get(m.group(1)) 

177 if unit_days is not None: 

178 return (anchor - timedelta(days=unit_days), anchor) 

179 

180 # Single-word time anchors 

181 if re.search(r"\byesterday\b", q): 

182 return (anchor - timedelta(days=2), anchor) 

183 if re.search(r"\btoday\b", q): 

184 return (anchor - timedelta(days=1), anchor + timedelta(days=1)) 

185 if re.search(r"\brecently\b|\bjust\s+now\b|\blately\b", q): 

186 return (anchor - timedelta(days=14), anchor + timedelta(days=1)) 

187 

188 return None