Coverage for astrocyte/pipeline/episodic_extract.py: 83%

41 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Fix 4 (conv-run-4) — episodic event index. 

2 

3Detects facts of the form "User experienced / attended / met X at / 

4during Y" and indexes them separately from the generic 'experience' 

5fact_type. The store-level schema has a CHECK constraint on 

6``fact_type`` (migration 020), so we don't introduce a new SQL value; 

7instead we mark each detected fact by appending a namespaced marker 

8``episodic:event`` to its ``entities`` array. The existing 

9``search_facts_by_entity`` SPI then becomes the read path: querying for 

10the marker returns just episodic facts. 

11 

12Why a separate index: the Brandon-Flowers-encounter LME failure shape 

13is "user mentions a location/event in passing, then asks about who they 

14met there". The generic semantic / keyword strategies miss it because 

15the encounter is buried in a long session and not entity-typed. By 

16tagging episodic facts at retain and explicitly querying for them when 

17the question carries a location / event cue, we give the answerer a 

18direct path to the episodic memory. 

19 

20Detection is regex-based today — the cost of an LLM classifier per fact 

21would dominate ingest. The regex is intentionally narrow (verb stems 

22+ ≥1 noun-phrase content) so false positives stay low; missing an 

23edge case is preferable to spraying the marker. 

24 

25Detection rule (conjunction): 

261. The fact text matches an EPISODIC_VERB pattern: ``attended``, 

27 ``visited``, ``met``, ``went to``, ``saw``, ``encountered``, 

28 ``ran into``, ``bumped into``, ``experienced``. 

292. The fact has at least one entity (so there's something to anchor 

30 the encounter to) AND the entity is not purely a role-marker 

31 (``role:doctor``-style). 

32 

33The marker is appended in-place to ``fact.entities`` so the caller's 

34existing ``save_facts`` call writes the updated array — no extra DB 

35round-trip. 

36""" 

37 

38from __future__ import annotations 

39 

40import logging 

41import re 

42from typing import TYPE_CHECKING 

43 

44if TYPE_CHECKING: 

45 from astrocyte.types import PageIndexFact 

46 

47_logger = logging.getLogger("astrocyte.pipeline.episodic_extract") 

48 

49 

50EPISODIC_MARKER = "episodic:event" 

51 

52# Verb stems that signal an episodic encounter. Word-boundary anchored 

53# and case-insensitive; matches both base and -ed/-s/-ing forms via 

54# `(?:ed|s|ing)?`. 

55_EPISODIC_VERBS = ( 

56 "attend", 

57 "visit", 

58 "meet", 

59 "met", # irregular past 

60 "saw", # irregular past 

61 "see", 

62 "encounter", 

63 "experienc", 

64 "went to", 

65 "go to", 

66 "going to", 

67 "ran into", 

68 "run into", 

69 "running into", 

70 "bumped into", 

71 "bump into", 

72 "watched", 

73 "watch", 

74) 

75 

76_EPISODIC_RE = re.compile( 

77 r"\b(" + "|".join(re.escape(v) for v in _EPISODIC_VERBS) + r")(?:ed|s|ing)?\b", 

78 re.IGNORECASE, 

79) 

80 

81# Question-side cues that should trigger episodic recall. Used by the 

82# query-time matcher; the indexing path doesn't read this. 

83_LOCATION_CUE_RE = re.compile( 

84 r"\b(at|in|during|when (?:I|we) (?:were|was)|where|while at)\b", 

85 re.IGNORECASE, 

86) 

87_EVENT_CUE_RE = re.compile( 

88 r"\b(" 

89 r"concert|show|gig|festival|game|match|conference|meetup|party|" 

90 r"event|trip|vacation|wedding|funeral|reunion|ceremony|" 

91 r"premiere|tournament|exhibit|exhibition|expo|tour" 

92 r")\b", 

93 re.IGNORECASE, 

94) 

95 

96 

97def is_episodic_fact_text(text: str) -> bool: 

98 """Return True when the fact text reads like an episodic encounter. 

99 

100 Exposed as a module-level helper so callers can probe a candidate 

101 without mutating it (used by tests). 

102 """ 

103 if not text: 

104 return False 

105 return bool(_EPISODIC_RE.search(text)) 

106 

107 

108def tag_episodic_facts(facts: list[PageIndexFact]) -> int: 

109 """Mutate ``facts`` in-place, appending ``episodic:event`` to the 

110 ``entities`` array of every fact whose text matches the episodic 

111 pattern AND that carries at least one non-role entity. Returns the 

112 number of facts tagged. 

113 

114 Idempotent: re-running on already-tagged facts is a no-op (the 

115 marker is only appended when absent). 

116 """ 

117 if not facts: 

118 return 0 

119 n_tagged = 0 

120 for f in facts: 

121 if not is_episodic_fact_text(f.text or ""): 

122 continue 

123 ents = list(f.entities or []) 

124 # Require at least one non-marker entity so we don't tag a bare 

125 # "user attended" with no anchor. 

126 non_marker_entities = [e for e in ents if e and not e.startswith("role:") and not e.startswith("episodic:")] 

127 if not non_marker_entities: 

128 continue 

129 if EPISODIC_MARKER in ents: 

130 continue 

131 ents.append(EPISODIC_MARKER) 

132 f.entities = ents 

133 n_tagged += 1 

134 if n_tagged: 

135 _logger.info( 

136 "episodic_extract: tagged %d/%d facts as episodic", 

137 n_tagged, 

138 len(facts), 

139 ) 

140 return n_tagged 

141 

142 

143def question_has_episodic_cue(question: str) -> bool: 

144 """Return True when the question carries a location or event cue 

145 that should trigger episodic recall. 

146 

147 Used by the query path (``astrocyte_client.search``) to decide 

148 whether to add an episodic-marker fact query alongside the 

149 semantic/temporal fact strategies. False is the safe default — when 

150 in doubt we don't inject episodic candidates, since duplicate facts 

151 in the answerer prompt can cause cross-category regression. 

152 """ 

153 if not question: 

154 return False 

155 if _LOCATION_CUE_RE.search(question): 

156 return True 

157 if _EVENT_CUE_RE.search(question): 

158 return True 

159 return False