Coverage for astrocyte/pipeline/episodic_extract.py: 83%
41 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Fix 4 (conv-run-4) — episodic event index.
3Detects facts of the form "User experienced / attended / met X at /
4during Y" and indexes them separately from the generic 'experience'
5fact_type. The store-level schema has a CHECK constraint on
6``fact_type`` (migration 020), so we don't introduce a new SQL value;
7instead we mark each detected fact by appending a namespaced marker
8``episodic:event`` to its ``entities`` array. The existing
9``search_facts_by_entity`` SPI then becomes the read path: querying for
10the marker returns just episodic facts.
12Why a separate index: the Brandon-Flowers-encounter LME failure shape
13is "user mentions a location/event in passing, then asks about who they
14met there". The generic semantic / keyword strategies miss it because
15the encounter is buried in a long session and not entity-typed. By
16tagging episodic facts at retain and explicitly querying for them when
17the question carries a location / event cue, we give the answerer a
18direct path to the episodic memory.
20Detection is regex-based today — the cost of an LLM classifier per fact
21would dominate ingest. The regex is intentionally narrow (verb stems
22+ ≥1 noun-phrase content) so false positives stay low; missing an
23edge case is preferable to spraying the marker.
25Detection rule (conjunction):
261. The fact text matches an EPISODIC_VERB pattern: ``attended``,
27 ``visited``, ``met``, ``went to``, ``saw``, ``encountered``,
28 ``ran into``, ``bumped into``, ``experienced``.
292. The fact has at least one entity (so there's something to anchor
30 the encounter to) AND the entity is not purely a role-marker
31 (``role:doctor``-style).
33The marker is appended in-place to ``fact.entities`` so the caller's
34existing ``save_facts`` call writes the updated array — no extra DB
35round-trip.
36"""
38from __future__ import annotations
40import logging
41import re
42from typing import TYPE_CHECKING
44if TYPE_CHECKING:
45 from astrocyte.types import PageIndexFact
47_logger = logging.getLogger("astrocyte.pipeline.episodic_extract")
50EPISODIC_MARKER = "episodic:event"
52# Verb stems that signal an episodic encounter. Word-boundary anchored
53# and case-insensitive; matches both base and -ed/-s/-ing forms via
54# `(?:ed|s|ing)?`.
55_EPISODIC_VERBS = (
56 "attend",
57 "visit",
58 "meet",
59 "met", # irregular past
60 "saw", # irregular past
61 "see",
62 "encounter",
63 "experienc",
64 "went to",
65 "go to",
66 "going to",
67 "ran into",
68 "run into",
69 "running into",
70 "bumped into",
71 "bump into",
72 "watched",
73 "watch",
74)
76_EPISODIC_RE = re.compile(
77 r"\b(" + "|".join(re.escape(v) for v in _EPISODIC_VERBS) + r")(?:ed|s|ing)?\b",
78 re.IGNORECASE,
79)
81# Question-side cues that should trigger episodic recall. Used by the
82# query-time matcher; the indexing path doesn't read this.
83_LOCATION_CUE_RE = re.compile(
84 r"\b(at|in|during|when (?:I|we) (?:were|was)|where|while at)\b",
85 re.IGNORECASE,
86)
87_EVENT_CUE_RE = re.compile(
88 r"\b("
89 r"concert|show|gig|festival|game|match|conference|meetup|party|"
90 r"event|trip|vacation|wedding|funeral|reunion|ceremony|"
91 r"premiere|tournament|exhibit|exhibition|expo|tour"
92 r")\b",
93 re.IGNORECASE,
94)
97def is_episodic_fact_text(text: str) -> bool:
98 """Return True when the fact text reads like an episodic encounter.
100 Exposed as a module-level helper so callers can probe a candidate
101 without mutating it (used by tests).
102 """
103 if not text:
104 return False
105 return bool(_EPISODIC_RE.search(text))
108def tag_episodic_facts(facts: list[PageIndexFact]) -> int:
109 """Mutate ``facts`` in-place, appending ``episodic:event`` to the
110 ``entities`` array of every fact whose text matches the episodic
111 pattern AND that carries at least one non-role entity. Returns the
112 number of facts tagged.
114 Idempotent: re-running on already-tagged facts is a no-op (the
115 marker is only appended when absent).
116 """
117 if not facts:
118 return 0
119 n_tagged = 0
120 for f in facts:
121 if not is_episodic_fact_text(f.text or ""):
122 continue
123 ents = list(f.entities or [])
124 # Require at least one non-marker entity so we don't tag a bare
125 # "user attended" with no anchor.
126 non_marker_entities = [e for e in ents if e and not e.startswith("role:") and not e.startswith("episodic:")]
127 if not non_marker_entities:
128 continue
129 if EPISODIC_MARKER in ents:
130 continue
131 ents.append(EPISODIC_MARKER)
132 f.entities = ents
133 n_tagged += 1
134 if n_tagged:
135 _logger.info(
136 "episodic_extract: tagged %d/%d facts as episodic",
137 n_tagged,
138 len(facts),
139 )
140 return n_tagged
143def question_has_episodic_cue(question: str) -> bool:
144 """Return True when the question carries a location or event cue
145 that should trigger episodic recall.
147 Used by the query path (``astrocyte_client.search``) to decide
148 whether to add an episodic-marker fact query alongside the
149 semantic/temporal fact strategies. False is the safe default — when
150 in doubt we don't inject episodic candidates, since duplicate facts
151 in the answerer prompt can cause cross-category regression.
152 """
153 if not question:
154 return False
155 if _LOCATION_CUE_RE.search(question):
156 return True
157 if _EVENT_CUE_RE.search(question):
158 return True
159 return False