Coverage for astrocyte/pipeline/section_entity_extraction.py: 93%
42 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""PR2 commit A: per-section entity extraction for section recall.
3Extracts named entities (people, places, organisations, products,
4notable concepts) from a PageIndex tree section's text. Output rows go
5into ``astrocyte_pi_section_entities`` and become the entity-lookup
6strategy's index in PR2 commit B (Hindsight's CTE pattern at section
7grain).
9Why per-section: the picker can't route on "Caroline" if the tree
10summary says "they discussed personal experiences". Stamping
11``entities`` rows means a question that mentions Caroline gets routed
12to *every* section mentioning her in <100ms, deterministically.
14Cost: one LLM call per section. For LoCoMo (~30 sections per
15conversation × 10 conversations) that's ~300 calls one-time at retain.
16At gpt-4o-mini prices, ~$0.001 per section → ~$3 to build entity index
17across the full LoCoMo dataset.
19Design notes:
20- We ask the LLM to return a JSON array of strings (no schema enforcement
21 at PR2-A scope; the picker is robust to noisy entity rows).
22- De-duplicate case-insensitively on the way out (``Caroline`` and
23 ``caroline`` collapse to one row).
24- Cap at 15 entities per section — pathological extractions (lyric
25 quotations, recipe ingredients) shouldn't blow up the index. The cap
26 is loose; ``ix_pi_section_entities_name`` handles fanout via
27 Hindsight's LATERAL pattern in PR2 commit B.
28- We DO NOT try to canonicalise across sections at PR2-A. "Jon" and
29 "Jonathan Smith" stay separate rows. PR2 commit D adds a per-bank
30 entity-resolution pass if the bench shows it matters.
32See:
33 - docs/_design/recall.md §5 (retain pipeline) and §8.1
34 - docs/_design/adr/adr-007-pageindex-tree-as-section-primitive.md
35"""
37from __future__ import annotations
39import logging
40from typing import TYPE_CHECKING
42if TYPE_CHECKING:
43 from astrocyte.provider import LLMProvider
44 from astrocyte.types import PageIndexSection
46from astrocyte.pipeline._json_tolerant import looks_truncated, tolerant_json_loads
47from astrocyte.types import Message, PageIndexSectionEntity
49logger = logging.getLogger("astrocyte.pipeline.section_entity_extraction")
52_EXTRACT_PROMPT = """Extract two kinds of entities from the conversation excerpt below.
54(A) NAMED ENTITIES — proper nouns the user mentioned:
55- People (first names, last names, full names, nicknames)
56- Places (cities, countries, neighbourhoods, named buildings)
57- Organisations (companies, schools, sports teams, clubs)
58- Products (named brands, books, movies, songs, games, foods)
59- Notable concepts (named events, named projects, named conditions)
61ALIAS CAPTURE for (A): for each PERSON mentioned, emit BOTH the
62form used in the excerpt AND any common short-form / nickname /
63formal-name variant they are likely to be referred to by. Use general
64Western-naming knowledge for the alias mapping:
65- "Joanna" → also emit "Jo", "Joey", "Jojo"
66- "Robert" → also emit "Rob", "Bob", "Bobby"
67- "Elizabeth" → also emit "Liz", "Beth", "Eliza", "Lizzie"
68- "Michael" → also emit "Mike", "Mickey"
69- "Catherine" / "Katherine" → also emit "Kate", "Cathy", "Katie"
70- "William" → also emit "Will", "Bill", "Billy"
71- "Jonathan" → also emit "Jon", "Jonny"
72- "Christopher" → also emit "Chris"
73- "Alexander" → also emit "Alex", "Sandy"
74Emit each alias as its OWN entry. Only emit aliases that are PLAUSIBLE
75for the person named (don't invent aliases when the name doesn't have
76a standard short form). Cap aliases per person at 3.
78Do NOT extract for (A):
79- Common nouns ("dog", "car", "school" without a name)
80- Pronouns
81- Dates / times (the temporal index handles those separately)
83(B) STRUCTURED LABELS — `key:value` strings that classify what the user \
84DID, ENCOUNTERED, or HAS. Use these vocabularies:
86- `role:<noun>` — occupational / functional category. Use when the user \
87visited / spoke to someone in a role. Examples: `role:doctor`, \
88`role:dermatologist`, `role:lawyer`, `role:teacher`, `role:therapist`.
89- `category:<noun>` — countable kind of THING the user owns / acquired / \
90worked on / consumed. Examples: `category:model_kit`, `category:plant`, \
91`category:restaurant`, `category:book`, `category:movie`, \
92`category:trip`, `category:doctor_visit`, `category:project`.
93- `event:<noun>` — distinct occurrence the user attended / experienced. \
94Examples: `event:wedding`, `event:engagement_party`, `event:sale`, \
95`event:concert`, `event:road_trip`, `event:job_interview`.
96- `expense:<currency_amount>` — money the user spent (when a number is \
97mentioned). Examples: `expense:$45`, `expense:$185`, `expense:$2400`.
99Rules for (B):
100- Use snake_case for the noun. Lowercase.
101- Emit ONE label per distinct mention (e.g. user visited 3 doctors → \
102emit `role:doctor` 3 times across the relevant sections).
103- Match the COUNTABLE category in user questions: "how many doctors?" \
104→ `role:doctor`. "How many bikes did I buy?" → `category:bike`. \
105"Total spent on bikes?" → `expense:$N`.
106- DO NOT invent labels outside the four prefixes above.
107- It's fine to emit nothing in (B) if the section is generic chit-chat.
109Return ONLY a JSON object with one key, ``entities``, containing an \
110array of strings (mixed (A) named entities + (A) aliases + (B) `key:value` \
111labels). Cap at 20 entries total; prefer (B) labels when the section \
112discusses a countable category, since those drive the wiki recall layer.
114Excerpt:
115{text}
117Output (JSON only):
118"""
121_MAX_ENTITIES_PER_SECTION = 20
124async def extract_entities_for_section(
125 provider: "LLMProvider",
126 document_id: str,
127 section: "PageIndexSection",
128 section_text: str,
129 *,
130 model: str | None = None,
131) -> list[PageIndexSectionEntity]:
132 """One LLM call → up to 15 ``PageIndexSectionEntity`` rows.
134 ``section_text`` is the sliced markdown for the section (caller
135 extracts via ``_slice_section_around_line`` from the bench). We pass
136 it rather than re-slicing here so the bench can batch the slicing
137 logic in one place.
139 Returns an empty list on parse failure (logged) — the picker
140 degrades gracefully when entity rows are missing for a section.
141 """
142 if not section_text.strip():
143 return []
145 msg = _EXTRACT_PROMPT.format(text=section_text[:6000]) # 6K char cap = ~1500 tokens
146 completion = await provider.complete(
147 messages=[Message(role="user", content=msg)],
148 model=model,
149 max_tokens=750,
150 temperature=0.0,
151 response_format={"type": "json_object"},
152 )
154 # Tolerant parse handles markdown-fence wrapping / leading-prose noise
155 # before we give up. On parse failure, retry once with a stricter
156 # system reminder unless the response looks budget-truncated (a retry
157 # under the same cap won't help).
158 parsed = tolerant_json_loads(completion.text)
159 if parsed is None and not looks_truncated(completion.text):
160 try:
161 retry = await provider.complete(
162 messages=[
163 Message(
164 role="system",
165 content=(
166 "Return ONLY a valid JSON object. "
167 "No markdown fences. No prose."
168 ),
169 ),
170 Message(role="user", content=msg),
171 ],
172 model=model,
173 max_tokens=750,
174 temperature=0.0,
175 response_format={"type": "json_object"},
176 )
177 except Exception as exc: # noqa: BLE001
178 logger.warning(
179 "section_entity_extraction: retry LLM call failed doc=%s line=%d: %s",
180 document_id,
181 section.line_num,
182 exc,
183 )
184 retry = None
185 if retry is not None:
186 parsed = tolerant_json_loads(retry.text)
187 if not isinstance(parsed, dict):
188 logger.warning(
189 "section_entity_extraction: JSON parse failed for doc=%s line=%d",
190 document_id,
191 section.line_num,
192 )
193 return []
194 raw = parsed.get("entities") or []
196 # Dedupe case-insensitively, preserve first-seen casing, cap.
197 seen: set[str] = set()
198 out: list[PageIndexSectionEntity] = []
199 for raw_name in raw:
200 if not isinstance(raw_name, str):
201 continue
202 name = raw_name.strip()
203 if not name:
204 continue
205 key = name.casefold()
206 if key in seen:
207 continue
208 seen.add(key)
209 out.append(
210 PageIndexSectionEntity(
211 document_id=document_id,
212 line_num=section.line_num,
213 entity_name=name,
214 )
215 )
216 if len(out) >= _MAX_ENTITIES_PER_SECTION:
217 break
218 return out