Coverage for astrocyte/pipeline/section_link_extraction.py: 0%
59 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""PR2 D.7: per-section causal / supersedes / elaborates link extraction.
3Why this exists: the ``graph_expand`` strategy (PR2 commit B) reads
4``section_links`` to do 1-hop expansion from semantic / entity seeds.
5Without this module, ``section_links`` is empty and graph_expand
6returns no hits — which is why **LME multi-session sat at 11% (1/9)**
7across PR2-D's iterations. PR2-D.7 populates the table at retain.
9Three link types we extract:
11| link_type | Meaning | Weight semantics |
12|---------------|--------------------------------------------------|------------------|
13| **causal** | from_section caused (or enabled) to_section | Higher = more direct |
14| **supersedes** | to_section corrects / updates from_section | Time-ordered (newer wins) — LME knowledge-update primitive |
15| **elaborates** | to_section extends / adds detail to from_section | Topical continuity |
17Weights default to 0.7 unless the LLM emits one. We don't tune weights
18at PR2-D.7; the graph_expand strategy sums per-edge weights, so a flat
190.7 makes RRF behaviour predictable.
21## Why per-section, not per-pair
23Pair-wise extraction (look at every (i, j) section pair) is O(N²) at
24retain and dominates LME cost. Per-section is O(N): for each section,
25the LLM looks at the section's content + a window of prior section
26summaries and emits any links it sees. Resulting graph is sparse but
27covers the dominant retrieval cases.
29## Cost shape
31One LLM call per section. For LoCoMo ~30 sections × 10 convs = 300
32calls (~$0.05 first-time, $0 on cache). For LME ~50 sections × 50
33samples = 2500 calls (~$0.40 first-time, $0 on cache).
35Failures degrade silently — link extraction is enrichment, not
36correctness. The picker still works without it; graph_expand just
37returns empty.
39See:
40- docs/_design/recall.md §5 (retain pipeline)
41- docs/_design/adr/adr-006-three-layer-recall-stack.md (graph layer)
42- ``astrocyte_pi_section_links`` schema in 015_tier2_recall.sql
43"""
45from __future__ import annotations
47import json
48import logging
49from typing import TYPE_CHECKING
51if TYPE_CHECKING:
52 from astrocyte.provider import LLMProvider
53 from astrocyte.types import PageIndexSection
55from astrocyte.types import Message, PageIndexSectionLink
57logger = logging.getLogger("astrocyte.pipeline.section_link_extraction")
60_VALID_LINK_TYPES = frozenset({"causal", "supersedes", "elaborates"})
62#: Cap how many prior-section summaries we show the LLM per call. Bounds
63#: prompt size so retain wallclock stays bounded; LME's 50-section docs
64#: would otherwise blow past gpt-4o-mini's context window.
65_PRIOR_WINDOW = 8
67#: Cap on emitted links per section. Pathological extractions (every
68#: prior section linked) inflate the index without lifting recall —
69#: graph_expand only needs the strongest few.
70_MAX_LINKS_PER_SECTION = 5
73_EXTRACT_PROMPT = """You are analysing a long conversation transcript. The transcript has been chunked into sessions. For the CURRENT session, identify links to PRIOR sessions in this conversation.
75Three link types:
77- "causal": the current session was caused or enabled by the prior session (e.g., prior session sets up an event that the current session reports the outcome of).
78- "supersedes": the current session corrects or updates information from the prior session (e.g., user previously said X, now says actually Y).
79- "elaborates": the current session adds detail or follow-up to a topic the prior session introduced.
81CURRENT session (line {current_line}):
82{current_text}
84PRIOR sessions (you can link to any of these):
85{prior_summary_block}
87Return ONLY a JSON object: {{"links": [{{"to_line": <int>, "type": "causal|supersedes|elaborates", "weight": <0.0-1.0>, "reason": "<one short clause>"}}, ...]}}.
89Emit at most {max_links} links. Return {{"links": []}} when no clear link exists.
91Constraint: ``to_line`` MUST be one of the prior-session line numbers shown above.
93Output (JSON only):
94"""
97async def extract_links_for_section(
98 provider: "LLMProvider",
99 document_id: str,
100 current: "PageIndexSection",
101 current_text: str,
102 prior_sections: list["PageIndexSection"],
103 *,
104 model: str | None = None,
105) -> list[PageIndexSectionLink]:
106 """One LLM call → up to ``_MAX_LINKS_PER_SECTION`` ``PageIndexSectionLink`` rows.
108 ``prior_sections`` is a window of the most-recent prior sections
109 (caller passes them; usually the previous N session-grain
110 sections in the same document). The LLM sees their summaries and
111 line_nums; it can emit links to any subset.
113 Returns an empty list on parse failure / no links found / no
114 valid prior window.
115 """
116 if not current_text.strip() or not prior_sections:
117 return []
119 # Build the prior-summary block for the prompt.
120 prior_lines: list[str] = []
121 valid_to_lines: set[int] = set()
122 for p in prior_sections[-_PRIOR_WINDOW:]:
123 title = (p.title or "").strip()
124 summary = (p.summary or "").strip()
125 if not title and not summary:
126 continue
127 prior_lines.append(f" - line {p.line_num}: {title}\n summary: {summary[:240]}")
128 valid_to_lines.add(p.line_num)
129 if not prior_lines:
130 return []
132 msg = _EXTRACT_PROMPT.format(
133 current_line=current.line_num,
134 current_text=current_text[:4000], # cap; matches entity-extract sizing
135 prior_summary_block="\n".join(prior_lines),
136 max_links=_MAX_LINKS_PER_SECTION,
137 )
138 completion = await provider.complete(
139 messages=[Message(role="user", content=msg)],
140 model=model,
141 max_tokens=400,
142 temperature=0.0,
143 response_format={"type": "json_object"},
144 )
146 try:
147 parsed = json.loads(completion.text)
148 raw = parsed.get("links") or []
149 except json.JSONDecodeError:
150 logger.warning(
151 "section_link_extraction: JSON parse failed for doc=%s line=%d",
152 document_id,
153 current.line_num,
154 )
155 return []
157 out: list[PageIndexSectionLink] = []
158 seen_to: set[tuple[int, str]] = set() # (to_line, type) dedupe
159 for entry in raw:
160 if not isinstance(entry, dict):
161 continue
162 to_line = entry.get("to_line")
163 link_type = entry.get("type")
164 if not isinstance(to_line, int) or to_line not in valid_to_lines:
165 continue
166 if link_type not in _VALID_LINK_TYPES:
167 continue
168 # Self-loops are nonsensical; skip.
169 if to_line == current.line_num:
170 continue
171 key = (to_line, link_type)
172 if key in seen_to:
173 continue
174 seen_to.add(key)
175 weight_raw = entry.get("weight", 0.7)
176 try:
177 weight = float(weight_raw)
178 except (TypeError, ValueError):
179 weight = 0.7
180 # Clamp to [0, 1] to match the schema's expected range.
181 weight = max(0.0, min(1.0, weight))
182 out.append(
183 PageIndexSectionLink(
184 from_doc=document_id,
185 from_line=current.line_num,
186 to_doc=document_id,
187 to_line=to_line,
188 link_type=link_type,
189 weight=weight,
190 )
191 )
192 if len(out) >= _MAX_LINKS_PER_SECTION:
193 break
194 return out