Coverage for astrocyte/pipeline/section_link_extraction.py: 0%

59 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""PR2 D.7: per-section causal / supersedes / elaborates link extraction. 

2 

3Why this exists: the ``graph_expand`` strategy (PR2 commit B) reads 

4``section_links`` to do 1-hop expansion from semantic / entity seeds. 

5Without this module, ``section_links`` is empty and graph_expand 

6returns no hits — which is why **LME multi-session sat at 11% (1/9)** 

7across PR2-D's iterations. PR2-D.7 populates the table at retain. 

8 

9Three link types we extract: 

10 

11| link_type | Meaning | Weight semantics | 

12|---------------|--------------------------------------------------|------------------| 

13| **causal** | from_section caused (or enabled) to_section | Higher = more direct | 

14| **supersedes** | to_section corrects / updates from_section | Time-ordered (newer wins) — LME knowledge-update primitive | 

15| **elaborates** | to_section extends / adds detail to from_section | Topical continuity | 

16 

17Weights default to 0.7 unless the LLM emits one. We don't tune weights 

18at PR2-D.7; the graph_expand strategy sums per-edge weights, so a flat 

190.7 makes RRF behaviour predictable. 

20 

21## Why per-section, not per-pair 

22 

23Pair-wise extraction (look at every (i, j) section pair) is O(N²) at 

24retain and dominates LME cost. Per-section is O(N): for each section, 

25the LLM looks at the section's content + a window of prior section 

26summaries and emits any links it sees. Resulting graph is sparse but 

27covers the dominant retrieval cases. 

28 

29## Cost shape 

30 

31One LLM call per section. For LoCoMo ~30 sections × 10 convs = 300 

32calls (~$0.05 first-time, $0 on cache). For LME ~50 sections × 50 

33samples = 2500 calls (~$0.40 first-time, $0 on cache). 

34 

35Failures degrade silently — link extraction is enrichment, not 

36correctness. The picker still works without it; graph_expand just 

37returns empty. 

38 

39See: 

40- docs/_design/recall.md §5 (retain pipeline) 

41- docs/_design/adr/adr-006-three-layer-recall-stack.md (graph layer) 

42- ``astrocyte_pi_section_links`` schema in 015_tier2_recall.sql 

43""" 

44 

45from __future__ import annotations 

46 

47import json 

48import logging 

49from typing import TYPE_CHECKING 

50 

51if TYPE_CHECKING: 

52 from astrocyte.provider import LLMProvider 

53 from astrocyte.types import PageIndexSection 

54 

55from astrocyte.types import Message, PageIndexSectionLink 

56 

57logger = logging.getLogger("astrocyte.pipeline.section_link_extraction") 

58 

59 

60_VALID_LINK_TYPES = frozenset({"causal", "supersedes", "elaborates"}) 

61 

62#: Cap how many prior-section summaries we show the LLM per call. Bounds 

63#: prompt size so retain wallclock stays bounded; LME's 50-section docs 

64#: would otherwise blow past gpt-4o-mini's context window. 

65_PRIOR_WINDOW = 8 

66 

67#: Cap on emitted links per section. Pathological extractions (every 

68#: prior section linked) inflate the index without lifting recall — 

69#: graph_expand only needs the strongest few. 

70_MAX_LINKS_PER_SECTION = 5 

71 

72 

73_EXTRACT_PROMPT = """You are analysing a long conversation transcript. The transcript has been chunked into sessions. For the CURRENT session, identify links to PRIOR sessions in this conversation. 

74 

75Three link types: 

76 

77- "causal": the current session was caused or enabled by the prior session (e.g., prior session sets up an event that the current session reports the outcome of). 

78- "supersedes": the current session corrects or updates information from the prior session (e.g., user previously said X, now says actually Y). 

79- "elaborates": the current session adds detail or follow-up to a topic the prior session introduced. 

80 

81CURRENT session (line {current_line}): 

82{current_text} 

83 

84PRIOR sessions (you can link to any of these): 

85{prior_summary_block} 

86 

87Return ONLY a JSON object: {{"links": [{{"to_line": <int>, "type": "causal|supersedes|elaborates", "weight": <0.0-1.0>, "reason": "<one short clause>"}}, ...]}}. 

88 

89Emit at most {max_links} links. Return {{"links": []}} when no clear link exists. 

90 

91Constraint: ``to_line`` MUST be one of the prior-session line numbers shown above. 

92 

93Output (JSON only): 

94""" 

95 

96 

97async def extract_links_for_section( 

98 provider: "LLMProvider", 

99 document_id: str, 

100 current: "PageIndexSection", 

101 current_text: str, 

102 prior_sections: list["PageIndexSection"], 

103 *, 

104 model: str | None = None, 

105) -> list[PageIndexSectionLink]: 

106 """One LLM call → up to ``_MAX_LINKS_PER_SECTION`` ``PageIndexSectionLink`` rows. 

107 

108 ``prior_sections`` is a window of the most-recent prior sections 

109 (caller passes them; usually the previous N session-grain 

110 sections in the same document). The LLM sees their summaries and 

111 line_nums; it can emit links to any subset. 

112 

113 Returns an empty list on parse failure / no links found / no 

114 valid prior window. 

115 """ 

116 if not current_text.strip() or not prior_sections: 

117 return [] 

118 

119 # Build the prior-summary block for the prompt. 

120 prior_lines: list[str] = [] 

121 valid_to_lines: set[int] = set() 

122 for p in prior_sections[-_PRIOR_WINDOW:]: 

123 title = (p.title or "").strip() 

124 summary = (p.summary or "").strip() 

125 if not title and not summary: 

126 continue 

127 prior_lines.append(f" - line {p.line_num}: {title}\n summary: {summary[:240]}") 

128 valid_to_lines.add(p.line_num) 

129 if not prior_lines: 

130 return [] 

131 

132 msg = _EXTRACT_PROMPT.format( 

133 current_line=current.line_num, 

134 current_text=current_text[:4000], # cap; matches entity-extract sizing 

135 prior_summary_block="\n".join(prior_lines), 

136 max_links=_MAX_LINKS_PER_SECTION, 

137 ) 

138 completion = await provider.complete( 

139 messages=[Message(role="user", content=msg)], 

140 model=model, 

141 max_tokens=400, 

142 temperature=0.0, 

143 response_format={"type": "json_object"}, 

144 ) 

145 

146 try: 

147 parsed = json.loads(completion.text) 

148 raw = parsed.get("links") or [] 

149 except json.JSONDecodeError: 

150 logger.warning( 

151 "section_link_extraction: JSON parse failed for doc=%s line=%d", 

152 document_id, 

153 current.line_num, 

154 ) 

155 return [] 

156 

157 out: list[PageIndexSectionLink] = [] 

158 seen_to: set[tuple[int, str]] = set() # (to_line, type) dedupe 

159 for entry in raw: 

160 if not isinstance(entry, dict): 

161 continue 

162 to_line = entry.get("to_line") 

163 link_type = entry.get("type") 

164 if not isinstance(to_line, int) or to_line not in valid_to_lines: 

165 continue 

166 if link_type not in _VALID_LINK_TYPES: 

167 continue 

168 # Self-loops are nonsensical; skip. 

169 if to_line == current.line_num: 

170 continue 

171 key = (to_line, link_type) 

172 if key in seen_to: 

173 continue 

174 seen_to.add(key) 

175 weight_raw = entry.get("weight", 0.7) 

176 try: 

177 weight = float(weight_raw) 

178 except (TypeError, ValueError): 

179 weight = 0.7 

180 # Clamp to [0, 1] to match the schema's expected range. 

181 weight = max(0.0, min(1.0, weight)) 

182 out.append( 

183 PageIndexSectionLink( 

184 from_doc=document_id, 

185 from_line=current.line_num, 

186 to_doc=document_id, 

187 to_line=to_line, 

188 link_type=link_type, 

189 weight=weight, 

190 ) 

191 ) 

192 if len(out) >= _MAX_LINKS_PER_SECTION: 

193 break 

194 return out