Coverage for astrocyte/pipeline/structured_doc.py: 96%

166 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Structured representation of a mental model document (M21). 

2 

3Storing mental models as raw markdown forces every refresh to round-trip 

4prose through an LLM, which then drifts on stylistic details (numbered 

5vs bulleted lists, casing, separator lines, paraphrasing) even when 

6instructed to preserve content byte-for-byte. The intrinsic mechanism 

7of an LLM is to *generate* the next token from a gestalt of the input 

8— not to copy tokens verbatim — so any "preserve unchanged content" 

9instruction is fundamentally a soft constraint. 

10 

11The fix is to give the LLM no opportunity to drift on unchanged 

12content. We keep an authoritative structured representation of the 

13document; the markdown shown to users / callers is a deterministic 

14render of that structure. Delta refreshes emit *operations* against 

15the structure (see :mod:`astrocyte.pipeline.delta_ops`); sections and 

16blocks not mentioned by any operation are physically untouched. 

17 

18Ported from Hindsight ``hindsight_api/engine/reflect/structured_doc.py`` 

19under the project's MIT licence; adapted to live alongside Astrocyte's 

20existing mental-model storage in 

21:class:`astrocyte.types.MentalModel`. Old mental-model rows with a 

22plain ``content`` markdown string remain readable — :func:`parse_markdown` 

23gives a best-effort conversion to the structured shape on first 

24refresh (lazy migration). 

25 

26Schema (v1) 

27----------- 

28A document is an ordered list of :class:`Section`. Each section has: 

29 

30- ``id`` — stable slug derived from ``heading`` (used as the 

31 operation target across refreshes; renames are an explicit op). 

32- ``heading`` — markdown heading text (no ``#`` prefix). 

33- ``level`` — 1 (``#``) … 6 (``######``). Default 2. 

34- ``blocks`` — ordered list of typed blocks: paragraph, bullet_list, 

35 ordered_list, code. 

36 

37The schema is intentionally narrow: it covers what real mental-model 

38documents actually contain. Tables, images, and raw HTML are out of 

39scope until a concrete use case lands. New block types should be added 

40by extending the :data:`Block` union and updating 

41:func:`render_block` + :func:`_parse_block`. 

42""" 

43 

44from __future__ import annotations 

45 

46import re 

47from typing import Annotated, Literal, Union 

48 

49from pydantic import BaseModel, ConfigDict, Field 

50 

51# Blocks --------------------------------------------------------------------- 

52 

53 

54class ParagraphBlock(BaseModel): 

55 model_config = ConfigDict(extra="forbid") 

56 type: Literal["paragraph"] = "paragraph" 

57 text: str 

58 

59 

60class BulletListBlock(BaseModel): 

61 model_config = ConfigDict(extra="forbid") 

62 type: Literal["bullet_list"] = "bullet_list" 

63 items: list[str] = Field(default_factory=list) 

64 

65 

66class OrderedListBlock(BaseModel): 

67 model_config = ConfigDict(extra="forbid") 

68 type: Literal["ordered_list"] = "ordered_list" 

69 items: list[str] = Field(default_factory=list) 

70 

71 

72class CodeBlock(BaseModel): 

73 model_config = ConfigDict(extra="forbid") 

74 type: Literal["code"] = "code" 

75 language: str = "" 

76 text: str 

77 

78 

79Block = Annotated[ 

80 Union[ParagraphBlock, BulletListBlock, OrderedListBlock, CodeBlock], 

81 Field(discriminator="type"), 

82] 

83 

84 

85# Section / Document --------------------------------------------------------- 

86 

87 

88class Section(BaseModel): 

89 model_config = ConfigDict(extra="forbid") 

90 id: str 

91 heading: str 

92 level: int = Field(default=2, ge=1, le=6) 

93 blocks: list[Block] = Field(default_factory=list) 

94 

95 

96class StructuredDocument(BaseModel): 

97 """Top-level structured representation of a mental-model document. 

98 

99 Pydantic-serialisable so it can be persisted as JSONB on Postgres 

100 via :class:`astrocyte.provider.MentalModelStore` implementations. 

101 """ 

102 

103 model_config = ConfigDict(extra="forbid") 

104 version: Literal[1] = 1 

105 sections: list[Section] = Field(default_factory=list) 

106 

107 def section_by_id(self, section_id: str) -> Section | None: 

108 for s in self.sections: 

109 if s.id == section_id: 

110 return s 

111 return None 

112 

113 def section_index(self, section_id: str) -> int | None: 

114 for i, s in enumerate(self.sections): 

115 if s.id == section_id: 

116 return i 

117 return None 

118 

119 

120# Slug helpers --------------------------------------------------------------- 

121 

122_SLUG_RX = re.compile(r"[^a-z0-9]+") 

123 

124 

125def slugify_heading(heading: str) -> str: 

126 """Stable, deterministic slug from a heading. 

127 

128 "Stop Conditions" -> "stop-conditions" 

129 "Inputs and Context" -> "inputs-and-context" 

130 """ 

131 slug = _SLUG_RX.sub("-", heading.strip().lower()).strip("-") 

132 return slug or "section" 

133 

134 

135def make_unique_id(base: str, existing: set[str]) -> str: 

136 """Disambiguate by appending ``-2``, ``-3``, … if the slug is in use.""" 

137 if base not in existing: 

138 return base 

139 i = 2 

140 while f"{base}-{i}" in existing: 

141 i += 1 

142 return f"{base}-{i}" 

143 

144 

145# Renderer ------------------------------------------------------------------- 

146 

147 

148def render_block(block: Block) -> str: 

149 """Render a single block to markdown. No trailing newline.""" 

150 if isinstance(block, ParagraphBlock): 

151 return block.text.rstrip() 

152 if isinstance(block, BulletListBlock): 

153 return "\n".join(f"- {item.rstrip()}" for item in block.items) 

154 if isinstance(block, OrderedListBlock): 

155 return "\n".join(f"{i + 1}. {item.rstrip()}" for i, item in enumerate(block.items)) 

156 if isinstance(block, CodeBlock): 

157 fence_lang = block.language or "" 

158 return f"```{fence_lang}\n{block.text}\n```" 

159 raise TypeError(f"Unknown block type: {type(block)!r}") 

160 

161 

162def render_section(section: Section) -> str: 

163 """Render a section: heading + blank line + blocks separated by blank lines.""" 

164 parts = ["#" * section.level + " " + section.heading.strip()] 

165 for block in section.blocks: 

166 parts.append("") # blank line before each block 

167 parts.append(render_block(block)) 

168 return "\n".join(parts) 

169 

170 

171def render_document(doc: StructuredDocument) -> str: 

172 """Render the whole document. 

173 

174 Sections are separated by a single blank line. The output is 

175 byte-stable: same structured input always produces the same 

176 markdown, modulo the inherent ordering of sections/blocks/items. 

177 """ 

178 if not doc.sections: 

179 return "" 

180 return "\n\n".join(render_section(s) for s in doc.sections) + "\n" 

181 

182 

183# Parser --------------------------------------------------------------------- 

184# 

185# The parser is intentionally lenient: it accepts the markdown produced 

186# by our own renderer (round-trip-safe) and the markdown an LLM tends to 

187# produce for mental-model documents. It is *not* a general CommonMark 

188# parser — it does not need to be. When it cannot classify a block it 

189# falls back to a paragraph so that no content is silently dropped. 

190 

191_HEADING_RX = re.compile(r"^(#{1,6})\s+(.+?)\s*$") 

192_BULLET_RX = re.compile(r"^\s*[-*+]\s+(.*)$") 

193_ORDERED_RX = re.compile(r"^\s*\d+[.)]\s+(.*)$") 

194_FENCE_RX = re.compile(r"^```([A-Za-z0-9_+-]*)\s*$") 

195 

196 

197def _strip_separators(lines: list[str]) -> list[str]: 

198 """Drop horizontal-rule lines (``---``, ``***``) used as section separators. 

199 

200 Our renderer never emits these, but LLM output frequently includes 

201 them between sections; treating them as blank lines avoids parsing 

202 them as paragraphs. 

203 """ 

204 return ["" if re.fullmatch(r"\s*([-*_])\1{2,}\s*", line) else line for line in lines] 

205 

206 

207def _split_blocks(lines: list[str]) -> list[list[str]]: 

208 """Group consecutive non-blank lines into block chunks.""" 

209 chunks: list[list[str]] = [] 

210 current: list[str] = [] 

211 in_fence = False 

212 for line in lines: 

213 if _FENCE_RX.match(line): 

214 current.append(line) 

215 in_fence = not in_fence 

216 continue 

217 if in_fence: 

218 current.append(line) 

219 continue 

220 if line.strip() == "": 

221 if current: 

222 chunks.append(current) 

223 current = [] 

224 else: 

225 current.append(line) 

226 if current: 

227 chunks.append(current) 

228 return chunks 

229 

230 

231def _parse_block(chunk: list[str]) -> Block: 

232 """Parse a single non-empty chunk into a block.""" 

233 if chunk and _FENCE_RX.match(chunk[0]): 

234 m = _FENCE_RX.match(chunk[0]) 

235 lang = m.group(1) if m else "" 

236 body_lines = chunk[1:] 

237 if body_lines and _FENCE_RX.match(body_lines[-1]): 

238 body_lines = body_lines[:-1] 

239 return CodeBlock(language=lang, text="\n".join(body_lines)) 

240 

241 if all(_BULLET_RX.match(line) for line in chunk): 

242 items = [] 

243 for line in chunk: 

244 m = _BULLET_RX.match(line) 

245 assert m is not None 

246 items.append(m.group(1).strip()) 

247 return BulletListBlock(items=items) 

248 

249 if all(_ORDERED_RX.match(line) for line in chunk): 

250 items = [] 

251 for line in chunk: 

252 m = _ORDERED_RX.match(line) 

253 assert m is not None 

254 items.append(m.group(1).strip()) 

255 return OrderedListBlock(items=items) 

256 

257 return ParagraphBlock(text=" ".join(line.strip() for line in chunk).strip()) 

258 

259 

260def parse_markdown(markdown: str) -> StructuredDocument: 

261 """Best-effort parse of a markdown document into the structured schema. 

262 

263 Sections are introduced by ATX headings (``#``..``######``). Anything 

264 before the first heading is wrapped into an implicit "Overview" 

265 section so we never silently drop user content. Section IDs are 

266 unique slugs of their headings. 

267 

268 Used for lazy migration of legacy mental-model rows whose ``content`` 

269 column is raw markdown — the first refresh that touches such a row 

270 parses it once and stores the structured representation going 

271 forward. 

272 """ 

273 raw_lines = (markdown or "").splitlines() 

274 lines = _strip_separators(raw_lines) 

275 

276 sections: list[Section] = [] 

277 used_ids: set[str] = set() 

278 pending: list[str] = [] 

279 current: Section | None = None 

280 

281 def flush_pending_into(section: Section) -> None: 

282 if not pending: 

283 return 

284 for chunk in _split_blocks(pending): 

285 section.blocks.append(_parse_block(chunk)) 

286 pending.clear() 

287 

288 for line in lines: 

289 m = _HEADING_RX.match(line) 

290 if m: 

291 if current is not None: 

292 flush_pending_into(current) 

293 sections.append(current) 

294 elif pending: 

295 # Content before the first heading: wrap in implicit section. 

296 base = "overview" 

297 section_id = make_unique_id(base, used_ids) 

298 used_ids.add(section_id) 

299 implicit = Section(id=section_id, heading="Overview", level=2) 

300 flush_pending_into(implicit) 

301 sections.append(implicit) 

302 level = len(m.group(1)) 

303 heading = m.group(2).strip() 

304 section_id = make_unique_id(slugify_heading(heading), used_ids) 

305 used_ids.add(section_id) 

306 current = Section(id=section_id, heading=heading, level=level) 

307 else: 

308 pending.append(line) 

309 

310 if current is not None: 

311 flush_pending_into(current) 

312 sections.append(current) 

313 elif pending: 

314 base = "overview" 

315 section_id = make_unique_id(base, used_ids) 

316 used_ids.add(section_id) 

317 implicit = Section(id=section_id, heading="Overview", level=2) 

318 flush_pending_into(implicit) 

319 sections.append(implicit) 

320 

321 return StructuredDocument(sections=sections)