Coverage for astrocyte/pipeline/structured_doc.py: 96%
166 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Structured representation of a mental model document (M21).
3Storing mental models as raw markdown forces every refresh to round-trip
4prose through an LLM, which then drifts on stylistic details (numbered
5vs bulleted lists, casing, separator lines, paraphrasing) even when
6instructed to preserve content byte-for-byte. The intrinsic mechanism
7of an LLM is to *generate* the next token from a gestalt of the input
8— not to copy tokens verbatim — so any "preserve unchanged content"
9instruction is fundamentally a soft constraint.
11The fix is to give the LLM no opportunity to drift on unchanged
12content. We keep an authoritative structured representation of the
13document; the markdown shown to users / callers is a deterministic
14render of that structure. Delta refreshes emit *operations* against
15the structure (see :mod:`astrocyte.pipeline.delta_ops`); sections and
16blocks not mentioned by any operation are physically untouched.
18Ported from Hindsight ``hindsight_api/engine/reflect/structured_doc.py``
19under the project's MIT licence; adapted to live alongside Astrocyte's
20existing mental-model storage in
21:class:`astrocyte.types.MentalModel`. Old mental-model rows with a
22plain ``content`` markdown string remain readable — :func:`parse_markdown`
23gives a best-effort conversion to the structured shape on first
24refresh (lazy migration).
26Schema (v1)
27-----------
28A document is an ordered list of :class:`Section`. Each section has:
30- ``id`` — stable slug derived from ``heading`` (used as the
31 operation target across refreshes; renames are an explicit op).
32- ``heading`` — markdown heading text (no ``#`` prefix).
33- ``level`` — 1 (``#``) … 6 (``######``). Default 2.
34- ``blocks`` — ordered list of typed blocks: paragraph, bullet_list,
35 ordered_list, code.
37The schema is intentionally narrow: it covers what real mental-model
38documents actually contain. Tables, images, and raw HTML are out of
39scope until a concrete use case lands. New block types should be added
40by extending the :data:`Block` union and updating
41:func:`render_block` + :func:`_parse_block`.
42"""
44from __future__ import annotations
46import re
47from typing import Annotated, Literal, Union
49from pydantic import BaseModel, ConfigDict, Field
51# Blocks ---------------------------------------------------------------------
54class ParagraphBlock(BaseModel):
55 model_config = ConfigDict(extra="forbid")
56 type: Literal["paragraph"] = "paragraph"
57 text: str
60class BulletListBlock(BaseModel):
61 model_config = ConfigDict(extra="forbid")
62 type: Literal["bullet_list"] = "bullet_list"
63 items: list[str] = Field(default_factory=list)
66class OrderedListBlock(BaseModel):
67 model_config = ConfigDict(extra="forbid")
68 type: Literal["ordered_list"] = "ordered_list"
69 items: list[str] = Field(default_factory=list)
72class CodeBlock(BaseModel):
73 model_config = ConfigDict(extra="forbid")
74 type: Literal["code"] = "code"
75 language: str = ""
76 text: str
79Block = Annotated[
80 Union[ParagraphBlock, BulletListBlock, OrderedListBlock, CodeBlock],
81 Field(discriminator="type"),
82]
85# Section / Document ---------------------------------------------------------
88class Section(BaseModel):
89 model_config = ConfigDict(extra="forbid")
90 id: str
91 heading: str
92 level: int = Field(default=2, ge=1, le=6)
93 blocks: list[Block] = Field(default_factory=list)
96class StructuredDocument(BaseModel):
97 """Top-level structured representation of a mental-model document.
99 Pydantic-serialisable so it can be persisted as JSONB on Postgres
100 via :class:`astrocyte.provider.MentalModelStore` implementations.
101 """
103 model_config = ConfigDict(extra="forbid")
104 version: Literal[1] = 1
105 sections: list[Section] = Field(default_factory=list)
107 def section_by_id(self, section_id: str) -> Section | None:
108 for s in self.sections:
109 if s.id == section_id:
110 return s
111 return None
113 def section_index(self, section_id: str) -> int | None:
114 for i, s in enumerate(self.sections):
115 if s.id == section_id:
116 return i
117 return None
120# Slug helpers ---------------------------------------------------------------
122_SLUG_RX = re.compile(r"[^a-z0-9]+")
125def slugify_heading(heading: str) -> str:
126 """Stable, deterministic slug from a heading.
128 "Stop Conditions" -> "stop-conditions"
129 "Inputs and Context" -> "inputs-and-context"
130 """
131 slug = _SLUG_RX.sub("-", heading.strip().lower()).strip("-")
132 return slug or "section"
135def make_unique_id(base: str, existing: set[str]) -> str:
136 """Disambiguate by appending ``-2``, ``-3``, … if the slug is in use."""
137 if base not in existing:
138 return base
139 i = 2
140 while f"{base}-{i}" in existing:
141 i += 1
142 return f"{base}-{i}"
145# Renderer -------------------------------------------------------------------
148def render_block(block: Block) -> str:
149 """Render a single block to markdown. No trailing newline."""
150 if isinstance(block, ParagraphBlock):
151 return block.text.rstrip()
152 if isinstance(block, BulletListBlock):
153 return "\n".join(f"- {item.rstrip()}" for item in block.items)
154 if isinstance(block, OrderedListBlock):
155 return "\n".join(f"{i + 1}. {item.rstrip()}" for i, item in enumerate(block.items))
156 if isinstance(block, CodeBlock):
157 fence_lang = block.language or ""
158 return f"```{fence_lang}\n{block.text}\n```"
159 raise TypeError(f"Unknown block type: {type(block)!r}")
162def render_section(section: Section) -> str:
163 """Render a section: heading + blank line + blocks separated by blank lines."""
164 parts = ["#" * section.level + " " + section.heading.strip()]
165 for block in section.blocks:
166 parts.append("") # blank line before each block
167 parts.append(render_block(block))
168 return "\n".join(parts)
171def render_document(doc: StructuredDocument) -> str:
172 """Render the whole document.
174 Sections are separated by a single blank line. The output is
175 byte-stable: same structured input always produces the same
176 markdown, modulo the inherent ordering of sections/blocks/items.
177 """
178 if not doc.sections:
179 return ""
180 return "\n\n".join(render_section(s) for s in doc.sections) + "\n"
183# Parser ---------------------------------------------------------------------
184#
185# The parser is intentionally lenient: it accepts the markdown produced
186# by our own renderer (round-trip-safe) and the markdown an LLM tends to
187# produce for mental-model documents. It is *not* a general CommonMark
188# parser — it does not need to be. When it cannot classify a block it
189# falls back to a paragraph so that no content is silently dropped.
191_HEADING_RX = re.compile(r"^(#{1,6})\s+(.+?)\s*$")
192_BULLET_RX = re.compile(r"^\s*[-*+]\s+(.*)$")
193_ORDERED_RX = re.compile(r"^\s*\d+[.)]\s+(.*)$")
194_FENCE_RX = re.compile(r"^```([A-Za-z0-9_+-]*)\s*$")
197def _strip_separators(lines: list[str]) -> list[str]:
198 """Drop horizontal-rule lines (``---``, ``***``) used as section separators.
200 Our renderer never emits these, but LLM output frequently includes
201 them between sections; treating them as blank lines avoids parsing
202 them as paragraphs.
203 """
204 return ["" if re.fullmatch(r"\s*([-*_])\1{2,}\s*", line) else line for line in lines]
207def _split_blocks(lines: list[str]) -> list[list[str]]:
208 """Group consecutive non-blank lines into block chunks."""
209 chunks: list[list[str]] = []
210 current: list[str] = []
211 in_fence = False
212 for line in lines:
213 if _FENCE_RX.match(line):
214 current.append(line)
215 in_fence = not in_fence
216 continue
217 if in_fence:
218 current.append(line)
219 continue
220 if line.strip() == "":
221 if current:
222 chunks.append(current)
223 current = []
224 else:
225 current.append(line)
226 if current:
227 chunks.append(current)
228 return chunks
231def _parse_block(chunk: list[str]) -> Block:
232 """Parse a single non-empty chunk into a block."""
233 if chunk and _FENCE_RX.match(chunk[0]):
234 m = _FENCE_RX.match(chunk[0])
235 lang = m.group(1) if m else ""
236 body_lines = chunk[1:]
237 if body_lines and _FENCE_RX.match(body_lines[-1]):
238 body_lines = body_lines[:-1]
239 return CodeBlock(language=lang, text="\n".join(body_lines))
241 if all(_BULLET_RX.match(line) for line in chunk):
242 items = []
243 for line in chunk:
244 m = _BULLET_RX.match(line)
245 assert m is not None
246 items.append(m.group(1).strip())
247 return BulletListBlock(items=items)
249 if all(_ORDERED_RX.match(line) for line in chunk):
250 items = []
251 for line in chunk:
252 m = _ORDERED_RX.match(line)
253 assert m is not None
254 items.append(m.group(1).strip())
255 return OrderedListBlock(items=items)
257 return ParagraphBlock(text=" ".join(line.strip() for line in chunk).strip())
260def parse_markdown(markdown: str) -> StructuredDocument:
261 """Best-effort parse of a markdown document into the structured schema.
263 Sections are introduced by ATX headings (``#``..``######``). Anything
264 before the first heading is wrapped into an implicit "Overview"
265 section so we never silently drop user content. Section IDs are
266 unique slugs of their headings.
268 Used for lazy migration of legacy mental-model rows whose ``content``
269 column is raw markdown — the first refresh that touches such a row
270 parses it once and stores the structured representation going
271 forward.
272 """
273 raw_lines = (markdown or "").splitlines()
274 lines = _strip_separators(raw_lines)
276 sections: list[Section] = []
277 used_ids: set[str] = set()
278 pending: list[str] = []
279 current: Section | None = None
281 def flush_pending_into(section: Section) -> None:
282 if not pending:
283 return
284 for chunk in _split_blocks(pending):
285 section.blocks.append(_parse_block(chunk))
286 pending.clear()
288 for line in lines:
289 m = _HEADING_RX.match(line)
290 if m:
291 if current is not None:
292 flush_pending_into(current)
293 sections.append(current)
294 elif pending:
295 # Content before the first heading: wrap in implicit section.
296 base = "overview"
297 section_id = make_unique_id(base, used_ids)
298 used_ids.add(section_id)
299 implicit = Section(id=section_id, heading="Overview", level=2)
300 flush_pending_into(implicit)
301 sections.append(implicit)
302 level = len(m.group(1))
303 heading = m.group(2).strip()
304 section_id = make_unique_id(slugify_heading(heading), used_ids)
305 used_ids.add(section_id)
306 current = Section(id=section_id, heading=heading, level=level)
307 else:
308 pending.append(line)
310 if current is not None:
311 flush_pending_into(current)
312 sections.append(current)
313 elif pending:
314 base = "overview"
315 section_id = make_unique_id(base, used_ids)
316 used_ids.add(section_id)
317 implicit = Section(id=section_id, heading="Overview", level=2)
318 flush_pending_into(implicit)
319 sections.append(implicit)
321 return StructuredDocument(sections=sections)