Coverage for astrocyte/pipeline/structured

1"""Structured representation of a mental model document (M21).

3Storing mental models as raw markdown forces every refresh to round-trip

4prose through an LLM, which then drifts on stylistic details (numbered

5vs bulleted lists, casing, separator lines, paraphrasing) even when

6instructed to preserve content byte-for-byte. The intrinsic mechanism

7of an LLM is to *generate* the next token from a gestalt of the input

8— not to copy tokens verbatim — so any "preserve unchanged content"

9instruction is fundamentally a soft constraint.

11The fix is to give the LLM no opportunity to drift on unchanged

12content. We keep an authoritative structured representation of the

13document; the markdown shown to users / callers is a deterministic

14render of that structure. Delta refreshes emit *operations* against

15the structure (see :mod:`astrocyte.pipeline.delta_ops`); sections and

16blocks not mentioned by any operation are physically untouched.

18Ported from Hindsight ``hindsight_api/engine/reflect/structured_doc.py``

19under the project's MIT licence; adapted to live alongside Astrocyte's

20existing mental-model storage in

21:class:`astrocyte.types.MentalModel`. Old mental-model rows with a

22plain ``content`` markdown string remain readable — :func:`parse_markdown`

23gives a best-effort conversion to the structured shape on first

24refresh (lazy migration).

26Schema (v1)

27-----------

28A document is an ordered list of :class:`Section`. Each section has:

30- ``id`` — stable slug derived from ``heading`` (used as the

31 operation target across refreshes; renames are an explicit op).

32- ``heading`` — markdown heading text (no ``#`` prefix).

33- ``level`` — 1 (``#``) … 6 (``######``). Default 2.

34- ``blocks`` — ordered list of typed blocks: paragraph, bullet_list,

35 ordered_list, code.

37The schema is intentionally narrow: it covers what real mental-model

38documents actually contain. Tables, images, and raw HTML are out of

39scope until a concrete use case lands. New block types should be added

40by extending the :data:`Block` union and updating

41:func:`render_block` + :func:`_parse_block`.

42"""

44from __future__ import annotations

46import re

47from typing import Annotated, Literal, Union

49from pydantic import BaseModel, ConfigDict, Field

51# Blocks ---------------------------------------------------------------------

54class ParagraphBlock(BaseModel):

55 model_config = ConfigDict(extra="forbid")

56 type: Literal["paragraph"] = "paragraph"

57 text: str

60class BulletListBlock(BaseModel):

61 model_config = ConfigDict(extra="forbid")

62 type: Literal["bullet_list"] = "bullet_list"

63 items: list[str] = Field(default_factory=list)

66class OrderedListBlock(BaseModel):

67 model_config = ConfigDict(extra="forbid")

68 type: Literal["ordered_list"] = "ordered_list"

69 items: list[str] = Field(default_factory=list)

72class CodeBlock(BaseModel):

73 model_config = ConfigDict(extra="forbid")

74 type: Literal["code"] = "code"

75 language: str = ""

76 text: str

79Block = Annotated[

80 Union[ParagraphBlock, BulletListBlock, OrderedListBlock, CodeBlock],

81 Field(discriminator="type"),

82]

85# Section / Document ---------------------------------------------------------

88class Section(BaseModel):

89 model_config = ConfigDict(extra="forbid")

90 id: str

91 heading: str

92 level: int = Field(default=2, ge=1, le=6)

93 blocks: list[Block] = Field(default_factory=list)

96class StructuredDocument(BaseModel):

97 """Top-level structured representation of a mental-model document.

99 Pydantic-serialisable so it can be persisted as JSONB on Postgres

100 via :class:`astrocyte.provider.MentalModelStore` implementations.

101 """

102

103 model_config = ConfigDict(extra="forbid")

104 version: Literal[1] = 1

105 sections: list[Section] = Field(default_factory=list)

106

107 def section_by_id(self, section_id: str) -> Section | None:

108 for s in self.sections:

109 if s.id == section_id:

110 return s

111 return None

112

113 def section_index(self, section_id: str) -> int | None:

114 for i, s in enumerate(self.sections):

115 if s.id == section_id:

116 return i

117 return None

118

119

120# Slug helpers ---------------------------------------------------------------

121

122_SLUG_RX = re.compile(r"[^a-z0-9]+")

123

124

125def slugify_heading(heading: str) -> str:

126 """Stable, deterministic slug from a heading.

127

128 "Stop Conditions" -> "stop-conditions"

129 "Inputs and Context" -> "inputs-and-context"

130 """

131 slug = _SLUG_RX.sub("-", heading.strip().lower()).strip("-")

132 return slug or "section"

133

134

135def make_unique_id(base: str, existing: set[str]) -> str:

136 """Disambiguate by appending ``-2``, ``-3``, … if the slug is in use."""

137 if base not in existing:

138 return base

139 i = 2

140 while f"{base}-{i}" in existing:

141 i += 1

142 return f"{base}-{i}"

143

144

145# Renderer -------------------------------------------------------------------

146

147

148def render_block(block: Block) -> str:

149 """Render a single block to markdown. No trailing newline."""

150 if isinstance(block, ParagraphBlock):

151 return block.text.rstrip()

152 if isinstance(block, BulletListBlock):

153 return "\n".join(f"- {item.rstrip()}" for item in block.items)

154 if isinstance(block, OrderedListBlock):

155 return "\n".join(f"{i + 1}. {item.rstrip()}" for i, item in enumerate(block.items))

156 if isinstance(block, CodeBlock):

157 fence_lang = block.language or ""

158 return f"```{fence_lang}\n{block.text}\n```"

159 raise TypeError(f"Unknown block type: {type(block)!r}")

160

161

162def render_section(section: Section) -> str:

163 """Render a section: heading + blank line + blocks separated by blank lines."""

164 parts = ["#" * section.level + " " + section.heading.strip()]

165 for block in section.blocks:

166 parts.append("") # blank line before each block

167 parts.append(render_block(block))

168 return "\n".join(parts)

169

170

171def render_document(doc: StructuredDocument) -> str:

172 """Render the whole document.

173

174 Sections are separated by a single blank line. The output is

175 byte-stable: same structured input always produces the same

176 markdown, modulo the inherent ordering of sections/blocks/items.

177 """

178 if not doc.sections:

179 return ""

180 return "\n\n".join(render_section(s) for s in doc.sections) + "\n"

181

182

183# Parser ---------------------------------------------------------------------

184#

185# The parser is intentionally lenient: it accepts the markdown produced

186# by our own renderer (round-trip-safe) and the markdown an LLM tends to

187# produce for mental-model documents. It is *not* a general CommonMark

188# parser — it does not need to be. When it cannot classify a block it

189# falls back to a paragraph so that no content is silently dropped.

190

191_HEADING_RX = re.compile(r"^(#{1,6})\s+(.+?)\s*$")

192_BULLET_RX = re.compile(r"^\s*[-*+]\s+(.*)$")

193_ORDERED_RX = re.compile(r"^\s*\d+[.)]\s+(.*)$")

194_FENCE_RX = re.compile(r"^```([A-Za-z0-9_+-]*)\s*$")

195

196

197def _strip_separators(lines: list[str]) -> list[str]:

198 """Drop horizontal-rule lines (``---``, ``***``) used as section separators.

199

200 Our renderer never emits these, but LLM output frequently includes

201 them between sections; treating them as blank lines avoids parsing

202 them as paragraphs.

203 """

204 return ["" if re.fullmatch(r"\s*([-*_])\1{2,}\s*", line) else line for line in lines]

205

206

207def _split_blocks(lines: list[str]) -> list[list[str]]:

208 """Group consecutive non-blank lines into block chunks."""

209 chunks: list[list[str]] = []

210 current: list[str] = []

211 in_fence = False

212 for line in lines:

213 if _FENCE_RX.match(line):

214 current.append(line)

215 in_fence = not in_fence

216 continue

217 if in_fence:

218 current.append(line)

219 continue

220 if line.strip() == "":

221 if current:

222 chunks.append(current)

223 current = []

224 else:

225 current.append(line)

226 if current:

227 chunks.append(current)

228 return chunks

229

230

231def _parse_block(chunk: list[str]) -> Block:

232 """Parse a single non-empty chunk into a block."""

233 if chunk and _FENCE_RX.match(chunk[0]):

234 m = _FENCE_RX.match(chunk[0])

235 lang = m.group(1) if m else ""

236 body_lines = chunk[1:]

237 if body_lines and _FENCE_RX.match(body_lines[-1]):

238 body_lines = body_lines[:-1]

239 return CodeBlock(language=lang, text="\n".join(body_lines))

240

241 if all(_BULLET_RX.match(line) for line in chunk):

242 items = []

243 for line in chunk:

244 m = _BULLET_RX.match(line)

245 assert m is not None

246 items.append(m.group(1).strip())

247 return BulletListBlock(items=items)

248

249 if all(_ORDERED_RX.match(line) for line in chunk):

250 items = []

251 for line in chunk:

252 m = _ORDERED_RX.match(line)

253 assert m is not None

254 items.append(m.group(1).strip())

255 return OrderedListBlock(items=items)

256

257 return ParagraphBlock(text=" ".join(line.strip() for line in chunk).strip())

258

259

260def parse_markdown(markdown: str) -> StructuredDocument:

261 """Best-effort parse of a markdown document into the structured schema.

262

263 Sections are introduced by ATX headings (``#``..``######``). Anything

264 before the first heading is wrapped into an implicit "Overview"

265 section so we never silently drop user content. Section IDs are

266 unique slugs of their headings.

267

268 Used for lazy migration of legacy mental-model rows whose ``content``

269 column is raw markdown — the first refresh that touches such a row

270 parses it once and stores the structured representation going

271 forward.

272 """

273 raw_lines = (markdown or "").splitlines()

274 lines = _strip_separators(raw_lines)

275

276 sections: list[Section] = []

277 used_ids: set[str] = set()

278 pending: list[str] = []

279 current: Section | None = None

280

281 def flush_pending_into(section: Section) -> None:

282 if not pending:

283 return

284 for chunk in _split_blocks(pending):

285 section.blocks.append(_parse_block(chunk))

286 pending.clear()

287

288 for line in lines:

289 m = _HEADING_RX.match(line)

290 if m:

291 if current is not None:

292 flush_pending_into(current)

293 sections.append(current)

294 elif pending:

295 # Content before the first heading: wrap in implicit section.

296 base = "overview"

297 section_id = make_unique_id(base, used_ids)

298 used_ids.add(section_id)

299 implicit = Section(id=section_id, heading="Overview", level=2)

300 flush_pending_into(implicit)

301 sections.append(implicit)

302 level = len(m.group(1))

303 heading = m.group(2).strip()

304 section_id = make_unique_id(slugify_heading(heading), used_ids)

305 used_ids.add(section_id)

306 current = Section(id=section_id, heading=heading, level=level)

307 else:

308 pending.append(line)

309

310 if current is not None:

311 flush_pending_into(current)

312 sections.append(current)

313 elif pending:

314 base = "overview"

315 section_id = make_unique_id(base, used_ids)

316 used_ids.add(section_id)

317 implicit = Section(id=section_id, heading="Overview", level=2)

318 flush_pending_into(implicit)

319 sections.append(implicit)

320

321 return StructuredDocument(sections=sections)

Coverage for astrocyte/pipeline/structured_doc.py: 96%

166 statements