Coverage for astrocyte/documents/builders/md_builder.py: 100%
52 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1r"""Markdown → DocumentTree builder.
3PageIndex parity (see ``/Users/calvin/AstrocyteAI/PageIndex/pageindex/page_index_md.py``):
5- Header regex: ``^(#{1,6})\s+(.+)$`` — all six heading levels become
6 nodes.
7- Code-block aware: triple-backtick fences are tracked so headings
8 inside code samples don't accidentally split sections.
9- Each node owns the text from its heading line up to (but not
10 including) the next heading at the same-or-shallower level.
11- Children attach to the most-recent ancestor with strictly shallower
12 depth — the standard markdown nesting rule.
14This module ONLY constructs the tree structure. Per-node summarization
15is the summarizer's job (Phase 2). The tree returned here has
16``summary=None`` on every node.
18Public API:
19 build_markdown_tree(md_text, document_id) -> DocumentTree
20"""
22from __future__ import annotations
24import re
26from astrocyte.documents.types import DocumentTree, TreeNode
28_HEADER_RE = re.compile(r"^(#{1,6})\s+(.+)$")
29_CODE_FENCE_RE = re.compile(r"^```")
32def _extract_node_headers(md_text: str) -> list[tuple[int, int, str]]:
33 """Pass 1 of PageIndex's md→tree: locate every header line.
35 Returns a list of ``(line_num, depth, title)`` triples in document
36 order. ``line_num`` is 1-indexed. Code blocks are excluded so
37 headers inside ```fenced``` regions don't appear.
38 """
39 headers: list[tuple[int, int, str]] = []
40 in_code = False
41 for i, line in enumerate(md_text.splitlines(), start=1):
42 stripped = line.strip()
43 if _CODE_FENCE_RE.match(stripped):
44 in_code = not in_code
45 continue
46 if in_code:
47 continue
48 if not stripped:
49 continue
50 m = _HEADER_RE.match(stripped)
51 if m:
52 depth = len(m.group(1))
53 title = m.group(2).strip()
54 headers.append((i, depth, title))
55 return headers
58def _build_node_text(
59 headers: list[tuple[int, int, str]],
60 md_lines: list[str],
61 idx: int,
62) -> tuple[str, int]:
63 """Compute the text body for header at ``headers[idx]``.
65 Returns ``(text, end_line)``. The text spans from the header line
66 (inclusive, 1-indexed) up to but not including the next header
67 line of any depth (PageIndex behavior — each leaf "owns" its body
68 until any next header). ``end_line`` is the exclusive end (next
69 header's line_num, or total_lines+1 if last).
70 """
71 start_line = headers[idx][0] # 1-indexed
72 if idx + 1 < len(headers):
73 end_line = headers[idx + 1][0]
74 else:
75 end_line = len(md_lines) + 1
76 # md_lines is 0-indexed; convert
77 body = "\n".join(md_lines[start_line - 1 : end_line - 1]).strip()
78 return body, end_line
81def build_markdown_tree(md_text: str, document_id: str) -> DocumentTree:
82 """Build a DocumentTree from markdown text.
84 Implementation mirrors PageIndex's two-pass approach:
86 Pass 1: scan lines, locate every header (depth + title + line_num).
87 Pass 2: stack-based tree construction — each header attaches under
88 the most-recent ancestor with strictly shallower depth.
90 Each node's text body is the lines from its header through the next
91 header of any depth, stripped. Roots are nodes whose parent is None
92 (i.e., the document started with that depth, no shallower header
93 preceded it).
95 Edge cases:
96 - Markdown with NO headers → returns DocumentTree with one
97 synthetic root holding the full text. (Most documents have at
98 least one heading; the bench harness for LME/LoCoMo synthesizes
99 ``## Session N`` headers, so this fallback rarely fires in
100 practice but keeps the contract well-defined.)
101 - Empty input → DocumentTree with empty ``roots`` list.
102 """
103 if not md_text or not md_text.strip():
104 return DocumentTree(document_id=document_id, roots=[])
106 md_lines = md_text.splitlines()
107 headers = _extract_node_headers(md_text)
109 if not headers:
110 # Headerless input — produce one synthetic root
111 synthetic = TreeNode.new(
112 parent_id=None,
113 depth=1,
114 title="(untitled document)",
115 text=md_text.strip(),
116 line_start=1,
117 line_end=len(md_lines),
118 )
119 return DocumentTree(document_id=document_id, roots=[synthetic])
121 # Build all nodes first (no parent assignments yet)
122 nodes: list[TreeNode] = []
123 for idx, (line_num, depth, title) in enumerate(headers):
124 text, end_line = _build_node_text(headers, md_lines, idx)
125 nodes.append(
126 TreeNode.new(
127 parent_id=None, # filled in by stack pass below
128 depth=depth,
129 title=title,
130 text=text,
131 line_start=line_num,
132 line_end=end_line - 1,
133 ),
134 )
136 # Stack-based parent assignment. Stack holds (depth, node) pairs of
137 # candidate ancestors. For each new node, pop until top has strictly
138 # shallower depth; that's the parent. If stack empties → new root.
139 roots: list[TreeNode] = []
140 stack: list[TreeNode] = []
141 for node in nodes:
142 while stack and stack[-1].depth >= node.depth:
143 stack.pop()
144 if stack:
145 stack[-1].add_child(node)
146 else:
147 roots.append(node)
148 stack.append(node)
150 return DocumentTree(document_id=document_id, roots=roots)