Coverage for astrocyte/documents/builders/md_builder.py: 100%

52 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1r"""Markdown → DocumentTree builder. 

2 

3PageIndex parity (see ``/Users/calvin/AstrocyteAI/PageIndex/pageindex/page_index_md.py``): 

4 

5- Header regex: ``^(#{1,6})\s+(.+)$`` — all six heading levels become 

6 nodes. 

7- Code-block aware: triple-backtick fences are tracked so headings 

8 inside code samples don't accidentally split sections. 

9- Each node owns the text from its heading line up to (but not 

10 including) the next heading at the same-or-shallower level. 

11- Children attach to the most-recent ancestor with strictly shallower 

12 depth — the standard markdown nesting rule. 

13 

14This module ONLY constructs the tree structure. Per-node summarization 

15is the summarizer's job (Phase 2). The tree returned here has 

16``summary=None`` on every node. 

17 

18Public API: 

19 build_markdown_tree(md_text, document_id) -> DocumentTree 

20""" 

21 

22from __future__ import annotations 

23 

24import re 

25 

26from astrocyte.documents.types import DocumentTree, TreeNode 

27 

28_HEADER_RE = re.compile(r"^(#{1,6})\s+(.+)$") 

29_CODE_FENCE_RE = re.compile(r"^```") 

30 

31 

32def _extract_node_headers(md_text: str) -> list[tuple[int, int, str]]: 

33 """Pass 1 of PageIndex's md→tree: locate every header line. 

34 

35 Returns a list of ``(line_num, depth, title)`` triples in document 

36 order. ``line_num`` is 1-indexed. Code blocks are excluded so 

37 headers inside ```fenced``` regions don't appear. 

38 """ 

39 headers: list[tuple[int, int, str]] = [] 

40 in_code = False 

41 for i, line in enumerate(md_text.splitlines(), start=1): 

42 stripped = line.strip() 

43 if _CODE_FENCE_RE.match(stripped): 

44 in_code = not in_code 

45 continue 

46 if in_code: 

47 continue 

48 if not stripped: 

49 continue 

50 m = _HEADER_RE.match(stripped) 

51 if m: 

52 depth = len(m.group(1)) 

53 title = m.group(2).strip() 

54 headers.append((i, depth, title)) 

55 return headers 

56 

57 

58def _build_node_text( 

59 headers: list[tuple[int, int, str]], 

60 md_lines: list[str], 

61 idx: int, 

62) -> tuple[str, int]: 

63 """Compute the text body for header at ``headers[idx]``. 

64 

65 Returns ``(text, end_line)``. The text spans from the header line 

66 (inclusive, 1-indexed) up to but not including the next header 

67 line of any depth (PageIndex behavior — each leaf "owns" its body 

68 until any next header). ``end_line`` is the exclusive end (next 

69 header's line_num, or total_lines+1 if last). 

70 """ 

71 start_line = headers[idx][0] # 1-indexed 

72 if idx + 1 < len(headers): 

73 end_line = headers[idx + 1][0] 

74 else: 

75 end_line = len(md_lines) + 1 

76 # md_lines is 0-indexed; convert 

77 body = "\n".join(md_lines[start_line - 1 : end_line - 1]).strip() 

78 return body, end_line 

79 

80 

81def build_markdown_tree(md_text: str, document_id: str) -> DocumentTree: 

82 """Build a DocumentTree from markdown text. 

83 

84 Implementation mirrors PageIndex's two-pass approach: 

85 

86 Pass 1: scan lines, locate every header (depth + title + line_num). 

87 Pass 2: stack-based tree construction — each header attaches under 

88 the most-recent ancestor with strictly shallower depth. 

89 

90 Each node's text body is the lines from its header through the next 

91 header of any depth, stripped. Roots are nodes whose parent is None 

92 (i.e., the document started with that depth, no shallower header 

93 preceded it). 

94 

95 Edge cases: 

96 - Markdown with NO headers → returns DocumentTree with one 

97 synthetic root holding the full text. (Most documents have at 

98 least one heading; the bench harness for LME/LoCoMo synthesizes 

99 ``## Session N`` headers, so this fallback rarely fires in 

100 practice but keeps the contract well-defined.) 

101 - Empty input → DocumentTree with empty ``roots`` list. 

102 """ 

103 if not md_text or not md_text.strip(): 

104 return DocumentTree(document_id=document_id, roots=[]) 

105 

106 md_lines = md_text.splitlines() 

107 headers = _extract_node_headers(md_text) 

108 

109 if not headers: 

110 # Headerless input — produce one synthetic root 

111 synthetic = TreeNode.new( 

112 parent_id=None, 

113 depth=1, 

114 title="(untitled document)", 

115 text=md_text.strip(), 

116 line_start=1, 

117 line_end=len(md_lines), 

118 ) 

119 return DocumentTree(document_id=document_id, roots=[synthetic]) 

120 

121 # Build all nodes first (no parent assignments yet) 

122 nodes: list[TreeNode] = [] 

123 for idx, (line_num, depth, title) in enumerate(headers): 

124 text, end_line = _build_node_text(headers, md_lines, idx) 

125 nodes.append( 

126 TreeNode.new( 

127 parent_id=None, # filled in by stack pass below 

128 depth=depth, 

129 title=title, 

130 text=text, 

131 line_start=line_num, 

132 line_end=end_line - 1, 

133 ), 

134 ) 

135 

136 # Stack-based parent assignment. Stack holds (depth, node) pairs of 

137 # candidate ancestors. For each new node, pop until top has strictly 

138 # shallower depth; that's the parent. If stack empties → new root. 

139 roots: list[TreeNode] = [] 

140 stack: list[TreeNode] = [] 

141 for node in nodes: 

142 while stack and stack[-1].depth >= node.depth: 

143 stack.pop() 

144 if stack: 

145 stack[-1].add_child(node) 

146 else: 

147 roots.append(node) 

148 stack.append(node) 

149 

150 return DocumentTree(document_id=document_id, roots=roots)