Coverage for astrocyte/documents/builders/md

1r"""Markdown → DocumentTree builder.

3PageIndex parity (see ``/Users/calvin/AstrocyteAI/PageIndex/pageindex/page_index_md.py``):

5- Header regex: ``^(#{1,6})\s+(.+)$`` — all six heading levels become

6 nodes.

7- Code-block aware: triple-backtick fences are tracked so headings

8 inside code samples don't accidentally split sections.

9- Each node owns the text from its heading line up to (but not

10 including) the next heading at the same-or-shallower level.

11- Children attach to the most-recent ancestor with strictly shallower

12 depth — the standard markdown nesting rule.

14This module ONLY constructs the tree structure. Per-node summarization

15is the summarizer's job (Phase 2). The tree returned here has

16``summary=None`` on every node.

18Public API:

19 build_markdown_tree(md_text, document_id) -> DocumentTree

20"""

22from __future__ import annotations

24import re

26from astrocyte.documents.types import DocumentTree, TreeNode

28_HEADER_RE = re.compile(r"^(#{1,6})\s+(.+)$")

29_CODE_FENCE_RE = re.compile(r"^```")

32def _extract_node_headers(md_text: str) -> list[tuple[int, int, str]]:

33 """Pass 1 of PageIndex's md→tree: locate every header line.

35 Returns a list of ``(line_num, depth, title)`` triples in document

36 order. ``line_num`` is 1-indexed. Code blocks are excluded so

37 headers inside ```fenced``` regions don't appear.

38 """

39 headers: list[tuple[int, int, str]] = []

40 in_code = False

41 for i, line in enumerate(md_text.splitlines(), start=1):

42 stripped = line.strip()

43 if _CODE_FENCE_RE.match(stripped):

44 in_code = not in_code

45 continue

46 if in_code:

47 continue

48 if not stripped:

49 continue

50 m = _HEADER_RE.match(stripped)

51 if m:

52 depth = len(m.group(1))

53 title = m.group(2).strip()

54 headers.append((i, depth, title))

55 return headers

58def _build_node_text(

59 headers: list[tuple[int, int, str]],

60 md_lines: list[str],

61 idx: int,

62) -> tuple[str, int]:

63 """Compute the text body for header at ``headers[idx]``.

65 Returns ``(text, end_line)``. The text spans from the header line

66 (inclusive, 1-indexed) up to but not including the next header

67 line of any depth (PageIndex behavior — each leaf "owns" its body

68 until any next header). ``end_line`` is the exclusive end (next

69 header's line_num, or total_lines+1 if last).

70 """

71 start_line = headers[idx][0] # 1-indexed

72 if idx + 1 < len(headers):

73 end_line = headers[idx + 1][0]

74 else:

75 end_line = len(md_lines) + 1

76 # md_lines is 0-indexed; convert

77 body = "\n".join(md_lines[start_line - 1 : end_line - 1]).strip()

78 return body, end_line

81def build_markdown_tree(md_text: str, document_id: str) -> DocumentTree:

82 """Build a DocumentTree from markdown text.

84 Implementation mirrors PageIndex's two-pass approach:

86 Pass 1: scan lines, locate every header (depth + title + line_num).

87 Pass 2: stack-based tree construction — each header attaches under

88 the most-recent ancestor with strictly shallower depth.

90 Each node's text body is the lines from its header through the next

91 header of any depth, stripped. Roots are nodes whose parent is None

92 (i.e., the document started with that depth, no shallower header

93 preceded it).

95 Edge cases:

96 - Markdown with NO headers → returns DocumentTree with one

97 synthetic root holding the full text. (Most documents have at

98 least one heading; the bench harness for LME/LoCoMo synthesizes

99 ``## Session N`` headers, so this fallback rarely fires in

100 practice but keeps the contract well-defined.)

101 - Empty input → DocumentTree with empty ``roots`` list.

102 """

103 if not md_text or not md_text.strip():

104 return DocumentTree(document_id=document_id, roots=[])

105

106 md_lines = md_text.splitlines()

107 headers = _extract_node_headers(md_text)

108

109 if not headers:

110 # Headerless input — produce one synthetic root

111 synthetic = TreeNode.new(

112 parent_id=None,

113 depth=1,

114 title="(untitled document)",

115 text=md_text.strip(),

116 line_start=1,

117 line_end=len(md_lines),

118 )

119 return DocumentTree(document_id=document_id, roots=[synthetic])

120

121 # Build all nodes first (no parent assignments yet)

122 nodes: list[TreeNode] = []

123 for idx, (line_num, depth, title) in enumerate(headers):

124 text, end_line = _build_node_text(headers, md_lines, idx)

125 nodes.append(

126 TreeNode.new(

127 parent_id=None, # filled in by stack pass below

128 depth=depth,

129 title=title,

130 text=text,

131 line_start=line_num,

132 line_end=end_line - 1,

133 ),

134 )

135

136 # Stack-based parent assignment. Stack holds (depth, node) pairs of

137 # candidate ancestors. For each new node, pop until top has strictly

138 # shallower depth; that's the parent. If stack empties → new root.

139 roots: list[TreeNode] = []

140 stack: list[TreeNode] = []

141 for node in nodes:

142 while stack and stack[-1].depth >= node.depth:

143 stack.pop()

144 if stack:

145 stack[-1].add_child(node)

146 else:

147 roots.append(node)

148 stack.append(node)

149

150 return DocumentTree(document_id=document_id, roots=roots)

Coverage for astrocyte/documents/builders/md_builder.py: 100%

52 statements