Coverage for astrocyte/documents/types.py: 100%

71 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Document Engine data types — DocumentTree, TreeNode, NodeSummary, Document. 

2 

3PageIndex-inspired hierarchical document representation. Plain dataclasses; 

4no DB coupling, no Memory Engine knowledge. Persisted via the 

5``DocumentStore`` SPI in Phase 2. 

6 

7A ``Document`` is the top-level container. It has metadata (id, source 

8URI, content hash) and optionally a parsed ``DocumentTree``. The tree 

9contains nodes with parent/child relationships derived from document 

10structure (heading levels for markdown; TOC for PDFs). 

11 

12Each ``TreeNode`` carries: 

13 - identity (id, parent_id, depth) 

14 - title (the heading text) 

15 - text (the body of the node, including the heading line) 

16 - optional summary + summary_kind (raw text vs LLM-generated) 

17 - children (list of TreeNode references — convenience, may be empty 

18 if loaded flat from storage) 

19 

20Summary kinds (PageIndex parity): 

21 - "raw" → summary IS the node text (small node, no LLM call) 

22 - "llm" → summary was generated by an LLM (large node) 

23 - "prefix" → internal-node prefix-summary (covers its descendants) 

24""" 

25 

26from __future__ import annotations 

27 

28import hashlib 

29import uuid 

30from dataclasses import dataclass, field 

31from datetime import datetime, timezone 

32from typing import Literal 

33 

34SummaryKind = Literal["raw", "llm", "prefix"] 

35 

36 

37# ─── NodeSummary ────────────────────────────────────────────────────── 

38 

39 

40@dataclass 

41class NodeSummary: 

42 """A node's summary text plus how it was produced. 

43 

44 Kept as a separate type so the summarizer can return ``None`` for 

45 nodes that don't need summarization and we keep ``TreeNode.summary`` 

46 typed precisely. 

47 """ 

48 

49 text: str 

50 kind: SummaryKind = "raw" 

51 token_count: int | None = None 

52 

53 def __len__(self) -> int: 

54 return len(self.text) 

55 

56 

57# ─── TreeNode ───────────────────────────────────────────────────────── 

58 

59 

60@dataclass 

61class TreeNode: 

62 """A single node in a DocumentTree. 

63 

64 ``id`` and ``parent_id`` are UUIDs (as strings) so trees can be 

65 serialized / round-tripped through JSON without losing references. 

66 ``depth`` is 1-indexed (root depth=1, like markdown ``# heading``). 

67 """ 

68 

69 id: str 

70 parent_id: str | None 

71 depth: int # 1..6 for markdown (h1..h6) 

72 title: str 

73 text: str = "" 

74 summary: NodeSummary | None = None 

75 children: list[TreeNode] = field(default_factory=list) 

76 # Optional source-location hints (line numbers in the original doc). 

77 line_start: int | None = None 

78 line_end: int | None = None 

79 

80 @classmethod 

81 def new(cls, *, parent_id: str | None, depth: int, title: str, **kwargs: object) -> TreeNode: 

82 """Construct a TreeNode with a fresh UUID id.""" 

83 if depth < 1 or depth > 6: 

84 raise ValueError(f"depth must be in 1..6, got {depth}") 

85 return cls( 

86 id=str(uuid.uuid4()), 

87 parent_id=parent_id, 

88 depth=depth, 

89 title=title, 

90 **kwargs, # type: ignore[arg-type] 

91 ) 

92 

93 def add_child(self, child: TreeNode) -> None: 

94 """Append a child node, updating its parent_id for consistency.""" 

95 child.parent_id = self.id 

96 self.children.append(child) 

97 

98 def traverse_pre(self) -> list[TreeNode]: 

99 """Pre-order traversal — self before children. Returns flat list.""" 

100 out: list[TreeNode] = [self] 

101 for c in self.children: 

102 out.extend(c.traverse_pre()) 

103 return out 

104 

105 def is_leaf(self) -> bool: 

106 return not self.children 

107 

108 

109# ─── DocumentTree ───────────────────────────────────────────────────── 

110 

111 

112@dataclass 

113class DocumentTree: 

114 """The hierarchical structure of one document. 

115 

116 A tree has one or more root nodes (markdown may have multiple top- 

117 level headings; PDF typically one). ``document_id`` ties the tree 

118 to its source Document in storage. 

119 """ 

120 

121 document_id: str 

122 roots: list[TreeNode] = field(default_factory=list) 

123 

124 def all_nodes(self) -> list[TreeNode]: 

125 """All nodes pre-order across all roots.""" 

126 out: list[TreeNode] = [] 

127 for r in self.roots: 

128 out.extend(r.traverse_pre()) 

129 return out 

130 

131 def node_count(self) -> int: 

132 return len(self.all_nodes()) 

133 

134 def find(self, node_id: str) -> TreeNode | None: 

135 for n in self.all_nodes(): 

136 if n.id == node_id: 

137 return n 

138 return None 

139 

140 

141# ─── Document ───────────────────────────────────────────────────────── 

142 

143 

144@dataclass 

145class Document: 

146 """Top-level container — metadata + (optionally) parsed tree. 

147 

148 A Document can exist without a tree (raw source held but not 

149 parsed yet); a tree always belongs to a Document. 

150 

151 ``content_hash`` is a SHA-256 of the raw bytes/text — used by 

152 storage to dedupe identical re-uploads and by builders to cache 

153 summarization results. 

154 """ 

155 

156 id: str 

157 source_uri: str = "" # file path, URL, "inline://", etc. 

158 content_hash: str = "" 

159 mime_type: str = "text/markdown" 

160 title: str = "" 

161 created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) 

162 tree: DocumentTree | None = None 

163 

164 @classmethod 

165 def new( 

166 cls, 

167 *, 

168 source_uri: str = "", 

169 content: str | bytes = "", 

170 mime_type: str = "text/markdown", 

171 title: str = "", 

172 ) -> Document: 

173 """Construct a Document with a fresh id + content_hash.""" 

174 if isinstance(content, str): 

175 content_bytes = content.encode("utf-8") 

176 else: 

177 content_bytes = content 

178 return cls( 

179 id=str(uuid.uuid4()), 

180 source_uri=source_uri, 

181 content_hash=hashlib.sha256(content_bytes).hexdigest(), 

182 mime_type=mime_type, 

183 title=title, 

184 )