Coverage for astrocyte/documents/types.py: 100%

1"""Document Engine data types — DocumentTree, TreeNode, NodeSummary, Document.

3PageIndex-inspired hierarchical document representation. Plain dataclasses;

4no DB coupling, no Memory Engine knowledge. Persisted via the

5``DocumentStore`` SPI in Phase 2.

7A ``Document`` is the top-level container. It has metadata (id, source

8URI, content hash) and optionally a parsed ``DocumentTree``. The tree

9contains nodes with parent/child relationships derived from document

10structure (heading levels for markdown; TOC for PDFs).

12Each ``TreeNode`` carries:

13 - identity (id, parent_id, depth)

14 - title (the heading text)

15 - text (the body of the node, including the heading line)

16 - optional summary + summary_kind (raw text vs LLM-generated)

17 - children (list of TreeNode references — convenience, may be empty

18 if loaded flat from storage)

20Summary kinds (PageIndex parity):

21 - "raw" → summary IS the node text (small node, no LLM call)

22 - "llm" → summary was generated by an LLM (large node)

23 - "prefix" → internal-node prefix-summary (covers its descendants)

24"""

26from __future__ import annotations

28import hashlib

29import uuid

30from dataclasses import dataclass, field

31from datetime import datetime, timezone

32from typing import Literal

34SummaryKind = Literal["raw", "llm", "prefix"]

37# ─── NodeSummary ──────────────────────────────────────────────────────

40@dataclass

41class NodeSummary:

42 """A node's summary text plus how it was produced.

44 Kept as a separate type so the summarizer can return ``None`` for

45 nodes that don't need summarization and we keep ``TreeNode.summary``

46 typed precisely.

47 """

49 text: str

50 kind: SummaryKind = "raw"

51 token_count: int | None = None

53 def __len__(self) -> int:

54 return len(self.text)

57# ─── TreeNode ─────────────────────────────────────────────────────────

60@dataclass

61class TreeNode:

62 """A single node in a DocumentTree.

64 ``id`` and ``parent_id`` are UUIDs (as strings) so trees can be

65 serialized / round-tripped through JSON without losing references.

66 ``depth`` is 1-indexed (root depth=1, like markdown ``# heading``).

67 """

69 id: str

70 parent_id: str | None

71 depth: int # 1..6 for markdown (h1..h6)

72 title: str

73 text: str = ""

74 summary: NodeSummary | None = None

75 children: list[TreeNode] = field(default_factory=list)

76 # Optional source-location hints (line numbers in the original doc).

77 line_start: int | None = None

78 line_end: int | None = None

80 @classmethod

81 def new(cls, *, parent_id: str | None, depth: int, title: str, **kwargs: object) -> TreeNode:

82 """Construct a TreeNode with a fresh UUID id."""

83 if depth < 1 or depth > 6:

84 raise ValueError(f"depth must be in 1..6, got {depth}")

85 return cls(

86 id=str(uuid.uuid4()),

87 parent_id=parent_id,

88 depth=depth,

89 title=title,

90 **kwargs, # type: ignore[arg-type]

91 )

93 def add_child(self, child: TreeNode) -> None:

94 """Append a child node, updating its parent_id for consistency."""

95 child.parent_id = self.id

96 self.children.append(child)

98 def traverse_pre(self) -> list[TreeNode]:

99 """Pre-order traversal — self before children. Returns flat list."""

100 out: list[TreeNode] = [self]

101 for c in self.children:

102 out.extend(c.traverse_pre())

103 return out

104

105 def is_leaf(self) -> bool:

106 return not self.children

107

108

109# ─── DocumentTree ─────────────────────────────────────────────────────

110

111

112@dataclass

113class DocumentTree:

114 """The hierarchical structure of one document.

115

116 A tree has one or more root nodes (markdown may have multiple top-

117 level headings; PDF typically one). ``document_id`` ties the tree

118 to its source Document in storage.

119 """

120

121 document_id: str

122 roots: list[TreeNode] = field(default_factory=list)

123

124 def all_nodes(self) -> list[TreeNode]:

125 """All nodes pre-order across all roots."""

126 out: list[TreeNode] = []

127 for r in self.roots:

128 out.extend(r.traverse_pre())

129 return out

130

131 def node_count(self) -> int:

132 return len(self.all_nodes())

133

134 def find(self, node_id: str) -> TreeNode | None:

135 for n in self.all_nodes():

136 if n.id == node_id:

137 return n

138 return None

139

140

141# ─── Document ─────────────────────────────────────────────────────────

142

143

144@dataclass

145class Document:

146 """Top-level container — metadata + (optionally) parsed tree.

147

148 A Document can exist without a tree (raw source held but not

149 parsed yet); a tree always belongs to a Document.

150

151 ``content_hash`` is a SHA-256 of the raw bytes/text — used by

152 storage to dedupe identical re-uploads and by builders to cache

153 summarization results.

154 """

155

156 id: str

157 source_uri: str = "" # file path, URL, "inline://", etc.

158 content_hash: str = ""

159 mime_type: str = "text/markdown"

160 title: str = ""

161 created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))

162 tree: DocumentTree | None = None

163

164 @classmethod

165 def new(

166 cls,

167 *,

168 source_uri: str = "",

169 content: str | bytes = "",

170 mime_type: str = "text/markdown",

171 title: str = "",

172 ) -> Document:

173 """Construct a Document with a fresh id + content_hash."""

174 if isinstance(content, str):

175 content_bytes = content.encode("utf-8")

176 else:

177 content_bytes = content

178 return cls(

179 id=str(uuid.uuid4()),

180 source_uri=source_uri,

181 content_hash=hashlib.sha256(content_bytes).hexdigest(),

182 mime_type=mime_type,

183 title=title,

184 )