Coverage for astrocyte/documents/types.py: 100%
71 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Document Engine data types — DocumentTree, TreeNode, NodeSummary, Document.
3PageIndex-inspired hierarchical document representation. Plain dataclasses;
4no DB coupling, no Memory Engine knowledge. Persisted via the
5``DocumentStore`` SPI in Phase 2.
7A ``Document`` is the top-level container. It has metadata (id, source
8URI, content hash) and optionally a parsed ``DocumentTree``. The tree
9contains nodes with parent/child relationships derived from document
10structure (heading levels for markdown; TOC for PDFs).
12Each ``TreeNode`` carries:
13 - identity (id, parent_id, depth)
14 - title (the heading text)
15 - text (the body of the node, including the heading line)
16 - optional summary + summary_kind (raw text vs LLM-generated)
17 - children (list of TreeNode references — convenience, may be empty
18 if loaded flat from storage)
20Summary kinds (PageIndex parity):
21 - "raw" → summary IS the node text (small node, no LLM call)
22 - "llm" → summary was generated by an LLM (large node)
23 - "prefix" → internal-node prefix-summary (covers its descendants)
24"""
26from __future__ import annotations
28import hashlib
29import uuid
30from dataclasses import dataclass, field
31from datetime import datetime, timezone
32from typing import Literal
34SummaryKind = Literal["raw", "llm", "prefix"]
37# ─── NodeSummary ──────────────────────────────────────────────────────
40@dataclass
41class NodeSummary:
42 """A node's summary text plus how it was produced.
44 Kept as a separate type so the summarizer can return ``None`` for
45 nodes that don't need summarization and we keep ``TreeNode.summary``
46 typed precisely.
47 """
49 text: str
50 kind: SummaryKind = "raw"
51 token_count: int | None = None
53 def __len__(self) -> int:
54 return len(self.text)
57# ─── TreeNode ─────────────────────────────────────────────────────────
60@dataclass
61class TreeNode:
62 """A single node in a DocumentTree.
64 ``id`` and ``parent_id`` are UUIDs (as strings) so trees can be
65 serialized / round-tripped through JSON without losing references.
66 ``depth`` is 1-indexed (root depth=1, like markdown ``# heading``).
67 """
69 id: str
70 parent_id: str | None
71 depth: int # 1..6 for markdown (h1..h6)
72 title: str
73 text: str = ""
74 summary: NodeSummary | None = None
75 children: list[TreeNode] = field(default_factory=list)
76 # Optional source-location hints (line numbers in the original doc).
77 line_start: int | None = None
78 line_end: int | None = None
80 @classmethod
81 def new(cls, *, parent_id: str | None, depth: int, title: str, **kwargs: object) -> TreeNode:
82 """Construct a TreeNode with a fresh UUID id."""
83 if depth < 1 or depth > 6:
84 raise ValueError(f"depth must be in 1..6, got {depth}")
85 return cls(
86 id=str(uuid.uuid4()),
87 parent_id=parent_id,
88 depth=depth,
89 title=title,
90 **kwargs, # type: ignore[arg-type]
91 )
93 def add_child(self, child: TreeNode) -> None:
94 """Append a child node, updating its parent_id for consistency."""
95 child.parent_id = self.id
96 self.children.append(child)
98 def traverse_pre(self) -> list[TreeNode]:
99 """Pre-order traversal — self before children. Returns flat list."""
100 out: list[TreeNode] = [self]
101 for c in self.children:
102 out.extend(c.traverse_pre())
103 return out
105 def is_leaf(self) -> bool:
106 return not self.children
109# ─── DocumentTree ─────────────────────────────────────────────────────
112@dataclass
113class DocumentTree:
114 """The hierarchical structure of one document.
116 A tree has one or more root nodes (markdown may have multiple top-
117 level headings; PDF typically one). ``document_id`` ties the tree
118 to its source Document in storage.
119 """
121 document_id: str
122 roots: list[TreeNode] = field(default_factory=list)
124 def all_nodes(self) -> list[TreeNode]:
125 """All nodes pre-order across all roots."""
126 out: list[TreeNode] = []
127 for r in self.roots:
128 out.extend(r.traverse_pre())
129 return out
131 def node_count(self) -> int:
132 return len(self.all_nodes())
134 def find(self, node_id: str) -> TreeNode | None:
135 for n in self.all_nodes():
136 if n.id == node_id:
137 return n
138 return None
141# ─── Document ─────────────────────────────────────────────────────────
144@dataclass
145class Document:
146 """Top-level container — metadata + (optionally) parsed tree.
148 A Document can exist without a tree (raw source held but not
149 parsed yet); a tree always belongs to a Document.
151 ``content_hash`` is a SHA-256 of the raw bytes/text — used by
152 storage to dedupe identical re-uploads and by builders to cache
153 summarization results.
154 """
156 id: str
157 source_uri: str = "" # file path, URL, "inline://", etc.
158 content_hash: str = ""
159 mime_type: str = "text/markdown"
160 title: str = ""
161 created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
162 tree: DocumentTree | None = None
164 @classmethod
165 def new(
166 cls,
167 *,
168 source_uri: str = "",
169 content: str | bytes = "",
170 mime_type: str = "text/markdown",
171 title: str = "",
172 ) -> Document:
173 """Construct a Document with a fresh id + content_hash."""
174 if isinstance(content, str):
175 content_bytes = content.encode("utf-8")
176 else:
177 content_bytes = content
178 return cls(
179 id=str(uuid.uuid4()),
180 source_uri=source_uri,
181 content_hash=hashlib.sha256(content_bytes).hexdigest(),
182 mime_type=mime_type,
183 title=title,
184 )