Coverage for astrocyte/documents/retrieval/retriever.py: 28%
39 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""DocumentRetriever — low-level read access to the document tree.
3Three PageIndex-parity tools:
4 get_document_info(doc_id) → DocumentInfo
5 get_document_structure(doc_id) → TreeSkeleton (no text)
6 get_node_content(doc_id, node_id) → NodeContent (text + children)
8No LLM. No agent loop. Directly wraps DocumentStore.
9Fully testable without any external service or LLM mock.
10"""
12from __future__ import annotations
14from astrocyte.documents.retrieval.types import (
15 DocumentInfo,
16 NodeContent,
17 SkeletonNode,
18 TreeSkeleton,
19)
20from astrocyte.documents.storage import DocumentNotFoundError, DocumentStore
21from astrocyte.documents.types import TreeNode
24def _to_skeleton_node(node: TreeNode) -> SkeletonNode:
25 return SkeletonNode(
26 node_id=node.id,
27 parent_id=node.parent_id,
28 depth=node.depth,
29 title=node.title,
30 summary=node.summary.text if node.summary else None,
31 has_children=bool(node.children),
32 child_count=len(node.children),
33 page_start=getattr(node, "page_start", None),
34 line_start=node.line_start,
35 )
38class DocumentRetriever:
39 """Read-only document access — three PageIndex-parity tools.
41 The low-level layer beneath DocumentNavigator. Consumers can also
42 wire these tools directly into their own agent loop via
43 make_retrieval_tools().
44 """
46 def __init__(self, store: DocumentStore) -> None:
47 self._store = store
49 async def get_document_info(self, doc_id: str) -> DocumentInfo:
50 """Lightweight metadata — no tree loaded, no text."""
51 doc = await self._store.get_document(doc_id)
52 if doc is None:
53 raise DocumentNotFoundError(doc_id)
54 tree = await self._store.get_tree(doc_id)
55 nodes = tree.all_nodes() if tree else []
56 depths = [n.depth for n in nodes] if nodes else [0]
57 return DocumentInfo(
58 document_id=doc.id,
59 title=doc.title,
60 source_uri=doc.source_uri,
61 node_count=len(nodes),
62 depth_min=min(depths),
63 depth_max=max(depths),
64 created_at=doc.created_at,
65 )
67 async def get_document_structure(self, doc_id: str) -> TreeSkeleton:
68 """Full tree without node text — the LLM reasoning surface.
70 The LLM reads SkeletonNode titles and summaries to decide which
71 nodes to retrieve. Raises DocumentNotFoundError if doc_id is
72 unknown.
73 """
74 doc = await self._store.get_document(doc_id)
75 if doc is None:
76 raise DocumentNotFoundError(doc_id)
77 tree = await self._store.get_tree(doc_id)
78 if tree is None:
79 return TreeSkeleton(document_id=doc_id, title=doc.title, node_count=0)
80 nodes_pre = tree.all_nodes()
81 return TreeSkeleton(
82 document_id=doc_id,
83 title=doc.title,
84 node_count=len(nodes_pre),
85 nodes=[_to_skeleton_node(n) for n in nodes_pre],
86 )
88 async def get_node_content(self, doc_id: str, node_id: str) -> NodeContent:
89 """Full text for one node + its immediate children (skeleton only).
91 Raises DocumentNotFoundError if doc_id is unknown.
92 Raises KeyError if node_id is not found in the tree.
93 """
94 tree = await self._store.get_tree(doc_id)
95 if tree is None:
96 raise DocumentNotFoundError(doc_id)
97 node = tree.find(node_id)
98 if node is None:
99 raise KeyError(f"node_id={node_id!r} not found in document {doc_id!r}")
100 return NodeContent(
101 node_id=node.id,
102 document_id=doc_id,
103 title=node.title,
104 depth=node.depth,
105 parent_id=node.parent_id,
106 text=node.text,
107 summary=node.summary.text if node.summary else None,
108 summary_kind=node.summary.kind if node.summary else None,
109 children=[_to_skeleton_node(c) for c in node.children],
110 page_start=getattr(node, "page_start", None),
111 page_end=getattr(node, "page_end", None),
112 line_start=node.line_start,
113 line_end=node.line_end,
114 )
116 async def get_nodes_at_depth(self, doc_id: str, depth: int) -> list[SkeletonNode]:
117 """All nodes at a specific depth level (e.g., depth=2 = all H2 sections)."""
118 tree = await self._store.get_tree(doc_id)
119 if tree is None:
120 raise DocumentNotFoundError(doc_id)
121 return [_to_skeleton_node(n) for n in tree.all_nodes() if n.depth == depth]