Coverage for astrocyte/documents/retrieval/retriever.py: 28%

39 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""DocumentRetriever — low-level read access to the document tree. 

2 

3Three PageIndex-parity tools: 

4 get_document_info(doc_id) → DocumentInfo 

5 get_document_structure(doc_id) → TreeSkeleton (no text) 

6 get_node_content(doc_id, node_id) → NodeContent (text + children) 

7 

8No LLM. No agent loop. Directly wraps DocumentStore. 

9Fully testable without any external service or LLM mock. 

10""" 

11 

12from __future__ import annotations 

13 

14from astrocyte.documents.retrieval.types import ( 

15 DocumentInfo, 

16 NodeContent, 

17 SkeletonNode, 

18 TreeSkeleton, 

19) 

20from astrocyte.documents.storage import DocumentNotFoundError, DocumentStore 

21from astrocyte.documents.types import TreeNode 

22 

23 

24def _to_skeleton_node(node: TreeNode) -> SkeletonNode: 

25 return SkeletonNode( 

26 node_id=node.id, 

27 parent_id=node.parent_id, 

28 depth=node.depth, 

29 title=node.title, 

30 summary=node.summary.text if node.summary else None, 

31 has_children=bool(node.children), 

32 child_count=len(node.children), 

33 page_start=getattr(node, "page_start", None), 

34 line_start=node.line_start, 

35 ) 

36 

37 

38class DocumentRetriever: 

39 """Read-only document access — three PageIndex-parity tools. 

40 

41 The low-level layer beneath DocumentNavigator. Consumers can also 

42 wire these tools directly into their own agent loop via 

43 make_retrieval_tools(). 

44 """ 

45 

46 def __init__(self, store: DocumentStore) -> None: 

47 self._store = store 

48 

49 async def get_document_info(self, doc_id: str) -> DocumentInfo: 

50 """Lightweight metadata — no tree loaded, no text.""" 

51 doc = await self._store.get_document(doc_id) 

52 if doc is None: 

53 raise DocumentNotFoundError(doc_id) 

54 tree = await self._store.get_tree(doc_id) 

55 nodes = tree.all_nodes() if tree else [] 

56 depths = [n.depth for n in nodes] if nodes else [0] 

57 return DocumentInfo( 

58 document_id=doc.id, 

59 title=doc.title, 

60 source_uri=doc.source_uri, 

61 node_count=len(nodes), 

62 depth_min=min(depths), 

63 depth_max=max(depths), 

64 created_at=doc.created_at, 

65 ) 

66 

67 async def get_document_structure(self, doc_id: str) -> TreeSkeleton: 

68 """Full tree without node text — the LLM reasoning surface. 

69 

70 The LLM reads SkeletonNode titles and summaries to decide which 

71 nodes to retrieve. Raises DocumentNotFoundError if doc_id is 

72 unknown. 

73 """ 

74 doc = await self._store.get_document(doc_id) 

75 if doc is None: 

76 raise DocumentNotFoundError(doc_id) 

77 tree = await self._store.get_tree(doc_id) 

78 if tree is None: 

79 return TreeSkeleton(document_id=doc_id, title=doc.title, node_count=0) 

80 nodes_pre = tree.all_nodes() 

81 return TreeSkeleton( 

82 document_id=doc_id, 

83 title=doc.title, 

84 node_count=len(nodes_pre), 

85 nodes=[_to_skeleton_node(n) for n in nodes_pre], 

86 ) 

87 

88 async def get_node_content(self, doc_id: str, node_id: str) -> NodeContent: 

89 """Full text for one node + its immediate children (skeleton only). 

90 

91 Raises DocumentNotFoundError if doc_id is unknown. 

92 Raises KeyError if node_id is not found in the tree. 

93 """ 

94 tree = await self._store.get_tree(doc_id) 

95 if tree is None: 

96 raise DocumentNotFoundError(doc_id) 

97 node = tree.find(node_id) 

98 if node is None: 

99 raise KeyError(f"node_id={node_id!r} not found in document {doc_id!r}") 

100 return NodeContent( 

101 node_id=node.id, 

102 document_id=doc_id, 

103 title=node.title, 

104 depth=node.depth, 

105 parent_id=node.parent_id, 

106 text=node.text, 

107 summary=node.summary.text if node.summary else None, 

108 summary_kind=node.summary.kind if node.summary else None, 

109 children=[_to_skeleton_node(c) for c in node.children], 

110 page_start=getattr(node, "page_start", None), 

111 page_end=getattr(node, "page_end", None), 

112 line_start=node.line_start, 

113 line_end=node.line_end, 

114 ) 

115 

116 async def get_nodes_at_depth(self, doc_id: str, depth: int) -> list[SkeletonNode]: 

117 """All nodes at a specific depth level (e.g., depth=2 = all H2 sections).""" 

118 tree = await self._store.get_tree(doc_id) 

119 if tree is None: 

120 raise DocumentNotFoundError(doc_id) 

121 return [_to_skeleton_node(n) for n in tree.all_nodes() if n.depth == depth]