Coverage for astrocyte/documents/retrieval/types.py: 100%

61 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Retrieval types for the Document Engine tree-search path. 

2 

3These types form the contract between DocumentRetriever (low-level reads) 

4and DocumentNavigator (agent loop). They are distinct from DocumentTree / 

5TreeNode — they are query-time views, not storage representations. 

6 

7Key design decisions: 

8- TreeSkeleton never contains node text — prevents accidental large-text 

9 dumps to the LLM reasoning step. 

10- SectionHit carries a breadcrumb so callers have structural context 

11 without having to re-traverse the tree. 

12- DocumentSearchResult is strategy-tagged so callers can log which 

13 path produced each result. 

14""" 

15 

16from __future__ import annotations 

17 

18from dataclasses import dataclass, field 

19from datetime import datetime 

20from typing import Literal 

21 

22 

23@dataclass 

24class DocumentInfo: 

25 """Lightweight document metadata — no tree, no text.""" 

26 

27 document_id: str 

28 title: str 

29 source_uri: str 

30 node_count: int 

31 depth_min: int 

32 depth_max: int 

33 created_at: datetime | None = None 

34 

35 

36@dataclass 

37class SkeletonNode: 

38 """A tree node stripped of text — for efficient LLM tree reasoning. 

39 

40 Token budget: ~30-50 tokens per node (title + summary fragment). 

41 A 100-node document → ~3-5k tokens to transmit the full skeleton. 

42 """ 

43 

44 node_id: str 

45 parent_id: str | None 

46 depth: int 

47 title: str 

48 summary: str | None # node.summary.text if available, else None 

49 has_children: bool 

50 child_count: int 

51 page_start: int | None = None # for PDFs (1-indexed); None for markdown 

52 line_start: int | None = None # for markdown (1-indexed); None for PDFs 

53 

54 

55@dataclass 

56class TreeSkeleton: 

57 """Full tree structure without node text. Pre-order flat list. 

58 

59 The reasoning surface: the LLM reads titles + summaries to decide 

60 which nodes to fetch via get_node_content(). 

61 """ 

62 

63 document_id: str 

64 title: str 

65 node_count: int 

66 nodes: list[SkeletonNode] = field(default_factory=list) 

67 

68 

69@dataclass 

70class NodeContent: 

71 """A single node's full content + its immediate children (skeleton form). 

72 

73 ``children`` are skeleton-only (no text) — the LLM can decide whether 

74 to drill deeper without paying the cost of fetching all child text. 

75 """ 

76 

77 node_id: str 

78 document_id: str 

79 title: str 

80 depth: int 

81 parent_id: str | None 

82 text: str 

83 summary: str | None 

84 summary_kind: str | None # "raw" | "llm" | "prefix" 

85 children: list[SkeletonNode] = field(default_factory=list) 

86 page_start: int | None = None 

87 page_end: int | None = None 

88 line_start: int | None = None 

89 line_end: int | None = None 

90 

91 

92@dataclass 

93class SectionHit: 

94 """One relevant section found by the DocumentNavigator.""" 

95 

96 document_id: str 

97 node_id: str 

98 node_title: str 

99 node_depth: int 

100 breadcrumb: list[str] # ancestor titles root→parent, e.g. ["Chapter 3", "3.2"] 

101 text: str 

102 relevance_reasoning: str # LLM's explanation of why this section is relevant 

103 

104 

105@dataclass 

106class DocumentSearchResult: 

107 """Aggregated result from DocumentNavigator.search().""" 

108 

109 query: str 

110 sections: list[SectionHit] 

111 documents_searched: int 

112 iterations_used: int 

113 strategy: Literal["tree_search"] = "tree_search"