Coverage for astrocyte/documents/retrieval/types.py: 100%
61 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Retrieval types for the Document Engine tree-search path.
3These types form the contract between DocumentRetriever (low-level reads)
4and DocumentNavigator (agent loop). They are distinct from DocumentTree /
5TreeNode — they are query-time views, not storage representations.
7Key design decisions:
8- TreeSkeleton never contains node text — prevents accidental large-text
9 dumps to the LLM reasoning step.
10- SectionHit carries a breadcrumb so callers have structural context
11 without having to re-traverse the tree.
12- DocumentSearchResult is strategy-tagged so callers can log which
13 path produced each result.
14"""
16from __future__ import annotations
18from dataclasses import dataclass, field
19from datetime import datetime
20from typing import Literal
23@dataclass
24class DocumentInfo:
25 """Lightweight document metadata — no tree, no text."""
27 document_id: str
28 title: str
29 source_uri: str
30 node_count: int
31 depth_min: int
32 depth_max: int
33 created_at: datetime | None = None
36@dataclass
37class SkeletonNode:
38 """A tree node stripped of text — for efficient LLM tree reasoning.
40 Token budget: ~30-50 tokens per node (title + summary fragment).
41 A 100-node document → ~3-5k tokens to transmit the full skeleton.
42 """
44 node_id: str
45 parent_id: str | None
46 depth: int
47 title: str
48 summary: str | None # node.summary.text if available, else None
49 has_children: bool
50 child_count: int
51 page_start: int | None = None # for PDFs (1-indexed); None for markdown
52 line_start: int | None = None # for markdown (1-indexed); None for PDFs
55@dataclass
56class TreeSkeleton:
57 """Full tree structure without node text. Pre-order flat list.
59 The reasoning surface: the LLM reads titles + summaries to decide
60 which nodes to fetch via get_node_content().
61 """
63 document_id: str
64 title: str
65 node_count: int
66 nodes: list[SkeletonNode] = field(default_factory=list)
69@dataclass
70class NodeContent:
71 """A single node's full content + its immediate children (skeleton form).
73 ``children`` are skeleton-only (no text) — the LLM can decide whether
74 to drill deeper without paying the cost of fetching all child text.
75 """
77 node_id: str
78 document_id: str
79 title: str
80 depth: int
81 parent_id: str | None
82 text: str
83 summary: str | None
84 summary_kind: str | None # "raw" | "llm" | "prefix"
85 children: list[SkeletonNode] = field(default_factory=list)
86 page_start: int | None = None
87 page_end: int | None = None
88 line_start: int | None = None
89 line_end: int | None = None
92@dataclass
93class SectionHit:
94 """One relevant section found by the DocumentNavigator."""
96 document_id: str
97 node_id: str
98 node_title: str
99 node_depth: int
100 breadcrumb: list[str] # ancestor titles root→parent, e.g. ["Chapter 3", "3.2"]
101 text: str
102 relevance_reasoning: str # LLM's explanation of why this section is relevant
105@dataclass
106class DocumentSearchResult:
107 """Aggregated result from DocumentNavigator.search()."""
109 query: str
110 sections: list[SectionHit]
111 documents_searched: int
112 iterations_used: int
113 strategy: Literal["tree_search"] = "tree_search"