Coverage for astrocyte/documents/retrieval/types.py: 100%

1"""Retrieval types for the Document Engine tree-search path.

3These types form the contract between DocumentRetriever (low-level reads)

4and DocumentNavigator (agent loop). They are distinct from DocumentTree /

5TreeNode — they are query-time views, not storage representations.

7Key design decisions:

8- TreeSkeleton never contains node text — prevents accidental large-text

9 dumps to the LLM reasoning step.

10- SectionHit carries a breadcrumb so callers have structural context

11 without having to re-traverse the tree.

12- DocumentSearchResult is strategy-tagged so callers can log which

13 path produced each result.

14"""

16from __future__ import annotations

18from dataclasses import dataclass, field

19from datetime import datetime

20from typing import Literal

23@dataclass

24class DocumentInfo:

25 """Lightweight document metadata — no tree, no text."""

27 document_id: str

28 title: str

29 source_uri: str

30 node_count: int

31 depth_min: int

32 depth_max: int

33 created_at: datetime | None = None

36@dataclass

37class SkeletonNode:

38 """A tree node stripped of text — for efficient LLM tree reasoning.

40 Token budget: ~30-50 tokens per node (title + summary fragment).

41 A 100-node document → ~3-5k tokens to transmit the full skeleton.

42 """

44 node_id: str

45 parent_id: str | None

46 depth: int

47 title: str

48 summary: str | None # node.summary.text if available, else None

49 has_children: bool

50 child_count: int

51 page_start: int | None = None # for PDFs (1-indexed); None for markdown

52 line_start: int | None = None # for markdown (1-indexed); None for PDFs

55@dataclass

56class TreeSkeleton:

57 """Full tree structure without node text. Pre-order flat list.

59 The reasoning surface: the LLM reads titles + summaries to decide

60 which nodes to fetch via get_node_content().

61 """

63 document_id: str

64 title: str

65 node_count: int

66 nodes: list[SkeletonNode] = field(default_factory=list)

69@dataclass

70class NodeContent:

71 """A single node's full content + its immediate children (skeleton form).

73 ``children`` are skeleton-only (no text) — the LLM can decide whether

74 to drill deeper without paying the cost of fetching all child text.

75 """

77 node_id: str

78 document_id: str

79 title: str

80 depth: int

81 parent_id: str | None

82 text: str

83 summary: str | None

84 summary_kind: str | None # "raw" | "llm" | "prefix"

85 children: list[SkeletonNode] = field(default_factory=list)

86 page_start: int | None = None

87 page_end: int | None = None

88 line_start: int | None = None

89 line_end: int | None = None

92@dataclass

93class SectionHit:

94 """One relevant section found by the DocumentNavigator."""

96 document_id: str

97 node_id: str

98 node_title: str

99 node_depth: int

100 breadcrumb: list[str] # ancestor titles root→parent, e.g. ["Chapter 3", "3.2"]

101 text: str

102 relevance_reasoning: str # LLM's explanation of why this section is relevant

103

104

105@dataclass

106class DocumentSearchResult:

107 """Aggregated result from DocumentNavigator.search()."""

108

109 query: str

110 sections: list[SectionHit]

111 documents_searched: int

112 iterations_used: int

113 strategy: Literal["tree_search"] = "tree_search"