Coverage for astrocyte/documents/storage.py: 95%

37 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""DocumentStore SPI — persist documents + trees. 

2 

3Abstract base + an in-memory implementation for tests / embedded use. 

4Postgres impl lives in ``adapters-storage-py/astrocyte-postgres/`` so 

5the Document Engine doesn't depend on Postgres directly. 

6 

7The SPI is intentionally narrow: 

8 - ``save_document(doc, tree=None)`` — upsert document + optionally its tree 

9 - ``get_document(doc_id)`` — fetch document metadata 

10 - ``get_tree(doc_id)`` — reconstruct the DocumentTree from stored nodes 

11 - ``list_documents(limit)`` — paginate; for control-plane / debugging 

12 - ``delete_document(doc_id)`` — drop document AND its tree 

13 

14Trees are stored as flat node rows with parent_id FKs (closure-table 

15style) rather than nested JSON. Reasons: 

16 - Allows queries like "fetch all nodes at depth 2" without parsing JSON 

17 - FK constraints catch orphaned nodes at write time 

18 - Future tree-aware queries (sibling expansion, depth filtering) are SQL 

19""" 

20 

21from __future__ import annotations 

22 

23from abc import ABC, abstractmethod 

24from typing import Iterable 

25 

26from astrocyte.documents.types import Document, DocumentTree, TreeNode 

27 

28 

29class DocumentNotFoundError(Exception): 

30 """Raised when a requested document_id doesn't exist in the store.""" 

31 

32 

33class DocumentStore(ABC): 

34 """Persistence SPI for documents + trees.""" 

35 

36 @abstractmethod 

37 async def save_document( 

38 self, 

39 document: Document, 

40 tree: DocumentTree | None = None, 

41 ) -> None: 

42 """Upsert a document. If ``tree`` is provided, also upsert all its nodes. 

43 

44 Idempotent — calling twice with the same ``document.id`` updates 

45 rather than duplicates. If a tree is provided AND the document 

46 already has a stored tree, the existing tree is replaced (all 

47 old nodes deleted, new nodes inserted). 

48 """ 

49 

50 @abstractmethod 

51 async def get_document(self, document_id: str) -> Document | None: 

52 """Fetch document metadata. Returns None if not found. 

53 

54 Note: returned ``Document.tree`` is None even if a tree exists 

55 in storage — use ``get_tree`` to fetch separately. Two-call 

56 pattern keeps reads cheap when only metadata is needed. 

57 """ 

58 

59 @abstractmethod 

60 async def get_tree(self, document_id: str) -> DocumentTree | None: 

61 """Reconstruct the DocumentTree from stored nodes. 

62 

63 Returns None if the document has no stored tree (e.g., Document 

64 saved without one). Returns an empty tree (no roots) if rows 

65 exist but are malformed (logged). 

66 """ 

67 

68 @abstractmethod 

69 async def list_documents(self, *, limit: int = 100) -> list[Document]: 

70 """List documents in descending created_at order. 

71 

72 For control-plane and debugging. Pagination is offset-less for 

73 Phase 2 simplicity — callers should not rely on stable order 

74 between pages. 

75 """ 

76 

77 @abstractmethod 

78 async def delete_document(self, document_id: str) -> None: 

79 """Delete a document and all its tree nodes. 

80 

81 No-op if document_id doesn't exist. Tree-node deletion happens 

82 via FK cascade (Postgres) or explicit removal (InMemory). 

83 """ 

84 

85 

86# ─── in-memory impl (tests, embedded use) ───────────────────────────── 

87 

88 

89class InMemoryDocumentStore(DocumentStore): 

90 """Pure-Python DocumentStore backed by dicts. Not thread-safe. 

91 

92 Useful for: 

93 - Unit tests (no DB setup needed) 

94 - Embedded / CLI use where persistence isn't required 

95 - Smoke tests of the Document Engine before wiring to Postgres 

96 """ 

97 

98 def __init__(self) -> None: 

99 self._docs: dict[str, Document] = {} 

100 # tree storage: doc_id → list of (TreeNode, depth-first order index) 

101 # We store the canonical tree object verbatim for the in-memory case. 

102 self._trees: dict[str, DocumentTree] = {} 

103 

104 async def save_document( 

105 self, 

106 document: Document, 

107 tree: DocumentTree | None = None, 

108 ) -> None: 

109 # store metadata only (tree on Document is detached) 

110 self._docs[document.id] = Document( 

111 id=document.id, 

112 source_uri=document.source_uri, 

113 content_hash=document.content_hash, 

114 mime_type=document.mime_type, 

115 title=document.title, 

116 created_at=document.created_at, 

117 tree=None, # never inline on the metadata record 

118 ) 

119 if tree is not None: 

120 # Replace any previous tree 

121 self._trees[document.id] = tree 

122 

123 async def get_document(self, document_id: str) -> Document | None: 

124 return self._docs.get(document_id) 

125 

126 async def get_tree(self, document_id: str) -> DocumentTree | None: 

127 return self._trees.get(document_id) 

128 

129 async def list_documents(self, *, limit: int = 100) -> list[Document]: 

130 docs = sorted(self._docs.values(), key=lambda d: d.created_at, reverse=True) 

131 return docs[:limit] 

132 

133 async def delete_document(self, document_id: str) -> None: 

134 self._docs.pop(document_id, None) 

135 self._trees.pop(document_id, None) 

136 

137 

138# ─── helper: flatten tree to (parent_id, node) rows ─────────────────── 

139 

140 

141def flatten_tree_rows(tree: DocumentTree) -> Iterable[TreeNode]: 

142 """Yield nodes in pre-order, with parent_id correctly set. 

143 

144 The Postgres impl uses this to insert nodes in parent-before-child 

145 order (required for the FK constraint). 

146 """ 

147 for n in tree.all_nodes(): 

148 yield n