Coverage for astrocyte/documents/storage.py: 95%
37 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""DocumentStore SPI — persist documents + trees.
3Abstract base + an in-memory implementation for tests / embedded use.
4Postgres impl lives in ``adapters-storage-py/astrocyte-postgres/`` so
5the Document Engine doesn't depend on Postgres directly.
7The SPI is intentionally narrow:
8 - ``save_document(doc, tree=None)`` — upsert document + optionally its tree
9 - ``get_document(doc_id)`` — fetch document metadata
10 - ``get_tree(doc_id)`` — reconstruct the DocumentTree from stored nodes
11 - ``list_documents(limit)`` — paginate; for control-plane / debugging
12 - ``delete_document(doc_id)`` — drop document AND its tree
14Trees are stored as flat node rows with parent_id FKs (closure-table
15style) rather than nested JSON. Reasons:
16 - Allows queries like "fetch all nodes at depth 2" without parsing JSON
17 - FK constraints catch orphaned nodes at write time
18 - Future tree-aware queries (sibling expansion, depth filtering) are SQL
19"""
21from __future__ import annotations
23from abc import ABC, abstractmethod
24from typing import Iterable
26from astrocyte.documents.types import Document, DocumentTree, TreeNode
29class DocumentNotFoundError(Exception):
30 """Raised when a requested document_id doesn't exist in the store."""
33class DocumentStore(ABC):
34 """Persistence SPI for documents + trees."""
36 @abstractmethod
37 async def save_document(
38 self,
39 document: Document,
40 tree: DocumentTree | None = None,
41 ) -> None:
42 """Upsert a document. If ``tree`` is provided, also upsert all its nodes.
44 Idempotent — calling twice with the same ``document.id`` updates
45 rather than duplicates. If a tree is provided AND the document
46 already has a stored tree, the existing tree is replaced (all
47 old nodes deleted, new nodes inserted).
48 """
50 @abstractmethod
51 async def get_document(self, document_id: str) -> Document | None:
52 """Fetch document metadata. Returns None if not found.
54 Note: returned ``Document.tree`` is None even if a tree exists
55 in storage — use ``get_tree`` to fetch separately. Two-call
56 pattern keeps reads cheap when only metadata is needed.
57 """
59 @abstractmethod
60 async def get_tree(self, document_id: str) -> DocumentTree | None:
61 """Reconstruct the DocumentTree from stored nodes.
63 Returns None if the document has no stored tree (e.g., Document
64 saved without one). Returns an empty tree (no roots) if rows
65 exist but are malformed (logged).
66 """
68 @abstractmethod
69 async def list_documents(self, *, limit: int = 100) -> list[Document]:
70 """List documents in descending created_at order.
72 For control-plane and debugging. Pagination is offset-less for
73 Phase 2 simplicity — callers should not rely on stable order
74 between pages.
75 """
77 @abstractmethod
78 async def delete_document(self, document_id: str) -> None:
79 """Delete a document and all its tree nodes.
81 No-op if document_id doesn't exist. Tree-node deletion happens
82 via FK cascade (Postgres) or explicit removal (InMemory).
83 """
86# ─── in-memory impl (tests, embedded use) ─────────────────────────────
89class InMemoryDocumentStore(DocumentStore):
90 """Pure-Python DocumentStore backed by dicts. Not thread-safe.
92 Useful for:
93 - Unit tests (no DB setup needed)
94 - Embedded / CLI use where persistence isn't required
95 - Smoke tests of the Document Engine before wiring to Postgres
96 """
98 def __init__(self) -> None:
99 self._docs: dict[str, Document] = {}
100 # tree storage: doc_id → list of (TreeNode, depth-first order index)
101 # We store the canonical tree object verbatim for the in-memory case.
102 self._trees: dict[str, DocumentTree] = {}
104 async def save_document(
105 self,
106 document: Document,
107 tree: DocumentTree | None = None,
108 ) -> None:
109 # store metadata only (tree on Document is detached)
110 self._docs[document.id] = Document(
111 id=document.id,
112 source_uri=document.source_uri,
113 content_hash=document.content_hash,
114 mime_type=document.mime_type,
115 title=document.title,
116 created_at=document.created_at,
117 tree=None, # never inline on the metadata record
118 )
119 if tree is not None:
120 # Replace any previous tree
121 self._trees[document.id] = tree
123 async def get_document(self, document_id: str) -> Document | None:
124 return self._docs.get(document_id)
126 async def get_tree(self, document_id: str) -> DocumentTree | None:
127 return self._trees.get(document_id)
129 async def list_documents(self, *, limit: int = 100) -> list[Document]:
130 docs = sorted(self._docs.values(), key=lambda d: d.created_at, reverse=True)
131 return docs[:limit]
133 async def delete_document(self, document_id: str) -> None:
134 self._docs.pop(document_id, None)
135 self._trees.pop(document_id, None)
138# ─── helper: flatten tree to (parent_id, node) rows ───────────────────
141def flatten_tree_rows(tree: DocumentTree) -> Iterable[TreeNode]:
142 """Yield nodes in pre-order, with parent_id correctly set.
144 The Postgres impl uses this to insert nodes in parent-before-child
145 order (required for the FK constraint).
146 """
147 for n in tree.all_nodes():
148 yield n