Coverage for astrocyte/pipeline/retain_fsm/context.py: 98%

44 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""M14.0: shared context + services for the retain FSM. 

2 

3The FSM engine drives stateful transitions over a single ``RetainContext`` 

4per source. Every state mutates the context in place — appends to lists, 

5sets output fields — never replaces it. The context is also the unit of 

6checkpoint persistence: serialise it between transitions, deserialise to 

7resume. 

8 

9``RetainServices`` bundles the dependencies states need (store, provider, 

10config). Passed alongside ctx into every state coroutine. Mirrors the 

11pattern Pydantic / FastAPI uses for dependency-injected request handlers. 

12 

13See ``docs/_design/m13-m14-roadmap.md`` §4 for the architectural shape. 

14""" 

15 

16from __future__ import annotations 

17 

18from dataclasses import dataclass, field 

19from datetime import datetime, timezone 

20from typing import TYPE_CHECKING, Any 

21 

22if TYPE_CHECKING: 

23 from astrocyte.provider import LLMProvider, MentalModelStore, PageIndexStore 

24 from astrocyte.types import PageIndexFact, PageIndexSection 

25 

26 

27@dataclass 

28class StepLogEntry: 

29 """One entry in the per-source audit trail. Karpathy's log.md is 

30 materialised from these on M14.5; for M14.0+ they're an in-memory 

31 debug aid and the basis for `RetainContext.duration_ms_by_state`. 

32 """ 

33 

34 state: str 

35 started_at: datetime 

36 completed_at: datetime | None = None 

37 duration_ms: float | None = None 

38 error: str | None = None 

39 notes: dict[str, Any] = field(default_factory=dict) 

40 

41 

42@dataclass 

43class RetainContext: 

44 """All state for one source through the retain FSM. 

45 

46 Inputs are populated by the caller before ``RetainFSM.run`` starts. 

47 Outputs are populated by states as the pipeline progresses. 

48 """ 

49 

50 # ── Inputs ────────────────────────────────────────────────────────── 

51 bank_id: str 

52 source_id: str 

53 md_text: str 

54 reference_date: datetime | None = None 

55 

56 # ── Outputs (set by states as the pipeline progresses) ────────────── 

57 document_id: str | None = None 

58 sections: list[PageIndexSection] = field(default_factory=list) 

59 facts: list[PageIndexFact] = field(default_factory=list) 

60 entities: list[str] = field(default_factory=list) 

61 wikis_created: list[str] = field(default_factory=list) 

62 wikis_updated: list[str] = field(default_factory=list) 

63 supersedes_edges: list[tuple[str, str]] = field(default_factory=list) 

64 

65 # ── Control / observability ───────────────────────────────────────── 

66 last_state: str = "INIT" 

67 step_log: list[StepLogEntry] = field(default_factory=list) 

68 errors: list[str] = field(default_factory=list) 

69 started_at: datetime = field( 

70 default_factory=lambda: datetime.now(tz=timezone.utc), 

71 ) 

72 completed_at: datetime | None = None 

73 

74 def duration_ms_by_state(self) -> dict[str, float]: 

75 """Aggregate per-state wall time from the step log. Used by tests 

76 and the M14.5 log.md emitter; safe to call mid-run.""" 

77 out: dict[str, float] = {} 

78 for entry in self.step_log: 

79 if entry.duration_ms is None: 

80 continue 

81 out[entry.state] = out.get(entry.state, 0.0) + entry.duration_ms 

82 return out 

83 

84 

85@dataclass 

86class RetainServices: 

87 """Dependencies states need. Constructed once at FSM init, threaded 

88 into every state coroutine alongside the context. 

89 

90 Stores are optional so tests / smoke runs can pass a partial bundle 

91 (e.g. only ``provider`` + ``store``); states that need a missing 

92 service should fail explicitly with a clear error rather than crash 

93 with ``AttributeError`` deep in the call stack. 

94 """ 

95 

96 store: PageIndexStore 

97 provider: LLMProvider 

98 mental_model_store: MentalModelStore | None = None 

99 embedding_model: str = "text-embedding-3-small" 

100 extraction_model: str = "gpt-4o-mini"