Coverage for astrocyte/mip/presets.py: 94%

47 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""MIP pipeline presets — named bundles of chunker/dedup/rerank/reflect overrides. 

2 

3Presets are the **primary** authoring interface for pipeline shaping (P1). 

4Authors write `pipeline: { preset: conversational }` rather than picking 

5individual knobs. Raw overrides are supported but documented as advanced. 

6 

7Expansion happens once at load time (in loader._parse_pipeline). Downstream 

8code only ever sees fully-resolved PipelineSpec instances — never preset names. 

9 

10To add a preset: add an entry to PRESETS. Update docs/_plugins/mip-developer-guide.md. 

11""" 

12 

13from __future__ import annotations 

14 

15from dataclasses import replace 

16 

17from astrocyte.mip.schema import ( 

18 ChunkerSpec, 

19 DedupSpec, 

20 ForgetSpec, 

21 PipelineSpec, 

22 ReflectSpec, 

23 RerankSpec, 

24) 

25 

26PRESETS: dict[str, PipelineSpec] = { 

27 "conversational": PipelineSpec( 

28 chunker=ChunkerSpec(strategy="dialogue", max_size=800, overlap=0), 

29 dedup=DedupSpec(threshold=0.92, action="skip_chunk"), 

30 rerank=RerankSpec(keyword_weight=0.08, proper_noun_weight=0.15), 

31 reflect=ReflectSpec(prompt="temporal_aware", promote_metadata=["speaker", "occurred_at"]), 

32 ), 

33 "document": PipelineSpec( 

34 chunker=ChunkerSpec(strategy="paragraph", max_size=1200, overlap=100), 

35 dedup=DedupSpec(threshold=0.95, action="skip"), 

36 rerank=RerankSpec(keyword_weight=0.10, proper_noun_weight=0.05), 

37 reflect=ReflectSpec(prompt="default", promote_metadata=None), 

38 ), 

39 "code": PipelineSpec( 

40 chunker=ChunkerSpec(strategy="fixed", max_size=1500, overlap=200), 

41 dedup=DedupSpec(threshold=0.98, action="skip"), 

42 rerank=RerankSpec(keyword_weight=0.12, proper_noun_weight=0.0), 

43 reflect=ReflectSpec(prompt="evidence_strict", promote_metadata=None), 

44 ), 

45 "evidence_strict": PipelineSpec( 

46 # Inherits caller's chunker (no override) 

47 chunker=None, 

48 dedup=DedupSpec(threshold=0.98, action="skip"), 

49 rerank=RerankSpec(keyword_weight=0.10, proper_noun_weight=0.05), 

50 reflect=ReflectSpec(prompt="evidence_strict", promote_metadata=["source", "occurred_at"]), 

51 ), 

52} 

53 

54 

55def is_known_preset(name: str) -> bool: 

56 return name in PRESETS 

57 

58 

59def list_presets() -> list[str]: 

60 return sorted(PRESETS.keys()) 

61 

62 

63# --------------------------------------------------------------------------- 

64# Forget presets (Phase 4) 

65# --------------------------------------------------------------------------- 

66 

67FORGET_PRESETS: dict[str, ForgetSpec] = { 

68 # GDPR right-to-erasure: hard delete, audit required, cascade derived data, 

69 # legal hold MUST be respected (compliance-mandated). 

70 "gdpr": ForgetSpec( 

71 mode="hard", 

72 audit="required", 

73 cascade=True, 

74 respect_legal_hold=True, 

75 min_age_days=0, 

76 ), 

77 # Student records (FERPA-style): soft delete with grace period, audit on, 

78 # refuse on records < 7 days old to prevent accidents. 

79 "student": ForgetSpec( 

80 mode="soft", 

81 audit="recommended", 

82 cascade=True, 

83 respect_legal_hold=True, 

84 min_age_days=7, 

85 ), 

86 # Audit-strict: tombstone replacement (preserves cryptographic chain), 

87 # audit required, cascade off (each tombstone tracked individually). 

88 "audit-strict": ForgetSpec( 

89 mode="tombstone", 

90 audit="required", 

91 cascade=False, 

92 respect_legal_hold=True, 

93 min_age_days=0, 

94 ), 

95} 

96 

97 

98def is_known_forget_preset(name: str) -> bool: 

99 return name in FORGET_PRESETS 

100 

101 

102def list_forget_presets() -> list[str]: 

103 return sorted(FORGET_PRESETS.keys()) 

104 

105 

106def expand_forget_preset(spec: ForgetSpec) -> ForgetSpec: 

107 """Merge a forget preset (if named) with explicit overrides on the spec. 

108 

109 Explicit fields on ``spec`` take precedence over preset defaults. Returns 

110 a new :class:`ForgetSpec` with ``preset`` cleared and all fields resolved. 

111 Caller is responsible for raising on unknown presets. 

112 """ 

113 if spec.preset is None: 

114 return spec 

115 base = FORGET_PRESETS[spec.preset] 

116 return ForgetSpec( 

117 version=spec.version, 

118 preset=None, 

119 mode=spec.mode if spec.mode is not None else base.mode, 

120 audit=spec.audit if spec.audit is not None else base.audit, 

121 cascade=spec.cascade if spec.cascade is not None else base.cascade, 

122 respect_legal_hold=( 

123 spec.respect_legal_hold if spec.respect_legal_hold is not None else base.respect_legal_hold 

124 ), 

125 min_age_days=spec.min_age_days if spec.min_age_days is not None else base.min_age_days, 

126 max_per_call=spec.max_per_call if spec.max_per_call is not None else base.max_per_call, 

127 ) 

128 

129 

130def expand_preset(spec: PipelineSpec) -> PipelineSpec: 

131 """Merge a preset (if named) with explicit overrides on the spec. 

132 

133 Explicit fields on `spec` take precedence over preset defaults. Returns 

134 a new PipelineSpec with `preset` cleared and all sub-blocks resolved. 

135 

136 If `spec.preset` is None, returns `spec` unchanged (raw overrides only). 

137 Caller is responsible for raising on unknown presets — use is_known_preset 

138 during loader validation so the error mentions the rule name. 

139 """ 

140 if spec.preset is None: 

141 return spec 

142 

143 base = PRESETS[spec.preset] 

144 

145 return PipelineSpec( 

146 version=spec.version, 

147 preset=None, # cleared post-expansion 

148 chunker=_merge_chunker(base.chunker, spec.chunker), 

149 dedup=_merge_dedup(base.dedup, spec.dedup), 

150 rerank=_merge_rerank(base.rerank, spec.rerank), 

151 reflect=_merge_reflect(base.reflect, spec.reflect), 

152 # Explicit override wins over preset default; preset defaults 

153 # don't currently set half-life but the field is forward-compatible 

154 # if a future preset does. 

155 temporal_half_life_days=( 

156 spec.temporal_half_life_days if spec.temporal_half_life_days is not None else base.temporal_half_life_days 

157 ), 

158 ) 

159 

160 

161def _merge_chunker(base: ChunkerSpec | None, override: ChunkerSpec | None) -> ChunkerSpec | None: 

162 if override is None: 

163 return base 

164 if base is None: 

165 return override 

166 return replace( 

167 base, 

168 strategy=override.strategy if override.strategy is not None else base.strategy, 

169 max_size=override.max_size if override.max_size is not None else base.max_size, 

170 overlap=override.overlap if override.overlap is not None else base.overlap, 

171 ) 

172 

173 

174def _merge_dedup(base: DedupSpec | None, override: DedupSpec | None) -> DedupSpec | None: 

175 if override is None: 

176 return base 

177 if base is None: 

178 return override 

179 return replace( 

180 base, 

181 threshold=override.threshold if override.threshold is not None else base.threshold, 

182 action=override.action if override.action is not None else base.action, 

183 ) 

184 

185 

186def _merge_rerank(base: RerankSpec | None, override: RerankSpec | None) -> RerankSpec | None: 

187 if override is None: 

188 return base 

189 if base is None: 

190 return override 

191 return replace( 

192 base, 

193 keyword_weight=(override.keyword_weight if override.keyword_weight is not None else base.keyword_weight), 

194 proper_noun_weight=( 

195 override.proper_noun_weight if override.proper_noun_weight is not None else base.proper_noun_weight 

196 ), 

197 ) 

198 

199 

200def _merge_reflect(base: ReflectSpec | None, override: ReflectSpec | None) -> ReflectSpec | None: 

201 if override is None: 

202 return base 

203 if base is None: 

204 return override 

205 return replace( 

206 base, 

207 prompt=override.prompt if override.prompt is not None else base.prompt, 

208 promote_metadata=( 

209 override.promote_metadata if override.promote_metadata is not None else base.promote_metadata 

210 ), 

211 )