Coverage for astrocyte/pipeline/section_embedding.py: 100%
19 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""PR2 commit A: per-section summary embeddings for the section recall semantic strategy.
3Embeds the ``summary`` text of each PageIndex tree section using the
4configured embedding provider (typically ``text-embedding-3-small`` at
51536 dims). The vector lands in ``astrocyte_pi_sections.summary_embedding``
6and powers the semantic strategy's `<=>` queries in PR2 commit B.
8Why summary (not body):
9- Body slices are 100-2000 tokens — embedding them is costly and the
10 signal is noisy (full-turn dialogue contains a lot of social filler
11 that dilutes the topical signal).
12- The summary was generated by md_to_tree at retain time specifically
13 to capture the section's topical content. It's the cleanest input
14 we have for similarity ranking.
16Why not body chunks too:
17- Chunking the body into multiple embedded rows would push us back
18 toward atomized retrieval (the M1-M8 shape). Section-grain is
19 deliberate (ADR-007). Keep it.
21Cost: one embedding API call per section. ``text-embedding-3-small``
22costs ~$0.02 per 1M tokens; a typical section summary is ~50 tokens, so
23~$0.000001 per section. Trivial vs the entity-extraction LLM call.
25The embed call is batched per document (one network round-trip for all
26sections in a tree) so retain wall-time stays bounded.
27"""
29from __future__ import annotations
31import logging
32from typing import TYPE_CHECKING
34if TYPE_CHECKING:
35 from astrocyte.providers.openai import OpenAIProvider
36 from astrocyte.types import PageIndexSection
38logger = logging.getLogger("astrocyte.pipeline.section_embedding")
41async def embed_sections(
42 provider: "OpenAIProvider",
43 sections: list["PageIndexSection"],
44 *,
45 model: str | None = None,
46) -> list[tuple[int, list[float]]]:
47 """Batch-embed every section's summary in a single API call.
49 Returns ``(line_num, vector)`` tuples for the subset of sections
50 that have a non-empty summary; sections without a summary are
51 silently skipped (their ``summary_embedding`` stays NULL — the
52 semantic strategy excludes NULL rows naturally).
54 ``model`` overrides the provider's default embedding model when
55 set; otherwise the provider's configured model wins.
56 """
57 targets: list[tuple[int, str]] = [
58 (s.line_num, (s.summary or "").strip()) for s in sections if (s.summary or "").strip()
59 ]
60 if not targets:
61 return []
63 line_nums = [ln for ln, _ in targets]
64 texts = [text for _, text in targets]
66 try:
67 vectors = await provider.embed(texts, model=model)
68 except Exception as exc: # noqa: BLE001 — embedding failure shouldn't tank retain
69 logger.warning(
70 "section_embedding: embed failed for %d sections — %s: %s",
71 len(targets),
72 type(exc).__name__,
73 exc,
74 )
75 return []
77 if len(vectors) != len(targets):
78 logger.warning(
79 "section_embedding: embed returned %d vectors for %d inputs",
80 len(vectors),
81 len(targets),
82 )
83 return []
85 return list(zip(line_nums, vectors))