Coverage for astrocyte/eval/judges/__init__.py: 100%
3 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Canonical benchmark judges — ported from published eval scripts.
3The previous Astrocyte benchmark adapters used ``word_overlap_score > 0.3``
4as a coarse proxy for correctness. That's looser than what published
5comparison points (LoCoMo paper, LongMemEval paper, Mem0, Zep, Hindsight)
6use — so our numbers could not be directly compared.
8This package ports each benchmark's canonical judge exactly so our
9scores become cross-comparable with published work:
11- :mod:`astrocyte.eval.judges.locomo_judge` — stemmed token-F1 with
12 category-specific logic. Pure Python, no LLM. Ported from
13 ``datasets/locomo/task_eval/evaluation.py``.
15- :mod:`astrocyte.eval.judges.longmemeval_judge` — LLM-judge with
16 task-specific yes/no prompts. Ported from
17 ``datasets/longmemeval/src/evaluation/evaluate_qa.py``.
19Each judge is self-contained; the adapter selects which judge to use
20based on the benchmark it's running. Adapters can also be configured to
21run BOTH their legacy scorer and the canonical judge for delta analysis.
22"""
24from astrocyte.eval.judges.locomo_judge import (
25 LOCOMO_CATEGORY_IDS,
26 locomo_category_id,
27 locomo_score_qa,
28)
29from astrocyte.eval.judges.longmemeval_judge import (
30 LONGMEMEVAL_ABSTENTION_SUFFIX,
31 LongMemEvalJudge,
32 build_longmemeval_judge_prompt,
33)
35__all__ = [
36 "LOCOMO_CATEGORY_IDS",
37 "LONGMEMEVAL_ABSTENTION_SUFFIX",
38 "LongMemEvalJudge",
39 "build_longmemeval_judge_prompt",
40 "locomo_category_id",
41 "locomo_score_qa",
42]