Coverage for astrocyte/eval/judges/longmemeval

1"""Canonical LongMemEval judge — ported from the paper's reference evaluation.

3Upstream: ``datasets/longmemeval/src/evaluation/evaluate_qa.py`` from

4https://github.com/xiaowu0162/LongMemEval. LongMemEval's canonical judge

5is an **LLM-judge** (unlike LoCoMo's deterministic F1): each prediction

6is sent to an LLM with a task-specific prompt asking "Is the model

7response correct? Answer yes or no only." The yes-rate across all

8questions is the accuracy.

10## Task-specific prompts

12Five templates, each tuned to the category's success criteria:

14- **single-session-user / single-session-assistant / multi-session**:

15 pass if the response contains the correct answer, or contains all the

16 intermediate steps. Reject subsets.

17- **temporal-reasoning**: same, plus do not penalize off-by-one errors

18 on day/week/month counts.

19- **knowledge-update**: pass if the response contains the *updated*

20 answer, even if it also mentions previous information.

21- **single-session-preference**: pass if the response satisfies the

22 rubric; does not need to reflect every point.

23- **abstention** (task suffix ``_abs``): pass if the response correctly

24 identifies the question as unanswerable.

26All prompts ask for a single-token "yes" or "no" reply. This module

27never parses free-form LLM output — just the yes/no head.

29## What this module does

31- Builds the right prompt for (task, question, answer, response).

32- Calls an :class:`astrocyte.provider.LLMProvider` to get the judgment.

33- Returns 1.0 for yes, 0.0 for no, raises on ambiguous responses.

35## What this module does NOT do

37- Does not generate the model's response (that's the adapter's reflect

38 call).

39- Does not batch multiple questions into one LLM call (the paper's

40 reference does one-at-a-time; we match to stay comparable).

41- Does not retry on rate-limit; the caller's LLM provider should

42 handle backoff.

44Cost note: real-provider judge calls are cheap (short prompt, 1-token

45response). At gpt-4o-mini prices that's about $0.0001 per question —

46500 LongMemEval questions = ~$0.05. We log the total in the result for

47transparency.

48"""

50from __future__ import annotations

52import logging

53from typing import TYPE_CHECKING, Final

55from astrocyte.types import Message

57if TYPE_CHECKING:

58 from astrocyte.provider import LLMProvider

60_logger = logging.getLogger(__name__)

62#: LongMemEval's abstention suffix convention — any question_type ending

63#: with ``_abs`` triggers the abstention prompt.

64LONGMEMEVAL_ABSTENTION_SUFFIX: Final[str] = "_abs"

66#: Category → prompt template. The upstream script uses Python str.format

67#: positional substitution; we keep the same template strings verbatim

68#: so future scoring runs remain byte-for-byte comparable. Order of

69#: substitution is (question, answer, response) for non-abstention,

70#: (question, explanation, response) for abstention.

71_TEMPLATES: Final[dict[str, str]] = {

72 "single-session-user": (

73 "I will give you a question, a correct answer, and a response from a "

74 "model. Please answer yes if the response contains the correct answer. "

75 "Otherwise, answer no. If the response is equivalent to the correct "

76 "answer or contains all the intermediate steps to get the correct "

77 "answer, you should also answer yes. If the response only contains a "

78 "subset of the information required by the answer, answer no. "

79 "\n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\n"

80 "Is the model response correct? Answer yes or no only."

81 ),

82 "temporal-reasoning": (

83 "I will give you a question, a correct answer, and a response from a "

84 "model. Please answer yes if the response contains the correct answer. "

85 "Otherwise, answer no. If the response is equivalent to the correct "

86 "answer or contains all the intermediate steps to get the correct "

87 "answer, you should also answer yes. If the response only contains a "

88 "subset of the information required by the answer, answer no. In "

89 "addition, do not penalize off-by-one errors for the number of days. "

90 "If the question asks for the number of days/weeks/months, etc., and "

91 "the model makes off-by-one errors (e.g., predicting 19 days when the "

92 "answer is 18), the model's response is still correct. "

93 "\n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\n"

94 "Is the model response correct? Answer yes or no only."

95 ),

96 "knowledge-update": (

97 "I will give you a question, a correct answer, and a response from a "

98 "model. Please answer yes if the response contains the correct answer. "

99 "Otherwise, answer no. If the response contains some previous "

100 "information along with an updated answer, the response should be "

101 "considered as correct as long as the updated answer is the required "

102 "answer."

103 "\n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\n"

104 "Is the model response correct? Answer yes or no only."

105 ),

106 "single-session-preference": (

107 "I will give you a question, a rubric for desired personalized "

108 "response, and a response from a model. Please answer yes if the "

109 "response satisfies the desired response. Otherwise, answer no. The "

110 "model does not need to reflect all the points in the rubric. The "

111 "response is correct as long as it recalls and utilizes the user's "

112 "personal information correctly."

113 "\n\nQuestion: {}\n\nRubric: {}\n\nModel Response: {}\n\n"

114 "Is the model response correct? Answer yes or no only."

115 ),

116 "_abstention": (

117 "I will give you an unanswerable question, an explanation, and a "

118 "response from a model. Please answer yes if the model correctly "

119 "identifies the question as unanswerable. The model could say that "

120 "the information is incomplete, or some other information is given "

121 "but the asked information is not."

122 "\n\nQuestion: {}\n\nExplanation: {}\n\nModel Response: {}\n\n"

123 "Does the model correctly identify the question as unanswerable? "

124 "Answer yes or no only."

125 ),

126}

127

128# Aliases — the upstream script treats multiple non-abstention tasks the

129# same way ("single-session-assistant" and "multi-session" share the

130# first template). We flatten that mapping here so the caller always

131# passes its own ``question_type`` string and we pick the right template.

132_ALIASES: Final[dict[str, str]] = {

133 "single-session-assistant": "single-session-user",

134 "multi-session": "single-session-user",

135}

136

137

138def _resolve_template(question_type: str) -> str:

139 """Pick the prompt template for a LongMemEval question_type."""

140 if question_type.endswith(LONGMEMEVAL_ABSTENTION_SUFFIX):

141 return _TEMPLATES["_abstention"]

142 key = _ALIASES.get(question_type, question_type)

143 if key not in _TEMPLATES:

144 raise ValueError(

145 f"Unknown LongMemEval question_type: {question_type!r}. "

146 f"Known: {sorted(_TEMPLATES.keys())} + aliases {sorted(_ALIASES.keys())}",

147 )

148 return _TEMPLATES[key]

149

150

151def build_longmemeval_judge_prompt(

152 question_type: str,

153 question: str,

154 answer: str,

155 response: str,

156) -> str:

157 """Render the canonical judge prompt for (type, q, a, r).

158

159 Exposed so tests can pin the exact prompt bytes against the upstream

160 reference. Under normal use, :meth:`LongMemEvalJudge.score` composes

161 and sends this internally.

162 """

163 template = _resolve_template(question_type)

164 return template.format(question, answer, response)

165

166

167# ---------------------------------------------------------------------------

168# Judge — async, LLM-backed

169# ---------------------------------------------------------------------------

170

171

172class LongMemEvalJudge:

173 """LLM-backed yes/no judge for LongMemEval predictions.

174

175 Instantiate once per benchmark run with the LLM provider to judge

176 against (typically the same provider used for the predictions, for

177 consistency — though the paper uses ``gpt-4o`` regardless of the

178 prediction model).

179 """

180

181 def __init__(

182 self,

183 llm_provider: LLMProvider,

184 *,

185 model: str | None = None,

186 max_tokens: int = 4,

187 temperature: float = 0.0,

188 ) -> None:

189 self._llm = llm_provider

190 self._model = model

191 self._max_tokens = max_tokens # "yes"/"no" fit in 1 token; 4 is defensive

192 self._temperature = temperature

193

194 async def score(

195 self,

196 question_type: str,

197 question: str,

198 answer: str,

199 response: str,

200 ) -> float:

201 """Return 1.0 if the judge says yes, 0.0 otherwise.

202

203 Raises :class:`ValueError` for unrecognised question types. LLM

204 failures propagate — caller decides how to aggregate (e.g. count

205 as 0 and log).

206 """

207 prompt = build_longmemeval_judge_prompt(

208 question_type,

209 question,

210 answer,

211 response,

212 )

213 completion = await self._llm.complete(

214 messages=[Message(role="user", content=prompt)],

215 model=self._model,

216 max_tokens=self._max_tokens,

217 temperature=self._temperature,

218 )

219 return parse_yes_no(completion.text)

220

221

222def parse_yes_no(raw: str) -> float:

223 """Interpret an LLM judgment string as 1.0 (yes) or 0.0 (no).

224

225 Tolerant to whitespace, punctuation, and case. Logs a warning and

226 returns 0.0 for ambiguous output — treating "I don't know" as a

227 negative judgment is the safe default for accuracy scoring. Matches

228 the upstream loop's ``ans.lower().startswith('yes')`` pattern.

229 """

230 if raw is None:

231 return 0.0

232 cleaned = raw.strip().lower().lstrip(".:!- \t\n\r").rstrip(".:!- \t\n\r")

233 if cleaned.startswith("yes"):

234 return 1.0

235 if cleaned.startswith("no"):

236 return 0.0

237 _logger.warning(

238 "LongMemEval judge returned ambiguous response %r; scored as no",

239 raw[:200],

240 )

241 return 0.0

Coverage for astrocyte/eval/judges/longmemeval_judge.py: 100%

38 statements