Coverage for astrocyte/pipeline/_json_tolerant.py: 91%
47 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-04 05:24 +0000
1"""Tolerant JSON parsing for LLM extraction outputs.
3gpt-4o-mini occasionally wraps its JSON in markdown fences or sprinkles
4prose around the object despite ``response_format={"type": "json_object"}``
5being set. The strict ``json.loads`` path then fails and the caller
6silently drops the section's facts / entities. This module gives the
7caller a layered fallback before giving up:
91. ``json.loads`` straight up — fast path, the common case.
102. Strip a ``` ```json ``` / ``` ``` ``` markdown fence wrapper.
113. Slice from the first ``{`` to the last ``}`` (drops surrounding prose).
13Returns the parsed object on success or ``None`` on total failure. The
14caller keeps its existing warn-and-return-empty fallback for ``None``.
16``looks_truncated`` is a sibling heuristic: when the LLM hit its
17``max_tokens`` budget mid-output, the JSON is unrecoverable and a retry
18with the same budget won't help either, so the caller should skip the
19retry path to avoid the latency cost.
20"""
22from __future__ import annotations
24import json
25from typing import Any
28def tolerant_json_loads(text: str) -> Any | None:
29 """Best-effort JSON parse. Returns ``None`` if all strategies fail."""
30 if not text:
31 return None
32 try:
33 return json.loads(text)
34 except json.JSONDecodeError:
35 pass
37 stripped = text.strip()
38 if stripped.startswith("```"):
39 # Drop the opening fence (possibly ```json) up to first newline.
40 newline = stripped.find("\n")
41 inner = stripped[newline + 1 :] if newline != -1 else stripped[3:]
42 # Drop trailing fence.
43 inner = inner.rstrip()
44 if inner.endswith("```"):
45 inner = inner[:-3].rstrip()
46 try:
47 return json.loads(inner)
48 except json.JSONDecodeError:
49 pass
51 start = text.find("{")
52 end = text.rfind("}")
53 if start != -1 and end > start:
54 try:
55 return json.loads(text[start : end + 1])
56 except json.JSONDecodeError:
57 pass
59 return None
62def looks_truncated(text: str) -> bool:
63 """Heuristic: did the LLM run out of budget mid-output?
65 Used to short-circuit the parse-failure retry path. Retrying when
66 the original response was budget-truncated wastes a round-trip — the
67 retry will hit the same cap. Counts are naive (don't track quoting),
68 which is fine: false positives just skip an occasional retry, false
69 negatives just spend an extra round-trip.
70 """
71 if not text:
72 return False
73 s = text.rstrip()
74 if not s:
75 return False
76 opens = s.count("{") + s.count("[")
77 # No braces at all → not a truncated JSON object. Probably a refusal
78 # or a prose answer; the retry path may recover with a stricter
79 # reminder, so don't short-circuit here.
80 if opens == 0:
81 return False
82 last = s[-1]
83 if last in (",", ":"):
84 return True
85 closes = s.count("}") + s.count("]")
86 if opens > closes:
87 return True
88 if last not in "}]\"'":
89 return True
90 return False