Coverage for astrocyte/pipeline/_json_tolerant.py: 91%

47 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-04 05:24 +0000

1"""Tolerant JSON parsing for LLM extraction outputs. 

2 

3gpt-4o-mini occasionally wraps its JSON in markdown fences or sprinkles 

4prose around the object despite ``response_format={"type": "json_object"}`` 

5being set. The strict ``json.loads`` path then fails and the caller 

6silently drops the section's facts / entities. This module gives the 

7caller a layered fallback before giving up: 

8 

91. ``json.loads`` straight up — fast path, the common case. 

102. Strip a ``` ```json ``` / ``` ``` ``` markdown fence wrapper. 

113. Slice from the first ``{`` to the last ``}`` (drops surrounding prose). 

12 

13Returns the parsed object on success or ``None`` on total failure. The 

14caller keeps its existing warn-and-return-empty fallback for ``None``. 

15 

16``looks_truncated`` is a sibling heuristic: when the LLM hit its 

17``max_tokens`` budget mid-output, the JSON is unrecoverable and a retry 

18with the same budget won't help either, so the caller should skip the 

19retry path to avoid the latency cost. 

20""" 

21 

22from __future__ import annotations 

23 

24import json 

25from typing import Any 

26 

27 

28def tolerant_json_loads(text: str) -> Any | None: 

29 """Best-effort JSON parse. Returns ``None`` if all strategies fail.""" 

30 if not text: 

31 return None 

32 try: 

33 return json.loads(text) 

34 except json.JSONDecodeError: 

35 pass 

36 

37 stripped = text.strip() 

38 if stripped.startswith("```"): 

39 # Drop the opening fence (possibly ```json) up to first newline. 

40 newline = stripped.find("\n") 

41 inner = stripped[newline + 1 :] if newline != -1 else stripped[3:] 

42 # Drop trailing fence. 

43 inner = inner.rstrip() 

44 if inner.endswith("```"): 

45 inner = inner[:-3].rstrip() 

46 try: 

47 return json.loads(inner) 

48 except json.JSONDecodeError: 

49 pass 

50 

51 start = text.find("{") 

52 end = text.rfind("}") 

53 if start != -1 and end > start: 

54 try: 

55 return json.loads(text[start : end + 1]) 

56 except json.JSONDecodeError: 

57 pass 

58 

59 return None 

60 

61 

62def looks_truncated(text: str) -> bool: 

63 """Heuristic: did the LLM run out of budget mid-output? 

64 

65 Used to short-circuit the parse-failure retry path. Retrying when 

66 the original response was budget-truncated wastes a round-trip — the 

67 retry will hit the same cap. Counts are naive (don't track quoting), 

68 which is fine: false positives just skip an occasional retry, false 

69 negatives just spend an extra round-trip. 

70 """ 

71 if not text: 

72 return False 

73 s = text.rstrip() 

74 if not s: 

75 return False 

76 opens = s.count("{") + s.count("[") 

77 # No braces at all → not a truncated JSON object. Probably a refusal 

78 # or a prose answer; the retry path may recover with a stricter 

79 # reminder, so don't short-circuit here. 

80 if opens == 0: 

81 return False 

82 last = s[-1] 

83 if last in (",", ":"): 

84 return True 

85 closes = s.count("}") + s.count("]") 

86 if opens > closes: 

87 return True 

88 if last not in "}]\"'": 

89 return True 

90 return False