fix(MAT-174): Robust LLM filter JSON extraction with regex

LLM sometimes returns extra text around the JSON array. Use regex to extract the array pattern instead of parsing the full response. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 14:07:47 +02:00
parent ec46c37bc5
commit 9ec45339e9
1 changed files with 7 additions and 2 deletions
--- a/cron/brave_search.py
+++ b/cron/brave_search.py
@@ -59,8 +59,13 @@ async def _llm_filter(results: list[dict], criteria: str) -> list[dict]:
            data = resp.json()

        reply = data["choices"][0]["message"]["content"].strip()
-        # Parse the JSON array of indices
-        indices = json.loads(reply)
+        # Extract JSON array from response (LLM may include extra text)
+        import re
+        match = re.search(r"\[[\d,\s]*\]", reply)
+        if not match:
+            logger.warning("LLM filter returned no array: %s", reply)
+            return results
+        indices = json.loads(match.group())
        if not isinstance(indices, list):
            logger.warning("LLM filter returned non-list: %s", reply)
            return results