From 9ec45339e910c713183dc6c077766b308cc61e1a Mon Sep 17 00:00:00 2001
From: Christian Gick <service@agiliton.eu>
Date: Mon, 16 Mar 2026 14:07:47 +0200
Subject: [PATCH] fix(MAT-174): Robust LLM filter JSON extraction with regex

LLM sometimes returns extra text around the JSON array. Use regex to
extract the array pattern instead of parsing the full response.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cron/brave_search.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cron/brave_search.py b/cron/brave_search.py
index c92875f..3196f76 100644
--- a/cron/brave_search.py
+++ b/cron/brave_search.py
@@ -59,8 +59,13 @@ async def _llm_filter(results: list[dict], criteria: str) -> list[dict]:
             data = resp.json()
 
         reply = data["choices"][0]["message"]["content"].strip()
-        # Parse the JSON array of indices
-        indices = json.loads(reply)
+        # Extract JSON array from response (LLM may include extra text)
+        import re
+        match = re.search(r"\[[\d,\s]*\]", reply)
+        if not match:
+            logger.warning("LLM filter returned no array: %s", reply)
+            return results
+        indices = json.loads(match.group())
         if not isinstance(indices, list):
             logger.warning("LLM filter returned non-list: %s", reply)
             return results