From 9ec45339e910c713183dc6c077766b308cc61e1a Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Mon, 16 Mar 2026 14:07:47 +0200 Subject: [PATCH] fix(MAT-174): Robust LLM filter JSON extraction with regex LLM sometimes returns extra text around the JSON array. Use regex to extract the array pattern instead of parsing the full response. Co-Authored-By: Claude Opus 4.6 (1M context) --- cron/brave_search.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cron/brave_search.py b/cron/brave_search.py index c92875f..3196f76 100644 --- a/cron/brave_search.py +++ b/cron/brave_search.py @@ -59,8 +59,13 @@ async def _llm_filter(results: list[dict], criteria: str) -> list[dict]: data = resp.json() reply = data["choices"][0]["message"]["content"].strip() - # Parse the JSON array of indices - indices = json.loads(reply) + # Extract JSON array from response (LLM may include extra text) + import re + match = re.search(r"\[[\d,\s]*\]", reply) + if not match: + logger.warning("LLM filter returned no array: %s", reply) + return results + indices = json.loads(match.group()) if not isinstance(indices, list): logger.warning("LLM filter returned non-list: %s", reply) return results