fix(MAT-174): Robust LLM filter JSON extraction with regex

LLM sometimes returns extra text around the JSON array. Use regex to
extract the array pattern instead of parsing the full response.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-03-16 14:07:47 +02:00
parent ec46c37bc5
commit 9ec45339e9

View File

@@ -59,8 +59,13 @@ async def _llm_filter(results: list[dict], criteria: str) -> list[dict]:
data = resp.json()
reply = data["choices"][0]["message"]["content"].strip()
# Parse the JSON array of indices
indices = json.loads(reply)
# Extract JSON array from response (LLM may include extra text)
import re
match = re.search(r"\[[\d,\s]*\]", reply)
if not match:
logger.warning("LLM filter returned no array: %s", reply)
return results
indices = json.loads(match.group())
if not isinstance(indices, list):
logger.warning("LLM filter returned non-list: %s", reply)
return results