fix(noise): expand STT artifact filter to catch subtitle metadata leaks

ElevenLabs scribe_v2_realtime also produces non-asterisk artifacts like "Untertitel: ARD Text im Auftrag von Funk (2017)" from TV/radio audio. Add pattern matching for subtitle metadata, copyright notices, and parenthetical/bracketed annotations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 19:43:22 +02:00
parent 5984132f60
commit 7b7079352f
1 changed files with 22 additions and 6 deletions
--- a/voice.py
+++ b/voice.py
@@ -57,13 +57,29 @@ def _build_voice_prompt() -> str:
        datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
    )

-# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
-# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
+# ElevenLabs scribe_v2_realtime produces two kinds of artifacts:
+# 1. Noise annotations: *Störgeräusche*, *Schlechte Qualität*, etc.
+# 2. Subtitle/metadata leaks: "Untertitel: ARD Text im Auftrag von Funk (2017)"
+# Filter both via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
 _NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
+_STT_ARTIFACT_PATTERNS = [
+    re.compile(r'(?i)^untertitel\b'),           # subtitle metadata
+    re.compile(r'(?i)^copyright\b'),            # copyright notices
+    re.compile(r'(?i)^musik\s*$'),              # bare "Musik" annotation
+    re.compile(r'(?i)^\(.*\)\s*$'),             # parenthetical annotations like (Applaus)
+    re.compile(r'(?i)^\[.*\]\s*$'),             # bracketed annotations like [Musik]
+]
+
+
+def _is_stt_artifact(text: str) -> bool:
+    """Check if text is an STT artifact (noise annotation or metadata leak)."""
+    if _NOISE_ANNOTATION_RE.match(text):
+        return True
+    return any(p.match(text) for p in _STT_ARTIFACT_PATTERNS)


 class _NoiseFilterAgent(Agent):
-    """Agent that suppresses ElevenLabs noise annotations before LLM sees them.
+    """Agent that suppresses ElevenLabs STT artifacts before LLM sees them.

    Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
    risk to the audio pipeline. Raises StopResponse to silently discard noise.
@@ -73,9 +89,9 @@ class _NoiseFilterAgent(Agent):
        self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
    ) -> None:
        text = (new_message.text_content or "").strip()
-        if text and _NOISE_ANNOTATION_RE.match(text):
-            logger.info("Noise annotation suppressed: %s", text)
-            # Remove the noise message from context so it doesn't accumulate
+        if text and _is_stt_artifact(text):
+            logger.info("STT artifact suppressed: %s", text)
+            # Remove the artifact from context so it doesn't accumulate
            if turn_ctx.items and turn_ctx.items[-1] is new_message:
                turn_ctx.items.pop()
            raise StopResponse()