fix(noise): expand STT artifact filter to catch subtitle metadata leaks

ElevenLabs scribe_v2_realtime also produces non-asterisk artifacts like "Untertitel: ARD Text im Auftrag von Funk (2017)" from TV/radio audio. Add pattern matching for subtitle metadata, copyright notices, and parenthetical/bracketed annotations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 19:43:22 +02:00
parent 5984132f60
commit 7b7079352f
1 changed files with 22 additions and 6 deletions
--- a/voice.py
+++ b/voice.py
@@ -57,13 +57,29 @@ def _build_voice_prompt() -> str:
        datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
    )
-# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
+# ElevenLabs scribe_v2_realtime produces two kinds of artifacts:
-# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
+# 1. Noise annotations: *Störgeräusche*, *Schlechte Qualität*, etc.
 # 2. Subtitle/metadata leaks: "Untertitel: ARD Text im Auftrag von Funk (2017)"
 # Filter both via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
 _NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
 _STT_ARTIFACT_PATTERNS = [
    re.compile(r'(?i)^untertitel\b'),           # subtitle metadata
    re.compile(r'(?i)^copyright\b'),            # copyright notices
    re.compile(r'(?i)^musik\s*$'),              # bare "Musik" annotation
    re.compile(r'(?i)^\(.*\)\s*$'),             # parenthetical annotations like (Applaus)
    re.compile(r'(?i)^\[.*\]\s*$'),             # bracketed annotations like [Musik]
 ]
 def _is_stt_artifact(text: str) -> bool:
    """Check if text is an STT artifact (noise annotation or metadata leak)."""
    if _NOISE_ANNOTATION_RE.match(text):
        return True
    return any(p.match(text) for p in _STT_ARTIFACT_PATTERNS)
 class _NoiseFilterAgent(Agent):
-    """Agent that suppresses ElevenLabs noise annotations before LLM sees them.
+    """Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
    Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
    risk to the audio pipeline. Raises StopResponse to silently discard noise.
@@ -73,9 +89,9 @@ class _NoiseFilterAgent(Agent):
        self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
    ) -> None:
        text = (new_message.text_content or "").strip()
-        if text and _NOISE_ANNOTATION_RE.match(text):
+        if text and _is_stt_artifact(text):
-            logger.info("Noise annotation suppressed: %s", text)
+            logger.info("STT artifact suppressed: %s", text)
-            # Remove the noise message from context so it doesn't accumulate
+            # Remove the artifact from context so it doesn't accumulate
            if turn_ctx.items and turn_ctx.items[-1] is new_message:
                turn_ctx.items.pop()
            raise StopResponse()