fix(noise): expand STT artifact filter to catch subtitle metadata leaks

ElevenLabs scribe_v2_realtime also produces non-asterisk artifacts like
"Untertitel: ARD Text im Auftrag von Funk (2017)" from TV/radio audio.
Add pattern matching for subtitle metadata, copyright notices, and
parenthetical/bracketed annotations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-22 19:43:22 +02:00
parent 5984132f60
commit 7b7079352f

View File

@@ -57,13 +57,29 @@ def _build_voice_prompt() -> str:
datetime=now.strftime("%A, %d. %B %Y %H:%M %Z") datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
) )
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc. # ElevenLabs scribe_v2_realtime produces two kinds of artifacts:
# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact). # 1. Noise annotations: *Störgeräusche*, *Schlechte Qualität*, etc.
# 2. Subtitle/metadata leaks: "Untertitel: ARD Text im Auftrag von Funk (2017)"
# Filter both via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$') _NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
_STT_ARTIFACT_PATTERNS = [
re.compile(r'(?i)^untertitel\b'), # subtitle metadata
re.compile(r'(?i)^copyright\b'), # copyright notices
re.compile(r'(?i)^musik\s*$'), # bare "Musik" annotation
re.compile(r'(?i)^\(.*\)\s*$'), # parenthetical annotations like (Applaus)
re.compile(r'(?i)^\[.*\]\s*$'), # bracketed annotations like [Musik]
]
def _is_stt_artifact(text: str) -> bool:
"""Check if text is an STT artifact (noise annotation or metadata leak)."""
if _NOISE_ANNOTATION_RE.match(text):
return True
return any(p.match(text) for p in _STT_ARTIFACT_PATTERNS)
class _NoiseFilterAgent(Agent): class _NoiseFilterAgent(Agent):
"""Agent that suppresses ElevenLabs noise annotations before LLM sees them. """Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
risk to the audio pipeline. Raises StopResponse to silently discard noise. risk to the audio pipeline. Raises StopResponse to silently discard noise.
@@ -73,9 +89,9 @@ class _NoiseFilterAgent(Agent):
self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
) -> None: ) -> None:
text = (new_message.text_content or "").strip() text = (new_message.text_content or "").strip()
if text and _NOISE_ANNOTATION_RE.match(text): if text and _is_stt_artifact(text):
logger.info("Noise annotation suppressed: %s", text) logger.info("STT artifact suppressed: %s", text)
# Remove the noise message from context so it doesn't accumulate # Remove the artifact from context so it doesn't accumulate
if turn_ctx.items and turn_ctx.items[-1] is new_message: if turn_ctx.items and turn_ctx.items[-1] is new_message:
turn_ctx.items.pop() turn_ctx.items.pop()
raise StopResponse() raise StopResponse()