diff --git a/voice.py b/voice.py index 3003ac3..bdef964 100644 --- a/voice.py +++ b/voice.py @@ -57,13 +57,29 @@ def _build_voice_prompt() -> str: datetime=now.strftime("%A, %d. %B %Y %H:%M %Z") ) -# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc. -# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact). +# ElevenLabs scribe_v2_realtime produces two kinds of artifacts: +# 1. Noise annotations: *Störgeräusche*, *Schlechte Qualität*, etc. +# 2. Subtitle/metadata leaks: "Untertitel: ARD Text im Auftrag von Funk (2017)" +# Filter both via on_user_turn_completed (downstream of VAD+STT, no pipeline impact). _NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$') +_STT_ARTIFACT_PATTERNS = [ + re.compile(r'(?i)^untertitel\b'), # subtitle metadata + re.compile(r'(?i)^copyright\b'), # copyright notices + re.compile(r'(?i)^musik\s*$'), # bare "Musik" annotation + re.compile(r'(?i)^\(.*\)\s*$'), # parenthetical annotations like (Applaus) + re.compile(r'(?i)^\[.*\]\s*$'), # bracketed annotations like [Musik] +] + + +def _is_stt_artifact(text: str) -> bool: + """Check if text is an STT artifact (noise annotation or metadata leak).""" + if _NOISE_ANNOTATION_RE.match(text): + return True + return any(p.match(text) for p in _STT_ARTIFACT_PATTERNS) class _NoiseFilterAgent(Agent): - """Agent that suppresses ElevenLabs noise annotations before LLM sees them. + """Agent that suppresses ElevenLabs STT artifacts before LLM sees them. Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure risk to the audio pipeline. Raises StopResponse to silently discard noise. @@ -73,9 +89,9 @@ class _NoiseFilterAgent(Agent): self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage ) -> None: text = (new_message.text_content or "").strip() - if text and _NOISE_ANNOTATION_RE.match(text): - logger.info("Noise annotation suppressed: %s", text) - # Remove the noise message from context so it doesn't accumulate + if text and _is_stt_artifact(text): + logger.info("STT artifact suppressed: %s", text) + # Remove the artifact from context so it doesn't accumulate if turn_ctx.items and turn_ctx.items[-1] is new_message: turn_ctx.items.pop() raise StopResponse()