fix(noise): expand STT artifact filter to catch subtitle metadata leaks
ElevenLabs scribe_v2_realtime also produces non-asterisk artifacts like "Untertitel: ARD Text im Auftrag von Funk (2017)" from TV/radio audio. Add pattern matching for subtitle metadata, copyright notices, and parenthetical/bracketed annotations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
28
voice.py
28
voice.py
@@ -57,13 +57,29 @@ def _build_voice_prompt() -> str:
|
||||
datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
|
||||
)
|
||||
|
||||
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
|
||||
# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
|
||||
# ElevenLabs scribe_v2_realtime produces two kinds of artifacts:
|
||||
# 1. Noise annotations: *Störgeräusche*, *Schlechte Qualität*, etc.
|
||||
# 2. Subtitle/metadata leaks: "Untertitel: ARD Text im Auftrag von Funk (2017)"
|
||||
# Filter both via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
|
||||
_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
|
||||
_STT_ARTIFACT_PATTERNS = [
|
||||
re.compile(r'(?i)^untertitel\b'), # subtitle metadata
|
||||
re.compile(r'(?i)^copyright\b'), # copyright notices
|
||||
re.compile(r'(?i)^musik\s*$'), # bare "Musik" annotation
|
||||
re.compile(r'(?i)^\(.*\)\s*$'), # parenthetical annotations like (Applaus)
|
||||
re.compile(r'(?i)^\[.*\]\s*$'), # bracketed annotations like [Musik]
|
||||
]
|
||||
|
||||
|
||||
def _is_stt_artifact(text: str) -> bool:
|
||||
"""Check if text is an STT artifact (noise annotation or metadata leak)."""
|
||||
if _NOISE_ANNOTATION_RE.match(text):
|
||||
return True
|
||||
return any(p.match(text) for p in _STT_ARTIFACT_PATTERNS)
|
||||
|
||||
|
||||
class _NoiseFilterAgent(Agent):
|
||||
"""Agent that suppresses ElevenLabs noise annotations before LLM sees them.
|
||||
"""Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
|
||||
|
||||
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
|
||||
risk to the audio pipeline. Raises StopResponse to silently discard noise.
|
||||
@@ -73,9 +89,9 @@ class _NoiseFilterAgent(Agent):
|
||||
self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
|
||||
) -> None:
|
||||
text = (new_message.text_content or "").strip()
|
||||
if text and _NOISE_ANNOTATION_RE.match(text):
|
||||
logger.info("Noise annotation suppressed: %s", text)
|
||||
# Remove the noise message from context so it doesn't accumulate
|
||||
if text and _is_stt_artifact(text):
|
||||
logger.info("STT artifact suppressed: %s", text)
|
||||
# Remove the artifact from context so it doesn't accumulate
|
||||
if turn_ctx.items and turn_ctx.items[-1] is new_message:
|
||||
turn_ctx.items.pop()
|
||||
raise StopResponse()
|
||||
|
||||
Reference in New Issue
Block a user