fix(noise): expand STT artifact filter to catch subtitle metadata leaks
ElevenLabs scribe_v2_realtime also produces non-asterisk artifacts like "Untertitel: ARD Text im Auftrag von Funk (2017)" from TV/radio audio. Add pattern matching for subtitle metadata, copyright notices, and parenthetical/bracketed annotations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
28
voice.py
28
voice.py
@@ -57,13 +57,29 @@ def _build_voice_prompt() -> str:
|
|||||||
datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
|
datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
|
||||||
)
|
)
|
||||||
|
|
||||||
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
|
# ElevenLabs scribe_v2_realtime produces two kinds of artifacts:
|
||||||
# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
|
# 1. Noise annotations: *Störgeräusche*, *Schlechte Qualität*, etc.
|
||||||
|
# 2. Subtitle/metadata leaks: "Untertitel: ARD Text im Auftrag von Funk (2017)"
|
||||||
|
# Filter both via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
|
||||||
_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
|
_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
|
||||||
|
_STT_ARTIFACT_PATTERNS = [
|
||||||
|
re.compile(r'(?i)^untertitel\b'), # subtitle metadata
|
||||||
|
re.compile(r'(?i)^copyright\b'), # copyright notices
|
||||||
|
re.compile(r'(?i)^musik\s*$'), # bare "Musik" annotation
|
||||||
|
re.compile(r'(?i)^\(.*\)\s*$'), # parenthetical annotations like (Applaus)
|
||||||
|
re.compile(r'(?i)^\[.*\]\s*$'), # bracketed annotations like [Musik]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _is_stt_artifact(text: str) -> bool:
|
||||||
|
"""Check if text is an STT artifact (noise annotation or metadata leak)."""
|
||||||
|
if _NOISE_ANNOTATION_RE.match(text):
|
||||||
|
return True
|
||||||
|
return any(p.match(text) for p in _STT_ARTIFACT_PATTERNS)
|
||||||
|
|
||||||
|
|
||||||
class _NoiseFilterAgent(Agent):
|
class _NoiseFilterAgent(Agent):
|
||||||
"""Agent that suppresses ElevenLabs noise annotations before LLM sees them.
|
"""Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
|
||||||
|
|
||||||
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
|
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
|
||||||
risk to the audio pipeline. Raises StopResponse to silently discard noise.
|
risk to the audio pipeline. Raises StopResponse to silently discard noise.
|
||||||
@@ -73,9 +89,9 @@ class _NoiseFilterAgent(Agent):
|
|||||||
self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
|
self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
|
||||||
) -> None:
|
) -> None:
|
||||||
text = (new_message.text_content or "").strip()
|
text = (new_message.text_content or "").strip()
|
||||||
if text and _NOISE_ANNOTATION_RE.match(text):
|
if text and _is_stt_artifact(text):
|
||||||
logger.info("Noise annotation suppressed: %s", text)
|
logger.info("STT artifact suppressed: %s", text)
|
||||||
# Remove the noise message from context so it doesn't accumulate
|
# Remove the artifact from context so it doesn't accumulate
|
||||||
if turn_ctx.items and turn_ctx.items[-1] is new_message:
|
if turn_ctx.items and turn_ctx.items[-1] is new_message:
|
||||||
turn_ctx.items.pop()
|
turn_ctx.items.pop()
|
||||||
raise StopResponse()
|
raise StopResponse()
|
||||||
|
|||||||
Reference in New Issue
Block a user