feat: Blinkist-style audio summary bot (MAT-74)

Add interactive article summary feature: user pastes URL → bot asks language/duration/topics → generates audio summary via LLM + ElevenLabs TTS → posts MP3 inline with transcript and follow-up Q&A. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 17:39:09 +02:00
parent 1000891a97
commit 4ec4054db4
6 changed files with 789 additions and 0 deletions
--- a/article_summary/extractor.py
+++ b/article_summary/extractor.py
@@ -0,0 +1,146 @@
+"""Article content extraction via Firecrawl with BeautifulSoup fallback."""
+
+from __future__ import annotations
+
+import logging
+import re
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger("article-summary.extractor")
+
+MAX_CONTENT_CHARS = 15_000
+
+# Domains that are not articles (social media, file hosts, etc.)
+NON_ARTICLE_DOMAINS = {
+    "youtube.com", "youtu.be", "twitter.com", "x.com", "instagram.com",
+    "facebook.com", "tiktok.com", "reddit.com", "discord.com",
+    "drive.google.com", "docs.google.com", "github.com",
+}
+
+
+def is_article_url(url: str) -> bool:
+    """Check if URL is likely an article (not social media, files, etc.)."""
+    try:
+        from urllib.parse import urlparse
+        host = urlparse(url).hostname or ""
+        host = host.removeprefix("www.")
+        return host not in NON_ARTICLE_DOMAINS
+    except Exception:
+        return False
+
+
+async def extract_article(url: str, firecrawl_url: str | None = None) -> dict | None:
+    """Extract article content from URL.
+
+    Returns dict with: title, content, word_count, detected_topics, language_hint
+    Returns None if extraction fails.
+    """
+    title = ""
+    content = ""
+
+    # Try Firecrawl first
+    if firecrawl_url:
+        try:
+            result = await _firecrawl_extract(url, firecrawl_url)
+            if result:
+                title, content = result
+        except Exception:
+            logger.warning("Firecrawl extraction failed for %s", url, exc_info=True)
+
+    # Fallback to BeautifulSoup
+    if not content:
+        try:
+            result = await _bs4_extract(url)
+            if result:
+                title, content = result
+        except Exception:
+            logger.warning("BS4 extraction failed for %s", url, exc_info=True)
+
+    if not content:
+        return None
+
+    content = content[:MAX_CONTENT_CHARS]
+    word_count = len(content.split())
+
+    return {
+        "title": title or url,
+        "content": content,
+        "word_count": word_count,
+    }
+
+
+async def _firecrawl_extract(url: str, firecrawl_url: str) -> tuple[str, str] | None:
+    """Extract via Firecrawl API."""
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        resp = await client.post(
+            f"{firecrawl_url}/v1/scrape",
+            json={"url": url, "formats": ["markdown"]},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    doc = data.get("data", {})
+    title = doc.get("metadata", {}).get("title", "")
+    content = doc.get("markdown", "")
+    if not content:
+        return None
+    return title, content
+
+
+async def _bs4_extract(url: str) -> tuple[str, str] | None:
+    """Fallback extraction via httpx + BeautifulSoup."""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; ArticleSummaryBot/1.0)",
+        "Accept": "text/html",
+    }
+    async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
+        resp = await client.get(url, headers=headers)
+        resp.raise_for_status()
+
+    soup = BeautifulSoup(resp.text, "html.parser")
+
+    # Extract title
+    title = ""
+    if soup.title:
+        title = soup.title.get_text(strip=True)
+
+    # Remove script/style/nav elements
+    for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
+        tag.decompose()
+
+    # Try <article> tag first, then <main>, then body
+    article = soup.find("article") or soup.find("main") or soup.find("body")
+    if not article:
+        return None
+
+    # Get text, clean up whitespace
+    text = article.get_text(separator="\n", strip=True)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    if len(text) < 100:
+        return None
+
+    return title, text
+
+
+async def detect_topics(content: str, llm_client, model: str) -> list[str]:
+    """Use LLM to detect 3-5 key topics from article content."""
+    snippet = content[:2000]
+    try:
+        resp = await llm_client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "Extract 3-5 key topics from this article. Return ONLY a comma-separated list of short topic labels (2-4 words each). No numbering, no explanation."},
+                {"role": "user", "content": snippet},
+            ],
+            max_tokens=100,
+            temperature=0.3,
+        )
+        raw = resp.choices[0].message.content.strip()
+        topics = [t.strip() for t in raw.split(",") if t.strip()]
+        return topics[:5]
+    except Exception:
+        logger.warning("Topic detection failed", exc_info=True)
+        return []