feat: Blinkist-style audio summary bot (MAT-74)
Add interactive article summary feature: user pastes URL → bot asks language/duration/topics → generates audio summary via LLM + ElevenLabs TTS → posts MP3 inline with transcript and follow-up Q&A. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
146
article_summary/extractor.py
Normal file
146
article_summary/extractor.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Article content extraction via Firecrawl with BeautifulSoup fallback."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger("article-summary.extractor")
|
||||
|
||||
MAX_CONTENT_CHARS = 15_000
|
||||
|
||||
# Domains that are not articles (social media, file hosts, etc.)
|
||||
NON_ARTICLE_DOMAINS = {
|
||||
"youtube.com", "youtu.be", "twitter.com", "x.com", "instagram.com",
|
||||
"facebook.com", "tiktok.com", "reddit.com", "discord.com",
|
||||
"drive.google.com", "docs.google.com", "github.com",
|
||||
}
|
||||
|
||||
|
||||
def is_article_url(url: str) -> bool:
|
||||
"""Check if URL is likely an article (not social media, files, etc.)."""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
host = urlparse(url).hostname or ""
|
||||
host = host.removeprefix("www.")
|
||||
return host not in NON_ARTICLE_DOMAINS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def extract_article(url: str, firecrawl_url: str | None = None) -> dict | None:
|
||||
"""Extract article content from URL.
|
||||
|
||||
Returns dict with: title, content, word_count, detected_topics, language_hint
|
||||
Returns None if extraction fails.
|
||||
"""
|
||||
title = ""
|
||||
content = ""
|
||||
|
||||
# Try Firecrawl first
|
||||
if firecrawl_url:
|
||||
try:
|
||||
result = await _firecrawl_extract(url, firecrawl_url)
|
||||
if result:
|
||||
title, content = result
|
||||
except Exception:
|
||||
logger.warning("Firecrawl extraction failed for %s", url, exc_info=True)
|
||||
|
||||
# Fallback to BeautifulSoup
|
||||
if not content:
|
||||
try:
|
||||
result = await _bs4_extract(url)
|
||||
if result:
|
||||
title, content = result
|
||||
except Exception:
|
||||
logger.warning("BS4 extraction failed for %s", url, exc_info=True)
|
||||
|
||||
if not content:
|
||||
return None
|
||||
|
||||
content = content[:MAX_CONTENT_CHARS]
|
||||
word_count = len(content.split())
|
||||
|
||||
return {
|
||||
"title": title or url,
|
||||
"content": content,
|
||||
"word_count": word_count,
|
||||
}
|
||||
|
||||
|
||||
async def _firecrawl_extract(url: str, firecrawl_url: str) -> tuple[str, str] | None:
|
||||
"""Extract via Firecrawl API."""
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(
|
||||
f"{firecrawl_url}/v1/scrape",
|
||||
json={"url": url, "formats": ["markdown"]},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
doc = data.get("data", {})
|
||||
title = doc.get("metadata", {}).get("title", "")
|
||||
content = doc.get("markdown", "")
|
||||
if not content:
|
||||
return None
|
||||
return title, content
|
||||
|
||||
|
||||
async def _bs4_extract(url: str) -> tuple[str, str] | None:
|
||||
"""Fallback extraction via httpx + BeautifulSoup."""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; ArticleSummaryBot/1.0)",
|
||||
"Accept": "text/html",
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# Extract title
|
||||
title = ""
|
||||
if soup.title:
|
||||
title = soup.title.get_text(strip=True)
|
||||
|
||||
# Remove script/style/nav elements
|
||||
for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
|
||||
tag.decompose()
|
||||
|
||||
# Try <article> tag first, then <main>, then body
|
||||
article = soup.find("article") or soup.find("main") or soup.find("body")
|
||||
if not article:
|
||||
return None
|
||||
|
||||
# Get text, clean up whitespace
|
||||
text = article.get_text(separator="\n", strip=True)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
|
||||
if len(text) < 100:
|
||||
return None
|
||||
|
||||
return title, text
|
||||
|
||||
|
||||
async def detect_topics(content: str, llm_client, model: str) -> list[str]:
|
||||
"""Use LLM to detect 3-5 key topics from article content."""
|
||||
snippet = content[:2000]
|
||||
try:
|
||||
resp = await llm_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "Extract 3-5 key topics from this article. Return ONLY a comma-separated list of short topic labels (2-4 words each). No numbering, no explanation."},
|
||||
{"role": "user", "content": snippet},
|
||||
],
|
||||
max_tokens=100,
|
||||
temperature=0.3,
|
||||
)
|
||||
raw = resp.choices[0].message.content.strip()
|
||||
topics = [t.strip() for t in raw.split(",") if t.strip()]
|
||||
return topics[:5]
|
||||
except Exception:
|
||||
logger.warning("Topic detection failed", exc_info=True)
|
||||
return []
|
||||
Reference in New Issue
Block a user