"""Article content extraction via Firecrawl with BeautifulSoup fallback.""" from __future__ import annotations import logging import re import httpx from bs4 import BeautifulSoup logger = logging.getLogger("article-summary.extractor") MAX_CONTENT_CHARS = 15_000 # Domains that are not articles (social media, file hosts, etc.) NON_ARTICLE_DOMAINS = { "youtube.com", "youtu.be", "twitter.com", "x.com", "instagram.com", "facebook.com", "tiktok.com", "reddit.com", "discord.com", "drive.google.com", "docs.google.com", "github.com", } def is_article_url(url: str) -> bool: """Check if URL is likely an article (not social media, files, etc.).""" try: from urllib.parse import urlparse host = urlparse(url).hostname or "" host = host.removeprefix("www.") return host not in NON_ARTICLE_DOMAINS except Exception: return False async def extract_article(url: str, firecrawl_url: str | None = None) -> dict | None: """Extract article content from URL. Returns dict with: title, content, word_count, detected_topics, language_hint Returns None if extraction fails. """ title = "" content = "" # Try Firecrawl first if firecrawl_url: try: result = await _firecrawl_extract(url, firecrawl_url) if result: title, content = result except Exception: logger.warning("Firecrawl extraction failed for %s", url, exc_info=True) # Fallback to BeautifulSoup if not content: try: result = await _bs4_extract(url) if result: title, content = result except Exception: logger.warning("BS4 extraction failed for %s", url, exc_info=True) if not content: return None content = content[:MAX_CONTENT_CHARS] word_count = len(content.split()) return { "title": title or url, "content": content, "word_count": word_count, } async def _firecrawl_extract(url: str, firecrawl_url: str) -> tuple[str, str] | None: """Extract via Firecrawl API.""" async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.post( f"{firecrawl_url}/v1/scrape", json={"url": url, "formats": ["markdown"]}, ) resp.raise_for_status() data = resp.json() doc = data.get("data", {}) title = doc.get("metadata", {}).get("title", "") content = doc.get("markdown", "") if not content: return None return title, content async def _bs4_extract(url: str) -> tuple[str, str] | None: """Fallback extraction via httpx + BeautifulSoup.""" headers = { "User-Agent": "Mozilla/5.0 (compatible; ArticleSummaryBot/1.0)", "Accept": "text/html", } async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client: resp = await client.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # Extract title title = "" if soup.title: title = soup.title.get_text(strip=True) # Remove script/style/nav elements for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form"]): tag.decompose() # Try
tag first, then
, then body article = soup.find("article") or soup.find("main") or soup.find("body") if not article: return None # Get text, clean up whitespace text = article.get_text(separator="\n", strip=True) text = re.sub(r"\n{3,}", "\n\n", text) if len(text) < 100: return None return title, text async def detect_topics(content: str, llm_client, model: str) -> list[str]: """Use LLM to detect 3-5 key topics from article content.""" snippet = content[:2000] try: resp = await llm_client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "Extract 3-5 key topics from this article. Return ONLY a comma-separated list of short topic labels (2-4 words each). No numbering, no explanation."}, {"role": "user", "content": snippet}, ], max_tokens=100, temperature=0.3, ) raw = resp.choices[0].message.content.strip() topics = [t.strip() for t in raw.split(",") if t.strip()] return topics[:5] except Exception: logger.warning("Topic detection failed", exc_info=True) return []