matrix-ai-agent/article_summary/extractor.py

"""Article content extraction via Firecrawl with BeautifulSoup fallback."""

from __future__ import annotations

import logging
import re

import httpx
from bs4 import BeautifulSoup

logger = logging.getLogger("article-summary.extractor")

MAX_CONTENT_CHARS = 15_000

# Domains that are not articles (social media, file hosts, etc.)
NON_ARTICLE_DOMAINS = {
    "youtube.com", "youtu.be", "twitter.com", "x.com", "instagram.com",
    "facebook.com", "tiktok.com", "reddit.com", "discord.com",
    "drive.google.com", "docs.google.com", "github.com",
}


def is_article_url(url: str) -> bool:
    """Check if URL is likely an article (not social media, files, etc.)."""
    try:
        from urllib.parse import urlparse
        host = urlparse(url).hostname or ""
        host = host.removeprefix("www.")
        return host not in NON_ARTICLE_DOMAINS
    except Exception:
        return False


async def extract_article(url: str, firecrawl_url: str | None = None) -> dict | None:
    """Extract article content from URL.

    Returns dict with: title, content, word_count, detected_topics, language_hint
    Returns None if extraction fails.
    """
    title = ""
    content = ""

    # Try Firecrawl first
    if firecrawl_url:
        try:
            result = await _firecrawl_extract(url, firecrawl_url)
            if result:
                title, content = result
        except Exception:
            logger.warning("Firecrawl extraction failed for %s", url, exc_info=True)

    # Fallback to BeautifulSoup
    if not content:
        try:
            result = await _bs4_extract(url)
            if result:
                title, content = result
        except Exception:
            logger.warning("BS4 extraction failed for %s", url, exc_info=True)

    if not content:
        return None

    content = content[:MAX_CONTENT_CHARS]
    word_count = len(content.split())

    return {
        "title": title or url,
        "content": content,
        "word_count": word_count,
    }


async def _firecrawl_extract(url: str, firecrawl_url: str) -> tuple[str, str] | None:
    """Extract via Firecrawl API."""
    async with httpx.AsyncClient(timeout=30.0) as client:
        resp = await client.post(
            f"{firecrawl_url}/v1/scrape",
            json={"url": url, "formats": ["markdown"]},
        )
        resp.raise_for_status()
        data = resp.json()

    doc = data.get("data", {})
    title = doc.get("metadata", {}).get("title", "")
    content = doc.get("markdown", "")
    if not content:
        return None
    return title, content


async def _bs4_extract(url: str) -> tuple[str, str] | None:
    """Fallback extraction via httpx + BeautifulSoup."""
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; ArticleSummaryBot/1.0)",
        "Accept": "text/html",
    }
    async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
        resp = await client.get(url, headers=headers)
        resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract title
    title = ""
    if soup.title:
        title = soup.title.get_text(strip=True)

    # Remove script/style/nav elements
    for tag in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
        tag.decompose()

    # Try <article> tag first, then <main>, then body
    article = soup.find("article") or soup.find("main") or soup.find("body")
    if not article:
        return None

    # Get text, clean up whitespace
    text = article.get_text(separator="\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)

    if len(text) < 100:
        return None

    return title, text


async def detect_topics(content: str, llm_client, model: str) -> list[str]:
    """Use LLM to detect 3-5 key topics from article content."""
    snippet = content[:2000]
    try:
        resp = await llm_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "Extract 3-5 key topics from this article. Return ONLY a comma-separated list of short topic labels (2-4 words each). No numbering, no explanation."},
                {"role": "user", "content": snippet},
            ],
            max_tokens=100,
            temperature=0.3,
        )
        raw = resp.choices[0].message.content.strip()
        topics = [t.strip() for t in raw.split(",") if t.strip()]
        return topics[:5]
    except Exception:
        logger.warning("Topic detection failed", exc_info=True)
        return []