# scraper.py from newspaper import Article, Config # Fake a real browser → fixes 403 Forbidden on some sites (Politico, WSJ, etc.) config = Config() config.browser_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" config.request_timeout = 15 config.fetch_images = False # faster def extract_article_text(url: str) -> str: try: article = Article(url, config=config) article.download() # Check if download succeeded if not article.html: return "Error: Download failed (site may block bots). Please paste the article text directly." article.parse() text = article.text.strip() if len(text) < 100: return "Error: Very little text extracted — try pasting the article text manually." return text except Exception as e: return ( f"Error: Failed to fetch article — {str(e)}. Try pasting the text directly." )