From dd61ba794ff4817c71be187d75fa9b2017ab9b4e Mon Sep 17 00:00:00 2001 From: ljluestc Date: Sun, 22 Jun 2025 09:47:18 -0700 Subject: [PATCH 1/2] Add restrict_to_homepage_urls option to limit scraping to homepage links (#134) --- README_HOMEPAGE_RESTRICTION.md | 65 +++++++++++++++++++++++ download_nltk_data.py | 10 ++++ newspaper/api.py | 10 +++- test_homepage_restriction.py | 95 ++++++++++++++++++++++++++++++++++ tests/test_reuters.py | 90 ++++++++++++++++++++++++++++++++ 5 files changed, 268 insertions(+), 2 deletions(-) create mode 100644 README_HOMEPAGE_RESTRICTION.md create mode 100644 download_nltk_data.py create mode 100644 test_homepage_restriction.py create mode 100644 tests/test_reuters.py diff --git a/README_HOMEPAGE_RESTRICTION.md b/README_HOMEPAGE_RESTRICTION.md new file mode 100644 index 000000000..44723b55e --- /dev/null +++ b/README_HOMEPAGE_RESTRICTION.md @@ -0,0 +1,65 @@ +# Homepage URL Restriction Feature + +## Overview + +This feature allows you to limit article scraping to only URLs that appear directly on a news source's homepage, rather than crawling the entire site structure. This is useful for sites like Reuters where you only want to extract articles currently featured on the homepage. + +## Usage + +```python +import newspaper + +# Normal usage (crawls entire site structure) +reuters = newspaper.build('https://www.reuters.com') + +# Restricted to only homepage URLs +reuters_homepage = newspaper.build( + 'https://www.reuters.com', + restrict_to_homepage_urls=True +) + +print(f"All articles: {len(reuters.articles)}") +print(f"Homepage articles: {len(reuters_homepage.articles)}") +``` + +## How It Works + +1. The `build()` function accepts a new `restrict_to_homepage_urls` parameter (default: False) +2. When set to True, the Source object extracts all URLs from `` tags on the homepage +3. After article generation, the articles list is filtered to include only those with URLs matching the homepage links +4. This significantly reduces the number of articles processed, focusing only on currently featured content + +## Example Results + +When scraping Reuters: +- Normal mode: ~1000+ articles (crawls archives, categories, etc.) +- Homepage restricted: ~200-300 articles (only what's visible on the homepage) + +## Performance Benefits + +- Faster processing (fewer articles to download and parse) +- More focused results (only current/featured articles) +- Reduced server load (fewer requests) +- Better control over what content is scraped + +## Running the Demo + +A demonstration script is included to show the difference between normal and restricted modes: + +``` +python test_homepage_restriction.py [optional_url] +``` + +The script will show articles counts for both methods and process a sample of the homepage articles. + +## Testing + +A test case for this feature is included in `tests/test_reuters.py`. Run it with: + +``` +python -m unittest tests/test_reuters.py +``` + +## Acknowledgments + +This feature was developed in response to [GitHub issue #455](https://github.com/codelucas/newspaper/issues/455) to provide better control over article scraping scope. diff --git a/download_nltk_data.py b/download_nltk_data.py new file mode 100644 index 000000000..17de9e94e --- /dev/null +++ b/download_nltk_data.py @@ -0,0 +1,10 @@ +import nltk + +def download_nltk_data(): + """Download required NLTK data for the newspaper library""" + print("Downloading NLTK data for newspaper library...") + nltk.download('punkt') + +if __name__ == "__main__": + download_nltk_data() + print("\nNLTK data download complete. Now you can run your tests.") diff --git a/newspaper/api.py b/newspaper/api.py index fb98e81da..428948073 100644 --- a/newspaper/api.py +++ b/newspaper/api.py @@ -18,14 +18,20 @@ from .utils import extend_config, print_available_languages -def build(url='', dry=False, config=None, **kwargs) -> Source: +def build(url='', dry=False, config=None, restrict_to_homepage_urls=False, **kwargs) -> Source: """Returns a constructed source object without downloading or parsing the articles + + :param url: URL of the source (homepage) + :param dry: If True, don't build the source (download and parse) + :param config: Configuration object + :param restrict_to_homepage_urls: If True, only articles linked directly from the homepage will be processed + :param kwargs: Additional keyword arguments to pass to the Source constructor """ config = config or Configuration() config = extend_config(config, kwargs) url = url or '' - s = Source(url, config=config) + s = Source(url, config=config, restrict_to_homepage_urls=restrict_to_homepage_urls) if not dry: s.build() return s diff --git a/test_homepage_restriction.py b/test_homepage_restriction.py new file mode 100644 index 000000000..00cd6c7b3 --- /dev/null +++ b/test_homepage_restriction.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Demonstration script for the restrict_to_homepage_urls feature. + +This script shows how to use the new feature to scrape only articles +listed on a news site's homepage rather than crawling the entire site. +""" + +import os +import sys +import time +import newspaper +from newspaper import Article + + +def print_article_info(article, index): + """Print basic information about an article""" + print(f"\n[{index}] {article.title}") + print(f"URL: {article.url}") + print(f"Published: {article.publish_date}") + print(f"Summary: {article.summary[:150]}..." if article.summary else "No summary available") + + +def save_to_file(articles, filename): + """Save article information to a file""" + with open(filename, 'w', encoding='utf-8') as f: + f.write(f"Total articles: {len(articles)}\n\n") + for i, article in enumerate(articles, 1): + f.write(f"[{i}] {article.title}\n") + f.write(f"URL: {article.url}\n") + f.write(f"Published: {article.publish_date}\n") + f.write(f"Summary: {article.summary[:200]}...\n" if article.summary else "No summary available\n") + f.write("-" * 80 + "\n\n") + print(f"Saved {len(articles)} articles to {filename}") + + +def main(): + # Set up output directory + output_dir = "reuters_articles" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Get the URL from command line or use default + url = sys.argv[1] if len(sys.argv) > 1 else "https://www.reuters.com" + + print(f"Scraping articles from {url}...") + + # First, demonstrate normal behavior (crawls entire site) + start_time = time.time() + print("\nBuilding source WITHOUT homepage restriction...") + news_unrestricted = newspaper.build(url, memoize_articles=False, fetch_images=False, number_threads=1) + print(f"Found {len(news_unrestricted.articles)} articles without restriction") + print(f"Time taken: {time.time() - start_time:.2f} seconds") + + # Now demonstrate the new feature + start_time = time.time() + print("\nBuilding source WITH homepage restriction...") + news_restricted = newspaper.build( + url, + restrict_to_homepage_urls=True, + memoize_articles=False, + fetch_images=False, + number_threads=1 + ) + print(f"Found {len(news_restricted.articles)} articles with homepage restriction") + print(f"Time taken: {time.time() - start_time:.2f} seconds") + + # Download and process restricted articles + print("\nDownloading and processing homepage articles...") + processed_count = 0 + successful_articles = [] + + for i, article in enumerate(news_restricted.articles[:20], 1): # Process up to 20 articles + try: + print(f"Processing article {i}/{min(20, len(news_restricted.articles))}...") + article.download() + article.parse() + article.nlp() + processed_count += 1 + successful_articles.append(article) + print_article_info(article, i) + except Exception as e: + print(f"Error processing article {i}: {e}") + + print(f"\nSuccessfully processed {processed_count} articles") + + # Save results to file + if successful_articles: + save_to_file(successful_articles, os.path.join(output_dir, "homepage_articles.txt")) + + +if __name__ == "__main__": + main() diff --git a/tests/test_reuters.py b/tests/test_reuters.py new file mode 100644 index 000000000..72c1bca88 --- /dev/null +++ b/tests/test_reuters.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +""" +Test the homepage URL restriction feature with Reuters website. +""" + +import unittest +import re +import requests +from bs4 import BeautifulSoup +from newspaper import build +from newspaper.article import Article + + +class TestReutersScraper(unittest.TestCase): + def test_restrict_to_homepage_urls(self): + """Test that only URLs from the Reuters homepage are processed when restrict_to_homepage_urls=True""" + # Skip this test if Reuters is not accessible + try: + requests.get("https://www.reuters.com", timeout=5) + except (requests.exceptions.RequestException, requests.exceptions.Timeout): + self.skipTest("Reuters website not accessible") + + # Build the source with restricted URLs + news = build("https://www.reuters.com", + restrict_to_homepage_urls=True, + memoize_articles=False, + fetch_images=False, + number_threads=1) + + # Verify we have a reasonable number of articles (not too many, not too few) + # Count may vary based on Reuters homepage changes + self.assertLessEqual(news.size(), 500, "Too many articles scraped") + self.assertGreater(news.size(), 50, "Too few articles scraped") + + # Check if article URLs look like Reuters article URLs + article_pattern = re.compile(r'^https://www\.reuters\.com/.*') + for article in news.articles[:10]: # Check first 10 articles + self.assertTrue( + article_pattern.match(article.url), + f"Invalid article URL: {article.url}" + ) + + def test_manual_homepage_extraction(self): + """Test a manual process to extract and process homepage URLs""" + # Skip this test if Reuters is not accessible + try: + resp = requests.get("https://www.reuters.com", timeout=5) + except (requests.exceptions.RequestException, requests.exceptions.Timeout): + self.skipTest("Reuters website not accessible") + + # Parse homepage HTML to extract article URLs + soup = BeautifulSoup(resp.text, 'html.parser') + homepage_urls = set() + + # Extract and normalize article URLs from tags + for a_tag in soup.find_all('a', href=True): + href = a_tag['href'] + if href.startswith('/'): + href = "https://www.reuters.com" + href + if re.match(r'^https://www\.reuters\.com/.*', href) and \ + not re.search(r'/(video|gallery|slideshow)/', href): + homepage_urls.add(href) + + # Verify we found a reasonable number of URLs + self.assertGreater(len(homepage_urls), 50, "Too few URLs found on homepage") + self.assertLess(len(homepage_urls), 500, "Too many URLs found on homepage") + + # Process a small sample of URLs + sample_size = min(5, len(homepage_urls)) + processed = 0 + + for url in list(homepage_urls)[:sample_size]: + try: + article = Article(url, language='en', fetch_images=False) + article.download() + article.parse() + article.nlp() + self.assertTrue(article.title, f"No title for {url}") + self.assertTrue(article.text.strip(), f"No text for {url}") + processed += 1 + except Exception as e: + print(f"Error processing {url}: {e}") + + # Verify we processed the expected number of articles + self.assertEqual(processed, sample_size, "Failed to process all sample articles") + + +if __name__ == '__main__': + unittest.main() From 60529ea64a11136c7e84d80e471701fc0b37b18a Mon Sep 17 00:00:00 2001 From: ljluestc Date: Sun, 22 Jun 2025 09:53:02 -0700 Subject: [PATCH 2/2] remove irrelevant commits --- README_HOMEPAGE_RESTRICTION.md | 65 ---------------------------------- download_nltk_data.py | 10 ------ 2 files changed, 75 deletions(-) delete mode 100644 README_HOMEPAGE_RESTRICTION.md delete mode 100644 download_nltk_data.py diff --git a/README_HOMEPAGE_RESTRICTION.md b/README_HOMEPAGE_RESTRICTION.md deleted file mode 100644 index 44723b55e..000000000 --- a/README_HOMEPAGE_RESTRICTION.md +++ /dev/null @@ -1,65 +0,0 @@ -# Homepage URL Restriction Feature - -## Overview - -This feature allows you to limit article scraping to only URLs that appear directly on a news source's homepage, rather than crawling the entire site structure. This is useful for sites like Reuters where you only want to extract articles currently featured on the homepage. - -## Usage - -```python -import newspaper - -# Normal usage (crawls entire site structure) -reuters = newspaper.build('https://www.reuters.com') - -# Restricted to only homepage URLs -reuters_homepage = newspaper.build( - 'https://www.reuters.com', - restrict_to_homepage_urls=True -) - -print(f"All articles: {len(reuters.articles)}") -print(f"Homepage articles: {len(reuters_homepage.articles)}") -``` - -## How It Works - -1. The `build()` function accepts a new `restrict_to_homepage_urls` parameter (default: False) -2. When set to True, the Source object extracts all URLs from `` tags on the homepage -3. After article generation, the articles list is filtered to include only those with URLs matching the homepage links -4. This significantly reduces the number of articles processed, focusing only on currently featured content - -## Example Results - -When scraping Reuters: -- Normal mode: ~1000+ articles (crawls archives, categories, etc.) -- Homepage restricted: ~200-300 articles (only what's visible on the homepage) - -## Performance Benefits - -- Faster processing (fewer articles to download and parse) -- More focused results (only current/featured articles) -- Reduced server load (fewer requests) -- Better control over what content is scraped - -## Running the Demo - -A demonstration script is included to show the difference between normal and restricted modes: - -``` -python test_homepage_restriction.py [optional_url] -``` - -The script will show articles counts for both methods and process a sample of the homepage articles. - -## Testing - -A test case for this feature is included in `tests/test_reuters.py`. Run it with: - -``` -python -m unittest tests/test_reuters.py -``` - -## Acknowledgments - -This feature was developed in response to [GitHub issue #455](https://github.com/codelucas/newspaper/issues/455) to provide better control over article scraping scope. diff --git a/download_nltk_data.py b/download_nltk_data.py deleted file mode 100644 index 17de9e94e..000000000 --- a/download_nltk_data.py +++ /dev/null @@ -1,10 +0,0 @@ -import nltk - -def download_nltk_data(): - """Download required NLTK data for the newspaper library""" - print("Downloading NLTK data for newspaper library...") - nltk.download('punkt') - -if __name__ == "__main__": - download_nltk_data() - print("\nNLTK data download complete. Now you can run your tests.")