From bf467acfaa6853c68546264cb7b51bee2ac38373 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Mon, 27 Jun 2016 13:01:43 +0200 Subject: [PATCH 1/5] Drop deprecated bsddb support and Python 2.6 support --- .bumpversion.cfg | 8 + .travis.yml | 31 ++++ requirements.txt | 2 + scrapy_deltafetch/__init__.py | 1 + scrapy_deltafetch/middleware.py | 106 ++++++++++++ setup.py | 24 +++ tests/test_deltafetch.py | 286 ++++++++++++++++++++++++++++++++ tox.ini | 17 ++ 8 files changed, 475 insertions(+) create mode 100644 .bumpversion.cfg create mode 100644 .travis.yml create mode 100644 requirements.txt create mode 100644 scrapy_deltafetch/__init__.py create mode 100644 scrapy_deltafetch/middleware.py create mode 100644 setup.py create mode 100644 tests/test_deltafetch.py create mode 100644 tox.ini diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..3bd1ff1 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,8 @@ +[bumpversion] +current_version = 1.7.0 +commit = True +tag = True +tag_name = {new_version} + +[bumpversion:file:setup.py] + diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..a821c7f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,31 @@ +language: python +python: 3.5 + +sudo: false + +env: + matrix: + - TOXENV=py27 + - TOXENV=py35 + +#addons: + #apt: + #packages: + #- language-pack-fr + #- libdb-dev + +install: pip install -U tox + +script: tox + +#deploy: + #provider: pypi + #user: scrapy-plugins + #distributions: sdist bdist_wheel + #password: + #secure: iKVlMlKSr+LOuCCMMOqL65aYjNRy3k1Zb4d7NRN0JpWS5DGau8G8cEhJ1dY4uyc/DNKVJmd939OiLBsUqqCmz09+ozen/YrRNjEZS5lOwBNfhpiCESkbOjcInV1PQgx2XfuHGp8O/9vxtXjjH9WE9CabQ+8Zg5/rMMvXizT4/O4= + #on: + #tags: true + #all_branches: true + #repo: scrapy-plugins/scrapy-deltafetch + #condition: $TOXENV = py27 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..29e546d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +scrapy>=1.0 +bsddb3 diff --git a/scrapy_deltafetch/__init__.py b/scrapy_deltafetch/__init__.py new file mode 100644 index 0000000..f88d4de --- /dev/null +++ b/scrapy_deltafetch/__init__.py @@ -0,0 +1 @@ +from .middleware import DeltaFetch diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py new file mode 100644 index 0000000..ed353d4 --- /dev/null +++ b/scrapy_deltafetch/middleware.py @@ -0,0 +1,106 @@ +import os, time + +from scrapy.http import Request +from scrapy.item import BaseItem +from scrapy.utils.request import request_fingerprint +from scrapy.utils.project import data_path +from scrapy.utils.python import to_bytes +from scrapy.exceptions import NotConfigured +from scrapy import log, signals + + +class DeltaFetch(object): + """This is a spider middleware to ignore requests to pages containing items + seen in previous crawls of the same spider, thus producing a "delta crawl" + containing only new items. + + This also speeds up the crawl, by reducing the number of requests that need + to be crawled, and processed (typically, item requests are the most cpu + intensive). + + Supported settings: + + * DELTAFETCH_ENABLED - to enable (or disable) this extension + * DELTAFETCH_DIR - directory where to store state + * DELTAFETCH_RESET - reset the state, clearing out all seen requests + + Supported spider arguments: + + * deltafetch_reset - same effect as DELTAFETCH_RESET setting + + Supported request meta keys: + + * deltafetch_key - used to define the lookup key for that request. by + default it's the fingerprint, but it can be changed to contain an item + id, for example. This requires support from the spider, but makes the + extension more efficient for sites that many URLs for the same item. + + """ + + def __init__(self, dir, reset=False, stats=None): + dbmodule = None + try: + dbmodule = __import__('bsddb3').db + except ImportError: + raise NotConfigured('bsddb or bsddb3 is required') + self.dbmodule = dbmodule + self.dir = dir + self.reset = reset + self.stats = stats + + @classmethod + def from_crawler(cls, crawler): + s = crawler.settings + if not s.getbool('DELTAFETCH_ENABLED'): + raise NotConfigured + dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) + reset = s.getbool('DELTAFETCH_RESET') + o = cls(dir, reset, crawler.stats) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + return o + + def spider_opened(self, spider): + if not os.path.exists(self.dir): + os.makedirs(self.dir) + dbpath = os.path.join(self.dir, '%s.db' % spider.name) + reset = self.reset or getattr(spider, 'deltafetch_reset', False) + flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE + try: + self.db = self.dbmodule.DB() + self.db.open(filename=dbpath, + dbtype=self.dbmodule.DB_HASH, + flags=flag) + except Exception: + spider.log("Failed to open DeltaFetch database at %s, " + "trying to recreate it" % dbpath) + if os.path.exists(dbpath): + os.remove(dbpath) + self.db = self.dbmodule.DB() + self.db.open(filename=dbpath, + dbtype=self.dbmodule.DB_HASH, + flags=self.dbmodule.DB_CREATE) + + def spider_closed(self, spider): + self.db.close() + + def process_spider_output(self, response, result, spider): + for r in result: + if isinstance(r, Request): + key = self._get_key(r) + if self.db.has_key(key): + spider.log("Ignoring already visited: %s" % r, level=log.INFO) + if self.stats: + self.stats.inc_value('deltafetch/skipped', spider=spider) + continue + elif isinstance(r, BaseItem): + key = self._get_key(response.request) + self.db[key] = str(time.time()) + if self.stats: + self.stats.inc_value('deltafetch/stored', spider=spider) + yield r + + def _get_key(self, request): + key = request.meta.get('deltafetch_key') or request_fingerprint(request) + # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string + return to_bytes(key) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..311511a --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ +from setuptools import setup + +setup( + name='scrapy-deltafetch', + version='1.7.0', + license='BSD', + description='Scrapy middleware to ignore previously crawled pages', + author='Scrapinghub', + author_email='info@scrapinghub.com', + url='http://github.com/scrapy-deltafetch/scrapy-deltafetch', + packages=['scrapy_deltafetch'], + platforms=['Any'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + ], + install_requires=['Scrapy>=1.0', 'bsddb3'] +) diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py new file mode 100644 index 0000000..ab5510f --- /dev/null +++ b/tests/test_deltafetch.py @@ -0,0 +1,286 @@ +from unittest import TestCase, skipIf + +import os +import mock +import tempfile +from scrapy import Request +from scrapy.item import BaseItem +from scrapy.spiders import Spider +from scrapy.settings import Settings +from scrapy.exceptions import NotConfigured +from scrapy.utils.request import request_fingerprint +from scrapy.utils.python import to_bytes +from scrapy.statscollectors import StatsCollector +from scrapy.utils.test import get_crawler + +from scrapy_deltafetch.middleware import DeltaFetch + + +dbmodule = None +try: + dbmodule = __import__('bsddb3') +except ImportError: + pass + + +@skipIf(not dbmodule, "bsddb3 is not found on the system") +class DeltaFetchTestCase(TestCase): + + mwcls = DeltaFetch + + def setUp(self): + self.spider = Spider('df_tests') + self.temp_dir = tempfile.gettempdir() + self.db_path = os.path.join(self.temp_dir, 'df_tests.db') + crawler = get_crawler(Spider) + self.stats = StatsCollector(crawler) + + def test_init(self): + # path format is any, the folder is not created + instance = self.mwcls('/any/dir', True, stats=self.stats) + assert isinstance(instance, self.mwcls) + self.assertEqual(instance.dir, '/any/dir') + self.assertEqual(self.stats.get_stats(), {}) + self.assertEqual(instance.reset, True) + + def test_init_from_crawler(self): + crawler = mock.Mock() + # void settings + crawler.settings = Settings({}) + self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) + with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: + data_dir.return_value = self.temp_dir + + # simple project_data_dir mock with based settings + crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + instance = self.mwcls.from_crawler(crawler) + assert isinstance(instance, self.mwcls) + self.assertEqual( + instance.dir, os.path.join(self.temp_dir, 'deltafetch')) + self.assertEqual(instance.reset, False) + + # project_data_dir mock with advanced settings + crawler.settings = Settings({'DELTAFETCH_ENABLED': True, + 'DELTAFETCH_DIR': 'other', + 'DELTAFETCH_RESET': True}) + instance = self.mwcls.from_crawler(crawler) + assert isinstance(instance, self.mwcls) + self.assertEqual( + instance.dir, os.path.join(self.temp_dir, 'other')) + self.assertEqual(instance.reset, True) + + def test_spider_opened_new(self): + if os.path.exists(self.db_path): + os.remove(self.db_path) + mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + assert not hasattr(self.mwcls, 'db') + mw.spider_opened(self.spider) + assert os.path.isdir(self.temp_dir) + assert os.path.exists(self.db_path) + assert hasattr(mw, 'db') + assert isinstance(mw.db, type(dbmodule.db.DB())) + assert mw.db.items() == [] + assert mw.db.get_type() == dbmodule.db.DB_HASH + assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + + def test_spider_opened_existing(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + assert not hasattr(self.mwcls, 'db') + mw.spider_opened(self.spider) + assert hasattr(mw, 'db') + assert isinstance(mw.db, type(dbmodule.db.DB())) + assert mw.db.items() == [(b'test_key_1', b'test_v_1'), + (b'test_key_2', b'test_v_2')] + assert mw.db.get_type() == dbmodule.db.DB_HASH + assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + + def test_spider_opened_existing_spider_reset(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + assert not hasattr(self.mwcls, 'db') + self.spider.deltafetch_reset = True + mw.spider_opened(self.spider) + assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + + def test_spider_opened_reset_non_existing_db(self): + mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + assert not hasattr(self.mwcls, 'db') + self.spider.deltafetch_reset = True + mw.spider_opened(self.spider) + assert mw.db.fd() + # there's different logic for different bdb versions: + # it can fail when opening a non-existing db with truncate flag, + # then it should be caught and retried with rm & create flag + assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or + mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) + + def test_spider_opened_recreate(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + assert not hasattr(self.mwcls, 'db') + mw.spider_opened(self.spider) + assert hasattr(mw, 'db') + assert isinstance(mw.db, type(dbmodule.db.DB())) + assert mw.db.items() == [] + assert mw.db.get_type() == dbmodule.db.DB_HASH + assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + + def test_spider_closed(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + mw.spider_opened(self.spider) + assert mw.db.fd() + mw.spider_closed(self.spider) + self.assertRaises(dbmodule.db.DBError, mw.db.fd) + + def test_process_spider_output(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw.spider_opened(self.spider) + response = mock.Mock() + response.request = Request('http://url', + meta={'deltafetch_key': 'key'}) + result = [] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), []) + result = [ + # same URL but with new key --> it should be processed + Request('http://url', meta={'deltafetch_key': 'key1'}), + + # 'test_key_1' is already in the test db --> it should be skipped + Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + ] + # so only the 1 request should go through + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), [result[0]]) + + # the skipped "http://url1" should be counted in stats + self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) + + # b'key' should not be in the db yet as no item was collected yet + self.assertEqual(set(mw.db.keys()), + set([b'test_key_1', + b'test_key_2'])) + + # if the spider returns items, the request's key is added in db + result = [BaseItem(), "not a base item"] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), result) + self.assertEqual(set(mw.db.keys()), + set([b'key', + b'test_key_1', + b'test_key_2'])) + assert mw.db[b'key'] + + def test_process_spider_output_stats(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw.spider_opened(self.spider) + response = mock.Mock() + response.request = Request('http://url', + meta={'deltafetch_key': 'key'}) + result = [] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), []) + self.assertEqual(self.stats.get_stats(), {}) + result = [ + Request('http://url', meta={'deltafetch_key': 'key'}), + Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + ] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), [result[0]]) + self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) + result = [BaseItem(), "not a base item"] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), result) + self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) + + def test_init_from_crawler_legacy(self): + # test with subclass not handling passed stats + class LegacyDeltaFetchSubClass(self.mwcls): + + def __init__(self, dir, reset=False, *args, **kwargs): + super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + self.something = True + + crawler = mock.Mock() + # void settings + crawler.settings = Settings({}) + self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) + + with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: + data_dir.return_value = self.temp_dir + + # simple project_data_dir mock with based settings + crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + instance = LegacyDeltaFetchSubClass.from_crawler(crawler) + assert isinstance(instance, self.mwcls) + self.assertEqual( + instance.dir, os.path.join(self.temp_dir, 'deltafetch')) + self.assertEqual(instance.reset, False) + + # project_data_dir mock with advanced settings + crawler.settings = Settings({'DELTAFETCH_ENABLED': True, + 'DELTAFETCH_DIR': 'other', + 'DELTAFETCH_RESET': True}) + instance = LegacyDeltaFetchSubClass.from_crawler(crawler) + assert isinstance(instance, self.mwcls) + self.assertEqual( + instance.dir, os.path.join(self.temp_dir, 'other')) + self.assertEqual(instance.reset, True) + + def test_process_spider_output_stats_legacy(self): + # testing the subclass not handling stats works at runtime + # (i.e. that trying to update stats does not trigger exception) + class LegacyDeltaFetchSubClass(self.mwcls): + + def __init__(self, dir, reset=False, *args, **kwargs): + super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + self.something = True + + self._create_test_db() + mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) + mw.spider_opened(self.spider) + response = mock.Mock() + response.request = Request('http://url', + meta={'deltafetch_key': 'key'}) + result = [] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), []) + self.assertEqual(self.stats.get_stats(), {}) + result = [ + Request('http://url', meta={'deltafetch_key': 'key'}), + Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + ] + + # stats should not be updated + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), [result[0]]) + self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) + + result = [BaseItem(), "not a base item"] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), result) + self.assertEqual(self.stats.get_value('deltafetch/stored'), None) + + def test_get_key(self): + mw = self.mwcls(self.temp_dir, reset=True) + test_req1 = Request('http://url1') + self.assertEqual(mw._get_key(test_req1), + to_bytes(request_fingerprint(test_req1))) + test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) + self.assertEqual(mw._get_key(test_req2), b'dfkey1') + + test_req3 = Request('http://url2', meta={'deltafetch_key': u'dfkey1'}) + # key will be converted to bytes + self.assertEqual(mw._get_key(test_req3), b'dfkey1') + + def _create_test_db(self): + db = dbmodule.db.DB() + # truncate test db if there were failed tests + db.open(self.db_path, dbmodule.db.DB_HASH, + dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) + db[b'test_key_1'] = b'test_v_1' + db[b'test_key_2'] = b'test_v_2' + db.close() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..c72785a --- /dev/null +++ b/tox.ini @@ -0,0 +1,17 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py27, py35 + +[testenv] +setenv = + BERKELEYDB_DIR = /usr +deps = + -rrequirements.txt + mock + nose + bsddb3 +commands = nosetests --with-doctest [] From fdb6e37038cbe12396c9a5c95dc2ae01f51bcde8 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Mon, 27 Jun 2016 13:06:47 +0200 Subject: [PATCH 2/5] Support universal wheels --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 From c49d554b32664470f426633e3536adf9677936c2 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Mon, 27 Jun 2016 13:11:10 +0200 Subject: [PATCH 3/5] Add libdev dependency for Travis CI --- .travis.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index a821c7f..6614361 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,11 +8,10 @@ env: - TOXENV=py27 - TOXENV=py35 -#addons: - #apt: - #packages: - #- language-pack-fr - #- libdb-dev +addons: + apt: + packages: + - libdb-dev install: pip install -U tox From eaa62688ddb9fb84601f47cf65e3957d740119a4 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Mon, 27 Jun 2016 13:26:41 +0200 Subject: [PATCH 4/5] Update NotConfigured() message about requirements --- scrapy_deltafetch/middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index ed353d4..bd07386 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -42,7 +42,7 @@ def __init__(self, dir, reset=False, stats=None): try: dbmodule = __import__('bsddb3').db except ImportError: - raise NotConfigured('bsddb or bsddb3 is required') + raise NotConfigured('bsddb3 is required') self.dbmodule = dbmodule self.dir = dir self.reset = reset From 977158beb7ccc2cec8da9dae14518001ea805679 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Mon, 27 Jun 2016 13:34:00 +0200 Subject: [PATCH 5/5] Remove redundant bsddb3 requirement from tox config --- tox.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index c72785a..9d05278 100644 --- a/tox.ini +++ b/tox.ini @@ -9,9 +9,10 @@ envlist = py27, py35 [testenv] setenv = BERKELEYDB_DIR = /usr + deps = -rrequirements.txt mock nose - bsddb3 + commands = nosetests --with-doctest []