diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9f51801..25fabf7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 diff --git a/README.rst b/README.rst index 65045b3..b9274b7 100644 --- a/README.rst +++ b/README.rst @@ -9,21 +9,16 @@ scrapy-deltafetch :target: https://codecov.io/gh/scrapy-plugins/scrapy-deltafetch This is a Scrapy spider middleware to ignore requests -to pages containing items seen in previous crawls of the same spider, -thus producing a "delta crawl" containing only new items. +to pages seen in previous crawls of the same spider, +thus producing a "delta crawl" containing only new requests. This also speeds up the crawl, by reducing the number of requests that need to be crawled, and processed (typically, item requests are the most CPU intensive). -Requirements -============ - -DeltaFetch middleware depends on Python's bsddb3_ package. - -On Ubuntu/Debian, you may need to install ``libdb-dev`` if it's not installed already. +DeltaFetch middleware uses Python's dbm_ package to store requests fingerprints. -.. _bsddb3: https://pypi.python.org/pypi/bsddb3 +.. _dbm: https://docs.python.org/3/library/dbm.html Installation diff --git a/requirements.txt b/requirements.txt index 2c309f4..0b9465e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ scrapy>=1.1.0 -bsddb3 diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index 86311c5..63c9ff2 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -1,6 +1,7 @@ import logging import os import time +import dbm from scrapy.http import Request from scrapy.item import Item @@ -26,12 +27,6 @@ class DeltaFetch(object): """ def __init__(self, dir, reset=False, stats=None): - dbmodule = None - try: - dbmodule = __import__('bsddb3').db - except ImportError: - raise NotConfigured('bsddb3 is required') - self.dbmodule = dbmodule self.dir = dir self.reset = reset self.stats = stats @@ -45,29 +40,25 @@ def from_crawler(cls, crawler): reset = s.getbool('DELTAFETCH_RESET') o = cls(dir, reset, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o def spider_opened(self, spider): if not os.path.exists(self.dir): os.makedirs(self.dir) + # TODO may be tricky, as there may be different paths on systems dbpath = os.path.join(self.dir, '%s.db' % spider.name) reset = self.reset or getattr(spider, 'deltafetch_reset', False) - flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE + flag = 'n' if reset else 'c' try: - self.db = self.dbmodule.DB() - self.db.open(filename=dbpath, - dbtype=self.dbmodule.DB_HASH, - flags=flag) + self.db = dbm.open(dbpath, flag=flag) except Exception: logger.warning("Failed to open DeltaFetch database at %s, " "trying to recreate it" % dbpath) if os.path.exists(dbpath): os.remove(dbpath) - self.db = self.dbmodule.DB() - self.db.open(filename=dbpath, - dbtype=self.dbmodule.DB_HASH, - flags=self.dbmodule.DB_CREATE) + self.db = dbm.open(dbpath, 'c') def spider_closed(self, spider): self.db.close() @@ -90,5 +81,4 @@ def process_spider_output(self, response, result, spider): def _get_key(self, request): key = request.meta.get('deltafetch_key') or request_fingerprint(request) - # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string return to_bytes(key) diff --git a/setup.py b/setup.py index 9ba1ae9..0e2819a 100644 --- a/setup.py +++ b/setup.py @@ -15,10 +15,12 @@ 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], - install_requires=['Scrapy>=1.1.0', 'bsddb3'] + install_requires=['Scrapy>=1.1.0'] ) diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..a8edee5 --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,31 @@ +import tempfile + +import mock +from scrapy import Request, Spider +from scrapy.statscollectors import StatsCollector +from scrapy.utils.test import get_crawler + +from scrapy_deltafetch import DeltaFetch + + +def benchmark_middleware(result): + spider_name = 'df_tests' + spider = Spider(spider_name) + temp_dir = tempfile.gettempdir() + crawler = get_crawler(Spider) + stats = StatsCollector(crawler) + mw = DeltaFetch(temp_dir, reset=False, stats=stats) + mw.spider_opened(spider) + response = mock.Mock() + response.request = Request('http://url', + meta={'deltafetch_key': 'key'}) + + for x in mw.process_spider_output(response, result, spider): + pass + +def test_middleware(benchmark): + result = [] + for x in range(50000): + request = Request(f'https://{x}') + result.append(request) + result = benchmark(benchmark_middleware, result) diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt index 5fb3a28..bf2c733 100644 --- a/tests/requirements-test.txt +++ b/tests/requirements-test.txt @@ -1,3 +1,5 @@ -r ../requirements.txt mock -pytest \ No newline at end of file +pytest +pytest-benchmark + diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 89654b9..0f82c3f 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -1,6 +1,7 @@ from unittest import TestCase, skipIf import os +import dbm import mock import tempfile from scrapy import Request @@ -16,14 +17,6 @@ from scrapy_deltafetch.middleware import DeltaFetch -dbmodule = None -try: - dbmodule = __import__('bsddb3') -except ImportError: - pass - - -@skipIf(not dbmodule, "bsddb3 is not found on the system") class DeltaFetchTestCase(TestCase): mwcls = DeltaFetch @@ -85,10 +78,7 @@ def test_spider_opened_new(self): assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert mw.db.keys() == [] def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" @@ -97,11 +87,11 @@ def test_spider_opened_existing(self): assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [(b'test_key_1', b'test_v_1'), - (b'test_key_2', b'test_v_2')] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + for k, v in [ + (b'test_key_1', b'test_v_1'), + (b'test_key_2', b'test_v_2') + ]: + assert mw.db.get(k) == v def test_spider_opened_corrupt_dbfile(self): """Middleware should create a new .db if it cannot open it.""" @@ -116,12 +106,9 @@ def test_spider_opened_corrupt_dbfile(self): assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) # and db should be empty (it was re-created) - assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert mw.db.keys() == [] def test_spider_opened_existing_spider_reset(self): self._create_test_db() @@ -129,38 +116,33 @@ def test_spider_opened_existing_spider_reset(self): assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + assert mw.db.keys() == [] def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.fd() - # there's different logic for different bdb versions: - # it can fail when opening a non-existing db with truncate flag, - # then it should be caught and retried with rm & create flag - assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or - mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) - + assert mw.db.get(b'random') is None + def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + assert mw.db.keys() == [] def test_spider_closed(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) mw.spider_opened(self.spider) - assert mw.db.fd() + assert mw.db.get('random') is None mw.spider_closed(self.spider) - self.assertRaises(dbmodule.db.DBError, mw.db.fd) + with self.assertRaises(Exception) as cm: + # should fail because database closed + mw.db.get('radom') + # self.assertRaisesRegex(, mw.db.get('random')) def test_process_spider_output(self): self._create_test_db() @@ -323,10 +305,8 @@ def test_get_key(self): self.assertEqual(mw._get_key(test_req3), b'dfkey1') def _create_test_db(self): - db = dbmodule.db.DB() # truncate test db if there were failed tests - db.open(self.db_path, dbmodule.db.DB_HASH, - dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) + db = dbm.open(self.db_path, 'n') db[b'test_key_1'] = b'test_v_1' db[b'test_key_2'] = b'test_v_2' db.close()