Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
[core change!] replace bsddb3 with default Python dbm
  • Loading branch information
pawelmhm committed Jun 11, 2021
commit c9dd29ed6318c0d42de13cdb6e8fb3fe0ac19624
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.5]
python-version: [3.5, 3.6, 3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
scrapy>=1.1.0
bsddb3
22 changes: 6 additions & 16 deletions scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import os
import time
import dbm

from scrapy.http import Request
from scrapy.item import Item
Expand All @@ -26,12 +27,6 @@ class DeltaFetch(object):
"""

def __init__(self, dir, reset=False, stats=None):
dbmodule = None
try:
dbmodule = __import__('bsddb3').db
except ImportError:
raise NotConfigured('bsddb3 is required')
self.dbmodule = dbmodule
self.dir = dir
self.reset = reset
self.stats = stats
Expand All @@ -45,29 +40,25 @@ def from_crawler(cls, crawler):
reset = s.getbool('DELTAFETCH_RESET')
o = cls(dir, reset, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o

def spider_opened(self, spider):
if not os.path.exists(self.dir):
os.makedirs(self.dir)
# TODO may be tricky, as there may be different paths on systems
dbpath = os.path.join(self.dir, '%s.db' % spider.name)
reset = self.reset or getattr(spider, 'deltafetch_reset', False)
flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE
flag = 'n' if reset else 'c'
try:
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=flag)
self.db = dbm.open(dbpath, flag=flag)
except Exception:
logger.warning("Failed to open DeltaFetch database at %s, "
"trying to recreate it" % dbpath)
if os.path.exists(dbpath):
os.remove(dbpath)
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=self.dbmodule.DB_CREATE)
self.db = dbm.open(dbpath, 'c')

def spider_closed(self, spider):
self.db.close()
Expand All @@ -90,5 +81,4 @@ def process_spider_output(self, response, result, spider):

def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
return to_bytes(key)
56 changes: 18 additions & 38 deletions tests/test_deltafetch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest import TestCase, skipIf

import os
import dbm
import mock
import tempfile
from scrapy import Request
Expand All @@ -16,14 +17,6 @@
from scrapy_deltafetch.middleware import DeltaFetch


dbmodule = None
try:
dbmodule = __import__('bsddb3')
except ImportError:
pass


@skipIf(not dbmodule, "bsddb3 is not found on the system")
class DeltaFetchTestCase(TestCase):

mwcls = DeltaFetch
Expand Down Expand Up @@ -85,10 +78,7 @@ def test_spider_opened_new(self):
assert os.path.isdir(self.temp_dir)
assert os.path.exists(self.db_path)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == []
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
assert mw.db.keys() == []

def test_spider_opened_existing(self):
"""Middleware should open and use existing and valid .db files."""
Expand All @@ -97,11 +87,11 @@ def test_spider_opened_existing(self):
assert not hasattr(self.mwcls, 'db')
mw.spider_opened(self.spider)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == [(b'test_key_1', b'test_v_1'),
(b'test_key_2', b'test_v_2')]
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
for k, v in [
(b'test_key_1', b'test_v_1'),
(b'test_key_2', b'test_v_2')
]:
assert mw.db.get(k) == v

def test_spider_opened_corrupt_dbfile(self):
"""Middleware should create a new .db if it cannot open it."""
Expand All @@ -116,51 +106,43 @@ def test_spider_opened_corrupt_dbfile(self):
assert os.path.isdir(self.temp_dir)
assert os.path.exists(self.db_path)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))

# and db should be empty (it was re-created)
assert mw.db.items() == []
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
assert mw.db.keys() == []

def test_spider_opened_existing_spider_reset(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
assert not hasattr(self.mwcls, 'db')
self.spider.deltafetch_reset = True
mw.spider_opened(self.spider)
assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE
assert mw.db.keys() == []

def test_spider_opened_reset_non_existing_db(self):
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
assert not hasattr(self.mwcls, 'db')
self.spider.deltafetch_reset = True
mw.spider_opened(self.spider)
assert mw.db.fd()
# there's different logic for different bdb versions:
# it can fail when opening a non-existing db with truncate flag,
# then it should be caught and retried with rm & create flag
assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or
mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE)

assert mw.db.get(b'random') is None

def test_spider_opened_recreate(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
assert not hasattr(self.mwcls, 'db')
mw.spider_opened(self.spider)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == []
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE
assert mw.db.keys() == []

def test_spider_closed(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
mw.spider_opened(self.spider)
assert mw.db.fd()
assert mw.db.get('random') is None
mw.spider_closed(self.spider)
self.assertRaises(dbmodule.db.DBError, mw.db.fd)
with self.assertRaises(Exception) as cm:
# should fail because database closed
mw.db.get('radom')
# self.assertRaisesRegex(, mw.db.get('random'))

def test_process_spider_output(self):
self._create_test_db()
Expand Down Expand Up @@ -323,10 +305,8 @@ def test_get_key(self):
self.assertEqual(mw._get_key(test_req3), b'dfkey1')

def _create_test_db(self):
db = dbmodule.db.DB()
# truncate test db if there were failed tests
db.open(self.db_path, dbmodule.db.DB_HASH,
dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
db = dbm.open(self.db_path, 'n')
db[b'test_key_1'] = b'test_v_1'
db[b'test_key_2'] = b'test_v_2'
db.close()