Skip to content
This repository was archived by the owner on Jul 19, 2018. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
deltafetch bsddb3 changes
  • Loading branch information
nyov committed Jul 15, 2016
commit 6bfda7a030f9716183f6aa71c4934c2d6a558235
7 changes: 5 additions & 2 deletions scrapylib/deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scrapy.item import BaseItem
from scrapy.utils.request import request_fingerprint
from scrapy.utils.project import data_path
from scrapy.utils.python import to_bytes
from scrapy.exceptions import NotConfigured
from scrapy import log, signals

Expand Down Expand Up @@ -99,10 +100,12 @@ def process_spider_output(self, response, result, spider):
continue
elif isinstance(r, BaseItem):
key = self._get_key(response.request)
self.db[key] = str(time.time())
self.db[key] = str(time.time()).encode('iso8859-1')
if self.stats:
self.stats.inc_value('deltafetch/stored', spider=spider)
yield r

def _get_key(self, request):
return request.meta.get('deltafetch_key') or request_fingerprint(request)
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
return to_bytes(key)
17 changes: 9 additions & 8 deletions tests/test_deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scrapy.settings import Settings
from scrapy.exceptions import NotConfigured
from scrapy.utils.request import request_fingerprint
from scrapy.utils.python import to_bytes
from scrapylib.deltafetch import DeltaFetch
from scrapy.statscollectors import StatsCollector
from scrapy.utils.test import get_crawler
Expand Down Expand Up @@ -90,8 +91,8 @@ def test_spider_opened_existing(self):
mw.spider_opened(self.spider)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == [('test_key_1', 'test_v_1'),
('test_key_2', 'test_v_2')]
assert mw.db.items() == [(b'test_key_1', b'test_v_1'),
(b'test_key_2', b'test_v_2')]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is strange: test_v_1 values are returned as bytes. But were inserted as strings (db.put(b'test_key_1', 'test_v_1')). Not sure if this is bsddb3 not casting back to int/string on fetch or what.

assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

Expand Down Expand Up @@ -154,8 +155,8 @@ def test_process_spider_output(self):
result = [BaseItem(), "not a base item"]
self.assertEqual(list(mw.process_spider_output(
response, result, self.spider)), result)
self.assertEqual(mw.db.keys(), ['test_key_1', 'key', 'test_key_2'])
assert mw.db['key']
self.assertEqual(mw.db.keys(), [b'test_key_1', b'key', b'test_key_2'])
assert mw.db[b'key']

def test_process_spider_output_stats(self):
self._create_test_db()
Expand Down Expand Up @@ -252,15 +253,15 @@ def test_get_key(self):
mw = self.mwcls(self.temp_dir, reset=True)
test_req1 = Request('http://url1')
self.assertEqual(mw._get_key(test_req1),
request_fingerprint(test_req1))
to_bytes(request_fingerprint(test_req1)))
test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
self.assertEqual(mw._get_key(test_req2), 'dfkey1')
self.assertEqual(mw._get_key(test_req2), b'dfkey1')

def _create_test_db(self):
db = dbmodule.db.DB()
# truncate test db if there were failed tests
db.open(self.db_path, dbmodule.db.DB_HASH,
dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
db.put('test_key_1', 'test_v_1')
db.put('test_key_2', 'test_v_2')
db.put(b'test_key_1', b'test_v_1')
db.put(b'test_key_2', b'test_v_2')
db.close()