Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move the logic to from_crawler to minimize backward incompatibility
  • Loading branch information
Gallaecio committed Feb 25, 2025
commit 0bd04d8ad9bc2fb4455cd944d4896b54b8314e56
23 changes: 10 additions & 13 deletions scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,10 @@ class DeltaFetch(object):
intensive).
"""

def __init__(self, dir, reset=False, stats=None, crawler=None):
def __init__(self, dir, reset=False, stats=None):
self.dir = dir
self.reset = reset
self.stats = stats
if crawler and hasattr(crawler, 'request_fingerprinter'):
self.fingerprint=crawler.request_fingerprinter.fingerprint
else:
try:
# compatibility with Scrapy <2.7.0
from scrapy.utils.request import request_fingerprint
self.fingerprint=request_fingerprint
except ImportError:
# use the new default
from scrapy.utils.request import fingerprint
self.fingerprint=fingerprint

@classmethod
def from_crawler(cls, crawler):
Expand All @@ -49,9 +38,17 @@ def from_crawler(cls, crawler):
raise NotConfigured
dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
reset = s.getbool('DELTAFETCH_RESET')
o = cls(dir, reset, crawler.stats, crawler)
o = cls(dir, reset, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)

try:
o.fingerprint = crawler.request_fingerprinter.fingerprint
except AttributeError:
from scrapy.utils.request import request_fingerprint

o.fingerprint = request_fingerprint

return o

def spider_opened(self, spider):
Expand Down
22 changes: 16 additions & 6 deletions tests/test_deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_spider_opened_reset_non_existing_db(self):
self.spider.deltafetch_reset = True
mw.spider_opened(self.spider)
assert mw.db.get(b'random') is None

def test_spider_opened_recreate(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
Expand Down Expand Up @@ -191,7 +191,12 @@ def test_process_spider_output(self):

def test_process_spider_output_with_ignored_request(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
settings = {
"DELTAFETCH_DIR": self.temp_dir,
"DELTAFETCH_ENABLED": True,
}
crawler = get_crawler(Spider, settings_dict=settings)
mw = self.mwcls.from_crawler(crawler)
mw.spider_opened(self.spider)
response = mock.Mock()
response.request = Request('http://url')
Expand Down Expand Up @@ -322,13 +327,18 @@ def __init__(self, dir, reset=False, *args, **kwargs):
self.assertEqual(self.stats.get_value('deltafetch/stored'), None)

def test_get_key(self):
mw = self.mwcls(self.temp_dir, reset=True)
settings = {
"DELTAFETCH_DIR": self.temp_dir,
"DELTAFETCH_ENABLED": True,
"DELTAFETCH_RESET": True,
}
crawler = get_crawler(Spider, settings_dict=settings)
mw = self.mwcls.from_crawler(crawler)
test_req1 = Request('http://url1')
crawler = get_crawler(Spider)
if _legacy_fingerprint:
fingerprint=request_fingerprint
fingerprint = request_fingerprint
else:
fingerprint=RequestFingerprinter(crawler).fingerprint
fingerprint = RequestFingerprinter.from_crawler(crawler).fingerprint
self.assertEqual(mw._get_key(test_req1),
to_bytes(fingerprint(test_req1)))
test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'})
Expand Down
Loading