Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def process_spider_output(self, response, result, spider):
for r in result:
if isinstance(r, Request):
key = self._get_key(r)
if key in self.db:
if key in self.db and not self._is_ignored(r):
logger.info("Ignoring already visited: %s" % r)
if self.stats:
self.stats.inc_value('deltafetch/skipped', spider=spider)
Expand All @@ -92,3 +92,6 @@ def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
return to_bytes(key)

def _is_ignored(self, request):
return request.meta.get('deltafetch_ignore', False)
26 changes: 26 additions & 0 deletions tests/test_deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,32 @@ def test_process_spider_output(self):
b'test_key_2']))
assert mw.db[b'key']

def test_process_spider_output_with_ignored_request(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
mw.spider_opened(self.spider)
response = mock.Mock()
response.request = Request('http://url')
result = []
self.assertEqual(
list(mw.process_spider_output(response, result, self.spider)), [])
result = [
# same URL but with new key --> it should be processed
Request('http://url', meta={'deltafetch_ignore': True}),

# 'test_key_1' is already in the test db, but deltafetch_ignore
# flag is set --> it should be processed
Request('http://url1',
meta={
'deltafetch_key': 'test_key_1',
'deltafetch_ignore': True
})
]
# so 2 requests should go through
self.assertEqual(
list(mw.process_spider_output(response, result, self.spider)),
[result[0], result[1]])

def test_process_spider_output_dict(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
Expand Down