From e37f89767bda5052618733eb5546243d64770e82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20=C5=81acek?= Date: Tue, 22 Jun 2021 14:32:34 +0200 Subject: [PATCH] Allow user to ignore some requests --- scrapy_deltafetch/middleware.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index 86311c5..d82d1f3 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -76,7 +76,7 @@ def process_spider_output(self, response, result, spider): for r in result: if isinstance(r, Request): key = self._get_key(r) - if key in self.db: + if key in self.db and not self._is_ignored(r): logger.info("Ignoring already visited: %s" % r) if self.stats: self.stats.inc_value('deltafetch/skipped', spider=spider) @@ -92,3 +92,6 @@ def _get_key(self, request): key = request.meta.get('deltafetch_key') or request_fingerprint(request) # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string return to_bytes(key) + + def _is_ignored(self, request): + return request.meta.get('deltafetch_ignore') is not None