diff --git a/README.rst b/README.rst index 65045b3..d98b856 100644 --- a/README.rst +++ b/README.rst @@ -64,6 +64,7 @@ Supported Scrapy settings * ``DELTAFETCH_ENABLED`` — to enable (or disable) this extension * ``DELTAFETCH_DIR`` — directory where to store state * ``DELTAFETCH_RESET`` — reset the state, clearing out all seen requests +* ``DELTAFETCH_USE_REDIRECT_URL`` - use/store redirect url fingerprint These usually go in your Scrapy project's ``settings.py``. diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index ded0843..6fb3c60 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -25,7 +25,7 @@ class DeltaFetch(object): intensive). """ - def __init__(self, dir, reset=False, stats=None): + def __init__(self, dir, reset=False, store_redirect_url=False, stats=None): dbmodule = None try: dbmodule = __import__('bsddb3').db @@ -34,6 +34,7 @@ def __init__(self, dir, reset=False, stats=None): self.dbmodule = dbmodule self.dir = dir self.reset = reset + self.store_redirect_url = store_redirect_url self.stats = stats @classmethod @@ -43,7 +44,8 @@ def from_crawler(cls, crawler): raise NotConfigured dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) reset = s.getbool('DELTAFETCH_RESET') - o = cls(dir, reset, crawler.stats) + store_redirect_url = s.getbool('DELTAFETCH_USE_REDIRECT_URL', False) + o = cls(dir, reset, store_redirect_url, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o @@ -82,7 +84,12 @@ def process_spider_output(self, response, result, spider): self.stats.inc_value('deltafetch/skipped', spider=spider) continue elif isinstance(r, (BaseItem, dict)): - key = self._get_key(response.request) + req = response.request + redirect_urls = req.meta.get('redirect_urls', False) + + if self.store_redirect_url and redirect_urls: + req = req.replace(url=redirect_urls[0]) + key = self._get_key(req) self.db[key] = str(time.time()) if self.stats: self.stats.inc_value('deltafetch/stored', spider=spider)