Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Drop deprecated bsddb support and Python 2.6 support
  • Loading branch information
redapple committed Jun 27, 2016
commit bf467acfaa6853c68546264cb7b51bee2ac38373
8 changes: 8 additions & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[bumpversion]
current_version = 1.7.0
commit = True
tag = True
tag_name = {new_version}

[bumpversion:file:setup.py]

31 changes: 31 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
language: python
python: 3.5

sudo: false

env:
matrix:
- TOXENV=py27
- TOXENV=py35

#addons:
#apt:
#packages:
#- language-pack-fr
#- libdb-dev

install: pip install -U tox

script: tox

#deploy:
#provider: pypi
#user: scrapy-plugins
#distributions: sdist bdist_wheel
#password:
#secure: iKVlMlKSr+LOuCCMMOqL65aYjNRy3k1Zb4d7NRN0JpWS5DGau8G8cEhJ1dY4uyc/DNKVJmd939OiLBsUqqCmz09+ozen/YrRNjEZS5lOwBNfhpiCESkbOjcInV1PQgx2XfuHGp8O/9vxtXjjH9WE9CabQ+8Zg5/rMMvXizT4/O4=
#on:
#tags: true
#all_branches: true
#repo: scrapy-plugins/scrapy-deltafetch
#condition: $TOXENV = py27
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
scrapy>=1.0
bsddb3
1 change: 1 addition & 0 deletions scrapy_deltafetch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .middleware import DeltaFetch
106 changes: 106 additions & 0 deletions scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os, time

from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils.request import request_fingerprint
from scrapy.utils.project import data_path
from scrapy.utils.python import to_bytes
from scrapy.exceptions import NotConfigured
from scrapy import log, signals


class DeltaFetch(object):
"""This is a spider middleware to ignore requests to pages containing items
seen in previous crawls of the same spider, thus producing a "delta crawl"
containing only new items.

This also speeds up the crawl, by reducing the number of requests that need
to be crawled, and processed (typically, item requests are the most cpu
intensive).

Supported settings:

* DELTAFETCH_ENABLED - to enable (or disable) this extension
* DELTAFETCH_DIR - directory where to store state
* DELTAFETCH_RESET - reset the state, clearing out all seen requests

Supported spider arguments:

* deltafetch_reset - same effect as DELTAFETCH_RESET setting

Supported request meta keys:

* deltafetch_key - used to define the lookup key for that request. by
default it's the fingerprint, but it can be changed to contain an item
id, for example. This requires support from the spider, but makes the
extension more efficient for sites that many URLs for the same item.

"""

def __init__(self, dir, reset=False, stats=None):
dbmodule = None
try:
dbmodule = __import__('bsddb3').db
except ImportError:
raise NotConfigured('bsddb or bsddb3 is required')
self.dbmodule = dbmodule
self.dir = dir
self.reset = reset
self.stats = stats

@classmethod
def from_crawler(cls, crawler):
s = crawler.settings
if not s.getbool('DELTAFETCH_ENABLED'):
raise NotConfigured
dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
reset = s.getbool('DELTAFETCH_RESET')
o = cls(dir, reset, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o

def spider_opened(self, spider):
if not os.path.exists(self.dir):
os.makedirs(self.dir)
dbpath = os.path.join(self.dir, '%s.db' % spider.name)
reset = self.reset or getattr(spider, 'deltafetch_reset', False)
flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE
try:
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=flag)
except Exception:
spider.log("Failed to open DeltaFetch database at %s, "
"trying to recreate it" % dbpath)
if os.path.exists(dbpath):
os.remove(dbpath)
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=self.dbmodule.DB_CREATE)

def spider_closed(self, spider):
self.db.close()

def process_spider_output(self, response, result, spider):
for r in result:
if isinstance(r, Request):
key = self._get_key(r)
if self.db.has_key(key):
spider.log("Ignoring already visited: %s" % r, level=log.INFO)
if self.stats:
self.stats.inc_value('deltafetch/skipped', spider=spider)
continue
elif isinstance(r, BaseItem):
key = self._get_key(response.request)
self.db[key] = str(time.time())
if self.stats:
self.stats.inc_value('deltafetch/stored', spider=spider)
yield r

def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
return to_bytes(key)
24 changes: 24 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from setuptools import setup

setup(
name='scrapy-deltafetch',
version='1.7.0',
license='BSD',
description='Scrapy middleware to ignore previously crawled pages',
author='Scrapinghub',
author_email='[email protected]',
url='http://github.com/scrapy-deltafetch/scrapy-deltafetch',
packages=['scrapy_deltafetch'],
platforms=['Any'],
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
],
install_requires=['Scrapy>=1.0', 'bsddb3']
)
Loading