diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..bf85629 --- /dev/null +++ b/Pipfile @@ -0,0 +1,12 @@ +[[source]] +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + +[dev-packages] +tox = "*" +pylint = "*" + +[packages] +scrapy = ">=1.1.0" +"bsddb3" = "*" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..737ac5c --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,467 @@ +{ + "_meta": { + "hash": { + "sha256": "a4cb946f1a4691babdac739ff1c610e83d20a8401eb5322a0efcf05ea42f8e96" + }, + "host-environment-markers": { + "implementation_name": "cpython", + "implementation_version": "0", + "os_name": "posix", + "platform_machine": "x86_64", + "platform_python_implementation": "CPython", + "platform_release": "4.10.0-35-generic", + "platform_system": "Linux", + "platform_version": "#39-Ubuntu SMP Wed Sep 13 07:46:59 UTC 2017", + "python_full_version": "2.7.13", + "python_version": "2.7", + "sys_platform": "linux2" + }, + "pipfile-spec": 6, + "requires": {}, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.python.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "asn1crypto": { + "hashes": [ + "sha256:654b7db3b120e23474e9a1e5e38d268c77e58a9e17d2cb595456c37309846494", + "sha256:0874981329cfebb366d6584c3d16e913f2a0eb026c9463efcc4aaf42a9d94d70" + ], + "version": "==0.23.0" + }, + "attrs": { + "hashes": [ + "sha256:a7e0d9183f6457de12df7ba6a81f6569c7d6b25f67ad509b5ad52e8545970a2f", + "sha256:5d4d1b99f94d69338f485984127e4473b3ab9e20f43821b0e546cc3b2302fd11" + ], + "version": "==17.2.0" + }, + "automat": { + "hashes": [ + "sha256:2140297df155f7990f6f4c73b2ab0583bd8150db9ed2a1b48122abe66e9908c1", + "sha256:3c1fd04ecf08ac87b4dd3feae409542e9bf7827257097b2b6ed5692f69d6f6a8" + ], + "version": "==0.6.0" + }, + "bsddb3": { + "hashes": [ + "sha256:784bf40ad935258507594a89b32ea11f362cde120751c8b96de163955ced7db8" + ], + "version": "==6.2.5" + }, + "cffi": { + "hashes": [ + "sha256:eddf6bd95b100f34c56a226847be92bde416d028213e87fd700bc593cbf3e9da", + "sha256:929d206ecfefadcec0b0e13956131eec7442829d240da020d67274fefb9e61a4", + "sha256:040e0b5462c65a39b7112ad9e951f3f57b487d0db053ea510dcf8fc53931ab4b", + "sha256:d361e8c504bae02f665fad8d5ba4b74a6c1fab7862eec39cddcc21e4479fa275", + "sha256:56a8c39e6e7225474b36c16a03240fcbbfc5c8cb016645793e0dec2d2f9bdf67", + "sha256:4d2e388ce51e6a4efae8d4533ca15503ea957c41233bad165f8ccec354edca67", + "sha256:c091a1ac521dac2770632f499c124762ddf978d23ced0472f19a8830e006eab8", + "sha256:5c20f34a6858cd8c59eac23f80bc80bb5fd294bd9b22b264255a3eec0ab459f5", + "sha256:b45efc8d81640a8e9122c1ba1817da2a998ca0168c40de7bbfc47b960771ca94", + "sha256:f2ffdfd930bc4bcd4900def32fbb2a16a91656d9caafff573a25934327140431", + "sha256:bd93a0df4dc72893ba604c05d487ea4d47a799e54172b89e414953685b0b08fb", + "sha256:45233398f5e7114e719d0b83ffd0642af7fbf05136195f500fb6b72133c8e998", + "sha256:47853d133f0de32697b55e0c2ee377d88e3143bc8645a3b93d38be2f5387b0a2", + "sha256:6f1a2f85510c675b068e29625ff534f02567f24839cbd7d61cda6b7df62da2b1", + "sha256:6153b26756de2f34f80ca4e8b1e79011ad62bb670539dc3df71e62909a112fdf", + "sha256:5152bbde6ee514a11e2cd1b76a7ccab5af1043356ec3df9efe0112d97b20dae3", + "sha256:42b38cd48e55b2a4c5f038f80137c2cb4393fd4e4d1c3dbeeb7aea9475b68ea1", + "sha256:f6202eab19fd2c44d7d7af2bb8d286cf18b7b3d67b0e5bf1f2f113c99303ff87", + "sha256:ec1112466a924ba69df5a90f31d26d323e3b91ce21f94b10fa0b77f44a07df48", + "sha256:7a55f044dd78fdca379d455550c581b9455af022b8f8b5d97f585a25f4da63f6", + "sha256:fd3c49d7d22ba75382e19c4877e1d4dca232ef48137442f8582c2259b0cdf612", + "sha256:a76b89adecd46138aafdbd348563df1deb714962b5b3659ee1b6b0c5cf6ae0e5", + "sha256:dd9577197fff563e4784ef4ace7da28c7da96a81895b69c008cdb82f1e82853c", + "sha256:2217d22cb001386947693bd93ca4370224989468daf2939f3bf717956fb0a119", + "sha256:59fc30a6f7331fe69cfc67ef151fd03a33491d25430ae0c4597e69c4235fa87e", + "sha256:8a5632592b2752375b6d9e81da022896ece235713129fcc33703b3e10c639ef9", + "sha256:48d6589fda4ec0ccb7d973807890a733381f828d29167177eb7d97892255a769", + "sha256:f14c53dc624529eef175c993ceb886f37ecece09825bf47bf3230aacbc3b7acc", + "sha256:d8930a719781987a876d6de8cf33ce937191c383158daafcb9db804bf3421adf", + "sha256:765cf591a2f790b012075c7f54344a02b622a8561708ec27b3042c8c1699a465", + "sha256:5f4ff33371c6969b39b293d9771ee91e81d26f9129be093ca1b7be357fcefd15" + ], + "version": "==1.11.0" + }, + "constantly": { + "hashes": [ + "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d", + "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35" + ], + "version": "==15.1.0" + }, + "cryptography": { + "hashes": [ + "sha256:b17e4244212c9ec6526b6ad47534a12e0378387ad86228141053c112e759769d", + "sha256:1fb7dbf630daba53494da88de1e87429f2740e6ffdc92bbdcb385ddb9ff67b32", + "sha256:369e4669cac8296e320f86d0a3c9f489b7593d2a144e7b0504b884bed56fd5d7", + "sha256:15d655434cf6f350462c6c97f24cbf42ee0e6270886a7937eaaa52fd09e474ce", + "sha256:b25d97585f1d2b2f16560737aba6257a10ca18aa3a2a70407c05fe8b64b375f9", + "sha256:59d2b285fb24d30deaf4143c2d048fcefdd7e560f30cd1745da881705ca965c6", + "sha256:fe6d56a4ff08875d70cc9b336fcd6e938db5808cb47cbc33c6927affb659aded", + "sha256:d32dad8448ea88fe8eb02dc943d46db658cf9e618386761e498802c79e97c937", + "sha256:d44db8e4435a10dcbf22fbc03072d18b31e6c1100084a2010239efd359f2f162", + "sha256:a972eb37b6409a2d67ddd956e6bf74f2e9315105083d1fdaa8bbe6eb520dc367", + "sha256:fb7dfc4b3a79ca4b9233803b80617fd302374e57771c9a28e32ee6c1296cdea3", + "sha256:795af466e52daf404d783877157b106bc79e0cffa55f8d6360fa9f9fd03148bd", + "sha256:4ffc706fbe4c12146050d432250896f2fcad8b77c6fd89d3c3bd456884dbf421", + "sha256:317127295b14f5c943ed789ab60ade0d7d39041f8975d58a214af19dd1148705", + "sha256:5c9e46b219e2b65ab9192e10407d7698589794407f15f11f39abbf3b04d16890", + "sha256:27b7b005debcf4c0562299d5ceeaea151120eaad7bd84ef91fb22c204a958f03", + "sha256:8bbff1de3002db5c7146b88fea392526260becc153adc8810103fc750c3eb333", + "sha256:bb715b0d8d5c9111e9656eabe58ee7b95842a797c04608aef7f52439ab73d5f3", + "sha256:bc86d3de2c7d63f3c73dff24fa43c55c00c7b8ea8102ed638c903c939b85e6a6", + "sha256:2f0b681e89530750e4c0adf98701cafd0384ac160ceafe39c9d10be6034d2cc7", + "sha256:5544d9433f9c25d55d609e2b014d47c390288bb98ae84e35b8cf93e9f6a6c832", + "sha256:55cc98c93323efcf6f3300b6e9a7d435146a9d451ffd27aa14074f576f275786", + "sha256:4ef205496c518aae7e699731dfff4b630c046ea9dbbe34e8ec8ce77bb399c1c7", + "sha256:886759453c5b93b1e8953e92abee65ee09fdf78bf655c95d0a206d6387adbfed", + "sha256:0f3dcdd06c5491cad2ed283ebc802805e3ac77ffe4789ad1b44ff0f0f891f59b", + "sha256:8067fce92ec98cb52fabe2e10205ff27dba5755d9ffc1f521be9c504f9a6d5fc", + "sha256:5a266ff306683cd82b2f19203f469b7580aee115f1818d2b2d1e12f1ee7fd12b", + "sha256:d5341dac22e8ab080e637c632d1b1e92acaa8e66a02d139c0c795c1797531384", + "sha256:a83318144a0037f264891ddfcbae9b75303949bb0efaece9713df5e04388b7ef", + "sha256:d04bb2425086c3fe86f7bc48915290b13e798497839fbb18ab7f6dffcf98cc3a" + ], + "version": "==2.0.3" + }, + "cssselect": { + "hashes": [ + "sha256:4f5f799a1d3182b04814007e9e7fc6c362f4489c7420d6b348cc901ece07ced9", + "sha256:73db1c054b9348409e2862fc6c0dde5c4e4fbe4da64c5c5a9e05fbea45744077" + ], + "version": "==1.0.1" + }, + "hyperlink": { + "hashes": [ + "sha256:1ec8e11fb4f5b330f25864bf8cfd3133dff1a3637dfd14fa441297df15fc7cf9", + "sha256:bc4ffdbde9bdad204d507bd8f554f16bba82dd356f6130cb16f41422909c33bc" + ], + "version": "==17.3.1" + }, + "idna": { + "hashes": [ + "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4", + "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f" + ], + "version": "==2.6" + }, + "incremental": { + "hashes": [ + "sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f", + "sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3" + ], + "version": "==17.5.0" + }, + "lxml": { + "hashes": [ + "sha256:3593f49858fc6229cd93326be06b099ae477fd65d8f4a981320a6d0bb7fc7a5a", + "sha256:8996df6b0f205b00b89bbd04d88f1fa1e04139a025fd291aa4ddd05dc86836f4", + "sha256:9f399c37b8e61c3989ef12ecf0abd9c10a5075f0fc9ad1ecd67ce6f9c72a7211", + "sha256:550a51dee73c14e5863bdbbbe5836b2b8092a3f92631b5a908b9c41e72f123a5", + "sha256:e37eda3d05519918403084b43eb7324df21a0daf45c8ae8172a860710dd0fa78", + "sha256:48ab0e79175fd16f9478edc679ee14c79524c64b26f665f92cbecff81312d04d", + "sha256:52e18dd86f153c4383bb4c4ef62f81f9b7e44809d068848a5a183b2285496faf", + "sha256:0b8f3d6e669ea26849a6184f04c7802dbef6fd418a8b90e6c026e237db07af31", + "sha256:567b76f291a8d02aa8b4d3f8295ae749ac4d532570d8a8c7176f0556c7d95891", + "sha256:61825daaf2d80dc3da7635ee108720b0739962db008343822753bbf343cbfd14", + "sha256:b7f6ef610680709be11cb7662e46e254bc561dafe0de3b4848be2cf3924bd300", + "sha256:824664493a012f5b70396e841a4b4049bdaf445a70307e60f82fe35619f72cc7", + "sha256:e908d685800626f10cd6ae01a013fc42094be167fb2a683eb920dfddfaa0ee76", + "sha256:10c86b2248043f4428be33ed10202764b02b281eaa4550f16f0fbbc6ccaae9ac", + "sha256:d9ec728caddb161405e7c33ed9d176e96309893481370163bbf4b00e43008795", + "sha256:b2ecb3fd5470b740dfc21b064bbc1337be4b7b805994a868488145d36f35f517", + "sha256:a211288459c9115ddb255ff88e8ac12dc2063e70bddc15e3c65136477a358bb5", + "sha256:1f81074e77c25f9b787fa3854f400ca924d3d798cb7ae910c0e7920be7138c90", + "sha256:99b7eabfb46663ed5918eca4ed12420613ba24196964a741ccd962d09296c0b2", + "sha256:a8ad0adeedbbb7b85916214fcd4f5d02829d0e7b3c32abc298789218b6c3d699", + "sha256:88d137e440b5de35df2e0616da8e28a88d0119abdaa84520ad1ba815ee9da732", + "sha256:c4e02657e629f02ab8712471d77d6896c2cf6f09f8ffa6a0f23b1b1ef0318474", + "sha256:9581b438e5d0d0a6fa3937fac2abffd95380bd513bcd39f6952bfcf20cf0b9a7", + "sha256:c446fde3284c363cd2085ad1ce5a07c18f15f6766d72684622bc14b0a9ddfd29", + "sha256:d4507916c408feec2ea8cee3f0d1380e49ea431f6e07b0dd927388bd6e92d6eb", + "sha256:7030f44b758e930fd09ade87d770f5a231a19a8c561a3acc54e5122b5ec09e29", + "sha256:d78c0a114cf127a41a526aef99aef539c0b2537e57f04a2cc7a49e2c94a44ab8", + "sha256:f7bc9f702500e205b1560d620f14015fec76dcd6f9e889a946a2ddcc3c344fd0" + ], + "version": "==4.0.0" + }, + "parsel": { + "hashes": [ + "sha256:29eb4bd74e22e41138ea8ed96ce8e477b8116f97a13a991e39cb150fdde7eabd", + "sha256:1c5a31cb3f0fdca3d19f2ef966d595df69181c2d81b6c56582732c819e2f4e26" + ], + "version": "==1.2.0" + }, + "pyasn1": { + "hashes": [ + "sha256:a1b5e7fb8ee082217576ac2e91ad2b5e4c590277fbe86b197b447424e7472f13", + "sha256:72540e4846b0e1f659723558f63a62abe7106becb6ebb35c05b5f7803caa53e8", + "sha256:686f0cbfa3f3587ab29a7e6477a587f4e72d74e6f5d8b8a14de7e47a53f2b345", + "sha256:6f318b30b5592d2f69acb06ea79f557e01ee0acab9a83a0f3cf1747f637df550", + "sha256:06afc633971ab80943f06b96d3d6314f461001c92418fc0cd682a8357a1db47f", + "sha256:ac60ffbd5d2d3f9aeb7c73938526c4141da849dd57a7e1084864324aed9eb986", + "sha256:fc6fec92d096b24aabd6f241817af14dabddbd5f3aef6c78f849614e6073bb7d", + "sha256:4dd57f10fd90ac5e2da7fcbb08ba9d88acb1ab8b1ffa532b6934ae30b2dcdbe5", + "sha256:5b1822324613a02a064a8baebbed02afbff1b9077ab5b761d6424c762c4f3de4", + "sha256:bbc702abbec2da12eb726f9d534296bb2ad90fa1be977a0214ce157cede5eb26", + "sha256:325b55a20bf9648ee9400a18aa8b13f6073e0d3057fc77b9778b743f27ccce8b", + "sha256:f0380ea97db0ede095a0dd87ce3003d46c197191f924206e43f776fc77e51f09" + ], + "version": "==0.3.6" + }, + "pyasn1-modules": { + "hashes": [ + "sha256:ca3cdc5d8ecc97f78a202009512254bd7aeef77069405e9e834b45c48a26a4db", + "sha256:2e39aead1602e4e5aa8b796260848f9383e51bea278edf7d2b4fb686944425fa", + "sha256:641cc18cf56d4a60679804aa8ae8cbc7359b3fa2d5559df064ecfaca27d15b10", + "sha256:7411e837c83ea4cb6088aa3bf63691d348454fcd9bb3d9cc342859282419bfcf", + "sha256:529d5d509562c22877a53771c5b394d8102c98cd62f33bb5af23b475879ea6d5", + "sha256:a24f5118f41af33f13fa0c7e8419fc7380dfe2d2f2dc0f554d928141c842f924", + "sha256:ab7e23e45f32f0515e5a084eecaff22653bb6277d0d4229cb7369117abf33baa", + "sha256:6ad0e6772af4b74bd63c78c0102ece7b6a775f764e395137968af575c20bbfc9", + "sha256:34e1d014608ca4f8a0cc3164a5add93f4b8a04a3871b96d31a028ff36a6fe924", + "sha256:beb3d344fee1fa68ddf36471c5d9120665ba049900d7fccbffa50c77036581de", + "sha256:7930d0f6109a47f78e5fb88a4f7ed2bfa1073ec9ddb2657deffa92f0805568fb", + "sha256:b07c17bdb34d6f64aafea6269f2e8fb306a57473f0f38d9a6ca389d6ab30ac4a" + ], + "version": "==0.1.4" + }, + "pycparser": { + "hashes": [ + "sha256:99a8ca03e29851d96616ad0404b4aad7d9ee16f25c9f9708a11faf2810f7b226" + ], + "version": "==2.18" + }, + "pydispatcher": { + "hashes": [ + "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", + "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" + ], + "version": "==2.0.5" + }, + "pyopenssl": { + "hashes": [ + "sha256:aade9985b93eaec51b0c0a2a60d14bb8dcff1ff8e36fe542e3c22812ec07315e", + "sha256:29630b9064a82e04d8242ea01d7c93d70ec320f5e3ed48e95fcabc6b1d0f6c76" + ], + "version": "==17.3.0" + }, + "queuelib": { + "hashes": [ + "sha256:c97f2120e070e2b8a5e8a60afe10bd7b3ae70c0994d6bb9c4d802b0448241339", + "sha256:a6829918157ed433fafa87b0bb1e93e3e63c885270166db5884a02c34c86f914" + ], + "version": "==1.4.2" + }, + "scrapy": { + "hashes": [ + "sha256:8f14888a86cc94d51ec8cdfdbd8c8a76c8bea7d6244bc4e4cc89c1958c58ed5d", + "sha256:04a08f027eef5d271342a016439533c81ba46f14bfcf230fecf602e99beaf233" + ], + "version": "==1.4.0" + }, + "service-identity": { + "hashes": [ + "sha256:0e76f3c042cc0f5c7e6da002cf646f59dc4023962d1d1166343ce53bdad39e17", + "sha256:4001fbb3da19e0df22c47a06d29681a398473af4aa9d745eca525b3b2c2302ab" + ], + "version": "==17.0.0" + }, + "six": { + "hashes": [ + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9" + ], + "version": "==1.11.0" + }, + "twisted": { + "hashes": [ + "sha256:7bc3cdfd1ca5e5b84c7936db3c2cb2feb7d5b77410e713fd346da095a3b6a1d2", + "sha256:716805e624f9396fcc1f47e8aef68e629fd31599a74855b6e1636122c042458d", + "sha256:0da1a7e35d5fcae37bc9c7978970b5feb3bc82822155b8654ec63925c05af75c" + ], + "version": "==17.9.0" + }, + "w3lib": { + "hashes": [ + "sha256:7b661935805b7d39afe90bb32dec8e4d20b377e74c66597eb1ddfad10071938e", + "sha256:c48731d5d73cde86f9c3c2bd6898d165f670120427353a7c8f9d6c685561d3c4" + ], + "version": "==1.18.0" + }, + "zope.interface": { + "hashes": [ + "sha256:9902d5fc11309e17cdce6574243dc114b9c30de5c60ab53c90f6e3e962688565", + "sha256:4cb1c56b0356da9a33249ef77a688c47107f54191c12a0055d284b6bee7f447e", + "sha256:ff20038fbc0e7ea050a7e28fcb8ae6ed8378a8d08ac70b848ea39960dda86bbf", + "sha256:f6868378fffbb8651f1f8a767d17e42aed39926c8f6bb9c56f184022fe6c2090", + "sha256:a6375035a4b45d199a8b990e3a2f6b71906c318c56dfc14b2d58350b6ca59392", + "sha256:dec19181cf6af58ccb8ba3fa3ca9d4ec555b2f3cb31f589f6e86d15df0926c31", + "sha256:b8f3491c9df4f0ffed32b275033e74041f420e5dcdefa4b1500d753c64ef42cf", + "sha256:5d8813e438ab67a793b09e1223742b757dd95a4a64d466855a53cb113cc9c9c4", + "sha256:5a8cc535f4212b134e66a3e1c6b93b19d453dbad0e2f89d0df2c01deefc8cad9", + "sha256:bd626cd76b7e5cbecac9d3e0dd8f98e3eada15ead95713238a523f877327633d", + "sha256:16fe824b3d93ee0629aa1f04848a1b515d6b5dc9e98cc7a04feaa35fdb0de5f1", + "sha256:f47d4138405eb67e5f059b9ab74e0a1147adc3277f5fe37d5bae5209b67e89e7", + "sha256:8dfdc1588db31895f81bcba6c36dc981b4cf4a526c62eae3745bbfbe102477ef", + "sha256:88e3d54e88a601f45d03e2a062d5d16852d20e0863a92c19260ae72e2586378a", + "sha256:3d033abd27cd54157cf42a3bfd4d8c28d7fc5c6f775df3332307d2632a79925b", + "sha256:a21d69de2ee89fc59de93e7a43c0379ecedb5149739ff94e910c2bf0ca18e181", + "sha256:aef398a5b92e70b8152d2c4850bad0fe185adb50d948f32d0bba5694d82b67c7", + "sha256:11b068fc9916556f3820f38c2376c28d8e55e4a2c51c34915aaac38b75706d2e", + "sha256:78321a6c0c8cc6ac928e44ef04d50384bc864a7f5e3c25b84110da2ede83739f", + "sha256:4be05f79e952793f31a0c2d6a0672c81a3300315da587ce6a590357595217005", + "sha256:1d954d557b63124a65f2247ac6ed66fa36df18d1e8538d08c9b432e808a634de", + "sha256:a16a3e07511fb6806bb48c8c661d38cdb91cd4bc6c2b6b0b173e72362ec1ceb4", + "sha256:d6d26d5dfbfd60c65152938fcb82f949e8dada37c041f72916fef6621ba5c5ce" + ], + "version": "==4.4.3" + } + }, + "develop": { + "astroid": { + "hashes": [ + "sha256:39a21dd2b5d81a6731dc0ac2884fa419532dffd465cdd43ea6c168d36b76efb3", + "sha256:492c2a2044adbf6a84a671b7522e9295ad2f6a7c781b899014308db25312dd35" + ], + "version": "==1.5.3" + }, + "backports.functools-lru-cache": { + "hashes": [ + "sha256:4ba998e881f285c1d1b73f5b6e3766539b4e162320f9589334400c5ddc35198c", + "sha256:31f235852f88edc1558d428d890663c49eb4514ffec9f3650e7f3c9e4a12e36f" + ], + "markers": "python_version < '3.3'", + "version": "==1.4" + }, + "configparser": { + "hashes": [ + "sha256:5308b47021bc2340965c371f0f058cc6971a04502638d4244225c49d80db273a" + ], + "markers": "python_version == '2.7'", + "version": "==3.5.0" + }, + "enum34": { + "hashes": [ + "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79", + "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a", + "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1", + "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850" + ], + "markers": "python_version < '3.4'", + "version": "==1.1.6" + }, + "isort": { + "hashes": [ + "sha256:cd5d3fc2c16006b567a17193edf4ed9830d9454cbeb5a42ac80b36ea00c23db4", + "sha256:79f46172d3a4e2e53e7016e663cc7a8b538bec525c36675fcfd2767df30b3983" + ], + "version": "==4.2.15" + }, + "lazy-object-proxy": { + "hashes": [ + "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019", + "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39", + "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c", + "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e", + "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b", + "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6", + "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b", + "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d", + "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff", + "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd", + "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f", + "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514", + "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92", + "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35", + "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff", + "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252", + "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7", + "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b", + "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f", + "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4", + "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577", + "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d", + "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109", + "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2", + "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33", + "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d", + "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5", + "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088", + "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a" + ], + "version": "==1.3.1" + }, + "mccabe": { + "hashes": [ + "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", + "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" + ], + "version": "==0.6.1" + }, + "pluggy": { + "hashes": [ + "sha256:bd60171dbb250fdebafad46ed16d97065369da40568ae948ef7117eee8536e94" + ], + "version": "==0.5.2" + }, + "py": { + "hashes": [ + "sha256:2ccb79b01769d99115aa600d7eed99f524bf752bba8f041dc1c184853514655a", + "sha256:0f2d585d22050e90c7d293b6451c83db097df77871974d90efd5a30dc12fcde3" + ], + "version": "==1.4.34" + }, + "pylint": { + "hashes": [ + "sha256:948679535a28afc54afb9210dabc6973305409042ece8e5768ca1409910c1ed8", + "sha256:1f65b3815c3bf7524b845711d54c4242e4057dd93826586620239ecdfe591fb1" + ], + "version": "==1.7.4" + }, + "singledispatch": { + "hashes": [ + "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8", + "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c" + ], + "markers": "python_version < '3.4'", + "version": "==3.4.0.3" + }, + "six": { + "hashes": [ + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9" + ], + "version": "==1.11.0" + }, + "tox": { + "hashes": [ + "sha256:8af30fd835a11f3ff8e95176ccba5a4e60779df4d96a9dfefa1a1704af263225", + "sha256:752f5ec561c6c08c5ecb167d3b20f4f4ffc158c0ab78855701a75f5cef05f4b8" + ], + "version": "==2.9.1" + }, + "virtualenv": { + "hashes": [ + "sha256:39d88b533b422825d644087a21e78c45cf5af0ef7a99a1fc9fbb7b481e5c85b0", + "sha256:02f8102c2436bb03b3ee6dede1919d1dac8a427541652e5ec95171ec8adbc93a" + ], + "markers": "python_version != '3.2'", + "version": "==15.1.0" + }, + "wrapt": { + "hashes": [ + "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" + ], + "version": "==1.10.11" + } + } +} diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index ded0843..79ba4e1 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -1,3 +1,7 @@ +#-*- coding: utf-8 -*- + +"""Scrapy Delta Fetch""" + import logging import os import time @@ -13,7 +17,6 @@ logger = logging.getLogger(__name__) - class DeltaFetch(object): """ This is a spider middleware to ignore requests to pages containing items @@ -25,70 +28,83 @@ class DeltaFetch(object): intensive). """ - def __init__(self, dir, reset=False, stats=None): + def __init__(self, directory, reset=False, stats=None): dbmodule = None try: dbmodule = __import__('bsddb3').db except ImportError: raise NotConfigured('bsddb3 is required') self.dbmodule = dbmodule - self.dir = dir + self.database = None + self.directory = directory self.reset = reset self.stats = stats @classmethod def from_crawler(cls, crawler): - s = crawler.settings - if not s.getbool('DELTAFETCH_ENABLED'): + """Load middleware settings and setup signals""" + settings = crawler.settings + if not settings.getbool('DELTAFETCH_ENABLED'): raise NotConfigured - dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) - reset = s.getbool('DELTAFETCH_RESET') - o = cls(dir, reset, crawler.stats) - crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) - crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) - return o + directory = data_path(settings.get('DELTAFETCH_DIR', 'deltafetch')) + reset = settings.getbool('DELTAFETCH_RESET') + middleware = cls(directory, reset, crawler.stats) + crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) + return middleware def spider_opened(self, spider): - if not os.path.exists(self.dir): - os.makedirs(self.dir) - dbpath = os.path.join(self.dir, '%s.db' % spider.name) + """Create database if it doesn't exist and open the handle""" + if not os.path.exists(self.directory): + os.makedirs(self.directory) + dbpath = os.path.join(self.directory, '%s.db' % spider.name) reset = self.reset or getattr(spider, 'deltafetch_reset', False) flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE try: - self.db = self.dbmodule.DB() - self.db.open(filename=dbpath, - dbtype=self.dbmodule.DB_HASH, - flags=flag) - except Exception: - logger.warning("Failed to open DeltaFetch database at %s, " - "trying to recreate it" % dbpath) + self.database = self.dbmodule.DB() + self.database.open( + filename=dbpath, + dbtype=self.dbmodule.DB_HASH, + flags=flag + ) + except self.dbmodule.DBError: + logger.warning( + "Failed to open DeltaFetch database at %s, trying to recreate it", + dbpath + ) if os.path.exists(dbpath): os.remove(dbpath) - self.db = self.dbmodule.DB() - self.db.open(filename=dbpath, - dbtype=self.dbmodule.DB_HASH, - flags=self.dbmodule.DB_CREATE) + self.database = self.dbmodule.DB() + self.database.open( + filename=dbpath, + dbtype=self.dbmodule.DB_HASH, + flags=self.dbmodule.DB_CREATE + ) def spider_closed(self, spider): - self.db.close() + """Close the database handle""" + if self.database: + self.database.close() def process_spider_output(self, response, result, spider): - for r in result: - if isinstance(r, Request): - key = self._get_key(r) - if key in self.db: - logger.info("Ignoring already visited: %s" % r) + """Retrieve key, lookup database and skip request if key exists""" + for each in result: + if isinstance(each, Request): + key = self._get_key(each) + if key in self.database: + logger.info("Ignoring already visited: %s", each) if self.stats: self.stats.inc_value('deltafetch/skipped', spider=spider) continue - elif isinstance(r, (BaseItem, dict)): + elif isinstance(each, (BaseItem, dict)): key = self._get_key(response.request) - self.db[key] = str(time.time()) + self.database[key] = str(time.time()) if self.stats: self.stats.inc_value('deltafetch/stored', spider=spider) - yield r + yield each - def _get_key(self, request): + @staticmethod + def _get_key(request): + """Retrieve key to use for database lookup""" key = request.meta.get('deltafetch_key') or request_fingerprint(request) - # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string return to_bytes(key) diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index d3c3289..f0641b7 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -43,7 +43,7 @@ def test_init(self): # path format is any, the folder is not created instance = self.mwcls('/any/dir', True, stats=self.stats) assert isinstance(instance, self.mwcls) - self.assertEqual(instance.dir, '/any/dir') + self.assertEqual(instance.directory, '/any/dir') self.assertEqual(self.stats.get_stats(), {}) self.assertEqual(instance.reset, True) @@ -62,7 +62,7 @@ def test_init_from_crawler(self): instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) + instance.directory, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings @@ -72,7 +72,7 @@ def test_init_from_crawler(self): instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) + instance.directory, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_spider_opened_new(self): @@ -80,28 +80,28 @@ def test_spider_opened_new(self): if os.path.exists(self.db_path): os.remove(self.db_path) mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, 'database') mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert hasattr(mw, 'database') + assert isinstance(mw.database, type(dbmodule.db.DB())) + assert mw.database.items() == [] + assert mw.database.get_type() == dbmodule.db.DB_HASH + assert mw.database.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, 'database') mw.spider_opened(self.spider) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [(b'test_key_1', b'test_v_1'), + assert hasattr(mw, 'database') + assert isinstance(mw.database, type(dbmodule.db.DB())) + assert mw.database.items() == [(b'test_key_1', b'test_v_1'), (b'test_key_2', b'test_v_2')] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert mw.database.get_type() == dbmodule.db.DB_HASH + assert mw.database.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_corrupt_dbfile(self): """Middleware should create a new .db if it cannot open it.""" @@ -109,58 +109,57 @@ def test_spider_opened_corrupt_dbfile(self): with open(self.db_path, "wb") as dbfile: dbfile.write(b'bad') mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, 'database') # file corruption is only detected when opening spider mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) + assert hasattr(mw, 'database') + assert isinstance(mw.database, type(dbmodule.db.DB())) # and db should be empty (it was re-created) - assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert mw.database.items() == [] + assert mw.database.get_type() == dbmodule.db.DB_HASH + assert mw.database.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing_spider_reset(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, 'database') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + assert mw.database.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, 'database') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.fd() + assert mw.database.fd() # there's different logic for different bdb versions: # it can fail when opening a non-existing db with truncate flag, # then it should be caught and retried with rm & create flag - assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or - mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) + assert (mw.database.get_open_flags() == dbmodule.db.DB_CREATE or + mw.database.get_open_flags() == dbmodule.db.DB_TRUNCATE) def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, 'database') mw.spider_opened(self.spider) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + assert hasattr(mw, 'database') + assert isinstance(mw.database, type(dbmodule.db.DB())) + assert mw.database.items() == [] + assert mw.database.get_type() == dbmodule.db.DB_HASH + assert mw.database.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_closed(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) mw.spider_opened(self.spider) - assert mw.db.fd() + assert mw.database.fd() mw.spider_closed(self.spider) - self.assertRaises(dbmodule.db.DBError, mw.db.fd) def test_process_spider_output(self): self._create_test_db() @@ -187,7 +186,7 @@ def test_process_spider_output(self): self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) # b'key' should not be in the db yet as no item was collected yet - self.assertEqual(set(mw.db.keys()), + self.assertEqual(set(mw.database.keys()), set([b'test_key_1', b'test_key_2'])) @@ -195,11 +194,11 @@ def test_process_spider_output(self): result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), + self.assertEqual(set(mw.database.keys()), set([b'key', b'test_key_1', b'test_key_2'])) - assert mw.db[b'key'] + assert mw.database[b'key'] def test_process_spider_output_dict(self): self._create_test_db() @@ -211,11 +210,11 @@ def test_process_spider_output_dict(self): result = [{"somekey": "somevalue"}] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), + self.assertEqual(set(mw.database.keys()), set([b'key', b'test_key_1', b'test_key_2'])) - assert mw.db[b'key'] + assert mw.database[b'key'] def test_process_spider_output_stats(self): self._create_test_db() @@ -244,8 +243,8 @@ def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): - super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + def __init__(self, directory, reset=False, *args, **kwargs): + super(LegacyDeltaFetchSubClass, self).__init__(directory=directory, reset=reset) self.something = True crawler = mock.Mock() @@ -263,7 +262,7 @@ def __init__(self, dir, reset=False, *args, **kwargs): instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) + instance.directory, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings @@ -273,7 +272,7 @@ def __init__(self, dir, reset=False, *args, **kwargs): instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) + instance.directory, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_process_spider_output_stats_legacy(self): @@ -281,8 +280,8 @@ def test_process_spider_output_stats_legacy(self): # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): - super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + def __init__(self, directory, reset=False, *args, **kwargs): + super(LegacyDeltaFetchSubClass, self).__init__(directory=directory, reset=reset) self.something = True self._create_test_db()