Skip to content
This repository was archived by the owner on Jul 19, 2018. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add Python3 support
  • Loading branch information
redapple authored and nyov committed Jul 15, 2016
commit f71f2e57ef8a19084d1cbe307f70d7053707cc07
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
boto
hubstorage
hubstorage>=0.23
python-dateutil
scrapinghub
Scrapy>=0.22.0
Scrapy>=1.1
9 changes: 5 additions & 4 deletions scrapylib/constraints/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class Product(Item):

import re
from functools import partial
from six import string_types, text_type


class RequiredFields(object):
Expand Down Expand Up @@ -71,8 +72,8 @@ def __call__(self, item):
assert isinstance(v, self.type), "field %r is not a %s: %r" % \
(f, self.type.__name__, v)

IsString = partial(IsType, basestring)
IsUnicode = partial(IsType, unicode)
IsString = partial(IsType, string_types)
IsUnicode = partial(IsType, text_type)
IsList = partial(IsType, list)
IsDict = partial(IsType, dict)

Expand All @@ -87,7 +88,7 @@ def __call__(self, item):
v = item.get(f)
if v is None:
continue
assert isinstance(v, basestring), "field %r is not a string: %r" % (f, v)
assert isinstance(v, string_types), "field %r is not a string: %r" % (f, v)
assert v.strip().isdigit(), "field %r contains non-numeric chars: %r" % (f, v)

class IsPrice(object):
Expand All @@ -101,7 +102,7 @@ def __call__(self, item):
for f in self.fields:
v = item.get(f)
if v:
assert isinstance(v, basestring), "field %r is not a string: %r" % (f, v)
assert isinstance(v, string_types), "field %r is not a string: %r" % (f, v)
assert self.price_re.search(v), "field %r is not a price: %r" % (f, v)

class MaxLen(object):
Expand Down
2 changes: 1 addition & 1 deletion scrapylib/constraints/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ def process_item(self, item, spider):
try:
for c in item.constraints:
c(item)
except AssertionError, e:
except AssertionError as e:
raise DropItem(str(e))
return item
6 changes: 5 additions & 1 deletion scrapylib/guid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

from scrapy import signals
from scrapy.exceptions import DropItem
try:
from scrapy.utils.python import to_bytes
except ImportError:
from scrapy.utils.python import unicode_to_str as to_bytes


def hash_values(*values):
Expand All @@ -16,7 +20,7 @@ def hash_values(*values):
if value is None:
message = "hash_values was passed None at argument index %d" % list(values).index(None)
raise ValueError(message)
hash.update('%s' % value)
hash.update(to_bytes('%s' % value))
return hash.hexdigest()


Expand Down
12 changes: 6 additions & 6 deletions scrapylib/magicfields.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

You can set project global magics with MAGIC_FIELDS, and tune them for a specific spider using MAGIC_FIELDS_OVERRIDE.

In case there is more than one argument, they must come separated by ','. So, the generic magic format is
In case there is more than one argument, they must come separated by ','. So, the generic magic format is

$<magic name>[:arg1,arg2,...]

Expand Down Expand Up @@ -78,7 +78,7 @@ def _extract_regex_group(regex, txt):
try:
compiled = re.compile(regex)
_REGEXES[regex] = compiled
except Exception, e:
except Exception as e:
errmessage = e.message
_REGEX_ERRORS[regex] = errmessage
if errmessage:
Expand All @@ -103,7 +103,7 @@ def _format(fmt, spider, response, item, fixed_values):
for m in _ENTITIES_RE.finditer(fmt):
val = None
entity, args, regex = m.groups()
args = filter(None, (args or ':')[1:].split(','))
args = list(filter(None, (args or ':')[1:].split(',')))
if entity == "$jobid":
val = os.environ.get('SCRAPY_JOB', '')
elif entity == "$spider":
Expand Down Expand Up @@ -143,13 +143,13 @@ def _format(fmt, spider, response, item, fixed_values):
if regex:
try:
out = _extract_regex_group(regex, out)
except ValueError, e:
except ValueError as e:
spider.log("Error at '%s': %s" % (m.group(), e.message))

return out

class MagicFieldsMiddleware(object):

@classmethod
def from_crawler(cls, crawler):
mfields = crawler.settings.getdict("MAGIC_FIELDS").copy()
Expand All @@ -170,5 +170,5 @@ def process_spider_output(self, response, result, spider):
if isinstance(_res, BaseItem):
for field, fmt in self.mfields.items():
_res.setdefault(field, _format(fmt, spider, response, _res, self.fixed_values))
yield _res
yield _res

9 changes: 5 additions & 4 deletions scrapylib/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import locale as localelib
import re
import time
from urlparse import urljoin
from six.moves.urllib.parse import urljoin


from scrapy.loader.processors import MapCompose, TakeFirst
from scrapy.utils.markup import (remove_tags, replace_escape_chars,
Expand Down Expand Up @@ -57,8 +58,8 @@ def to_datetime(value, format, locale=None):
current date.
"""
if locale:
old_locale = localelib.getlocale(localelib.LC_ALL)
localelib.setlocale(localelib.LC_ALL, locale)
old_locale = localelib.getlocale(localelib.LC_TIME)
localelib.setlocale(localelib.LC_TIME, locale)

time_s = time.strptime(value, format)
dt = datetime.datetime(*time_s[0:5])
Expand All @@ -67,7 +68,7 @@ def to_datetime(value, format, locale=None):
dt = dt.replace(year=datetime.datetime.utcnow().year)

if locale:
localelib.setlocale(localelib.LC_ALL, old_locale)
localelib.setlocale(localelib.LC_TIME, old_locale)

return dt

Expand Down
8 changes: 5 additions & 3 deletions scrapylib/proxy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import base64
from urllib import unquote
from urllib2 import _parse_proxy
from urlparse import urlunparse
from six.moves.urllib.parse import unquote, urlunparse
try:
from urllib2 import _parse_proxy
except ImportError:
from urllib.request import _parse_proxy


class SelectiveProxyMiddleware(object):
Expand Down
5 changes: 3 additions & 2 deletions scrapylib/querycleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
Remove patterns has precedence.
"""
import re
from urllib import quote
from six.moves.urllib.parse import quote
from six import string_types

from scrapy.utils.httpobj import urlparse_cached
from scrapy.http import Request
Expand Down Expand Up @@ -51,7 +52,7 @@ def _filter_query(query, remove_re=None, keep_re=None):
continue
if keep_re is None or keep_re.search(k):
qarg = quote(k, _safe_chars)
if isinstance(v, basestring):
if isinstance(v, string_types):
qarg = qarg + '=' + quote(v, _safe_chars)
qargs.append(qarg.replace("%20", "+"))
return '&'.join(qargs)
Expand Down
6 changes: 4 additions & 2 deletions tests/test_constraints.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
import six

from scrapylib.constraints import RequiredFields, NonEmptyFields, IsType, IsNumber, IsPrice, MaxLen, MinLen

Expand Down Expand Up @@ -37,12 +38,13 @@ def setUp(self):
self.item = {'str': 'bar', 'list': ['one']}

def test_ok(self):
IsType(basestring, 'str')(self.item)
IsType(six.string_types, 'str')(self.item)
IsType(list, 'list')(self.item)
IsType(list, 'missing')(self.item)

def test_fail(self):
self.assertRaises(AssertionError, IsType(basestring, 'list'), self.item)
for t in six.string_types:
self.assertRaises(AssertionError, IsType(t, 'list'), self.item)
self.assertRaises(AssertionError, IsType(list, 'str'), self.item)


Expand Down
5 changes: 3 additions & 2 deletions tests/test_crawlera.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scrapy.spider import Spider
from scrapy.utils.test import get_crawler
from twisted.internet.error import ConnectionRefusedError
from six.moves import xrange

from scrapylib.crawlera import CrawleraMiddleware
import os
Expand Down Expand Up @@ -185,7 +186,7 @@ def get_proxyauth(self, spider):
wascalled[:] = [] # reset
enabled = True
self.spider.crawlera_enabled = False
proxyauth = 'Basic Foo'
proxyauth = b'Basic Foo'
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth'])

Expand Down Expand Up @@ -271,4 +272,4 @@ def test_jobid_header(self):
mw1.open_spider(self.spider)
req1 = Request('http://www.scrapytest.org')
self.assertEqual(mw1.process_request(req1, self.spider), None)
self.assertEqual(req1.headers.get('X-Crawlera-Jobid'), '2816')
self.assertEqual(req1.headers.get('X-Crawlera-Jobid'), b'2816')
4 changes: 2 additions & 2 deletions tests/test_deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,6 @@ def _create_test_db(self):
# truncate test db if there were failed tests
db.open(self.db_path, dbmodule.db.DB_HASH,
dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
db['test_key_1'] = 'test_v_1'
db['test_key_2'] = 'test_v_2'
db.put('test_key_1', 'test_v_1')
db.put('test_key_2', 'test_v_2')
db.close()
3 changes: 2 additions & 1 deletion tests/test_hubproxy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest import TestCase
from six.moves import xrange

from w3lib.http import basic_auth_header
from scrapy.http import Request, Response
Expand Down Expand Up @@ -161,6 +162,6 @@ def get_proxyauth(self, spider):
wascalled[:] = [] # reset
enabled = True
self.spider.use_hubproxy = False
proxyauth = 'Basic Foo'
proxyauth = b'Basic Foo'
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth'])
4 changes: 2 additions & 2 deletions tests/test_magicfields.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def setUp(self):
self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"])

def _log(x):
print x
print(x)

self.spider.log = _log
self.response = HtmlResponse(body="<html></html>", url="http://www.example.com/product/8798732")
self.response = HtmlResponse(body=b"<html></html>", url="http://www.example.com/product/8798732")
self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})

def tearDown(self):
Expand Down
8 changes: 4 additions & 4 deletions tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def test_to_datetime(self):
datetime.datetime(2011, 3, 4))

def test_localized_to_datetime(self):
current_locale = locale.getlocale(locale.LC_ALL)
current_locale = locale.getlocale(locale.LC_TIME)

self.assertEquals(
to_datetime('11 janvier 2011', '%d %B %Y', locale='fr_FR.UTF-8'),
datetime.datetime(2011, 1, 11)
)

self.assertEquals(current_locale, locale.getlocale(locale.LC_ALL))
self.assertEquals(current_locale, locale.getlocale(locale.LC_TIME))

def test_to_date(self):
self.assertEquals(to_date('March 4, 2011', '%B %d, %Y'),
Expand All @@ -39,14 +39,14 @@ def test_to_date(self):
self.assertEquals(test_date.year, datetime.datetime.utcnow().year)

def test_localized_to_date(self):
current_locale = locale.getlocale(locale.LC_ALL)
current_locale = locale.getlocale(locale.LC_TIME)

self.assertEquals(
to_date('11 janvier 2011', '%d %B %Y', locale='fr_FR.UTF-8'),
datetime.date(2011, 1, 11)
)

self.assertEquals(current_locale, locale.getlocale(locale.LC_ALL))
self.assertEquals(current_locale, locale.getlocale(locale.LC_TIME))

def test_default_input_processor(self):
self.assertEquals(default_input_processor(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_splitvariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class SplitVariantsTest(TestCase):
def setUp(self):
self.spider = Spider('myspider',
start_urls=["http://example.com"])
self.response = HtmlResponse(body="<html></html>",
self.response = HtmlResponse(body=b"<html></html>",
url="http://www.example.com")

def test_variants_splitted(self):
Expand Down