|
| 1 | +"""Get parameter cleaner for AS. |
| 2 | +
|
| 3 | +Add removed/kept pattern (regex) with |
| 4 | +
|
| 5 | +QUERYCLEANER_REMOVE |
| 6 | +QUERYCLEANER_KEEP |
| 7 | +
|
| 8 | +Remove patterns has precedence. |
| 9 | +""" |
| 10 | +import re |
| 11 | +from six.moves.urllib.parse import quote |
| 12 | +from six import string_types |
| 13 | + |
| 14 | +from scrapy.utils.httpobj import urlparse_cached |
| 15 | +from scrapy.http import Request |
| 16 | +from scrapy.exceptions import NotConfigured |
| 17 | + |
| 18 | +from w3lib.url import _safe_chars |
| 19 | + |
| 20 | +def _parse_query_string(query): |
| 21 | + """Used for replacing cgi.parse_qsl. |
| 22 | + The cgi version returns the same pair for query 'key' |
| 23 | + and query 'key=', so reconstruction |
| 24 | + maps to the same string. But some sites does not handle both versions |
| 25 | + in the same way. |
| 26 | + This version returns (key, None) in the first case, and (key, '') in the |
| 27 | + second one, so correct reconstruction can be performed.""" |
| 28 | + |
| 29 | + params = query.split("&") |
| 30 | + keyvals = [] |
| 31 | + for param in params: |
| 32 | + kv = param.split("=") + [None] |
| 33 | + keyvals.append((kv[0], kv[1])) |
| 34 | + return keyvals |
| 35 | + |
| 36 | +def _filter_query(query, remove_re=None, keep_re=None): |
| 37 | + """ |
| 38 | + Filters query parameters in a query string according to key patterns |
| 39 | + >>> _filter_query('as=3&bs=8&cs=9') |
| 40 | + 'as=3&bs=8&cs=9' |
| 41 | + >>> _filter_query('as=3&bs=8&cs=9', None, re.compile("as|bs")) |
| 42 | + 'as=3&bs=8' |
| 43 | + >>> _filter_query('as=3&bs=8&cs=9', re.compile("as|bs")) |
| 44 | + 'cs=9' |
| 45 | + >>> _filter_query('as=3&bs=8&cs=9', re.compile("as|bs"), re.compile("as|cs")) |
| 46 | + 'cs=9' |
| 47 | + """ |
| 48 | + keyvals = _parse_query_string(query) |
| 49 | + qargs = [] |
| 50 | + for k, v in keyvals: |
| 51 | + if remove_re is not None and remove_re.search(k): |
| 52 | + continue |
| 53 | + if keep_re is None or keep_re.search(k): |
| 54 | + qarg = quote(k, _safe_chars) |
| 55 | + if isinstance(v, string_types): |
| 56 | + qarg = qarg + '=' + quote(v, _safe_chars) |
| 57 | + qargs.append(qarg.replace("%20", "+")) |
| 58 | + return '&'.join(qargs) |
| 59 | + |
| 60 | +class QueryCleanerMiddleware(object): |
| 61 | + def __init__(self, settings): |
| 62 | + remove = settings.get("QUERYCLEANER_REMOVE") |
| 63 | + keep = settings.get("QUERYCLEANER_KEEP") |
| 64 | + if not (remove or keep): |
| 65 | + raise NotConfigured |
| 66 | + self.remove = re.compile(remove) if remove else None |
| 67 | + self.keep = re.compile(keep) if keep else None |
| 68 | + |
| 69 | + @classmethod |
| 70 | + def from_crawler(cls, crawler): |
| 71 | + return cls(crawler.settings) |
| 72 | + |
| 73 | + def process_spider_output(self, response, result, spider): |
| 74 | + for res in result: |
| 75 | + if isinstance(res, Request): |
| 76 | + parsed = urlparse_cached(res) |
| 77 | + if parsed.query: |
| 78 | + parsed = parsed._replace(query=_filter_query(parsed.query, self.remove, self.keep)) |
| 79 | + res = res.replace(url=parsed.geturl()) |
| 80 | + yield res |
| 81 | + |
0 commit comments