Skip to content

Commit 7ee8e55

Browse files
committed
Added engine localization. Closes #88
1 parent 4ee25bc commit 7ee8e55

File tree

17 files changed

+48
-97
lines changed

17 files changed

+48
-97
lines changed

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@ Query Results can be scraped from popular search engines as shown in the example
112112
print(bresults["descriptions"][5])
113113
```
114114

115+
For localization, you can pass the `url` keyword and a localized url. This would use the url to query and parse using the same engine's parser
116+
```python
117+
# Use google.de instead of google.com
118+
results = gsearch.search(*search_args, url="google.de")
119+
```
120+
115121
### Command line
116122

117123
Search engine parser comes with a CLI tool known as `pysearch` e.g
@@ -132,6 +138,8 @@ There is a needed argument for the CLI i.e `-e Engine` followed by either of two
132138

133139
```bash
134140

141+
usage: pysearch [-h] [-u URL] [-e ENGINE] {search,summary} ...
142+
135143
SearchEngineParser
136144

137145
positional arguments:
@@ -141,9 +149,11 @@ positional arguments:
141149

142150
optional arguments:
143151
-h, --help show this help message and exit
152+
-u URL, --url URL A custom link to use as base url for search e.g
153+
google.de
144154
-e ENGINE, --engine ENGINE
145155
Engine to use for parsing the query e.g google, yahoo,
146-
bing, duckduckgo (default: google)
156+
bing,duckduckgo (default: google)
147157
```
148158

149159
`summary` just shows the summary of each search engine added with descriptions on the return

search_engine_parser/core/base.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,10 @@ def keys(self):
5151
with suppress(IndexError):
5252
x = self.results[0]
5353
keys = x.keys()
54-
return keys
54+
return keys
55+
56+
def __len__(self):
57+
return len(self.results)
5558

5659

5760
class BaseSearch:
@@ -79,7 +82,7 @@ def parse_soup(self, soup):
7982
raise NotImplementedError("subclasses must define method <parse_soup>")
8083

8184
@abstractmethod
82-
def parse_single_result(self, single_result):
85+
def parse_single_result(self, single_result, **kwargs):
8386
"""
8487
Every div/span containing a result is passed here to retrieve
8588
`title`, `link` and `descr`
@@ -152,8 +155,14 @@ def get_search_url(self, query=None, page=None, **kwargs):
152155
offset = (page * 10) - 9
153156
params = self.get_params(
154157
query=query, page=page, offset=offset, **kwargs)
155-
url = self.search_url + urlencode(params)
156-
self._parsed_url = urlparse(url)
158+
url = urlparse(self.search_url)
159+
# For localization purposes, custom urls can be parsed for the same engine
160+
# such as google.de and google.com
161+
if kwargs.get("url"):
162+
new_url = urlparse(kwargs.pop("url"))
163+
url._replace(netloc=new_url.netloc)
164+
self._parsed_url = url._replace(query=urlencode(params))
165+
157166
return self._parsed_url.geturl()
158167

159168
def get_results(self, soup, **kwargs):

search_engine_parser/core/cli.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def display(results, term, **args):
1919
def print_one(kwargs):
2020
""" Print one result to the console """
2121
# Header
22+
print(kwargs)
2223
if kwargs.get("titles"):
2324
print("\t{}".format(term.magenta(kwargs.pop("titles"))))
2425
if kwargs.get("links"):
@@ -41,15 +42,10 @@ def print_one(kwargs):
4142
# TODO Some more optimization might be need
4243
len_results = 0
4344
for i in results:
44-
len_results = len(results[i])
45-
break
46-
for i in range(len_results):
47-
result = {k: results[k][i] for k in results}
48-
print_one(result)
45+
print_one(i)
4946
else:
5047
rank = args["rank"]
51-
result = {k: results[k][rank] for k in results}
52-
print_one(result)
48+
print_one(results[rank])
5349

5450

5551

@@ -76,7 +72,7 @@ def main(args): # pylint: disable=too-many-branches
7672
engine = engine_class()
7773
try:
7874
# Display full details: Header, Link, Description
79-
results = engine.search(args['query'], args['page'], return_type=ReturnType(args["type"]))
75+
results = engine.search(args['query'], args['page'], return_type=ReturnType(args["type"]), url=args.get("url"))
8076
display(results, term, type=args.get('type'), rank=args.get('rank'))
8177
except NoResultsOrTrafficError as exc:
8278
print('\n', '{}'.format(term.red(str(exc))))
@@ -86,7 +82,7 @@ def runner():
8682
"""
8783
runner that handles parsing logic
8884
"""
89-
parser = argparse.ArgumentParser(description='SearchEngineParser')
85+
parser = argparse.ArgumentParser(description='SearchEngineParser', prog="pysearch")
9086
parser.add_argument(
9187
'-e', '--engine',
9288
help='Engine to use for parsing the query e.g google, yahoo, bing,'
@@ -97,6 +93,11 @@ def runner():
9793

9894
parser_search = subparsers.add_parser('search', help='search help')
9995

96+
parser_search.add_argument(
97+
'-u',
98+
'--url',
99+
help='A custom link to use as base url for search e.g google.de')
100+
100101
parser_search.add_argument(
101102
'-q',
102103
'--query',

search_engine_parser/core/engines/aol.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def parse_soup(self, soup):
2222
# find all divs
2323
return soup.find_all('div', class_='algo-sr')
2424

25-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
25+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
2626
"""
2727
Parses the source code to return
2828

search_engine_parser/core/engines/ask.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def parse_soup(self, soup):
3434
# find all class_='PartialSearchResults-item' => each result
3535
return soup.find_all('div', class_="PartialSearchResults-item")
3636

37-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
37+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
3838
"""
3939
Parses the source code to return
4040

search_engine_parser/core/engines/baidu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def parse_soup(self, soup):
4646

4747
return soup.find_all('div', {'id': re.compile(r"^\d{1,2}")})
4848

49-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
49+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
5050
"""
5151
Parses the source code to return
5252

search_engine_parser/core/engines/bing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def parse_soup(self, soup):
3131
# find all li tags
3232
return soup.find_all('li', class_='b_algo')
3333

34-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
34+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
3535
"""
3636
Parses the source code to return
3737

search_engine_parser/core/engines/duckduckgo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def parse_soup(self, soup):
3636
# find all div tags
3737
return soup.find_all('div', class_='result')
3838

39-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
39+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
4040
"""
4141
Parses the source code to return
4242

search_engine_parser/core/engines/github.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def parse_soup(self, soup):
6161
elif self.type == "Commits":
6262
return soup.find_all('div', class_='commits-list-item')
6363

64-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
64+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
6565
"""
6666
Parses the source code to return
6767

search_engine_parser/core/engines/google.py

Lines changed: 1 addition & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -32,76 +32,7 @@ def parse_soup(self, soup):
3232
# find all class_='g' => each result
3333
return soup.find_all('div', class_='g')
3434

35-
def parse_result(self, results, **kwargs):
36-
"""
37-
Runs every entry on the page through parse_single_result
38-
39-
:param results: Result of main search to extract individual results
40-
:type results: list[`bs4.element.ResultSet`]
41-
:returns: dictionary. Containing lists of titles, links, descriptions, direct results and other possible\
42-
returns.
43-
:rtype: dict
44-
"""
45-
search_results = dict()
46-
for each in results:
47-
try:
48-
rdict = self.parse_single_result(each, **kwargs)
49-
# Create a list for all keys in rdict if not exist, else
50-
for key in rdict.keys():
51-
if key not in search_results.keys():
52-
search_results[key] = list([rdict[key]])
53-
else:
54-
search_results[key].append(rdict[key])
55-
except Exception: #pylint: disable=invalid-name, broad-except
56-
pass
57-
58-
direct_answer = self.parse_direct_answer(results[0])
59-
if direct_answer is not None:
60-
search_results['direct_answer'] = direct_answer
61-
62-
return search_results
63-
64-
def parse_direct_answer(self, single_result, return_type=ReturnType.FULL):
65-
# returns empty string when there is no direct answer
66-
if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
67-
direct_answer = ''
68-
if not single_result.find('span', class_='st'):
69-
# example query: President of US
70-
if single_result.find('div', class_='Z0LcW'):
71-
direct_answer = single_result.find('div', class_='Z0LcW').find('a').text
72-
73-
# example query: 5+5
74-
elif single_result.find('span', class_='qv3Wpe'):
75-
direct_answer = single_result.find('span', class_='qv3Wpe').text
76-
77-
# example query: Weather in dallas
78-
elif single_result.find('div', id='wob_wc'):
79-
weather_status = single_result.find('span', id='wob_dc').text
80-
temperature = single_result.find('span', id='wob_tm').text
81-
unit = single_result.find('div', class_='wob-unit').find('span', class_='wob_t').text
82-
direct_answer = weather_status + ', ' + temperature + unit
83-
84-
# example query: 100 euros in pounds
85-
elif single_result.find('span', class_='DFlfde SwHCTb'):
86-
direct_answer = single_result.find('span', class_='DFlfde SwHCTb').text + ' ' +single_result.find('span', class_='MWvIVe').text
87-
88-
# example query: US time
89-
elif single_result.find('div', class_="gsrt vk_bk dDoNo"):
90-
direct_answer = single_result.find('div', class_='gsrt vk_bk dDoNo').text
91-
92-
# Christmas
93-
elif single_result.find('div', class_="zCubwf"):
94-
direct_answer = single_result.find('div', class_="zCubwf").text
95-
96-
97-
elif not single_result.find('span', class_='st').text:
98-
# example queris: How long shoud a car service take?, fastest animal
99-
if single_result.find('div', class_='Z0LcW'):
100-
direct_answer = single_result.find('div', class_='Z0LcW').text
101-
102-
return direct_answer
103-
104-
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
35+
def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
10536
"""
10637
Parses the source code to return
10738

0 commit comments

Comments
 (0)