Skip to content

Commit 5704bde

Browse files
committed
Use URL Parse to parse urls
1 parent 56ec989 commit 5704bde

File tree

14 files changed

+129
-44
lines changed

14 files changed

+129
-44
lines changed

search_engine_parser/core/base.py

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
Base class inherited by every search engine
33
"""
44

5-
from abc import ABCMeta, abstractmethod
6-
import random
75
import asyncio
6+
import random
7+
from abc import ABCMeta, abstractmethod
88
from enum import Enum, unique
9-
import aiohttp
9+
from urllib.parse import urlencode, urlparse
10+
1011
from bs4 import BeautifulSoup
1112

13+
import aiohttp
1214
from search_engine_parser.core.exceptions import NoResultsOrTrafficError
1315

1416

@@ -34,9 +36,11 @@ class BaseSearch:
3436
name = None
3537
# Search Engine unformatted URL
3638
search_url = None
39+
# The url after all query params have been set
40+
_parsed_url = None
3741

3842
@abstractmethod
39-
def parse_soup(self, soup, **kwargs):
43+
def parse_soup(self, soup):
4044
"""
4145
Defines the results contained in a soup
4246
"""
@@ -75,16 +79,9 @@ def parse_result(self, results, **kwargs):
7579
pass
7680
return search_results
7781

78-
@staticmethod
79-
def parse_query(query):
80-
"""
81-
Replace spaces in query
82-
83-
:param query: query to be processed
84-
:type query: str
85-
:rtype: str
86-
"""
87-
return query.replace(" ", "%20").replace(":", "%3A")
82+
def get_params(self, query=None, page=None, offset=None, **kwargs):
83+
""" This function should be overwritten to return a dictionary of query params"""
84+
return {'q': query, 'page': page}
8885

8986
@staticmethod
9087
async def get_source(url):
@@ -134,20 +131,19 @@ async def get_soup(self, url):
134131
def get_search_url(self, query=None, page=None, **kwargs):
135132
"""
136133
Return a formatted search url
137-
"""
138-
# Some URLs use offsets
139-
offset = (page * 10) - 9
140-
141-
return self.search_url.format(
142-
query=query,
143-
page=page,
144-
offset=offset,
145-
)
134+
"""
135+
if not self._parsed_url:
136+
# Some URLs use offsets
137+
offset = (page * 10) - 9
138+
params = self.get_params(query=query, page=page, offset=offset, **kwargs)
139+
url = self.search_url + urlencode(params)
140+
self._parsed_url = urlparse(url)
141+
return self._parsed_url.geturl()
146142

147143
def get_results(self, soup, **kwargs):
148144
""" Get results from soup"""
149145

150-
results = self.parse_soup(soup, **kwargs)
146+
results = self.parse_soup(soup)
151147
# TODO Check if empty results is caused by traffic or answers to query
152148
# were not found
153149
if not results:
@@ -167,7 +163,6 @@ def search(self, query=None, page=None, **kwargs):
167163
:type page: int
168164
:return: dictionary. Containing titles, links, netlocs and descriptions.
169165
"""
170-
parsed_query = self.parse_query(query)
171166
# Get search Page Results
172167
loop = asyncio.get_event_loop()
173168
soup = loop.run_until_complete(
@@ -191,6 +186,5 @@ async def async_search(self, query=None, page=None, callback=None, **kwargs):
191186
# TODO callback should be called
192187
if callback:
193188
pass
194-
parsed_query = self.parse_query(query)
195-
soup = await self.get_soup(self.get_search_url(parsed_query, page))
189+
soup = await self.get_soup(self.get_search_url(query, page, **kwargs))
196190
return self.get_results(soup, **kwargs)

search_engine_parser/core/engines/aol.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class AolSearch(BaseSearch):
99
Searches Aol for string
1010
"""
1111
name = "AOL"
12-
search_url = "https://search.aol.com/aol/search?q={query}&page={page}"
12+
search_url = "https://search.aol.com/aol/search?"
1313
summary = "\t According to netmarketshare, the old time famous AOL is still in the top 10 "\
1414
"search engines with a market share that is close to 0.06%. "\
1515
"The AOL network includes many popular web sites like engadget.com, techchrunch.com and "\

search_engine_parser/core/engines/ask.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,23 @@ class AskSearch(BaseSearch):
1010
"""
1111
name = "Ask"
1212

13-
search_url = "https://www.ask.com/web?o=0&l=dir&qo=pagination&q={query}&qsrc=998&page={page}"
13+
search_url = "https://www.ask.com/web?"
1414

1515
summary = "\t Formerly known as Ask Jeeves, Ask.com receives approximately 0.42% of the search"\
1616
" share. ASK is based on a question/answer format where most questions are answered by "\
1717
"other users or are in the form of polls.\nIt also has the general search functionality "\
1818
"but the results returned lack quality compared to Google or even Bing and Yahoo."
1919

20+
def get_params(self, query=None, page=None, offset=None, **kwargs):
21+
params = {}
22+
params["o"] = 0
23+
params["l"] = "dir"
24+
params["qo"] = "pagination"
25+
params["q"] = query
26+
params["qsrc"] = 998
27+
params["page"] = page
28+
return params
29+
2030
def parse_soup(self, soup):
2131
"""
2232
Parses Ask Search Soup for results

search_engine_parser/core/engines/baidu.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import re
6+
67
from search_engine_parser.core.base import BaseSearch, ReturnType
78

89

@@ -11,7 +12,7 @@ class BaiduSearch(BaseSearch):
1112
Searches Baidu for string
1213
"""
1314
name = "Baidu"
14-
search_url = "https://www.baidu.com/s?wd={query}&pn={offset}&oq={query}"
15+
search_url = "https://www.baidu.com/s?"
1516
summary = "\tBaidu, Inc. is a Chinese multinational technology company specializing in"\
1617
" Internet-related services and products and artificial intelligence (AI), headquartered"\
1718
" in Beijing's Haidian District.\n\tIt is one of the largest AI and internet"\
@@ -20,12 +21,18 @@ class BaiduSearch(BaseSearch):
2021

2122
"""Override get_search_url"""
2223

24+
def get_params(self, query=None, page=None, offset=None, **kwargs):
25+
params = {}
26+
params["wd"] = query
27+
params["pn"] = offset
28+
params["oq"] = query
29+
return params
30+
2331
def get_search_url(self, query=None, page=None):
2432
"""
2533
Return a formatted search url.
2634
Offsets are of form 0,10,20, etc. So if 1 is passed, we make it 0, for 2->(2-1)*10=10. etc.
27-
"""
28-
35+
"""
2936
offset = (page - 1) * 10
3037
return self.search_url.format(query=query, page=page, offset=offset)
3138

search_engine_parser/core/engines/bing.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,21 @@ class BingSearch(BaseSearch):
99
Searches Bing for string
1010
"""
1111
name = "Bing"
12-
search_url = "https://www.bing.com/search?q={query}&count=10&offset=0&first={offset}&FORM=PERE"
12+
search_url = "https://www.bing.com/search?"
1313
summary = "\tBing is Microsoft’s attempt to challenge Google in search, but despite their "\
1414
"efforts they still did not manage to convince users that their search engine can be"\
1515
" an alternative to Google.\n\tTheir search engine market share is constantly below "\
1616
"10%, even though Bing is the default search engine on Windows PCs."
1717

18+
def get_params(self, query=None, page=None, offset=None, **kwargs):
19+
params = {}
20+
params["q"] = query
21+
params["offset"] = 0
22+
params["first"] = offset
23+
params["count"] = 10
24+
params["FORM"] = "PERE"
25+
return params
26+
1827
def parse_soup(self, soup):
1928
"""
2029
Parses Bing for a search query.

search_engine_parser/core/engines/duckduckgo.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Parser for DuckDuckGo search results
33
"""
44
import re
5+
56
from search_engine_parser.core.base import BaseSearch, ReturnType
67

78

@@ -11,13 +12,23 @@ class DuckDuckGoSearch(BaseSearch):
1112
"""
1213
name = "DuckDuckGo"
1314
base_url = "https://www.duckduckgo.com"
14-
search_url = "https://www.duckduckgo.com/html/?q={query}&s={start}&dc={offset}&v=l&o=json&api=/d.js"
15+
search_url = "https://www.duckduckgo.com/html/?"
1516
summary = "\tHas a number of advantages over the other search engines. \n\tIt has a clean "\
1617
"interface, it does not track users, it is not fully loaded with ads and has a number "\
1718
"of very nice features (only one page of results, you can search directly other web "\
1819
"sites etc).\n\tAccording to DuckDuckGo traffic stats [December, 2018], they are "\
1920
"currently serving more than 30 million searches per day."
2021

22+
def get_params(self, query=None, page=None, offset=None, **kwargs):
23+
params = {}
24+
params["q"] = query
25+
params["s"] = kwargs.get("start", 0)
26+
params["dc"] = offset
27+
params["v"] = "l"
28+
params["o"] = "json"
29+
params["api"] = "/d.js"
30+
return params
31+
2132
def parse_soup(self, soup):
2233
"""
2334
Parses DuckDuckGo Search Soup for a query results

search_engine_parser/core/engines/github.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ class GitHubSearch(BaseSearch):
1111
"""
1212
name = "GitHub"
1313
base_url = "https://github.com"
14-
search_url = base_url + "/search?q={query}&p={page}&type={type_}"
14+
search_url = base_url + "/search?"
1515
summary = "\tGitHub is an American company that provides hosting for software development "\
1616
"version control using Git. It is a subsidiary of Microsoft, which acquired the company "\
1717
"in 2018 for $7.5 billion.\n\tIt offers all of the distributed version control and source"\
@@ -20,7 +20,15 @@ class GitHubSearch(BaseSearch):
2020
" repositories (including at least 28 million public repositories), making it the largest "\
2121
"host of source code in the world."
2222

23-
def parse_soup(self, soup, **kwargs):
23+
def get_params(self, query=None, page=None, offset=None, **kwargs):
24+
params = {}
25+
params["q"] = query
26+
params["p"] = page
27+
params["type"] = kwargs.get("type_", None)
28+
self.type = params["type"]
29+
return params
30+
31+
def parse_soup(self, soup):
2432
"""
2533
Parses GitHub for a search query.
2634
"""
@@ -35,7 +43,6 @@ def parse_soup(self, soup, **kwargs):
3543
"Issues",
3644
"Commits",
3745
"Code")
38-
self.type = kwargs.get("type", None)
3946
if self.type not in allowed_types:
4047
raise IncorrectKeyWord("No type <{type_}> exists".format(type_=self.type))
4148
# find all li tags

search_engine_parser/core/engines/google.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,21 @@ class GoogleSearch(BaseSearch):
1010
Searches Google for string
1111
"""
1212
name = "Google"
13-
search_url = "https://www.google.com/search?client=ubuntu&q={query}&num=10&start={page}"
13+
search_url = "https://www.google.com/search?"
1414
summary = "\tNo need for further introductions. The search engine giant holds the first "\
1515
"place in search with a stunning difference of 65% from second in place Bing.\n"\
1616
"\tAccording to the latest netmarketshare report (November 2018) 73% of searches "\
1717
"were powered by Google and only 7.91% by Bing.\n\tGoogle is also dominating the "\
1818
"mobile/tablet search engine market share with 81%!"
1919

20+
def get_params(self, query=None, offset=None, page=None, **kwargs):
21+
params = {}
22+
params["num"] = 10
23+
params["start"] = page
24+
params["q"] = query
25+
params["client"] = "ubuntu"
26+
return params
27+
2028
def parse_soup(self, soup):
2129
"""
2230
Parses Google Search Soup for results

search_engine_parser/core/engines/googlescholar.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import re
6+
67
from search_engine_parser.core.base import BaseSearch, ReturnType
78

89

@@ -11,11 +12,18 @@ class GoogleScholarSearch(BaseSearch):
1112
Searches Google Scholar for string
1213
"""
1314
name = "GoogleScholar"
14-
search_url = "https://scholar.google.gr/scholar?start={page}&q={query}&hl=en"
15+
search_url = "https://scholar.google.gr/scholar?"
1516
summary = "\tGoogle Scholar is a freely accessible web search engine that indexes the full "\
1617
"text or metadata of scholarly literature across an array of publishing formats and "\
1718
"disciplines."
1819

20+
def get_params(self, query=None, offset=None, page=None, **kwargs):
21+
params = {}
22+
params["hl"] = "en"
23+
params["start"] = page
24+
params["q"] = query
25+
return params
26+
1927
def parse_soup(self, soup):
2028
"""
2129
Parses Google Scholar Search Soup for results

search_engine_parser/core/engines/myanimelist.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import math
6+
67
from search_engine_parser.core.base import BaseSearch, ReturnType
78

89

@@ -12,7 +13,7 @@ class MyAnimeListSearch(BaseSearch):
1213
"""
1314
name = "MyAnimeList"
1415

15-
search_url = "https://myanimelist.net/anime.php?q={query}&show={offset}"
16+
search_url = "https://myanimelist.net/anime.php?"
1617
summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\
1718
"networking and social cataloging application website."\
1819
"\n\tThe site provides its users with a list-like system to organize"\
@@ -21,6 +22,11 @@ class MyAnimeListSearch(BaseSearch):
2122
"site claims to have 4.4 million anime and 775,000 manga entries."\
2223
"\n\tIn 2015, the site received over 120 million visitors a month."
2324

25+
def get_params(self, query=None, offset=None, page=None, **kwargs):
26+
params = {}
27+
params["show"] = offset
28+
params["q"] = query
29+
return params
2430

2531
def get_search_url(self, query=None, page=None):
2632
"""

0 commit comments

Comments
 (0)