Skip to content

Commit 7bbaf54

Browse files
committed
Performance Improvement: Return based on request for other engines
1 parent 3bab38d commit 7bbaf54

File tree

11 files changed

+188
-176
lines changed

11 files changed

+188
-176
lines changed
Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""@desc
22
Parser for AOL search results
33
"""
4-
from search_engine_parser.core.base import BaseSearch
4+
from search_engine_parser.core.base import BaseSearch, ReturnType
55

66

77
class AolSearch(BaseSearch):
@@ -22,7 +22,7 @@ def parse_soup(self, soup):
2222
# find all divs
2323
return soup.find_all('div', class_='algo-sr')
2424

25-
def parse_single_result(self, single_result):
25+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
2626
"""
2727
Parses the source code to return
2828
@@ -31,21 +31,19 @@ def parse_single_result(self, single_result):
3131
:return: parsed title, link and description of single result
3232
:rtype: dict
3333
"""
34+
rdict = {}
3435
h3_tag = single_result.find('h3')
3536
link_tag = h3_tag.find('a')
36-
caption = single_result.find('div', class_='compText aAbs')
37-
desc = caption.find('p', class_='lh-16')
38-
# Get the text and link
39-
title = link_tag.text
37+
if return_type in (ReturnType.FULL, return_type.TITLE):
38+
# Get the text and link
39+
rdict["titles"] = link_tag.text
4040

41-
link = link_tag.get('href')
41+
if return_type in (ReturnType.FULL, ReturnType.LINK):
42+
rdict["links"] = link_tag.get("href")
43+
44+
if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
45+
caption = single_result.find('div', class_='compText aAbs')
46+
desc = caption.find('p', class_='lh-16')
47+
rdict["descriptions"] = desc.text
4248

43-
desc = desc.text
44-
rdict = dict()
45-
if title and link and desc:
46-
rdict = {
47-
"titles": title,
48-
"links": link,
49-
"descriptions": desc,
50-
}
5149
return rdict

search_engine_parser/core/engines/ask.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""@desc
22
Parser for ask search results
33
"""
4-
from search_engine_parser.core.base import BaseSearch
4+
from search_engine_parser.core.base import BaseSearch, ReturnType
55

66

77
class AskSearch(BaseSearch):
@@ -24,7 +24,7 @@ def parse_soup(self, soup):
2424
# find all class_='PartialSearchResults-item' => each result
2525
return soup.find_all('div', class_="PartialSearchResults-item")
2626

27-
def parse_single_result(self, single_result):
27+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
2828
"""
2929
Parses the source code to return
3030
@@ -34,14 +34,16 @@ def parse_single_result(self, single_result):
3434
:rtype: str, str, str
3535
"""
3636

37-
title = single_result.find('a').text
38-
link = single_result.a["href"]
39-
desc = single_result.find(
40-
'p', class_="PartialSearchResults-item-abstract").text
41-
search_results = {
42-
"titles": title,
43-
"links": link,
44-
"descriptions": desc,
45-
}
46-
47-
return search_results
37+
rdict = {}
38+
if return_type in (ReturnType.FULL, return_type.TITLE):
39+
rdict["titles"] = single_result.find('a').text
40+
41+
if return_type in (ReturnType.FULL, return_type.TITLE):
42+
rdict["links"] = single_result.a["href"]
43+
44+
if return_type in (ReturnType.FULL, return_type.TITLE):
45+
rdict["descriptions"] = single_result.find(
46+
'p', class_="PartialSearchResults-item-abstract").text
47+
48+
49+
return rdict

search_engine_parser/core/engines/baidu.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
import re
6-
from search_engine_parser.core.base import BaseSearch
6+
from search_engine_parser.core.base import BaseSearch, ReturnType
77

88

99
class BaiduSearch(BaseSearch):
@@ -39,27 +39,25 @@ def parse_soup(self, soup):
3939

4040
return soup.find_all('div', {'id': re.compile(r"^\d{1,2}")})
4141

42-
def parse_single_result(self, single_result):
42+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
4343
"""
4444
Parses the source code to return
4545
46-
:param single_result: single result found in div with a numeric id
47-
:type single_result: `bs4.element.Tag`
48-
:return: parsed title, link and description of single result
49-
:rtype: dict
50-
"""
51-
h3_tag = single_result.find('h3')
52-
link_tag = single_result.find('a')
53-
54-
# Get the text and link
55-
title = h3_tag.text
56-
link = link_tag.get('href')
57-
desc = single_result.find('div', class_='c-abstract').text
58-
rdict = dict()
59-
if title and link and desc:
60-
rdict = {
61-
"titles": title,
62-
"links": link,
63-
"descriptions": desc,
64-
}
46+
:param single_result: single result found in div with a numeric id
47+
:type single_result: `bs4.element.Tag`
48+
:return: parsed title, link and description of single result
49+
:rtype: dict
50+
"""
51+
rdict = {}
52+
if return_type in (ReturnType.FULL, return_type.TITLE):
53+
h3_tag = single_result.find('h3')
54+
rdict["title"] = h3_tag.text
55+
56+
if return_type in (ReturnType.FULL, ReturnType.LINK):
57+
link_tag = single_result.find('a')
58+
# Get the text and link
59+
rdict["links"] = link_tag.get('href')
60+
61+
if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
62+
rdict["descriptions"] = single_result.find('div', class_='c-abstract').text
6563
return rdict
Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""@desc
22
Parser for Bing search results
33
"""
4-
from search_engine_parser.core.base import BaseSearch
4+
from search_engine_parser.core.base import BaseSearch, ReturnType
55

66

77
class BingSearch(BaseSearch):
@@ -22,7 +22,7 @@ def parse_soup(self, soup):
2222
# find all li tags
2323
return soup.find_all('li', class_='b_algo')
2424

25-
def parse_single_result(self, single_result):
25+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
2626
"""
2727
Parses the source code to return
2828
@@ -31,19 +31,20 @@ def parse_single_result(self, single_result):
3131
:return: parsed title, link and description of single result
3232
:rtype: dict
3333
"""
34+
rdict = {}
3435
h2_tag = single_result.find('h2')
3536
link_tag = h2_tag.find('a')
36-
caption = single_result.find('div', class_='b_caption')
37-
desc = caption.find('p')
38-
# Get the text and link
39-
title = link_tag.text
40-
41-
link = link_tag.get('href')
42-
43-
desc = desc.text
44-
rdict = {
45-
"titles": title,
46-
"links": link,
47-
"descriptions": desc,
48-
}
37+
38+
if return_type in (ReturnType.FULL, return_type.TITLE):
39+
rdict["titles"] = link_tag.text
40+
41+
if return_type in (ReturnType.FULL, return_type.LINK):
42+
link = link_tag.get('href')
43+
rdict["links"] = link
44+
45+
if return_type in (ReturnType.FULL, return_type.DESCRIPTIONS):
46+
caption = single_result.find('div', class_='b_caption')
47+
desc = caption.find('p')
48+
rdict["descriptions"] = desc.text
49+
4950
return rdict

search_engine_parser/core/engines/duckduckgo.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Parser for DuckDuckGo search results
33
"""
44
import re
5-
from search_engine_parser.core.base import BaseSearch
5+
from search_engine_parser.core.base import BaseSearch, ReturnType
66

77

88
class DuckDuckGoSearch(BaseSearch):
@@ -25,7 +25,7 @@ def parse_soup(self, soup):
2525
# find all div tags
2626
return soup.find_all('div', class_='result')
2727

28-
def parse_single_result(self, single_result):
28+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
2929
"""
3030
Parses the source code to return
3131
@@ -34,27 +34,28 @@ def parse_single_result(self, single_result):
3434
:return: parsed title, link and description of single result
3535
:rtype: dict
3636
"""
37-
h2 = single_result.find('h2', class_="result__title") #pylint: disable=invalid-name
38-
link_tag = single_result.find('a', class_="result__url")
39-
desc = single_result.find(class_='result__snippet')
4037

41-
# Get the text and link
42-
title = h2.text.strip()
38+
rdict = {}
4339

44-
# raw link is of format "/url?q=REAL-LINK&sa=..."
45-
raw_link = self.base_url + link_tag.get('href')
46-
47-
re_str = re.findall("uddg=(.+)", raw_link)[0]
48-
re_str = re_str.replace("%3A", ":")
49-
link = re_str.replace("%2F", "/")
50-
link = link.replace("%2D", "-")
40+
if return_type in (ReturnType.FULL, return_type.TITLE):
41+
h2 = single_result.find('h2', class_="result__title") #pylint: disable=invalid-name
42+
# Get the text and link
43+
rdict["titles"] = h2.text.strip()
44+
45+
if return_type in (ReturnType.FULL, ReturnType.LINK):
46+
link_tag = single_result.find('a', class_="result__url")
47+
# raw link is of format "/url?q=REAL-LINK&sa=..."
48+
raw_link = self.base_url + link_tag.get('href')
49+
re_str = re.findall("uddg=(.+)", raw_link)[0]
50+
re_str = re_str.replace("%3A", ":")
51+
link = re_str.replace("%2F", "/")
52+
link = link.replace("%2D", "-")
53+
rdict["links"] = link
54+
55+
if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
56+
desc = single_result.find(class_='result__snippet')
57+
rdict["descriptions"] = desc.text
5158

52-
desc = desc.text
53-
rdict = {
54-
"titles": title,
55-
"links": link,
56-
"descriptions": desc,
57-
}
5859
return rdict
5960

6061
def get_search_url(self, query=None, page=None, **kwargs):
@@ -64,7 +65,7 @@ def get_search_url(self, query=None, page=None, **kwargs):
6465
# Start value for the page
6566
start = 0 if (page < 2) else (((page-1) * 50) - 20)
6667

67-
type_ = self.keywords.get("type", None)
68+
type_ = kwargs.get("type", None)
6869

6970
return self.search_url.format(
7071
query=query,

search_engine_parser/core/engines/github.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""@desc
22
Parser for GitHub search results
33
"""
4-
from search_engine_parser.core.base import BaseSearch
4+
from search_engine_parser.core.base import BaseSearch, ReturnType
55
from search_engine_parser.core.exceptions import IncorrectKeyWord
66

77

@@ -19,7 +19,8 @@ class GitHubSearch(BaseSearch):
1919
"\n\tAs of May 2019, GitHub reports having over 37 million users and more than 100 million"\
2020
" repositories (including at least 28 million public repositories), making it the largest "\
2121
"host of source code in the world."
22-
def parse_soup(self, soup):
22+
23+
def parse_soup(self, soup, **kwargs):
2324
"""
2425
Parses GitHub for a search query.
2526
"""
@@ -34,7 +35,7 @@ def parse_soup(self, soup):
3435
"Issues",
3536
"Commits",
3637
"Code")
37-
self.type = self.keywords.get("type", None)
38+
self.type = kwargs.get("type", None)
3839
if self.type not in allowed_types:
3940
raise IncorrectKeyWord("No type <{type_}> exists".format(type_=self.type))
4041
# find all li tags
@@ -52,7 +53,7 @@ def parse_soup(self, soup):
5253
elif self.type == "Commits":
5354
return soup.find_all('div', class_='commits-list-item')
5455

55-
def parse_single_result(self, single_result):
56+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
5657
"""
5758
Parses the source code to return
5859

search_engine_parser/core/engines/myanimelist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ class MyAnimeListSearch(BaseSearch):
1313
name = "MyAnimeList"
1414

1515
search_url = "https://myanimelist.net/anime.php?q={query}&show={offset}"
16-
summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\ "networking and social cataloging application website."\
16+
summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\
17+
"networking and social cataloging application website."\
1718
"\n\tThe site provides its users with a list-like system to organize"\
1819
"and score anime and manga.\n\tIt facilitates finding users who share"\
1920
"similar tastes and provides a large database on anime and manga.\n\tThe"\
2021
"site claims to have 4.4 million anime and 775,000 manga entries."\
2122
"\n\tIn 2015, the site received over 120 million visitors a month."
2223

23-
"""Override get_search_url"""
2424

2525
def get_search_url(self, query=None, page=None):
2626
"""

search_engine_parser/core/engines/stackoverflow.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""@desc
22
Parser for AOL search results
33
"""
4-
from search_engine_parser.core.base import BaseSearch
4+
from search_engine_parser.core.base import BaseSearch, ReturnType
55

66

77
class StackOverflowSearch(BaseSearch):
@@ -25,7 +25,7 @@ def parse_soup(self, soup):
2525
# find all divs
2626
return soup.find_all('div', class_='summary')
2727

28-
def parse_single_result(self, single_result):
28+
def parse_single_result(self, single_result, return_type=ReturnType.FULL):
2929
"""
3030
Parses the source code to return
3131
@@ -34,19 +34,19 @@ def parse_single_result(self, single_result):
3434
:return: parsed title, link and description of single result
3535
:rtype: dict
3636
"""
37+
rdict = {}
3738
h3 = single_result.find('h3') #pylint: disable=invalid-name
3839
link_tag = h3.find('a')
39-
caption = single_result.find('div', class_='excerpt')
40-
# Get the text and link
41-
title = link_tag.text
40+
if return_type in (ReturnType.FULL, return_type.TITLE):
41+
# Get the text and link
42+
rdict["titles"] = link_tag.text
4243

43-
ref_link = link_tag.get('href')
44-
link = self.base_url + ref_link
44+
if return_type in (ReturnType.FULL, return_type.LINK):
45+
ref_link = link_tag.get('href')
46+
link = self.base_url + ref_link
47+
rdict["links"] = link
4548

46-
desc = caption.text
47-
rdict = {
48-
"titles": title,
49-
"links": link,
50-
"descriptions": desc,
51-
}
49+
if return_type in (ReturnType.FULL, return_type.DESCRIPTIONS):
50+
caption = single_result.find('div', class_='excerpt')
51+
rdict["descriptions"] = caption.text
5252
return rdict

0 commit comments

Comments
 (0)