Use URL Parse to parse urls

MeNsaaH · MeNsaaH · commit 5704bde3f95f · 2020-04-14T22:32:46.000+01:00
diff --git a/search_engine_parser/core/base.py b/search_engine_parser/core/base.py
@@ -2,13 +2,15 @@
 		Base class inherited by every search engine
 """
 
-from abc import ABCMeta, abstractmethod
-import random
 import asyncio
+import random
+from abc import ABCMeta, abstractmethod
 from enum import Enum, unique
-import aiohttp
+from urllib.parse import urlencode, urlparse
+
 from bs4 import BeautifulSoup
 
+import aiohttp
 from search_engine_parser.core.exceptions import NoResultsOrTrafficError
 
 
@@ -34,9 +36,11 @@ class BaseSearch:
     name = None
     # Search Engine unformatted URL
     search_url = None
+    # The url after all query params have been set
+    _parsed_url = None
 
     @abstractmethod
-    def parse_soup(self, soup, **kwargs):
+    def parse_soup(self, soup):
         """
         Defines the results contained in a soup
         """
@@ -75,16 +79,9 @@ def parse_result(self, results, **kwargs):
                 pass
         return search_results
 
-    @staticmethod
-    def parse_query(query):
-        """
-        Replace spaces in query
-
-        :param query: query to be processed
-        :type query: str
-        :rtype: str
-        """
-        return query.replace(" ", "%20").replace(":", "%3A")
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        """ This  function should be overwritten to return a dictionary of query params"""
+        return {'q': query, 'page': page}
 
     @staticmethod
     async def get_source(url):
@@ -134,20 +131,19 @@ async def get_soup(self, url):
     def get_search_url(self, query=None, page=None, **kwargs):
         """
         Return a formatted search url
-        """
-        # Some URLs use offsets
-        offset = (page * 10) - 9
-
-        return self.search_url.format(
-            query=query,
-            page=page,
-            offset=offset,
-            )
+        """ 
+        if not self._parsed_url:
+            # Some URLs use offsets
+            offset = (page * 10) - 9
+            params = self.get_params(query=query, page=page, offset=offset, **kwargs)
+            url = self.search_url + urlencode(params)
+            self._parsed_url = urlparse(url)
+        return self._parsed_url.geturl()
 
     def get_results(self, soup, **kwargs):
         """ Get results from soup"""
 
-        results = self.parse_soup(soup, **kwargs)
+        results = self.parse_soup(soup)
         # TODO Check if empty results is caused by traffic or answers to query
         # were not found
         if not results:
@@ -167,7 +163,6 @@ def search(self, query=None, page=None, **kwargs):
         :type page: int
         :return: dictionary. Containing titles, links, netlocs and descriptions.
         """
-        parsed_query = self.parse_query(query)
         # Get search Page Results
         loop = asyncio.get_event_loop()
         soup = loop.run_until_complete(
@@ -191,6 +186,5 @@ async def async_search(self, query=None, page=None, callback=None, **kwargs):
         # TODO callback should be called
         if callback:
             pass
-        parsed_query = self.parse_query(query)
-        soup = await self.get_soup(self.get_search_url(parsed_query, page))
+        soup = await self.get_soup(self.get_search_url(query, page, **kwargs))
         return self.get_results(soup, **kwargs)
diff --git a/search_engine_parser/core/engines/aol.py b/search_engine_parser/core/engines/aol.py
@@ -9,7 +9,7 @@ class AolSearch(BaseSearch):
     Searches Aol for string
     """
     name = "AOL"
-    search_url = "https://search.aol.com/aol/search?q={query}&page={page}"
+    search_url = "https://search.aol.com/aol/search?"
     summary = "\t According to netmarketshare, the old time famous AOL is still in the top 10 "\
         "search engines with a market share that is close to 0.06%. "\
         "The AOL network includes many popular web sites like engadget.com, techchrunch.com and "\
diff --git a/search_engine_parser/core/engines/ask.py b/search_engine_parser/core/engines/ask.py
@@ -10,13 +10,23 @@ class AskSearch(BaseSearch):
     """
     name = "Ask"
 
-    search_url = "https://www.ask.com/web?o=0&l=dir&qo=pagination&q={query}&qsrc=998&page={page}"
+    search_url = "https://www.ask.com/web?"
 
     summary = "\t Formerly known as Ask Jeeves, Ask.com receives approximately 0.42% of the search"\
         " share. ASK is based on a question/answer format where most questions are answered by "\
         "other users or are in the form of polls.\nIt also has the general search functionality "\
         "but the results returned lack quality compared to Google or even Bing and Yahoo."
 
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        params = {}
+        params["o"] = 0
+        params["l"] = "dir"
+        params["qo"] = "pagination"
+        params["q"] = query
+        params["qsrc"] = 998
+        params["page"] = page
+        return params
+
     def parse_soup(self, soup):
         """
         Parses Ask Search Soup for results
diff --git a/search_engine_parser/core/engines/baidu.py b/search_engine_parser/core/engines/baidu.py
@@ -3,6 +3,7 @@
 """
 
 import re
+
 from search_engine_parser.core.base import BaseSearch, ReturnType
 
 
@@ -11,7 +12,7 @@ class BaiduSearch(BaseSearch):
     Searches Baidu for string
     """
     name = "Baidu"
-    search_url = "https://www.baidu.com/s?wd={query}&pn={offset}&oq={query}"
+    search_url = "https://www.baidu.com/s?"
     summary = "\tBaidu, Inc. is a Chinese multinational technology company specializing in"\
         " Internet-related services and products and artificial intelligence (AI), headquartered"\
         " in Beijing's Haidian District.\n\tIt is one of the largest AI and internet"\
@@ -20,12 +21,18 @@ class BaiduSearch(BaseSearch):
 
     """Override get_search_url"""
 
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        params = {}
+        params["wd"] = query
+        params["pn"] = offset
+        params["oq"] = query
+        return params
+
     def get_search_url(self, query=None, page=None):
         """
         Return a formatted search url.
         Offsets are of form 0,10,20, etc. So if 1 is passed, we make it 0, for 2->(2-1)*10=10. etc.
-        """
-
+        """ 
         offset = (page - 1) * 10
         return self.search_url.format(query=query, page=page, offset=offset)
 
diff --git a/search_engine_parser/core/engines/bing.py b/search_engine_parser/core/engines/bing.py
@@ -9,12 +9,21 @@ class BingSearch(BaseSearch):
     Searches Bing for string
     """
     name = "Bing"
-    search_url = "https://www.bing.com/search?q={query}&count=10&offset=0&first={offset}&FORM=PERE"
+    search_url = "https://www.bing.com/search?"
     summary = "\tBing is Microsoft’s attempt to challenge Google in search, but despite their "\
         "efforts they still did not manage to convince users that their search engine can be"\
         " an alternative to Google.\n\tTheir search engine market share is constantly below "\
         "10%, even though Bing is the default search engine on Windows PCs."
 
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        params = {}
+        params["q"] = query
+        params["offset"] = 0
+        params["first"] = offset
+        params["count"] = 10
+        params["FORM"] = "PERE"
+        return params
+
     def parse_soup(self, soup):
         """
         Parses Bing for a search query.
diff --git a/search_engine_parser/core/engines/duckduckgo.py b/search_engine_parser/core/engines/duckduckgo.py
@@ -2,6 +2,7 @@
 		Parser for DuckDuckGo search results
 """
 import re
+
 from search_engine_parser.core.base import BaseSearch, ReturnType
 
 
@@ -11,13 +12,23 @@ class DuckDuckGoSearch(BaseSearch):
     """
     name = "DuckDuckGo"
     base_url = "https://www.duckduckgo.com"
-    search_url = "https://www.duckduckgo.com/html/?q={query}&s={start}&dc={offset}&v=l&o=json&api=/d.js"
+    search_url = "https://www.duckduckgo.com/html/?"
     summary = "\tHas a number of advantages over the other search engines. \n\tIt has a clean "\
         "interface, it does not track users, it is not fully loaded with ads and has a number "\
         "of very nice features (only one page of results, you can search directly other web "\
         "sites etc).\n\tAccording to DuckDuckGo traffic stats [December, 2018], they are "\
         "currently serving more than 30 million searches per day."
 
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        params = {}
+        params["q"] = query
+        params["s"] = kwargs.get("start", 0)
+        params["dc"] = offset
+        params["v"] = "l"
+        params["o"] = "json"
+        params["api"] = "/d.js"
+        return params
+
     def parse_soup(self, soup):
         """
         Parses DuckDuckGo Search Soup for a query results
diff --git a/search_engine_parser/core/engines/github.py b/search_engine_parser/core/engines/github.py
@@ -11,7 +11,7 @@ class GitHubSearch(BaseSearch):
     """
     name = "GitHub"
     base_url = "https://github.com"
-    search_url = base_url + "/search?q={query}&p={page}&type={type_}"
+    search_url = base_url + "/search?"
     summary = "\tGitHub is an American company that provides hosting for software development "\
         "version control using Git. It is a subsidiary of Microsoft, which acquired the company "\
         "in 2018 for $7.5 billion.\n\tIt offers all of the distributed version control and source"\
@@ -20,7 +20,15 @@ class GitHubSearch(BaseSearch):
         " repositories (including at least 28 million public repositories), making it the largest "\
         "host of source code in the world."
 
-    def parse_soup(self, soup, **kwargs):
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        params = {}
+        params["q"] = query
+        params["p"] = page
+        params["type"] = kwargs.get("type_", None)
+        self.type = params["type"]
+        return params
+
+    def parse_soup(self, soup):
         """
         Parses GitHub for a search query.
         """
@@ -35,7 +43,6 @@ def parse_soup(self, soup, **kwargs):
             "Issues",
             "Commits",
             "Code")
-        self.type = kwargs.get("type", None)
         if self.type not in allowed_types:
             raise IncorrectKeyWord("No type <{type_}> exists".format(type_=self.type))
         # find all li tags
diff --git a/search_engine_parser/core/engines/google.py b/search_engine_parser/core/engines/google.py
@@ -10,13 +10,21 @@ class GoogleSearch(BaseSearch):
     Searches Google for string
     """
     name = "Google"
-    search_url = "https://www.google.com/search?client=ubuntu&q={query}&num=10&start={page}"
+    search_url = "https://www.google.com/search?"
     summary = "\tNo need for further introductions. The search engine giant holds the first "\
         "place in search with a stunning difference of 65% from second in place Bing.\n"\
         "\tAccording to the latest netmarketshare report (November 2018) 73% of searches "\
         "were powered by Google and only 7.91% by Bing.\n\tGoogle is also dominating the "\
         "mobile/tablet search engine market share with 81%!"
 
+    def get_params(self, query=None, offset=None, page=None, **kwargs):
+        params = {}
+        params["num"] = 10
+        params["start"] = page
+        params["q"] = query
+        params["client"] = "ubuntu"
+        return params
+
     def parse_soup(self, soup):
         """
         Parses Google Search Soup for results
diff --git a/search_engine_parser/core/engines/googlescholar.py b/search_engine_parser/core/engines/googlescholar.py
@@ -3,6 +3,7 @@
 """
 
 import re
+
 from search_engine_parser.core.base import BaseSearch, ReturnType
 
 
@@ -11,11 +12,18 @@ class GoogleScholarSearch(BaseSearch):
     Searches Google Scholar for string
     """
     name = "GoogleScholar"
-    search_url = "https://scholar.google.gr/scholar?start={page}&q={query}&hl=en"
+    search_url = "https://scholar.google.gr/scholar?"
     summary = "\tGoogle Scholar is a freely accessible web search engine that indexes the full "\
             "text or metadata of scholarly literature across an array of publishing formats and "\
             "disciplines."
 
+    def get_params(self, query=None, offset=None, page=None, **kwargs):
+        params = {}
+        params["hl"] = "en"
+        params["start"] = page
+        params["q"] = query
+        return params
+
     def parse_soup(self, soup):
         """
         Parses Google Scholar Search Soup for results
diff --git a/search_engine_parser/core/engines/myanimelist.py b/search_engine_parser/core/engines/myanimelist.py
@@ -3,6 +3,7 @@
 """
 
 import math
+
 from search_engine_parser.core.base import BaseSearch, ReturnType
 
 
@@ -12,7 +13,7 @@ class MyAnimeListSearch(BaseSearch):
     """
     name = "MyAnimeList"
 
-    search_url = "https://myanimelist.net/anime.php?q={query}&show={offset}"
+    search_url = "https://myanimelist.net/anime.php?"
     summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\
         "networking and social cataloging application website."\
         "\n\tThe site provides its users with a list-like system to organize"\
@@ -21,6 +22,11 @@ class MyAnimeListSearch(BaseSearch):
         "site claims to have 4.4 million anime and 775,000 manga entries."\
         "\n\tIn 2015, the site received over 120 million visitors a month."
 
+    def get_params(self, query=None, offset=None, page=None, **kwargs):
+        params = {}
+        params["show"] = offset
+        params["q"] = query
+        return params
 
     def get_search_url(self, query=None, page=None):
         """
diff --git a/search_engine_parser/core/engines/stackoverflow.py b/search_engine_parser/core/engines/stackoverflow.py
@@ -10,14 +10,21 @@ class StackOverflowSearch(BaseSearch):
     """
     name = "StackOverflow"
     base_url = "https://stackoverflow.com"
-    search_url = base_url + "/search?q={query}&page={page}&pagesize=15"
+    search_url = base_url + "/search?"
     summary = "\tStack Overflow is a question and answer site for professional and enthusiast "\
               "programmers.\n\tIt is a privately held website, the flagship site of the Stack "\
               "Exchange Network, created in 2008 by Jeff Atwood and Joel Spolsky.\n\tIt features "\
               "questions and answers on a wide range of topics in computer programming. It was "\
               "created to be a more open alternative to earlier question and answer sites "\
               "such as Experts-Exchange"
 
+    def get_params(self, query=None, offset=None, page=None, **kwargs):
+        params = {}
+        params["page"] = page
+        params["q"] = query
+        params["pagesize"] = 15
+        return params
+
     def parse_soup(self, soup):
         """
         Parses StackOverflow for a search query
diff --git a/search_engine_parser/core/engines/yahoo.py b/search_engine_parser/core/engines/yahoo.py
@@ -2,6 +2,7 @@
 		Parser for Yahoo search results
 """
 import re
+
 from search_engine_parser.core.base import BaseSearch, ReturnType
 
 
@@ -10,14 +11,20 @@ class YahooSearch(BaseSearch):
     Searches Yahoo for string
     """
     name = "Yahoo"
-    search_url = "https://search.yahoo.com/search?p={query}&b={offset}"
+    search_url = "https://search.yahoo.com/search?"
     summary = "\tYahoo is one the most popular email providers and holds the fourth place in "\
         "search with 3.90% market share.\n\tFrom October 2011 to October 2015, Yahoo search "\
         "was powered exclusively by Bing. \n\tSince October 2015 Yahoo agreed with Google to "\
         "provide search-related services and since then the results of Yahoo are powered both "\
         "by Google and Bing. \n\tYahoo is also the default search engine for Firefox browsers "\
         "in the United States (since 2014)."
 
+    def get_params(self, query=None, page=None, offset=None, **kwargs):
+        params = {}
+        params["p"] = query
+        params["b"] = offset
+        return params
+
     def parse_soup(self, soup):
         """
         Parses Yahoo for a search query
diff --git a/search_engine_parser/core/engines/yandex.py b/search_engine_parser/core/engines/yandex.py
diff --git a/search_engine_parser/core/engines/youtube.py b/search_engine_parser/core/engines/youtube.py