11from bs4 import BeautifulSoup
2- import requests
32from readability import Document
4- from websearch import search_google , get_tbs
5- import random
3+ from websearch_funcs import search_google , get_tbs
64import aiohttp
75import asyncio
6+ from utils import get_random_user_agent
87
9- # REQUEST_SUCCESS = 200
10-
11-
12- # def parse_webpage(url_link):
13- # try:
14- # response = requests.get(url_link)
15- # if response.status_code == REQUEST_SUCCESS:
16- # doc = Document(response.content)
17- # soup = BeautifulSoup(doc.summary(), 'html.parser')
18- # return soup.get_text(strip=True)
19- # return ''
20- # except Exception as e:
21- # print(f"Error: {e}")
22- # return ''
23-
24- # List of user agents for requests
25- USER_AGENTS_LIST = [
26- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' ,
27- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' ,
28- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' ,
29- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' ,
30- 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1' ,
31- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.49' ,
32- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 OPR/76.0.4017.123'
33- ]
348
359async def fetch (session , url ):
36- headers = {'User-Agent' : random . choice ( USER_AGENTS_LIST )}
10+ headers = {'User-Agent' : get_random_user_agent ( )}
3711 try :
3812 timeout = aiohttp .ClientTimeout (total = 60 ) # Increasing the total timeout to 60 seconds
3913 await asyncio .sleep (0.5 )
@@ -46,6 +20,7 @@ async def fetch(session, url):
4620 print ("The request timed out" )
4721 return None
4822
23+
4924async def parse_webpage (url_link ):
5025 async with aiohttp .ClientSession () as session :
5126 html_content = await fetch (session , url_link )
0 commit comments