Skip to content

Commit 733b441

Browse files
author
KVignesh122
committed
Updated websearch file
1 parent 5066a27 commit 733b441

File tree

7 files changed

+298
-218
lines changed

7 files changed

+298
-218
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ __pycache__/*
77
*.csv
88
oai.py
99
main-mine.py
10+
websearch copy.py

requirement.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
tiktoken
2+
readability-lxml

scrapper.py

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,13 @@
11
from bs4 import BeautifulSoup
2-
import requests
32
from readability import Document
4-
from websearch import search_google, get_tbs
5-
import random
3+
from websearch_funcs import search_google, get_tbs
64
import aiohttp
75
import asyncio
6+
from utils import get_random_user_agent
87

9-
# REQUEST_SUCCESS = 200
10-
11-
12-
# def parse_webpage(url_link):
13-
# try:
14-
# response = requests.get(url_link)
15-
# if response.status_code == REQUEST_SUCCESS:
16-
# doc = Document(response.content)
17-
# soup = BeautifulSoup(doc.summary(), 'html.parser')
18-
# return soup.get_text(strip=True)
19-
# return ''
20-
# except Exception as e:
21-
# print(f"Error: {e}")
22-
# return ''
23-
24-
# List of user agents for requests
25-
USER_AGENTS_LIST = [
26-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
27-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
28-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
29-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
30-
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
31-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.49',
32-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 OPR/76.0.4017.123'
33-
]
348

359
async def fetch(session, url):
36-
headers = {'User-Agent': random.choice(USER_AGENTS_LIST)}
10+
headers = {'User-Agent': get_random_user_agent()}
3711
try:
3812
timeout = aiohttp.ClientTimeout(total=60) # Increasing the total timeout to 60 seconds
3913
await asyncio.sleep(0.5)
@@ -46,6 +20,7 @@ async def fetch(session, url):
4620
print("The request timed out")
4721
return None
4822

23+
4924
async def parse_webpage(url_link):
5025
async with aiohttp.ClientSession() as session:
5126
html_content = await fetch(session, url_link)

utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from random import choice
2+
3+
# List of user agents for requests
4+
USER_AGENTS_LIST = [
5+
# Chrome
6+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
7+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
8+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
9+
# Firefox
10+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
11+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Firefox/88.0',
12+
'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
13+
# Safari
14+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15',
15+
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
16+
# Edge
17+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.49',
18+
# Opera
19+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 OPR/76.0.4017.123'
20+
]
21+
22+
# Define URL templates
23+
GOOGLE_HOMEPAGE_URL = "https://www.google.%(tld)s/"
24+
GOOGLE_SEARCH_RESULTS_URL = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
25+
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
26+
"cr=%(country)s&tbm=%(tbm)s"
27+
28+
29+
def get_random_user_agent():
30+
"""Select a random user agent from a predefined list."""
31+
return choice(USER_AGENTS_LIST)

websearch.py

Lines changed: 0 additions & 189 deletions
This file was deleted.

0 commit comments

Comments
 (0)