diff --git a/v2/chapter1/1-basicExample.py b/v2/chapter1/1-basicExample.py new file mode 100644 index 0000000..28ea3ea --- /dev/null +++ b/v2/chapter1/1-basicExample.py @@ -0,0 +1,4 @@ +from urllib.request import urlopen + +html = urlopen('http://pythonscraping.com/pages/page1.html') +print(html.read()) diff --git a/v2/chapter1/2-beautifulSoup.py b/v2/chapter1/2-beautifulSoup.py new file mode 100644 index 0000000..7cf6d5a --- /dev/null +++ b/v2/chapter1/2-beautifulSoup.py @@ -0,0 +1,6 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + +html = urlopen('http://www.pythonscraping.com/pages/page1.html') +bs = BeautifulSoup(html.read(), 'html.parser') +print(bs.h1) diff --git a/v2/chapter1/3-exceptionHandling_1.py b/v2/chapter1/3-exceptionHandling_1.py new file mode 100644 index 0000000..66bb929 --- /dev/null +++ b/v2/chapter1/3-exceptionHandling_1.py @@ -0,0 +1,13 @@ +from urllib.request import urlopen +from urllib.error import HTTPError +from urllib.error import URLError + + +try: + html = urlopen("https://pythonscrapingthisurldoesnotexist.com") +except HTTPError as e: + print("The server returned an HTTP error") +except URLError as e: + print("The server could not be found!") +else: + print(html.read()) diff --git a/v2/chapter1/4-exceptionHandling_2.py b/v2/chapter1/4-exceptionHandling_2.py new file mode 100644 index 0000000..9821f0f --- /dev/null +++ b/v2/chapter1/4-exceptionHandling_2.py @@ -0,0 +1,23 @@ +from urllib.request import urlopen +from urllib.error import HTTPError +from bs4 import BeautifulSoup + + +def getTitle(url): + try: + html = urlopen(url) + except HTTPError as e: + return None + try: + bsObj = BeautifulSoup(html.read(), "lxml") + title = bsObj.body.h1 + except AttributeError as e: + return None + return title + + +title = getTitle("http://www.pythonscraping.com/pages/page1.html") +if title == None: + print("Title could not be found") +else: + print(title) diff --git a/Chapter01_BeginningToScrape.ipynb b/v2/chapter1/Chapter01_BeginningToScrape.ipynb similarity index 100% rename from Chapter01_BeginningToScrape.ipynb rename to v2/chapter1/Chapter01_BeginningToScrape.ipynb diff --git a/v2/chapter2/1-selectByClass.py b/v2/chapter2/1-selectByClass.py new file mode 100644 index 0000000..b4543bd --- /dev/null +++ b/v2/chapter2/1-selectByClass.py @@ -0,0 +1,9 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page1.html') +bs = BeautifulSoup(html.read(), 'html.parser') +nameList = bs.findAll('span', {'class':'green'}) +for name in nameList: + print(name.get_text()) diff --git a/v2/chapter2/2-selectByOtherArguments.py b/v2/chapter2/2-selectByOtherArguments.py new file mode 100644 index 0000000..d8bf3d5 --- /dev/null +++ b/v2/chapter2/2-selectByOtherArguments.py @@ -0,0 +1,22 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html') +bs = BeautifulSoup(html, "html.parser") + +# Select by tags +titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6']) +print([title for title in titles]) + +# Select by tag attributes +allText = bs.find_all('span', {'class':{'green', 'red'}}) +print([text for text in allText]) + +# Select by text content of tags +nameList = bs.find_all(text='the prince') +print(len(nameList)) + +# Select by tags that contains a particular attribute +titles = bs.find_all(id='title', class_='text') +print([title for title in titles]) diff --git a/v2/chapter2/3-findChildrens.py b/v2/chapter2/3-findChildrens.py new file mode 100644 index 0000000..e2b4952 --- /dev/null +++ b/v2/chapter2/3-findChildrens.py @@ -0,0 +1,8 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +for child in bs.find('table',{'id':'giftList'}).children: + print(child) diff --git a/v2/chapter2/4-findSiblings.py b/v2/chapter2/4-findSiblings.py new file mode 100644 index 0000000..3f87a36 --- /dev/null +++ b/v2/chapter2/4-findSiblings.py @@ -0,0 +1,8 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings: + print(sibling) diff --git a/v2/chapter2/5-findParents.py b/v2/chapter2/5-findParents.py new file mode 100644 index 0000000..0c826ca --- /dev/null +++ b/v2/chapter2/5-findParents.py @@ -0,0 +1,9 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +print(bs.find( + 'img', + {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()) diff --git a/v2/chapter2/6-regularExpressions.py b/v2/chapter2/6-regularExpressions.py new file mode 100644 index 0000000..0bf4dd4 --- /dev/null +++ b/v2/chapter2/6-regularExpressions.py @@ -0,0 +1,12 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +images = bs.find_all( + 'img', + {'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')}) +for image in images: + print(image['src']) diff --git a/v2/chapter2/7-lambdaExpressions.py b/v2/chapter2/7-lambdaExpressions.py new file mode 100644 index 0000000..b4d46b6 --- /dev/null +++ b/v2/chapter2/7-lambdaExpressions.py @@ -0,0 +1,10 @@ + +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen("http://www.pythonscraping.com/pages/page2.html") +bsObj = BeautifulSoup(html, "html.parser") +tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2) +for tag in tags: + print(tag) diff --git a/Chapter02-AdvancedHTMLParsing.ipynb b/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb similarity index 100% rename from Chapter02-AdvancedHTMLParsing.ipynb rename to v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb diff --git a/v2/chapter3/1-getPageLinks.py b/v2/chapter3/1-getPageLinks.py new file mode 100644 index 0000000..740741a --- /dev/null +++ b/v2/chapter3/1-getPageLinks.py @@ -0,0 +1,10 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon') +bs = BeautifulSoup(html, 'html.parser') + +for link in bs.find_all('a'): + if 'href' in link.attrs: + print(link.attrs['href']) diff --git a/v2/chapter3/2-getPageArticleLinks.py b/v2/chapter3/2-getPageArticleLinks.py new file mode 100644 index 0000000..cae42c3 --- /dev/null +++ b/v2/chapter3/2-getPageArticleLinks.py @@ -0,0 +1,11 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon') +bs = BeautifulSoup(html, 'html.parser') +regex = re.compile('^(/wiki/)((?!:).)*$') +for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=regex): + if 'href' in link.attrs: + print(link.attrs['href']) diff --git a/v2/chapter3/3-getRandomLinks.py b/v2/chapter3/3-getRandomLinks.py new file mode 100644 index 0000000..46188b8 --- /dev/null +++ b/v2/chapter3/3-getRandomLinks.py @@ -0,0 +1,20 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import datetime +import random +import re + + +def getLinks(articleUrl): + html = urlopen('http://en.wikipedia.org{}'.format(articleUrl)) + bs = BeautifulSoup(html, 'html.parser') + regex = re.compile('^(/wiki/)((?!:).)*$') + return bs.find('div', {'id':'bodyContent'}).find_all('a', href=regex) + + +random.seed(datetime.datetime.now()) +links = getLinks('/wiki/Kevin_Bacon') +while len(links) > 0: + newArticle = links[random.randint(0, len(links)-1)].attrs['href'] + print(newArticle) + links = getLinks(newArticle) diff --git a/v2/chapter3/4-crawlPage.py b/v2/chapter3/4-crawlPage.py new file mode 100644 index 0000000..59bdfe6 --- /dev/null +++ b/v2/chapter3/4-crawlPage.py @@ -0,0 +1,22 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +pages = set() +def getLinks(pageUrl): + global pages + html = urlopen('http://en.wikipedia.org{}'.format(pageUrl)) + bs = BeautifulSoup(html, 'html.parser') + regex = re.compile('^(/wiki/)') + for link in bs.find_all('a', href=regex): + if 'href' in link.attrs: + if link.attrs['href'] not in pages: + #We have encountered a new page + newPage = link.attrs['href'] + print(newPage) + pages.add(newPage) + getLinks(newPage) + + +getLinks('') diff --git a/v2/chapter3/5-crawlAndScrapePage.py b/v2/chapter3/5-crawlAndScrapePage.py new file mode 100644 index 0000000..cbd96a2 --- /dev/null +++ b/v2/chapter3/5-crawlAndScrapePage.py @@ -0,0 +1,31 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +pages = set() +def getLinks(pageUrl): + global pages + html = urlopen('http://en.wikipedia.org{}'.format(pageUrl)) + bs = BeautifulSoup(html, 'html.parser') + try: + # Find the page's title + print(bs.h1.get_text()) + # Find the page's 1st paragraph + print(bs.find(id ='mw-content-text').find_all('p')[0]) + # Find edit links (doesn't aply anymore) + print(bs.find(id='ca-edit').find('span').find('a').attrs['href']) + except AttributeError: + print('This page is missing something! Continuing.') + for link in bs.find_all('a', href=re.compile('^(/wiki/)')): + if 'href' in link.attrs: + if link.attrs['href'] not in pages: + #We have encountered a new page + newPage = link.attrs['href'] + print('-'*20) + print(newPage) + pages.add(newPage) + getLinks(newPage) + + +getLinks('') diff --git a/v2/chapter3/6-getRandomExternalLinks.py b/v2/chapter3/6-getRandomExternalLinks.py new file mode 100644 index 0000000..cb52401 --- /dev/null +++ b/v2/chapter3/6-getRandomExternalLinks.py @@ -0,0 +1,65 @@ +from urllib.request import urlopen +from urllib.parse import urlparse +from bs4 import BeautifulSoup +import re +import datetime +import random + + +pages = set() +random.seed(datetime.datetime.now()) + + +#Retrieves a list of all Internal links found on a page +def getInternalLinks(bs, includeUrl): + includeUrl = '{}://{}'.format( + urlparse(includeUrl).scheme, + urlparse(includeUrl).netloc) + internalLinks = [] + #Finds all links that begin with a "/" + regex = re.compile('^(/|.*'+includeUrl+')') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in internalLinks: + if(link.attrs['href'].startswith('/')): + internalLinks.append(includeUrl+link.attrs['href']) + else: + internalLinks.append(link.attrs['href']) + return internalLinks + + +#Retrieves a list of all external links found on a page +def getExternalLinks(bs, excludeUrl): + externalLinks = [] + #Finds all links that start with "http" that don't contain the current URL + regex = re.compile('^(http|www)((?!'+excludeUrl+').)*$') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in externalLinks: + externalLinks.append(link.attrs['href']) + return externalLinks + + +def getRandomExternalLink(startingPage): + html = urlopen(startingPage) + bs = BeautifulSoup(html, 'html.parser') + externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc) + if len(externalLinks) == 0: + print('No external links, looking around the site for one') + domain = '{}://{}'.format( + urlparse(startingPage).scheme, + urlparse(startingPage).netloc) + internalLinks = getInternalLinks(bs, domain) + return getRandomExternalLink( + internalLinks[random.randint(0, len(internalLinks)-1)]) + else: + return externalLinks[random.randint(0, len(externalLinks)-1)] + + +def followExternalOnly(startingSite): + externalLink = getRandomExternalLink(startingSite) + print('Random external link is: {}'.format(externalLink)) + followExternalOnly(externalLink) + + +followExternalOnly('http://oreilly.com') diff --git a/v2/chapter3/7-getAllExternalLinks.py b/v2/chapter3/7-getAllExternalLinks.py new file mode 100644 index 0000000..0913530 --- /dev/null +++ b/v2/chapter3/7-getAllExternalLinks.py @@ -0,0 +1,61 @@ +import re +from urllib.request import urlopen +from urllib.parse import urlparse +from bs4 import BeautifulSoup + + +allExtLinks = set() +allIntLinks = set() + + +#Retrieves a list of all Internal links found on a page +def getInternalLinks(bs, includeUrl): + includeUrl = '{}://{}'.format( + urlparse(includeUrl).scheme, + urlparse(includeUrl).netloc) + internalLinks = [] + #Finds all links that begin with a "/" + regex = re.compile('^(/|.*'+includeUrl+')') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in internalLinks: + if(link.attrs['href'].startswith('/')): + internalLinks.append(includeUrl+link.attrs['href']) + else: + internalLinks.append(link.attrs['href']) + return internalLinks + + +#Retrieves a list of all external links found on a page +def getExternalLinks(bs, excludeUrl): + externalLinks = [] + #Finds all links that start with "http" that don't contain the current URL + regex = re.compile('^(http|www)((?!'+excludeUrl+').)*$') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in externalLinks: + externalLinks.append(link.attrs['href']) + return externalLinks + + +# Collects a list of all external URLs found on the site +def getAllExternalLinks(siteUrl): + html = urlopen(siteUrl) + domain = '{}://{}'.format( + urlparse(siteUrl).scheme, + urlparse(siteUrl).netloc) + bs = BeautifulSoup(html, 'html.parser') + internalLinks = getInternalLinks(bs, domain) + externalLinks = getExternalLinks(bs, domain) + for link in externalLinks: + if link not in allExtLinks: + allExtLinks.add(link) + print(link) + for link in internalLinks: + if link not in allIntLinks: + allIntLinks.add(link) + getAllExternalLinks(link) + + +allIntLinks.add('http://oreilly.com') +getAllExternalLinks('http://oreilly.com') diff --git a/Chapter03-web-crawlers.ipynb b/v2/chapter3/Chapter03-web-crawlers.ipynb similarity index 100% rename from Chapter03-web-crawlers.ipynb rename to v2/chapter3/Chapter03-web-crawlers.ipynb diff --git a/Chapter04_CrawlingModels.ipynb b/v2/chapter4/Chapter04_CrawlingModels.ipynb similarity index 100% rename from Chapter04_CrawlingModels.ipynb rename to v2/chapter4/Chapter04_CrawlingModels.ipynb