Adds html.parser to BeautifulSoup([markup], html.parser) calls.

Mario Burgos · Mario Burgos · commit 8889deae7bb4 · 2016-08-19T14:15:17.000-07:00
diff --git a/chapter11/4-solveCaptcha.py b/chapter11/4-solveCaptcha.py
@@ -13,7 +13,7 @@ def cleanImage(imagePath):
     borderImage.save(imagePath)
 
 html = urlopen("http://www.pythonscraping.com/humans-only")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 #Gather prepopulated form values
 imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
 formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
diff --git a/chapter2/1-selectByClass.py b/chapter2/1-selectByClass.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 nameList = bsObj.findAll("span", {"class":"green"})
 for name in nameList:
     print(name.get_text())
diff --git a/chapter2/2-selectByAttribute.py b/chapter2/2-selectByAttribute.py
@@ -2,6 +2,6 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 allText = bsObj.findAll(id="text")
 print(allText[0].get_text())
diff --git a/chapter2/3-findDescendants.py b/chapter2/3-findDescendants.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 
 for child in bsObj.find("table",{"id":"giftList"}).children:
     print(child)
diff --git a/chapter2/4-findSiblings.py b/chapter2/4-findSiblings.py
@@ -1,7 +1,7 @@
 from urllib.request import urlopen
 from bs4 import BeautifulSoup
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 
 for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings:
     print(sibling) 
diff --git a/chapter2/5-findParents.py b/chapter2/5-findParents.py
@@ -2,5 +2,5 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
-print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
+bsObj = BeautifulSoup(html, "html.parser")
+print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
diff --git a/chapter2/6-regularExpressions.py b/chapter2/6-regularExpressions.py
@@ -3,7 +3,7 @@
 import re
 
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
 for image in images: 
     print(image["src"])
diff --git a/chapter2/7-lambdaExpressions.py b/chapter2/7-lambdaExpressions.py
@@ -1,7 +1,7 @@
 from urllib.request import urlopen
 from bs4 import BeautifulSoup
 html = urlopen("http://www.pythonscraping.com/pages/page2.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2)
 for tag in tags:
 	print(tag)
diff --git a/chapter3/1-getWikiLinks.py b/chapter3/1-getWikiLinks.py
@@ -7,7 +7,7 @@
 random.seed(datetime.datetime.now())
 def getLinks(articleUrl):
     html = urlopen("http://en.wikipedia.org"+articleUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
 links = getLinks("/wiki/Kevin_Bacon")
 while len(links) > 0:
diff --git a/chapter3/2-crawlWikipedia.py b/chapter3/2-crawlWikipedia.py
@@ -6,7 +6,7 @@
 def getLinks(pageUrl):
     global pages
     html = urlopen("http://en.wikipedia.org"+pageUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     try:
         print(bsObj.h1.get_text())
         print(bsObj.find(id ="mw-content-text").findAll("p")[0])
diff --git a/chapter3/3-crawlSite.py b/chapter3/3-crawlSite.py
@@ -34,7 +34,7 @@ def splitAddress(address):
 
 def getRandomExternalLink(startingPage):
     html = urlopen(startingPage)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
     if len(externalLinks) == 0:
         internalLinks = getInternalLinks(startingPage)
diff --git a/chapter3/4-getExternalLinks.py b/chapter3/4-getExternalLinks.py
@@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl):
 
 def getRandomExternalLink(startingPage):
     html = urlopen(startingPage)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
     if len(externalLinks) == 0:
         print("No external links, looking around the site for one")
diff --git a/chapter3/5-getAllExternalLinks.py b/chapter3/5-getAllExternalLinks.py
@@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl):
 
 def getRandomExternalLink(startingPage):
     html = urlopen(startingPage)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
     if len(externalLinks) == 0:
         print("No external links, looking around the site for one")
@@ -57,7 +57,7 @@ def followExternalOnly(startingSite):
 def getAllExternalLinks(siteUrl):
     html = urlopen(siteUrl)
     domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     internalLinks = getInternalLinks(bsObj,domain)
     externalLinks = getExternalLinks(bsObj,domain)
 
diff --git a/chapter4/6-wikiHistories.py b/chapter4/6-wikiHistories.py
@@ -9,7 +9,7 @@
 random.seed(datetime.datetime.now())
 def getLinks(articleUrl):
     html = urlopen("http://en.wikipedia.org"+articleUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
 
 def getHistoryIPs(pageUrl):
@@ -19,7 +19,7 @@ def getHistoryIPs(pageUrl):
     historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
     print("history url is: "+historyUrl)
     html = urlopen(historyUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     #finds only the links with class "mw-anonuserlink" which has IP addresses 
     #instead of usernames
     ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
diff --git a/chapter5/1-getPageMedia.py b/chapter5/1-getPageMedia.py
@@ -32,7 +32,7 @@ def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
     return path
 
 html = urlopen("http://www.pythonscraping.com")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 downloadList = bsObj.findAll(src=True)
 
 for download in downloadList:
diff --git a/chapter5/3-scrapeCsv.py b/chapter5/3-scrapeCsv.py
@@ -3,7 +3,7 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 #The main comparison table is currently the first table on the page
 table = bsObj.findAll("table",{"class":"wikitable"})[0]
 rows = table.findAll("tr")
diff --git a/chapter5/5-storeWikiLinks.py b/chapter5/5-storeWikiLinks.py
@@ -17,7 +17,7 @@ def store(title, content):
 
 def getLinks(articleUrl):
     html = urlopen("http://en.wikipedia.org"+articleUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     title = bsObj.find("h1").get_text()
     content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()
     store(title, content)
diff --git a/chapter5/6-6DegreesCrawlWiki.py b/chapter5/6-6DegreesCrawlWiki.py
@@ -39,7 +39,7 @@ def getLinks(pageUrl, recursionLevel):
         return
     pageId = insertPageIfNotExists(pageUrl)
     html = urlopen("http://en.wikipedia.org"+pageUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
         insertLink(pageId, insertPageIfNotExists(link.attrs['href']))
         if not pageScraped(link.attrs['href']):
diff --git a/chapter6/2-getUtf8Text.py b/chapter6/2-getUtf8Text.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
 content = bytes(content, "UTF-8")
 content = content.decode("UTF-8")
diff --git a/chapter7/1-2grams.py b/chapter7/1-2grams.py
@@ -9,7 +9,7 @@ def getNgrams(input, n):
   return output
 
 html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
 ngrams = getNgrams(content, 2)
 print(ngrams)
diff --git a/chapter7/2-clean2grams.py b/chapter7/2-clean2grams.py
@@ -30,7 +30,7 @@ def getNgrams(input, n):
     return output
 
 html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
 #ngrams = getNgrams(content, 2)
 #print(ngrams)