Merge branch 'master' of https://github.com/REMitchell/python-scraping

Ryan Mitchell · Ryan Mitchell · commit 2ef1be609157 · 2016-06-04T10:27:20.000-06:00
diff --git a/chapter1/2-beautifulSoup.py b/chapter1/2-beautifulSoup.py
@@ -2,5 +2,5 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
-bsObj = BeautifulSoup(html.read());
-print(bsObj.h1)
+bsObj = BeautifulSoup(html.read())
+print(bsObj.h1)
diff --git a/chapter11/3-readWebImages.py b/chapter11/3-readWebImages.py
@@ -8,7 +8,7 @@
 driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
 time.sleep(2)
 
-driver.find_element_by_id("sitbLogoImg").click()
+driver.find_element_by_id("img-canvas").click()
 #The easiest way to get exactly one of every page
 imageList = set()
 
@@ -33,4 +33,4 @@
     p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
     p.wait()
     f = open("page.txt", "r")
-    print(f.read())
+    print(f.read())
diff --git a/chapter4/6-wikiHistories.py b/chapter4/6-wikiHistories.py
@@ -1,4 +1,5 @@
 from urllib.request import urlopen
+from urllib.request import HTTPError
 from bs4 import BeautifulSoup
 import datetime
 import json
@@ -9,8 +10,7 @@
 def getLinks(articleUrl):
     html = urlopen("http://en.wikipedia.org"+articleUrl)
     bsObj = BeautifulSoup(html)
-    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", 
-                     href=re.compile("^(/wiki/)((?!:).)*$"))
+    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
 
 def getHistoryIPs(pageUrl):
     #Format of revision history pages is: 
@@ -31,8 +31,7 @@ def getHistoryIPs(pageUrl):
 
 def getCountry(ipAddress):
     try:
-        response = urlopen("http://freegeoip.net/json/"
-                           +ipAddress).read().decode('utf-8')
+        response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
     except HTTPError:
         return None
     responseJson = json.loads(response)
@@ -51,4 +50,4 @@ def getCountry(ipAddress):
                 print(historyIP+" is from "+country)
 
     newLink = links[random.randint(0, len(links)-1)].attrs["href"]
-    links = getLinks(newLink)
+    links = getLinks(newLink)
diff --git a/chapter5/6-6DegreesCrawlWiki.py b/chapter5/6-6DegreesCrawlWiki.py
@@ -36,7 +36,7 @@ def insertLink(fromPageId, toPageId):
 def getLinks(pageUrl, recursionLevel):
     global pages
     if recursionLevel > 4:
-        return;
+        return
     pageId = insertPageIfNotExists(pageUrl)
     html = urlopen("http://en.wikipedia.org"+pageUrl)
     bsObj = BeautifulSoup(html)
@@ -51,4 +51,4 @@ def getLinks(pageUrl, recursionLevel):
             print("Skipping: "+str(link.attrs['href'])+" found on "+pageUrl)
 getLinks("/wiki/Kevin_Bacon", 0) 
 cur.close()
-conn.close()
+conn.close()
diff --git a/chapter6/5-readPdf.py b/chapter6/5-readPdf.py
@@ -18,7 +18,7 @@ def readPDF(pdfFile):
     retstr.close()
     return content
 
-pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf");
+pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
 outputString = readPDF(pdfFile)
 print(outputString)
-pdfFile.close()
+pdfFile.close()
diff --git a/chapter6/readPdf.py b/chapter6/readPdf.py
@@ -18,7 +18,7 @@ def readPDF(pdfFile):
     retstr.close()
     return content
 
-pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf");
+pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
 outputString = readPDF(pdfFile)
 print(outputString)
-pdfFile.close()
+pdfFile.close()
diff --git a/chapter8/3-markovGenerator.py b/chapter8/3-markovGenerator.py
@@ -17,14 +17,14 @@ def retrieveRandomWord(wordList):
 
 def buildWordDict(text):
     #Remove newlines and quotes
-    text = text.replace("\n", " ");
-    text = text.replace("\"", "");
+    text = text.replace("\n", " ")
+    text = text.replace("\"", "")
 
     #Make sure puncuation are treated as their own "word," so they will be included
     #in the Markov chain
     punctuation = [',','.',';',':']
     for symbol in punctuation:
-        text = text.replace(symbol, " "+symbol+" ");
+        text = text.replace(symbol, " "+symbol+" ")
 
     words = text.split(" ")
     #Filter out empty words
diff --git a/chapter8/4-6DegreesFinder.py b/chapter8/4-6DegreesFinder.py
@@ -42,7 +42,7 @@ def searchBreadth(targetPageId, currentPageId, depth, nodes):
         print(found)
         for node in found:
             print(getUrl(node))
-        break;
+        break
     else:
         print("No path found")