Merge branch 'master' of https://github.com/REMitchell/python-scraping

REMitchell · REMitchell · commit 6c3cdccc7cc1 · 2016-03-14T12:21:20.000-04:00
diff --git a/chapter1/2-beautifulSoup.py b/chapter1/2-beautifulSoup.py
@@ -2,5 +2,5 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
-bsObj = BeautifulSoup(html.read());
-print(bsObj.h1)
+bsObj = BeautifulSoup(html.read())
+print(bsObj.h1)
diff --git a/chapter11/3-readWebImages.py b/chapter11/3-readWebImages.py
@@ -8,7 +8,7 @@
 driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
 time.sleep(2)
 
-driver.find_element_by_id("sitbLogoImg").click()
+driver.find_element_by_id("img-canvas").click()
 #The easiest way to get exactly one of every page
 imageList = set()
 
@@ -33,4 +33,4 @@
     p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
     p.wait()
     f = open("page.txt", "r")
-    print(f.read())
+    print(f.read())
diff --git a/chapter5/6-6DegreesCrawlWiki.py b/chapter5/6-6DegreesCrawlWiki.py
@@ -36,7 +36,7 @@ def insertLink(fromPageId, toPageId):
 def getLinks(pageUrl, recursionLevel):
     global pages
     if recursionLevel > 4:
-        return;
+        return
     pageId = insertPageIfNotExists(pageUrl)
     html = urlopen("http://en.wikipedia.org"+pageUrl)
     bsObj = BeautifulSoup(html)
@@ -51,4 +51,4 @@ def getLinks(pageUrl, recursionLevel):
             print("Skipping: "+str(link.attrs['href'])+" found on "+pageUrl)
 getLinks("/wiki/Kevin_Bacon", 0) 
 cur.close()
-conn.close()
+conn.close()
diff --git a/chapter6/5-readPdf.py b/chapter6/5-readPdf.py
@@ -18,7 +18,7 @@ def readPDF(pdfFile):
     retstr.close()
     return content
 
-pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf");
+pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
 outputString = readPDF(pdfFile)
 print(outputString)
-pdfFile.close()
+pdfFile.close()
diff --git a/chapter6/readPdf.py b/chapter6/readPdf.py
@@ -18,7 +18,7 @@ def readPDF(pdfFile):
     retstr.close()
     return content
 
-pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf");
+pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
 outputString = readPDF(pdfFile)
 print(outputString)
-pdfFile.close()
+pdfFile.close()
diff --git a/chapter8/3-markovGenerator.py b/chapter8/3-markovGenerator.py
@@ -17,14 +17,14 @@ def retrieveRandomWord(wordList):
 
 def buildWordDict(text):
     #Remove newlines and quotes
-    text = text.replace("\n", " ");
-    text = text.replace("\"", "");
+    text = text.replace("\n", " ")
+    text = text.replace("\"", "")
 
     #Make sure puncuation are treated as their own "word," so they will be included
     #in the Markov chain
     punctuation = [',','.',';',':']
     for symbol in punctuation:
-        text = text.replace(symbol, " "+symbol+" ");
+        text = text.replace(symbol, " "+symbol+" ")
 
     words = text.split(" ")
     #Filter out empty words
diff --git a/chapter8/4-6DegreesFinder.py b/chapter8/4-6DegreesFinder.py
@@ -42,7 +42,7 @@ def searchBreadth(targetPageId, currentPageId, depth, nodes):
         print(found)
         for node in found:
             print(getUrl(node))
-        break;
+        break
     else:
         print("No path found")