LohitB
diff --git a/‎v1/chapter1/1-basicExample.py‎
Lines changed: 4 additions & 0 deletions b/‎v1/chapter1/1-basicExample.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎v1/chapter1/2-beautifulSoup.py‎
Lines changed: 6 additions & 0 deletions b/‎v1/chapter1/2-beautifulSoup.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎v1/chapter1/3-exceptionHandling.py‎
Lines changed: 26 additions & 0 deletions b/‎v1/chapter1/3-exceptionHandling.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎v1/chapter10/1-seleniumBasic.py‎
Lines changed: 10 additions & 0 deletions b/‎v1/chapter10/1-seleniumBasic.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎v1/chapter10/2-waitForLoad.py‎
Lines changed: 14 additions & 0 deletions b/‎v1/chapter10/2-waitForLoad.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎v1/chapter10/3-javascriptRedirect.py‎
Lines changed: 25 additions & 0 deletions b/‎v1/chapter10/3-javascriptRedirect.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎v1/chapter11/1-basicImage.py‎
Lines changed: 6 additions & 0 deletions b/‎v1/chapter11/1-basicImage.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎v1/chapter11/2-cleanImage.py‎
Lines changed: 19 additions & 0 deletions b/‎v1/chapter11/2-cleanImage.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎v1/chapter11/3-readWebImages.py‎
Lines changed: 36 additions & 0 deletions b/‎v1/chapter11/3-readWebImages.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎v1/chapter11/4-solveCaptcha.py‎
Lines changed: 48 additions & 0 deletions b/‎v1/chapter11/4-solveCaptcha.py‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,4 @@
+from urllib.request import urlopen
+#Retrieve HTML string from the URL
+html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
+print(html.read())
@@ -0,0 +1,6 @@
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+
+html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
+bsObj = BeautifulSoup(html, "html.parser")
+print(bsObj.h1)
@@ -0,0 +1,26 @@
+from urllib.request import urlopen
+from urllib.error import HTTPError
+from bs4 import BeautifulSoup
+import sys
+
+
+def getTitle(url):
+    try:
+        html = urlopen(url)
+    except HTTPError as e:
+        print(e)
+        return None
+    try:
+        bsObj = BeautifulSoup(html, "html.parser")
+        title = bsObj.body.h1
+    except AttributeError as e:
+        return None
+    return title
+
+title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
+if title == None:
+    print("Title could not be found")
+else:
+    print(title)
+    
+    
@@ -0,0 +1,10 @@
+from selenium import webdriver
+import time
+
+#REPLACE WITH YOUR DRIVER PATH. EXAMPLES FOR CHROME AND PHANTOMJS
+driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-macosx/bin/phantomjs')
+#driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver')
+driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
+time.sleep(5)
+print(driver.find_element_by_id("content").text)
+driver.close()
@@ -0,0 +1,14 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+#REPLACE WITH YOUR DRIVER PATH. EXAMPLES FOR CHROME AND PHANTOMJS
+driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-macosx/bin/phantomjs')
+#driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver')
+driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
+try:
+    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
+finally:
+    print(driver.find_element_by_id("content").text)
+    driver.close()
@@ -0,0 +1,25 @@
+from selenium import webdriver
+import time
+from selenium.webdriver.remote.webelement import WebElement
+from selenium.common.exceptions import StaleElementReferenceException
+
+def waitForLoad(driver):
+    elem = driver.find_element_by_tag_name("html")
+    count = 0
+    while True:
+        count += 1
+        if count > 20:
+            print("Timing out after 10 seconds and returning")
+            return
+        time.sleep(.5)
+        try:
+            elem == driver.find_element_by_tag_name("html")
+        except StaleElementReferenceException:
+            return
+
+#REPLACE WITH YOUR DRIVER PATH. EXAMPLES FOR CHROME AND PHANTOMJS
+driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-macosx/bin/phantomjs')
+#driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver')
+driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
+waitForLoad(driver)
+print(driver.page_source)
@@ -0,0 +1,6 @@
+from PIL import Image, ImageFilter
+
+kitten = Image.open("kitten.jpg")
+blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
+blurryKitten.save("kitten_blurred.jpg")
+blurryKitten.show()
@@ -0,0 +1,19 @@
+from PIL import Image
+import subprocess
+
+def cleanFile(filePath, newFilePath):
+    image = Image.open(filePath)
+
+    #Set a threshold value for the image, and save
+    image = image.point(lambda x: 0 if x<143 else 255)
+    image.save(newFilePath)
+
+    #call tesseract to do OCR on the newly created image
+    subprocess.call(["tesseract", newFilePath, "output"])
+    
+    #Open and read the resulting data file
+    outputFile = open("output.txt", 'r')
+    print(outputFile.read())
+    outputFile.close()
+
+cleanFile("text_2.png", "text_2_clean.png")
@@ -0,0 +1,36 @@
+import time
+from urllib.request import urlretrieve
+import subprocess
+from selenium import webdriver
+
+#driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
+driver = webdriver.Firefox()
+driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
+time.sleep(2)
+
+driver.find_element_by_id("img-canvas").click()
+#The easiest way to get exactly one of every page
+imageList = set()
+
+#Wait for the page to load
+time.sleep(10)
+print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
+while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
+    #While we can click on the right arrow, move through the pages
+    driver.find_element_by_id("sitbReaderRightPageTurner").click()
+    time.sleep(2)
+    #Get any new pages that have loaded (multiple pages can load at once)
+    pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
+    for page in pages:
+        image = page.get_attribute("src")
+        imageList.add(image)
+
+driver.quit()
+
+#Start processing the images we've collected URLs for with Tesseract
+for image in sorted(imageList):
+    urlretrieve(image, "page.jpg")
+    p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    p.wait()
+    f = open("page.txt", "r")
+    print(f.read())
@@ -0,0 +1,48 @@
+from urllib.request import urlretrieve
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import subprocess
+import requests
+from PIL import Image
+from PIL import ImageOps
+
+def cleanImage(imagePath):
+    image = Image.open(imagePath)
+    image = image.point(lambda x: 0 if x<143 else 255)
+    borderImage = ImageOps.expand(image,border=20,fill='white')
+    borderImage.save(imagePath)
+
+html = urlopen("http://www.pythonscraping.com/humans-only")
+bsObj = BeautifulSoup(html, "html.parser")
+#Gather prepopulated form values
+imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
+formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
+captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
+captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
+
+captchaUrl = "http://pythonscraping.com"+imageLocation
+urlretrieve(captchaUrl, "captcha.jpg")
+cleanImage("captcha.jpg")
+p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
+    subprocess.PIPE,stderr=subprocess.PIPE)
+p.wait()
+f = open("captcha.txt", "r")
+
+#Clean any whitespace characters
+captchaResponse = f.read().replace(" ", "").replace("\n", "")
+print("Captcha solution attempt: "+captchaResponse)
+
+if len(captchaResponse) == 5:
+    params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,   
+              "form_id":"comment_node_page_form", "form_build_id": formBuildId, 
+                  "captcha_response":captchaResponse, "name":"Ryan Mitchell", 
+                  "subject": "I come to seek the Grail", 
+                  "comment_body[und][0][value]": 
+                                           "...and I am definitely not a bot"}
+    r = requests.post("http://www.pythonscraping.com/comment/reply/10", 
+                          data=params)
+    responseObj = BeautifulSoup(r.text)
+    if responseObj.find("div", {"class":"messages"}) is not None:
+        print(responseObj.find("div", {"class":"messages"}).get_text())
+else:
+    print("There was a problem reading the CAPTCHA correctly!")