Skip to content

Commit eda827e

Browse files
committed
Adding v1 back in
1 parent 8452914 commit eda827e

File tree

75 files changed

+1575
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+1575
-0
lines changed

v1/chapter1/1-basicExample.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from urllib.request import urlopen
2+
#Retrieve HTML string from the URL
3+
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
4+
print(html.read())

v1/chapter1/2-beautifulSoup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from urllib.request import urlopen
2+
from bs4 import BeautifulSoup
3+
4+
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
5+
bsObj = BeautifulSoup(html, "html.parser")
6+
print(bsObj.h1)

v1/chapter1/3-exceptionHandling.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from urllib.request import urlopen
2+
from urllib.error import HTTPError
3+
from bs4 import BeautifulSoup
4+
import sys
5+
6+
7+
def getTitle(url):
8+
try:
9+
html = urlopen(url)
10+
except HTTPError as e:
11+
print(e)
12+
return None
13+
try:
14+
bsObj = BeautifulSoup(html, "html.parser")
15+
title = bsObj.body.h1
16+
except AttributeError as e:
17+
return None
18+
return title
19+
20+
title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
21+
if title == None:
22+
print("Title could not be found")
23+
else:
24+
print(title)
25+
26+

v1/chapter10/1-seleniumBasic.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from selenium import webdriver
2+
import time
3+
4+
#REPLACE WITH YOUR DRIVER PATH. EXAMPLES FOR CHROME AND PHANTOMJS
5+
driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-macosx/bin/phantomjs')
6+
#driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver')
7+
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
8+
time.sleep(5)
9+
print(driver.find_element_by_id("content").text)
10+
driver.close()

v1/chapter10/2-waitForLoad.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
from selenium.webdriver.support.ui import WebDriverWait
4+
from selenium.webdriver.support import expected_conditions as EC
5+
6+
#REPLACE WITH YOUR DRIVER PATH. EXAMPLES FOR CHROME AND PHANTOMJS
7+
driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-macosx/bin/phantomjs')
8+
#driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver')
9+
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
10+
try:
11+
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
12+
finally:
13+
print(driver.find_element_by_id("content").text)
14+
driver.close()
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from selenium import webdriver
2+
import time
3+
from selenium.webdriver.remote.webelement import WebElement
4+
from selenium.common.exceptions import StaleElementReferenceException
5+
6+
def waitForLoad(driver):
7+
elem = driver.find_element_by_tag_name("html")
8+
count = 0
9+
while True:
10+
count += 1
11+
if count > 20:
12+
print("Timing out after 10 seconds and returning")
13+
return
14+
time.sleep(.5)
15+
try:
16+
elem == driver.find_element_by_tag_name("html")
17+
except StaleElementReferenceException:
18+
return
19+
20+
#REPLACE WITH YOUR DRIVER PATH. EXAMPLES FOR CHROME AND PHANTOMJS
21+
driver = webdriver.PhantomJS(executable_path='../phantomjs-2.1.1-macosx/bin/phantomjs')
22+
#driver = webdriver.Chrome(executable_path='../chromedriver/chromedriver')
23+
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
24+
waitForLoad(driver)
25+
print(driver.page_source)

v1/chapter11/1-basicImage.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from PIL import Image, ImageFilter
2+
3+
kitten = Image.open("kitten.jpg")
4+
blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
5+
blurryKitten.save("kitten_blurred.jpg")
6+
blurryKitten.show()

v1/chapter11/2-cleanImage.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from PIL import Image
2+
import subprocess
3+
4+
def cleanFile(filePath, newFilePath):
5+
image = Image.open(filePath)
6+
7+
#Set a threshold value for the image, and save
8+
image = image.point(lambda x: 0 if x<143 else 255)
9+
image.save(newFilePath)
10+
11+
#call tesseract to do OCR on the newly created image
12+
subprocess.call(["tesseract", newFilePath, "output"])
13+
14+
#Open and read the resulting data file
15+
outputFile = open("output.txt", 'r')
16+
print(outputFile.read())
17+
outputFile.close()
18+
19+
cleanFile("text_2.png", "text_2_clean.png")

v1/chapter11/3-readWebImages.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import time
2+
from urllib.request import urlretrieve
3+
import subprocess
4+
from selenium import webdriver
5+
6+
#driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
7+
driver = webdriver.Firefox()
8+
driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
9+
time.sleep(2)
10+
11+
driver.find_element_by_id("img-canvas").click()
12+
#The easiest way to get exactly one of every page
13+
imageList = set()
14+
15+
#Wait for the page to load
16+
time.sleep(10)
17+
print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
18+
while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
19+
#While we can click on the right arrow, move through the pages
20+
driver.find_element_by_id("sitbReaderRightPageTurner").click()
21+
time.sleep(2)
22+
#Get any new pages that have loaded (multiple pages can load at once)
23+
pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
24+
for page in pages:
25+
image = page.get_attribute("src")
26+
imageList.add(image)
27+
28+
driver.quit()
29+
30+
#Start processing the images we've collected URLs for with Tesseract
31+
for image in sorted(imageList):
32+
urlretrieve(image, "page.jpg")
33+
p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
34+
p.wait()
35+
f = open("page.txt", "r")
36+
print(f.read())

v1/chapter11/4-solveCaptcha.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from urllib.request import urlretrieve
2+
from urllib.request import urlopen
3+
from bs4 import BeautifulSoup
4+
import subprocess
5+
import requests
6+
from PIL import Image
7+
from PIL import ImageOps
8+
9+
def cleanImage(imagePath):
10+
image = Image.open(imagePath)
11+
image = image.point(lambda x: 0 if x<143 else 255)
12+
borderImage = ImageOps.expand(image,border=20,fill='white')
13+
borderImage.save(imagePath)
14+
15+
html = urlopen("http://www.pythonscraping.com/humans-only")
16+
bsObj = BeautifulSoup(html, "html.parser")
17+
#Gather prepopulated form values
18+
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
19+
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
20+
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
21+
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
22+
23+
captchaUrl = "http://pythonscraping.com"+imageLocation
24+
urlretrieve(captchaUrl, "captcha.jpg")
25+
cleanImage("captcha.jpg")
26+
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
27+
subprocess.PIPE,stderr=subprocess.PIPE)
28+
p.wait()
29+
f = open("captcha.txt", "r")
30+
31+
#Clean any whitespace characters
32+
captchaResponse = f.read().replace(" ", "").replace("\n", "")
33+
print("Captcha solution attempt: "+captchaResponse)
34+
35+
if len(captchaResponse) == 5:
36+
params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,
37+
"form_id":"comment_node_page_form", "form_build_id": formBuildId,
38+
"captcha_response":captchaResponse, "name":"Ryan Mitchell",
39+
"subject": "I come to seek the Grail",
40+
"comment_body[und][0][value]":
41+
"...and I am definitely not a bot"}
42+
r = requests.post("http://www.pythonscraping.com/comment/reply/10",
43+
data=params)
44+
responseObj = BeautifulSoup(r.text)
45+
if responseObj.find("div", {"class":"messages"}) is not None:
46+
print(responseObj.find("div", {"class":"messages"}).get_text())
47+
else:
48+
print("There was a problem reading the CAPTCHA correctly!")

0 commit comments

Comments
 (0)