Skip to content

Commit 9bd45ba

Browse files
committed
update 20160428
1 parent 3b5d845 commit 9bd45ba

File tree

4 files changed

+87
-0
lines changed

4 files changed

+87
-0
lines changed

ANA flight 2.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
print("Expedia found: " + str(len(p.index)) + " lines")
7676
flight_database = flight_database.append(p)
7777
# print(flight_database)
78+
# driver.delete_all_cookies()
7879
driver.close()
7980
except:
8081
continue

Chapter 11.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# image processing
2+
import os
3+
4+
os.chdir('C:/Users/jchen5/python/Web Scraping with Python/image')
5+
from PIL import Image, ImageFilter
6+
7+
htt = Image.open('htt.jpg')
8+
htt.show()
9+
blurryhtt = htt.filter(ImageFilter.GaussianBlur)
10+
blurryhtt.save('htt_blurred.jpg')
11+
blurryhtt.show()
12+
#
13+
# #Scraping Text from Images on Websites -- bug
14+
# import time
15+
# from urllib.request import urlopen, urlretrieve
16+
# import subprocess
17+
# from selenium import webdriver
18+
# import os
19+
# os.chdir('C:/Users/jchen5/python/Web Scraping with Python/image')
20+
# path = 'C:/Users/jchen5/Downloads/phantomjs-2.1.1-windows/bin/phantomjs.exe'
21+
# driver = webdriver.PhantomJS(executable_path=path)
22+
# driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
23+
# time.sleep(2)
24+
# driver.find_element_by_id('sitbLogoImg').click()
25+
# imageList = set()
26+
# time.sleep(5)
27+
#
28+
# #While the right arrow is available for clicking, turn through pages
29+
# while "pointer" in driver.find_element_by_id('sitbReaderRightPageTurner').get_attribute('style'):
30+
# driver.find_element_by_id('sitbReaderRightPageTurner').click()
31+
# time.sleep(2)
32+
# pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
33+
# for page in pages:
34+
# image = page.get_attribute('src')
35+
# imageList.add(image)
36+
#
37+
# driver.quit()
38+
#
39+
# #Start processing the images we've collected URLs for with Tesseract
40+
# for image in sorted(imageList):
41+
# urlretrieve(image,'page.jpg')
42+
# p = subprocess.Popen(['tesseract','page.jpg','page'],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
43+
# p.wait()
44+
# f = open('page.txt','r')
45+
# print(f.read())
46+
47+
48+
import requests
49+
from bs4 import BeautifulSoup
50+
51+
session = requests.Session()
52+
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
53+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
54+
url = "https://www.whatismybrowser.com/developers/what-http-headers-is-my-browser-sending"
55+
req = session.get(url, headers=headers)
56+
bsObj = BeautifulSoup(req.text)
57+
print(bsObj.find("table", {"class": "table-striped"}).get_text)

Chapter 12.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from selenium import webdriver
2+
3+
path = 'C:/Users/jchen5/Downloads/phantomjs-2.1.1-windows/bin/phantomjs.exe'
4+
driver = webdriver.PhantomJS(executable_path=path)
5+
driver.get("http://pythonscraping.com/")
6+
driver.implicitly_wait(1)
7+
print(driver.get_cookies())
8+
9+
saveCookies = driver.get_cookies()
10+
11+
driver2 = webdriver.PhantomJS(executable_path=path)
12+
driver2.get("http://pythonscraping.com/")
13+
driver2.delete_all_cookies()
14+
for cookie in saveCookies:
15+
driver2.add_cookie(cookie)
16+
17+
driver2.get("http://pythonscraping.com/")
18+
driver2.implicitly_wait(1)
19+
print(driver2.get_cookies())

VISA.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from urllib.request import urlopen
2+
from bs4 import BeautifulSoup
3+
4+
html = urlopen('https://egov.uscis.gov/casestatus/mycasestatus.do?appReceiptNum=EAC1614153626')
5+
soup = BeautifulSoup(html)
6+
7+
status = soup.find('div', {'class': 'rows text-center'}).find('h1')
8+
# text = soup.find('div',{'class':'rows text-center'}).find('p')
9+
print(status.text)
10+
# print(text.text)

0 commit comments

Comments
 (0)