|
| 1 | +# image processing |
| 2 | +import os |
| 3 | + |
| 4 | +os.chdir('C:/Users/jchen5/python/Web Scraping with Python/image') |
| 5 | +from PIL import Image, ImageFilter |
| 6 | + |
| 7 | +htt = Image.open('htt.jpg') |
| 8 | +htt.show() |
| 9 | +blurryhtt = htt.filter(ImageFilter.GaussianBlur) |
| 10 | +blurryhtt.save('htt_blurred.jpg') |
| 11 | +blurryhtt.show() |
| 12 | +# |
| 13 | +# #Scraping Text from Images on Websites -- bug |
| 14 | +# import time |
| 15 | +# from urllib.request import urlopen, urlretrieve |
| 16 | +# import subprocess |
| 17 | +# from selenium import webdriver |
| 18 | +# import os |
| 19 | +# os.chdir('C:/Users/jchen5/python/Web Scraping with Python/image') |
| 20 | +# path = 'C:/Users/jchen5/Downloads/phantomjs-2.1.1-windows/bin/phantomjs.exe' |
| 21 | +# driver = webdriver.PhantomJS(executable_path=path) |
| 22 | +# driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200") |
| 23 | +# time.sleep(2) |
| 24 | +# driver.find_element_by_id('sitbLogoImg').click() |
| 25 | +# imageList = set() |
| 26 | +# time.sleep(5) |
| 27 | +# |
| 28 | +# #While the right arrow is available for clicking, turn through pages |
| 29 | +# while "pointer" in driver.find_element_by_id('sitbReaderRightPageTurner').get_attribute('style'): |
| 30 | +# driver.find_element_by_id('sitbReaderRightPageTurner').click() |
| 31 | +# time.sleep(2) |
| 32 | +# pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img") |
| 33 | +# for page in pages: |
| 34 | +# image = page.get_attribute('src') |
| 35 | +# imageList.add(image) |
| 36 | +# |
| 37 | +# driver.quit() |
| 38 | +# |
| 39 | +# #Start processing the images we've collected URLs for with Tesseract |
| 40 | +# for image in sorted(imageList): |
| 41 | +# urlretrieve(image,'page.jpg') |
| 42 | +# p = subprocess.Popen(['tesseract','page.jpg','page'],stdout=subprocess.PIPE,stderr=subprocess.PIPE) |
| 43 | +# p.wait() |
| 44 | +# f = open('page.txt','r') |
| 45 | +# print(f.read()) |
| 46 | + |
| 47 | + |
| 48 | +import requests |
| 49 | +from bs4 import BeautifulSoup |
| 50 | + |
| 51 | +session = requests.Session() |
| 52 | +headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome", |
| 53 | + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"} |
| 54 | +url = "https://www.whatismybrowser.com/developers/what-http-headers-is-my-browser-sending" |
| 55 | +req = session.get(url, headers=headers) |
| 56 | +bsObj = BeautifulSoup(req.text) |
| 57 | +print(bsObj.find("table", {"class": "table-striped"}).get_text) |
0 commit comments