Skip to content

Commit 5d3e45f

Browse files
committed
update 20160427
1 parent 8fda089 commit 5d3e45f

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed

Chapter 5.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# imageLocation = soup.find("a",{"id":"logo"}).find("img")["src"]
88
# urlretrieve(imageLocation,"logo.jpg")
99

10+
11+
# download image
1012
import os
1113
from urllib.request import urlretrieve
1214
from urllib.request import urlopen
@@ -51,3 +53,25 @@ def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
5153
print(fileUrl)
5254

5355
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
56+
57+
# store data to csv
58+
import csv
59+
from urllib.request import urlopen
60+
from bs4 import BeautifulSoup
61+
62+
html = urlopen('https://en.wikipedia.org/wiki/Comparison_of_text_editors')
63+
soup = BeautifulSoup(html)
64+
table = soup.findAll('table', {'class': 'wikitable'})[0]
65+
rows = table.findAll('tr')
66+
67+
csvFile = open('C:/Users/jchen5/python/Web Scraping with Python/editors.csv', 'wt')
68+
writer = csv.writer(csvFile)
69+
70+
try:
71+
for row in rows:
72+
csvRow = []
73+
for cell in row.findAll(['td', 'th']):
74+
csvRow.append(cell.get_text())
75+
writer.writerow(csvRow)
76+
finally:
77+
csvFile.close()

Chapter 6.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# from urllib.request import urlopen
2+
# textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')
3+
# print(textPage.read())
4+
5+
from urllib.request import urlopen
6+
7+
textPage = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt")
8+
print(textPage.read(), 'utf-8')
9+
10+
# read CSV online -- use csv.reader
11+
from urllib.request import urlopen
12+
from io import StringIO
13+
import csv
14+
15+
data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore')
16+
dataFile = StringIO(data)
17+
csvReader = csv.reader(dataFile)
18+
19+
for row in csvReader:
20+
print("The album \"" + row[0] + "\" was released in " + str(row[1]))
21+
22+
# read CSV online -- use csv.Dictreader: handle variable name (take longer time)
23+
from urllib.request import urlopen
24+
from io import StringIO
25+
import csv
26+
27+
data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore')
28+
dataFile = StringIO(data)
29+
dictReader = csv.DictReader(dataFile)
30+
print(dictReader)
31+
32+
for row in dictReader:
33+
print(row)
34+
35+
# read PDF -- bug: io.UnsupportedOperation: seek
36+
# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
37+
# from pdfminer.converter import TextConverter
38+
# from pdfminer.layout import LAParams
39+
# from pdfminer.pdfpage import PDFPage
40+
# from io import StringIO
41+
# from io import open
42+
#
43+
# def convert_pdf_to_txt(path, codec='utf-8'):
44+
# rsrcmgr = PDFResourceManager()
45+
# retstr = StringIO()
46+
# laparams = LAParams()
47+
# device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
48+
# fp = urlopen(path)
49+
# interpreter = PDFPageInterpreter(rsrcmgr, device)
50+
# password = ""
51+
# maxpages = 0
52+
# caching = True
53+
# pagenos=set()
54+
#
55+
# for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
56+
# interpreter.process_page(page)
57+
#
58+
# text = retstr.getvalue()
59+
#
60+
# fp.close()
61+
# device.close()
62+
# retstr.close()
63+
# return text
64+
#
65+
# path = "http://pythonscraping.com/pages/warandpeace/chapter1.pdf"
66+
# #path = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf").read()
67+
# outputString = convert_pdf_to_txt(path)

Chapter 7.py

Whitespace-only changes.

0 commit comments

Comments
 (0)