|
| 1 | +# from urllib.request import urlopen |
| 2 | +# textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt') |
| 3 | +# print(textPage.read()) |
| 4 | + |
| 5 | +from urllib.request import urlopen |
| 6 | + |
| 7 | +textPage = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt") |
| 8 | +print(textPage.read(), 'utf-8') |
| 9 | + |
| 10 | +# read CSV online -- use csv.reader |
| 11 | +from urllib.request import urlopen |
| 12 | +from io import StringIO |
| 13 | +import csv |
| 14 | + |
| 15 | +data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore') |
| 16 | +dataFile = StringIO(data) |
| 17 | +csvReader = csv.reader(dataFile) |
| 18 | + |
| 19 | +for row in csvReader: |
| 20 | + print("The album \"" + row[0] + "\" was released in " + str(row[1])) |
| 21 | + |
| 22 | +# read CSV online -- use csv.Dictreader: handle variable name (take longer time) |
| 23 | +from urllib.request import urlopen |
| 24 | +from io import StringIO |
| 25 | +import csv |
| 26 | + |
| 27 | +data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore') |
| 28 | +dataFile = StringIO(data) |
| 29 | +dictReader = csv.DictReader(dataFile) |
| 30 | +print(dictReader) |
| 31 | + |
| 32 | +for row in dictReader: |
| 33 | + print(row) |
| 34 | + |
| 35 | +# read PDF -- bug: io.UnsupportedOperation: seek |
| 36 | +# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter |
| 37 | +# from pdfminer.converter import TextConverter |
| 38 | +# from pdfminer.layout import LAParams |
| 39 | +# from pdfminer.pdfpage import PDFPage |
| 40 | +# from io import StringIO |
| 41 | +# from io import open |
| 42 | +# |
| 43 | +# def convert_pdf_to_txt(path, codec='utf-8'): |
| 44 | +# rsrcmgr = PDFResourceManager() |
| 45 | +# retstr = StringIO() |
| 46 | +# laparams = LAParams() |
| 47 | +# device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) |
| 48 | +# fp = urlopen(path) |
| 49 | +# interpreter = PDFPageInterpreter(rsrcmgr, device) |
| 50 | +# password = "" |
| 51 | +# maxpages = 0 |
| 52 | +# caching = True |
| 53 | +# pagenos=set() |
| 54 | +# |
| 55 | +# for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): |
| 56 | +# interpreter.process_page(page) |
| 57 | +# |
| 58 | +# text = retstr.getvalue() |
| 59 | +# |
| 60 | +# fp.close() |
| 61 | +# device.close() |
| 62 | +# retstr.close() |
| 63 | +# return text |
| 64 | +# |
| 65 | +# path = "http://pythonscraping.com/pages/warandpeace/chapter1.pdf" |
| 66 | +# #path = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf").read() |
| 67 | +# outputString = convert_pdf_to_txt(path) |
0 commit comments