diff --git a/chapter5/1-getPageMedia.py b/chapter5/1-getPageMedia.py index 0e5aaa5..635e5d8 100644 --- a/chapter5/1-getPageMedia.py +++ b/chapter5/1-getPageMedia.py @@ -39,5 +39,5 @@ def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory): fileUrl = getAbsoluteURL(baseUrl, download["src"]) if fileUrl is not None: print(fileUrl) - -urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory)) \ No newline at end of file + fileUrl = fileUrl.split('?')[0] + urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory)) \ No newline at end of file diff --git a/chapter5/2-createCsv.py b/chapter5/2-createCsv.py index 4fd02bb..bacc477 100644 --- a/chapter5/2-createCsv.py +++ b/chapter5/2-createCsv.py @@ -1,11 +1,13 @@ import csv -#from os import open +import os -csvFile = open("../files/test.csv", 'w+') +filename = "../files/test.csv" +os.makedirs(os.path.dirname(filename), exist_ok=True) +csvFile = open(filename, 'w', newline='') try: writer = csv.writer(csvFile) writer.writerow(('number', 'number plus 2', 'number times 2')) for i in range(10): - writer.writerow( (i, i+2, i*2)) + writer.writerow((i, i+2, i*2)) finally: csvFile.close() \ No newline at end of file diff --git a/chapter5/3-scrapeCsv.py b/chapter5/3-scrapeCsv.py index acb0b53..968e2ce 100644 --- a/chapter5/3-scrapeCsv.py +++ b/chapter5/3-scrapeCsv.py @@ -4,17 +4,17 @@ html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors") bsObj = BeautifulSoup(html) -#The main comparison table is currently the first table on the page -table = bsObj.findAll("table",{"class":"wikitable"})[0] +# The main comparison table is currently the first table on the page +table = bsObj.findAll("table", {"class": "wikitable"})[0] rows = table.findAll("tr") -csvFile = open("files/editors.csv", 'wt') +csvFile = open("../files/editors.csv", 'w', encoding='utf-8', newline='') writer = csv.writer(csvFile) try: - for row in rows: - csvRow = [] - for cell in row.findAll(['td', 'th']): - csvRow.append(cell.get_text()) - writer.writerow(csvRow) + for row in rows: + csvRow = [] + for cell in row.findAll(['td', 'th']): + csvRow.append(cell.get_text()) + writer.writerow(csvRow) finally: - csvFile.close() + csvFile.close() \ No newline at end of file diff --git a/chapter5/5-storeWikiLinks.py b/chapter5/5-storeWikiLinks.py index 33cb77c..55440be 100644 --- a/chapter5/5-storeWikiLinks.py +++ b/chapter5/5-storeWikiLinks.py @@ -1,5 +1,6 @@ from urllib.request import urlopen from bs4 import BeautifulSoup +import re import datetime import random import pymysql @@ -30,4 +31,4 @@ def getLinks(articleUrl): links = getLinks(newArticle) finally: cur.close() - conn.close() \ No newline at end of file + conn.close() diff --git a/files/test.csv b/files/test.csv deleted file mode 100644 index a0efbe1..0000000 --- a/files/test.csv +++ /dev/null @@ -1,11 +0,0 @@ -number,number plus 2,number times 2 -0,2,0 -1,3,2 -2,4,4 -3,5,6 -4,6,8 -5,7,10 -6,8,12 -7,9,14 -8,10,16 -9,11,18