Skip to content

Commit e8163f5

Browse files
committed
Deleted old chap2 notebook, code formatting
1 parent 89a1998 commit e8163f5

11 files changed

+95
-83
lines changed

v2/Chapter01_BeginningToScrape.ipynb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
"from urllib.error import HTTPError\n",
8989
"from bs4 import BeautifulSoup\n",
9090
"\n",
91+
"\n",
9192
"def getTitle(url):\n",
9293
" try:\n",
9394
" html = urlopen(url)\n",
@@ -100,6 +101,7 @@
100101
" return None\n",
101102
" return title\n",
102103
"\n",
104+
"\n",
103105
"title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n",
104106
"if title == None:\n",
105107
" print(\"Title could not be found\")\n",

v2/Chapter02-AdvancedHTMLParsing.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
}
8383
],
8484
"source": [
85-
"nameList = bs.findAll('span', {'class':'green'})\n",
85+
"nameList = bs.findAll('span', {'class': 'green'})\n",
8686
"for name in nameList:\n",
8787
" print(name.get_text())"
8888
]

v2/Chapter03-web-crawlers.ipynb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,13 +1766,15 @@
17661766
}
17671767
],
17681768
"source": [
1769-
"#Collects a list of all external URLs found on the site\n",
1769+
"# Collects a list of all external URLs found on the site\n",
17701770
"allExtLinks = set()\n",
17711771
"allIntLinks = set()\n",
17721772
"\n",
1773+
"\n",
17731774
"def getAllExternalLinks(siteUrl):\n",
17741775
" html = urlopen(siteUrl)\n",
1775-
" domain = '{}://{}'.format(urlparse(siteUrl).scheme, urlparse(siteUrl).netloc)\n",
1776+
" domain = '{}://{}'.format(urlparse(siteUrl).scheme,\n",
1777+
" urlparse(siteUrl).netloc)\n",
17761778
" bs = BeautifulSoup(html, 'html.parser')\n",
17771779
" internalLinks = getInternalLinks(bs, domain)\n",
17781780
" externalLinks = getExternalLinks(bs, domain)\n",
@@ -1786,6 +1788,7 @@
17861788
" allIntLinks.add(link)\n",
17871789
" getAllExternalLinks(link)\n",
17881790
"\n",
1791+
"\n",
17891792
"allIntLinks.add('http://oreilly.com')\n",
17901793
"getAllExternalLinks('http://oreilly.com')"
17911794
]

v2/Chapter04_CrawlingModels.ipynb

Lines changed: 53 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
" \"\"\"\n",
2323
"\n",
2424
" session = requests.Session()\n",
25-
" headers = {\"User-Agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\"}\n",
25+
" headers = {\"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\",\n",
26+
" \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\"}\n",
2627
" try:\n",
2728
" req = session.get(url, headers=headers)\n",
2829
" except requests.exceptions.RequestException:\n",
@@ -183,36 +184,41 @@
183184
"source": [
184185
"import requests\n",
185186
"\n",
187+
"\n",
186188
"class Content:\n",
187189
" def __init__(self, url, title, body):\n",
188190
" self.url = url\n",
189191
" self.title = title\n",
190192
" self.body = body\n",
191193
"\n",
194+
"\n",
192195
"def getPage(url):\n",
193196
" req = requests.get(url)\n",
194197
" return BeautifulSoup(req.text, 'html.parser')\n",
195198
"\n",
199+
"\n",
196200
"def scrapeNYTimes(url):\n",
197201
" bs = getPage(url)\n",
198202
" title = bs.find(\"h1\").text\n",
199-
" lines = bs.find_all(\"p\", {\"class\":\"story-content\"})\n",
203+
" lines = bs.find_all(\"p\", {\"class\": \"story-content\"})\n",
200204
" body = '\\n'.join([line.text for line in lines])\n",
201205
" return Content(url, title, body)\n",
202206
"\n",
207+
"\n",
203208
"def scrapeBrookings(url):\n",
204209
" bs = getPage(url)\n",
205210
" title = bs.find(\"h1\").text\n",
206-
" body = bs.find(\"div\",{\"class\",\"post-body\"}).text\n",
211+
" body = bs.find(\"div\", {\"class\", \"post-body\"}).text\n",
207212
" return Content(url, title, body)\n",
208213
"\n",
214+
"\n",
209215
"url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'\n",
210216
"content = scrapeBrookings(url)\n",
211217
"print('Title: {}'.format(content.title))\n",
212218
"print('URL: {}\\n'.format(content.url))\n",
213219
"print(content.body)\n",
214220
"\n",
215-
"url = \"https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html\"\n",
221+
"url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'\n",
216222
"content = scrapeNYTimes(url)\n",
217223
"print('Title: {}'.format(content.title))\n",
218224
"print('URL: {}\\n'.format(content.url))\n",
@@ -229,7 +235,7 @@
229235
" \"\"\"\n",
230236
" Common base class for all articles/pages\n",
231237
" \"\"\"\n",
232-
" \n",
238+
"\n",
233239
" def __init__(self, url, title, body):\n",
234240
" self.url = url\n",
235241
" self.title = title\n",
@@ -242,7 +248,7 @@
242248
" print(\"URL: {}\".format(self.url))\n",
243249
" print(\"TITLE: {}\".format(self.title))\n",
244250
" print(\"BODY:\\n{}\".format(self.body))\n",
245-
" \n",
251+
"\n",
246252
"\n",
247253
"class Website:\n",
248254
" \"\"\" \n",
@@ -265,13 +271,14 @@
265271
"import requests\n",
266272
"from bs4 import BeautifulSoup\n",
267273
"\n",
274+
"\n",
268275
"class Crawler:\n",
269276
"\n",
270277
" def getPage(self, url):\n",
271278
" try:\n",
272279
" req = requests.get(url)\n",
273280
" except requests.exceptions.RequestException:\n",
274-
" return None \n",
281+
" return None\n",
275282
" return BeautifulSoup(req.text, 'html.parser')\n",
276283
"\n",
277284
" def safeGet(self, pageObj, selector):\n",
@@ -800,9 +807,14 @@
800807
" websites.append(Website(row[0], row[1], row[2], row[3]))\n",
801808
"\n",
802809
"crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')\n",
803-
"crawler.parse(websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')\n",
804-
"crawler.parse(websites[2], 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')\n",
805-
"crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')"
810+
"crawler.parse(\n",
811+
" websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')\n",
812+
"crawler.parse(\n",
813+
" websites[2],\n",
814+
" 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')\n",
815+
"crawler.parse(\n",
816+
" websites[3], \n",
817+
" 'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')"
806818
]
807819
},
808820
{
@@ -852,7 +864,7 @@
852864
" self.searchUrl = searchUrl\n",
853865
" self.resultListing = resultListing\n",
854866
" self.resultUrl = resultUrl\n",
855-
" self.absoluteUrl=absoluteUrl\n",
867+
" self.absoluteUrl = absoluteUrl\n",
856868
" self.titleTag = titleTag\n",
857869
" self.bodyTag = bodyTag"
858870
]
@@ -976,13 +988,14 @@
976988
"import requests\n",
977989
"from bs4 import BeautifulSoup\n",
978990
"\n",
991+
"\n",
979992
"class Crawler:\n",
980993
"\n",
981994
" def getPage(self, url):\n",
982995
" try:\n",
983996
" req = requests.get(url)\n",
984997
" except requests.exceptions.RequestException:\n",
985-
" return None \n",
998+
" return None\n",
986999
" return BeautifulSoup(req.text, 'html.parser')\n",
9871000
"\n",
9881001
" def safeGet(self, pageObj, selector):\n",
@@ -995,15 +1008,15 @@
9951008
" \"\"\"\n",
9961009
" Searches a given website for a given topic and records all pages found\n",
9971010
" \"\"\"\n",
998-
" bs = self.getPage(site.searchUrl+topic)\n",
1011+
" bs = self.getPage(site.searchUrl + topic)\n",
9991012
" searchResults = bs.select(site.resultListing)\n",
10001013
" for result in searchResults:\n",
10011014
" url = result.select(site.resultUrl)[0].attrs[\"href\"]\n",
1002-
" #Check to see whether it's a relative or an absolute URL\n",
1015+
" # Check to see whether it's a relative or an absolute URL\n",
10031016
" if(site.absoluteUrl):\n",
10041017
" bs = self.getPage(url)\n",
10051018
" else:\n",
1006-
" bs = self.getPage(site.url+url)\n",
1019+
" bs = self.getPage(site.url + url)\n",
10071020
" if bs is None:\n",
10081021
" print(\"Something was wrong with that page or URL. Skipping!\")\n",
10091022
" return\n",
@@ -1017,17 +1030,21 @@
10171030
"crawler = Crawler()\n",
10181031
"\n",
10191032
"siteData = [\n",
1020-
" ['O\\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=', 'article.product-result','p.title a', True, 'h1', 'section#product-description'],\n",
1021-
" ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content', 'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],\n",
1022-
" ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=', 'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']\n",
1023-
" ]\n",
1033+
" ['O\\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',\n",
1034+
" 'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],\n",
1035+
" ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',\n",
1036+
" 'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],\n",
1037+
" ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',\n",
1038+
" 'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']\n",
1039+
"]\n",
10241040
"sites = []\n",
10251041
"for row in siteData:\n",
1026-
" sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))\n",
1042+
" sites.append(Website(row[0], row[1], row[2],\n",
1043+
" row[3], row[4], row[5], row[6], row[7]))\n",
10271044
"\n",
1028-
"topics = [\"python\",\"data science\"]\n",
1045+
"topics = [\"python\", \"data science\"]\n",
10291046
"for topic in topics:\n",
1030-
" print(\"GETTING INFO ABOUT: \"+topic)\n",
1047+
" print(\"GETTING INFO ABOUT: \" + topic)\n",
10311048
" for targetSite in sites:\n",
10321049
" crawler.search(topic, targetSite)"
10331050
]
@@ -1046,15 +1063,16 @@
10461063
"outputs": [],
10471064
"source": [
10481065
"class Website:\n",
1049-
" \n",
1066+
"\n",
10501067
" def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):\n",
10511068
" self.name = name\n",
10521069
" self.url = url\n",
10531070
" self.targetPattern = targetPattern\n",
1054-
" self.absoluteUrl=absoluteUrl\n",
1071+
" self.absoluteUrl = absoluteUrl\n",
10551072
" self.titleTag = titleTag\n",
10561073
" self.bodyTag = bodyTag\n",
1057-
" \n",
1074+
"\n",
1075+
"\n",
10581076
"class Content:\n",
10591077
"\n",
10601078
" def __init__(self, url, title, body):\n",
@@ -1170,24 +1188,25 @@
11701188
"source": [
11711189
"import re\n",
11721190
"\n",
1191+
"\n",
11731192
"class Crawler:\n",
11741193
" def __init__(self, site):\n",
11751194
" self.site = site\n",
11761195
" self.visited = []\n",
1177-
" \n",
1196+
"\n",
11781197
" def getPage(self, url):\n",
11791198
" try:\n",
11801199
" req = requests.get(url)\n",
11811200
" except requests.exceptions.RequestException:\n",
1182-
" return None \n",
1201+
" return None\n",
11831202
" return BeautifulSoup(req.text, 'html.parser')\n",
11841203
"\n",
11851204
" def safeGet(self, pageObj, selector):\n",
11861205
" selectedElems = pageObj.select(selector)\n",
11871206
" if selectedElems is not None and len(selectedElems) > 0:\n",
11881207
" return '\\n'.join([elem.get_text() for elem in selectedElems])\n",
11891208
" return ''\n",
1190-
" \n",
1209+
"\n",
11911210
" def parse(self, url):\n",
11921211
" bs = self.getPage(url)\n",
11931212
" if bs is not None:\n",
@@ -1211,7 +1230,9 @@
12111230
" targetPage = '{}{}'.format(self.site.url, targetPage)\n",
12121231
" self.parse(targetPage)\n",
12131232
"\n",
1214-
"reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)', False, 'h1', 'div.StandardArticleBody_body_1gnLA')\n",
1233+
"\n",
1234+
"reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',\n",
1235+
" False, 'h1', 'div.StandardArticleBody_body_1gnLA')\n",
12151236
"crawler = Crawler(reuters)\n",
12161237
"crawler.crawl()"
12171238
]
@@ -1248,13 +1269,16 @@
12481269
"source": [
12491270
"class Product(Website):\n",
12501271
" \"\"\"Contains information for scraping a product page\"\"\"\n",
1272+
"\n",
12511273
" def __init__(self, name, url, titleTag, productNumber, price):\n",
12521274
" Website.__init__(self, name, url, TitleTag)\n",
12531275
" self.productNumberTag = productNumberTag\n",
12541276
" self.priceTag = priceTag\n",
12551277
"\n",
1278+
"\n",
12561279
"class Article(Website):\n",
12571280
" \"\"\"Contains information for scraping an article page\"\"\"\n",
1281+
"\n",
12581282
" def __init__(self, name, url, titleTag, bodyTag, dateTag):\n",
12591283
" Website.__init__(self, name, url, titleTag)\n",
12601284
" self.bodyTag = bodyTag\n",

v2/Chapter06_StoringData.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37025,7 +37025,9 @@
3702537025
"execution_count": null,
3702637026
"metadata": {},
3702737027
"outputs": [],
37028-
"source": []
37028+
"source": [
37029+
"'"
37030+
]
3702937031
}
3703037032
],
3703137033
"metadata": {

v2/Chapter09_NaturalLanguages.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,9 @@
276276
"execution_count": null,
277277
"metadata": {},
278278
"outputs": [],
279-
"source": []
279+
"source": [
280+
"'"
281+
]
280282
},
281283
{
282284
"cell_type": "code",

v2/Chapter10-CrawlingThroughFormsAndLogins.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,8 +517,8 @@
517517
"from requests.auth import HTTPBasicAuth\n",
518518
"\n",
519519
"auth = HTTPBasicAuth('ryan', 'password')\n",
520-
"r = requests.post(url='http://pythonscraping.com/pages/auth/login.php', auth=\n",
521-
" auth)\n",
520+
"r = requests.post(\n",
521+
" url='http://pythonscraping.com/pages/auth/login.php', auth=auth)\n",
522522
"print(r.text)"
523523
]
524524
},

v2/Chapter11-JavaScript.ipynb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,15 @@
6666
"from selenium.webdriver.support import expected_conditions as EC\n",
6767
"from selenium.common.exceptions import TimeoutException\n",
6868
"\n",
69-
"driver = webdriver.PhantomJS(executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
69+
"driver = webdriver.PhantomJS(\n",
70+
" executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
7071
"driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html')\n",
7172
"try:\n",
72-
" bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//body[contains(text(), \"This is the page you are looking for!\")]')))\n",
73+
" bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located(\n",
74+
" (By.XPATH, '//body[contains(text(), \"This is the page you are looking for!\")]')))\n",
7375
" print(bodyElement.text)\n",
7476
"except TimeoutException:\n",
75-
" print('Did not find the element')\n"
77+
" print('Did not find the element')"
7678
]
7779
},
7880
{

0 commit comments

Comments
 (0)