Skip to content

Commit 235bbe7

Browse files
committed
Code cleanup
1 parent fb9cc07 commit 235bbe7

File tree

7 files changed

+481
-238
lines changed

7 files changed

+481
-238
lines changed

v2/Chapter02-AdvancedHTMLParsing.ipynb

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -229,23 +229,21 @@
229229
"source": [
230230
"from urllib.request import urlopen\n",
231231
"from bs4 import BeautifulSoup\n",
232-
"html = urlopen(\"http://www.pythonscraping.com/pages/warandpeace.html\")\n",
233-
"bs = BeautifulSoup(html, \"html.parser\")\n",
234-
"print(bs)"
232+
"html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n",
233+
"bsObj = BeautifulSoup(html, 'html.parser')\n",
234+
"print(bsObj)"
235235
]
236236
},
237237
{
238238
"cell_type": "code",
239239
"execution_count": 6,
240-
"metadata": {
241-
"collapsed": true
242-
},
240+
"metadata": {},
243241
"outputs": [],
244242
"source": [
245243
"from urllib.request import urlopen\n",
246244
"from bs4 import BeautifulSoup\n",
247-
"html = urlopen(\"http://www.pythonscraping.com/pages/warandpeace.html\")\n",
248-
"bs = BeautifulSoup(html, \"html.parser\")"
245+
"html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n",
246+
"bsObj = BeautifulSoup(html, \"html.parser\")"
249247
]
250248
},
251249
{
@@ -307,14 +305,14 @@
307305
}
308306
],
309307
"source": [
310-
"nameList = bs.findAll(\"span\", {\"class\":\"green\"})\n",
308+
"nameList = bsObj.findAll('span', {'class':'green'})\n",
311309
"for name in nameList:\n",
312310
" print(name.get_text())"
313311
]
314312
},
315313
{
316314
"cell_type": "code",
317-
"execution_count": 18,
315+
"execution_count": 7,
318316
"metadata": {},
319317
"outputs": [
320318
{
@@ -326,13 +324,13 @@
326324
}
327325
],
328326
"source": [
329-
"titles = bs.find_all({\"h1\",\"h2\",\"h3\",\"h4\",\"h5\",\"h6\"})\n",
327+
"titles = bsObj.find_all(['h1', 'h2','h3','h4','h5','h6'])\n",
330328
"print([title for title in titles])\n"
331329
]
332330
},
333331
{
334332
"cell_type": "code",
335-
"execution_count": 20,
333+
"execution_count": 8,
336334
"metadata": {},
337335
"outputs": [
338336
{
@@ -416,7 +414,7 @@
416414
}
417415
],
418416
"source": [
419-
"allText = bs.find_all(\"span\", {\"class\":{\"green\", \"red\"}})\n",
417+
"allText = bsObj.find_all('span', {'class':{'green', 'red'}})\n",
420418
"print([text for text in allText])"
421419
]
422420
},
@@ -434,7 +432,7 @@
434432
}
435433
],
436434
"source": [
437-
"nameList = bs.find_all(text=\"the prince\")\n",
435+
"nameList = bsObj.find_all(text='the prince')\n",
438436
"print(len(nameList))"
439437
]
440438
},
@@ -452,7 +450,7 @@
452450
}
453451
],
454452
"source": [
455-
"allText = bs.find_all(id=\"title\", class_=\"text\")\n",
453+
"allText = bsObj.find_all(id='title', class_='text')\n",
456454
"print([text for text in allText])"
457455
]
458456
},
@@ -541,10 +539,10 @@
541539
"from urllib.request import urlopen\n",
542540
"from bs4 import BeautifulSoup\n",
543541
"\n",
544-
"html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n",
545-
"bs = BeautifulSoup(html, \"html.parser\")\n",
542+
"html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n",
543+
"bsObj = BeautifulSoup(html, 'html.parser')\n",
546544
"\n",
547-
"for child in bs.find(\"table\",{\"id\":\"giftList\"}).children:\n",
545+
"for child in bsObj.find('table',{'id':'giftList'}).children:\n",
548546
" print(child)"
549547
]
550548
},
@@ -621,16 +619,16 @@
621619
"source": [
622620
"from urllib.request import urlopen\n",
623621
"from bs4 import BeautifulSoup\n",
624-
"html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n",
625-
"bs = BeautifulSoup(html, \"html.parser\")\n",
622+
"html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n",
623+
"bsObj = BeautifulSoup(html, 'html.parser')\n",
626624
"\n",
627-
"for sibling in bs.find(\"table\",{\"id\":\"giftList\"}).tr.next_siblings:\n",
625+
"for sibling in bsObj.find('table', {'id':'giftList'}).tr.next_siblings:\n",
628626
" print(sibling) "
629627
]
630628
},
631629
{
632630
"cell_type": "code",
633-
"execution_count": 27,
631+
"execution_count": 1,
634632
"metadata": {},
635633
"outputs": [
636634
{
@@ -647,14 +645,14 @@
647645
"from urllib.request import urlopen\n",
648646
"from bs4 import BeautifulSoup\n",
649647
"\n",
650-
"html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n",
651-
"bs = BeautifulSoup(html, \"html.parser\")\n",
652-
"print(bs.find(\"img\",{\"src\":\"../img/gifts/img1.jpg\"}).parent.previous_sibling.get_text())"
648+
"html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n",
649+
"bsObj = BeautifulSoup(html, 'html.parser')\n",
650+
"print(bsObj.find('img',{'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())"
653651
]
654652
},
655653
{
656654
"cell_type": "code",
657-
"execution_count": 28,
655+
"execution_count": 3,
658656
"metadata": {},
659657
"outputs": [
660658
{
@@ -674,11 +672,11 @@
674672
"from bs4 import BeautifulSoup\n",
675673
"import re\n",
676674
"\n",
677-
"html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n",
678-
"bs = BeautifulSoup(html, \"html.parser\")\n",
679-
"images = bs.find_all(\"img\", {\"src\":re.compile(\"\\.\\.\\/img\\/gifts/img.*\\.jpg\")})\n",
675+
"html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n",
676+
"bsObj = BeautifulSoup(html, 'html.parser')\n",
677+
"images = bsObj.find_all('img', {'src':re.compile('\\.\\.\\/img\\/gifts/img.*\\.jpg')})\n",
680678
"for image in images: \n",
681-
" print(image[\"src\"])"
679+
" print(image['src'])"
682680
]
683681
},
684682
{
@@ -744,7 +742,7 @@
744742
}
745743
],
746744
"source": [
747-
"bs.find_all(lambda tag: len(tag.attrs) == 2)"
745+
"bsObj.find_all(lambda tag: len(tag.attrs) == 2)"
748746
]
749747
},
750748
{

0 commit comments

Comments
 (0)