|
229 | 229 | "source": [ |
230 | 230 | "from urllib.request import urlopen\n", |
231 | 231 | "from bs4 import BeautifulSoup\n", |
232 | | - "html = urlopen(\"http://www.pythonscraping.com/pages/warandpeace.html\")\n", |
233 | | - "bs = BeautifulSoup(html, \"html.parser\")\n", |
234 | | - "print(bs)" |
| 232 | + "html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n", |
| 233 | + "bsObj = BeautifulSoup(html, 'html.parser')\n", |
| 234 | + "print(bsObj)" |
235 | 235 | ] |
236 | 236 | }, |
237 | 237 | { |
238 | 238 | "cell_type": "code", |
239 | 239 | "execution_count": 6, |
240 | | - "metadata": { |
241 | | - "collapsed": true |
242 | | - }, |
| 240 | + "metadata": {}, |
243 | 241 | "outputs": [], |
244 | 242 | "source": [ |
245 | 243 | "from urllib.request import urlopen\n", |
246 | 244 | "from bs4 import BeautifulSoup\n", |
247 | | - "html = urlopen(\"http://www.pythonscraping.com/pages/warandpeace.html\")\n", |
248 | | - "bs = BeautifulSoup(html, \"html.parser\")" |
| 245 | + "html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n", |
| 246 | + "bsObj = BeautifulSoup(html, \"html.parser\")" |
249 | 247 | ] |
250 | 248 | }, |
251 | 249 | { |
|
307 | 305 | } |
308 | 306 | ], |
309 | 307 | "source": [ |
310 | | - "nameList = bs.findAll(\"span\", {\"class\":\"green\"})\n", |
| 308 | + "nameList = bsObj.findAll('span', {'class':'green'})\n", |
311 | 309 | "for name in nameList:\n", |
312 | 310 | " print(name.get_text())" |
313 | 311 | ] |
314 | 312 | }, |
315 | 313 | { |
316 | 314 | "cell_type": "code", |
317 | | - "execution_count": 18, |
| 315 | + "execution_count": 7, |
318 | 316 | "metadata": {}, |
319 | 317 | "outputs": [ |
320 | 318 | { |
|
326 | 324 | } |
327 | 325 | ], |
328 | 326 | "source": [ |
329 | | - "titles = bs.find_all({\"h1\",\"h2\",\"h3\",\"h4\",\"h5\",\"h6\"})\n", |
| 327 | + "titles = bsObj.find_all(['h1', 'h2','h3','h4','h5','h6'])\n", |
330 | 328 | "print([title for title in titles])\n" |
331 | 329 | ] |
332 | 330 | }, |
333 | 331 | { |
334 | 332 | "cell_type": "code", |
335 | | - "execution_count": 20, |
| 333 | + "execution_count": 8, |
336 | 334 | "metadata": {}, |
337 | 335 | "outputs": [ |
338 | 336 | { |
|
416 | 414 | } |
417 | 415 | ], |
418 | 416 | "source": [ |
419 | | - "allText = bs.find_all(\"span\", {\"class\":{\"green\", \"red\"}})\n", |
| 417 | + "allText = bsObj.find_all('span', {'class':{'green', 'red'}})\n", |
420 | 418 | "print([text for text in allText])" |
421 | 419 | ] |
422 | 420 | }, |
|
434 | 432 | } |
435 | 433 | ], |
436 | 434 | "source": [ |
437 | | - "nameList = bs.find_all(text=\"the prince\")\n", |
| 435 | + "nameList = bsObj.find_all(text='the prince')\n", |
438 | 436 | "print(len(nameList))" |
439 | 437 | ] |
440 | 438 | }, |
|
452 | 450 | } |
453 | 451 | ], |
454 | 452 | "source": [ |
455 | | - "allText = bs.find_all(id=\"title\", class_=\"text\")\n", |
| 453 | + "allText = bsObj.find_all(id='title', class_='text')\n", |
456 | 454 | "print([text for text in allText])" |
457 | 455 | ] |
458 | 456 | }, |
|
541 | 539 | "from urllib.request import urlopen\n", |
542 | 540 | "from bs4 import BeautifulSoup\n", |
543 | 541 | "\n", |
544 | | - "html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n", |
545 | | - "bs = BeautifulSoup(html, \"html.parser\")\n", |
| 542 | + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", |
| 543 | + "bsObj = BeautifulSoup(html, 'html.parser')\n", |
546 | 544 | "\n", |
547 | | - "for child in bs.find(\"table\",{\"id\":\"giftList\"}).children:\n", |
| 545 | + "for child in bsObj.find('table',{'id':'giftList'}).children:\n", |
548 | 546 | " print(child)" |
549 | 547 | ] |
550 | 548 | }, |
|
621 | 619 | "source": [ |
622 | 620 | "from urllib.request import urlopen\n", |
623 | 621 | "from bs4 import BeautifulSoup\n", |
624 | | - "html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n", |
625 | | - "bs = BeautifulSoup(html, \"html.parser\")\n", |
| 622 | + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", |
| 623 | + "bsObj = BeautifulSoup(html, 'html.parser')\n", |
626 | 624 | "\n", |
627 | | - "for sibling in bs.find(\"table\",{\"id\":\"giftList\"}).tr.next_siblings:\n", |
| 625 | + "for sibling in bsObj.find('table', {'id':'giftList'}).tr.next_siblings:\n", |
628 | 626 | " print(sibling) " |
629 | 627 | ] |
630 | 628 | }, |
631 | 629 | { |
632 | 630 | "cell_type": "code", |
633 | | - "execution_count": 27, |
| 631 | + "execution_count": 1, |
634 | 632 | "metadata": {}, |
635 | 633 | "outputs": [ |
636 | 634 | { |
|
647 | 645 | "from urllib.request import urlopen\n", |
648 | 646 | "from bs4 import BeautifulSoup\n", |
649 | 647 | "\n", |
650 | | - "html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n", |
651 | | - "bs = BeautifulSoup(html, \"html.parser\")\n", |
652 | | - "print(bs.find(\"img\",{\"src\":\"../img/gifts/img1.jpg\"}).parent.previous_sibling.get_text())" |
| 648 | + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", |
| 649 | + "bsObj = BeautifulSoup(html, 'html.parser')\n", |
| 650 | + "print(bsObj.find('img',{'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())" |
653 | 651 | ] |
654 | 652 | }, |
655 | 653 | { |
656 | 654 | "cell_type": "code", |
657 | | - "execution_count": 28, |
| 655 | + "execution_count": 3, |
658 | 656 | "metadata": {}, |
659 | 657 | "outputs": [ |
660 | 658 | { |
|
674 | 672 | "from bs4 import BeautifulSoup\n", |
675 | 673 | "import re\n", |
676 | 674 | "\n", |
677 | | - "html = urlopen(\"http://www.pythonscraping.com/pages/page3.html\")\n", |
678 | | - "bs = BeautifulSoup(html, \"html.parser\")\n", |
679 | | - "images = bs.find_all(\"img\", {\"src\":re.compile(\"\\.\\.\\/img\\/gifts/img.*\\.jpg\")})\n", |
| 675 | + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", |
| 676 | + "bsObj = BeautifulSoup(html, 'html.parser')\n", |
| 677 | + "images = bsObj.find_all('img', {'src':re.compile('\\.\\.\\/img\\/gifts/img.*\\.jpg')})\n", |
680 | 678 | "for image in images: \n", |
681 | | - " print(image[\"src\"])" |
| 679 | + " print(image['src'])" |
682 | 680 | ] |
683 | 681 | }, |
684 | 682 | { |
|
744 | 742 | } |
745 | 743 | ], |
746 | 744 | "source": [ |
747 | | - "bs.find_all(lambda tag: len(tag.attrs) == 2)" |
| 745 | + "bsObj.find_all(lambda tag: len(tag.attrs) == 2)" |
748 | 746 | ] |
749 | 747 | }, |
750 | 748 | { |
|
0 commit comments