From cc1eb44188c854647e4988de420006aaf92cba74 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 09:41:43 -0400 Subject: [PATCH 01/27] Move notebook to chapter1 folder --- .../chapter1/Chapter01_BeginningToScrape.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Chapter01_BeginningToScrape.ipynb => v2/chapter1/Chapter01_BeginningToScrape.ipynb (100%) diff --git a/Chapter01_BeginningToScrape.ipynb b/v2/chapter1/Chapter01_BeginningToScrape.ipynb similarity index 100% rename from Chapter01_BeginningToScrape.ipynb rename to v2/chapter1/Chapter01_BeginningToScrape.ipynb From ba2dd8709ac6eba6d079124d90b27a82178ab1d6 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 09:53:08 -0400 Subject: [PATCH 02/27] Add 1st example to chapter1 --- v2/chapter1/1-basicExample.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 v2/chapter1/1-basicExample.py diff --git a/v2/chapter1/1-basicExample.py b/v2/chapter1/1-basicExample.py new file mode 100644 index 0000000..28ea3ea --- /dev/null +++ b/v2/chapter1/1-basicExample.py @@ -0,0 +1,4 @@ +from urllib.request import urlopen + +html = urlopen('http://pythonscraping.com/pages/page1.html') +print(html.read()) From 874d3bbea2a8dba10b1c65fbfa535eec698875cb Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 10:07:14 -0400 Subject: [PATCH 03/27] Add 2nd example to chapter1 --- v2/chapter1/2-beautifulSoup.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 v2/chapter1/2-beautifulSoup.py diff --git a/v2/chapter1/2-beautifulSoup.py b/v2/chapter1/2-beautifulSoup.py new file mode 100644 index 0000000..7cf6d5a --- /dev/null +++ b/v2/chapter1/2-beautifulSoup.py @@ -0,0 +1,6 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + +html = urlopen('http://www.pythonscraping.com/pages/page1.html') +bs = BeautifulSoup(html.read(), 'html.parser') +print(bs.h1) From 4121156431580747ece389dcd979657a1e93a9fe Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 10:19:26 -0400 Subject: [PATCH 04/27] Add 3rd example to chapter1 --- v2/chapter1/3-exceptionHandling_1.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 v2/chapter1/3-exceptionHandling_1.py diff --git a/v2/chapter1/3-exceptionHandling_1.py b/v2/chapter1/3-exceptionHandling_1.py new file mode 100644 index 0000000..66bb929 --- /dev/null +++ b/v2/chapter1/3-exceptionHandling_1.py @@ -0,0 +1,13 @@ +from urllib.request import urlopen +from urllib.error import HTTPError +from urllib.error import URLError + + +try: + html = urlopen("https://pythonscrapingthisurldoesnotexist.com") +except HTTPError as e: + print("The server returned an HTTP error") +except URLError as e: + print("The server could not be found!") +else: + print(html.read()) From 09a93651552aee91f9830646f92e46186f8e6171 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 10:20:29 -0400 Subject: [PATCH 05/27] Add 4th example to chapter1 --- v2/chapter1/4-exceptionHandling_2.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 v2/chapter1/4-exceptionHandling_2.py diff --git a/v2/chapter1/4-exceptionHandling_2.py b/v2/chapter1/4-exceptionHandling_2.py new file mode 100644 index 0000000..9821f0f --- /dev/null +++ b/v2/chapter1/4-exceptionHandling_2.py @@ -0,0 +1,23 @@ +from urllib.request import urlopen +from urllib.error import HTTPError +from bs4 import BeautifulSoup + + +def getTitle(url): + try: + html = urlopen(url) + except HTTPError as e: + return None + try: + bsObj = BeautifulSoup(html.read(), "lxml") + title = bsObj.body.h1 + except AttributeError as e: + return None + return title + + +title = getTitle("http://www.pythonscraping.com/pages/page1.html") +if title == None: + print("Title could not be found") +else: + print(title) From 71ced46aadaca50f6f2b96735ee9218da7b76694 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 10:22:56 -0400 Subject: [PATCH 06/27] Move notebook to chapter2 folder --- .../chapter2/Chapter02-AdvancedHTMLParsing.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Chapter02-AdvancedHTMLParsing.ipynb => v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb (100%) diff --git a/Chapter02-AdvancedHTMLParsing.ipynb b/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb similarity index 100% rename from Chapter02-AdvancedHTMLParsing.ipynb rename to v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb From 47fbd43a68da7e9b8d9bd982f2e4391d89c93a76 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 11:41:50 -0400 Subject: [PATCH 07/27] Add 1st example to chapter2 --- v2/chapter2/1-selectByClass.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 v2/chapter2/1-selectByClass.py diff --git a/v2/chapter2/1-selectByClass.py b/v2/chapter2/1-selectByClass.py new file mode 100644 index 0000000..b4543bd --- /dev/null +++ b/v2/chapter2/1-selectByClass.py @@ -0,0 +1,9 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page1.html') +bs = BeautifulSoup(html.read(), 'html.parser') +nameList = bs.findAll('span', {'class':'green'}) +for name in nameList: + print(name.get_text()) From 06425480fd298701066b9bfc560e19bff944aa31 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 12:09:58 -0400 Subject: [PATCH 08/27] Add 2nd example to chapter2 --- v2/chapter2/2-selectByOtherArguments.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 v2/chapter2/2-selectByOtherArguments.py diff --git a/v2/chapter2/2-selectByOtherArguments.py b/v2/chapter2/2-selectByOtherArguments.py new file mode 100644 index 0000000..d8bf3d5 --- /dev/null +++ b/v2/chapter2/2-selectByOtherArguments.py @@ -0,0 +1,22 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html') +bs = BeautifulSoup(html, "html.parser") + +# Select by tags +titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6']) +print([title for title in titles]) + +# Select by tag attributes +allText = bs.find_all('span', {'class':{'green', 'red'}}) +print([text for text in allText]) + +# Select by text content of tags +nameList = bs.find_all(text='the prince') +print(len(nameList)) + +# Select by tags that contains a particular attribute +titles = bs.find_all(id='title', class_='text') +print([title for title in titles]) From d1a874b558b237575cfd3b18661f51cafa2bcfa9 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:09:10 -0400 Subject: [PATCH 09/27] Add 3rd example to chapter2 --- v2/chapter2/3-findChildrens.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 v2/chapter2/3-findChildrens.py diff --git a/v2/chapter2/3-findChildrens.py b/v2/chapter2/3-findChildrens.py new file mode 100644 index 0000000..e2b4952 --- /dev/null +++ b/v2/chapter2/3-findChildrens.py @@ -0,0 +1,8 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +for child in bs.find('table',{'id':'giftList'}).children: + print(child) From 5339a19e165c896e9be35e8da5a8ab30bfef0424 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:10:04 -0400 Subject: [PATCH 10/27] Add 4th example to chapter2 --- v2/chapter2/4-findSiblings.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 v2/chapter2/4-findSiblings.py diff --git a/v2/chapter2/4-findSiblings.py b/v2/chapter2/4-findSiblings.py new file mode 100644 index 0000000..3f87a36 --- /dev/null +++ b/v2/chapter2/4-findSiblings.py @@ -0,0 +1,8 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings: + print(sibling) From 4d44423873df007ba95bae945be07a961b179759 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:15:47 -0400 Subject: [PATCH 11/27] Add 5th example to chapter2 --- v2/chapter2/5-findParents.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 v2/chapter2/5-findParents.py diff --git a/v2/chapter2/5-findParents.py b/v2/chapter2/5-findParents.py new file mode 100644 index 0000000..0c826ca --- /dev/null +++ b/v2/chapter2/5-findParents.py @@ -0,0 +1,9 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +print(bs.find( + 'img', + {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()) From e14730f6c403f95dc0e47a9e685adfa5fc640761 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:39:43 -0400 Subject: [PATCH 12/27] Add 6th example to chapter2 --- v2/chapter2/6-regularExpressions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 v2/chapter2/6-regularExpressions.py diff --git a/v2/chapter2/6-regularExpressions.py b/v2/chapter2/6-regularExpressions.py new file mode 100644 index 0000000..0bf4dd4 --- /dev/null +++ b/v2/chapter2/6-regularExpressions.py @@ -0,0 +1,12 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html, 'html.parser') +images = bs.find_all( + 'img', + {'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')}) +for image in images: + print(image['src']) From 1e8db8830f3a1826bf6163c0188d5df8cde0b0ab Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:49:25 -0400 Subject: [PATCH 13/27] Add 7th example to chapter2 --- v2/chapter2/7-lambdaExpressions.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 v2/chapter2/7-lambdaExpressions.py diff --git a/v2/chapter2/7-lambdaExpressions.py b/v2/chapter2/7-lambdaExpressions.py new file mode 100644 index 0000000..b4d46b6 --- /dev/null +++ b/v2/chapter2/7-lambdaExpressions.py @@ -0,0 +1,10 @@ + +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen("http://www.pythonscraping.com/pages/page2.html") +bsObj = BeautifulSoup(html, "html.parser") +tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2) +for tag in tags: + print(tag) From aec4ac700ec7507be0bb2f81ef438ae8751f2cb1 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:52:06 -0400 Subject: [PATCH 14/27] Delete chapter2's notebook --- .../Chapter02-AdvancedHTMLParsing.ipynb | 597 ------------------ 1 file changed, 597 deletions(-) delete mode 100644 v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb diff --git a/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb b/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb deleted file mode 100644 index 6779c08..0000000 --- a/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb +++ /dev/null @@ -1,597 +0,0 @@ -{ - "cells": [ - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n", - "bs = BeautifulSoup(html, 'html.parser')\n", - "print(bs)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n", - "bs = BeautifulSoup(html, \"html.parser\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Anna\n", - "Pavlovna Scherer\n", - "Empress Marya\n", - "Fedorovna\n", - "Prince Vasili Kuragin\n", - "Anna Pavlovna\n", - "St. Petersburg\n", - "the prince\n", - "Anna Pavlovna\n", - "Anna Pavlovna\n", - "the prince\n", - "the prince\n", - "the prince\n", - "Prince Vasili\n", - "Anna Pavlovna\n", - "Anna Pavlovna\n", - "the prince\n", - "Wintzingerode\n", - "King of Prussia\n", - "le Vicomte de Mortemart\n", - "Montmorencys\n", - "Rohans\n", - "Abbe Morio\n", - "the Emperor\n", - "the prince\n", - "Prince Vasili\n", - "Dowager Empress Marya Fedorovna\n", - "the baron\n", - "Anna Pavlovna\n", - "the Empress\n", - "the Empress\n", - "Anna Pavlovna's\n", - "Her Majesty\n", - "Baron\n", - "Funke\n", - "The prince\n", - "Anna\n", - "Pavlovna\n", - "the Empress\n", - "The prince\n", - "Anatole\n", - "the prince\n", - "The prince\n", - "Anna\n", - "Pavlovna\n", - "Anna Pavlovna\n" - ] - } - ], - "source": [ - "nameList = bs.findAll('span', {'class': 'green'})\n", - "for name in nameList:\n", - " print(name.get_text())" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[

War and Peace

,

Chapter 1

]\n" - ] - } - ], - "source": [ - "titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6'])\n", - "print([title for title in titles])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Well, Prince, so Genoa and Lucca are now just family estates of the\n", - "Buonapartes. But I warn you, if you don't tell me that this means war,\n", - "if you still try to defend the infamies and horrors perpetrated by\n", - "that Antichrist- I really believe he is Antichrist- I will have\n", - "nothing more to do with you and you are no longer my friend, no longer\n", - "my 'faithful slave,' as you call yourself! But how do you do? I see\n", - "I have frightened you- sit down and tell me all the news., Anna\n", - "Pavlovna Scherer, Empress Marya\n", - "Fedorovna, Prince Vasili Kuragin, Anna Pavlovna, St. Petersburg, If you have nothing better to do, Count [or Prince], and if the\n", - "prospect of spending an evening with a poor invalid is not too\n", - "terrible, I shall be very charmed to see you tonight between 7 and 10-\n", - "Annette Scherer., Heavens! what a virulent attack!, the prince, Anna Pavlovna, First of all, dear friend, tell me how you are. Set your friend's\n", - "mind at rest,, Can one be well while suffering morally? Can one be calm in times\n", - "like these if one has any feeling?, Anna Pavlovna, You are\n", - "staying the whole evening, I hope?, And the fete at the English ambassador's? Today is Wednesday. I\n", - "must put in an appearance there,, the prince, My daughter is\n", - "coming for me to take me there., I thought today's fete had been canceled. I confess all these\n", - "festivities and fireworks are becoming wearisome., If they had known that you wished it, the entertainment would\n", - "have been put off,, the prince, Don't tease! Well, and what has been decided about Novosiltsev's\n", - "dispatch? You know everything., What can one say about it?, the prince, What has been decided? They have decided that\n", - "Buonaparte has burnt his boats, and I believe that we are ready to\n", - "burn ours., Prince Vasili, Anna Pavlovna, Anna Pavlovna, Oh, don't speak to me of Austria. Perhaps I don't understand\n", - "things, but Austria never has wished, and does not wish, for war.\n", - "She is betraying us! Russia alone must save Europe. Our gracious\n", - "sovereign recognizes his high vocation and will be true to it. That is\n", - "the one thing I have faith in! Our good and wonderful sovereign has to\n", - "perform the noblest role on earth, and he is so virtuous and noble\n", - "that God will not forsake him. He will fulfill his vocation and\n", - "crush the hydra of revolution, which has become more terrible than\n", - "ever in the person of this murderer and villain! We alone must\n", - "avenge the blood of the just one.... Whom, I ask you, can we rely\n", - "on?... England with her commercial spirit will not and cannot\n", - "understand the Emperor Alexander's loftiness of soul. She has\n", - "refused to evacuate Malta. She wanted to find, and still seeks, some\n", - "secret motive in our actions. What answer did Novosiltsev get? None.\n", - "The English have not understood and cannot understand the\n", - "self-abnegation of our Emperor who wants nothing for himself, but only\n", - "desires the good of mankind. And what have they promised? Nothing! And\n", - "what little they have promised they will not perform! Prussia has\n", - "always declared that Buonaparte is invincible, and that all Europe\n", - "is powerless before him.... And I don't believe a word that Hardenburg\n", - "says, or Haugwitz either. This famous Prussian neutrality is just a\n", - "trap. I have faith only in God and the lofty destiny of our adored\n", - "monarch. He will save Europe!, I think,, the prince, that if you had been\n", - "sent instead of our dear Wintzingerode you would have captured the\n", - "King of Prussia's consent by assault. You are so eloquent. Will you\n", - "give me a cup of tea?, Wintzingerode, King of Prussia, In a moment. A propos,, I am\n", - "expecting two very interesting men tonight, le Vicomte de Mortemart,\n", - "who is connected with the Montmorencys through the Rohans, one of\n", - "the best French families. He is one of the genuine emigres, the good\n", - "ones. And also the Abbe Morio. Do you know that profound thinker? He\n", - "has been received by the Emperor. Had you heard?, le Vicomte de Mortemart, Montmorencys, Rohans, Abbe Morio, the Emperor, I shall be delighted to meet them,, the prince, But tell me,, is it true that the Dowager Empress wants Baron Funke\n", - "to be appointed first secretary at Vienna? The baron by all accounts\n", - "is a poor creature., Prince Vasili, Dowager Empress Marya Fedorovna, the baron, Anna Pavlovna, the Empress, Baron Funke has been recommended to the Dowager Empress by her\n", - "sister,, the Empress, Anna Pavlovna's, Her Majesty, Baron\n", - "Funke, The prince, Anna\n", - "Pavlovna, the Empress, Now about your family. Do you know that since your daughter came\n", - "out everyone has been enraptured by her? They say she is amazingly\n", - "beautiful., The prince, I often think,, I often think how unfairly sometimes the\n", - "joys of life are distributed. Why has fate given you two such splendid\n", - "children? I don't speak of Anatole, your youngest. I don't like\n", - "him,, Anatole, Two such charming children. And really you appreciate\n", - "them less than anyone, and so you don't deserve to have them., I can't help it,, the prince, Lavater would have said I\n", - "lack the bump of paternity., Don't joke; I mean to have a serious talk with you. Do you know I\n", - "am dissatisfied with your younger son? Between ourselves, he was mentioned at Her\n", - "Majesty's and you were pitied...., The prince, What would you have me do?, You know I did all\n", - "a father could for their education, and they have both turned out\n", - "fools. Hippolyte is at least a quiet fool, but Anatole is an active\n", - "one. That is the only difference between them., And why are children born to such men as you? If you were not a\n", - "father there would be nothing I could reproach you with,, Anna\n", - "Pavlovna, I am your faithful slave and to you alone I can confess that my\n", - "children are the bane of my life. It is the cross I have to bear. That\n", - "is how I explain it to myself. It can't be helped!, Anna Pavlovna]\n" - ] - } - ], - "source": [ - "allText = bs.find_all('span', {'class':{'green', 'red'}})\n", - "print([text for text in allText])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7\n" - ] - } - ], - "source": [ - "nameList = bs.find_all(text='the prince')\n", - "print(len(nameList))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" - ] - } - ], - "source": [ - "title = bs.find_all(id='title', class_='text')\n", - "print([text for text in allText])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "Item Title\n", - "\n", - "Description\n", - "\n", - "Cost\n", - "\n", - "Image\n", - "\n", - "\n", - "\n", - "\n", - "Vegetable Basket\n", - "\n", - "This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n", - "Now with super-colorful bell peppers!\n", - "\n", - "$15.00\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Russian Nesting Dolls\n", - "\n", - "Hand-painted by trained monkeys, these exquisite dolls are priceless! And by \"priceless,\" we mean \"extremely expensive\"! 8 entire dolls per set! Octuple the presents!\n", - "\n", - "$10,000.52\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Fish Painting\n", - "\n", - "If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!\n", - "\n", - "$10,005.00\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Dead Parrot\n", - "\n", - "This is an ex-parrot! Or maybe he's only resting?\n", - "\n", - "$0.50\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Mystery Box\n", - "\n", - "If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n", - "\n", - "$1.50\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "\n", - "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", - "bs = BeautifulSoup(html, 'html.parser')\n", - "\n", - "for child in bs.find('table',{'id':'giftList'}).children:\n", - " print(child)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "Vegetable Basket\n", - "\n", - "This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n", - "Now with super-colorful bell peppers!\n", - "\n", - "$15.00\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Russian Nesting Dolls\n", - "\n", - "Hand-painted by trained monkeys, these exquisite dolls are priceless! And by \"priceless,\" we mean \"extremely expensive\"! 8 entire dolls per set! Octuple the presents!\n", - "\n", - "$10,000.52\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Fish Painting\n", - "\n", - "If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!\n", - "\n", - "$10,005.00\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Dead Parrot\n", - "\n", - "This is an ex-parrot! Or maybe he's only resting?\n", - "\n", - "$0.50\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Mystery Box\n", - "\n", - "If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n", - "\n", - "$1.50\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "\n", - "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", - "bs = BeautifulSoup(html, 'html.parser')\n", - "\n", - "for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:\n", - " print(sibling) " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "$15.00\n", - "\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "\n", - "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", - "bs = BeautifulSoup(html, 'html.parser')\n", - "print(bs.find('img',\n", - " {'src':'../img/gifts/img1.jpg'})\n", - " .parent.previous_sibling.get_text())" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "../img/gifts/img1.jpg\n", - "../img/gifts/img2.jpg\n", - "../img/gifts/img3.jpg\n", - "../img/gifts/img4.jpg\n", - "../img/gifts/img6.jpg\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "import re\n", - "\n", - "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", - "bs = BeautifulSoup(html, 'html.parser')\n", - "images = bs.find_all('img', {'src':re.compile('\\.\\.\\/img\\/gifts/img.*\\.jpg')})\n", - "for image in images: \n", - " print(image['src'])" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " \n", - " Vegetable Basket\n", - " \n", - " This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n", - " Now with super-colorful bell peppers!\n", - " \n", - " $15.00\n", - " \n", - " \n", - " ,\n", - " \n", - " Russian Nesting Dolls\n", - " \n", - " Hand-painted by trained monkeys, these exquisite dolls are priceless! And by \"priceless,\" we mean \"extremely expensive\"! 8 entire dolls per set! Octuple the presents!\n", - " \n", - " $10,000.52\n", - " \n", - " \n", - " ,\n", - " \n", - " Fish Painting\n", - " \n", - " If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!\n", - " \n", - " $10,005.00\n", - " \n", - " \n", - " ,\n", - " \n", - " Dead Parrot\n", - " \n", - " This is an ex-parrot! Or maybe he's only resting?\n", - " \n", - " $0.50\n", - " \n", - " \n", - " ,\n", - " \n", - " Mystery Box\n", - " \n", - " If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n", - " \n", - " $1.50\n", - " \n", - " \n", - " ]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bs.find_all(lambda tag: len(tag.attrs) == 2)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Or maybe he's only resting?]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bs.find_all(lambda tag: tag.get_text() == 'Or maybe he\\'s only resting?')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[\"Or maybe he's only resting?\"]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bs.find_all('', text='Or maybe he\\'s only resting?')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 59480472145b042ecafecffb1df68639c7c274aa Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Wed, 30 Jun 2021 14:52:58 -0400 Subject: [PATCH 15/27] Delete chapter1's notebook --- v2/chapter1/Chapter01_BeginningToScrape.ipynb | 143 ------------------ 1 file changed, 143 deletions(-) delete mode 100644 v2/chapter1/Chapter01_BeginningToScrape.ipynb diff --git a/v2/chapter1/Chapter01_BeginningToScrape.ipynb b/v2/chapter1/Chapter01_BeginningToScrape.ipynb deleted file mode 100644 index 8d7a74b..0000000 --- a/v2/chapter1/Chapter01_BeginningToScrape.ipynb +++ /dev/null @@ -1,143 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "b'\\n\\nA Useful Page\\n\\n\\n

An Interesting Title

\\n
\\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\\n
\\n\\n\\n'\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "\n", - "html = urlopen('http://pythonscraping.com/pages/page1.html')\n", - "print(html.read())" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "

An Interesting Title

\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from bs4 import BeautifulSoup\n", - "\n", - "html = urlopen('http://www.pythonscraping.com/pages/page1.html')\n", - "bs = BeautifulSoup(html.read(), 'html.parser')\n", - "print(bs.h1)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The server could not be found!\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from urllib.error import HTTPError\n", - "from urllib.error import URLError\n", - "\n", - "try:\n", - " html = urlopen(\"https://pythonscrapingthisurldoesnotexist.com\")\n", - "except HTTPError as e:\n", - " print(\"The server returned an HTTP error\")\n", - "except URLError as e:\n", - " print(\"The server could not be found!\")\n", - "else:\n", - " print(html.read())" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "

An Interesting Title

\n" - ] - } - ], - "source": [ - "from urllib.request import urlopen\n", - "from urllib.error import HTTPError\n", - "from bs4 import BeautifulSoup\n", - "\n", - "\n", - "def getTitle(url):\n", - " try:\n", - " html = urlopen(url)\n", - " except HTTPError as e:\n", - " return None\n", - " try:\n", - " bsObj = BeautifulSoup(html.read(), \"lxml\")\n", - " title = bsObj.body.h1\n", - " except AttributeError as e:\n", - " return None\n", - " return title\n", - "\n", - "\n", - "title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n", - "if title == None:\n", - " print(\"Title could not be found\")\n", - "else:\n", - " print(title)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 1360c6adb789b66f6ae73f4ee34ff4289fcc5292 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 14:44:08 -0400 Subject: [PATCH 16/27] Move notebook to v2/chapter3 Do this to have an organized folder structure --- .../chapter3/Chapter03-web-crawlers.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Chapter03-web-crawlers.ipynb => v2/chapter3/Chapter03-web-crawlers.ipynb (100%) diff --git a/Chapter03-web-crawlers.ipynb b/v2/chapter3/Chapter03-web-crawlers.ipynb similarity index 100% rename from Chapter03-web-crawlers.ipynb rename to v2/chapter3/Chapter03-web-crawlers.ipynb From f683f248564cbe551b0f258e247b2369f7365ffd Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 14:56:10 -0400 Subject: [PATCH 17/27] Add 1st example to chapter3 --- v2/chapter3/1-getPageLinks.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 v2/chapter3/1-getPageLinks.py diff --git a/v2/chapter3/1-getPageLinks.py b/v2/chapter3/1-getPageLinks.py new file mode 100644 index 0000000..740741a --- /dev/null +++ b/v2/chapter3/1-getPageLinks.py @@ -0,0 +1,10 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon') +bs = BeautifulSoup(html, 'html.parser') + +for link in bs.find_all('a'): + if 'href' in link.attrs: + print(link.attrs['href']) From 1c133a6b9ac78170a6c5b815553c0f4a8d6065ca Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 14:57:00 -0400 Subject: [PATCH 18/27] Add 2nd example to chapter3 --- v2/chapter3/2-getPageArticleLinks.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 v2/chapter3/2-getPageArticleLinks.py diff --git a/v2/chapter3/2-getPageArticleLinks.py b/v2/chapter3/2-getPageArticleLinks.py new file mode 100644 index 0000000..cae42c3 --- /dev/null +++ b/v2/chapter3/2-getPageArticleLinks.py @@ -0,0 +1,11 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon') +bs = BeautifulSoup(html, 'html.parser') +regex = re.compile('^(/wiki/)((?!:).)*$') +for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=regex): + if 'href' in link.attrs: + print(link.attrs['href']) From 026140240e0e9103a1f83a8853742d89ceb295d1 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 15:25:18 -0400 Subject: [PATCH 19/27] Add 3rd example to chapter3 --- v2/chapter3/3-getRandomLinks.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 v2/chapter3/3-getRandomLinks.py diff --git a/v2/chapter3/3-getRandomLinks.py b/v2/chapter3/3-getRandomLinks.py new file mode 100644 index 0000000..46188b8 --- /dev/null +++ b/v2/chapter3/3-getRandomLinks.py @@ -0,0 +1,20 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import datetime +import random +import re + + +def getLinks(articleUrl): + html = urlopen('http://en.wikipedia.org{}'.format(articleUrl)) + bs = BeautifulSoup(html, 'html.parser') + regex = re.compile('^(/wiki/)((?!:).)*$') + return bs.find('div', {'id':'bodyContent'}).find_all('a', href=regex) + + +random.seed(datetime.datetime.now()) +links = getLinks('/wiki/Kevin_Bacon') +while len(links) > 0: + newArticle = links[random.randint(0, len(links)-1)].attrs['href'] + print(newArticle) + links = getLinks(newArticle) From 108b604048d6fcb877f44325fbc2457a1315bc90 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 16:05:41 -0400 Subject: [PATCH 20/27] Add 4th example to chapter3 --- v2/chapter3/4-crawlPage.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 v2/chapter3/4-crawlPage.py diff --git a/v2/chapter3/4-crawlPage.py b/v2/chapter3/4-crawlPage.py new file mode 100644 index 0000000..59bdfe6 --- /dev/null +++ b/v2/chapter3/4-crawlPage.py @@ -0,0 +1,22 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +pages = set() +def getLinks(pageUrl): + global pages + html = urlopen('http://en.wikipedia.org{}'.format(pageUrl)) + bs = BeautifulSoup(html, 'html.parser') + regex = re.compile('^(/wiki/)') + for link in bs.find_all('a', href=regex): + if 'href' in link.attrs: + if link.attrs['href'] not in pages: + #We have encountered a new page + newPage = link.attrs['href'] + print(newPage) + pages.add(newPage) + getLinks(newPage) + + +getLinks('') From 9389220f692592ef4a48dfab744186694ea0ad1a Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 16:12:16 -0400 Subject: [PATCH 21/27] Add 5th example to chapter3 --- v2/chapter3/5-crawlAndScrapePage.py | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 v2/chapter3/5-crawlAndScrapePage.py diff --git a/v2/chapter3/5-crawlAndScrapePage.py b/v2/chapter3/5-crawlAndScrapePage.py new file mode 100644 index 0000000..cbd96a2 --- /dev/null +++ b/v2/chapter3/5-crawlAndScrapePage.py @@ -0,0 +1,31 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + + +pages = set() +def getLinks(pageUrl): + global pages + html = urlopen('http://en.wikipedia.org{}'.format(pageUrl)) + bs = BeautifulSoup(html, 'html.parser') + try: + # Find the page's title + print(bs.h1.get_text()) + # Find the page's 1st paragraph + print(bs.find(id ='mw-content-text').find_all('p')[0]) + # Find edit links (doesn't aply anymore) + print(bs.find(id='ca-edit').find('span').find('a').attrs['href']) + except AttributeError: + print('This page is missing something! Continuing.') + for link in bs.find_all('a', href=re.compile('^(/wiki/)')): + if 'href' in link.attrs: + if link.attrs['href'] not in pages: + #We have encountered a new page + newPage = link.attrs['href'] + print('-'*20) + print(newPage) + pages.add(newPage) + getLinks(newPage) + + +getLinks('') From c01d6883709c2973c3c215ddb3d244d3a4849417 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 17:23:35 -0400 Subject: [PATCH 22/27] Add 6th example to chapter3 --- v1/chapter3/6-getRandomExternalLinks.py | 65 +++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 v1/chapter3/6-getRandomExternalLinks.py diff --git a/v1/chapter3/6-getRandomExternalLinks.py b/v1/chapter3/6-getRandomExternalLinks.py new file mode 100644 index 0000000..cb52401 --- /dev/null +++ b/v1/chapter3/6-getRandomExternalLinks.py @@ -0,0 +1,65 @@ +from urllib.request import urlopen +from urllib.parse import urlparse +from bs4 import BeautifulSoup +import re +import datetime +import random + + +pages = set() +random.seed(datetime.datetime.now()) + + +#Retrieves a list of all Internal links found on a page +def getInternalLinks(bs, includeUrl): + includeUrl = '{}://{}'.format( + urlparse(includeUrl).scheme, + urlparse(includeUrl).netloc) + internalLinks = [] + #Finds all links that begin with a "/" + regex = re.compile('^(/|.*'+includeUrl+')') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in internalLinks: + if(link.attrs['href'].startswith('/')): + internalLinks.append(includeUrl+link.attrs['href']) + else: + internalLinks.append(link.attrs['href']) + return internalLinks + + +#Retrieves a list of all external links found on a page +def getExternalLinks(bs, excludeUrl): + externalLinks = [] + #Finds all links that start with "http" that don't contain the current URL + regex = re.compile('^(http|www)((?!'+excludeUrl+').)*$') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in externalLinks: + externalLinks.append(link.attrs['href']) + return externalLinks + + +def getRandomExternalLink(startingPage): + html = urlopen(startingPage) + bs = BeautifulSoup(html, 'html.parser') + externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc) + if len(externalLinks) == 0: + print('No external links, looking around the site for one') + domain = '{}://{}'.format( + urlparse(startingPage).scheme, + urlparse(startingPage).netloc) + internalLinks = getInternalLinks(bs, domain) + return getRandomExternalLink( + internalLinks[random.randint(0, len(internalLinks)-1)]) + else: + return externalLinks[random.randint(0, len(externalLinks)-1)] + + +def followExternalOnly(startingSite): + externalLink = getRandomExternalLink(startingSite) + print('Random external link is: {}'.format(externalLink)) + followExternalOnly(externalLink) + + +followExternalOnly('http://oreilly.com') From 384171c116984a2276ba0794d37e779805e7c136 Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 17:27:18 -0400 Subject: [PATCH 23/27] Move from v1 to v2 folder --- {v1 => v2}/chapter3/6-getRandomExternalLinks.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {v1 => v2}/chapter3/6-getRandomExternalLinks.py (100%) diff --git a/v1/chapter3/6-getRandomExternalLinks.py b/v2/chapter3/6-getRandomExternalLinks.py similarity index 100% rename from v1/chapter3/6-getRandomExternalLinks.py rename to v2/chapter3/6-getRandomExternalLinks.py From 2812eea510aa14bd548731fdf38a39e3ad68575d Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Thu, 1 Jul 2021 17:28:40 -0400 Subject: [PATCH 24/27] Add 7th example to chapter3 --- v2/chapter3/7-getAllExternalLinks.py | 61 ++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 v2/chapter3/7-getAllExternalLinks.py diff --git a/v2/chapter3/7-getAllExternalLinks.py b/v2/chapter3/7-getAllExternalLinks.py new file mode 100644 index 0000000..0913530 --- /dev/null +++ b/v2/chapter3/7-getAllExternalLinks.py @@ -0,0 +1,61 @@ +import re +from urllib.request import urlopen +from urllib.parse import urlparse +from bs4 import BeautifulSoup + + +allExtLinks = set() +allIntLinks = set() + + +#Retrieves a list of all Internal links found on a page +def getInternalLinks(bs, includeUrl): + includeUrl = '{}://{}'.format( + urlparse(includeUrl).scheme, + urlparse(includeUrl).netloc) + internalLinks = [] + #Finds all links that begin with a "/" + regex = re.compile('^(/|.*'+includeUrl+')') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in internalLinks: + if(link.attrs['href'].startswith('/')): + internalLinks.append(includeUrl+link.attrs['href']) + else: + internalLinks.append(link.attrs['href']) + return internalLinks + + +#Retrieves a list of all external links found on a page +def getExternalLinks(bs, excludeUrl): + externalLinks = [] + #Finds all links that start with "http" that don't contain the current URL + regex = re.compile('^(http|www)((?!'+excludeUrl+').)*$') + for link in bs.find_all('a', href=regex): + if link.attrs['href'] is not None: + if link.attrs['href'] not in externalLinks: + externalLinks.append(link.attrs['href']) + return externalLinks + + +# Collects a list of all external URLs found on the site +def getAllExternalLinks(siteUrl): + html = urlopen(siteUrl) + domain = '{}://{}'.format( + urlparse(siteUrl).scheme, + urlparse(siteUrl).netloc) + bs = BeautifulSoup(html, 'html.parser') + internalLinks = getInternalLinks(bs, domain) + externalLinks = getExternalLinks(bs, domain) + for link in externalLinks: + if link not in allExtLinks: + allExtLinks.add(link) + print(link) + for link in internalLinks: + if link not in allIntLinks: + allIntLinks.add(link) + getAllExternalLinks(link) + + +allIntLinks.add('http://oreilly.com') +getAllExternalLinks('http://oreilly.com') From 00715d810f8c1ac34e8353ed02a0205f24030c3a Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Fri, 2 Jul 2021 10:36:34 -0400 Subject: [PATCH 25/27] Move notebook to v2/chapter4 --- .../chapter4/Chapter04_CrawlingModels.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Chapter04_CrawlingModels.ipynb => v2/chapter4/Chapter04_CrawlingModels.ipynb (100%) diff --git a/Chapter04_CrawlingModels.ipynb b/v2/chapter4/Chapter04_CrawlingModels.ipynb similarity index 100% rename from Chapter04_CrawlingModels.ipynb rename to v2/chapter4/Chapter04_CrawlingModels.ipynb From 6e7e293d3702f58e97bcf140f1ebeede29f0f6fa Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Fri, 2 Jul 2021 10:45:27 -0400 Subject: [PATCH 26/27] Add back chapter1's notebook --- v2/chapter1/Chapter01_BeginningToScrape.ipynb | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 v2/chapter1/Chapter01_BeginningToScrape.ipynb diff --git a/v2/chapter1/Chapter01_BeginningToScrape.ipynb b/v2/chapter1/Chapter01_BeginningToScrape.ipynb new file mode 100644 index 0000000..8d7a74b --- /dev/null +++ b/v2/chapter1/Chapter01_BeginningToScrape.ipynb @@ -0,0 +1,143 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b'\\n\\nA Useful Page\\n\\n\\n

An Interesting Title

\\n
\\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\\n
\\n\\n\\n'\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "\n", + "html = urlopen('http://pythonscraping.com/pages/page1.html')\n", + "print(html.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "

An Interesting Title

\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "\n", + "html = urlopen('http://www.pythonscraping.com/pages/page1.html')\n", + "bs = BeautifulSoup(html.read(), 'html.parser')\n", + "print(bs.h1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The server could not be found!\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from urllib.error import HTTPError\n", + "from urllib.error import URLError\n", + "\n", + "try:\n", + " html = urlopen(\"https://pythonscrapingthisurldoesnotexist.com\")\n", + "except HTTPError as e:\n", + " print(\"The server returned an HTTP error\")\n", + "except URLError as e:\n", + " print(\"The server could not be found!\")\n", + "else:\n", + " print(html.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "

An Interesting Title

\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from urllib.error import HTTPError\n", + "from bs4 import BeautifulSoup\n", + "\n", + "\n", + "def getTitle(url):\n", + " try:\n", + " html = urlopen(url)\n", + " except HTTPError as e:\n", + " return None\n", + " try:\n", + " bsObj = BeautifulSoup(html.read(), \"lxml\")\n", + " title = bsObj.body.h1\n", + " except AttributeError as e:\n", + " return None\n", + " return title\n", + "\n", + "\n", + "title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n", + "if title == None:\n", + " print(\"Title could not be found\")\n", + "else:\n", + " print(title)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3738c318f5673f2ae8adbadb7fdb27be09235cab Mon Sep 17 00:00:00 2001 From: William <68653853+will-i-amv@users.noreply.github.com> Date: Fri, 2 Jul 2021 10:46:13 -0400 Subject: [PATCH 27/27] Add back chapter2's notebook --- .../Chapter02-AdvancedHTMLParsing.ipynb | 597 ++++++++++++++++++ 1 file changed, 597 insertions(+) create mode 100644 v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb diff --git a/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb b/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb new file mode 100644 index 0000000..6779c08 --- /dev/null +++ b/v2/chapter2/Chapter02-AdvancedHTMLParsing.ipynb @@ -0,0 +1,597 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n", + "bs = BeautifulSoup(html, 'html.parser')\n", + "print(bs)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')\n", + "bs = BeautifulSoup(html, \"html.parser\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anna\n", + "Pavlovna Scherer\n", + "Empress Marya\n", + "Fedorovna\n", + "Prince Vasili Kuragin\n", + "Anna Pavlovna\n", + "St. Petersburg\n", + "the prince\n", + "Anna Pavlovna\n", + "Anna Pavlovna\n", + "the prince\n", + "the prince\n", + "the prince\n", + "Prince Vasili\n", + "Anna Pavlovna\n", + "Anna Pavlovna\n", + "the prince\n", + "Wintzingerode\n", + "King of Prussia\n", + "le Vicomte de Mortemart\n", + "Montmorencys\n", + "Rohans\n", + "Abbe Morio\n", + "the Emperor\n", + "the prince\n", + "Prince Vasili\n", + "Dowager Empress Marya Fedorovna\n", + "the baron\n", + "Anna Pavlovna\n", + "the Empress\n", + "the Empress\n", + "Anna Pavlovna's\n", + "Her Majesty\n", + "Baron\n", + "Funke\n", + "The prince\n", + "Anna\n", + "Pavlovna\n", + "the Empress\n", + "The prince\n", + "Anatole\n", + "the prince\n", + "The prince\n", + "Anna\n", + "Pavlovna\n", + "Anna Pavlovna\n" + ] + } + ], + "source": [ + "nameList = bs.findAll('span', {'class': 'green'})\n", + "for name in nameList:\n", + " print(name.get_text())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[

War and Peace

,

Chapter 1

]\n" + ] + } + ], + "source": [ + "titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6'])\n", + "print([title for title in titles])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Well, Prince, so Genoa and Lucca are now just family estates of the\n", + "Buonapartes. But I warn you, if you don't tell me that this means war,\n", + "if you still try to defend the infamies and horrors perpetrated by\n", + "that Antichrist- I really believe he is Antichrist- I will have\n", + "nothing more to do with you and you are no longer my friend, no longer\n", + "my 'faithful slave,' as you call yourself! But how do you do? I see\n", + "I have frightened you- sit down and tell me all the news., Anna\n", + "Pavlovna Scherer, Empress Marya\n", + "Fedorovna, Prince Vasili Kuragin, Anna Pavlovna, St. Petersburg, If you have nothing better to do, Count [or Prince], and if the\n", + "prospect of spending an evening with a poor invalid is not too\n", + "terrible, I shall be very charmed to see you tonight between 7 and 10-\n", + "Annette Scherer., Heavens! what a virulent attack!, the prince, Anna Pavlovna, First of all, dear friend, tell me how you are. Set your friend's\n", + "mind at rest,, Can one be well while suffering morally? Can one be calm in times\n", + "like these if one has any feeling?, Anna Pavlovna, You are\n", + "staying the whole evening, I hope?, And the fete at the English ambassador's? Today is Wednesday. I\n", + "must put in an appearance there,, the prince, My daughter is\n", + "coming for me to take me there., I thought today's fete had been canceled. I confess all these\n", + "festivities and fireworks are becoming wearisome., If they had known that you wished it, the entertainment would\n", + "have been put off,, the prince, Don't tease! Well, and what has been decided about Novosiltsev's\n", + "dispatch? You know everything., What can one say about it?, the prince, What has been decided? They have decided that\n", + "Buonaparte has burnt his boats, and I believe that we are ready to\n", + "burn ours., Prince Vasili, Anna Pavlovna, Anna Pavlovna, Oh, don't speak to me of Austria. Perhaps I don't understand\n", + "things, but Austria never has wished, and does not wish, for war.\n", + "She is betraying us! Russia alone must save Europe. Our gracious\n", + "sovereign recognizes his high vocation and will be true to it. That is\n", + "the one thing I have faith in! Our good and wonderful sovereign has to\n", + "perform the noblest role on earth, and he is so virtuous and noble\n", + "that God will not forsake him. He will fulfill his vocation and\n", + "crush the hydra of revolution, which has become more terrible than\n", + "ever in the person of this murderer and villain! We alone must\n", + "avenge the blood of the just one.... Whom, I ask you, can we rely\n", + "on?... England with her commercial spirit will not and cannot\n", + "understand the Emperor Alexander's loftiness of soul. She has\n", + "refused to evacuate Malta. She wanted to find, and still seeks, some\n", + "secret motive in our actions. What answer did Novosiltsev get? None.\n", + "The English have not understood and cannot understand the\n", + "self-abnegation of our Emperor who wants nothing for himself, but only\n", + "desires the good of mankind. And what have they promised? Nothing! And\n", + "what little they have promised they will not perform! Prussia has\n", + "always declared that Buonaparte is invincible, and that all Europe\n", + "is powerless before him.... And I don't believe a word that Hardenburg\n", + "says, or Haugwitz either. This famous Prussian neutrality is just a\n", + "trap. I have faith only in God and the lofty destiny of our adored\n", + "monarch. He will save Europe!, I think,, the prince, that if you had been\n", + "sent instead of our dear Wintzingerode you would have captured the\n", + "King of Prussia's consent by assault. You are so eloquent. Will you\n", + "give me a cup of tea?, Wintzingerode, King of Prussia, In a moment. A propos,, I am\n", + "expecting two very interesting men tonight, le Vicomte de Mortemart,\n", + "who is connected with the Montmorencys through the Rohans, one of\n", + "the best French families. He is one of the genuine emigres, the good\n", + "ones. And also the Abbe Morio. Do you know that profound thinker? He\n", + "has been received by the Emperor. Had you heard?, le Vicomte de Mortemart, Montmorencys, Rohans, Abbe Morio, the Emperor, I shall be delighted to meet them,, the prince, But tell me,, is it true that the Dowager Empress wants Baron Funke\n", + "to be appointed first secretary at Vienna? The baron by all accounts\n", + "is a poor creature., Prince Vasili, Dowager Empress Marya Fedorovna, the baron, Anna Pavlovna, the Empress, Baron Funke has been recommended to the Dowager Empress by her\n", + "sister,, the Empress, Anna Pavlovna's, Her Majesty, Baron\n", + "Funke, The prince, Anna\n", + "Pavlovna, the Empress, Now about your family. Do you know that since your daughter came\n", + "out everyone has been enraptured by her? They say she is amazingly\n", + "beautiful., The prince, I often think,, I often think how unfairly sometimes the\n", + "joys of life are distributed. Why has fate given you two such splendid\n", + "children? I don't speak of Anatole, your youngest. I don't like\n", + "him,, Anatole, Two such charming children. And really you appreciate\n", + "them less than anyone, and so you don't deserve to have them., I can't help it,, the prince, Lavater would have said I\n", + "lack the bump of paternity., Don't joke; I mean to have a serious talk with you. Do you know I\n", + "am dissatisfied with your younger son? Between ourselves, he was mentioned at Her\n", + "Majesty's and you were pitied...., The prince, What would you have me do?, You know I did all\n", + "a father could for their education, and they have both turned out\n", + "fools. Hippolyte is at least a quiet fool, but Anatole is an active\n", + "one. That is the only difference between them., And why are children born to such men as you? If you were not a\n", + "father there would be nothing I could reproach you with,, Anna\n", + "Pavlovna, I am your faithful slave and to you alone I can confess that my\n", + "children are the bane of my life. It is the cross I have to bear. That\n", + "is how I explain it to myself. It can't be helped!, Anna Pavlovna]\n" + ] + } + ], + "source": [ + "allText = bs.find_all('span', {'class':{'green', 'red'}})\n", + "print([text for text in allText])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7\n" + ] + } + ], + "source": [ + "nameList = bs.find_all(text='the prince')\n", + "print(len(nameList))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], + "source": [ + "title = bs.find_all(id='title', class_='text')\n", + "print([text for text in allText])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Item Title\n", + "\n", + "Description\n", + "\n", + "Cost\n", + "\n", + "Image\n", + "\n", + "\n", + "\n", + "\n", + "Vegetable Basket\n", + "\n", + "This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n", + "Now with super-colorful bell peppers!\n", + "\n", + "$15.00\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Russian Nesting Dolls\n", + "\n", + "Hand-painted by trained monkeys, these exquisite dolls are priceless! And by \"priceless,\" we mean \"extremely expensive\"! 8 entire dolls per set! Octuple the presents!\n", + "\n", + "$10,000.52\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Fish Painting\n", + "\n", + "If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!\n", + "\n", + "$10,005.00\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Dead Parrot\n", + "\n", + "This is an ex-parrot! Or maybe he's only resting?\n", + "\n", + "$0.50\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Mystery Box\n", + "\n", + "If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n", + "\n", + "$1.50\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "\n", + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", + "bs = BeautifulSoup(html, 'html.parser')\n", + "\n", + "for child in bs.find('table',{'id':'giftList'}).children:\n", + " print(child)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "Vegetable Basket\n", + "\n", + "This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n", + "Now with super-colorful bell peppers!\n", + "\n", + "$15.00\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Russian Nesting Dolls\n", + "\n", + "Hand-painted by trained monkeys, these exquisite dolls are priceless! And by \"priceless,\" we mean \"extremely expensive\"! 8 entire dolls per set! Octuple the presents!\n", + "\n", + "$10,000.52\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Fish Painting\n", + "\n", + "If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!\n", + "\n", + "$10,005.00\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Dead Parrot\n", + "\n", + "This is an ex-parrot! Or maybe he's only resting?\n", + "\n", + "$0.50\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Mystery Box\n", + "\n", + "If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n", + "\n", + "$1.50\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "\n", + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", + "bs = BeautifulSoup(html, 'html.parser')\n", + "\n", + "for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:\n", + " print(sibling) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "$15.00\n", + "\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "\n", + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", + "bs = BeautifulSoup(html, 'html.parser')\n", + "print(bs.find('img',\n", + " {'src':'../img/gifts/img1.jpg'})\n", + " .parent.previous_sibling.get_text())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../img/gifts/img1.jpg\n", + "../img/gifts/img2.jpg\n", + "../img/gifts/img3.jpg\n", + "../img/gifts/img4.jpg\n", + "../img/gifts/img6.jpg\n" + ] + } + ], + "source": [ + "from urllib.request import urlopen\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "\n", + "html = urlopen('http://www.pythonscraping.com/pages/page3.html')\n", + "bs = BeautifulSoup(html, 'html.parser')\n", + "images = bs.find_all('img', {'src':re.compile('\\.\\.\\/img\\/gifts/img.*\\.jpg')})\n", + "for image in images: \n", + " print(image['src'])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " \n", + " Vegetable Basket\n", + " \n", + " This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\n", + " Now with super-colorful bell peppers!\n", + " \n", + " $15.00\n", + " \n", + " \n", + " ,\n", + " \n", + " Russian Nesting Dolls\n", + " \n", + " Hand-painted by trained monkeys, these exquisite dolls are priceless! And by \"priceless,\" we mean \"extremely expensive\"! 8 entire dolls per set! Octuple the presents!\n", + " \n", + " $10,000.52\n", + " \n", + " \n", + " ,\n", + " \n", + " Fish Painting\n", + " \n", + " If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!\n", + " \n", + " $10,005.00\n", + " \n", + " \n", + " ,\n", + " \n", + " Dead Parrot\n", + " \n", + " This is an ex-parrot! Or maybe he's only resting?\n", + " \n", + " $0.50\n", + " \n", + " \n", + " ,\n", + " \n", + " Mystery Box\n", + " \n", + " If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n", + " \n", + " $1.50\n", + " \n", + " \n", + " ]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bs.find_all(lambda tag: len(tag.attrs) == 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Or maybe he's only resting?]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bs.find_all(lambda tag: tag.get_text() == 'Or maybe he\\'s only resting?')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[\"Or maybe he's only resting?\"]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bs.find_all('', text='Or maybe he\\'s only resting?')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}