From 6fd2f7e7c6043ded91537bc3cd1c2299036949d9 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 16 Aug 2023 22:55:54 +0900 Subject: [PATCH 01/17] scrapetest.py created --- scrapetest.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 scrapetest.py diff --git a/scrapetest.py b/scrapetest.py new file mode 100644 index 0000000..66fdd21 --- /dev/null +++ b/scrapetest.py @@ -0,0 +1,4 @@ +from urllib.request import urlopen + +html = urlopen('http://pythonscraping.com/pages/page1.html') +print(html.read()) \ No newline at end of file From bd0f127a5fd17e88039e374e6a98672324d6ead7 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 16 Aug 2023 22:57:51 +0900 Subject: [PATCH 02/17] scrapetest.py created --- scrapetest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapetest.py b/scrapetest.py index 66fdd21..28ea3ea 100644 --- a/scrapetest.py +++ b/scrapetest.py @@ -1,4 +1,4 @@ from urllib.request import urlopen html = urlopen('http://pythonscraping.com/pages/page1.html') -print(html.read()) \ No newline at end of file +print(html.read()) From 5255b7a8e40d38a962e0bdab95f3fad32ccab18e Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 16 Aug 2023 23:06:57 +0900 Subject: [PATCH 03/17] scrapetest.py updated --- scrapetest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapetest.py b/scrapetest.py index 28ea3ea..7076109 100644 --- a/scrapetest.py +++ b/scrapetest.py @@ -1,4 +1,7 @@ from urllib.request import urlopen +from bs4 import BeautifulSoup html = urlopen('http://pythonscraping.com/pages/page1.html') -print(html.read()) +#print(html.read()) +bs = BeautifulSoup(html.read(), 'html.parser') +print(bs.h1) \ No newline at end of file From 79daf92fb161659ec9bfee5ca59f4f15db653435 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Mon, 21 Aug 2023 23:08:08 +0900 Subject: [PATCH 04/17] scrapetest.py updated --- scrapetest.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/scrapetest.py b/scrapetest.py index 7076109..6916f2c 100644 --- a/scrapetest.py +++ b/scrapetest.py @@ -1,7 +1,18 @@ from urllib.request import urlopen +from urllib.error import HTTPError, URLError from bs4 import BeautifulSoup -html = urlopen('http://pythonscraping.com/pages/page1.html') +try: + html = urlopen('http://pythonscraping.com/pages/page1.html') +except HTTPError as e: + print(e) + # return null, break, あるいは他の処理を実行 #print(html.read()) -bs = BeautifulSoup(html.read(), 'html.parser') -print(bs.h1) \ No newline at end of file +except URLError as e: + print("The server could not be found!") +else: + # プログラムは継続. + # ※例外の捕捉でreturnかbreakしたらelse文は実行されないため,いらない. + print("It worked!") +#bs = BeautifulSoup(html.read(), 'html.parser') +#print(bs.h1) # 上から最初のh1タグを取ってくる.複数h1タグがある場合は,最初のものしか取られないことに注意. \ No newline at end of file From b9ae315a2bebf4648ac3ae9271ea20bc6e5a7acd Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Mon, 21 Aug 2023 23:17:20 +0900 Subject: [PATCH 05/17] scrapetest2.py, which includes try-excepetion procedure, was created --- scrapetest2.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 scrapetest2.py diff --git a/scrapetest2.py b/scrapetest2.py new file mode 100644 index 0000000..5f6baee --- /dev/null +++ b/scrapetest2.py @@ -0,0 +1,21 @@ +from urllib.request import urlopen +from urllib.error import HTTPError, URLError +from bs4 import BeautifulSoup + +def getTitle(url): + try: + html = urlopen(url) + except HTTPError as e: + return None + try: + bs = BeautifulSoup(html.read(), 'html.parser') + title = bs.body.h1 + except AttributeError as e: + return None + return title + +title = getTitle('http://pythonscraping.com/pages/page1.html') +if title == None: + print('Title could not be found') +else: + print(title) From d149571686f6b3d99c44b0063d9e81edbb05ed3e Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Tue, 22 Aug 2023 22:11:36 +0900 Subject: [PATCH 06/17] chap2-2 finished --- chap2_work.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 chap2_work.py diff --git a/chap2_work.py b/chap2_work.py new file mode 100644 index 0000000..fb5f504 --- /dev/null +++ b/chap2_work.py @@ -0,0 +1,27 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup + +html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html') +bs = BeautifulSoup(html.read(), 'html.parser') + +nameList = bs.find_all('span', {'class':'green'}) +for name in nameList: + print(name.get_text()) + +# the princeの出現回数をカウントする場合 +nameList = bs.find_all(string='the prince') +print(len(nameList)) + +## 2-2-3-1 +html = urlopen('http://www.pythonscraping.com/pages/page3.html') +bs = BeautifulSoup(html.read(), 'html.parser') + +for child in bs.find('table', {'id': 'giftList'}).children: + print(child) + +## 2-2-3-2 +for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings: + print(sibling) + +## 2-2-3-3 +print(bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()) \ No newline at end of file From c4ec9a8b7b2dc08f95c8ceb23d2222897971e9c7 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Tue, 22 Aug 2023 22:27:24 +0900 Subject: [PATCH 07/17] chap2-4 finished --- chap2_work.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/chap2_work.py b/chap2_work.py index fb5f504..bba3996 100644 --- a/chap2_work.py +++ b/chap2_work.py @@ -1,5 +1,6 @@ from urllib.request import urlopen from bs4 import BeautifulSoup +import re html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html') bs = BeautifulSoup(html.read(), 'html.parser') @@ -24,4 +25,9 @@ print(sibling) ## 2-2-3-3 -print(bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()) \ No newline at end of file +print(bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()) + +## chap2-4: regular expression +images = bs.find_all('img', {'src':re.compile('..\/img\/gifts\/img.*.jpg')}) +for image in images: + print(image['src']) \ No newline at end of file From 90bfc7aa1d1110bab61f738132da153bada546e6 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 23 Aug 2023 22:55:14 +0900 Subject: [PATCH 08/17] chap3 started --- chap2_work.py | 3 ++- chap3_work.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 chap3_work.py diff --git a/chap2_work.py b/chap2_work.py index bba3996..c276dae 100644 --- a/chap2_work.py +++ b/chap2_work.py @@ -30,4 +30,5 @@ ## chap2-4: regular expression images = bs.find_all('img', {'src':re.compile('..\/img\/gifts\/img.*.jpg')}) for image in images: - print(image['src']) \ No newline at end of file + print(image['src']) + diff --git a/chap3_work.py b/chap3_work.py new file mode 100644 index 0000000..7e619bf --- /dev/null +++ b/chap3_work.py @@ -0,0 +1,10 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + +html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon') +bs = BeautifulSoup(html, 'html.parser') +for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')): + # ?!から始まる文字列を()で囲むことで,その文字列を含まないを表現できる.(?!:).で1つのコロンを含まない,((?!:).)*コロン以外の0文字以上の文字列を表している + if 'href' in link.attrs: + print(link.attrs['href']) \ No newline at end of file From d86e4b3caf9205dd62ed9733de1db1df24b4a7ce Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 23 Aug 2023 23:01:57 +0900 Subject: [PATCH 09/17] chap3 updated --- chap3_work.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/chap3_work.py b/chap3_work.py index 7e619bf..6ec9d4d 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -7,4 +7,20 @@ for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')): # ?!から始まる文字列を()で囲むことで,その文字列を含まないを表現できる.(?!:).で1つのコロンを含まない,((?!:).)*コロン以外の0文字以上の文字列を表している if 'href' in link.attrs: - print(link.attrs['href']) \ No newline at end of file + print(link.attrs['href']) + +## p.35あたり +import datetime +import random + +random.seed(datetime.datetime.now()) +def getLinks(articleUrl): + html = urlopen('http://en.wikipedia.org{}'.format(articleUrl)) + bs = BeautifulSoup(html, 'html.parser') + return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')) + +links = getLinks('/wiki/Kevin_Bacon') +while len(links) > 0: + newArticle = links[random.randint(0, len(links)-1)].attrs['href'] + print(newArticle) + links = getLinks(newArticle) \ No newline at end of file From 7c01c06a193583130bcd97ae5ac66575d7bc6663 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 23 Aug 2023 23:14:03 +0900 Subject: [PATCH 10/17] chap3 updated --- chap3_work.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/chap3_work.py b/chap3_work.py index 6ec9d4d..5a6cab8 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -10,6 +10,8 @@ print(link.attrs['href']) ## p.35あたり +""" +# 停止条件が満たされにくいコードのため,どっかでctrl+Cで止める必要あり import datetime import random @@ -23,4 +25,23 @@ def getLinks(articleUrl): while len(links) > 0: newArticle = links[random.randint(0, len(links)-1)].attrs['href'] print(newArticle) - links = getLinks(newArticle) \ No newline at end of file + links = getLinks(newArticle) +""" + +## 3-2 +# 停止条件が満たされにくいコードのため,どっかでctrl+Cで止める必要あり +pages = set() +def getLinks(pageUrl): + global pages + html = urlopen('http://en.wikipedia.org{}'.format(pageUrl)) + bs = BeautifulSoup(html, 'html.parser') + for link in bs.find_all('a', href=re.compile('^(/wiki/)')): + if 'href' in link.attrs: + if link.attrs['href'] not in pages: + # 新しいページに出会った + newPage = link.attrs['href'] + print(newPage) + pages.add(newPage) + getLinks(newPage) + +getLinks('') From 7cbed41405a278b03266488a04dbb1c94eb76a9a Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Wed, 23 Aug 2023 23:16:46 +0900 Subject: [PATCH 11/17] prepare for 3-2-1 --- chap3_work.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chap3_work.py b/chap3_work.py index 5a6cab8..340e766 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -29,6 +29,7 @@ def getLinks(articleUrl): """ ## 3-2 +""" # 停止条件が満たされにくいコードのため,どっかでctrl+Cで止める必要あり pages = set() def getLinks(pageUrl): @@ -45,3 +46,6 @@ def getLinks(pageUrl): getLinks(newPage) getLinks('') +""" + +## 3-2-1 \ No newline at end of file From 9a835fde92507e0fde2968a07f0f5701840358b8 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Thu, 5 Oct 2023 23:59:25 +0900 Subject: [PATCH 12/17] finished 3-2-1 --- chap3_work.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/chap3_work.py b/chap3_work.py index 340e766..e393566 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -48,4 +48,26 @@ def getLinks(pageUrl): getLinks('') """ -## 3-2-1 \ No newline at end of file +## 3-2-1 +pages = set() +def getLinks(pageUrl): + global pages + html = urlopen('http://en.wikipedia.org{}'.format(pageUrl)) + bs = BeautifulSoup(html, 'html.parser') + try: + print(bs.h1.get_text()) + print(bs.find(id='mw-content-text').find_all('p')[0]) + print(bs.find(id='ca-edit').find('span').find('a').attrs['href']) + except AttributeError: + print('This page is missing something! Continuing.') + + for link in bs.find_all('a', href=re.compile('^(/wiki/)')): + if 'href' in link.attrs: + if link.attrs['href'] not in pages: + # 新しいページに出会った + newPage = link.attrs['href'] + print('-'*20) + print(newPage) + pages.add(newPage) + getLinks(newPage) +getLinks('') \ No newline at end of file From a45bd5ca702d321727d5adc294c00c0a6ea86ea2 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Fri, 6 Oct 2023 00:36:21 +0900 Subject: [PATCH 13/17] working chap 3-3 --- chap3_work.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/chap3_work.py b/chap3_work.py index e393566..5353dc7 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -2,13 +2,15 @@ from bs4 import BeautifulSoup import re +""" html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon') bs = BeautifulSoup(html, 'html.parser') for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')): # ?!から始まる文字列を()で囲むことで,その文字列を含まないを表現できる.(?!:).で1つのコロンを含まない,((?!:).)*コロン以外の0文字以上の文字列を表している if 'href' in link.attrs: print(link.attrs['href']) - +""" + ## p.35あたり """ # 停止条件が満たされにくいコードのため,どっかでctrl+Cで止める必要あり @@ -48,6 +50,7 @@ def getLinks(pageUrl): getLinks('') """ +""" ## 3-2-1 pages = set() def getLinks(pageUrl): @@ -70,4 +73,58 @@ def getLinks(pageUrl): print(newPage) pages.add(newPage) getLinks(newPage) -getLinks('') \ No newline at end of file +getLinks('') + +""" + +# 3-3 +from urllib.parse import urlparse +import datetime +import random + +pages = set() +random.seed(datetime.datetime.now()) + +# ページ内のすべての内部リンクのリストを取り出す +def getInternalLinks(bs, includeUrl): + includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc) + internalLinks = [] + # "/"から始まるすべてのリンクを見つける + for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')): + if link.attrs['href'] is not None: + if link.attrs['href'] not in internalLinks: + if (link.attrs['href'].startswith('/')): + internalLinks.append(includeUrl+link.attrs['href']) + else: + internalLinks.append(link.attrs['href']) + return internalLinks + +# ページ内のすべての外部リンクのリストを取り出す +def getExternalLinks(bs, excludeUrl): + externalLinks = [] + # 現在のURLを含まない'https'か'www'から始まるすべてのリンクを見つける + for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')): + if link.attrs['href'] is not None: + if link.attrs['href'] not in externalLinks: + externalLinks.append(link.attrs['href']) + return externalLinks + +def getRandomExternalLink(startingPage): + html = urlopen(startingPage) + bs = BeautifulSoup(html, 'html.parser') + externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc) + if len(externalLinks) == 0: + print('No external links, looking around the site for one') + domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc) + internalLinks = getInternalLinks(bs, domain) + print("internal links: \n", internalLinks, "\n ==========") + return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)]) + else: + return externalLinks[random.randint(0, len(externalLinks)-1)] + +def followExternalOnly(startingSite): + externalLink = getRandomExternalLink(startingSite) + print('Random external link is: {}'.format(externalLink)) + followExternalOnly(externalLink) + +followExternalOnly('http://oreilly.com') \ No newline at end of file From 72b35c9b0dbdf47508c2185d1ea9e7c5282bf903 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Fri, 6 Oct 2023 00:42:31 +0900 Subject: [PATCH 14/17] finished chap 3-3 --- chap3_work.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/chap3_work.py b/chap3_work.py index 5353dc7..c24e245 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -127,4 +127,27 @@ def followExternalOnly(startingSite): print('Random external link is: {}'.format(externalLink)) followExternalOnly(externalLink) -followExternalOnly('http://oreilly.com') \ No newline at end of file +#followExternalOnly('http://oreilly.com') +## このサイトで見つかったすべての外部URLのリストを集める +allExtLinks = set() +allIntLinks = set() + +def getAllExternalLinks(siteUrl): + html = urlopen(siteUrl) + domain = '{}://{}'.format(urlparse(siteUrl).scheme, urlparse(siteUrl).netloc) + bs = BeautifulSoup(html, 'html.parser') + internalLinks = getInternalLinks(bs, domain) + externalLinks = getExternalLinks(bs, domain) + + for link in externalLinks: + if link not in allExtLinks: + allExtLinks.add(link) + print(link) + + for link in internalLinks: + if link not in allIntLinks: + allIntLinks.add(link) + getAllExternalLinks(link) + +allIntLinks.add('http://oreilly.com') +getAllExternalLinks('http://oreilly.com') \ No newline at end of file From a10028d90aeb739eccf2d28d81e07d75c4ab47a6 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Sat, 7 Oct 2023 13:40:55 +0900 Subject: [PATCH 15/17] finished chap 3-3 --- chap3_work.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chap3_work.py b/chap3_work.py index c24e245..5591cc2 100644 --- a/chap3_work.py +++ b/chap3_work.py @@ -150,4 +150,4 @@ def getAllExternalLinks(siteUrl): getAllExternalLinks(link) allIntLinks.add('http://oreilly.com') -getAllExternalLinks('http://oreilly.com') \ No newline at end of file +getAllExternalLinks('http://oreilly.com') From 963bdd0dceb9044fa0ada001da83f006bb20d789 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Sat, 7 Oct 2023 13:43:15 +0900 Subject: [PATCH 16/17] chap4_work.py created --- chap | 0 chap4_work.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 chap create mode 100644 chap4_work.py diff --git a/chap b/chap new file mode 100644 index 0000000..e69de29 diff --git a/chap4_work.py b/chap4_work.py new file mode 100644 index 0000000..e69de29 From d35df292a111360c125b8b08d0ceb4981c9fa647 Mon Sep 17 00:00:00 2001 From: masamichiIto Date: Sun, 8 Oct 2023 23:24:42 +0900 Subject: [PATCH 17/17] chap4_work.py was modified for 4.2 excersise. --- chap4_work.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/chap4_work.py b/chap4_work.py index e69de29..d8fbb40 100644 --- a/chap4_work.py +++ b/chap4_work.py @@ -0,0 +1,32 @@ +import requests +from bs4 import BeautifulSoup +class Content: + def __init__(self, url, title, body) -> None: + self.url = url + self.title = title + self.body = body + +def getPage(url): + req = requests.get(url) + return BeautifulSoup(req.text, 'html.parser') + +def scrapeNYTimes(url): + bs = getPage(url) + title = bs.find('h1').text + lines = bs.select('div.StoryBodyCompanionColumn div p') + body = '\n'.join([line.text for line in lines]) + return Content(url, title, body) + +def scrapeBrookings(url): + bs = getPage(url) + title = bs.find('h1').text + body = bs.find('div', {'class','post-body'}).text + return Content(url, title, body) + +url = ('https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/') + +content = scrapeBrookings(url) +print('title: {}'.format(content.title)) +print('URL: {}'.format(content.url)) +print(content.body) +