Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
working chap 3-3
  • Loading branch information
masamichiIto committed Oct 5, 2023
commit a45bd5ca702d321727d5adc294c00c0a6ea86ea2
61 changes: 59 additions & 2 deletions chap3_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from bs4 import BeautifulSoup
import re

"""
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
# ?!から始まる文字列を()で囲むことで,その文字列を含まないを表現できる.(?!:).で1つのコロンを含まない,((?!:).)*コロン以外の0文字以上の文字列を表している
if 'href' in link.attrs:
print(link.attrs['href'])

"""

## p.35あたり
"""
# 停止条件が満たされにくいコードのため,どっかでctrl+Cで止める必要あり
Expand Down Expand Up @@ -48,6 +50,7 @@ def getLinks(pageUrl):
getLinks('')
"""

"""
## 3-2-1
pages = set()
def getLinks(pageUrl):
Expand All @@ -70,4 +73,58 @@ def getLinks(pageUrl):
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
getLinks('')

"""

# 3-3
from urllib.parse import urlparse
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

# ページ内のすべての内部リンクのリストを取り出す
def getInternalLinks(bs, includeUrl):
includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
internalLinks = []
# "/"から始まるすべてのリンクを見つける
for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if (link.attrs['href'].startswith('/')):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks

# ページ内のすべての外部リンクのリストを取り出す
def getExternalLinks(bs, excludeUrl):
externalLinks = []
# 現在のURLを含まない'https'か'www'から始まるすべてのリンクを見つける
for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks

def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bs = BeautifulSoup(html, 'html.parser')
externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print('No external links, looking around the site for one')
domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
internalLinks = getInternalLinks(bs, domain)
print("internal links: \n", internalLinks, "\n ==========")
return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print('Random external link is: {}'.format(externalLink))
followExternalOnly(externalLink)

followExternalOnly('http://oreilly.com')