Skip to content

Commit 88f9755

Browse files
Ryan MitchellRyan Mitchell
authored andcommitted
Cleaned up code, fixed external link finding
1 parent 8fab873 commit 88f9755

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

chapter3/4-getExternalLinks.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from urllib.request import urlopen
2+
from urllib.error import HTTPError
23
from bs4 import BeautifulSoup
34
import re
4-
import datetime
55
import random
66

77
pages = set()
8-
random.seed(datetime.datetime.now())
8+
99

1010
#Retrieves a list of all Internal links found on a page
1111
def getInternalLinks(bsObj, includeUrl):
@@ -19,34 +19,40 @@ def getInternalLinks(bsObj, includeUrl):
1919

2020
#Retrieves a list of all external links found on a page
2121
def getExternalLinks(bsObj, excludeUrl):
22+
excludeUrl = splitAddress(excludeUrl)[0]
2223
externalLinks = []
2324
#Finds all links that start with "http" or "www" that do
2425
#not contain the current URL
2526
for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
26-
if link.attrs['href'] is not None:
27+
if link.attrs['href'] is not None and len(link.attrs['href']) != 0:
2728
if link.attrs['href'] not in externalLinks:
2829
externalLinks.append(link.attrs['href'])
2930
return externalLinks
3031

3132
def splitAddress(address):
33+
address = address.replace("www", "")
3234
addressParts = address.replace("http://", "").split("/")
3335
return addressParts
3436

35-
def getRandomExternalLink(startingPage):
36-
html = urlopen(startingPage)
37-
bsObj = BeautifulSoup(html)
38-
externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
37+
38+
def followExternalOnly(bsObj, url):
39+
externalLinks = getExternalLinks(bsObj, splitAddress(url)[0])
3940
if len(externalLinks) == 0:
40-
internalLinks = getInternalLinks(startingPage)
41-
return getNextExternalLink(internalLinks[random.randint(0,
42-
len(internalLinks)-1)])
41+
#Only internal links here. Get another internal page and try again
42+
internalLinks = getInternalLinks(bsObj, url)
43+
bsObj = urlopen(internalLinks[random.randint(0, len(internalLinks)-1)])
44+
return followExternalOnly(bsObj, url)
4345
else:
44-
return externalLinks[random.randint(0, len(externalLinks)-1)]
46+
randomExternal = externalLinks[random.randint(0, len(externalLinks)-1)]
47+
try:
48+
nextBsObj = BeautifulSoup(urlopen(randomExternal))
49+
print(randomExternal)
50+
return [nextBsObj, url]
51+
except HTTPError:
52+
#Try again
53+
print("Encountered error at "+randomExternal+"! Trying again")
54+
return followExternalOnly(bsObj, url)
4555

46-
def followExternalOnly(startingSite):
47-
externalLink = getRandomExternalLink(startingSite)
48-
print("Random external link is: "+externalLink)
49-
followExternalOnly(externalLink)
5056

5157

5258
#Collects a list of all external URLs found on the site
@@ -66,5 +72,14 @@ def getAllExternalLinks(siteUrl):
6672
print("About to get link: "+link)
6773
allIntLinks.add(link)
6874
getAllExternalLinks(link)
69-
75+
76+
url = "http://oreilly.com"
77+
bsObj = BeautifulSoup(urlopen(url))
78+
#Following random external links for 10 steps
79+
for i in range(10):
80+
bsObj, url = followExternalOnly(bsObj, url)
81+
82+
#Get a collection of all external links on orielly.com
7083
getAllExternalLinks("http://oreilly.com")
84+
85+

0 commit comments

Comments
 (0)