11from urllib .request import urlopen
2+ from urllib .error import HTTPError
23from bs4 import BeautifulSoup
34import re
4- import datetime
55import random
66
77pages = set ()
8- random . seed ( datetime . datetime . now ())
8+
99
1010#Retrieves a list of all Internal links found on a page
1111def getInternalLinks (bsObj , includeUrl ):
@@ -19,34 +19,40 @@ def getInternalLinks(bsObj, includeUrl):
1919
2020#Retrieves a list of all external links found on a page
2121def getExternalLinks (bsObj , excludeUrl ):
22+ excludeUrl = splitAddress (excludeUrl )[0 ]
2223 externalLinks = []
2324 #Finds all links that start with "http" or "www" that do
2425 #not contain the current URL
2526 for link in bsObj .findAll ("a" , href = re .compile ("^(http|www)((?!" + excludeUrl + ").)*$" )):
26- if link .attrs ['href' ] is not None :
27+ if link .attrs ['href' ] is not None and len ( link . attrs [ 'href' ]) != 0 :
2728 if link .attrs ['href' ] not in externalLinks :
2829 externalLinks .append (link .attrs ['href' ])
2930 return externalLinks
3031
3132def splitAddress (address ):
33+ address = address .replace ("www" , "" )
3234 addressParts = address .replace ("http://" , "" ).split ("/" )
3335 return addressParts
3436
35- def getRandomExternalLink (startingPage ):
36- html = urlopen (startingPage )
37- bsObj = BeautifulSoup (html )
38- externalLinks = getExternalLinks (bsObj , splitAddress (startingPage )[0 ])
37+
38+ def followExternalOnly (bsObj , url ):
39+ externalLinks = getExternalLinks (bsObj , splitAddress (url )[0 ])
3940 if len (externalLinks ) == 0 :
40- internalLinks = getInternalLinks (startingPage )
41- return getNextExternalLink (internalLinks [random .randint (0 ,
42- len (internalLinks )- 1 )])
41+ #Only internal links here. Get another internal page and try again
42+ internalLinks = getInternalLinks (bsObj , url )
43+ bsObj = urlopen (internalLinks [random .randint (0 , len (internalLinks )- 1 )])
44+ return followExternalOnly (bsObj , url )
4345 else :
44- return externalLinks [random .randint (0 , len (externalLinks )- 1 )]
46+ randomExternal = externalLinks [random .randint (0 , len (externalLinks )- 1 )]
47+ try :
48+ nextBsObj = BeautifulSoup (urlopen (randomExternal ))
49+ print (randomExternal )
50+ return [nextBsObj , url ]
51+ except HTTPError :
52+ #Try again
53+ print ("Encountered error at " + randomExternal + "! Trying again" )
54+ return followExternalOnly (bsObj , url )
4555
46- def followExternalOnly (startingSite ):
47- externalLink = getRandomExternalLink (startingSite )
48- print ("Random external link is: " + externalLink )
49- followExternalOnly (externalLink )
5056
5157
5258#Collects a list of all external URLs found on the site
@@ -66,5 +72,14 @@ def getAllExternalLinks(siteUrl):
6672 print ("About to get link: " + link )
6773 allIntLinks .add (link )
6874 getAllExternalLinks (link )
69-
75+
76+ url = "http://oreilly.com"
77+ bsObj = BeautifulSoup (urlopen (url ))
78+ #Following random external links for 10 steps
79+ for i in range (10 ):
80+ bsObj , url = followExternalOnly (bsObj , url )
81+
82+ #Get a collection of all external links on orielly.com
7083getAllExternalLinks ("http://oreilly.com" )
84+
85+
0 commit comments