11from  urllib .request  import  urlopen 
2+ from  urllib .error  import  HTTPError 
23from  bs4  import  BeautifulSoup 
34import  re 
4- import  datetime 
55import  random 
66
77pages  =  set ()
8- random . seed ( datetime . datetime . now ()) 
8+ 
99
1010#Retrieves a list of all Internal links found on a page 
1111def  getInternalLinks (bsObj , includeUrl ):
@@ -19,34 +19,40 @@ def getInternalLinks(bsObj, includeUrl):
1919
2020#Retrieves a list of all external links found on a page 
2121def  getExternalLinks (bsObj , excludeUrl ):
22+     excludeUrl  =  splitAddress (excludeUrl )[0 ]
2223    externalLinks  =  []
2324    #Finds all links that start with "http" or "www" that do 
2425    #not contain the current URL 
2526    for  link  in  bsObj .findAll ("a" , href = re .compile ("^(http|www)((?!" + excludeUrl + ").)*$" )):
26-         if  link .attrs ['href' ] is  not   None :
27+         if  link .attrs ['href' ] is  not   None   and   len ( link . attrs [ 'href' ])  !=   0 :
2728            if  link .attrs ['href' ] not  in   externalLinks :
2829                externalLinks .append (link .attrs ['href' ])
2930    return  externalLinks 
3031
3132def  splitAddress (address ):
33+     address  =  address .replace ("www" , "" )
3234    addressParts  =  address .replace ("http://" , "" ).split ("/" )
3335    return  addressParts 
3436
35- def  getRandomExternalLink (startingPage ):
36-     html  =  urlopen (startingPage )
37-     bsObj  =  BeautifulSoup (html )
38-     externalLinks  =  getExternalLinks (bsObj , splitAddress (startingPage )[0 ])
37+ 
38+ def  followExternalOnly (bsObj , url ):
39+     externalLinks  =  getExternalLinks (bsObj , splitAddress (url )[0 ])
3940    if  len (externalLinks ) ==  0 :
40-         internalLinks  =  getInternalLinks (startingPage )
41-         return  getNextExternalLink (internalLinks [random .randint (0 , 
42-                                   len (internalLinks )- 1 )])
41+         #Only internal links here. Get another internal page and try again 
42+         internalLinks  =  getInternalLinks (bsObj , url )
43+         bsObj  =  urlopen (internalLinks [random .randint (0 , len (internalLinks )- 1 )])
44+         return  followExternalOnly (bsObj , url )
4345    else :
44-         return  externalLinks [random .randint (0 , len (externalLinks )- 1 )]
46+         randomExternal  =  externalLinks [random .randint (0 , len (externalLinks )- 1 )]
47+         try :
48+             nextBsObj  =  BeautifulSoup (urlopen (randomExternal ))
49+             print (randomExternal )
50+             return  [nextBsObj , url ]
51+         except  HTTPError :
52+             #Try again 
53+             print ("Encountered error at " + randomExternal + "! Trying again" )
54+             return  followExternalOnly (bsObj , url )
4555
46- def  followExternalOnly (startingSite ):
47-     externalLink  =  getRandomExternalLink (startingSite )
48-     print ("Random external link is: " + externalLink )
49-     followExternalOnly (externalLink )
5056
5157
5258#Collects a list of all external URLs found on the site 
@@ -66,5 +72,14 @@ def getAllExternalLinks(siteUrl):
6672            print ("About to get link: " + link )
6773            allIntLinks .add (link )
6874            getAllExternalLinks (link )
69-             
75+ 
76+ url  =  "http://oreilly.com" 
77+ bsObj  =  BeautifulSoup (urlopen (url ))
78+ #Following random external links for 10 steps 
79+ for  i  in  range (10 ):
80+     bsObj , url  =  followExternalOnly (bsObj , url )
81+ 
82+ #Get a collection of all external links on orielly.com 
7083getAllExternalLinks ("http://oreilly.com" )
84+ 
85+ 
0 commit comments