Skip to content

Commit 2dfe368

Browse files
Ryan MitchellRyan Mitchell
authored andcommitted
Bug fixes
1 parent 16e38ac commit 2dfe368

File tree

13 files changed

+194
-134
lines changed

13 files changed

+194
-134
lines changed

chapter11/3-readWebImages.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import subprocess
44
from selenium import webdriver
55

6-
driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
7-
#driver = webdriver.Firefox()
6+
#driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
7+
driver = webdriver.Firefox()
88
driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
99
time.sleep(2)
1010

chapter11/ghostdriver.log

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,36 @@
11
PhantomJS is launching GhostDriver...
2-
[INFO - 2015-06-15T00:08:45.592Z] GhostDriver - Main - running on port 51799
3-
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true}
4-
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.customHeaders: - {}
5-
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-10.9 (Mavericks)-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
6-
[INFO - 2015-06-15T00:08:46.231Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9
7-
[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: ReferenceError: Can't find variable: ue
8-
[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack:
9-
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10202)
10-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:940)
11-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:807)
12-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:696)
13-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:784)
14-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:959)
15-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:161)
16-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:70)
17-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:960)
18-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2615)
19-
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2640)
20-
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10203)
21-
[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: TypeError: 'null' is not an object (evaluating 'old_error_handler.apply')
22-
[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack:
23-
dpOnErrorOverride (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:1186)
2+
[INFO - 2015-10-27T02:02:29.659Z] GhostDriver - Main - running on port 60643
3+
[INFO - 2015-10-27T02:02:29.699Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true}
4+
[INFO - 2015-10-27T02:02:29.699Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.customHeaders: - {}
5+
[INFO - 2015-10-27T02:02:29.699Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-10.10 (Yosemite)-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
6+
[INFO - 2015-10-27T02:02:29.699Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: c741cde0-7c4e-11e5-9065-072e3a262f9e
7+
[ERROR - 2015-10-27T02:02:33.292Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - msg: ReferenceError: Can't find variable: ue
8+
[ERROR - 2015-10-27T02:02:33.292Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - stack:
9+
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:9638)
10+
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:201)
11+
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:202)
12+
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:205)
13+
[ERROR - 2015-10-27T02:02:33.292Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - msg: TypeError: 'null' is not an object (evaluating 'old_error_handler.apply')
14+
[ERROR - 2015-10-27T02:02:33.292Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - stack:
15+
dpOnErrorOverride (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:1345)
16+
[ERROR - 2015-10-27T02:02:33.794Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - msg: ReferenceError: Can't find variable: ue
17+
[ERROR - 2015-10-27T02:02:33.794Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - stack:
18+
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:9632)
19+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:940)
20+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:807)
21+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:696)
22+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:784)
23+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:959)
24+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:161)
25+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:70)
26+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:960)
27+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:2608)
28+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:2646)
29+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:2772)
30+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:161)
31+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:2773)
32+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:2778)
33+
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-7939884564._V1_.js:2781)
34+
[ERROR - 2015-10-27T02:02:33.794Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - msg: TypeError: 'null' is not an object (evaluating 'old_error_handler.apply')
35+
[ERROR - 2015-10-27T02:02:33.794Z] Session [c741cde0-7c4e-11e5-9065-072e3a262f9e] - page.onError - stack:
36+
dpOnErrorOverride (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:1345)

chapter13/4-dragAndDrop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from selenium.webdriver.remote.webelement import WebElement
33
from selenium.webdriver import ActionChains
44

5-
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
5+
driver = webdriver.PhantomJS(executable_path='phantomjs/bin/phantomjs')
66
driver.get('http://pythonscraping.com/pages/javascript/draggableDemo.html')
77

88
print(driver.find_element_by_id("message").text)

chapter13/ghostdriver.log

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
PhantomJS is launching GhostDriver...
2-
[INFO - 2015-06-15T00:50:30.713Z] GhostDriver - Main - running on port 53151
3-
[INFO - 2015-06-15T00:50:31.089Z] Session [85cfddd0-12f8-11e5-bb71-bf3b62d3747d] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true}
4-
[INFO - 2015-06-15T00:50:31.089Z] Session [85cfddd0-12f8-11e5-bb71-bf3b62d3747d] - page.customHeaders: - {}
5-
[INFO - 2015-06-15T00:50:31.089Z] Session [85cfddd0-12f8-11e5-bb71-bf3b62d3747d] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-10.9 (Mavericks)-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
6-
[INFO - 2015-06-15T00:50:31.089Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 85cfddd0-12f8-11e5-bb71-bf3b62d3747d
2+
[INFO - 2016-01-10T08:45:04.240Z] GhostDriver - Main - running on port 58176
3+
[INFO - 2016-01-10T08:45:05.042Z] Session [71f20ae0-b776-11e5-afb7-fbdb40be72f0] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true}
4+
[INFO - 2016-01-10T08:45:05.042Z] Session [71f20ae0-b776-11e5-afb7-fbdb40be72f0] - page.customHeaders: - {}
5+
[INFO - 2016-01-10T08:45:05.042Z] Session [71f20ae0-b776-11e5-afb7-fbdb40be72f0] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-unknown-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
6+
[INFO - 2016-01-10T08:45:05.042Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 71f20ae0-b776-11e5-afb7-fbdb40be72f0

chapter14/1-socks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import urllib2
21
import socks
32
import socket
3+
from urllib.request import urlopen
44

55
socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
66
socket.socket = socks.socksocket
7-
print(urllib2.urlopen('http://icanhazip.com').read())
7+
print(urlopen('http://icanhazip.com').read())

chapter2/.DS_Store

0 Bytes
Binary file not shown.

chapter3/4-getExternalLinks.py

Lines changed: 24 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,55 @@
11
from urllib.request import urlopen
2-
from urllib.error import HTTPError
2+
from urllib.parse import urlparse
33
from bs4 import BeautifulSoup
44
import re
5+
import datetime
56
import random
67

78
pages = set()
8-
9+
random.seed(datetime.datetime.now())
910

1011
#Retrieves a list of all Internal links found on a page
1112
def getInternalLinks(bsObj, includeUrl):
13+
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
1214
internalLinks = []
1315
#Finds all links that begin with a "/"
1416
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
1517
if link.attrs['href'] is not None:
1618
if link.attrs['href'] not in internalLinks:
17-
internalLinks.append(link.attrs['href'])
19+
if(link.attrs['href'].startswith("/")):
20+
internalLinks.append(includeUrl+link.attrs['href'])
21+
else:
22+
internalLinks.append(link.attrs['href'])
1823
return internalLinks
1924

2025
#Retrieves a list of all external links found on a page
2126
def getExternalLinks(bsObj, excludeUrl):
22-
excludeUrl = splitAddress(excludeUrl)[0]
2327
externalLinks = []
2428
#Finds all links that start with "http" or "www" that do
2529
#not contain the current URL
26-
for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
27-
if link.attrs['href'] is not None and len(link.attrs['href']) != 0:
30+
for link in bsObj.findAll("a", href=re.compile(
31+
"^(http|www)((?!"+excludeUrl+").)*$")):
32+
if link.attrs['href'] is not None:
2833
if link.attrs['href'] not in externalLinks:
2934
externalLinks.append(link.attrs['href'])
3035
return externalLinks
3136

32-
def splitAddress(address):
33-
address = address.replace("www", "")
34-
addressParts = address.replace("http://", "").split("/")
35-
return addressParts
36-
37-
38-
def followExternalOnly(bsObj, url):
39-
externalLinks = getExternalLinks(bsObj, splitAddress(url)[0])
37+
def getRandomExternalLink(startingPage):
38+
html = urlopen(startingPage)
39+
bsObj = BeautifulSoup(html)
40+
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
4041
if len(externalLinks) == 0:
41-
#Only internal links here. Get another internal page and try again
42-
internalLinks = getInternalLinks(bsObj, url)
43-
bsObj = urlopen(internalLinks[random.randint(0, len(internalLinks)-1)])
44-
return followExternalOnly(bsObj, url)
42+
domain = domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
43+
internalLinks = getInternalLinks(bsObj, domain)
44+
return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
4545
else:
46-
randomExternal = externalLinks[random.randint(0, len(externalLinks)-1)]
47-
try:
48-
nextBsObj = BeautifulSoup(urlopen(randomExternal))
49-
print(randomExternal)
50-
return [nextBsObj, url]
51-
except HTTPError:
52-
#Try again
53-
print("Encountered error at "+randomExternal+"! Trying again")
54-
return followExternalOnly(bsObj, url)
46+
return externalLinks[random.randint(0, len(externalLinks)-1)]
5547

48+
def followExternalOnly(startingSite):
49+
externalLink = getRandomExternalLink(startingSite)
50+
print("Random external link is: "+externalLink)
51+
followExternalOnly(externalLink)
5652

57-
58-
#Collects a list of all external URLs found on the site
59-
allExtLinks = set()
60-
allIntLinks = set()
61-
def getAllExternalLinks(siteUrl):
62-
html = urlopen(siteUrl)
63-
bsObj = BeautifulSoup(html)
64-
internalLinks = getInternalLinks(bsObj,splitAddress(siteUrl)[0])
65-
externalLinks = getExternalLinks(bsObj,splitAddress(siteUrl)[0])
66-
for link in externalLinks:
67-
if link not in allExtLinks:
68-
allExtLinks.add(link)
69-
print(link)
70-
for link in internalLinks:
71-
if link not in allIntLinks:
72-
print("About to get link: "+link)
73-
allIntLinks.add(link)
74-
getAllExternalLinks(link)
75-
76-
url = "http://oreilly.com"
77-
bsObj = BeautifulSoup(urlopen(url))
78-
#Following random external links for 10 steps
79-
for i in range(10):
80-
bsObj, url = followExternalOnly(bsObj, url)
81-
82-
#Get a collection of all external links on orielly.com
83-
getAllExternalLinks("http://oreilly.com")
53+
followExternalOnly("http://oreilly.com")
8454

8555

chapter3/5-getAllExternalLinks.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from urllib.request import urlopen
2+
from urllib.parse import urlparse
3+
from bs4 import BeautifulSoup
4+
import re
5+
import datetime
6+
import random
7+
8+
pages = set()
9+
random.seed(datetime.datetime.now())
10+
11+
#Retrieves a list of all Internal links found on a page
12+
def getInternalLinks(bsObj, includeUrl):
13+
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
14+
internalLinks = []
15+
#Finds all links that begin with a "/"
16+
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
17+
if link.attrs['href'] is not None:
18+
if link.attrs['href'] not in internalLinks:
19+
if(link.attrs['href'].startswith("/")):
20+
internalLinks.append(includeUrl+link.attrs['href'])
21+
else:
22+
internalLinks.append(link.attrs['href'])
23+
return internalLinks
24+
25+
#Retrieves a list of all external links found on a page
26+
def getExternalLinks(bsObj, excludeUrl):
27+
externalLinks = []
28+
#Finds all links that start with "http" or "www" that do
29+
#not contain the current URL
30+
for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
31+
if link.attrs['href'] is not None:
32+
if link.attrs['href'] not in externalLinks:
33+
externalLinks.append(link.attrs['href'])
34+
return externalLinks
35+
36+
def getRandomExternalLink(startingPage):
37+
html = urlopen(startingPage)
38+
bsObj = BeautifulSoup(html)
39+
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
40+
if len(externalLinks) == 0:
41+
domain = domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
42+
internalLinks = getInternalLinks(bsObj, domain)
43+
return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
44+
else:
45+
return externalLinks[random.randint(0, len(externalLinks)-1)]
46+
47+
def followExternalOnly(startingSite):
48+
externalLink = getRandomExternalLink(startingSite)
49+
print("Random external link is: "+externalLink)
50+
followExternalOnly(externalLink)
51+
52+
#Collects a list of all external URLs found on the site
53+
allExtLinks = set()
54+
allIntLinks = set()
55+
def getAllExternalLinks(siteUrl):
56+
html = urlopen(siteUrl)
57+
domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
58+
bsObj = BeautifulSoup(html)
59+
internalLinks = getInternalLinks(bsObj,domain)
60+
externalLinks = getExternalLinks(bsObj,domain)
61+
62+
for link in externalLinks:
63+
if link not in allExtLinks:
64+
allExtLinks.add(link)
65+
print(link)
66+
for link in internalLinks:
67+
if link not in allIntLinks:
68+
allIntLinks.add(link)
69+
getAllExternalLinks(link)
70+
71+
followExternalOnly("http://oreilly.com")
72+
73+
allIntLinks.add("http://oreilly.com")
74+
getAllExternalLinks("http://oreilly.com")

chapter4/6-wikiHistories.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from urllib.request import urlopen
22
from bs4 import BeautifulSoup
33
import datetime
4+
import json
45
import random
56
import re
67

0 commit comments

Comments
 (0)