Skip to content

Commit d944b00

Browse files
author
remitchell
committed
Updated code to remove PhantomJS, add headless Chrome
1 parent 7e0e567 commit d944b00

File tree

4 files changed

+319
-117
lines changed

4 files changed

+319
-117
lines changed

Chapter11-JavaScript.ipynb

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,78 @@
44
"cell_type": "code",
55
"execution_count": 1,
66
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"Here is some important text you want to retrieve!\n",
13+
"A button to click!\n"
14+
]
15+
}
16+
],
17+
"source": [
18+
"from selenium import webdriver\n",
19+
"from selenium.webdriver.chrome.options import Options\n",
20+
"import time\n",
21+
"\n",
22+
"chrome_options = Options()\n",
23+
"chrome_options.add_argument(\"--headless\")\n",
24+
"driver = webdriver.Chrome(\n",
25+
" executable_path='drivers/chromedriver', \n",
26+
" options=chrome_options)\n",
27+
"driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')\n",
28+
"time.sleep(3)\n",
29+
"print(driver.find_element_by_id('content').text)\n",
30+
"driver.close()"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 2,
36+
"metadata": {},
37+
"outputs": [
38+
{
39+
"name": "stdout",
40+
"output_type": "stream",
41+
"text": [
42+
"Here is some important text you want to retrieve!\n",
43+
"A button to click!\n"
44+
]
45+
}
46+
],
47+
"source": [
48+
"from selenium import webdriver\n",
49+
"from selenium.webdriver.common.by import By\n",
50+
"from selenium.webdriver.support.ui import WebDriverWait\n",
51+
"from selenium.webdriver.support import expected_conditions as EC\n",
52+
"\n",
53+
"chrome_options = Options()\n",
54+
"chrome_options.add_argument(\"--headless\")\n",
55+
"driver = webdriver.Chrome(\n",
56+
" executable_path='drivers/chromedriver',\n",
57+
" options=chrome_options)\n",
58+
"\n",
59+
"driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')\n",
60+
"try:\n",
61+
" element = WebDriverWait(driver, 10).until(\n",
62+
" EC.presence_of_element_located((By.ID, 'loadedButton')))\n",
63+
"finally:\n",
64+
" print(driver.find_element_by_id('content').text)\n",
65+
" driver.close()"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 3,
71+
"metadata": {},
772
"outputs": [
873
{
974
"name": "stdout",
1075
"output_type": "stream",
1176
"text": [
1277
"Timing out after 10 seconds and returning\n",
13-
"<html><head>\n",
78+
"<html xmlns=\"http://www.w3.org/1999/xhtml\"><head>\n",
1479
"<title>The Destination Page!</title>\n",
1580
"\n",
1681
"</head>\n",
@@ -23,9 +88,10 @@
2388
],
2489
"source": [
2590
"from selenium import webdriver\n",
26-
"import time\n",
91+
"from selenium.webdriver.chrome.options import Options\n",
2792
"from selenium.webdriver.remote.webelement import WebElement\n",
2893
"from selenium.common.exceptions import StaleElementReferenceException\n",
94+
"import time\n",
2995
"\n",
3096
"def waitForLoad(driver):\n",
3197
" elem = driver.find_element_by_tag_name(\"html\")\n",
@@ -40,16 +106,20 @@
40106
" elem == driver.find_element_by_tag_name(\"html\")\n",
41107
" except StaleElementReferenceException:\n",
42108
" return\n",
43-
"\n",
44-
"driver = webdriver.PhantomJS(executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
109+
"chrome_options = Options()\n",
110+
"chrome_options.add_argument(\"--headless\")\n",
111+
"driver = webdriver.Chrome(\n",
112+
" executable_path='drivers/chromedriver',\n",
113+
" options=chrome_options)\n",
45114
"driver.get(\"http://pythonscraping.com/pages/javascript/redirectDemo1.html\")\n",
46115
"waitForLoad(driver)\n",
47-
"print(driver.page_source)"
116+
"print(driver.page_source)\n",
117+
"driver.close()"
48118
]
49119
},
50120
{
51121
"cell_type": "code",
52-
"execution_count": 6,
122+
"execution_count": 9,
53123
"metadata": {},
54124
"outputs": [
55125
{
@@ -63,11 +133,15 @@
63133
"source": [
64134
"from selenium.webdriver.common.by import By\n",
65135
"from selenium.webdriver.support.ui import WebDriverWait\n",
136+
"from selenium.webdriver.chrome.options import Options\n",
66137
"from selenium.webdriver.support import expected_conditions as EC\n",
67138
"from selenium.common.exceptions import TimeoutException\n",
68139
"\n",
69-
"driver = webdriver.PhantomJS(\n",
70-
" executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
140+
"chrome_options = Options()\n",
141+
"chrome_options.add_argument(\"--headless\")\n",
142+
"driver = webdriver.Chrome(\n",
143+
" executable_path='drivers/chromedriver', \n",
144+
" options=chrome_options)\n",
71145
"driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html')\n",
72146
"try:\n",
73147
" bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located(\n",

Chapter14-ScrapingTraps.ipynb

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,55 +53,67 @@
5353
},
5454
{
5555
"cell_type": "code",
56-
"execution_count": 7,
56+
"execution_count": 1,
5757
"metadata": {},
5858
"outputs": [
5959
{
6060
"name": "stdout",
6161
"output_type": "stream",
6262
"text": [
63-
"[{'domain': '.pythonscraping.com', 'expires': 'Sat, 27 Jan 2018 21:46:48 GMT', 'expiry': 1517089608, 'httponly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.pythonscraping.com', 'expires': 'Sun, 28 Jan 2018 21:45:48 GMT', 'expiry': 1517175948, 'httponly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.1903552212.1517089549'}, {'domain': '.pythonscraping.com', 'expires': 'Mon, 27 Jan 2020 21:45:48 GMT', 'expiry': 1580161548, 'httponly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1362930272.1517089549'}, {'domain': 'pythonscraping.com', 'httponly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}]\n"
63+
"[{'domain': '.pythonscraping.com', 'expiry': 1540863803, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.358315088.1540777403'}, {'domain': '.pythonscraping.com', 'expiry': 1603849403, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1302028449.1540777403'}, {'domain': '.pythonscraping.com', 'expiry': 1540777463, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': 'pythonscraping.com', 'httpOnly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}]\n"
6464
]
6565
}
6666
],
6767
"source": [
6868
"from selenium import webdriver\n",
69-
"driver = webdriver.PhantomJS(executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
69+
"from selenium.webdriver.chrome.options import Options\n",
70+
"chrome_options = Options()\n",
71+
"chrome_options.add_argument(\"--headless\")\n",
72+
"driver = webdriver.Chrome(\n",
73+
" executable_path='drivers/chromedriver', \n",
74+
" chrome_options=chrome_options)\n",
7075
"driver.get('http://pythonscraping.com')\n",
7176
"driver.implicitly_wait(1)\n",
7277
"print(driver.get_cookies())"
7378
]
7479
},
7580
{
7681
"cell_type": "code",
77-
"execution_count": 12,
82+
"execution_count": 6,
7883
"metadata": {},
7984
"outputs": [
8085
{
8186
"name": "stdout",
8287
"output_type": "stream",
8388
"text": [
84-
"[{'domain': '.pythonscraping.com', 'expires': 'Sun, 28 Jan 2018 21:51:09 GMT', 'expiry': 1517176269, 'httponly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.2130634883.1517089865'}, {'domain': '.pythonscraping.com', 'expires': 'Mon, 27 Jan 2020 21:51:09 GMT', 'expiry': 1580161869, 'httponly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1559626153.1517089865'}, {'domain': 'pythonscraping.com', 'httponly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.pythonscraping.com', 'httponly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.pythonscraping.com', 'expires': 'Sat, 27 Jan 2018 21:52:05 GMT', 'expiry': 1517107925, 'httponly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}]\n"
89+
"[{'domain': '.pythonscraping.com', 'expiry': 1540864964, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.819982224.1540778565'}, {'domain': '.pythonscraping.com', 'expiry': 1603850564, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.864755518.1540778565'}, {'domain': '.pythonscraping.com', 'expiry': 1540778624, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': 'pythonscraping.com', 'httpOnly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}]\n",
90+
"[{'domain': 'pythonscraping.com', 'httpOnly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.pythonscraping.com', 'expiry': 1540778624, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.pythonscraping.com', 'expiry': 1603850569, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.864755518.1540778565'}, {'domain': '.pythonscraping.com', 'expiry': 1540864969, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.819982224.1540778565'}]\n"
8591
]
8692
}
8793
],
8894
"source": [
8995
"from selenium import webdriver\n",
96+
"from selenium.webdriver.chrome.options import Options\n",
97+
"\n",
98+
"chrome_options = Options()\n",
99+
"chrome_options.add_argument(\"--headless\")\n",
90100
"\n",
91-
"phantomPath = 'drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs'\n",
92-
"driver = webdriver.PhantomJS(executable_path=phantomPath)\n",
101+
"driver = webdriver.Chrome(\n",
102+
" executable_path='drivers/chromedriver', \n",
103+
" chrome_options=chrome_options)\n",
93104
"driver.get('http://pythonscraping.com')\n",
94105
"driver.implicitly_wait(1)\n",
95106
"\n",
96107
"savedCookies = driver.get_cookies()\n",
97108
"print(savedCookies)\n",
98109
"\n",
99-
"driver2 = webdriver.PhantomJS(executable_path=phantomPath)\n",
110+
"driver2 = webdriver.Chrome(\n",
111+
" executable_path='drivers/chromedriver',\n",
112+
" chrome_options=chrome_options)\n",
113+
"\n",
100114
"driver2.get('http://pythonscraping.com')\n",
101115
"driver2.delete_all_cookies()\n",
102116
"for cookie in savedCookies:\n",
103-
" if not cookie['domain'].startswith('.'):\n",
104-
" cookie['domain'] = '.{}'.format(cookie['domain'])\n",
105117
" driver2.add_cookie(cookie)\n",
106118
"\n",
107119
"driver2.get('http://pythonscraping.com')\n",
@@ -111,7 +123,7 @@
111123
},
112124
{
113125
"cell_type": "code",
114-
"execution_count": 13,
126+
"execution_count": 5,
115127
"metadata": {},
116128
"outputs": [
117129
{
@@ -127,9 +139,11 @@
127139
"source": [
128140
"from selenium import webdriver\n",
129141
"from selenium.webdriver.remote.webelement import WebElement\n",
142+
"from selenium.webdriver.chrome.options import Options\n",
130143
"\n",
131-
"driver = webdriver.PhantomJS(\n",
132-
" executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
144+
"driver = webdriver.Chrome(\n",
145+
" executable_path='drivers/chromedriver',\n",
146+
" chrome_options=chrome_options)\n",
133147
"driver.get('http://pythonscraping.com/pages/itsatrap.html')\n",
134148
"links = driver.find_elements_by_tag_name('a')\n",
135149
"for link in links:\n",

0 commit comments

Comments
 (0)