Skip to content

Commit 638f1ee

Browse files
committed
Moving v2 to root directory
1 parent 1e29b28 commit 638f1ee

29 files changed

+69969
-0
lines changed

Chapter01_BeginningToScrape.ipynb

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"b'<html>\\n<head>\\n<title>A Useful Page</title>\\n</head>\\n<body>\\n<h1>An Interesting Title</h1>\\n<div>\\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\\n</div>\\n</body>\\n</html>\\n'\n"
13+
]
14+
}
15+
],
16+
"source": [
17+
"from urllib.request import urlopen\n",
18+
"\n",
19+
"html = urlopen('http://pythonscraping.com/pages/page1.html')\n",
20+
"print(html.read())"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 1,
26+
"metadata": {},
27+
"outputs": [
28+
{
29+
"name": "stdout",
30+
"output_type": "stream",
31+
"text": [
32+
"<h1>An Interesting Title</h1>\n"
33+
]
34+
}
35+
],
36+
"source": [
37+
"from urllib.request import urlopen\n",
38+
"from bs4 import BeautifulSoup\n",
39+
"\n",
40+
"html = urlopen('http://www.pythonscraping.com/pages/page1.html')\n",
41+
"bs = BeautifulSoup(html.read(), 'html.parser')\n",
42+
"print(bs.h1)"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 10,
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"name": "stdout",
52+
"output_type": "stream",
53+
"text": [
54+
"The server could not be found!\n"
55+
]
56+
}
57+
],
58+
"source": [
59+
"from urllib.request import urlopen\n",
60+
"from urllib.error import HTTPError\n",
61+
"from urllib.error import URLError\n",
62+
"\n",
63+
"try:\n",
64+
" html = urlopen(\"https://pythonscrapingthisurldoesnotexist.com\")\n",
65+
"except HTTPError as e:\n",
66+
" print(\"The server returned an HTTP error\")\n",
67+
"except URLError as e:\n",
68+
" print(\"The server could not be found!\")\n",
69+
"else:\n",
70+
" print(html.read())"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 12,
76+
"metadata": {},
77+
"outputs": [
78+
{
79+
"name": "stdout",
80+
"output_type": "stream",
81+
"text": [
82+
"<h1>An Interesting Title</h1>\n"
83+
]
84+
}
85+
],
86+
"source": [
87+
"from urllib.request import urlopen\n",
88+
"from urllib.error import HTTPError\n",
89+
"from bs4 import BeautifulSoup\n",
90+
"\n",
91+
"\n",
92+
"def getTitle(url):\n",
93+
" try:\n",
94+
" html = urlopen(url)\n",
95+
" except HTTPError as e:\n",
96+
" return None\n",
97+
" try:\n",
98+
" bsObj = BeautifulSoup(html.read(), \"lxml\")\n",
99+
" title = bsObj.body.h1\n",
100+
" except AttributeError as e:\n",
101+
" return None\n",
102+
" return title\n",
103+
"\n",
104+
"\n",
105+
"title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n",
106+
"if title == None:\n",
107+
" print(\"Title could not be found\")\n",
108+
"else:\n",
109+
" print(title)"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"metadata": {
116+
"collapsed": true
117+
},
118+
"outputs": [],
119+
"source": []
120+
}
121+
],
122+
"metadata": {
123+
"kernelspec": {
124+
"display_name": "Python 3",
125+
"language": "python",
126+
"name": "python3"
127+
},
128+
"language_info": {
129+
"codemirror_mode": {
130+
"name": "ipython",
131+
"version": 3
132+
},
133+
"file_extension": ".py",
134+
"mimetype": "text/x-python",
135+
"name": "python",
136+
"nbconvert_exporter": "python",
137+
"pygments_lexer": "ipython3",
138+
"version": "3.6.1"
139+
}
140+
},
141+
"nbformat": 4,
142+
"nbformat_minor": 2
143+
}

0 commit comments

Comments
 (0)