Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
chapter 4
  • Loading branch information
november11th committed Jan 16, 2025
commit be070c3215a1c605b5c1cc34f76ae0af5cacba91
181 changes: 181 additions & 0 deletions .ipynb_checkpoints/Chapter04_FirstWebScraper-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Writing Your First Web Scraper"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"b'<html>\\n<head>\\n<title>A Useful Page</title>\\n</head>\\n<body>\\n<h1>An Interesting Title</h1>\\n<div>\\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\\n</div>\\n</body>\\n</html>\\n'\n"
]
}
],
"source": [
"from urllib.request import urlopen\n",
"\n",
"html = urlopen('http://pythonscraping.com/pages/page1.html')\n",
"print(html.read())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<h1>An Interesting Title</h1>\n"
]
}
],
"source": [
"from urllib.request import urlopen\n",
"from bs4 import BeautifulSoup\n",
"\n",
"html = urlopen('http://www.pythonscraping.com/pages/page1.html')\n",
"bs = BeautifulSoup(html.read(), 'html.parser')\n",
"print(bs.h1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from urllib.request import urlopen\n",
"from bs4 import BeautifulSoup\n",
"\n",
"html = urlopen('https://en.wikipedia.org/wiki/Iron_Gwazi')\n",
"bs = BeautifulSoup(html.read(), 'html.parser')\n",
"# 'class':['mw-file-description']\n",
"#bs.find_all(attrs={'class': ['mw-ui-icon-wikimedia-listBullet', 'vector-icon']})\n",
"\n",
"bs.find_all(_class='mw-ui-icon-wikimedia-listBullet')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The server could not be found!\n"
]
}
],
"source": [
"from urllib.request import urlopen\n",
"from urllib.error import HTTPError\n",
"from urllib.error import URLError\n",
"\n",
"try:\n",
" html = urlopen(\"https://pythonscrapingthisurldoesnotexist.com\")\n",
"except HTTPError as e:\n",
" print(\"The server returned an HTTP error\")\n",
"except URLError as e:\n",
" print(\"The server could not be found!\")\n",
"else:\n",
" print(html.read())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<h1>An Interesting Title</h1>\n"
]
}
],
"source": [
"from urllib.request import urlopen\n",
"from urllib.error import HTTPError\n",
"from bs4 import BeautifulSoup\n",
"\n",
"\n",
"def getTitle(url):\n",
" try:\n",
" html = urlopen(url)\n",
" except HTTPError as e:\n",
" return None\n",
" try:\n",
" bsObj = BeautifulSoup(html.read(), \"lxml\")\n",
" title = bsObj.body.h1\n",
" except AttributeError as e:\n",
" return None\n",
" return title\n",
"\n",
"\n",
"title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n",
"if title is None:\n",
" print(\"Title could not be found\")\n",
"else:\n",
" print(title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
21 changes: 12 additions & 9 deletions Chapter04_FirstWebScraper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -60,7 +60,7 @@
"[]"
]
},
"execution_count": 34,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -79,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -107,7 +107,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -138,7 +138,7 @@
"\n",
"\n",
"title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n",
"if title == None:\n",
"if title is None:\n",
" print(\"Title could not be found\")\n",
"else:\n",
" print(title)"
Expand All @@ -148,7 +148,10 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [],
"source": []
Expand All @@ -170,9 +173,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}