Skip to content

Commit 71ad2c9

Browse files
committed
Added chapter 14 and 16 code, formatting changes
1 parent 81a3264 commit 71ad2c9

File tree

10 files changed

+25070
-43
lines changed

10 files changed

+25070
-43
lines changed

v2/Chapter04_CrawlingModels.ipynb

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,7 +1022,7 @@
10221022
" return\n",
10231023
" title = self.safeGet(bs, site.titleTag)\n",
10241024
" body = self.safeGet(bs, site.bodyTag)\n",
1025-
" if title != \"\" and body != \"\":\n",
1025+
" if title != '' and body != '':\n",
10261026
" content = Content(topic, title, body, url)\n",
10271027
" content.print()\n",
10281028
"\n",
@@ -1042,7 +1042,7 @@
10421042
" sites.append(Website(row[0], row[1], row[2],\n",
10431043
" row[3], row[4], row[5], row[6], row[7]))\n",
10441044
"\n",
1045-
"topics = [\"python\", \"data science\"]\n",
1045+
"topics = ['python', 'data science']\n",
10461046
"for topic in topics:\n",
10471047
" print(\"GETTING INFO ABOUT: \" + topic)\n",
10481048
" for targetSite in sites:\n",
@@ -1246,24 +1246,24 @@
12461246
},
12471247
{
12481248
"cell_type": "code",
1249-
"execution_count": null,
1249+
"execution_count": 1,
12501250
"metadata": {},
12511251
"outputs": [],
12521252
"source": [
12531253
"class Website:\n",
12541254
" \"\"\"Common base class for all articles/pages\"\"\"\n",
12551255
"\n",
1256-
" def __init__(self, type, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):\n",
1256+
" def __init__(self, name, url, titleTag, bodyTag):\n",
12571257
" self.name = name\n",
12581258
" self.url = url\n",
12591259
" self.titleTag = titleTag\n",
12601260
" self.bodyTag = bodyTag\n",
1261-
" self.pageType = pageType"
1261+
" "
12621262
]
12631263
},
12641264
{
12651265
"cell_type": "code",
1266-
"execution_count": null,
1266+
"execution_count": 2,
12671267
"metadata": {},
12681268
"outputs": [],
12691269
"source": [
@@ -1284,6 +1284,22 @@
12841284
" self.bodyTag = bodyTag\n",
12851285
" self.dateTag = dateTag"
12861286
]
1287+
},
1288+
{
1289+
"cell_type": "code",
1290+
"execution_count": null,
1291+
"metadata": {},
1292+
"outputs": [],
1293+
"source": [
1294+
"\n",
1295+
"\n",
1296+
"def parsePage(url):\n",
1297+
" \n",
1298+
" if '/ideas/' in url:\n",
1299+
" \n",
1300+
"\n",
1301+
"oreilly = Website('O\\'Reilly', 'https://oreilly.com', 'h1' '') "
1302+
]
12871303
}
12881304
],
12891305
"metadata": {

0 commit comments

Comments
 (0)