|
1022 | 1022 | " return\n", |
1023 | 1023 | " title = self.safeGet(bs, site.titleTag)\n", |
1024 | 1024 | " body = self.safeGet(bs, site.bodyTag)\n", |
1025 | | - " if title != \"\" and body != \"\":\n", |
| 1025 | + " if title != '' and body != '':\n", |
1026 | 1026 | " content = Content(topic, title, body, url)\n", |
1027 | 1027 | " content.print()\n", |
1028 | 1028 | "\n", |
|
1042 | 1042 | " sites.append(Website(row[0], row[1], row[2],\n", |
1043 | 1043 | " row[3], row[4], row[5], row[6], row[7]))\n", |
1044 | 1044 | "\n", |
1045 | | - "topics = [\"python\", \"data science\"]\n", |
| 1045 | + "topics = ['python', 'data science']\n", |
1046 | 1046 | "for topic in topics:\n", |
1047 | 1047 | " print(\"GETTING INFO ABOUT: \" + topic)\n", |
1048 | 1048 | " for targetSite in sites:\n", |
|
1246 | 1246 | }, |
1247 | 1247 | { |
1248 | 1248 | "cell_type": "code", |
1249 | | - "execution_count": null, |
| 1249 | + "execution_count": 1, |
1250 | 1250 | "metadata": {}, |
1251 | 1251 | "outputs": [], |
1252 | 1252 | "source": [ |
1253 | 1253 | "class Website:\n", |
1254 | 1254 | " \"\"\"Common base class for all articles/pages\"\"\"\n", |
1255 | 1255 | "\n", |
1256 | | - " def __init__(self, type, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):\n", |
| 1256 | + " def __init__(self, name, url, titleTag, bodyTag):\n", |
1257 | 1257 | " self.name = name\n", |
1258 | 1258 | " self.url = url\n", |
1259 | 1259 | " self.titleTag = titleTag\n", |
1260 | 1260 | " self.bodyTag = bodyTag\n", |
1261 | | - " self.pageType = pageType" |
| 1261 | + " " |
1262 | 1262 | ] |
1263 | 1263 | }, |
1264 | 1264 | { |
1265 | 1265 | "cell_type": "code", |
1266 | | - "execution_count": null, |
| 1266 | + "execution_count": 2, |
1267 | 1267 | "metadata": {}, |
1268 | 1268 | "outputs": [], |
1269 | 1269 | "source": [ |
|
1284 | 1284 | " self.bodyTag = bodyTag\n", |
1285 | 1285 | " self.dateTag = dateTag" |
1286 | 1286 | ] |
| 1287 | + }, |
| 1288 | + { |
| 1289 | + "cell_type": "code", |
| 1290 | + "execution_count": null, |
| 1291 | + "metadata": {}, |
| 1292 | + "outputs": [], |
| 1293 | + "source": [ |
| 1294 | + "\n", |
| 1295 | + "\n", |
| 1296 | + "def parsePage(url):\n", |
| 1297 | + " \n", |
| 1298 | + " if '/ideas/' in url:\n", |
| 1299 | + " \n", |
| 1300 | + "\n", |
| 1301 | + "oreilly = Website('O\\'Reilly', 'https://oreilly.com', 'h1' '') " |
| 1302 | + ] |
1287 | 1303 | } |
1288 | 1304 | ], |
1289 | 1305 | "metadata": { |
|
0 commit comments