|
22 | 22 | " \"\"\"\n", |
23 | 23 | "\n", |
24 | 24 | " session = requests.Session()\n", |
25 | | - " headers = {\"User-Agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\"}\n", |
| 25 | + " headers = {\"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\",\n", |
| 26 | + " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\"}\n", |
26 | 27 | " try:\n", |
27 | 28 | " req = session.get(url, headers=headers)\n", |
28 | 29 | " except requests.exceptions.RequestException:\n", |
|
183 | 184 | "source": [ |
184 | 185 | "import requests\n", |
185 | 186 | "\n", |
| 187 | + "\n", |
186 | 188 | "class Content:\n", |
187 | 189 | " def __init__(self, url, title, body):\n", |
188 | 190 | " self.url = url\n", |
189 | 191 | " self.title = title\n", |
190 | 192 | " self.body = body\n", |
191 | 193 | "\n", |
| 194 | + "\n", |
192 | 195 | "def getPage(url):\n", |
193 | 196 | " req = requests.get(url)\n", |
194 | 197 | " return BeautifulSoup(req.text, 'html.parser')\n", |
195 | 198 | "\n", |
| 199 | + "\n", |
196 | 200 | "def scrapeNYTimes(url):\n", |
197 | 201 | " bs = getPage(url)\n", |
198 | 202 | " title = bs.find(\"h1\").text\n", |
199 | | - " lines = bs.find_all(\"p\", {\"class\":\"story-content\"})\n", |
| 203 | + " lines = bs.find_all(\"p\", {\"class\": \"story-content\"})\n", |
200 | 204 | " body = '\\n'.join([line.text for line in lines])\n", |
201 | 205 | " return Content(url, title, body)\n", |
202 | 206 | "\n", |
| 207 | + "\n", |
203 | 208 | "def scrapeBrookings(url):\n", |
204 | 209 | " bs = getPage(url)\n", |
205 | 210 | " title = bs.find(\"h1\").text\n", |
206 | | - " body = bs.find(\"div\",{\"class\",\"post-body\"}).text\n", |
| 211 | + " body = bs.find(\"div\", {\"class\", \"post-body\"}).text\n", |
207 | 212 | " return Content(url, title, body)\n", |
208 | 213 | "\n", |
| 214 | + "\n", |
209 | 215 | "url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'\n", |
210 | 216 | "content = scrapeBrookings(url)\n", |
211 | 217 | "print('Title: {}'.format(content.title))\n", |
212 | 218 | "print('URL: {}\\n'.format(content.url))\n", |
213 | 219 | "print(content.body)\n", |
214 | 220 | "\n", |
215 | | - "url = \"https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html\"\n", |
| 221 | + "url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'\n", |
216 | 222 | "content = scrapeNYTimes(url)\n", |
217 | 223 | "print('Title: {}'.format(content.title))\n", |
218 | 224 | "print('URL: {}\\n'.format(content.url))\n", |
|
229 | 235 | " \"\"\"\n", |
230 | 236 | " Common base class for all articles/pages\n", |
231 | 237 | " \"\"\"\n", |
232 | | - " \n", |
| 238 | + "\n", |
233 | 239 | " def __init__(self, url, title, body):\n", |
234 | 240 | " self.url = url\n", |
235 | 241 | " self.title = title\n", |
|
242 | 248 | " print(\"URL: {}\".format(self.url))\n", |
243 | 249 | " print(\"TITLE: {}\".format(self.title))\n", |
244 | 250 | " print(\"BODY:\\n{}\".format(self.body))\n", |
245 | | - " \n", |
| 251 | + "\n", |
246 | 252 | "\n", |
247 | 253 | "class Website:\n", |
248 | 254 | " \"\"\" \n", |
|
265 | 271 | "import requests\n", |
266 | 272 | "from bs4 import BeautifulSoup\n", |
267 | 273 | "\n", |
| 274 | + "\n", |
268 | 275 | "class Crawler:\n", |
269 | 276 | "\n", |
270 | 277 | " def getPage(self, url):\n", |
271 | 278 | " try:\n", |
272 | 279 | " req = requests.get(url)\n", |
273 | 280 | " except requests.exceptions.RequestException:\n", |
274 | | - " return None \n", |
| 281 | + " return None\n", |
275 | 282 | " return BeautifulSoup(req.text, 'html.parser')\n", |
276 | 283 | "\n", |
277 | 284 | " def safeGet(self, pageObj, selector):\n", |
|
800 | 807 | " websites.append(Website(row[0], row[1], row[2], row[3]))\n", |
801 | 808 | "\n", |
802 | 809 | "crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')\n", |
803 | | - "crawler.parse(websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')\n", |
804 | | - "crawler.parse(websites[2], 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')\n", |
805 | | - "crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')" |
| 810 | + "crawler.parse(\n", |
| 811 | + " websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')\n", |
| 812 | + "crawler.parse(\n", |
| 813 | + " websites[2],\n", |
| 814 | + " 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')\n", |
| 815 | + "crawler.parse(\n", |
| 816 | + " websites[3], \n", |
| 817 | + " 'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')" |
806 | 818 | ] |
807 | 819 | }, |
808 | 820 | { |
|
852 | 864 | " self.searchUrl = searchUrl\n", |
853 | 865 | " self.resultListing = resultListing\n", |
854 | 866 | " self.resultUrl = resultUrl\n", |
855 | | - " self.absoluteUrl=absoluteUrl\n", |
| 867 | + " self.absoluteUrl = absoluteUrl\n", |
856 | 868 | " self.titleTag = titleTag\n", |
857 | 869 | " self.bodyTag = bodyTag" |
858 | 870 | ] |
|
976 | 988 | "import requests\n", |
977 | 989 | "from bs4 import BeautifulSoup\n", |
978 | 990 | "\n", |
| 991 | + "\n", |
979 | 992 | "class Crawler:\n", |
980 | 993 | "\n", |
981 | 994 | " def getPage(self, url):\n", |
982 | 995 | " try:\n", |
983 | 996 | " req = requests.get(url)\n", |
984 | 997 | " except requests.exceptions.RequestException:\n", |
985 | | - " return None \n", |
| 998 | + " return None\n", |
986 | 999 | " return BeautifulSoup(req.text, 'html.parser')\n", |
987 | 1000 | "\n", |
988 | 1001 | " def safeGet(self, pageObj, selector):\n", |
|
995 | 1008 | " \"\"\"\n", |
996 | 1009 | " Searches a given website for a given topic and records all pages found\n", |
997 | 1010 | " \"\"\"\n", |
998 | | - " bs = self.getPage(site.searchUrl+topic)\n", |
| 1011 | + " bs = self.getPage(site.searchUrl + topic)\n", |
999 | 1012 | " searchResults = bs.select(site.resultListing)\n", |
1000 | 1013 | " for result in searchResults:\n", |
1001 | 1014 | " url = result.select(site.resultUrl)[0].attrs[\"href\"]\n", |
1002 | | - " #Check to see whether it's a relative or an absolute URL\n", |
| 1015 | + " # Check to see whether it's a relative or an absolute URL\n", |
1003 | 1016 | " if(site.absoluteUrl):\n", |
1004 | 1017 | " bs = self.getPage(url)\n", |
1005 | 1018 | " else:\n", |
1006 | | - " bs = self.getPage(site.url+url)\n", |
| 1019 | + " bs = self.getPage(site.url + url)\n", |
1007 | 1020 | " if bs is None:\n", |
1008 | 1021 | " print(\"Something was wrong with that page or URL. Skipping!\")\n", |
1009 | 1022 | " return\n", |
|
1017 | 1030 | "crawler = Crawler()\n", |
1018 | 1031 | "\n", |
1019 | 1032 | "siteData = [\n", |
1020 | | - " ['O\\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=', 'article.product-result','p.title a', True, 'h1', 'section#product-description'],\n", |
1021 | | - " ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content', 'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],\n", |
1022 | | - " ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=', 'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']\n", |
1023 | | - " ]\n", |
| 1033 | + " ['O\\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',\n", |
| 1034 | + " 'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],\n", |
| 1035 | + " ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',\n", |
| 1036 | + " 'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],\n", |
| 1037 | + " ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',\n", |
| 1038 | + " 'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']\n", |
| 1039 | + "]\n", |
1024 | 1040 | "sites = []\n", |
1025 | 1041 | "for row in siteData:\n", |
1026 | | - " sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))\n", |
| 1042 | + " sites.append(Website(row[0], row[1], row[2],\n", |
| 1043 | + " row[3], row[4], row[5], row[6], row[7]))\n", |
1027 | 1044 | "\n", |
1028 | | - "topics = [\"python\",\"data science\"]\n", |
| 1045 | + "topics = [\"python\", \"data science\"]\n", |
1029 | 1046 | "for topic in topics:\n", |
1030 | | - " print(\"GETTING INFO ABOUT: \"+topic)\n", |
| 1047 | + " print(\"GETTING INFO ABOUT: \" + topic)\n", |
1031 | 1048 | " for targetSite in sites:\n", |
1032 | 1049 | " crawler.search(topic, targetSite)" |
1033 | 1050 | ] |
|
1046 | 1063 | "outputs": [], |
1047 | 1064 | "source": [ |
1048 | 1065 | "class Website:\n", |
1049 | | - " \n", |
| 1066 | + "\n", |
1050 | 1067 | " def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):\n", |
1051 | 1068 | " self.name = name\n", |
1052 | 1069 | " self.url = url\n", |
1053 | 1070 | " self.targetPattern = targetPattern\n", |
1054 | | - " self.absoluteUrl=absoluteUrl\n", |
| 1071 | + " self.absoluteUrl = absoluteUrl\n", |
1055 | 1072 | " self.titleTag = titleTag\n", |
1056 | 1073 | " self.bodyTag = bodyTag\n", |
1057 | | - " \n", |
| 1074 | + "\n", |
| 1075 | + "\n", |
1058 | 1076 | "class Content:\n", |
1059 | 1077 | "\n", |
1060 | 1078 | " def __init__(self, url, title, body):\n", |
|
1170 | 1188 | "source": [ |
1171 | 1189 | "import re\n", |
1172 | 1190 | "\n", |
| 1191 | + "\n", |
1173 | 1192 | "class Crawler:\n", |
1174 | 1193 | " def __init__(self, site):\n", |
1175 | 1194 | " self.site = site\n", |
1176 | 1195 | " self.visited = []\n", |
1177 | | - " \n", |
| 1196 | + "\n", |
1178 | 1197 | " def getPage(self, url):\n", |
1179 | 1198 | " try:\n", |
1180 | 1199 | " req = requests.get(url)\n", |
1181 | 1200 | " except requests.exceptions.RequestException:\n", |
1182 | | - " return None \n", |
| 1201 | + " return None\n", |
1183 | 1202 | " return BeautifulSoup(req.text, 'html.parser')\n", |
1184 | 1203 | "\n", |
1185 | 1204 | " def safeGet(self, pageObj, selector):\n", |
1186 | 1205 | " selectedElems = pageObj.select(selector)\n", |
1187 | 1206 | " if selectedElems is not None and len(selectedElems) > 0:\n", |
1188 | 1207 | " return '\\n'.join([elem.get_text() for elem in selectedElems])\n", |
1189 | 1208 | " return ''\n", |
1190 | | - " \n", |
| 1209 | + "\n", |
1191 | 1210 | " def parse(self, url):\n", |
1192 | 1211 | " bs = self.getPage(url)\n", |
1193 | 1212 | " if bs is not None:\n", |
|
1211 | 1230 | " targetPage = '{}{}'.format(self.site.url, targetPage)\n", |
1212 | 1231 | " self.parse(targetPage)\n", |
1213 | 1232 | "\n", |
1214 | | - "reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)', False, 'h1', 'div.StandardArticleBody_body_1gnLA')\n", |
| 1233 | + "\n", |
| 1234 | + "reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',\n", |
| 1235 | + " False, 'h1', 'div.StandardArticleBody_body_1gnLA')\n", |
1215 | 1236 | "crawler = Crawler(reuters)\n", |
1216 | 1237 | "crawler.crawl()" |
1217 | 1238 | ] |
|
1248 | 1269 | "source": [ |
1249 | 1270 | "class Product(Website):\n", |
1250 | 1271 | " \"\"\"Contains information for scraping a product page\"\"\"\n", |
| 1272 | + "\n", |
1251 | 1273 | " def __init__(self, name, url, titleTag, productNumber, price):\n", |
1252 | 1274 | " Website.__init__(self, name, url, TitleTag)\n", |
1253 | 1275 | " self.productNumberTag = productNumberTag\n", |
1254 | 1276 | " self.priceTag = priceTag\n", |
1255 | 1277 | "\n", |
| 1278 | + "\n", |
1256 | 1279 | "class Article(Website):\n", |
1257 | 1280 | " \"\"\"Contains information for scraping an article page\"\"\"\n", |
| 1281 | + "\n", |
1258 | 1282 | " def __init__(self, name, url, titleTag, bodyTag, dateTag):\n", |
1259 | 1283 | " Website.__init__(self, name, url, titleTag)\n", |
1260 | 1284 | " self.bodyTag = bodyTag\n", |
|
0 commit comments