Deleted old chap2 notebook, code formatting

REMitchell · REMitchell · commit e8163f51b9a1 · 2018-01-29T23:41:22.000-05:00
diff --git a/v2/Chapter01_BeginningToScrape.ipynb b/v2/Chapter01_BeginningToScrape.ipynb
@@ -88,6 +88,7 @@
     "from urllib.error import HTTPError\n",
     "from bs4 import BeautifulSoup\n",
     "\n",
+    "\n",
     "def getTitle(url):\n",
     "    try:\n",
     "        html = urlopen(url)\n",
@@ -100,6 +101,7 @@
     "        return None\n",
     "    return title\n",
     "\n",
+    "\n",
     "title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n",
     "if title == None:\n",
     "    print(\"Title could not be found\")\n",
diff --git a/v2/Chapter02-AdvancedHTMLParsing.ipynb b/v2/Chapter02-AdvancedHTMLParsing.ipynb
@@ -82,7 +82,7 @@
     }
    ],
    "source": [
-    "nameList = bs.findAll('span', {'class':'green'})\n",
+    "nameList = bs.findAll('span', {'class': 'green'})\n",
     "for name in nameList:\n",
     "    print(name.get_text())"
    ]
diff --git a/v2/Chapter03-web-crawlers.ipynb b/v2/Chapter03-web-crawlers.ipynb
@@ -1766,13 +1766,15 @@
     }
    ],
    "source": [
-    "#Collects a list of all external URLs found on the site\n",
+    "# Collects a list of all external URLs found on the site\n",
     "allExtLinks = set()\n",
     "allIntLinks = set()\n",
     "\n",
+    "\n",
     "def getAllExternalLinks(siteUrl):\n",
     "    html = urlopen(siteUrl)\n",
-    "    domain = '{}://{}'.format(urlparse(siteUrl).scheme, urlparse(siteUrl).netloc)\n",
+    "    domain = '{}://{}'.format(urlparse(siteUrl).scheme,\n",
+    "                              urlparse(siteUrl).netloc)\n",
     "    bs = BeautifulSoup(html, 'html.parser')\n",
     "    internalLinks = getInternalLinks(bs, domain)\n",
     "    externalLinks = getExternalLinks(bs, domain)\n",
@@ -1786,6 +1788,7 @@
     "            allIntLinks.add(link)\n",
     "            getAllExternalLinks(link)\n",
     "\n",
+    "\n",
     "allIntLinks.add('http://oreilly.com')\n",
     "getAllExternalLinks('http://oreilly.com')"
    ]
diff --git a/v2/Chapter04_CrawlingModels.ipynb b/v2/Chapter04_CrawlingModels.ipynb
@@ -22,7 +22,8 @@
     "    \"\"\"\n",
     "\n",
     "    session = requests.Session()\n",
-    "    headers = {\"User-Agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\"}\n",
+    "    headers = {\"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\",\n",
+    "               \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\"}\n",
     "    try:\n",
     "        req = session.get(url, headers=headers)\n",
     "    except requests.exceptions.RequestException:\n",
@@ -183,36 +184,41 @@
    "source": [
     "import requests\n",
     "\n",
+    "\n",
     "class Content:\n",
     "    def __init__(self, url, title, body):\n",
     "        self.url = url\n",
     "        self.title = title\n",
     "        self.body = body\n",
     "\n",
+    "\n",
     "def getPage(url):\n",
     "    req = requests.get(url)\n",
     "    return BeautifulSoup(req.text, 'html.parser')\n",
     "\n",
+    "\n",
     "def scrapeNYTimes(url):\n",
     "    bs = getPage(url)\n",
     "    title = bs.find(\"h1\").text\n",
-    "    lines = bs.find_all(\"p\", {\"class\":\"story-content\"})\n",
+    "    lines = bs.find_all(\"p\", {\"class\": \"story-content\"})\n",
     "    body = '\\n'.join([line.text for line in lines])\n",
     "    return Content(url, title, body)\n",
     "\n",
+    "\n",
     "def scrapeBrookings(url):\n",
     "    bs = getPage(url)\n",
     "    title = bs.find(\"h1\").text\n",
-    "    body = bs.find(\"div\",{\"class\",\"post-body\"}).text\n",
+    "    body = bs.find(\"div\", {\"class\", \"post-body\"}).text\n",
     "    return Content(url, title, body)\n",
     "\n",
+    "\n",
     "url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'\n",
     "content = scrapeBrookings(url)\n",
     "print('Title: {}'.format(content.title))\n",
     "print('URL: {}\\n'.format(content.url))\n",
     "print(content.body)\n",
     "\n",
-    "url = \"https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html\"\n",
+    "url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'\n",
     "content = scrapeNYTimes(url)\n",
     "print('Title: {}'.format(content.title))\n",
     "print('URL: {}\\n'.format(content.url))\n",
@@ -229,7 +235,7 @@
     "    \"\"\"\n",
     "    Common base class for all articles/pages\n",
     "    \"\"\"\n",
-    "    \n",
+    "\n",
     "    def __init__(self, url, title, body):\n",
     "        self.url = url\n",
     "        self.title = title\n",
@@ -242,7 +248,7 @@
     "        print(\"URL: {}\".format(self.url))\n",
     "        print(\"TITLE: {}\".format(self.title))\n",
     "        print(\"BODY:\\n{}\".format(self.body))\n",
-    "        \n",
+    "\n",
     "\n",
     "class Website:\n",
     "    \"\"\" \n",
@@ -265,13 +271,14 @@
     "import requests\n",
     "from bs4 import BeautifulSoup\n",
     "\n",
+    "\n",
     "class Crawler:\n",
     "\n",
     "    def getPage(self, url):\n",
     "        try:\n",
     "            req = requests.get(url)\n",
     "        except requests.exceptions.RequestException:\n",
-    "            return None        \n",
+    "            return None\n",
     "        return BeautifulSoup(req.text, 'html.parser')\n",
     "\n",
     "    def safeGet(self, pageObj, selector):\n",
@@ -800,9 +807,14 @@
     "    websites.append(Website(row[0], row[1], row[2], row[3]))\n",
     "\n",
     "crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')\n",
-    "crawler.parse(websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')\n",
-    "crawler.parse(websites[2], 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')\n",
-    "crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')"
+    "crawler.parse(\n",
+    "    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')\n",
+    "crawler.parse(\n",
+    "    websites[2],\n",
+    "    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')\n",
+    "crawler.parse(\n",
+    "    websites[3], \n",
+    "    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')"
    ]
   },
   {
@@ -852,7 +864,7 @@
     "        self.searchUrl = searchUrl\n",
     "        self.resultListing = resultListing\n",
     "        self.resultUrl = resultUrl\n",
-    "        self.absoluteUrl=absoluteUrl\n",
+    "        self.absoluteUrl = absoluteUrl\n",
     "        self.titleTag = titleTag\n",
     "        self.bodyTag = bodyTag"
    ]
@@ -976,13 +988,14 @@
     "import requests\n",
     "from bs4 import BeautifulSoup\n",
     "\n",
+    "\n",
     "class Crawler:\n",
     "\n",
     "    def getPage(self, url):\n",
     "        try:\n",
     "            req = requests.get(url)\n",
     "        except requests.exceptions.RequestException:\n",
-    "            return None        \n",
+    "            return None\n",
     "        return BeautifulSoup(req.text, 'html.parser')\n",
     "\n",
     "    def safeGet(self, pageObj, selector):\n",
@@ -995,15 +1008,15 @@
     "        \"\"\"\n",
     "        Searches a given website for a given topic and records all pages found\n",
     "        \"\"\"\n",
-    "        bs = self.getPage(site.searchUrl+topic)\n",
+    "        bs = self.getPage(site.searchUrl + topic)\n",
     "        searchResults = bs.select(site.resultListing)\n",
     "        for result in searchResults:\n",
     "            url = result.select(site.resultUrl)[0].attrs[\"href\"]\n",
-    "            #Check to see whether it's a relative or an absolute URL\n",
+    "            # Check to see whether it's a relative or an absolute URL\n",
     "            if(site.absoluteUrl):\n",
     "                bs = self.getPage(url)\n",
     "            else:\n",
-    "                bs = self.getPage(site.url+url)\n",
+    "                bs = self.getPage(site.url + url)\n",
     "            if bs is None:\n",
     "                print(\"Something was wrong with that page or URL. Skipping!\")\n",
     "                return\n",
@@ -1017,17 +1030,21 @@
     "crawler = Crawler()\n",
     "\n",
     "siteData = [\n",
-    "    ['O\\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=', 'article.product-result','p.title a', True, 'h1', 'section#product-description'],\n",
-    "    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content', 'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],\n",
-    "    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=', 'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']\n",
-    "    ]\n",
+    "    ['O\\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',\n",
+    "        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],\n",
+    "    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',\n",
+    "        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],\n",
+    "    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',\n",
+    "        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']\n",
+    "]\n",
     "sites = []\n",
     "for row in siteData:\n",
-    "    sites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))\n",
+    "    sites.append(Website(row[0], row[1], row[2],\n",
+    "                         row[3], row[4], row[5], row[6], row[7]))\n",
     "\n",
-    "topics = [\"python\",\"data science\"]\n",
+    "topics = [\"python\", \"data science\"]\n",
     "for topic in topics:\n",
-    "    print(\"GETTING INFO ABOUT: \"+topic)\n",
+    "    print(\"GETTING INFO ABOUT: \" + topic)\n",
     "    for targetSite in sites:\n",
     "        crawler.search(topic, targetSite)"
    ]
@@ -1046,15 +1063,16 @@
    "outputs": [],
    "source": [
     "class Website:\n",
-    "    \n",
+    "\n",
     "    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):\n",
     "        self.name = name\n",
     "        self.url = url\n",
     "        self.targetPattern = targetPattern\n",
-    "        self.absoluteUrl=absoluteUrl\n",
+    "        self.absoluteUrl = absoluteUrl\n",
     "        self.titleTag = titleTag\n",
     "        self.bodyTag = bodyTag\n",
-    "        \n",
+    "\n",
+    "\n",
     "class Content:\n",
     "\n",
     "    def __init__(self, url, title, body):\n",
@@ -1170,24 +1188,25 @@
    "source": [
     "import re\n",
     "\n",
+    "\n",
     "class Crawler:\n",
     "    def __init__(self, site):\n",
     "        self.site = site\n",
     "        self.visited = []\n",
-    "        \n",
+    "\n",
     "    def getPage(self, url):\n",
     "        try:\n",
     "            req = requests.get(url)\n",
     "        except requests.exceptions.RequestException:\n",
-    "            return None        \n",
+    "            return None\n",
     "        return BeautifulSoup(req.text, 'html.parser')\n",
     "\n",
     "    def safeGet(self, pageObj, selector):\n",
     "        selectedElems = pageObj.select(selector)\n",
     "        if selectedElems is not None and len(selectedElems) > 0:\n",
     "            return '\\n'.join([elem.get_text() for elem in selectedElems])\n",
     "        return ''\n",
-    "    \n",
+    "\n",
     "    def parse(self, url):\n",
     "        bs = self.getPage(url)\n",
     "        if bs is not None:\n",
@@ -1211,7 +1230,9 @@
     "                    targetPage = '{}{}'.format(self.site.url, targetPage)\n",
     "                self.parse(targetPage)\n",
     "\n",
-    "reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)', False, 'h1', 'div.StandardArticleBody_body_1gnLA')\n",
+    "\n",
+    "reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',\n",
+    "                  False, 'h1', 'div.StandardArticleBody_body_1gnLA')\n",
     "crawler = Crawler(reuters)\n",
     "crawler.crawl()"
    ]
@@ -1248,13 +1269,16 @@
    "source": [
     "class Product(Website):\n",
     "    \"\"\"Contains information for scraping a product page\"\"\"\n",
+    "\n",
     "    def __init__(self, name, url, titleTag, productNumber, price):\n",
     "        Website.__init__(self, name, url, TitleTag)\n",
     "        self.productNumberTag = productNumberTag\n",
     "        self.priceTag = priceTag\n",
     "\n",
+    "\n",
     "class Article(Website):\n",
     "    \"\"\"Contains information for scraping an article page\"\"\"\n",
+    "\n",
     "    def __init__(self, name, url, titleTag, bodyTag, dateTag):\n",
     "        Website.__init__(self, name, url, titleTag)\n",
     "        self.bodyTag = bodyTag\n",
diff --git a/v2/Chapter06_StoringData.ipynb b/v2/Chapter06_StoringData.ipynb
@@ -37025,7 +37025,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "'"
+   ]
   }
  ],
  "metadata": {
diff --git a/v2/Chapter09_NaturalLanguages.ipynb b/v2/Chapter09_NaturalLanguages.ipynb
@@ -276,7 +276,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "'"
+   ]
   },
   {
    "cell_type": "code",
diff --git a/v2/Chapter10-CrawlingThroughFormsAndLogins.ipynb b/v2/Chapter10-CrawlingThroughFormsAndLogins.ipynb
@@ -517,8 +517,8 @@
     "from requests.auth import HTTPBasicAuth\n",
     "\n",
     "auth = HTTPBasicAuth('ryan', 'password')\n",
-    "r = requests.post(url='http://pythonscraping.com/pages/auth/login.php', auth=\n",
-    "                   auth)\n",
+    "r = requests.post(\n",
+    "    url='http://pythonscraping.com/pages/auth/login.php', auth=auth)\n",
     "print(r.text)"
    ]
   },
diff --git a/v2/Chapter11-JavaScript.ipynb b/v2/Chapter11-JavaScript.ipynb
@@ -66,13 +66,15 @@
     "from selenium.webdriver.support import expected_conditions as EC\n",
     "from selenium.common.exceptions import TimeoutException\n",
     "\n",
-    "driver = webdriver.PhantomJS(executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
+    "driver = webdriver.PhantomJS(\n",
+    "    executable_path='drivers/phantomjs/phantomjs-2.1.1-macosx/bin/phantomjs')\n",
     "driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html')\n",
     "try:\n",
-    "    bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//body[contains(text(), \"This is the page you are looking for!\")]')))\n",
+    "    bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located(\n",
+    "        (By.XPATH, '//body[contains(text(), \"This is the page you are looking for!\")]')))\n",
     "    print(bodyElement.text)\n",
     "except TimeoutException:\n",
-    "    print('Did not find the element')\n"
+    "    print('Did not find the element')"
    ]
   },
   {
diff --git a/v2/Chapter13-ImageProcessing.ipynb b/v2/Chapter13-ImageProcessing.ipynb
diff --git a/v2/Chapter15_Testing.ipynb b/v2/Chapter15_Testing.ipynb
diff --git a/v2/Chapter2-AdvancedHTMLParsing.ipynb b/v2/Chapter2-AdvancedHTMLParsing.ipynb

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@`
`82`	`82`	`}`
`83`	`83`	`],`
`84`	`84`	`"source": [`
`85`		`- "nameList = bs.findAll('span', {'class':'green'})\n",`
	`85`	`+ "nameList = bs.findAll('span', {'class': 'green'})\n",`
`86`	`86`	`"for name in nameList:\n",`
`87`	`87`	`" print(name.get_text())"`
`88`	`88`	`]`