|
| 1 | +''' |
| 2 | +CLASS: Web Scraping with Beautiful Soup |
| 3 | +
|
| 4 | +What is web scraping? |
| 5 | +- Extracting information from websites (simulates a human copying and pasting) |
| 6 | +- Based on finding patterns in website code (usually HTML) |
| 7 | +
|
| 8 | +What are best practices for web scraping? |
| 9 | +- Scraping too many pages too fast can get your IP address blocked |
| 10 | +- Pay attention to the robots exclusion standard (robots.txt) |
| 11 | +- Let's look at http://www.imdb.com/robots.txt |
| 12 | +
|
| 13 | +What is HTML? |
| 14 | +- Code interpreted by a web browser to produce ("render") a web page |
| 15 | +- Let's look at example.html |
| 16 | +- Tags are opened and closed |
| 17 | +- Tags have optional attributes |
| 18 | +
|
| 19 | +How to view HTML code: |
| 20 | +- To view the entire page: "View Source" or "View Page Source" or "Show Page Source" |
| 21 | +- To view a specific part: "Inspect Element" |
| 22 | +- Safari users: Safari menu, Preferences, Advanced, Show Develop menu in menu bar |
| 23 | +- Let's inspect example.html |
| 24 | +''' |
| 25 | + |
| 26 | +# read the HTML code for a web page and save as a string |
| 27 | +with open('example.html', 'rU') as f: |
| 28 | + html = f.read() |
| 29 | + |
| 30 | +# convert HTML into a structured Soup object |
| 31 | +from bs4 import BeautifulSoup |
| 32 | +b = BeautifulSoup(html) |
| 33 | + |
| 34 | +# print out the object |
| 35 | +print b |
| 36 | +print b.prettify() |
| 37 | + |
| 38 | +# 'find' method returns the first matching Tag (and everything inside of it) |
| 39 | +b.find(name='body') |
| 40 | +b.find(name='h1') |
| 41 | + |
| 42 | +# Tags allow you to access the 'inside text' |
| 43 | +b.find(name='h1').text |
| 44 | + |
| 45 | +# Tags also allow you to access their attributes |
| 46 | +b.find(name='h1')['id'] |
| 47 | + |
| 48 | +# 'find_all' method is useful for finding all matching Tags |
| 49 | +b.find(name='p') # returns a Tag |
| 50 | +b.find_all(name='p') # returns a ResultSet (like a list of Tags) |
| 51 | + |
| 52 | +# ResultSets can be sliced like lists |
| 53 | +len(b.find_all(name='p')) |
| 54 | +b.find_all(name='p')[0] |
| 55 | +b.find_all(name='p')[0].text |
| 56 | +b.find_all(name='p')[0]['id'] |
| 57 | + |
| 58 | +# iterate over a ResultSet |
| 59 | +results = b.find_all(name='p') |
| 60 | +for tag in results: |
| 61 | + print tag.text |
| 62 | + |
| 63 | +# limit search by Tag attribute |
| 64 | +b.find(name='p', attrs={'id':'scraping'}) |
| 65 | +b.find_all(name='p', attrs={'class':'topic'}) |
| 66 | + |
| 67 | +# limit search to specific sections |
| 68 | +b.find_all(name='li') |
| 69 | +b.find(name='ul', attrs={'id':'scraping'}).find_all(name='li') |
| 70 | + |
| 71 | +''' |
| 72 | +EXERCISE ONE |
| 73 | +''' |
| 74 | + |
| 75 | +# find the 'h2' tag and then print its text |
| 76 | + |
| 77 | +# find the 'p' tag with an 'id' value of 'feedback' and then print its text |
| 78 | + |
| 79 | +# find the first 'p' tag and then print the value of the 'id' attribute |
| 80 | + |
| 81 | +# print the text of all four resources |
| 82 | + |
| 83 | +# print the text of only the API resources |
| 84 | + |
| 85 | +''' |
| 86 | +Scraping the IMDb website |
| 87 | +''' |
| 88 | + |
| 89 | +# get the HTML from the Shawshank Redemption page |
| 90 | + |
| 91 | +# convert HTML into Soup |
| 92 | + |
| 93 | +# run this code if you have encoding errors |
| 94 | + |
| 95 | +# get the title |
| 96 | + |
| 97 | +# get the star rating |
| 98 | + |
| 99 | +''' |
| 100 | +EXERCISE TWO |
| 101 | +''' |
| 102 | + |
| 103 | +# get the description |
| 104 | + |
| 105 | +# get the content rating |
| 106 | + |
| 107 | +# get the duration in minutes (as an integer) |
| 108 | + |
| 109 | +''' |
| 110 | +OPTIONAL WEB SCRAPING HOMEWORK |
| 111 | +
|
| 112 | +First, define a function that accepts an IMDb ID and returns a dictionary of |
| 113 | +movie information: title, star_rating, description, content_rating, duration. |
| 114 | +The function should gather this information by scraping the IMDb website, not |
| 115 | +by calling the OMDb API. (This is really just a wrapper of the web scraping |
| 116 | +code we wrote above.) |
| 117 | +
|
| 118 | +For example, get_movie_info('tt0111161') should return: |
| 119 | +
|
| 120 | +{'content_rating': 'R', |
| 121 | + 'description': u'Two imprisoned men bond over a number of years...', |
| 122 | + 'duration': 142, |
| 123 | + 'star_rating': 9.3, |
| 124 | + 'title': u'The Shawshank Redemption'} |
| 125 | +
|
| 126 | +Then, open the file imdb_ids.txt using Python, and write a for loop that builds |
| 127 | +a list in which each element is a dictionary of movie information. |
| 128 | +
|
| 129 | +Finally, convert that list into a DataFrame. |
| 130 | +''' |
| 131 | + |
| 132 | + |
| 133 | + |
| 134 | +''' |
| 135 | +Another IMDb example: Getting the genres |
| 136 | +''' |
| 137 | + |
| 138 | +# read the Shawshank Redemption page again |
| 139 | +r = requests.get('http://www.imdb.com/title/tt0111161/') |
| 140 | +b = BeautifulSoup(r.text) |
| 141 | + |
| 142 | +# only gets the first genre |
| 143 | +b.find(name='span', attrs={'class':'itemprop', 'itemprop':'genre'}) |
| 144 | + |
| 145 | +# gets all of the genres |
| 146 | +b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'genre'}) |
| 147 | + |
| 148 | +# stores the genres in a list |
| 149 | +[tag.text for tag in b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'genre'})] |
| 150 | + |
| 151 | +''' |
| 152 | +Another IMDb example: Getting the writers |
| 153 | +''' |
| 154 | + |
| 155 | +# attempt to get the list of writers (too many results) |
| 156 | +b.find_all(name='span', attrs={'itemprop':'name'}) |
| 157 | + |
| 158 | +# limit search to a smaller section to only get the writers |
| 159 | +b.find(name='div', attrs={'itemprop':'creator'}).find_all(name='span', attrs={'itemprop':'name'}) |
| 160 | + |
| 161 | +''' |
| 162 | +Another IMDb example: Getting the URLs of cast images |
| 163 | +''' |
| 164 | + |
| 165 | +# find the images by size |
| 166 | +results = b.find_all(name='img', attrs={'height':'44', 'width':'32'}) |
| 167 | + |
| 168 | +# check that the number of results matches the number of cast images on the page |
| 169 | +len(results) |
| 170 | + |
| 171 | +# iterate over the results to get all URLs |
| 172 | +for tag in results: |
| 173 | + print tag['loadlate'] |
| 174 | + |
| 175 | +''' |
| 176 | +Useful to know: Alternative Beautiful Soup syntax |
| 177 | +''' |
| 178 | + |
| 179 | +# read the example web page again |
| 180 | +with open('example.html', 'rU') as f: |
| 181 | + html = f.read() |
| 182 | + |
| 183 | +# convert to Soup |
| 184 | +b = BeautifulSoup(html) |
| 185 | + |
| 186 | +# these are equivalent |
| 187 | +b.find(name='p') # normal way |
| 188 | +b.find('p') # 'name' is the first argument |
| 189 | +b.p # can also be accessed as an attribute of the object |
| 190 | + |
| 191 | +# these are equivalent |
| 192 | +b.find(name='p', attrs={'id':'scraping'}) # normal way |
| 193 | +b.find('p', {'id':'scraping'}) # 'name' and 'attrs' are the first two arguments |
| 194 | +b.find('p', id='scraping') # can write the attributes as arguments |
| 195 | + |
| 196 | +# these are equivalent |
| 197 | +b.find(name='p', attrs={'class':'topic'}) # normal way |
| 198 | +b.find('p', class_='topic') # 'class' is special, so it needs a trailing underscore |
| 199 | +b.find('p', 'topic') # if you don't name it, it's assumed to be the class |
| 200 | + |
| 201 | +# these are equivalent |
| 202 | +b.find_all(name='p') # normal way |
| 203 | +b.findAll(name='p') # old function name from Beautiful Soup 3 |
| 204 | +b('p') # if you don't name the method, it's assumed to be find_all |
0 commit comments