Skip to content

Commit 59f7887

Browse files
committed
Minor changes to scrape.py
1 parent 7d070f2 commit 59f7887

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

Day4/scrape.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
import random
1010
import time
1111
import os
12+
import re
13+
14+
#from nltk.util import clean_html
15+
#import urllib2
16+
1217

1318
# Open a web page
1419
web_address='https://polisci.wustl.edu/faculty/specialization'
@@ -25,6 +30,7 @@
2530

2631
# Get the attributes
2732
my_a_tag=soup.find_all('a')[2]
33+
my_a_tag = re.sub(r'<[^>]+>', '', str(my_a_tag)) #remove tags
2834
my_a_tag.attrs #Gives a dictionary with the attributes
2935
my_a_tag.attrs.keys()
3036
my_a_tag['alt']
@@ -36,6 +42,8 @@
3642
mysection=soup.find_all('div')[0]
3743
mysection.a #Gives the 'a' tag within the 'div' tag
3844
mysection.find_all('a') #Gives the list of all 'a' tags within the 'div' tag
45+
mysection.get_text()
46+
3947

4048
# Creating a tree of objects
4149

Day4/scrape.py~

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ import urllib2
99
import random
1010
import time
1111
import os
12+
import re
13+
14+
#from nltk.util import clean_html
15+
#import urllib2
16+
1217

1318
# Open a web page
1419
web_address='https://polisci.wustl.edu/faculty/specialization'
@@ -22,12 +27,10 @@ soup.prettify()
2227
soup.find_all('h3')
2328
soup.find_all('a')
2429

25-
# Get the script of a certain tag
26-
mytitle=soup.find_all('h3')[0]
27-
mytitle.script
2830

2931
# Get the attributes
3032
my_a_tag=soup.find_all('a')[2]
33+
my_a_tag = re.sub(r'<[^>]+>', '', str(my_a_tag)) #remove tags
3134
my_a_tag.attrs #Gives a dictionary with the attributes
3235
my_a_tag.attrs.keys()
3336
my_a_tag['alt']

0 commit comments

Comments
 (0)