11from bs4 import BeautifulSoup
22import re
33import pymysql
4+ from urllib .request import urlopen
45
5- conn = pymysql .connect (host = '127.0.0.1' , unix_socket = '/tmp/mysql.sock' , user = 'root' , passwd = None , db = 'mysql' , charset = 'utf8' )
6+ conn = pymysql .connect (host = '127.0.0.1' , port = 3306 , user = 'root' , passwd = 'root' , db = 'mysql' , charset = 'utf8' )
67cur = conn .cursor ()
78cur .execute ("USE wikipedia" )
89
@@ -29,12 +30,12 @@ def getLinks(pageUrl, recursionLevel):
2930 pageId = insertPageIfNotExists (pageUrl )
3031 html = urlopen ("http://en.wikipedia.org" + pageUrl )
3132 bsObj = BeautifulSoup (html )
32- for link in bsObj .findAll ("a" ,
33- href = re .compile ("^(/wiki/)((?!:).)*$" )):
34- insertLink (pageId , insertPageIfNotExists (link .attrs ['href' ]))
33+ for link in bsObj .findAll ("a" , href = re .compile ("^(/wiki/)((?!:).)*$" )):
34+ insertLink (pageId , insertPageIfNotExists (link .attrs ['href' ]))
3535 if link .attrs ['href' ] not in pages :
3636 #We have encountered a new page, add it and search it for links
3737 newPage = link .attrs ['href' ]
38+ print (newPage )
3839 pages .add (newPage )
3940 getLinks (newPage , recursionLevel + 1 )
4041getLinks ("/wiki/Kevin_Bacon" , 0 )
0 commit comments