88from pymysql import Error
99
1010
11- def decode_page (page_bytes , charsets = ('utf-8' , )):
11+ # 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
12+ def decode_page (page_bytes , charsets = ('utf-8' ,)):
1213 page_html = None
1314 for charset in charsets :
1415 try :
@@ -20,7 +21,8 @@ def decode_page(page_bytes, charsets=('utf-8', )):
2021 return page_html
2122
2223
23- def get_page_html (seed_url , * , retry_times = 3 , charsets = ('utf-8' , )):
24+ # 获取页面的HTML代码(通过递归实现指定次数的重试操作)
25+ def get_page_html (seed_url , * , retry_times = 3 , charsets = ('utf-8' ,)):
2426 page_html = None
2527 try :
2628 page_html = decode_page (urlopen (seed_url ).read (), charsets )
@@ -32,32 +34,38 @@ def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8', )):
3234 return page_html
3335
3436
37+ # 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
3538def get_matched_parts (page_html , pattern_str , pattern_ignore_case = re .I ):
3639 pattern_regex = re .compile (pattern_str , pattern_ignore_case )
3740 return pattern_regex .findall (page_html ) if page_html else []
3841
3942
40- def start_crawl (seed_url , match_pattern ):
43+ # 开始执行爬虫程序并对指定的数据进行持久化操作
44+ def start_crawl (seed_url , match_pattern , * , max_depth = - 1 ):
4145 conn = pymysql .connect (host = 'localhost' , port = 3306 ,
4246 database = 'crawler' , user = 'root' ,
4347 password = '123456' , charset = 'utf8' )
4448 try :
4549 with conn .cursor () as cursor :
4650 url_list = [seed_url ]
51+ visited_url_list = {seed_url : 0 }
4752 while url_list :
4853 current_url = url_list .pop (0 )
49- page_html = get_page_html (current_url , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
50- links_list = get_matched_parts (page_html , match_pattern )
51- url_list += links_list
52- param_list = []
53- for link in links_list :
54- page_html = get_page_html (link , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
55- headings = get_matched_parts (page_html , r'<h1>(.*)<span' )
56- if headings :
57- param_list .append ((headings [0 ], link ))
58- cursor .executemany ('insert into tb_result values (default, %s, %s)' ,
59- param_list )
60- conn .commit ()
54+ depth = visited_url_list [current_url ]
55+ if depth != max_depth :
56+ page_html = get_page_html (current_url , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
57+ links_list = get_matched_parts (page_html , match_pattern )
58+ param_list = []
59+ for link in links_list :
60+ if link not in visited_url_list :
61+ visited_url_list [link ] = depth + 1
62+ page_html = get_page_html (link , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
63+ headings = get_matched_parts (page_html , r'<h1>(.*)<span' )
64+ if headings :
65+ param_list .append ((headings [0 ], link ))
66+ cursor .executemany ('insert into tb_result values (default, %s, %s)' ,
67+ param_list )
68+ conn .commit ()
6169 except Error :
6270 pass
6371 # logging.error('SQL:', error)
@@ -67,8 +75,9 @@ def start_crawl(seed_url, match_pattern):
6775
6876def main ():
6977 ssl ._create_default_https_context = ssl ._create_unverified_context
70- start_crawl ('http://sports.sohu.com/nba_a.shtml' ,
71- r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']' )
78+ start_crawl ('http://sports.sohu.com/nba_a.shtml' ,
79+ r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']' ,
80+ max_depth = 2 )
7281
7382
7483if __name__ == '__main__' :
0 commit comments