File tree Expand file tree Collapse file tree 1 file changed +5
-3
lines changed Expand file tree Collapse file tree 1 file changed +5
-3
lines changed Original file line number Diff line number Diff line change @@ -17,20 +17,22 @@ def main():
1717 seed_url = urljoin (base_url , 'explore' )
1818 # 创建Redis客户端
1919 client = Redis (host = '1.2.3.4' , port = 6379 , password = '1qaz2wsx' )
20- # 设置用户代理
20+ # 设置用户代理(否则访问会被拒绝)
2121 headers = {'user-agent' : 'Baiduspider' }
2222 # 通过requests模块发送GET请求并指定用户代理
2323 resp = requests .get (seed_url , headers = headers )
2424 # 创建BeautifulSoup对象并指定使用lxml作为解析器
2525 soup = BeautifulSoup (resp .text , 'lxml' )
2626 href_regex = re .compile (r'^/question' )
27+ # 将URL处理成SHA1摘要(长度固定更简短)
28+ hasher_proto = sha1 ()
2729 # 查找所有href属性以/question打头的a标签
2830 for a_tag in soup .find_all ('a' , {'href' : href_regex }):
2931 # 获取a标签的href属性值并组装完整的URL
3032 href = a_tag .attrs ['href' ]
3133 full_url = urljoin (base_url , href )
32- # 将URL处理成SHA1摘要(长度固定更简短)
33- hasher = sha1 ()
34+ # 传入URL生成SHA1摘要
35+ hasher = hasher_proto . copy ()
3436 hasher .update (full_url .encode ('utf-8' ))
3537 field_key = hasher .hexdigest ()
3638 # 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
You can’t perform that action at this time.
0 commit comments