Skip to content

Commit d058083

Browse files
committed
add-实例二贴吧爬虫
1 parent caa19ca commit d058083

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

code_demo/Tieba.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/python3
2+
# -*- coding: utf-8 -*-
3+
import requests
4+
5+
6+
class TiebaSpider():
7+
8+
def __init__(self, kw, max_pn):
9+
self.max_pn = max_pn
10+
self.kw = kw
11+
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
12+
self.headers = {
13+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
14+
}
15+
pass
16+
17+
def get_url_list(self):
18+
'''
19+
获取 url 列表
20+
:return:
21+
'''
22+
# 写法一
23+
'''
24+
url_list = []
25+
26+
for pn in range(0,self.max_pn,50):
27+
url = self.base_url.format(self.kw,pn)
28+
url_list.append(url)
29+
30+
return url_list
31+
'''
32+
# 写法二
33+
return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_pn, 50)]
34+
35+
def get_content(self, url):
36+
'''
37+
发送请求获取响应内容
38+
:param url:
39+
:return:
40+
'''
41+
response = requests.get(
42+
url=url,
43+
headers=self.headers
44+
)
45+
46+
return response.content
47+
48+
def get_items(self, content, index):
49+
'''
50+
从响应内容中提取数据
51+
:param content:
52+
:return:
53+
'''
54+
with open('tieba-{}.html'.format(index), 'wb') as f:
55+
f.write(content)
56+
return None
57+
58+
def save_items(self, items):
59+
'''
60+
保存数据
61+
:param items:
62+
:return:
63+
'''
64+
pass
65+
66+
def run(self):
67+
# 1. 获取 url 列表
68+
url_list = self.get_url_list()
69+
70+
for url in url_list:
71+
# 2. 发送请求获取响应
72+
content = self.get_content(url)
73+
# 3. 从响应中提取数据
74+
items = self.get_items(content, url_list.index(url) + 1)
75+
# 4. 保存数据
76+
self.save_items(items)
77+
78+
pass
79+
80+
81+
if __name__ == '__main__':
82+
spider = TiebaSpider("英雄联盟", 150)
83+
spider.run()

images/百度贴吧分析.jpg

145 KB
Loading

百度贴吧爬虫.md

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# 百度贴吧爬虫
2+
## 分析
3+
### 分析流程图
4+
5+
> 分析 ``url`` 的时候我们一般都是从第二页开始分析,可以看出 ``url`` 的变化
6+
7+
![](./images/百度贴吧分析.jpg)
8+
9+
### 分析结果
10+
#### 结果概要
11+
12+
| 请求目标 | 分析结果 |
13+
|-----------------------------------|-------------------|
14+
| 请求方式分析 | GET |
15+
| 请求参数分析 | pn每页50发生变化,其他参数固定不变 |
16+
| 请求头分析 | 只需要添加User-Agent |
17+
| 请求url分析 | https://tieba.baidu.com/f?kw=英雄联盟&ie=utf-8&pn=50
18+
19+
### 代码实现流程
20+
1. 实现面向对象构建爬虫对象
21+
2. 爬虫流程四步骤
22+
1. 获取 url 列表
23+
2. 发送请求获取响应
24+
3. 从响应中提取数据
25+
4. 保存数据
26+
27+
28+
## 代码实现
29+
30+
```python
31+
#!/usr/bin/python3
32+
# -*- coding: utf-8 -*-
33+
import requests
34+
35+
class TiebaSpider():
36+
37+
def __init__(self,kw,max_pn):
38+
self.max_pn = max_pn
39+
self.kw = kw
40+
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
41+
self.headers = {
42+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
43+
}
44+
pass
45+
46+
def get_url_list(self):
47+
'''
48+
获取 url 列表
49+
:return:
50+
'''
51+
# 写法一
52+
'''
53+
url_list = []
54+
55+
for pn in range(0,self.max_pn,50):
56+
url = self.base_url.format(self.kw,pn)
57+
url_list.append(url)
58+
59+
return url_list
60+
'''
61+
# 写法二
62+
return [self.base_url.format(self.kw,pn) for pn in range(0,self.max_pn,50)]
63+
64+
def get_content(self,url):
65+
'''
66+
发送请求获取响应内容
67+
:param url:
68+
:return:
69+
'''
70+
response = requests.get(
71+
url=url,
72+
headers = self.headers
73+
)
74+
75+
return response.content
76+
77+
def get_items(self,content,idx):
78+
'''
79+
从响应内容中提取数据
80+
:param content:
81+
:return:
82+
'''
83+
with open('08-{}.html'.format(idx),'wb') as f:
84+
f.write(content)
85+
return None
86+
87+
def save_items(self,items):
88+
'''
89+
保存数据
90+
:param items:
91+
:return:
92+
'''
93+
pass
94+
95+
96+
def run(self):
97+
98+
# 1. 获取 url 列表
99+
url_list = self.get_url_list()
100+
101+
for url in url_list:
102+
# 2. 发送请求获取响应
103+
content = self.get_content(url)
104+
105+
# 3. 从响应中提取数据
106+
items = self.get_items(content,url_list.index(url) + 1)
107+
108+
# 4. 保存数据
109+
self.save_items(items)
110+
111+
pass
112+
113+
if __name__ == '__main__':
114+
spider = TiebaSpider("英雄联盟",150)
115+
spider.run()
116+
```

0 commit comments

Comments
 (0)