Skip to content

Commit 6994023

Browse files
authored
Merge pull request #371 from songzy12/album_parser
Add the feature to crawl avatar album.
2 parents 7c6a8c4 + 0b669fa commit 6994023

11 files changed

+113
-4
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from unittest.mock import patch
2+
3+
from .util import mock_request_get_content
4+
from weibo_spider.parser.album_parser import AlbumParser
5+
6+
7+
@patch('requests.get', mock_request_get_content)
8+
def test_album_parser():
9+
album_parser = AlbumParser(
10+
cookie="",
11+
album_url="https://weibo.cn/album/166564740000001980768563?rl=1")
12+
13+
pic_urls = album_parser.extract_pic_urls()
14+
assert (len(pic_urls) == 4)
15+
assert (pic_urls == [
16+
'http://wx1.sinaimg.cn/wap180/76102133ly8ga961tpte6j20u00u0q65.jpg',
17+
'http://wx2.sinaimg.cn/wap180/76102133ly8fwr33wpn8fj20v90v9tbw.jpg',
18+
'http://wx4.sinaimg.cn/wap180/76102133ly8fvlyn5n52gj20v90v949a.jpg',
19+
'http://wx2.sinaimg.cn/wap180/76102133ly8fk0btnrn5zj20dp0e8q3t.jpg'
20+
])
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from unittest.mock import patch
2+
3+
from weibo_spider.parser.photo_parser import PhotoParser
4+
5+
from .util import mock_request_get_content
6+
7+
8+
@patch('requests.get', mock_request_get_content)
9+
def test_photo_parser():
10+
photo_parser = PhotoParser(cookie="", user_id=1980768563)
11+
12+
avatar_album_url = photo_parser.extract_avatar_album_url()
13+
assert (avatar_album_url ==
14+
"https://weibo.cn/album/166564740000001980768563?rl=1")
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><meta http-equiv="Cache-Control" content="no-cache"/><meta id="viewport" name="viewport" content="width=device-width,initial-scale=1.0,minimum-scale=1.0, maximum-scale=2.0" /><link rel="shortcut icon" type="image/x-icon" href="https://weibo.cn/favicon.ico"><link rel="icon" sizes="any" mask href="https://h5.sinaimg.cn/upload/2015/05/15/28/WeiboLogoCh.svg" color="black"><meta name="MobileOptimized" content="240"/><title>专辑:头像相册</title><style type="text/css" id="internalStyle">html,body,p,form,div,table,textarea,input,span,select{font-size:12px;word-wrap:break-word;}body{background:#F8F9F9;color:#000;padding:1px;margin:1px;}table,tr,td{border-width:0px;margin:0px;padding:0px;}form{margin:0px;padding:0px;border:0px;}textarea{border:1px solid #96c1e6}textarea{width:95%;}a,.tl{color:#2a5492;text-decoration:underline;}/*a:link {color:#023298}*/.k{color:#2a5492;text-decoration:underline;}.kt{color:#F00;}.ib{border:1px solid #C1C1C1;}.pm,.pmy{clear:both;background:#ffffff;color:#676566;border:1px solid #b1cee7;padding:3px;margin:2px 1px;overflow:hidden;}.pms{clear:both;background:#c8d9f3;color:#666666;padding:3px;margin:0 1px;overflow:hidden;}.pmst{margin-top: 5px;}.pmsl{clear:both;padding:3px;margin:0 1px;overflow:hidden;}.pmy{background:#DADADA;border:1px solid #F8F8F8;}.t{padding:0px;margin:0px;height:35px;}.b{background:#e3efff;text-align:center;color:#2a5492;clear:both;padding:4px;}.bl{color:#2a5492;}.n{clear:both;background:#436193;color:#FFF;padding:4px; margin: 1px;}.nt{color:#b9e7ff;}.nl{color:#FFF;text-decoration:none;}.nfw{clear:both;border:1px solid #BACDEB;padding:3px;margin:2px 1px;}.s{border-bottom:1px dotted #666666;margin:3px;clear:both;}.tip{clear:both; background:#c8d9f3;color:#676566;border:1px solid #BACDEB;padding:3px;margin:2px 1px;}.tip2{color:#000000;padding:2px 3px;clear:both;}.ps{clear:both;background:#FFF;color:#676566;border:1px solid #BACDEB;padding:3px;margin:2px 1px;}.tm{background:#feffe5;border:1px solid #e6de8d;padding:4px;}.tm a{color:#ba8300;}.tmn{color:#f00}.tk{color:#ffffff}.tc{color:#63676A;}.c{padding:2px 5px;}.c div a img{border:1px solid #C1C1C1;}.ct{color:#9d9d9d;font-style:italic;}.cmt{color:#9d9d9d;}.ctt{color:#000;}.cc{color:#2a5492;}.nk{color:#2a5492;}.por {border: 1px solid #CCCCCC;height:50px;width:50px;}.me{color:#000000;background:#FEDFDF;padding:2px 5px;}.pa{padding:2px 4px;}.nm{margin:10px 5px;padding:2px;}.hm{padding:5px;background:#FFF;color:#63676A;}.u{margin:2px 1px;background:#ffffff;border:1px solid #b1cee7;}.ut{padding:2px 3px;}.cd{text-align:center;}.r{color:#F00;}.g{color:#0F0;}.bn{background: transparent;border: 0 none;text-align: left;padding-left: 0;}</style><script>if(top != self){top.location = self.location;}</script></head><body><div class="tm"><a href="https://weibo.cn/msg/comment/receive?unread=1"><span class="tmn">1</span>评论</a>&nbsp;&nbsp;<a href="https://weibo.cn/msg/clearAllUnread?type=dcm&amp;rl=11"><img src="https://h5.sinaimg.cn/upload/2016/12/30/125/5366.gif" alt="[X]" /></a><br/></div><div class="c" style="padding: 6px 4px;"><a href="/?tf=5_009">首页!</a>|<a href="/msg/?tf=5_010">消息</a>|<a href="/album/166564740000001980768563?rl=1&amp;rand=5759&amp;p=r">刷新</a></div><div style="background-color:#77BBE0;"></div><div style="margin:0px;" class="n"><a href="/album/updates?st=81505d" class="nl">好友</a>|<a href="/album/square" class="nl">美图</a>|<a href="/album/likelist" class="nl">喜欢</a>|<a href="/album/albumlist" class="nl">我的</a></div><div class="tip"><a href="https://weibo.cn/shuangye2012" class="nk">霜叶</a><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/5338.gif" alt="V" /><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/donate_btn_s.png" alt="M"/>&gt;<a href="/album/albumlist?fuid=1980768563">他的相册</a>&gt;浏览</div><div class="c">专辑:头像相册</div><div class="c"><a href="/album/166564740000001980768563/photo/44534610716263050000001980768563/detail?page=1&amp;rl=11"><img src="http://wx1.sinaimg.cn/wap180/76102133ly8ga961tpte6j20u00u0q65.jpg" alt='' class="c"/></a><a href="/album/166564740000001980768563/photo/43010955171725440000001980768563/detail?page=2&amp;rl=11"><img src="http://wx2.sinaimg.cn/wap180/76102133ly8fwr33wpn8fj20v90v9tbw.jpg" alt='' class="c"/></a><a href="/album/166564740000001980768563/photo/42882049152386420000001980768563/detail?page=3&amp;rl=11"><img src="http://wx4.sinaimg.cn/wap180/76102133ly8fvlyn5n52gj20v90v949a.jpg" alt='' class="c"/></a><a href="/album/166564740000001980768563/photo/41572947986015870000001980768563/detail?page=4&amp;rl=11"><img src="http://wx2.sinaimg.cn/wap180/76102133ly8fk0btnrn5zj20dp0e8q3t.jpg" alt='' class="c"/></a></div><div class="c"><a href="/album/166564740000001980768563/rt?rl=11">转发</a>&nbsp;<a href="/album/166564740000001980768563/comment?rl=11">评论</a>&nbsp;</div><div class="pm">照片墙|<a href="/album/166564740000001980768563/?DisplayMode=2&amp;rl=1">传统列表</a></div><div class="cd"><a href="#top"><img src="https://h5.sinaimg.cn/upload/2017/04/27/319/5e990ec2.gif" alt="TOP"/></a></div><div class="pms"><a href="https://weibo.cn">首页<span class="tk">!</span></a>.<a href="https://weibo.cn/topic/240489">反馈</a>.<a href="https://weibo.cn/page/91">帮助</a>.<a href="https://c.weibo.cn" >客户端</a>.<a href="https://weibo.cn/spam/?rl=11&amp;type=3&amp;fuid=3113276555" class="kt">举报</a>.<a href="https://weibo.cn/logout">退出</a></div><div class="b"><a href="https://beian.miit.gov.cn" target="_blank">京ICP备12002058号-1</a> [08-16 00:54]</div></body></html>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><meta http-equiv="Cache-Control" content="no-cache"/><meta id="viewport" name="viewport" content="width=device-width,initial-scale=1.0,minimum-scale=1.0, maximum-scale=2.0" /><link rel="shortcut icon" type="image/x-icon" href="https://weibo.cn/favicon.ico"><link rel="icon" sizes="any" mask href="https://h5.sinaimg.cn/upload/2015/05/15/28/WeiboLogoCh.svg" color="black"><meta name="MobileOptimized" content="240"/><title>微博</title><style type="text/css" id="internalStyle">html,body,p,form,div,table,textarea,input,span,select{font-size:12px;word-wrap:break-word;}body{background:#F8F9F9;color:#000;padding:1px;margin:1px;}table,tr,td{border-width:0px;margin:0px;padding:0px;}form{margin:0px;padding:0px;border:0px;}textarea{border:1px solid #96c1e6}textarea{width:95%;}a,.tl{color:#2a5492;text-decoration:underline;}/*a:link {color:#023298}*/.k{color:#2a5492;text-decoration:underline;}.kt{color:#F00;}.ib{border:1px solid #C1C1C1;}.pm,.pmy{clear:both;background:#ffffff;color:#676566;border:1px solid #b1cee7;padding:3px;margin:2px 1px;overflow:hidden;}.pms{clear:both;background:#c8d9f3;color:#666666;padding:3px;margin:0 1px;overflow:hidden;}.pmst{margin-top: 5px;}.pmsl{clear:both;padding:3px;margin:0 1px;overflow:hidden;}.pmy{background:#DADADA;border:1px solid #F8F8F8;}.t{padding:0px;margin:0px;height:35px;}.b{background:#e3efff;text-align:center;color:#2a5492;clear:both;padding:4px;}.bl{color:#2a5492;}.n{clear:both;background:#436193;color:#FFF;padding:4px; margin: 1px;}.nt{color:#b9e7ff;}.nl{color:#FFF;text-decoration:none;}.nfw{clear:both;border:1px solid #BACDEB;padding:3px;margin:2px 1px;}.s{border-bottom:1px dotted #666666;margin:3px;clear:both;}.tip{clear:both; background:#c8d9f3;color:#676566;border:1px solid #BACDEB;padding:3px;margin:2px 1px;}.tip2{color:#000000;padding:2px 3px;clear:both;}.ps{clear:both;background:#FFF;color:#676566;border:1px solid #BACDEB;padding:3px;margin:2px 1px;}.tm{background:#feffe5;border:1px solid #e6de8d;padding:4px;}.tm a{color:#ba8300;}.tmn{color:#f00}.tk{color:#ffffff}.tc{color:#63676A;}.c{padding:2px 5px;}.c div a img{border:1px solid #C1C1C1;}.ct{color:#9d9d9d;font-style:italic;}.cmt{color:#9d9d9d;}.ctt{color:#000;}.cc{color:#2a5492;}.nk{color:#2a5492;}.por {border: 1px solid #CCCCCC;height:50px;width:50px;}.me{color:#000000;background:#FEDFDF;padding:2px 5px;}.pa{padding:2px 4px;}.nm{margin:10px 5px;padding:2px;}.hm{padding:5px;background:#FFF;color:#63676A;}.u{margin:2px 1px;background:#ffffff;border:1px solid #b1cee7;}.ut{padding:2px 3px;}.cd{text-align:center;}.r{color:#F00;}.g{color:#0F0;}.bn{background: transparent;border: 0 none;text-align: left;padding-left: 0;}</style><script>if(top != self){top.location = self.location;}</script></head><body><div class="n" style="padding: 6px 4px;"><a href="https://weibo.cn/?tf=5_009" class="nl">首页<span class="tk">!</span></a>|<a href="https://weibo.cn/msg/?tf=5_010" class="nl">消息</a>|<a href="/1980768563/photo?tf=6_008&amp;rand=6508&amp;p=r" class="nl">刷新</a></div><div class="c tip"><a href="https://m.weibo.cn" id="top" class="tl">手机微博触屏版,点击前往>></a></div><div class="c">霜叶的相册</div><div class="s"></div><div class="pmst"><span class="pmsl">&nbsp;<a href="/shuangye2012">微博</a>&nbsp;</span><span class="pms">&nbsp;相册&nbsp;</span></div><div class="pms" style="margin: 0;padding: 0;line-height: 3px;">&nbsp;</div><div class="s"></div><div class="c"><table><tr><td><a href="/album/albummblog?fuid=1980768563"><img width="80" height="80" src="//img.t.sinajs.cn/t5/style/images/staticlogo/groups1_3.png?version=74d5a0aee49e3f11" alt="微博配图"/></a></td><td><div class="c"><a href="/album/albummblog?fuid=1980768563">微博配图</a></div></td></tr></table></div><div class="s"></div><div class="c"><table><tr><td><a href="/album/34589831934400230000001980768563?rl=1"><img width="80" height="80" src="http://ss1.sinaimg.cn/wap180/&690" alt='默认专辑'/></a></td><td><div class="c"><a href="/album/34589831934400230000001980768563?rl=1">默认专辑(3张)</a></div></td></tr></table></div><div class="s"></div><div class="c"><table><tr><td><a href="/album/166564740000001980768563?rl=1"><img width="80" height="80" src="https://tvax1.sinaimg.cn/crop.0.0.1080.1080.180/76102133ly8ga961tpte6j20u00u0q65.jpg?KID=imgbed,tva&Expires=1629140012&ssig=fJbL8N5deV" alt='头像相册'/></a></td><td><div class="c"><a href="/album/166564740000001980768563?rl=1">头像相册(4张)</a></div></td></tr></table></div><div class="cd"><a href="#top"><img src="https://h5.sinaimg.cn/upload/2017/04/27/319/5e990ec2.gif" alt="TOP"/></a></div><div class="pms"><a href="https://weibo.cn">首页<span class="tk">!</span></a>.<a href="https://weibo.cn/topic/240489">反馈</a>.<a href="https://weibo.cn/page/91">帮助</a>.<a href="https://c.weibo.cn" >客户端</a>.<a href="https://weibo.cn/spam/?rl=1&amp;type=3&amp;fuid=1980768563" class="kt">举报</a>.<a href="https://weibo.cn/logout">退出</a></div><div class="c">设置:<a href="https://weibo.cn/account/customize/skin?tf=7_005&amp;st=81505d">皮肤</a>.<a href="https://weibo.cn/account/customize/pic?tf=7_006&amp;st=81505d">图片</a>.<a href="https://weibo.cn/account/customize/pagesize?tf=7_007&amp;st=81505d">条数</a>.<a href="https://weibo.cn/account/privacy/?tf=7_008&amp;st=81505d">隐私</a></div><div class="b"><a href="https://beian.miit.gov.cn" target="_blank">京ICP备12002058号-1</a> [08-16 23:53]</div></body></html>

tests/testdata/url_map.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,7 @@
77
"https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html",
88
"https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html",
99
"https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html",
10-
"https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html"
10+
"https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html",
11+
"https://weibo.cn/1980768563/photo?tf=6_008": "tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html",
12+
"https://weibo.cn/album/166564740000001980768563?rl=1": "tests/testdata/b541fd1751117498b6d6f40d3321686ddf871651237c4ac854a5c3eb.html"
1113
}
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from .origin_picture_downloader import OriginPictureDownloader
22
from .retweet_picture_downloader import RetweetPictureDownloader
3+
from .avatar_picture_downloader import AvatarPictureDownloader
34
from .video_downloader import VideoDownloader
45

5-
__all__ = [OriginPictureDownloader, RetweetPictureDownloader, VideoDownloader]
6+
__all__ = [
7+
OriginPictureDownloader, RetweetPictureDownloader, AvatarPictureDownloader,
8+
VideoDownloader
9+
]
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import os
2+
3+
from .img_downloader import ImgDownloader
4+
5+
6+
class AvatarPictureDownloader(ImgDownloader):
7+
def __init__(self, file_dir, file_download_timeout):
8+
super().__init__(file_dir, file_download_timeout)
9+
self.describe = u'头像图片'
10+
self.key = 'avatar_pictures'
11+
12+
def handle_download(self, urls):
13+
"""处理下载相关操作"""
14+
file_dir = self.file_dir + os.sep + self.describe
15+
if not os.path.isdir(file_dir):
16+
os.makedirs(file_dir)
17+
18+
for i, url in enumerate(urls):
19+
index = url.rfind('/')
20+
file_name = url[index:]
21+
file_path = file_dir + os.sep + file_name
22+
self.download_one_file(url, file_path, 'xxx')

weibo_spider/parser/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from .index_parser import IndexParser
22
from .page_parser import PageParser
3+
from .photo_parser import PhotoParser
4+
from .album_parser import AlbumParser
35

4-
__all__ = [IndexParser, PageParser]
6+
__all__ = [IndexParser, PageParser, PhotoParser, AlbumParser]
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from .util import handle_html
2+
from .parser import Parser
3+
4+
5+
class AlbumParser(Parser):
6+
def __init__(self, cookie, album_url):
7+
self.cookie = cookie
8+
self.url = album_url
9+
self.selector = handle_html(self.cookie, self.url)
10+
11+
def extract_pic_urls(self):
12+
# <img src="http://wx2.sinaimg.cn/wap180/76102133ly8fwr33wpn8fj20v90v9tbw.jpg" alt="" class="c">
13+
return self.selector.xpath('//img[@class="c"]/@src')
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from .util import handle_html
2+
from .parser import Parser
3+
4+
5+
class PhotoParser(Parser):
6+
def __init__(self, cookie, user_id):
7+
self.cookie = cookie
8+
self.url = "https://weibo.cn/" + str(user_id) + "/photo?tf=6_008"
9+
self.selector = handle_html(self.cookie, self.url)
10+
11+
def extract_avatar_album_url(self):
12+
# Finds the href attribute of the table td div element with text 头像相册, e.g.
13+
# <a href="/album/166564740000001980768563?rl=1"><img width="80" height="80" src="https://tvax1.sinaimg.cn/crop.0.0.1080.1080.180/76102133ly8ga961tpte6j20u00u0q65.jpg?KID=imgbed,tva&amp;Expires=1629227741&amp;ssig=TEUDkMXcS1" alt="头像相册"></a>
14+
result = self.selector.xpath('//img[@alt="头像相册"]/../@href')
15+
return "https://weibo.cn" + result[0]

0 commit comments

Comments
 (0)