爬虫实战(1)——批量爬取明星图片

爬取网站

美桌
由于该网站并没有robots协议,因此可以任意爬取

技术路线

requests+re

除此之外用到了pypinyin库用于进行名字到拼音的转换

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Created on Mon Oct 16 20:32:27 2017

@Author: Jingwang Li
@Email: JingwangLi@foxmail.com
"""
import requests
import re
import os
from pypinyin import pinyin, lazy_pinyin
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("")

def getPageUrls(text,name):
re_pageUrl=r'href="(.+)">\s*<img src="(.+)" alt="'+name
return re.findall(re_pageUrl,text)

def downPictures(text,root,name):
pageUrls=getPageUrls(text,name)
titles=re.findall(r'alt="'+name+r'(.+)" ',text)
for i in range(len(pageUrls)):
pageUrl=pageUrls[i][0]
path = root + titles[i]+ "//"
if not os.path.exists(path):
os.mkdir(path)
if not os.listdir(path):
pageText=getHTMLText(pageUrl)
totalPics=int(re.findall(r'<em>(.+)</em>)',pageText)[0])
downUrl=re.findall(r'href="(.+?)" class="">下载图片',pageText)[0]
cnt=1;
while(cnt<=totalPics):
picPath=path+str(cnt)+".jpg"
r=requests.get(downUrl)
with open(picPath,'wb') as f:
f.write(r.content)
f.close()
print('{} - 第{}张下载已完成\n'.format(titles[i],cnt))
cnt+=1
nextPageUrl=re.findall(r'href="(.+?)">下一张',pageText)[0]
pageText=getHTMLText(nextPageUrl)
downUrl=re.findall(r'href="(.+?)" class="">下载图片',pageText)[0]
return

def main():
name=input("请输入你喜欢的明星的名字:")
nameUrl="http://www.win4000.com/mt/"+''.join(lazy_pinyin(name))+".html"
try:
text=getHTMLText(nameUrl)
if not re.findall(r'暂无(.+)!',text):
root = "D://pics//"+name+"//"
if not os.path.exists(root):
os.mkdir(root)
downPictures(text,root,name)
try:
nextPage=re.findall(r'next" href="(.+)"',text)[0]
while(nextPage):
nextText=getHTMLText(nextPage)
downPictures(nextText,root,name)
nextPage=re.findall(r'next" href="(.+)"',nextText)[0]
except IndexError:
print("已全部下载完毕")
except TypeError:
print("不好意思,没有{}的照片".format(name))
return

if __name__ == '__main__':
main()

Copyright © 2019 - 2024 Jingwang Li

Powered by Hexo | Theme Hiker