爬取网站 美桌 由于该网站并没有robots协议,因此可以任意爬取
技术路线 requests+re
除此之外用到了pypinyin库用于进行名字到拼音的转换
代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 """ Created on Mon Oct 16 20:32:27 2017 @Author: Jingwang Li @Email: JingwangLi@foxmail.com """ import requestsimport reimport osfrom pypinyin import pinyin, lazy_pinyindef getHTMLText (url ): try : r = requests.get(url,timeout=30 ) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except : print ("" ) def getPageUrls (text,name ): re_pageUrl=r'href="(.+)">\s*<img src="(.+)" alt="' +name return re.findall(re_pageUrl,text) def downPictures (text,root,name ): pageUrls=getPageUrls(text,name) titles=re.findall(r'alt="' +name+r'(.+)" ' ,text) for i in range (len (pageUrls)): pageUrl=pageUrls[i][0 ] path = root + titles[i]+ "//" if not os.path.exists(path): os.mkdir(path) if not os.listdir(path): pageText=getHTMLText(pageUrl) totalPics=int (re.findall(r'<em>(.+)</em>)' ,pageText)[0 ]) downUrl=re.findall(r'href="(.+?)" class="">下载图片' ,pageText)[0 ] cnt=1 ; while (cnt<=totalPics): picPath=path+str (cnt)+".jpg" r=requests.get(downUrl) with open (picPath,'wb' ) as f: f.write(r.content) f.close() print ('{} - 第{}张下载已完成\n' .format (titles[i],cnt)) cnt+=1 nextPageUrl=re.findall(r'href="(.+?)">下一张' ,pageText)[0 ] pageText=getHTMLText(nextPageUrl) downUrl=re.findall(r'href="(.+?)" class="">下载图片' ,pageText)[0 ] return def main (): name=input ("请输入你喜欢的明星的名字:" ) nameUrl="http://www.win4000.com/mt/" +'' .join(lazy_pinyin(name))+".html" try : text=getHTMLText(nameUrl) if not re.findall(r'暂无(.+)!' ,text): root = "D://pics//" +name+"//" if not os.path.exists(root): os.mkdir(root) downPictures(text,root,name) try : nextPage=re.findall(r'next" href="(.+)"' ,text)[0 ] while (nextPage): nextText=getHTMLText(nextPage) downPictures(nextText,root,name) nextPage=re.findall(r'next" href="(.+)"' ,nextText)[0 ] except IndexError: print ("已全部下载完毕" ) except TypeError: print ("不好意思,没有{}的照片" .format (name)) return if __name__ == '__main__' : main()