不管痛苦还是难受,都应该感受到生存的力量。 ----《野良神》
网站地址: www.xiaohuar.com/hua
流程:
- 获取所有的页面的总数
- 获取当前页面的所有校花的相册地址
- 获取校花相册中的所有图片地址
- 下载图片
用到的库:
- requests
- re
- os
- bs4.BeautifulSoup
- urllib.request.urlretrieve
这个网站相对简单,就直接贴出源代码了~
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| import requests import re import os from bs4 import BeautifulSoup from urllib.request import urlretrieve
class Xiaohua(): try: os.mkdir('xiaohua') except FileExistsError: pass
def __init__(self): self.url_list = [] self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
# 获取总页面数 def get_page(self): url = "http://www.xiaohuar.com/hua/" get_request_page = requests.get(url,headers=self.headers).text Bsoup = BeautifulSoup(get_request_page, 'lxml') Bfind = Bsoup.find('div', class_="page_num").find_all('a') re_str = re.compile(r'<a href="http://www.xiaohuar.com/list-1-(\d+).html">尾页</a>') page = int(re_str.search(str(Bfind[-1])).groups()[0])+1 return page
# 获取当前页面的所有校花的相册地址 def get_image_url(self, page): print("正在获取第 {} 页的校花相册地址......".format(page+1)) url = "http://www.xiaohuar.com/list-1-{}.html".format(page) request_url = requests.get(url, headers=self.headers).text img_url_soup = BeautifulSoup(request_url, 'lxml') img_find = img_url_soup.find('div', class_="demo clearfix").find_all('div', class_="img") re_name = re.compile(r'<a href="(.*?)" target="_blank"><img alt="(.*?)" src') for img in img_find: link = re_name.search(str(img)).groups() self.url_list.append(re_name.search(str(img)).groups()[0].replace('p-','s-')) print("正在获取 '{}' 的相册地址".format(link[1])) print("第 {} 页的校花相册地址获取完成!".format(page+1 )) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") return self.url_list.sort(reverse=False)
# 下载图片 def download(self,image_site,name): print("正在下载 `{}`".format(image_site.split('/')[-1])) url = "http://www.xiaohuar.com"+image_site try: os.mkdir('xiaohua/{}'.format(name)) except FileExistsError: pass urlretrieve(url, 'xiaohua/{}/{}.{}'.format(name, image_site.split('/')[-1][0:-4], image_site.split('.')[-1]))
# 获取校花相册中的所有图片地址 def get_xiaohua_total_img(self): for i in range(len(self.url_list)): url = self.url_list.pop() img_request = requests.get(url, headers=self.headers).text bs = BeautifulSoup(img_request, 'lxml') bfind_name = bs.find('div', class_="pic_con_box ad-gallery").find_all('h1') re_name = re.compile(r'<h1>(.*?)<span class') xiaohau_name = re_name.search(str(bfind_name[0])).groups()[0] bfind = bs.find('ul', class_="ad-thumb-list").find_all('a') re_img = re.compile(r'<a class="" href="(.*?)"') print("开始下载 `{}`".format(xiaohau_name)) for img in bfind: imgs = re_img.search(str(img)).groups()[0] self.download(imgs,xiaohau_name) print("校花 `{}` 下载完成~".format(xiaohau_name)) print("***************************************") if self.url_list == []: break
def main(self): page = self.get_page() for num in range(page): self.get_image_url(num) self.get_xiaohua_total_img() print("爬取完成!")
if __name__ == '__main__': x = Xiaohua() x.main()
|
效果:
