requests+正则表达式 爬取 妹子图
感谢 崔庆才大神的 爬虫教学视频 和 gitbook:
#! user/bin/python # coding=utf-8 import os import re import requests from requests.exceptions import RequestException from hashlib import md5 def download_from_detail(url): item = get_dict(url) save_images(item) def get_dict(url): """ :param url: :return: {"title","image_url_list"} """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36" } try: response = requests.get(url, headers=headers) except RequestException: print("request error") return None if response.status_code == 200: # parse html from gb2312 to utf-8 response.encoding = "gb2312" html = response.text title ='<title>(.*?)</title>', html, re.S).group(1).split()[0] images_url = re.findall('<img alt=.*?src="(.*?)" /><br />', html) return { "title": title, "images_url": images_url } else: return None def save_images(item): """ save image in file which name is title :param item: :return: """ if not item: return # 1 affirm if directory exists if not os.path.exists(item["title"]): os.mkdir(item["title"]) # 2 save all the images into folder headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36" } for url in item["images_url"]: try: image_response = requests.get(url, headers=headers) except RequestException: print("request image error") continue file_name = "{0}/{1}.{2}".format(item["title"], md5(image_response.content).hexdigest(), "jpeg") with open(file_name, "wb") as image_file: image_file.write(image_response.content) print("{0} writing successfully".format(file_name)) def get_page_index(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36" } try: response = requests.get(url, headers=headers) response.encoding="gb2312" except RequestException: print("request image error") if response.status_code == 200: page_index_urls = re.findall('<a href="(.*?)".*?target=\'_blank\'>',response.text,re.S) for url in page_index_urls: download_from_detail(url) if __name__ == "__main__": url = "" get_page_index(url)
① gb2312 转 utf-8
