福利爬虫妹子图之获取种子url

import os

import uuid

from lxml import html

import aiofiles

import logging

from ruia import Spider, Request

from ruia_ua import middleware

from aiohttp探究.db import MotorBase

import datetime

demo = "https://www.mzitu.com/page/{}/"

class BaiduImgSpider(Spider):

    start_urls = []

    img_path = 'data/'

    async def parse(self, res):

        self.mongo_db = MotorBase().get_db('img_data')

        source = res.html

        root = html.fromstring(source)

        url_list = root.xpath("//ul[@id='pins']/li/a/@href")

        name_list = root.xpath("//ul[@id='pins']/li/a/img/@alt")

        next_page_urls = []

        headers = {

            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

            'accept-encoding': 'gzip, deflate, br',

            'accept-language': 'zh-CN,zh;q=0.9',

            'cache-control': 'max-age=0',

            'referer': 'https://www.mzitu.com/mm/',

            'upgrade-insecure-requests': '1',

            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',

        }

        for each_data in url_list:

            next_page_urls.append(each_data)

        for name, url in zip(name_list, next_page_urls):

            yield Request(url, headers=headers, callback=self.next_page, metadata={"name": name}, res_type='text')

    async def next_page(self, res):

        source = res.html

        root = html.fromstring(source)

        name = res.metadata.get("name")

        refere_url = res.url

        # print(name, refere_url)

        # 最后一页xpath

        max_page_list = "//div[@class='pagenavi']/a[last()-1]/span/text()"

        _max_page_num = root.xpath(max_page_list)

        max_page_num = _max_page_num[0] if _max_page_num else None

        img_url_node = root.xpath("//div[@class='main-image']/p/a/img/@src")

        img_url = img_url_node[0] if img_url_node else None

        headers = {

            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

            'accept-encoding': 'gzip, deflate, br',

            'accept-language': 'zh-CN,zh;q=0.9',

            'cache-control': 'max-age=0',

            'if-modified-since': 'Thu, 15 Nov 2018 04:24:11 GMT',

            'if-none-match': '"5becf4eb-1b7d4"',

            'referer': refere_url,

            'upgrade-insecure-requests': '1',

            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',

        }

        datas = []

        # yield Request(img_url, callback=self.save_img, headers=headers,

        #               metadata={"url": img_url, "name": name, "id": "1"},

        #               res_type='bytes')

        data1 = {'url': img_url, "status": "0", 'title': name, "img_id": "1", "headers": headers,

                 "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

        datas.append(data1)

        # print("最大页数", max_page_num)

        for page in range(2, int(max_page_num) + 1):

            headers["referer"] = f"{refere_url}{str(page).zfill(2)}"

            next_img_url = img_url.replace("01.", f"{str(page).zfill(2)}.")

            # print("next",next_img_url)

            # yield Request(next_img_url, callback=self.save_img, headers=headers,

            #               metadata={"url": img_url, "name": name, "id": page},

            #               res_type='bytes')

            data2 = {'url': next_img_url, "status": "0", 'title': name, "img_id": page, "headers": headers,

                     "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

            datas.append(data2)

        await self.mongo_db.mzitu2.insert_many(datas)

    async def save_img(self, res):

        url = res.metadata.get("url")

        _img_type = url.rsplit(".", 1)

        img_type = _img_type[1] if _img_type else None

        name = res.metadata.get("name")

        img_id = res.metadata.get("id")

        img_all_path = f"{self.img_path}{name}/"

        if not os.path.exists(img_all_path):

            os.makedirs(img_all_path)

        # img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-')

        img_name = f"{img_id}.{img_type}"

        async with aiofiles.open(img_all_path + img_name, 'wb') as fp:

            await fp.write(res.html)

            logging.info('Img downloaded successfully in {dir}'.format(dir=img_all_path + img_name))

if __name__ == '__main__':

    word = '妹子图'  # 目录名

    pages = 201  # 页数

    BaiduImgSpider.img_path = word + "/"

    BaiduImgSpider.start_urls = [demo.format(page) for page in range(pages)]

    BaiduImgSpider.start(middleware=middleware)

db.py

import asyncio

from motor.motor_asyncio import AsyncIOMotorClient

class MotorBase:

    """

    About motor's doc: https://github.com/mongodb/motor

    """

    _db = {}

    _collection = {}

    def __init__(self, loop=None):

        self.motor_uri = ''

        self.loop = loop or asyncio.get_event_loop()

    def client(self, db):

        # motor

        self.motor_uri = f"mongodb://localhost:27017/{db}"

        return AsyncIOMotorClient(self.motor_uri, io_loop=self.loop)

    def get_db(self, db='test'):

        """

        Get a db instance

        :param db: database name

        :return: the motor db instance

        """

        if db not in self._db:

            self._db[db] = self.client(db)[db]

        return self._db[db]

福利爬虫妹子图之获取种子url的更多相关文章

关于如何爬虫妹子图网的源码分析 c#实现
网上也出现一些抓取妹子图的python 代码,今天我们用c#实现爬虫过程. 请看我的网站: www.di81.com private void www_94xmn_Com(string url, st ...
爬虫实战【5】送福利！Python获取妹子图上的内容
[插入图片,妹子图首页] 哈,只敢放到这个地步了. 今天给直男们送点福利,通过今天的代码,可以把你的硬盘装的满满的~ 下面就开始咯! 第一步:如何获取一张图片假如我们知道某张图片的url,如何获取到 ...
python妹子图爬虫5千张高清大图突破防盗链福利5千张福利高清大图
meizitu-spider python通用爬虫-绕过防盗链爬取妹子图这是一只小巧方便,强大的爬虫,由python编写所需的库有 requests BeautifulSoup os lxml 伪 ...
Python协程爬取妹子图(内有福利，你懂得~)
项目说明: 1.项目介绍本项目使用Python提供的协程+scrapy中的选择器的使用(相当好用)实现爬取妹子图的(福利图)图片,这个学会了,某榴什么的.pow(2, 10)是吧! 2.用到的知 ...
Scrapy框架实战-妹子图爬虫
Scrapy这个成熟的爬虫框架,用起来之后发现并没有想象中的那么难.即便是在一些小型的项目上,用scrapy甚至比用requests.urllib.urllib2更方便,简单,效率也更高.废话不多说, ...
Python爬虫入门教程 2-100 妹子图网站爬取
妹子图网站爬取---前言从今天开始就要撸起袖子,直接写Python爬虫了,学习语言最好的办法就是有目的的进行,所以,接下来我将用10+篇的博客,写爬图片这一件事情.希望可以做好. 为了写好爬虫,我们 ...
Python3爬虫系列：理论+实验+爬取妹子图实战
Github: https://github.com/wangy8961/python3-concurrency-pics-02 ,欢迎star 爬虫系列: (1) 理论 Python3爬虫系列01 ...
[Python爬虫]煎蛋网OOXX妹子图爬虫（1）——解密图片地址
之前在鱼C论坛的时候,看到很多人都在用Python写爬虫爬煎蛋网的妹子图,当时我也写过,爬了很多的妹子图片.后来煎蛋网把妹子图的网页改进了,对图片的地址进行了加密,所以论坛里面的人经常有人问怎么请求的 ...
Python Scrapy 爬取煎蛋网妹子图实例（一）
前面介绍了爬虫框架的一个实例,那个比较简单,这里在介绍一个实例爬取煎蛋网妹子图,遗憾的是上周煎蛋网还有妹子图了,但是这周妹子图变成了随手拍, 不过没关系,我们爬图的目的是为了加强实战应用,管 ...

随机推荐

【BZOJ3413】匹配（后缀自动机，线段树合并）
[BZOJ3413]匹配(后缀自动机,线段树合并) 题面 BZOJ 题解很好的一道题目. 做一个转化,匹配的次数显然就是在可以匹配的区间中,每个前缀的出现次数之和. 首先是空前缀的出现次数,意味着你 ...
【Luogu4512】多项式除法（FFT）
题面洛谷题解模板题... 我直接蒯我写的东西... 这个除法是带余除法,所以并不能直接求逆解决. 要求的就是给定两个多项式\(A(x),B(x)\),其项数为\(n,m\) 求解一个\(n-m\ ...
bzoj2599/luogu4149 [IOI2011]Race (点分治)
点分治.WA了一万年. 重点就是统计答案的方法做法一(洛谷AC bzojWA 自测WA): 做点x时记到x距离为k的边数最小值为dis[k],然后对每一对有值的dis[i]和dis[K-i],给an ...
luogu1979 华容道 (dijkstra+bfs)
我想动某个点的话,一定要先把空白点移动到这个点旁边,然后调换这个点和空白点,一直重复那么,我们就可以记一些状态(x,y,s) (s={0,1},{0,-1},{1,0},{-1,0}),表示我要动的 ...
手动实现property装饰器
首先,property装饰器是通过数据描述符实现的.用法很简单,大家应该都知道,这里就不细说了. 这里主要分析一下property是如何通过描述符实现的. class Property: def __ ...
P2569 股票交易
题目大意: 你初始时有∞ 元钱,并且每天持有的股票不超过 Maxp . 有 T 天,你知道每一天的买入价格( AP[i] ),卖出价格( Bp[i] ), 买入数量限制( AS[i] ),卖出数量限制 ...
（转） JVM——Java类加载机制总结
背景:对java类的加载机制,一直都是模糊的理解,这篇文章看下来清晰易懂. 转载:http://blog.csdn.net/seu_calvin/article/details/52301541 1. ...
c输出格式
#include <stdio.h> #include <stdlib.h> #include <math.h> int main() { //取整 printf( ...
NO.10: 在operator=中处理 "自我赋值"
1.确保当对象自我赋值时operator=有良好的行为,其中的技术包括 "来源对象" 和 "目标对象" 的地址,精心周到的语句顺序,以及“ copy and s ...
用webstorm来开发微信小程序之less的配置
1.安装less. 安装好node之后,打开运行-->cmd-->进入安装node的文件夹目录-->输入 npm install -g less. 然后自动就会在C:\Users\A ...

福利爬虫妹子图之获取种子url

福利爬虫妹子图之获取种子url的更多相关文章

随机推荐

热门专题