python文件管道下载图集

# -*- coding: utf-8 -*-

import re

from time import sleep

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

class AngelSpider(CrawlSpider):

    name = 'angel'

    allowed_domains = ['angelimg.spbeen.com']

    start_urls = ['http://angelimg.spbeen.com/']

    base_url = "http://angelimg.spbeen.com"

    rules = (

        Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),

    )

    def parse_item(self, response):

        print(response.url)

        item = response.meta.get('item',False)

        if item:

            pass

        else:

            item = {}

            item['files'] = []

            item['file_urls'] = []

            dir_name = response.xpath('.//div[@class="article"]/h2/text()').extract_first()

            item['dir_name'] = dir_name.split('【')[0]

            item['dir_name'] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","", item['dir_name'])

        img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()

        item['file_urls'].append(img_url)

        # 如果有下一页 请求下一页，没有数据丢回管道

        next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()

        #sleep(1)

        if next_url:

            next_url = self.base_url + next_url

            yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})

        else:

            yield item

管道继承文件管道

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import hashlib

import os

from scrapy.pipelines.files import FilesPipeline

class AngelimgPipeline(object):

    def process_item(self, item, spider):

        return item

from scrapy.http import Request

from scrapy.utils.python import to_bytes

class DealFilePathPipeline(FilesPipeline):

    def get_media_requests(self, item, info):

        return [Request(x,meta={'item':item}) for x in item.get(self.files_urls_field, [])]

    def file_path(self, request, response=None, info=None):

        ## start of deprecation warning block (can be removed in the future)

        def _warn():

            from scrapy.exceptions import ScrapyDeprecationWarning

            import warnings

            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '

                          'file_path(request, response=None, info=None) instead',

                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument

        if not isinstance(request, Request):

            _warn()

            url = request

        else:

            url = request.url

        # detect if file_key() method has been overridden

        if not hasattr(self.file_key, '_base'):

            _warn()

            return self.file_key(url)

        ## end of deprecation warning block

        item = request.meta.get('item',{})

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation

        media_ext = os.path.splitext(url)[1]  # change to request.url after deprecation

        print(item)

        return 'full2/{}/{}{}'.format(item['dir_name'],media_guid, media_ext)

        return 'full/%s%s' % (media_guid, media_ext)

    # deprecated

    def file_key(self, url):

        return self.file_path(url)

    file_key._base = True

　　setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for angelImg project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'angelImg'

SPIDER_MODULES = ['angelImg.spiders']

NEWSPIDER_MODULE = 'angelImg.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'angelImg (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

  # 'Accept-Language': 'en',

    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",

    "Referer":"http://angelimg.spbeen.com/"

}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'angelImg.middlewares.AngelimgSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'angelImg.middlewares.AngelimgDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

   #'angelImg.pipelines.AngelimgPipeline': 300,

   'angelImg.pipelines.DealFilePathPipeline': 200,

   #'scrapy.pipelines.files.FilesPipeline': 2

}

FILES_STORE='file_doload'

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

python文件管道下载图集的更多相关文章

PYTHON文件多线程下载
其实,在一般的文件编程中,这有两个概念要说明: 第一是,下载一个大文件,将这个大文件多为多线程. 第二是,下载N多小文件,将每个线程指定下载多个小文件. 现在实现的是多线程下载一个大文件. 今天完成了 ...
2、Python djang 框架下的word Excel TXT Image 等文件的下载
2.python实现文件下载 (1)方法一.直接用a标签的href+数据库中文件地址,即可下载.缺点:word excel是直接弹框下载,对于image txt 等文件的下载方式是直接在新页面打开. ...
python爬虫之下载文件的方式总结以及程序实例
python爬虫之下载文件的方式以及下载实例目录第一种方法:urlretrieve方法下载第二种方法:request download 第三种方法:视频文件.大型文件下载实战演示第一种方法: ...
Python selenium 文件自动下载（自动下载器）
MyGithub:https://github.com/williamzxl 最新代码已经上传到Github,以下版本为stupid版本. 由于在下载过程中需要下载不同文件,所以可以把所有类型放在Va ...
Python 文件操作函数
这个博客是 Building powerful image classification models using very little data 的前期准备,用于把图片数据按照教程指示放到规定的文 ...
python文件打包格式，pip包管理
1..whl是python文件的一种打包格式, 在有些情况下,可以将文件的后缀名改为.zip并解压 2.cmd中,提示pip版本太低,先升级pip pip install --upgrade pi ...
Python文件系统功能：os模块
Python文件系统功能:os模块 1.os模块方法分类 (1)目录: chdir() 改变工作目录 chroot() 设定当前进程的根目录 listdir() 列出指定目录下的所有文件名 mkdir ...
利用pyinstaller 打包Python文件
1.下载安装pyinstaller模块 cmd 命令: pip install pyinstaller cmd命令: pip list 查看自己安装的模块 2.建议把要大包的Python文件单独放到新 ...
随手用python写一个下载jdk源码爬虫
最近在研读jdk源码,网上找了下资源,发现都不完整. 后来新发现了一个有完整源码的地方,主要包括了java,c,c++的东西,装逼需要,就想拿来玩玩.但是,找了好多种下载打开的方式,发现都不对.于是, ...

随机推荐

HTML页面的基本信息
1.python中生成的html页面,每一段的基本解释,以及header中的应用 2.body中的应用 2.1.a href链接点击baidu直接跳转百度网址,如果需要重新打开一个页面,详情看2.16 ...
python变量及简单数据类型
python 目录 python 1.变量 1.变量的定义 2.变量的命名 3. 关键字 4.变量的命名规则 5.变量的类型 5.不同类型变量之间的计算 6.变量的输入 7.变量的格式化输出 8.格式 ...
C语言汇总2
(10-15) 注释:1.单行注释可以嵌套单行注释 eg .//lalalal//lalalal(/后面都是注释完的) 2.多行注释可以嵌套单行注释 (两个**之间的都是注释的) 3.单行注释可以嵌套 ...
php Zookeeper使用踩坑
用的是Zookeeper扩展,Php版本为7.2.17,下载地址: https://pecl.php.net/package/zookeeper 用的是0.6.4版本创建节点官方给的示例如下: &l ...
CVE-2020-0796（Windows SMBv3） RCE漏洞复现
CVE-2020-0796 攻击机:win10:192.168.205.1 靶机win10:192.168.205.132 关闭defender防火墙 0x01 影响版本 Windows 10 190 ...
svn的使用学习
一:安装 1.svn安装包,语言包下载地址:https://pan.baidu.com/s/1PFM7ya_hNJM-v979KgCpgA 提取码:mpxq 2.运行下载的TortoiseSVN程序 ...
源码分析springboot自定义jackson序列化，默认null值个性化处理返回值
最近项目要实现一种需求,对于后端返回给前端的json格式的一种规范,不允许缺少字段和字段值都为null,所以琢磨了一下如何进行将springboot的Jackson序列化自定义一下,先看看如何实现,再 ...
oracle数据库外部连接无法访问
服务器出现的问题是运行的项目无法访问oracle数据库连接,用plsql输入用户名密码后卡死,无法连接.但是通过命令窗口对oracle数据库操作正常,对oracle服务进行查看并重启,并无异常,运行t ...
const pointers
1 指针 p对应的地址是常量,但是里面存放的data不是常量 2 地址里存放的data是常量,但是地址不是常量 3 地址和指针都是常量
GAN在seq2seq中的应用 Application to Sequence Generation
Improving Supervised Seq-to-seq Model 有监督的 seq2seq ,比如机器翻译.聊天机器人.语音辨识之类的 . 而 generator 其实就是典型的 seq2s ...

python文件管道 下载图集

python文件管道 下载图集的更多相关文章

随机推荐

热门专题

python文件管道下载图集

python文件管道下载图集的更多相关文章