
  1. 在爬虫文件中只需要解析提取出图片地址,然后将地址提交给管道
  2. 配置文件中:IMAGES_STORE = './imgsLib'
  3. 在管道文件中进行管道类的制定:
    • from scrapy.pipelines.images import ImagesPipeline
    • 将管道类的父类修改成ImagesPipeline
    • 重写父类的三个方法

# -*- coding: utf-8 -*-
import scrapy
from imgPro.items import ImgproItem class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.521609.com/daxuemeinv/']
url = 'http://www.521609.com/daxuemeinv/list8%d.html'
pageNum = 1
def parse(self, response):
li_list = response.xpath('//*[@id="content"]/div[2]/div[2]/ul/li')
for li in li_list:
img_src = 'http://www.521609.com'+li.xpath('./a[1]/img/@src').extract_first()
item = ImgproItem()
item['src'] = img_src yield item # if self.pageNum < 3:
# self.pageNum += 1
# new_url = format(self.url%self.pageNum)
# yield scrapy.Request(new_url,callback=self.parse)


# -*- coding: utf-8 -*-

# Define here the models for your scraped items
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ImgproItem(scrapy.Item):
# define the fields for your item here like:
src = scrapy.Field()
# pass


# -*- coding: utf-8 -*-

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline
import scrapy
# class ImgproPipeline(object):
# def process_item(self, item, spider):
# return item
class ImgproPipeline(ImagesPipeline): #对某一个媒体资源进行请求发送
def get_media_requests(self, item, info):
yield scrapy.Request(item['src']) #制定媒体数据存储的名称
def file_path(self, request, response=None, info=None):
name = request.url.split('/')[-1]
return name #将item传递给下一个即将给执行的管道类
def item_completed(self, results, item, info):
return item


# -*- coding: utf-8 -*-

# Scrapy settings for imgPro project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'imgPro' SPIDER_MODULES = ['imgPro.spiders']
NEWSPIDER_MODULE = 'imgPro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'imgPro (+http://www.yourdomain.com)' # Obey robots.txt rules
# LOG_FILE = './log.txt' IMAGES_STORE = './imgsLib'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 'imgPro.middlewares.ImgproSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 'imgPro.middlewares.ImgproDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'imgPro.pipelines.ImgproPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'



