Pipeline用法

储存到MongoDB

pipline.py中的代码

import pymongo

class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db @classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
) def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db] def process_item(self, item, spider):
name = item.collection
self.db[name].insert(dict(item))
return item def close_spider(self, spider):
self.client.close()

settings配置

MONGO_URI = 'localhost'  # 若为远程MongoDB,需更改地址
MONGO_DB = 'images360'
ITEM_PIPELINES = {
'images360.pipelines.MongoPipeline': 301,
}

存储到MySQL

pipeline 文件配置

import pymysql

class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port @classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
) def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor() def close_spider(self, spider):
self.db.close() def process_item(self, item, spider):
print(item['title'])
data = dict(item)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values) # 动态构造数据,需先在MySQL里面创建好。
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item

setting配置

MYSQL_HOST = 'localhost'  # 远程连接的话,要改成该主机数据库
MYSQL_DATABASE = 'images360'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root'
MYSQL_PORT = 3306
ITEM_PIPELINES = {
'images360.pipelines.MysqlPipeline': 302,
}

图片储存

pipline

from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item def get_media_requests(self, item, info):
yield Request(item['url'])

settings配置

MYSQL_HOST = 'localhost'  # 远程连接的话,要改成该主机数据库
MYSQL_DATABASE = 'images360'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root'
MYSQL_PORT = 3306
ITEM_PIPELINES = {
'images360.pipelines.ImagePipeline': 300,
}

中间件使用

随机请求头

import random

class RandomUserAgentMiddleware():
def __init__(self):
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2',
'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'ozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
] def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)

settings配置

DOWNLOADER_MIDDLEWARES = {
'fangtianxia.middlewares.RandomUserAgentMiddleware': 545,
}

设置随机代理池

import logging
import requests class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False def process_request(self, request, spider):
if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + uri)
request.meta['proxy'] = uri @classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)

下载中间件配置

DOWNLOADER_MIDDLEWARES = {
'fangtianxia.middlewares.ProxyMiddleware': 543,
}

末尾代理池配置

PROXY_URL = 'http://localhost:5555/random'  # 若为远程服务器运行,需更改地址

对接selenium

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
import time class SeleniumMiddleware():
# def __init__(self, timeout=None, service_args=[]):
def __init__(self, timeout=None):
self.logger = getLogger(__name__)
self.timeout = timeout
# self.browser = webdriver.PhantomJS(service_args=service_args)
self.browser = webdriver.Chrome()
self.browser.set_window_size(1400, 700)
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout) def __del__(self):
self.browser.close() def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
self.logger.debug('PhantomJS is Starting')
page = request.meta.get('page', 1)
try:
self.browser.get(request.url)
if page > 1:
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.clear()
input.send_keys(page)
submit.click()
self.wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request) @classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
)
# return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
# service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))

Settings配置

若需要更改为phantomjs,需把注释的内容替换出来,并在setting中增加设置:

SELENIUM_TIMEOUT = 20

PHANTOMJS_SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

原文链接

Scrapy 中常用的中间件和管道组件的更多相关文章

  1. 本文将介绍“数据计算”环节中常用的三种分布式计算组件——Hadoop、Storm以及Spark。

    本文将介绍“数据计算”环节中常用的三种分布式计算组件——Hadoop.Storm以及Spark. 当前的高性能PC机.中型机等机器在处理海量数据时,其计算能力.内存容量等指标都远远无法达到要求.在大数 ...

  2. .Net中常用的重要的第三方组件

    RSS.NET.dll RSS.NET是一款操作RSS feeds的开源.NET类库.它为解析和编写RSS feeds提供了一个可重用的对象模型.它完全兼容RSS 0.90, 0.91, 0.92, ...

  3. scrapy中使用selenium来爬取页面

    scrapy中使用selenium来爬取页面 from selenium import webdriver from scrapy.http.response.html import HtmlResp ...

  4. 爬虫07 /scrapy图片爬取、中间件、selenium在scrapy中的应用、CrawlSpider、分布式、增量式

    爬虫07 /scrapy图片爬取.中间件.selenium在scrapy中的应用.CrawlSpider.分布式.增量式 目录 爬虫07 /scrapy图片爬取.中间件.selenium在scrapy ...

  5. .net开发中常用的第三方组件

    .net开发中常用的第三方组件 2013-05-09 09:33:32|  分类: dotnet |举报 |字号 订阅     下载LOFTER 我的照片书  |   RSS.NET.dll RSS. ...

  6. Android 项目中常用到的第三方组件

    项目中常用到的第三方组件 1 社会化分享ShareSDK-Core-2.5.9.jarShareSDK-QQ-2.5.9.jarShareSDK-QZone-2.5.9.jarShareSDK-Sin ...

  7. scrapy中的下载器中间件

    scrapy中的下载器中间件 下载中间件 下载器中间件是介于Scrapy的request/response处理的钩子框架. 是用于全局修改Scrapy request和response的一个轻量.底层 ...

  8. scrapy中 selenium(中间件) + 语言处理 +mysql

    在通过scrapy框架进行某些网站数据爬取的时候,往往会碰到页面动态数据加载的情况发生,如果直接使用scrapy对其url发请求,是绝对获取不到那部分动态加载出来的数据值.但是通过观察我们会发现,通过 ...

  9. Angular4中常用管道(转载)

    Angular4中常用管道 通常我们需要使用管道实现对数据的格式化,Angular4中的管道和之前有了一些变化,下面说一些常用的管道. 一.大小写转换管道 uppercase将字符串转换为大写 low ...

随机推荐

  1. iOS-UITextField和UITextView隐藏键盘

    UITextField和UITextView隐藏键盘 71 views, IOS DEV, by admin. self._textField.returnKeyType=UIReturnKeyDon ...

  2. CEIWEI CheckSum CRC校验精灵v2.1 CRC3/CRC4/CRC5/CRC6/CRC8CRC10/CRC11/CRC16/CRC24/CRC32/CRC40/CRC64/CRC82/Adler32

    CEIWEI CheckSum CRC校验精灵 是一款通用的循环冗余校验码CRC(Cyclic Redundancy Check).MD5.SHA1.SHA2.SHA3.HAVAL.SHAKE.TIG ...

  3. CommMonitor10.0.3串口过滤工具(serial port monitor)

    CommMonitor  串行端口监视精灵是用于RS232 / RS422 / RS485端口监控的专业强大的系统实用程序软件.CommMonitor监视显示,记录和分析系统中的所有串行端口活动.这是 ...

  4. Spring 使用单选按钮

    模型层需要提供数据选项,设置错误信息 关键代码 @NotNull(message = "请选择性别") private String gender; 控制器层需要在显示视图前,通过 ...

  5. 洛谷 题解 P1908 【逆序对】

    一开始竟然妄想用\(n^2\)的算法过这题,然而这是不可能的 所以只好写归并排序来求逆序対惹 比如将下面两个区间排序 3 4 7 9 1 5 8 10 首先将右区间的\(1\)取出,放到\(r_k\) ...

  6. 第一周-----Java 的核心优势和生态体系——程序员们希望他们编写的程序能够运行在不同的机器,不同的环境中,这需要一种体系中立的语言(即跨平台)。

    跨平台是Java 语言的核心优势,赶上最初互联网的发展,并随着互联网的发展而发展,建立了强大的生态体系,目前已覆盖IT各个行业的“第一大语言,称为IT界的英语”

  7. activate-power-mode安装与设置

    Window-->activate-power-mode-->去掉combo/shake,其他三个全勾上,现在用起来就很爽了,赶紧体验吧.

  8. PCL学习(二)三维模型转点云 obj转pcd----PCL实现

    #include <pcl/io/io.h> #include <pcl/io/pcd_io.h> #include <pcl/io/obj_io.h> #incl ...

  9. windows版mysql添加远程访问

    use mysql; ##然后查看下当前连接允许情况 select host, user, authentication_string, plugin from user; ##依次执行 CREATE ...

  10. C++:标准模板库vector

    一:介绍 vector是C++标准模板库,是一个容器,底层是数组,为连续内存. 命名空间为std,所属头文件为<vector>   注意:不是<vector.h> vector ...