Scarpy 起始url 自定义代理 自定义去重规则
- start_urls - 内部原理
"""
scrapy引擎来爬虫中去起始的URL:
1. 调用start_requests并获取返回值
2. v = iter(返回值)
3.
req1 = 执行 v.__next__()
req2 = 执行 v.__next__()
req3 = 执行 v.__next__()
...
4.req全部放到调度器中
""" - 编写
class Pc1Spider(scrapy.Spider):
name = 'pc1'
allowed_domains = ['chouti.com']
start_urls = ["https://dig.chouti.com"] #定制:可以去redis中获取 def start_requests(self):
# 添加内置代理
"""
import os
os.environ["HTTPS_PROXY"] = "https://root:password@192.168.1.3:9999/"
os.environ["HTTP_PROXY"] = "123.11.12.13" """
# 方式一:
# for url in self.start_urls:
# yield Request(url=url)
# 方式二:
# req_list = []
# for url in self.start_urls:
# req_list.append(Request(url=url))
# return req_list def parse(self,response):
pass
自定义代理
import base64
import random
from six.moves.urllib.parse import unquote, urlunparse try:
from urllib2 import _parse_proxy
except ImportError:
from urllib.request import _parse_proxy from scrapy.utils.python import to_bytes class AqlProxyMiddleware(object): def _basic_auth_header(self, username, password):
user_pass = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
encoding='latin-1')
return base64.b64encode(user_pass) def process_request(self, request, spider): PROXIES = [
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/", ] url = random.choice(PROXIES)
orig_type = ""
proxt_type, user, password, hostport = _parse_proxy(url)
proxt_url = urlunparse((proxt_type or orig_type, hostport,'','','','')) if user:
creds = self._basic_auth_header(user,password)
else:
creds = None request.meta['proxy'] = proxt_url
if creds :
request.headers['Proxy-Authorization'] = b'Basic ' + creds
settings
# -*- coding: utf- -*- # Scrapy settings for step8_king project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # . 爬虫名称
BOT_NAME = 'step8_king' # . 爬虫应用路径
SPIDER_MODULES = ['step8_king.spiders']
NEWSPIDER_MODULE = 'step8_king.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
# . 客户端 user-agent请求头
# USER_AGENT = 'step8_king (+http://www.yourdomain.com)' # Obey robots.txt rules
# . 禁止爬虫配置
# ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: )
# . 并发请求数
# CONCURRENT_REQUESTS = # Configure a delay for requests for the same website (default: )
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# . 延迟下载秒数
# DOWNLOAD_DELAY = # The download delay setting will honor only one of:
# . 单域名访问并发数,并且延迟下次秒数也应用在每个域名
# CONCURRENT_REQUESTS_PER_DOMAIN =
# 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP
# CONCURRENT_REQUESTS_PER_IP = # Disable cookies (enabled by default)
# . 是否支持cookie,cookiejar进行操作cookie
# COOKIES_ENABLED = True
# COOKIES_DEBUG = True # Disable Telnet Console (enabled by default)
# . Telnet用于查看当前爬虫的信息,操作爬虫等...
# 使用telnet ip port ,然后通过命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [,] # . 默认请求头
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# } # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# . 定义pipeline处理请求
# ITEM_PIPELINES = {
# 'step8_king.pipelines.JsonPipeline': ,
# 'step8_king.pipelines.FilePipeline': ,
# } # . 自定义扩展,基于信号进行调用
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# # 'step8_king.extensions.MyExtension': ,
# } # . 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度
# DEPTH_LIMIT = # . 爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo # 后进先出,深度优先
# DEPTH_PRIORITY =
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
# 先进先出,广度优先 # DEPTH_PRIORITY =
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' # . 调度器队列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'
# from scrapy.core.scheduler import Scheduler # . 访问URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl' # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html """
. 自动限速算法
from scrapy.contrib.throttle import AutoThrottle
自动限速设置
. 获取最小延迟 DOWNLOAD_DELAY
. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
target_delay = latency / self.target_concurrency
new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
new_delay = max(target_delay, new_delay)
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
slot.delay = new_delay
""" # 开始自动限速
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下载延迟
# AUTOTHROTTLE_START_DELAY =
# The maximum download delay to be set in case of high latencies
# 最大下载延迟
# AUTOTHROTTLE_MAX_DELAY =
# The average number of requests Scrapy should be sending in parallel to each remote server
# 平均每秒并发数
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received:
# 是否显示
# AUTOTHROTTLE_DEBUG = True # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings """
. 启用缓存
目的用于将已经发送的请求或相应缓存下来,以便以后使用 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否启用缓存策略
# HTTPCACHE_ENABLED = True # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" # 缓存超时时间
# HTTPCACHE_EXPIRATION_SECS = # 缓存保存路径
# HTTPCACHE_DIR = 'httpcache' # 缓存忽略的Http状态码
# HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存存储的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' """
. 代理,需要在环境变量中设置
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware 方式一:使用默认
os.environ
{
http_proxy:http://root:woshiniba@192.168.11.11:9999/
https_proxy:http://192.168.11.11:9999/
}
方式二:使用自定义下载中间件 def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors) class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) DOWNLOADER_MIDDLEWARES = {
'step8_king.middlewares.ProxyMiddleware': ,
} """ """
. Https访问
Https访问时有两种情况:
. 要爬取网站使用的可信任证书(默认支持)
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" . 要爬取网站使用的自定义证书
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
其他:
相关类
scrapy.core.downloader.handlers.http.HttpDownloadHandler
scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
相关配置
DOWNLOADER_HTTPCLIENTFACTORY
DOWNLOADER_CLIENTCONTEXTFACTORY """ """
. 爬虫中间件
class SpiderMiddleware(object): def process_spider_input(self,response, spider):
'''
下载完成,执行,然后交给parse处理
:param response:
:param spider:
:return:
'''
pass def process_spider_output(self,response, result, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
'''
return result def process_spider_exception(self,response, exception, spider):
'''
异常调用
:param response:
:param exception:
:param spider:
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
'''
return None def process_start_requests(self,start_requests, spider):
'''
爬虫启动时调用
:param start_requests:
:param spider:
:return: 包含 Request 对象的可迭代对象
'''
return start_requests 内置爬虫中间件:
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': ,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': ,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': ,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': ,
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': , """
# from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'step8_king.middlewares.SpiderMiddleware': ,
} """
. 下载中间件
class DownMiddleware1(object):
def process_request(self, request, spider):
'''
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
'''
pass def process_response(self, request, response, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
'''
print('response1')
return response def process_exception(self, request, exception, spider):
'''
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
'''
return None 默认下载中间件
{
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': ,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': ,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': ,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': ,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': ,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': ,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': ,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': ,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': ,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': ,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': ,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': ,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': ,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': ,
} """
# from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'step8_king.middlewares.DownMiddleware1': ,
# 'step8_king.middlewares.DownMiddleware2': ,
# }
scrapy-redis 自定义去重规则
############### xxx.py ###### from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults class RdisDupeFilter(RFPDupeFilter): @classmethod
def from_settings(cls, settings): server = get_redis_from_settings(settings) key = defaults.DUPEFILTER_KEY % {'timestamp':'myScrapy'}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug) ########################### settings.py ##########
# ######################### scrapy redis连接 ##############
REDIS_HOST = "129.28.96.43" #主机名
REDIS_PORT = 6379 #端口
REDIS_PARAMS = {'password':"beta"}
REDIS_ENCODEING = "utf-8" #redis编码类型 # REDIS_URL = 'redis://user:pwd@hostname:9001' #连接URL 优先上面配置 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' # DUPEFLITER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
DUPEFLITER_CLASS = 'myscrapy.xxx.RedisDupeFilter'
Scarpy 起始url 自定义代理 自定义去重规则的更多相关文章
- Scrapy框架——介绍、安装、命令行创建,启动、项目目录结构介绍、Spiders文件夹详解(包括去重规则)、Selectors解析页面、Items、pipelines(自定义pipeline)、下载中间件(Downloader Middleware)、爬虫中间件、信号
一 介绍 Scrapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的,使用它可以以快速.简单.可扩展的方式从网站中提取所需的数据.但目前Scrapy的用途十分广泛,可 ...
- Springboot中以配置类方式自定义Mybatis的配置规则(如开启驼峰映射等)
什么是自定义Mybatis的配置规则? 答:即原来在mybatis配置文件中中我们配置到<settings>标签中的内容,如下第6-10行内容: 1 <?xml version=&q ...
- 客户端使用自定义代理类访问WCF服务 z
通常在客户端访问WCF服务时,都需要添加服务引用,然后在客户端app.config或 web.config文件中产生WCF服务的客户端配置信息.若是每添加一个服务都是这样做,这样势必会将比较麻烦,能否 ...
- python3 get/post/使用代理/自定义header/自定义Cookie
说明:urllib发送http请求并不是很人性化,更推荐使用在urllib基础上封装的.python2和python3都兼容的requests模块,移步查看. 一.get请求 get请求就是在构造Re ...
- SpringSecurity——基于Spring、SpringMVC和MyBatis自定义SpringSecurity权限认证规则
本文转自:https://www.cnblogs.com/weilu2/p/springsecurity_custom_decision_metadata.html 本文在SpringMVC和MyBa ...
- 客户端使用自定义代理类访问WCF服务
通常在客户端访问WCF服务时,都需要添加服务引用,然后在客户端app.config或web.config文件中产生WCF服务的客户端配置信息.若是每添加一个服务都是这样做,这样势必会将比较麻烦,能否简 ...
- dt二次开发之-url伪静态的自定义
dt内核的方便性在于代码内核完全开源,都可以根据自身需要进行优化整改,个人在这段时间的深入研究,发现这套内核的方便性,今天继续给大家分享下DT的url伪静态如何自定义函数. url自定义文件是在api ...
- atittit.表单验证的实现方式以及原理本质以及选型以及自定义兼容easyui dsl规则的表单验证
atittit.表单验证的实现方式以及原理本质以及选型以及自定义兼容easyui dsl规则的表单验证 1. 需求,表单验证需要弹框式,但目前easyui ms绑定死了tooltip式样 1 2. 表 ...
- nginx 自定义代理返回 404
在nginx的http段,加上一面的配置 proxy_intercept_errors on;//自定义代理返回的404错误提示
随机推荐
- 2018-06-21 中文代码示例视频演示Python入门教程第五章 数据结构
知乎原链 续前作: 中文代码示例视频演示Python入门教程第四章 控制流 对应在线文档: 5. Data Structures 这一章起初还是采取了尽量与原例程相近的汉化方式, 但有些语义较偏(如T ...
- python地理处理包——geopy使用之地理编码与反地理编码
由于专业需要,经常接触一些地理处理的工具包,文档都是英文的,自己看的同时将其翻译一下,一方面自己学习的同时有个记录,要是能同时给一起的学习的童鞋们一些帮助,想想也是极好的.以下的文档内容主要翻译自官方 ...
- 关于数据分析的4点心得:维度、指标、KPI
1.看数据看维度 在对某一项业务或者业务的某个模块进行分析时,可以从大小两个角度去切入分析. 首先站在广阔的视角去看待一些数据.比如对某个产品(消费品),就要分析在大环境下是一个什么样的数据,如市场排 ...
- spring boot 基础 2018年5月3日
主包下运行类@SpringBootApplication 此注解是核心注解,源码如下 @Target({ElementType.TYPE}) @Retention(RetentionPolicy.R ...
- MyBatis笔记----Mybatis3.4.2与spring4整合:增删查改
结构图 刚之前没什么区别,多了一个applicationContext.xml 包图 由于之前出了一点错误,有些包可能多加上了 数据库图 model User.java package com.ij3 ...
- Greenplum hostname和address不一致导致配置文件无法加载
最近又遇到了几个坑,逐一记录分析下. 1.主机名hostname和address不一致 在又一次部署压测环境交由测试组进行压测时,同事修改了pg_hba.conf文件重新加载配置文件时报错.(找不到l ...
- mac os 10.12 Sierra 连接 惠普 M1136 MFP 打印机,通过 samba 协议,安装驱动,连接打印机
参考链接: https://support.hp.com/hk-zh/product/hp-zbook-17-g3-mobile-workstation/8693765/document/c04530 ...
- CISCO 动态路由(RIP)
RIP(路由信息协议):是一种内部网关协议(IGP),是一种动态路由选择协议,基于距离矢量算法(DistanceVectorAlgorithms),使用“跳数”(即metric)来衡量到达目标地址的路 ...
- Spring缓存注解@Cacheable、@CacheEvict、@CachePut使用
从3.1开始,Spring引入了对Cache的支持.其使用方法和原理都类似于Spring对事务管理的支持.Spring Cache是作用在方法上的,其核心思想是这样的:当我们在调用一个缓存方法时会把该 ...
- python list和tuple
list列表简介:列表是python的基础数据类型之⼀ ,其他编程语⾔也有类似的数据类型. 比如JS中的数组, java中的数组等等. 它是以[ ]括起来, 每个元素⽤' , '隔开⽽且可以存放各种数 ...