scrapy-redis使redis不止保存url
先看scrapy-redis源码
class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None # Redis client placeholder.
server = None def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests() def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal. This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None) if crawler is None:
raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip():
raise ValueError("redis_key must not be empty") if self.redis_batch_size is None:
# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
) try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__) self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
# TODO: Use redis pipeline execution.
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data) if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data):
"""Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to
provide your own message decoding. Parameters
----------
data : bytes
Message from redis. """
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url) def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self) def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
仔细看完的话会发现
make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
def make_request_from_data(self, data):
'''
:params data bytes, Message from redis
'''
company = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(company) def make_requests_from_url(self, company):
data = eval(company)
url = data["url"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
}
return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)
值得注意的是
不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行
scrapy-redis使redis不止保存url的更多相关文章
- Redis学习——Redis持久化之AOF备份方式保存数据
新技术的出现一定是在老技术的基础之上,并且完善了老技术的某一些不足的地方,新技术和老技术就如同JAVA中的继承关系.子类(新技术)比父类(老技术)更加的强大! 在前面介绍了Redis学习--Redis ...
- Redis学习——Redis持久化之RDB备份方式保存数据
从这一个介绍里面知道,redis比memcache作为缓存数据库强大的地方,一个是支持的数据类型比较多,另一个就是redis持久化功能. 下面就介绍Redis的持久化之RDB! 一:什么是redis的 ...
- 解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留。可能修改数据集的命令被禁用。请检查Redis日志,了解有关错误的详细信息。
解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留.可能修改数据集的命令被禁用.请检查Redis日志,了解有关错误的详细信息. 出现bug: 在学习celery,将数据 ...
- 8.1 k8s使用PV/PVC做数据持久化运行redis服务,数据保存至NFS
1.制作redis docker镜像 1.1 准备alpine基础镜像 # 下载 docker pull alpine:3.13 # 更改tag docker tag alpine:3.13 192. ...
- 搭建LNAMP环境(五)- PHP7源码安装Redis和Redis拓展
上一篇:搭建LNAMP环境(四)- 源码安装PHP7 一.安装Redis 1.创建redis用户组和用户 groupadd redis useradd -r -g redis -s /sbin/nol ...
- redis配置文件redis.conf中文版
转账自:http://www.jb51.net/article/50605.htm # Redis示例配置文件 # 注意单位问题:当需要设置内存大小的时候,可以使用类似1k.5GB.4M这样的常见格式 ...
- redis配置文件redis.conf中文版(基于2.4)
转载于:http://www.itxuexiwang.com/a/shujukujishu/redis/2016/0216/99.html?1455869981 代码如下: # Redis示例配置文件 ...
- vagrant系列教程(四):vagrant搭建redis与redis的监控程序redis-stat(转)
上一篇php7环境的搭建 真是火爆,仅仅两天时间,就破了我之前swagger系列的一片文章,看来,大家对搭建环境真是情有独钟. 为了访问量,我今天再来一篇Redis的搭建.当然不能仅仅是redis的搭 ...
- Redis配置文件redis.conf参数配置详解
########################################## 常规 ########################################## daemonize n ...
随机推荐
- [bzoj2783][JLOI2012]树_树的遍历
树 bzoj2783 JLOI2012 题目大意:给定一棵n个点的树.求满足条件的路径条数.说一个路径是满足条件的,当且仅当这条路径上每个节点深度依次递增且点权和为S. 注释:$1\le n\le 1 ...
- Window下UDP(socket)接和收数据案例
配置QT的环境变量,这台电脑à属性à高级系统设置à高级à环境变量à系统变量àpathàC:\Qt\Qt5.3.0\5.3\mingw482_32\bin;C:\Qt\Qt5.3.0\Tools\ ...
- pytest 失败用例重试
https://www.cnblogs.com/jinzhuduoduo/articles/7017405.html http://www.lxway.com/445949491.htm https: ...
- tlplayer 全部平台版本号支持水印叠加
tlplayer支持视频渲染前水印叠加.各个系统版本号相同支持. 联系方式:weinyzhou86@gmail.com QQ:514540005 版权全部,禁止转载. 公布自:http://blog. ...
- 简化bigdecimal计算的小工具类
简化bigdecimal计算的小工具类 如果我们要做一个加法运算,需要先将两个浮点数转为String,然后够造成BigDecimal,在其中一个上调用add方法,传入另一个作为参数,然后把运算的结果( ...
- Advapi32.dll 函数接口说明
Advapi32.dll 函数接口说明 函数原型 说明 AbortSystemShutDown ...
- codeforces 931E Logical Expression dp
time limit per test 3 seconds memory limit per test 256 megabytes input standard input output standa ...
- [SDOI2007]游戏
https://zybuluo.com/ysner/note/1184420 题面 题意简单,但不太好概括. 戳我 解析 不成熟想法 据题意可知,字符串字符的顺序无影响. 并且判断两个字符串能否接龙可 ...
- Palindrome(dp)
http://poj.org/problem?id=1159 题意:给定一个字符,问最少插入多少字符,使该字符串变成回文字符串. 思路:设原字符串序列为X,其逆字符串为Y,则最少插入的字符数=leng ...
- json用法
什么是JSON? JavaScript 对象表示法(JavaScript Object Notation). JSON是一种轻量级的数据交换格式,某个JSON格式的文件内部譬如可以长成这样: 1 2 ...