安装sqlslte,scrapy需要这个模块

yum install sqlite-devel

python3.5

下载包自己编译安装

./configure

make

make install

自带pip,升到最新版

pip3 install --upgrade pip

python3 MySQL模块

pip3 install pymysql

安装Twisted,scrapy使用的线程框架

wget https://pypi.python.org/packages/6b/23/8dbe86fc83215015e221fbd861a545c6ec5c9e9cd7514af114d1f64084ab/Twisted-16.4.1.tar.bz2#md5=c6d09bdd681f538369659111f079c29d

解包

tar -jxvf Twisted-16.4.1.tar.bz2

进目录
cd Twisted-16.4.1

安装

python3 setup.py install

安装scrapy

pip install scrapy

安装redis

yum install redis

安装scrapy-redis

git clone https://github.com/rolando/scrapy-redis.git

cd scrapy-redis/

python3 setup.py install

由于python2和python3字符串不同引起的bug,github上临时解决的方法

#util.py
import six def bytes_to_str(s, encoding='utf-8'):
"""Returns a str if a bytes object is given.""" if six.PY3 and isinstance(s, bytes):
return s.decode(encoding) return s

  

# spider.py
import six from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from scrapy.spiders import Spider, CrawlSpider from . import connection
from .utils import bytes_to_str # Default batch size matches default concurrent requests setting.
DEFAULT_START_URLS_BATCH_SIZE = 16
DEFAULT_START_URLS_KEY = '%(name)s:start_urls' class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
# Per spider redis key, default to DEFAULT_KEY.
redis_key = None
# Fetch this amount of start urls when idle. Default to DEFAULT_BATCH_SIZE.
redis_batch_size = None
redis_encoding = 'utf-8'
# Redis client instance.
server = None def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests() def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal. This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None) if crawler is None:
raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,
) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip():
raise ValueError("redis_key must not be empty") if self.redis_batch_size is None:
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
) try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer") self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s)", self.__dict__) self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET')
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data) if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data):
# By default, data is an URL. if not isinstance(data, six.string_types):
# XXX: Shall we log and continue?
self.logger.error("Wrong type for data: %s" % type(data))
url = bytes_to_str(data, self.redis_encoding)
else:
url = data # FIXME: This is a naive guard against using a wrong redis_key where
# data are not string URLs.
if '://' not in url:
# XXX: Shall this be an exception?
self.logger.error("Missing scheme in URL: '%s'", url) return self.make_requests_from_url(url) def schedule_next_requests(self):
"""Schedules a request if available"""
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self) def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle.""" @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle.""" @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj

  

scrapy 和 scrapy_redis 安装的更多相关文章

  1. Python之Scrapy爬虫框架安装及简单使用

    题记:早已听闻python爬虫框架的大名.近些天学习了下其中的Scrapy爬虫框架,将自己理解的跟大家分享.有表述不当之处,望大神们斧正. 一.初窥Scrapy Scrapy是一个为了爬取网站数据,提 ...

  2. scrapy之环境安装

    scrapy之环境安装 在之前我安装了scrapy,但是在pycharm中却无法使用. 具体情况是: 我的电脑上存在多个python,有python2,python3,anaconda,其中anaco ...

  3. Python3.5在Windows7环境下Scrapy库的安装

    Python3.5在Windows7环境下Scrapy库的安装 忙活了一下午,总算是把Scrapy库给装完了,记下来给需要帮助的人 首先安装的环境:Windows7 64位 Python的版本是:3. ...

  4. scrapy初体验 - 安装遇到的坑及第一个范例

    scrapy,python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据.scrapy用途广泛,可以用于数据挖掘.监测和自动化测试.scrapy的安装稍 ...

  5. 浅析scrapy与scrapy_redis区别

    最近在工作中写了很多 scrapy_redis 分布式爬虫,但是回想 scrapy 与 scrapy_redis 两者区别的时候,竟然,思维只是局限在了应用方面,于是乎,搜索了很多相关文章介绍,这才搞 ...

  6. Scrapy框架的安装

    Win+R 输入cmd打开命令行 我们先把pip升级到最新版,输入代码如下: pip install --upgrade pip 不过一般这种更新方式会经常性出错,安装文件在下载到一半时就会超时报错 ...

  7. scrapy和scrapy_redis入门

    Scarp框架 需求 获取网页的url 下载网页内容(Downloader下载器) 定位元素位置, 获取特定的信息(Spiders 蜘蛛) 存储信息(ItemPipeline, 一条一条从管里走) 队 ...

  8. Windows平台下,Scrapy Installation,安装问题解决

    按理说直接:pip install scrapy 就可以成功,但是出现了错误"libxml/xpath.h: No such file or directory" "er ...

  9. 关于Scrapy框架的安装

    Scrapy介绍与环境安装 Scrapy介绍与环境安装 What is scrapy? An open source and collaborative framework for extractin ...

随机推荐

  1. java 标签库(核心,xml,sql ,国际化,函数)

    java标签库分分为上述几种,一般经常使用的是核心和函数,接下来会分别讲解这几种,和常见的用法. 一般标签库会和el表达式一起使用,所以在学习标签库前最后也学习下el表达式的使用. 导入后展开 可以从 ...

  2. MongoDB安装并随windows开机自启

    MongoDB的官方下载站是http://www.mongodb.org/downloads,可以去上面下载最新的程序下来.在下载页面可以看到,对操作系统支持很全面,OS X.Linux.Window ...

  3. jstl自定义时间格式

    <fmt:formatDate value='${time}' pattern='yyyy-MM-dd HH:mm:ss'/> <s:iterator>下的<s:prop ...

  4. PHP RSA参数签名

    为了防止在支付通信过程中的参数数据被篡改或者伪造,采用RSA进行数据签名和验证签名. RSA算法是一种非对称密码算法,所谓非对称,就是指该算法需要一对密钥,使用其中一个加密,则需要用另一个才能解密. ...

  5. make: *** [out/host/linux-x86/obj/EXECUTABLES/aidl_intermediates/aidl] 错误 1,make: *** [out/host/linux-x86/obj/lib/libESR_Portable.so] 错误 1

    错误3: g++: g++: selected multilib '32' not installed selected multilib '32' not installed make: *** [ ...

  6. 【原创】内核ShellCode注入的一种方法

    标 题: [原创]内核ShellCode注入的一种方法 作 者: organic 时 间: 2013-05-04,04:34:08 链 接: http://bbs.pediy.com/showthre ...

  7. locky勒索样本分析

    前段时间收到locky样本,分析之后遂做一个分析. 样本如下所示,一般locky勒索的先决条件是一个js的脚本,脚本经过了复杂的混淆,主要用于下载该样本文件并运行,. 解密 样本本身进行了保护,通过i ...

  8. TMemo.Text 回车键会变成#$D#$A,而非#13#10

    mmoComplain: TMemo;//cxmComplain.Text 会造成回车键 转换成十六进制的字符串 #$D#$A,而非#13#10 //cxmComplain.Text范例:'风发的是' ...

  9. Material Design兼容包的使用

    为了方便自己以后的查找,于是就写了这个博客,废话就不多说,开始干: 导入: compile 'com.android.support:appcompat-v7:24.2.1' compile 'com ...

  10. 将一个字符串中的大写字母转换成小写字母,小写字母转换成大写字母(java)

    背景:刚刚学到java的String和StringBuffer类,遇到如标题所示的题. 要求:必须要用到String类的toUpperCase方法和toLowerCase方法 思路:用到StringB ...