scrapy 相关
Spider类的一些自定制
# Spider类 自定义 起始解析器
def start_requests(self):
for url in self.start_urls:
yield Request(url=url,callback=self.myparse) # 起始会先经过这个视图函数返回一个 列表或者 一个生成器
yield Request(url=page_url, callback=self.parse) #指定解析函数
parse函数的参数之response
# print(response.request) # 将请求对象也封装在了response中
# print(response.url) # 请求的url
# print(response.headers) # 响应的头
# print(response.headers['Set-Cookie']) # 原始cookies ['_auto_detect_fun', '_body', '_body_declared_encoding', '_body_inferred
_encoding', '_cached_benc', '_cached_selector', '_cached_ubody', '_declared_encoding', '_encoding', '_get_body', '_get_url', '_headers_encoding', '_se
t_body', '_set_url', '_url', 'body', 'body_as_unicode', 'copy', 'css', 'encoding', 'flags', 'follow', 'headers', 'meta', 'replace', 'request', 'select
or', 'status', 'text', 'url', 'urljoin', 'xpath']
参数
获取状态码 response.status
CookiesMiddleware
- class
scrapy.contrib.downloadermiddleware.cookies.
CookiesMiddleware
-
该中间件使得爬取需要cookie(例如使用session)的网站成为了可能。 其追踪了web server发送的cookie,并在之后的request中发送回去, 就如浏览器所做的那样。
以下设置可以用来配置cookie中间件:
单spider多cookie session
0.15 新版功能.
Scrapy通过使用 cookiejar
Request meta key来支持单spider追踪多cookie session。 默认情况下其使用一个cookie jar(session),不过您可以传递一个标示符来使用多个。
例如:
for i, url in enumerate(urls):
yield scrapy.Request("http://www.example.com", meta={'cookiejar': i},
callback=self.parse_page)
需要注意的是 cookiejar
meta key不是”黏性的(sticky)”。 您需要在之后的request请求中接着传递。例如:
def parse_page(self, response):
# do some processing
return scrapy.Request("http://www.example.com/otherpage",
meta={'cookiejar': response.meta['cookiejar']},
callback=self.parse_other_page)
# from scrapy.http.cookies import CookieJar
# cookiejar = CookieJar()
# cookiejar.extract_cookies(response, response.request)
# temp = {}
# for k, v in cookiejar._cookies.items():
# for i, j in v.items():
# for m, n in j.items():
# temp[m] = n.value
# self.cookie_dic.update(temp)
yeild Request(url=url,
cookies = self.cookie_dic
)
# 该方法是原始的手动操作cookie
COOKIES_ENABLED
默认: True
是否启用cookies middleware。如果关闭,cookies将不会发送给web server。
COOKIES_DEBUG
默认: False
如果启用,Scrapy将记录所有在request(Cookie
请求头)发送的cookies及response接收到的cookies(Set-Cookie
接收头)。
下边是启用 COOKIES_DEBUG
的记录的样例:
2011-04-06 14:35:10-0300 [diningcity] INFO: Spider opened
2011-04-06 14:35:10-0300 [diningcity] DEBUG: Sending cookies to: <GET http://www.diningcity.com/netherlands/index.html>
Cookie: clientlanguage_nl=en_EN
2011-04-06 14:35:14-0300 [diningcity] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html>
Set-Cookie: JSESSIONID=B~FA4DC0C496C8762AE4F1A620EAB34F38; Path=/
Set-Cookie: ip_isocode=US
Set-Cookie: clientlanguage_nl=en_EN; Expires=Thu, 07-Apr-2011 21:21:34 GMT; Path=/
2011-04-06 14:49:50-0300 [diningcity] DEBUG: Crawled (200) <GET http://www.diningcity.com/netherlands/index.html> (referer: None)
[...]
pipelines
当根据配置文件:
ITEM_PIPELINES = {
'xiangmu.pipelines.FilePipeline': 300,
'xiangmu.pipelines.DBPipeline': 301,
}
找到相关的类:FilePipeline之后,会优先判断类中是否含有 from_crawler
如果有:
obj = FilePipeline.from_crawler()
没有则:
obj = FilePipeline() obj.open_spider(..)
ob.process_item(..)
obj.close_spider(..)
class FilePipeline(object):
def __init__(self,path):
self.path = path
self.f = None @classmethod
def from_crawler(cls, crawler):
"""
初始化时候,用于创建pipeline对象
:param crawler:
:return:
"""
# return cls() path = crawler.settings.get('XL_FILE_PATH')
return cls(path) def process_item(self, item, spider):
self.f.write(item['href']+'\n')
return item
# 表示将item丢弃,不会被后续pipeline处理
# raise DropItem() def open_spider(self, spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
self.f = open(self.path,'w') def close_spider(self, spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
self.f.close()
自定义文件pipline
class DBPipeline(): def __init__(self,conn):
self.conn = conn @classmethod
def from_crawler(cls,crawler):
import pymysql
conn = pymysql.connect(host='127.0.0.1',password='',port=3306,db='s10',user='root',charset='utf8')
# pool = crawler.settings.get('MYSQLPOOL')
return cls(conn) def process_item(self,item,spider):
print(item['href'],item['text'])
self.cursor.execute('insert into url (url,title) VALUES (%s,%s)',(item['href'],item['text'])) return item
def open_spider(self,spider):
self.cursor = self.conn.cursor()
def close_spider(self,spider):
self.conn.commit()
self.conn.close()
自定义数据库piplines
去重DupeFilter
在scrapy 中是有个默认去重的
如果想自定义去重需要在setting
DUPEFILTER_CLASS = 'pachong.dupefilter.MyDupeFilter'
from scrapy.dupefilter import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class MyDupeFilter(BaseDupeFilter): def __init__(self, path=None, debug=False):
self.record = set() @classmethod
def from_settings(cls, settings): return cls() def request_seen(self, request):
fingerprint = request_fingerprint(request)
if fingerprint in self.record:
print('已经访问过')
return True
else:
self.record.add(fingerprint) def request_fingerprint(self, request):
pass def close(self, reason): pass
自定义去重
中间件
class SpiderMiddleware(object): def process_spider_input(self,response, spider):
"""
下载完成,执行,然后交给parse处理
:param response:
:param spider:
:return:
"""
pass def process_spider_output(self,response, result, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
"""
return result def process_spider_exception(self,response, exception, spider):
"""
异常调用
:param response:
:param exception:
:param spider:
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
"""
return None def process_start_requests(self,start_requests, spider):
"""
爬虫启动时调用
:param start_requests:
:param spider:
:return: 包含 Request 对象的可迭代对象
"""
return start_requests 爬虫中间件
下载器中间件
class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
pass def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print('response1')
return response def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
return None
下载中间件
其他
# -*- coding: utf-8 -*- # Scrapy settings for step8_king project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # 1. 爬虫名称
BOT_NAME = 'step8_king' # 2. 爬虫应用路径
SPIDER_MODULES = ['step8_king.spiders']
NEWSPIDER_MODULE = 'step8_king.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
# 3. 客户端 user-agent请求头
# USER_AGENT = 'step8_king (+http://www.yourdomain.com)' # Obey robots.txt rules
# 4. 禁止爬虫配置
# ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
# 5. 并发请求数
# CONCURRENT_REQUESTS = 4 # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 6. 延迟下载秒数
# DOWNLOAD_DELAY = 2 # The download delay setting will honor only one of:
# 7. 单域名访问并发数,并且延迟下次秒数也应用在每个域名
# CONCURRENT_REQUESTS_PER_DOMAIN = 2
# 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP
# CONCURRENT_REQUESTS_PER_IP = 3 # Disable cookies (enabled by default)
# 8. 是否支持cookie,cookiejar进行操作cookie
# COOKIES_ENABLED = True
# COOKIES_DEBUG = True # Disable Telnet Console (enabled by default)
# 9. Telnet用于查看当前爬虫的信息,操作爬虫等...
# 使用telnet ip port ,然后通过命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,] # 10. 默认请求头
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# } # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# 11. 定义pipeline处理请求
# ITEM_PIPELINES = {
# 'step8_king.pipelines.JsonPipeline': 700,
# 'step8_king.pipelines.FilePipeline': 500,
# } # 12. 自定义扩展,基于信号进行调用
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# # 'step8_king.extensions.MyExtension': 500,
# } # 13. 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度
# DEPTH_LIMIT = 3 # 14. 爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo # 后进先出,深度优先
# DEPTH_PRIORITY = 0
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
# 先进先出,广度优先 # DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' # 15. 调度器队列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'
# from scrapy.core.scheduler import Scheduler # 16. 访问URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl' # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html """
17. 自动限速算法
from scrapy.contrib.throttle import AutoThrottle
自动限速设置
1. 获取最小延迟 DOWNLOAD_DELAY
2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
target_delay = latency / self.target_concurrency
new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
new_delay = max(target_delay, new_delay)
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
slot.delay = new_delay
""" # 开始自动限速
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下载延迟
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 最大下载延迟
# AUTOTHROTTLE_MAX_DELAY = 10
# The average number of requests Scrapy should be sending in parallel to each remote server
# 平均每秒并发数
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received:
# 是否显示
# AUTOTHROTTLE_DEBUG = True # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings """
18. 启用缓存
目的用于将已经发送的请求或相应缓存下来,以便以后使用 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否启用缓存策略
# HTTPCACHE_ENABLED = True # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" # 缓存超时时间
# HTTPCACHE_EXPIRATION_SECS = 0 # 缓存保存路径
# HTTPCACHE_DIR = 'httpcache' # 缓存忽略的Http状态码
# HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存存储的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' """
19. 代理,需要在环境变量中设置
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware 方式一:使用默认
os.environ
{
http_proxy:http://root:woshiniba@192.168.11.11:9999/
https_proxy:http://192.168.11.11:9999/
}
方式二:使用自定义下载中间件 def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors) class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) DOWNLOADER_MIDDLEWARES = {
'step8_king.middlewares.ProxyMiddleware': 500,
} """ """
20. Https访问
Https访问时有两种情况:
1. 要爬取网站使用的可信任证书(默认支持)
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 2. 要爬取网站使用的自定义证书
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
其他:
相关类
scrapy.core.downloader.handlers.http.HttpDownloadHandler
scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
相关配置
DOWNLOADER_HTTPCLIENTFACTORY
DOWNLOADER_CLIENTCONTEXTFACTORY """ """
21. 爬虫中间件
class SpiderMiddleware(object): def process_spider_input(self,response, spider):
'''
下载完成,执行,然后交给parse处理
:param response:
:param spider:
:return:
'''
pass def process_spider_output(self,response, result, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
'''
return result def process_spider_exception(self,response, exception, spider):
'''
异常调用
:param response:
:param exception:
:param spider:
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
'''
return None def process_start_requests(self,start_requests, spider):
'''
爬虫启动时调用
:param start_requests:
:param spider:
:return: 包含 Request 对象的可迭代对象
'''
return start_requests 内置爬虫中间件:
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900, """
# from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'step8_king.middlewares.SpiderMiddleware': 543,
} """
22. 下载中间件
class DownMiddleware1(object):
def process_request(self, request, spider):
'''
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
'''
pass def process_response(self, request, response, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
'''
print('response1')
return response def process_exception(self, request, exception, spider):
'''
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
'''
return None 默认下载中间件
{
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
} """
# from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'step8_king.middlewares.DownMiddleware1': 100,
# 'step8_king.middlewares.DownMiddleware2': 500,
# } settings
其他参数
scrapy 相关的更多相关文章
- scrapy相关:splash 实践
0. 1.参考 https://github.com/scrapy-plugins/scrapy-splash#configuration 以此为准 scrapy相关:splash安装 A javas ...
- scrapy相关:splash安装 A javascript rendering service 渲染
0. splash: 美人鱼 溅,泼 1.参考 Splash使用初体验 docker在windows下的安装 https://blog.scrapinghub.com/2015/03/02/hand ...
- scrapy相关 通过设置 FEED_EXPORT_ENCODING 解决 unicode 中文写入json文件出现`\uXXXX`
0.问题现象 爬取 item: 2017-10-16 18:17:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.hu ...
- 【python】scrapy相关
目前scrapy还不支持python3,python2.7与python3.5共存时安装scrapy后,执行scrapy后报错 Traceback (most recent call last): F ...
- linux下scrapy环境搭建
最近使用scrapy做数据挖掘,使用scrapy定时抓取数据并存入MongoDB,本文记录环境搭建过程以作备忘 OS:ubuntu 14.04 python:2.7.6 scrapy:1.0.5 D ...
- pycharm创建scrapy项目教程及遇到的坑
最近学习scrapy爬虫框架,在使用pycharm安装scrapy类库及创建scrapy项目时花费了好长的时间,遇到各种坑,根据网上的各种教程,花费了一晚上的时间,终于成功,其中也踩了一些坑,现在整理 ...
- python-爬虫框架scrapy
一 介绍 Scrapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的,使用它可以以快速.简单.可扩展的方式从网站中提取所需的数据.但目前Scrapy的用途十分广泛,可 ...
- 如何运行简单的scrapy
1.建scrapy工程 scrapy startproject python123demo 2.在工程中写一个爬虫文件 cd python123demo scrapy genspider demo p ...
- Scrapy框架——介绍、安装、命令行创建,启动、项目目录结构介绍、Spiders文件夹详解(包括去重规则)、Selectors解析页面、Items、pipelines(自定义pipeline)、下载中间件(Downloader Middleware)、爬虫中间件、信号
一 介绍 Scrapy一个开源和协作的框架,其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的,使用它可以以快速.简单.可扩展的方式从网站中提取所需的数据.但目前Scrapy的用途十分广泛,可 ...
随机推荐
- TI BSL in python
今天花了一下午在这个上面,被这个工具整没脾气了快.花点功夫记录一下. 代码是在linux下下过来的: bzr branch lp:python-msp430-tools 看了下,有我想要的器件.然后想 ...
- lodash merge mergeWith使用
1.作用 递归合并来源对象的自身和继承的可枚举属性到目标对象. 2.示例 <!DOCTYPE html> <html lang="zh"> <head ...
- SQLSERVER常用计数器
复制下面全部--开始--运行--profmon.exe--性能监视器--粘贴计数器列表 <OBJECT ID="DISystemMonitor1" WIDTH="1 ...
- Java中正数与负数操作>>、>>>的区别
以下为个人理解,有不对的地方请提出 Java中,>>.>>>都是在数字的二进制的补码中进行的 正数的补码为本身 如33的二进制表示为 00000000 00000000 ...
- html中的标签分类
单标签 <br> <hr> <img> <input> <param> <meta> <link> 双标签 < ...
- Xcode中利用git源代码版本号控制
git是一个版本号控制系统,能够通过命令行来调用,也有专门的桌面软件.这里主要介绍在Xcode中怎样利用git来进行版本号的控制. 一.创建git源 从Xcode5開始引入了使用git的一些新特性.将 ...
- SQLyog之MySQL客户端的下载、安装和使用(普通版)
本博文的主要内容有 .SQLyog的下载 .SQLyog的安装 .SQLyog的使用 前期,安装这个,不多说 MySQL Server类型之MySQL客户端工具的下载.安装和使用 1.SQLyog的下 ...
- mysql-ubuntu14.04彻底卸载mysql
删除mysql的数据文件 sudo rm /var/lib/mysql/ -R 删除mysql的配置文件 sudo rm /etc/mysql/ -R 自动卸载mysql(包括server和clien ...
- Android Studio公布到Jcenter
1.前言 拥抱开源.热爱开源,将我们觉得不错的代码开源到gihtub.将我们的库公布到jcenter\mevan等. 2.准备工作 2.1 准备 申请仓库账号 注意model为android libr ...
- 两个优秀的C标准库源代码
gnu的基本看不懂,因为套了一层又一层复杂的东西. 我现在能看懂的2个c标准库是openbsd的,还有一个嵌入c的库. https://github.com/openbsd/src/tree/mast ...