
python的scrapy框架的使用 和xpath的使用 && scrapy中request和response的函数参数 && parse()函数运行机制 



scrapy startprojects js


scrapy genspider -t crawl jianshu


import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from js.items import JsItem class JianshuSpider(CrawlSpider):
# 运行scrapy时候的名字
name = 'jianshu'
allowed_domains = ['']
start_urls = ['']
rules = (
# 要爬取网页上的推荐文章,进而可以使用crawl爬取我们规定的链接
# 网页上的推荐文章链接是“/p/(12个【字母/数字】)”
Rule(LinkExtractor(allow=r'.*?p/.*?'), callback='parse_detail', follow=True),
) def parse_detail(self, response):
# print(response.text())
# print('-'*30)
passage_id = response.url
title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
time = response.xpath('//div[@class="s-dsoj"]/time/text()').get()
author = response.xpath('//span[@class="FxYr8x"]/a/text()').get()
body = response.xpath('//article[@class="_2rhmJa"]').get()
type = response.xpath('//div[@class="_2Nttfz"]/a/img/text()').getall()
type = ','.join(type)
# 比如文章链接为
# 那么文章的id,passage_id就是ef7bb28258c8,因为有的链接后面还有跟上“?参数”,所以我们先按照“?”
# 切割链接,然后再取出文章id
passage_id = passage_id.split('?')[0]
passage_id = passage_id.split('/')[-1]
# 有些文章违规在简书上不能看,但是链接还存在,所以我们需要判断一下
if author == None:
else: # 只要返回的对象是item类型,无论在那个函数里面返回都是返回到pipelines.py的item参数里面
item = JsItem(
passage_id = passage_id,
title = title,
time = time,
author = author,
body = body,
type = type
yield item


# Define here the models for your scraped items
# See documentation in:
# import scrapy # 定义一下Item的对象的一些变量
class JsItem(scrapy.Item):
passage_id = scrapy.Field()
title = scrapy.Field()
time = scrapy.Field()
author = scrapy.Field()
body = scrapy.Field()
type = scrapy.Field()

三、 (启动程序)

from scrapy import cmdline
# jianshu是你的爬虫程序文件,注意要把这个命令用split拆开
cmdline.execute("scrapy crawl jianshu".split())


# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: # useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from twisted.enterprise import adbapi
import pymysql # class JsPipeline:
# def process_item(self, item, spider):
# return item class JianShuSpiderPipeline(object):
def __init__(self):
# 连接数据库的一些参数
dpparams = {
'port': 3306,
# 连接数据库
self.conn = pymysql.connect(**dpparams)
# 定义游标
self.cursor = self.conn.cursor()
self._sql = None @property # 定义属性
def sql(self):
if self._sql == None:
self._sql = '''
insert into jsw(passage_id,title,time,author,body,type) values(%s,%s,%s,%s,%s,%s)
return self._sql
return self._sql def process_item(self,item,spider):
# 执行sql语句,并传参
# 记得执行之后要提交
# 必须返回,不返回就不会处理下一个item对象
return item # 异步向mysql中插入数据
# class JianShuTwistedPipeline:
# def __init__(self):
# dpparams = {
# 'host':'',
# 'port': 3306,
# 'user':'root',
# 'password':'qu513712qu',
# 'database':'jianshu',
# 'charset':'utf-8'
# }
# self.dbpool = adbapi.Connection('pymysql',**dpparams)
# self._sql = None
# @property
# def sql(self):
# if self._sql == None:
# self._sql = '''
# insert into jsw values(%s,%s,%s,%s,%s,%s)
# '''
# return self._sql
# else:
# return self._sql
# def process_item(self,item,spider):
# # 把插入Insert函数放到runInteraction里面,插入数据就会变成异步的,如果直接执行函数就是异步执行了
# # runInteraction会返回一个游标给Insert函数
# defer = self.dbpool.runInteraction(self.Insert,item)
# defer.addErrback(self.error,item,spider)
# def Insert(self,cursor,item):
# cursor.execute(self._sql,(item['passage_id'],item['title'],item['time'],item['author'],item['body'],item['type']))
# def error(self,error,item,spider):
# print('-'*30)
# print('error')
# print('-'*30)


# Scrapy settings for js project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# BOT_NAME = 'js' SPIDER_MODULES = ['js.spiders']
NEWSPIDER_MODULE = 'js.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'js (+' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } # Enable or disable spider middlewares
# See
# 'js.middlewares.JsSpiderMiddleware': 543,
'js.middlewares.UserAgentMiddleWare': 543,
} # Enable or disable downloader middlewares
# See
# 'js.middlewares.JsDownloaderMiddleware': 543,
'js.middlewares.SeleniumDownloadMiddleware': 543,
} # Enable or disable extensions
# See
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See
# 管道里面使用的存储数据的类
'js.pipelines.JianShuSpiderPipeline': 300,
# 'js.pipelines.JianShuTwistedPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


# Define here the models for your spider middleware
# See documentation in:
# from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import random
from selenium import webdriver
import time
from scrapy.http.response.html import HtmlResponse class JsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects. @classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider. # Should return None or raise an exception.
return None def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response. # Must return an iterable of Request, or item objects.
for i in result:
yield i def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects.
pass def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated. # Must return only requests (not items).
for r in start_requests:
yield r def spider_opened(self, spider):'Spider opened: %s' % class JsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects. @classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware. # Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None def process_response(self, request, response, spider):
# Called with the response returned from the downloader. # Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception. # Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass def spider_opened(self, spider):'Spider opened: %s' % class UserAgentMiddleWare:
User_Agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" ]
def process_request(self,request,spider):
user_agent = random.choice(self.User_Agent)
request.headers['User-Agent'] = user_agent class SeleniumDownloadMiddleware(object):
# 通过selenium+chrome来爬取网页的一些动态加载的数据
def __init__(self): # 将你的chrome.exe文件位置传给它
self.driver = webdriver.Chrome(executable_path=r'D:\python-senium\chromedriver.exe') def process_request(self,request,spider):
while 1: # 文章下面有文章分类,我们需要点击加载加载更多才可以找出来这篇文章的所有归属类型
showmore = self.driver.find_element_by_class_name('anticon anticon-down')
if not showmore:
# 网页源码
sourse = self.driver.page_source
# 构造一个response对象,并返回给
response = HtmlResponse(url=self.driver.current_url,body=sourse,request=request,encoding='utf-8')
return response

