关于  spinder  在这个框架里面   和不用数据库  相同

# -*- coding: utf-8 -*-
import scrapy
from yang_guan.items import YangGuanItem
from copy import deepcopy
from scrapy.spiders import CrawlSpider class YgSpider(scrapy.Spider):
name = 'yg'
allowed_domains = ['huanqiu.com']
start_urls = ['http://www.huanqiu.com/',
] def parse (self, response): # 总页面 第一个一定要用parse 用来传递start_urls
item = YangGuanItem()
# item = {} class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a")
print(class_news_urls_li)
for class_news_url in class_news_urls_li:
item["class_tittle"] = class_news_url.xpath("./text()").extract_first()
print(item)
new_url = class_news_url.xpath("./@href").extract_first()
print(new_url)
yield scrapy.Request(
new_url,
callback=self.second_class,
meta={"item": deepcopy(item)}, # 由于是多线程 所以要用深拷贝进入item
) def second_class(self, response): # 二级页面
item = response.meta["item"]
print(response.url) second_urls = response.xpath(".//div/h2/em") for second_url in second_urls: secoond_news_url = second_url.xpath("./a/@href").extract_first() yield scrapy.Request(
secoond_news_url,
callback=self.parse_detail_analyze,
meta={"item": deepcopy(item)}
) def parse_detail_analyze(self, response): # 进入第三成 总细节的抓取 http://china.huanqiu.com/leaders/'
item = response.meta["item"] li_list = response.xpath("//ul[@class='listPicBox']/li") for li in li_list:
# item = YangGuanItem()
item["title"] = li.xpath("./h3/a/text()").extract_first()
item["img_url"] = li.xpath("./a/img/@src").extract_first()
item["detail"] = li.xpath("./h5/text()").extract_first()
yield item next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first() # 遇见翻页就要这样写 yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})

关于  pipelines  的 管道设定

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo class YangGuanPipeline(object): def __init__(self):
# 建立mongodb 数据库连接
client = pymongo.MongoClient('127.0.0.1', 27017)
# 连接数据库,['scrapy_huan_qiu]
db = client['scrapy_huan_qiu']
# 连接所用的集合
self.post = db['zong_huan_qiu']
print("*"*100) def process_item(self, item, spider):
postItem = dict(item)
self.post.insert(postItem)
return item

setting  的设置

# -*- coding: utf-8 -*-

# Scrapy settings for yang_guan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # 好像记得是ip代理
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},] BOT_NAME = 'yang_guan' SPIDER_MODULES = ['yang_guan.spiders']
NEWSPIDER_MODULE = 'yang_guan.spiders' # LOG_LEVEL = "WARNING" # Crawl responsibly by identifying yourself (and your website) on the user-agent
# 计算机型号防止反爬虫
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' # Obey robots.txt rules
# 不遵守爬虫机器人协议
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yang_guan.middlewares.YangGuanSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yang_guan.middlewares.YangGuanDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 开启管道 由于这次没有编写 items 也无法保存进入数据库
ITEM_PIPELINES = {
'yang_guan.pipelines.YangGuanPipeline': 300,
} # 关于 debug等级 和生成log日志
# LOG_FILE = "dg.log"
# LOG_LEVEL = "DEBUG" # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

关于item  的设置   这个一定要有  用spider 里面的 yield 来进行传递 字典

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()

19 03 13 关于 scrapy 框架的 对环球网的整体爬取(存储于 mongodb 数据库里)的更多相关文章

  1. scrapy框架基于CrawlSpider的全站数据爬取

    引入 提问:如果想要通过爬虫程序去爬取”糗百“全站数据新闻数据的话,有几种实现方法? 方法一:基于Scrapy框架中的Spider的递归爬取进行实现(Request模块递归回调parse方法). 方法 ...

  2. 爬虫入门之Scrapy框架基础框架结构及腾讯爬取(十)

    Scrapy终端是一个交互终端,我们可以在未启动spider的情况下尝试及调试代码,也可以用来测试XPath或CSS表达式,查看他们的工作方式,方便我们爬取的网页中提取的数据. 如果安装了 IPyth ...

  3. Scrapy实战篇(五)之爬取历史天气数据

    本篇文章我们以抓取历史天气数据为例,简单说明数据抓取的两种方式: 1.一般简单或者较小量的数据需求,我们以requests(selenum)+beautiful的方式抓取数据 2.当我们需要的数据量较 ...

  4. Scrapy实战篇(八)之爬取教育部高校名单抓取和分析

    本节我们以网址https://daxue.eol.cn/mingdan.shtml为初始链接,爬取教育部公布的正规高校名单. 思路: 1.首先以上面的地址开始链接,抓取到下面省份对应的链接. 2.在解 ...

  5. 13,scrapy框架的日志等级和请求传参

    今日概要 日志等级 请求传参 如何提高scrapy的爬取效率 一.Scrapy的日志等级 - 在使用scrapy crawl spiderFileName运行程序时,在终端里打印输出的就是scrapy ...

  6. scrapy爬虫笔记(三)------写入源文件的爬取

    开始爬取网页:(2)写入源文件的爬取 为了使代码易于修改,更清晰高效的爬取网页,我们将代码写入源文件进行爬取. 主要分为以下几个步骤: 一.使用scrapy创建爬虫框架: 二.修改并编写源代码,确定我 ...

  7. Scrapy实战篇(七)之爬取爱基金网站基金业绩数据

    本篇我们以scrapy+selelum的方式来爬取爱基金网站(http://fund.10jqka.com.cn/datacenter/jz/)的基金业绩数据. 思路:我们以http://fund.1 ...

  8. Scrapy实战篇(六)之爬取360图片数据和图片

    本篇文章我们以360图片为例,介绍scrapy框架的使用以及图片数据的下载. 目标网站:http://images.so.com/z?ch=photography 思路:分析目标网站为ajax加载方式 ...

  9. Scrapy实战篇(三)之爬取豆瓣电影短评

    今天的主要内容是爬取豆瓣电影短评,看一下网友是怎么评价最近的电影的,方便我们以后的分析,以以下三部电影:二十二,战狼,三生三世十里桃花为例. 由于豆瓣短评网页比较简单,且不存在动态加载的内容,我们下面 ...

随机推荐

  1. [经验] Java Web 项目怎么部署到 Linux 系统上

    废话少说, 直奔主题 第一步: 将 web 项目打成 war 包 1: 打开项目的 pom.xml 文件 如果是迭代后的项目, 记得修改项目的版本号, 这里我的是第二版所有就把 1 改成了 2 2: ...

  2. VS2013 ERROR MSB8020

    error MSB8020: The build tools for Visual Studio 2013 (Platform Toolset = 'v120') cannot be found. T ...

  3. UniGUI设置背景图片(09)

    主要是Background和LoginBackground属性, 类似地Login窗口背景图也可这样修改 UniServerModule.MainFormDisplayMode:=  mfPage;/ ...

  4. 一 注册功能&登录功能,权限拦截

    注册功能: 前端JSP:提供表单注册信息以及访问路径,发送请求到Strus2. Struts2 : 通过模型驱动接收并封装User对象,Spring依赖注入(无参构造+setter方法)获取业务层Us ...

  5. 【转】彻底搞透Netty框架

    本文基于 Netty 4.1 展开介绍相关理论模型,使用场景,基本组件.整体架构,知其然且知其所以然,希望给大家在实际开发实践.学习开源项目方面提供参考. Netty 是一个异步事件驱动的网络应用程序 ...

  6. 30 整数中1出现的次数(从1到n整数中1出现的次数)这题很难要多看*

    题目描述 求出1~13的整数中1出现的次数,并算出100~1300的整数中1出现的次数?为此他特别数了一下1~13中包含1的数字有1.10.11.12.13因此共出现6次,但是对于后面问题他就没辙了. ...

  7. [LuoguP1203][USACO1.1]P1203 Broken Necklace

    Solution 这道题数据规模奇小,因此大部分人都使用了暴力搜索的方法,这也是我一开始的想法. 对于 100100%100 的数据,3≤n≤3503≤n≤3503≤n≤350 的确可以如此,但暴力搜 ...

  8. 如何使用Docker部署PHP开发环境

    本文主要介绍了如何使用Docker构建PHP的开发环境,文中作者也探讨了构建基于Docker的开发环境应该使用单容器还是多容器,各有什么利弊.推荐PHP开发者阅读.希望对大家有所帮助. 环境部署一直是 ...

  9. Mysql数据库日志,备份及回滚操作

    一.打开二进制日志配置 : 在Windows系统下,对mysql的my.ini的log-bin等进行配置目录位置时,假设要将log-bin的日志配置到D盘的mysqllog的文件为binlog.则可以 ...

  10. iOS Burp suite CA证书 HTTPS

    设置好burp suite代理后,在浏览器地址输入http://burp/,下载CA证书: 在iOS上下载CA证书,可通过邮件或百度云等一切iOS可以访问证书文件的方法: 点击证书文件iOS提示安装, ...