items的编写

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define here the models for your scraped items
  4. #
  5. # See documentation in:
  6. # https://doc.scrapy.org/en/latest/topics/items.html
  7.  
  8. import scrapy
  9.  
  10. class AutopjtItem(scrapy.Item):
  11. # define the fields for your item here like:
  12. # 用来存储商品名
  13. name = scrapy.Field()
  14. #用来存储商品价格
  15. price = scrapy.Field()
  16. # 用来存储商品链接
  17. link = scrapy.Field()
  18. # 用来存储商品评论数
  19. comnum = scrapy.Field()
  20. # 用来存储商品评论内容链接
  21. comnum_link = scrapy.Field()

piplines的编写

  1. # -*- coding: utf-8 -*-
  2. import codecs
  3. import json
  4. # Define your item pipelines here
  5. #
  6. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  7. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  8.  
  9. class AutopjtPipeline(object):
  10. def __init__(self):
  11. self.file = codecs.open("D:/git/learn_scray/day11/1.json", "wb", encoding="utf-8")
  12.  
  13. def process_item(self, item, spider):
  14. # 爬取当前页的所有信息
  15. for i in range(len(item["name"])):
  16. name = item["name"][i]
  17. price = item["price"][i]
  18. link = item["link"][i]
  19. comnum = item["comnum"][i]
  20. comnum_link = item["comnum_link"][i]
  21. current_conent = {"name":name,"price":price,"link":link,
  22. "comnum":comnum,"comnum_link":comnum_link}
  23. j = json.dumps(dict(current_conent),ensure_ascii=False)
  24. # 为每条数据添加换行
  25. line = j + '\n'
  26. print(line)
  27. self.file.write(line)
  28. # for key,value in current_conent.items():
  29. # print(key,value)
  30. return item
  31.  
  32. def close_spider(self,spider):
  33. self.file.close()

自动爬虫编写实战

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from autopjt.items import AutopjtItem
  4. from scrapy.http import Request
  5.  
  6. class AutospdSpider(scrapy.Spider):
  7. name = 'autospd'
  8. allowed_domains = ['dangdang.com']
  9. # 当当地方特产
  10. start_urls = ['http://category.dangdang.com/pg1-cid10010056.html']
  11.  
  12. def parse(self, response):
  13. item = AutopjtItem()
  14. print("进入item")
  15. # print("获取标题:")
  16. # 获取标题
  17. item["name"] = response.xpath("//p[@class='name']/a/@title").extract()
  18. # print(title)
  19.  
  20. # print("获取价格:")
  21. # 价格
  22. item["price"] = response.xpath("//span[@class='price_n']/text()").extract()
  23. # print(price)
  24.  
  25. # print("获取商品链接:")
  26. # 获取商品链接
  27. item["link"] = response.xpath("//p[@class='name']/a/@href").extract()
  28. # print(link)
  29.  
  30. # print("\n")
  31. # print("获取商品评论数:")
  32. # 获取商品评论数
  33. item["comnum"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
  34. # comnum = response.xpath("//a[@name='itemlist-review']/text()").extract()
  35. # print(comnum)
  36.  
  37. # print("获取商品评论数链接:")
  38. # 获取商品评论数链接
  39. item["comnum_link"] = response.xpath("//a[@name='itemlist-review']/@href").extract()
  40. # comnum_link = response.xpath("//a[@name='itemlist-review']/@href").extract()
  41. # print(comnum_link)
  42. yield item
  43. for i in range(1,79):
  44. # print(i)
  45. url = "http://category.dangdang.com/pg"+ str(i) + "-cid10010056.html"
  46. # print(url)
  47. yield Request(url, callback=self.parse)

yield详解:

 https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do

settings的设置:

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Scrapy settings for autopjt project
  4. #
  5. # For simplicity, this file contains only settings considered important or
  6. # commonly used. You can find more settings consulting the documentation:
  7. #
  8. # https://doc.scrapy.org/en/latest/topics/settings.html
  9. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  10. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  11.  
  12. BOT_NAME = 'autopjt'
  13.  
  14. SPIDER_MODULES = ['autopjt.spiders']
  15. NEWSPIDER_MODULE = 'autopjt.spiders'
  16.  
  17. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  18. #USER_AGENT = 'autopjt (+http://www.yourdomain.com)'
  19.  
  20. # Obey robots.txt rules
  21. # 默认为true遵守robots.txt协议 我试了一下能爬 为了保险设置为false
  22. ROBOTSTXT_OBEY = True
  23.  
  24. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  25. #CONCURRENT_REQUESTS = 32
  26.  
  27. # Configure a delay for requests for the same website (default: 0)
  28. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  29. # See also autothrottle settings and docs
  30. #DOWNLOAD_DELAY = 3
  31. # The download delay setting will honor only one of:
  32. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  33. #CONCURRENT_REQUESTS_PER_IP = 16
  34.  
  35. # Disable cookies (enabled by default)
  36. COOKIES_ENABLED = False
  37.  
  38. # Disable Telnet Console (enabled by default)
  39. #TELNETCONSOLE_ENABLED = False
  40.  
  41. # Override the default request headers:
  42. #DEFAULT_REQUEST_HEADERS = {
  43. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  44. # 'Accept-Language': 'en',
  45. #}
  46.  
  47. # Enable or disable spider middlewares
  48. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  49. #SPIDER_MIDDLEWARES = {
  50. # 'autopjt.middlewares.AutopjtSpiderMiddleware': 543,
  51. #}
  52.  
  53. # Enable or disable downloader middlewares
  54. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  55. #DOWNLOADER_MIDDLEWARES = {
  56. # 'autopjt.middlewares.AutopjtDownloaderMiddleware': 543,
  57. #}
  58.  
  59. # Enable or disable extensions
  60. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  61. #EXTENSIONS = {
  62. # 'scrapy.extensions.telnet.TelnetConsole': None,
  63. #}
  64.  
  65. # Configure item pipelines
  66. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  67. ITEM_PIPELINES = {
  68. 'autopjt.pipelines.AutopjtPipeline': 300,
  69. }
  70.  
  71. # Enable and configure the AutoThrottle extension (disabled by default)
  72. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  73. #AUTOTHROTTLE_ENABLED = True
  74. # The initial download delay
  75. #AUTOTHROTTLE_START_DELAY = 5
  76. # The maximum download delay to be set in case of high latencies
  77. #AUTOTHROTTLE_MAX_DELAY = 60
  78. # The average number of requests Scrapy should be sending in parallel to
  79. # each remote server
  80. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  81. # Enable showing throttling stats for every response received:
  82. #AUTOTHROTTLE_DEBUG = False
  83.  
  84. # Enable and configure HTTP caching (disabled by default)
  85. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  86. #HTTPCACHE_ENABLED = True
  87. #HTTPCACHE_EXPIRATION_SECS = 0
  88. #HTTPCACHE_DIR = 'httpcache'
  89. #HTTPCACHE_IGNORE_HTTP_CODES = []
  90. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

最后的效果:

精通python网络爬虫之自动爬取网页的爬虫 代码记录的更多相关文章

  1. python爬取网页的通用代码框架

    python爬取网页的通用代码框架: def getHTMLText(url):#参数code缺省值为‘utf-8’(编码方式) try: r=requests.get(url,timeout=30) ...

  2. python(27)requests 爬取网页乱码,解决方法

    最近遇到爬取网页乱码的情况,找了好久找到了种解决的办法: html = requests.get(url,headers = head) html.apparent_encoding html.enc ...

  3. 【Python网络爬虫三】 爬取网页新闻

    学弟又一个自然语言处理的项目,需要在网上爬一些文章,然后进行分词,刚好牛客这周的是从一个html中找到正文,就实践了一下.写了一个爬门户网站新闻的程序 需求: 从门户网站爬取新闻,将新闻标题,作者,时 ...

  4. 爬虫-----selenium模块自动爬取网页资源

    selenium介绍与使用 1 selenium介绍 什么是selenium?selenium是Python的一个第三方库,对外提供的接口可以操作浏览器,然后让浏览器完成自动化的操作.     sel ...

  5. python爬虫学习(7) —— 爬取你的AC代码

    上一篇文章中,我们介绍了python爬虫利器--requests,并且拿HDU做了小测试. 这篇文章,我们来爬取一下自己AC的代码. 1 确定ac代码对应的页面 如下图所示,我们一般情况可以通过该顺序 ...

  6. [原创]python爬虫之BeautifulSoup,爬取网页上所有图片标题并存储到本地文件

    from bs4 import BeautifulSoup import requests import re import os r = requests.get("https://re. ...

  7. 爬虫系列----scrapy爬取网页初始

    一 基本流程 创建工程,工程名称为(cmd):firstblood: scrapy startproject firstblood 进入工程目录中(cmd):cd :./firstblood 创建爬虫 ...

  8. Python学习--两种方法爬取网页图片(requests/urllib)

    实际上,简单的图片爬虫就三个步骤: 获取网页代码 使用正则表达式,寻找图片链接 下载图片链接资源到电脑 下面以博客园为例子,不同的网站可能需要更改正则表达式形式. requests版本: import ...

  9. 《精通python网络爬虫》笔记

    <精通python网络爬虫>韦玮 著 目录结构 第一章 什么是网络爬虫 第二章 爬虫技能概览 第三章 爬虫实现原理与实现技术 第四章 Urllib库与URLError异常处理 第五章 正则 ...

随机推荐

  1. iOS使用技巧---高效使用你的xcode

    推荐一遍好文章:绝对可以学到关于xcode的很多哟 转载自cocoachina: http://www.cocoachina.com/ios/20140731/9284.html

  2. js之数组知识

    一.数组的定义(来源于Array.prototype) 1.构造函数方法: (1)var arr = new Array();//没有参数等价于 var arr = []; (2)var arr = ...

  3. Python学习记录1

    交互式解释器 模块 python序列 索引提取 序列运算 空列表 成员资格 长度最大值最小值函数 列表 list和join函数 交互式解释器 ' >>> '为提示符. 语句是用来告诉 ...

  4. layer的iframe层的传参和回参

    从父窗口传参给iframe,参考://https://yq.aliyun.com/ziliao/133150 从iframe回参给父窗口,参考:https://www.cnblogs.com/jiqi ...

  5. php-7.0.16 , apache2.4.25 配置

    官网下载php,apache 修改apache E:\php\Apache24\conf\httpd.conf Define SRVROOT "E:/php/Apache24" - ...

  6. 虚拟dom和diff算法

    https://github.com/livoras/blog/issues/13 这里简单记录一些要点和理解: 一个dom元素中有许多属性,操作dom是很耗资源的,而操作自定义的js对象是很高效.所 ...

  7. springboot的启动类不能直接放在src/java目录下,不然会报错

    jar包的application.yml 会被项目的覆盖,导致找不到原有的配置

  8. CSS3-::selection

    “::selection”只能设置两个属性,一个就是background,另一个就是color属性,设置其他属性是没有任何效果的. ::selection使用语法: /*Webkit,Opera9.5 ...

  9. Redis原理及集群相关知识

    读书笔记 <Redis开发与运维 > Redis使用场景 作为缓存层 减少对Mysql的压力 计数功能 比如使用原子命令incr 共享Session 设置过期时间 可以限制短信接口等调用 ...

  10. python-格式化输出(考试必考)

    Python与用户交互 如何交互 我们使用input()方法来与用户交互,但是无论我们输入的值是数字类型.字符串类型.列表类型,input的接收值都是字符串类型. name = input('请输入你 ...