1.tencentSpider.py

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from Tencent.items import TencentItem
  4. #创建爬虫类
  5. class TencentspiderSpider(scrapy.Spider):
  6. name = 'tencentSpider'#爬虫名字
  7. allowed_domains = ['tencent.com']#容许爬虫的作用范围
  8.  
  9. # 定义开始的URL
  10. offset = 0
  11. url = 'https://hr.tencent.com/position.php?&start='
  12. #urll='#a'
  13.  
  14. start_urls = [url + str(offset)] # 爬虫开始的URL
  15.  
  16. def parse(self, response):
  17. # 继承
  18. item = TencentItem()
  19. # 根节点
  20. movies = response.xpath("//tr[@class='odd']|//tr[@class='even']")
  21. for each in movies:
  22. item['zhiwei']=each.xpath(".//td[@class='l square']/a/text()").extract()[0]
  23. item['lianjie'] = each.xpath(".//td[@class='l square']/a/@href").extract()[0]
  24. #item['leibie'] = each.xpath("//tr[@class='odd']/td[2]/text()|//tr[@class='even']/td[2]/text()").extract()[0]
  25. item['leibie'] = each.xpath("//tr[@class='odd']/td[2]/text()|//tr[@class='even']/td[2]/text()").extract()[0]
  26. #data = response.xpath(".//tr[@class='odd']/td[2]|//tr[@class='even']/td[2][descendant-or-self::text()]")
  27. #item['leibie'] = data.xpath('string(.)').extract()
  28.  
  29. item['renshu'] = each.xpath("//tr[@class='odd']/td[3]/text()|//tr[@class='even']/td[3]/text()").extract()[0]
  30. item['didian'] = each.xpath("//tr[@class='odd']/td[4]/text()|//tr[@class='even']/td[4]/text()").extract()[0]
  31. item['shijian'] = each.xpath("//tr[@class='odd']/td[5]/text()|//tr[@class='even']/td[5]/text()").extract()[0]
  32.  
  33. # 异常处理
  34. #if len(quote) != 0:
  35. #item['quote'] = quote[0]
  36. print(item)
  37. yield item
  38.  
  39. if self.offset < 2840:
  40. self.offset += 10
  41. # 每次处理完一页之后,重新发送下一页请求
  42. # self offset 自增25,同时拼接为新的URL并调用回调函数,self parse 处理response
  43. yield scrapy.Request(self.url + str(self.offset),callback=self.parse)

2.items.py

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define here the models for your scraped items
  4. #
  5. # See documentation in:
  6. # https://doc.scrapy.org/en/latest/topics/items.html
  7.  
  8. import scrapy
  9.  
  10. class TencentItem(scrapy.Item):
  11. # define the fields for your item here like:
  12. # name = scrapy.Field()
  13. zhiwei = scrapy.Field()
  14. lianjie = scrapy.Field()
  15. leibie = scrapy.Field()
  16. renshu = scrapy.Field()
  17. didian = scrapy.Field()
  18. shijian = scrapy.Field()

3.main.py

  1. from scrapy import cmdline
  2. #
  3. cmdline.execute("scrapy crawl tencentSpider".split())

4.middlewares.py

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define here the models for your spider middleware
  4. #
  5. # See documentation in:
  6. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7.  
  8. from scrapy import signals
  9.  
  10. class TencentSpiderMiddleware(object):
  11. # Not all methods need to be defined. If a method is not defined,
  12. # scrapy acts as if the spider middleware does not modify the
  13. # passed objects.
  14.  
  15. @classmethod
  16. def from_crawler(cls, crawler):
  17. # This method is used by Scrapy to create your spiders.
  18. s = cls()
  19. crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
  20. return s
  21.  
  22. def process_spider_input(self, response, spider):
  23. # Called for each response that goes through the spider
  24. # middleware and into the spider.
  25.  
  26. # Should return None or raise an exception.
  27. return None
  28.  
  29. def process_spider_output(self, response, result, spider):
  30. # Called with the results returned from the Spider, after
  31. # it has processed the response.
  32.  
  33. # Must return an iterable of Request, dict or Item objects.
  34. for i in result:
  35. yield i
  36.  
  37. def process_spider_exception(self, response, exception, spider):
  38. # Called when a spider or process_spider_input() method
  39. # (from other spider middleware) raises an exception.
  40.  
  41. # Should return either None or an iterable of Response, dict
  42. # or Item objects.
  43. pass
  44.  
  45. def process_start_requests(self, start_requests, spider):
  46. # Called with the start requests of the spider, and works
  47. # similarly to the process_spider_output() method, except
  48. # that it doesn’t have a response associated.
  49.  
  50. # Must return only requests (not items).
  51. for r in start_requests:
  52. yield r
  53.  
  54. def spider_opened(self, spider):
  55. spider.logger.info('Spider opened: %s' % spider.name)
  56.  
  57. class TencentDownloaderMiddleware(object):
  58. # Not all methods need to be defined. If a method is not defined,
  59. # scrapy acts as if the downloader middleware does not modify the
  60. # passed objects.
  61.  
  62. @classmethod
  63. def from_crawler(cls, crawler):
  64. # This method is used by Scrapy to create your spiders.
  65. s = cls()
  66. crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
  67. return s
  68.  
  69. def process_request(self, request, spider):
  70. # Called for each request that goes through the downloader
  71. # middleware.
  72.  
  73. # Must either:
  74. # - return None: continue processing this request
  75. # - or return a Response object
  76. # - or return a Request object
  77. # - or raise IgnoreRequest: process_exception() methods of
  78. # installed downloader middleware will be called
  79. return None
  80.  
  81. def process_response(self, request, response, spider):
  82. # Called with the response returned from the downloader.
  83.  
  84. # Must either;
  85. # - return a Response object
  86. # - return a Request object
  87. # - or raise IgnoreRequest
  88. return response
  89.  
  90. def process_exception(self, request, exception, spider):
  91. # Called when a download handler or a process_request()
  92. # (from other downloader middleware) raises an exception.
  93.  
  94. # Must either:
  95. # - return None: continue processing this exception
  96. # - return a Response object: stops process_exception() chain
  97. # - return a Request object: stops process_exception() chain
  98. pass
  99.  
  100. def spider_opened(self, spider):
  101. spider.logger.info('Spider opened: %s' % spider.name)

5.pipelines.py

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define your item pipelines here
  4. #Tencent.json
  5. #class TencentPipeline(object):
  6. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  7. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  8.  
  9. import json
  10. from openpyxl import Workbook
  11. class TencentPipeline(object):
  12. wb = Workbook()
  13. ws = wb.active
  14. # 设置表头
  15. ws.append(['职位', '链接', '类型', '人数', '地点', '时间'])
  16.  
  17. def process_item(self, item, spider):
  18. # 添加数据
  19. line = [item['zhiwei'],item['lianjie'],item['leibie'],item['renshu'],item['didian'],item['shijian']]
  20. self.ws.append(line) # 按行添加
  21. self.wb.save('tencentSpider.xlsx')
  22. return item

6.settings.py

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Scrapy settings for Tencent project
  4. #
  5. # For simplicity, this file contains only settings considered important or
  6. # commonly used. You can find more settings consulting the documentation:
  7. #
  8. # https://doc.scrapy.org/en/latest/topics/settings.html
  9. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  10. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  11.  
  12. BOT_NAME = 'Tencent'
  13.  
  14. SPIDER_MODULES = ['Tencent.spiders']
  15. NEWSPIDER_MODULE = 'Tencent.spiders'
  16.  
  17. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  18. USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
  19.  
  20. # Obey robots.txt rules
  21. #ROBOTSTXT_OBEY = True
  22.  
  23. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  24. #CONCURRENT_REQUESTS = 32
  25.  
  26. # Configure a delay for requests for the same website (default: 0)
  27. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  28. # See also autothrottle settings and docs
  29. #DOWNLOAD_DELAY = 3
  30. # The download delay setting will honor only one of:
  31. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  32. #CONCURRENT_REQUESTS_PER_IP = 16
  33.  
  34. # Disable cookies (enabled by default)
  35. #COOKIES_ENABLED = False
  36.  
  37. # Disable Telnet Console (enabled by default)
  38. #TELNETCONSOLE_ENABLED = False
  39.  
  40. # Override the default request headers:
  41. #DEFAULT_REQUEST_HEADERS = {
  42. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  43. # 'Accept-Language': 'en',
  44. #}
  45.  
  46. # Enable or disable spider middlewares
  47. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  48. #SPIDER_MIDDLEWARES = {
  49. # 'Tencent.middlewares.TencentSpiderMiddleware': 543,
  50. #}
  51.  
  52. # Enable or disable downloader middlewares
  53. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  54. #DOWNLOADER_MIDDLEWARES = {
  55. # 'Tencent.middlewares.TencentDownloaderMiddleware': 543,
  56. #}
  57.  
  58. # Enable or disable extensions
  59. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  60. #EXTENSIONS = {
  61. # 'scrapy.extensions.telnet.TelnetConsole': None,
  62. #}
  63.  
  64. # Configure item pipelines
  65. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  66. ITEM_PIPELINES = {
  67. 'Tencent.pipelines.TencentPipeline': 300,
  68. }
  69.  
  70. # Enable and configure the AutoThrottle extension (disabled by default)
  71. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  72. #AUTOTHROTTLE_ENABLED = True
  73. # The initial download delay
  74. #AUTOTHROTTLE_START_DELAY = 5
  75. # The maximum download delay to be set in case of high latencies
  76. #AUTOTHROTTLE_MAX_DELAY = 60
  77. # The average number of requests Scrapy should be sending in parallel to
  78. # each remote server
  79. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  80. # Enable showing throttling stats for every response received:
  81. #AUTOTHROTTLE_DEBUG = False
  82.  
  83. # Enable and configure HTTP caching (disabled by default)
  84. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  85. #HTTPCACHE_ENABLED = True
  86. #HTTPCACHE_EXPIRATION_SECS = 0
  87.  
  88. #HTTPCACHE_DIR = 'httpcache'
  89. #HTTPCACHE_IGNORE_HTTP_CODES = []
  90. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

Scrapy项目 - 项目源码 - 实现腾讯网站社会招聘信息爬取的爬虫设计的更多相关文章

  1. Scrapy项目 - 数据简析 - 实现腾讯网站社会招聘信息爬取的爬虫设计

    一.数据分析截图 本例实验,使用Weka 3.7对腾讯招聘官网中网页上所罗列的招聘信息,如:其中的职位名称.链接.职位类别.人数.地点和发布时间等信息进行数据分析,详见如下图:   图1-1 Weka ...

  2. Scrapy项目 - 实现腾讯网站社会招聘信息爬取的爬虫设计

    通过使Scrapy框架,进行数据挖掘和对web站点页面提取结构化数据,掌握如何使用Twisted异步网络框架来处理网络通讯的问题,可以加快我们的下载速度,也可深入接触各种中间件接口,灵活的完成各种需求 ...

  3. Scrapy项目 - 源码工程 - 实现豆瓣 Top250 电影信息爬取的爬虫设计

    一.项目目录结构 spiders文件夹内包含doubanSpider.py文件,对于项目的构建以及结构逻辑,详见环境搭建篇. 二.项目源码 1.doubanSpider.py # -*- coding ...

  4. Scrapy项目 - 实现斗鱼直播网站信息爬取的爬虫设计

    要求编写的程序可爬取斗鱼直播网站上的直播信息,如:房间数,直播类别和人气等.熟悉掌握基本的网页和url分析,同时能灵活使用Xmind工具对Python爬虫程序(网络爬虫)流程图进行分析.   一.项目 ...

  5. Scrapy项目 - 实现豆瓣 Top250 电影信息爬取的爬虫设计

    通过使Scrapy框架,掌握如何使用Twisted异步网络框架来处理网络通讯的问题,进行数据挖掘和对web站点页面提取结构化数据,可以加快我们的下载速度,也可深入接触各种中间件接口,灵活的完成各种需求 ...

  6. Scrapy项目 - 数据简析 - 实现豆瓣 Top250 电影信息爬取的爬虫设计

    一.数据分析截图(weka数据分析截图 ) 本例实验,使用Weka 3.7对豆瓣电影网页上所罗列的上映电影信息,如:标题.主要信息(年份.国家.类型)和评分等的信息进行数据分析,Weka 3.7数据分 ...

  7. Scrapy项目 - 数据简析 - 实现斗鱼直播网站信息爬取的爬虫设计

    一.数据分析截图(weka数据分析截图 2-3个图,作业文字描述) 本次将所爬取的数据信息,如:房间数,直播类别和人气,导入Weka 3.7工具进行数据分析.有关本次的数据分析详情详见下图所示:   ...

  8. Scrapy项目 - 实现百度贴吧帖子主题及图片爬取的爬虫设计

    要求编写的程序可获取任一贴吧页面中的帖子链接,并爬取贴子中用户发表的图片,在此过程中使用user agent 伪装和轮换,解决爬虫ip被目标网站封禁的问题.熟悉掌握基本的网页和url分析,同时能灵活使 ...

  9. Scrapy案例02-腾讯招聘信息爬取

    目录 1. 目标 2. 网站结构分析 3. 编写爬虫程序 3.1. 配置需要爬取的目标变量 3.2. 写爬虫文件scrapy 3.3. 编写yield需要的管道文件 3.4. setting中配置请求 ...

随机推荐

  1. python 38 线程队列与协程

    目录 1. 线程队列 1.1 先进先出(FIFO) 1.2 后进先出(LIFO)堆栈 1.3 优先级队列 2. 事件event 3. 协程 4. Greenlet 模块 5. Gevent模块 1. ...

  2. [Python] Django框架入门3——深入视图

    说明: 本文主要深入了解视图(views.py),涉及路由配置.定义视图.Request对象.Response对象.状态保持等. 一.路由配置 1.配置位置(settings.py 的 ROOT_UR ...

  3. .NET CORE下最快比较两个文件内容是否相同的方法 - 续

    .NET CORE下最快比较两个文件内容是否相同的方法 - 续 在上一篇博文中, 我使用了几种方法试图找到哪个是.NET CORE下最快比较两个文件的方法.文章发布后,引起了很多博友的讨论, 在此我对 ...

  4. .netcore 分布式事务CAP2.6之控制台使用

    上一编讲了cap2.6的快速入门,这次我们来讲讲在控制台中如何使用cap2.6.因为cap2.6的内存模式目前已经可以使用了,相关组件已经更新,所以这次我们以简单的内存模式为例. 1:创建项目 创建一 ...

  5. JavaScript 小游戏 贪吃蛇

    贪吃蛇 代码: <!DOCTYPE html><html><head> <meta charset="UTF-8"> <met ...

  6. VS Code 配置 Python 开发环境

    1.终端运行 Python2.安装 Python 插件3.查看.安装外部库4.代码补全工具5.代码检查工具5.1.pylint5.2.flake8 和 yapf 本文基于 VS Code 1.36.1 ...

  7. 深入理解static关键字

    class A{ public int i = 10; public void show(){ System.out.printf("%d",i); } } class M{ pu ...

  8. PAT L3-017. 森森快递

    L3-017. 森森快递 时间限制 400 ms 内存限制 65536 kB 代码长度限制 8000 B 判题程序 Standard 作者 俞勇(上海交通大学) 森森开了一家快递公司,叫森森快递.因为 ...

  9. Python学习之旅:使用Python实现Linux中的ls命令

    一.写在前面 前几天在微信上看到这样一篇文章,链接为:https://mp.weixin.qq.com/s/rl6Sgv3uk_IpoFAx6cWa8w,在这篇文章中,有这样一段话,吸引了我的注意: ...

  10. R:ggplot2数据可视化——进阶(1)

    ,分为三个部分,此篇为Part1,推荐学习一些基础知识后阅读~ Part 1: Introduction to ggplot2, 覆盖构建简单图表并进行修饰的基础知识 Part 2: Customiz ...