-- coding: utf-8 --

import scrapy

from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem

class QianchengSpiderSpider(scrapy.Spider):

name = 'qiancheng_spider'

# allowed_domains = ['www.qq.com']

start_urls = [

#关键字数据分析

'https://search.51job.com/list/000000,000000,0000,00,9,99,%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字数据挖掘

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字算法

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字机器学习

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字深度学习

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字人工智能

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

]

#以上是第一步，获取搜索到这些关键字的都有哪些url

#第二步骤，这些个检索页，下面有很多页，要翻页，每一页中的每个详情页的里面的数据

#那么首先我们要先写提取一个页面当中的url（每一个详情页的url），这应该是一个a标签

def parse(self, response):

    xpath="//div[@class='el']" #这里面要过滤筛选一下用这个xpth获得

    items = response.xpath(xpath); #这里面获得是不满足条件的el标签

    print(items)

    for item in items:

    #遍历一下这个items，把不符合需求的过滤掉

    # 如何去过滤呢？也就是说如何选择下面这个if的条件呢，我们来看看原始网页代码的特点

        #观察之后我们发现每一个t1标签前面还都有一个p标签，t1后面还有一个空格

        if not len(item.xpath("./p[@class='t1 ']")):

            continue

                              #一个点表示当前节点#p标签 t1 （这里有个空格）#这部分为这么这样写我不是特别明白

               #也就是说我在上面这个items下面我再查找，有没有"p标签"，有没有class等于‘t1空格’

        url = item.xpath("./p[@class='t1 ']//a/@href").extract_first()#这里获得是详情页的全部内容

    #./p[@class='t1 '#照抄，因为他下面只有一个“a标签”，获取他下面全部内容@

    #href属性，

        yield scrapy.Request(url, callback=self.detail_parse)

    #下面开始是想得到他的翻页行为

    next_page_url = response.xpath("//a[@id='rtNext']/@href").extract_first()

    if not next_page_url is None:

         yield scrapy.Request(next_page_url, callback=self.parse)

def detail_parse(self,response):

    item = JobscrawlerQianchengwuyouItem()

    # 招聘名称

    item["job_name"] = response.xpath("//div[@class='cn']/h1/text()").extract_first().strip()

    # 可以获得没有白空格的job_name

    # 职位信息

    item["job_info"] = "".join(response.xpath("//div[@calss='bmsg job_msg inbox']//text()").extract()).strip()

    # 薪资

    item["job_salary"] = "".join(response.xpath('//div[@class="sp4"]/text()').extract()).strip()

    # 职位福利

    item["job_welfare"] = ",".join(response.xpath("//span[@class='sp4']/text()").extract())

    #item["job_welfare"] = response.xpath("//span[@class='sp4']/text()这样会获得一个列表，但是我们需要的是一个字符串

    # 经验要求

    item["job_exp_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip()

    item["job_edu_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip()

    # 学历要求#获取详情页的细节信息

    # 公司名称

    item["company_name"] = response.xpath('//div[@class="com_msg"]//p/text').extract_first().strip()

    # 公司行业

    # 公司性质

    itme["company_industry"] = "".join(response.xpath('//span[@class="i_trade"]/../text()').extract()).strip()

    item["company_nature"] = "".join(response.xpath('//span[@class="i_flag"]/../text()').extract()).strip()

    #“..（点点）”的意思是我希望定位到父标签的text，但是定位不到父标签，能定位到子标签同一级的标签，然后通过子标签点点，就可以了

    #"".join（....）意思是得到的是一个列表，join一下，就加到前面的“”当中去了，就变成str格式了

    #这里如果希望把所有的白空格都处理掉的话，就需要for循环，但是数据量比较大，就把收尾的白空格去掉就可以了

    # 公司人数

    item["company_people"] = "".join(response.xpath('//span[@class="i_people"]/../text()').extract()).strip()

    # 公司地址

    item["company_location"] = ""

    # 公司概况

    item["company_overview"] = "".join(response.xpath('//div[@class="tmsg inbox"]//text()').extract()).strip()

    # 公司融资阶段

    item["company_financing_stage"] = ""

    yield item

# -- coding: utf-8 --的更多相关文章

【转】关于Python脚本开头两行的：#!/usr/bin/python和# -*- coding: utf-8 -*-的作用 – 指定文件编码类型
原文网址:http://www.crifan.com/python_head_meaning_for_usr_bin_python_coding_utf-8/ #!/usr/bin/python 是用 ...
Python脚本开头两行：#!/usr/bin/python和# -*- coding: utf-8 -*-的作用
转于:https://www.crifan.com/python_head_meaning_for_usr_bin_python_coding_utf-8/ 出处:在路上一.基本功能 1)#!/us ...
关于Python脚本开头两行的：#!/usr/bin/python和# -*- coding: utf-8 -*-的作用 – 转
#!/usr/bin/python 是用来说明脚本语言是python的是要用/usr/bin下面的程序(工具)python,这个解释器,来解释python脚本,来运行python脚本的. # -*- ...
《简明python教程》笔记一
读<简明Python教程>笔记: 本书的官方网站是www.byteofpython.info 安装就不说了,网上很多,这里就记录下我在安装时的问题,首先到python官网下载,选好安装路 ...
RSA加密解密（python版）
RSA的算法涉及三个参数,n.e.d. 其中,n是两个大质数p.q的积,n的二进制表示时所占用的位数,就是所谓的密钥长度. e1和d是一对相关的值,e可以任意取,但要求e与(p-1)*(q-1)互质: ...
算法：冒泡排序（python版）
1.将n个元素从大到小排序后,选择第k个大的元素 #!/usr/bin/env python #coding -*- utf:8 -*- #冒泡排序选第k个元素 import random impor ...
算法：二分查找（python版）
#!/usr/bin/env python #coding -*- utf:8 -*- #二分查找#时间复杂度O(logn)#一个时间常量O(1)将问题的规模缩小一半,则O(logn) import ...
算法：欧几里得求最大公约数（python版）
#欧几里得求最大公约数 #!/usr/bin/env python #coding -*- utf:8 -*- #iteration def gcd(a,b): if b==0: return a e ...
算法：求幂（python版）
分别用迭代方法和递归方法实现求幂迭代方法的时间复杂度为O(n),空间复杂度为O(1)递归方法1的时间复杂度为O(logn),空间复杂度为O(logn)递归方法2的时间复杂度为O(n),空间复杂度为O( ...
python第一天
python 解释器执行代码有两种一种在解释器: win+R==>cmd 打开终端进行输入python 加路径另一种在文件里写完再到解释器执行:win+R==>cmd 打开终端进行 ...

随机推荐

【Django视图与网址进阶004】
一.在网页上做加减法 1. 采用 /add/?a=4&b=5 这样GET方法进行 django-admin.py startproject zqxt_views cd zqxt_views p ...
Mysql与Postgresql
在经过一段时间的数据库学习之后,我接触到了Mysql与Postgresql两种数据库管理系统,由于我对这两者的理解都停留在很浅的层面,就不在此比较两者的好坏,主要在这里比较一下两者的入门指令(当然两者 ...
Introducation of Servlet filter（servlet过滤器介绍）
本文章向大家介绍Servlet Filter,主要包括 Servlet Filter使用实例.应用技巧.基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下. 过滤器是一个可以转换 ...
Android5.0新特性之——控件移动动画（初级）
最近开发,UI大牛们设计了好多很炫酷吊炸天的动画,不由得重新学习了一下5.0的ObjectAnimator动画. ObjectAnimator动画的原理,通过反射控件的setXXX方法,改变控件的实际 ...
git遇到error: RPC failed; curl 18 transfer closed with outstanding read data remaining fatal: The remote end hung up unexpectedly fatal: early EOF fatal: index-pack failed failed怎么办?
答: 将clone地址中的https://替换成git://即可解决如: 将https://git.openwrt.org/project/luci.git修改为git://git.openwrt. ...
使用代理实现对C# list distinct操作
范型在c#编程中经常使用,而经常用list 去存放实体集,因此会设计到对list的各种操作,比较常见的有对list进行排序,查找,比较,去重复.而一般的如果要对list去重复如果使用linq dist ...
pycharm使用selenium之前
2.python安装好后,查看你的pycharm输出控制台,第一行就写了你所使用的python.exe的路径,如下图箭头处所示: 检查python使用的是不是你刚刚安装的,如果不是,换成你刚刚安装的p ...
JXOJ(基于UOJ)部署日志
JXOJ部署日志前些日子协助cyc.llf两位奆老部署了JXOJ,为方便日后维护我校OJ的同学,写篇日志做记录以日后查看. 一.准备: 在尝试了多个不同OJ之后,我们最终选择了Universal ...
IPTABLES使用总结（内网模拟银行网络）
iptables中有以下三种类型的表: FILTER表,默认的表,包含以下三种内建链: INPUT链,发给本地sockets的包 FORWARD链,经由系统发送的包 OUTPUT链,本地生成并发出的包 ...
自身使用的springboot项目中比较全的pom.xml
在学习的时候常建新的项目,mark下商用的jar <dependency> <groupId>org.mybatis</groupId> <artifactI ...

# -*- coding: utf-8 -*-

-- coding: utf-8 --

# -*- coding: utf-8 -*-的更多相关文章

随机推荐

热门专题

# -- coding: utf-8 --

# -- coding: utf-8 --的更多相关文章