爬取招聘网的招聘信息:

  1. import json
  2. import random
  3. import time
  4.  
  5. import pymongo
  6. import re
  7. import pandas as pd
  8. import requests
  9. from lxml import etree
  10. import datetime
  11.  
  12. # 设置cookie中可变的值
  13. now = datetime.datetime.now()
  14. timeStamp = int(now.timestamp()*)
  15. geshi = "%Y%m%d%H%M%S"
  16. time1 = datetime.datetime.strftime(now,geshi)
  17.  
  18. # 设置mongodb
  19. client = pymongo.MongoClient('localhost')
  20. # 设置数据库名
  21. db = client['lagou']
  22. # 指定集合名
  23. data_name = 'lagouData'
  24. detail = 'detailData'
  25.  
  26. # 常量
  27. CITY = '广州'
  28. # 查询的岗位名称
  29. POSITON_NAME = '数据挖掘'
  30. # 想要爬取的总页面数
  31. PAGE_SUN =
  32. # 每页返回的职位数量
  33. PAGE_SIZE =
  34.  
  35. # 匹配span[position()>]:表示p标签下从第三个span开始匹配所以
  36. # //dd[@class='job_request']/p/span[position()>3]
  37.  
  38. # index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize=15'
  39. # index页地址url
  40. index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize={}'
  41. # 详情页地址url
  42. detail_url = 'https://m.lagou.com/jobs/{}.html'
  43. # 浏览代理用户
  44. user_agents = [
  45. "Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
  46. "Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080",
  47. "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.5.18 NetType/WIFI Language/en",
  48. "Mozilla/5.0 (Linux; Android 5.1.1; vivo Xplay5A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 T7/9.3 baiduboxapp/9.3.0.10 (Baidu; P1 5.1.1)",
  49. "Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36",
  50. "Mozilla/5.0 (Linux; Android 6.0; LEX626 Build/HEXCNFN5902606111S) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/7.4 baiduboxapp/8.3.1 (Baidu; P1 6.0)",
  51. "Mozilla/5.0 (iPhone 92; CPU iPhone OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.7.2 Mobile/14F89 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
  52. "Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; ZUK Z2121 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.8.952 Mobile Safari/537.36"]
  53.  
  54. # index页面
  55. def index_fn():
  56. user_agent=random.choice(user_agents)
  57. headers = {
  58. "Accept": "application/json",
  59. "Accept-Encoding": "gzip, deflate, br",
  60. "Accept-Language": "zh-CN,zh;q=0.9",
  61. "Connection": "keep-alive",
  62. "Host": "m.lagou.com",
  63. #
  64. "Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
  65. timeStamp=timeStamp, time=time1),
  66. "Referer": "https://m.lagou.com/search.html",
  67. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
  68. "X-Requested-With": "XMLHttpRequest",
  69. }
  70. for i in range(PAGE_SIZE):
  71. proxies = {'HTTP': '171.37.164.78:8123'}
  72. response = requests.get(index_url.format(CITY, POSITON_NAME, i, PAGE_SIZE), headers=headers,proxies=proxies).content.decode()
  73. content = json.loads(response)
  74. # print('content', content)
  75. if content:
  76. try:
  77. result = content['content']['data']['page']['result']
  78. for item in result:
  79. # print(type(item),item)
  80. # print(item['positionId'])
  81. data = {
  82. 'positionId': item['positionId'],
  83. 'positionName': item['positionName'],
  84. 'city': item['city'],
  85. 'createTime':item['createTime'],
  86. 'companyId': item['companyId'],
  87. 'companyLogo': item['companyLogo'],
  88. 'companyName': item['companyName'],
  89. 'companyFullName': item['companyFullName'],
  90. }
  91. time.sleep(0.5)
  92. # db['lagouData'].insert(data)
  93. yield data
  94. except Exception as e:
  95. print('爬取index页出错', e)
  96. else:
  97. time.sleep()
  98. print('重新加载')
  99. # except Exception as e:
  100. # print('爬取index页出错', e)
  101.  
  102. # 详情页:
  103. def detail_d(positionId):
  104. # 随机获取代理对象
  105. user_agent = random.choice(user_agents)
  106. headers = {
  107. "Accept": "application/json",
  108. "Accept-Encoding": "gzip, deflate, br",
  109. "Accept-Language": "zh-CN,zh;q=0.9",
  110. "Connection": "keep-alive",
  111. "Host": "m.lagou.com",
  112. #
  113. "Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
  114. timeStamp=timeStamp, time=time1),
  115. "Referer": "https://m.lagou.com/search.html",
  116. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
  117. "X-Requested-With": "XMLHttpRequest",
  118. }
  119. response = requests.get(detail_url.format(positionId), headers=headers).content.decode()
  120.  
  121. xml = etree.HTML(response)
  122. title = xml.xpath('''//div[@class='postitle']/h2/text()''')
  123. job_details = xml.xpath('''//div[@class='detail']/div[1]//span/span/text()''')
  124. job_detail = str(job_details).replace(r'\n', '').replace(' ', '')
  125. work_detial = xml.xpath('''//div[@class='content']//p/text()''')
  126. company_img = xml.xpath('''//div[@class='content']//p/text()''')
  127. company_infors = xml.xpath(
  128. '''//div[@class='company activeable']/div/div/h2/text()|//div[@class='dleft']/p/text()''')
  129. company_infor = str(company_infors).strip().replace(r'\n', '').replace(' ', '')
  130. detail_datas = {
  131. 'title': title,
  132. 'job_detail': job_detail,
  133. 'work_detial': work_detial,
  134. 'company_img': company_img,
  135. 'company_infor': company_infor
  136. }
  137. return detail_datas
  138.  
  139. # 保存到mongodb
  140. def save_to_mongodb(data, detail_datas, positionId):
  141. # if db[data_name].update({'positionId': positionId}, {'$set': data}, True):
  142. # print('update to Mongo', data['positionId'])
  143.  
  144. db['lagouData'].insert(data)
  145. db['detailDta'].insert(detail_datas)
  146. print('成功存入mongodb')
  147.  
  148. # 保存为csv文件
  149. def save_to_csv():
  150. item_list = []
  151. for item in index_fn():
  152. item_list.append(item)
  153. print('', item)
  154. # print('详情列', item_list)
  155. # item_list是一个列表,里面装很多字典类似{'positionId': , 'positionName': '数据挖掘工程师', 'city': '广州',...
  156. datas = pd.DataFrame(item_list, columns=["positionId", "positionName", "city", "createTime", "salary", "companyId",
  157. "companyLogo", "companyName", "companyFullName"])
  158.  
  159. datas.to_csv('./static/lagou.csv')
  160. print('保存为csv文件成功')
  161.  
  162. def run():
  163. # 保存为csv文件
  164. # proxies=get_ip()
  165. # for i in proxies:
  166. data = index_fn()
  167. save_to_csv()
  168. for item in data:
  169. print('data', item)
  170. positionId = item['positionId']
  171. print(positionId)
  172. # 调用详情页函数
  173. detail_datas = detail_d(positionId)
  174. # 保存详情页和主页的数据到mongodb
  175. save_to_mongodb(data, detail_datas, positionId)
  176.  
  177. if __name__ == '__main__':
  178. run()

预防反爬虫措施:

1.用户代理变换设置

2.不同ip代理的设置

3.设置用户cookie变化的信息

列举一下获取网络免费ip代理,并验证其是否可用的代码:

  1. import requests
  2. import re
  3. import telnetlib
  4. from lxml import etree
  5. import time
  6. def get_ip():
  7. headers = {
  8. "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
  9. }
  10. url = "http://www.xicidaili.com/nn/"
  11. res = requests.get(url,headers=headers)
  12. content = res.content.decode()
  13.  
  14. # 调用lxml中的etree库便于用xpath获取html中的文本
  15. xml = etree.HTML(content)
  16. # 以下为xpath语法
  17. # //tr[@class='odd']//td[2]//text() ip
  18. # //tr[@class='odd']//td[3]//text() port 端口
  19. # //tr[@class='odd']//td[6]//text() type 类型
  20. ip_list = xml.xpath("//tr[@class='odd']//td[2]//text()")
  21. port_list = xml.xpath("//tr[@class='odd']//td[3]//text()")
  22. type_list = xml.xpath("//tr[@class='odd']//td[6]//text()")
  23. if len(ip_list) != :
  24. for ip,port,type in zip(ip_list,port_list,type_list):
  25. proxies = {
  26. type:"{}:{}".format(ip,port)
  27. }
  28. try:
  29. telnetlib.Telnet(ip, port=port, timeout=)
  30. except Exception:
  31. print("不能使用该{}".format(proxies))
  32. else:
  33. print('可以使用该{}'.format(proxies))
  34. yield proxies
  35. get_ip()
  36. else:
  37. time.sleep()
  38. get_ip()
  39.  
  40. # content= res.content.decode()
  41. # print(content)
  42.  
  43. # if __name__ == '__main__':
  44. # ip = get_ip()
  45. # print(ip)
  46. # for i in ip:
  47. # pass
  48. # yield i
  49. # print('getip,getip',i)

利用xpath爬取招聘网的招聘信息的更多相关文章

  1. python爬取当当网的书籍信息并保存到csv文件

    python爬取当当网的书籍信息并保存到csv文件 依赖的库: requests #用来获取页面内容 BeautifulSoup #opython3不能安装BeautifulSoup,但可以安装Bea ...

  2. 利用scrapy爬取腾讯的招聘信息

    利用scrapy框架抓取腾讯的招聘信息,爬取地址为:https://hr.tencent.com/position.php 抓取字段包括:招聘岗位,人数,工作地点,发布时间,及具体的工作要求和工作任务 ...

  3. 如何利用Xpath抓取京东网商品信息

    前几小编分别利用Python正则表达式和BeautifulSoup爬取了京东网商品信息,今天小编利用Xpath来为大家演示一下如何实现京东商品信息的精准匹配~~ HTML文件其实就是由一组尖括号构成的 ...

  4. 利用python爬取贝壳网租房信息

    最近准备换房子,在网站上寻找各种房源信息,看得眼花缭乱,于是想着能否将基本信息汇总起来便于查找,便用python将基本信息爬下来放到excel,这样一来就容易搜索了. 1. 利用lxml中的xpath ...

  5. 利用jsoup爬取百度网盘资源分享连接(多线程)

    突然有一天就想说能不能用某种方法把百度网盘上分享的资源连接抓取下来,于是就动手了.知乎上有人说过最好的方法就是http://pan.baidu.com/wap抓取,一看果然链接后面的uk值是一串数字, ...

  6. 利用selenium爬取京东商品信息存放到mongodb

    利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...

  7. python 爬虫之爬取大街网(思路)

    由于需要,本人需要对大街网招聘信息进行分析,故写了个爬虫进行爬取.这里我将记录一下,本人爬取大街网的思路. 附:爬取得数据仅供自己分析所用,并未用作其它用途. 附:本篇适合有一定 爬虫基础 crawl ...

  8. 利用Selenium爬取淘宝商品信息

    一.  Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样.由于这个性质,Selenium也是一 ...

  9. Python爬虫项目--爬取自如网房源信息

    本次爬取自如网房源信息所用到的知识点: 1. requests get请求 2. lxml解析html 3. Xpath 4. MongoDB存储 正文 1.分析目标站点 1. url: http:/ ...

随机推荐

  1. 【模板】多项式乘法(FFT)

    题目描述 给定一个n次多项式F(x),和一个m次多项式G(x). 请求出F(x)和G(x)的卷积. 输入输出格式 输入格式: 第一行2个正整数n,m. 接下来一行n+1个数字,从低到高表示F(x)的系 ...

  2. 【洛谷P4145】花神游历各国

    题目大意:给定一个长度为 N 的序列,支持区间开根,区间求和. 题解:对于区间开根操作,可以发现任何一个位置的值开根至多 6 次就会变成 1.因此即使是整个区间开根,暴力修改6次后,所有的点的权值均小 ...

  3. 【POJ2411】Mondriaan's Dream

    题目大意:给定一个 N*M 的棋盘,用 1*2 的木条填满有多少种不同的方式. 题解:在这里采用以行为阶段进行状压 dp.到第 i 行时,1*1 的木块分成两类,第一类是这个木块是竖着放置木条的上半部 ...

  4. 奇怪的跨域访问:No 'Access-Control-Allow-Origin' header

    代码是几个月前写的,之前的几个月一直运行正常. 可今天使用的时候运行失败了,提示:No 'Access-Control-Allow-Origin' header 使用 chrome.firefox 都 ...

  5. 2018 ACM 网络选拔赛 青岛赛区

    一些题目的代码被网站吞了…… Problem B. Red Black Tree http://acm.zju.edu.cn/onlinejudge/searchProblem.do?contestI ...

  6. bash 3

    1)unset 命令可以删除变量.readonly变量不能删除 2)变量类型 运行shell时,会同时存在三种变量: 1) 局部变量 局部变量在脚本或命令中定义,仅在当前shell实例中有效,其他sh ...

  7. Linux 上 nginx配置

    1:安装工具包 wget.vim和gcc yum install -y wget yum install -y vim-enhanced yum install -y make cmake gcc g ...

  8. bzoj1009 KMP+矩阵dp

    https://www.lydsy.com/JudgeOnline/problem.php?id=1009 阿申准备报名参加GT考试,准考证号为N位数X1X2....Xn(<=Xi<=), ...

  9. 数据库基础SQL知识面试题二

    数据库基础SQL知识面试题二 作者:尹正杰  版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.选课系统SQL语法练习 course数据库中有以下四张表: •students表(学生表): si ...

  10. kafka常见异常汇总

      1>.java.lang.OutOfMemoryError:Map failed 发生上述问题,原因是发生OOM啦,会导致kafka进程直接崩溃掉!因此我们只能重新启动broker节点了,但 ...