利用xpath爬取招聘网的招聘信息
爬取招聘网的招聘信息:
import json
import random
import time import pymongo
import re
import pandas as pd
import requests
from lxml import etree
import datetime # 设置cookie中可变的值
now = datetime.datetime.now()
timeStamp = int(now.timestamp()*)
geshi = "%Y%m%d%H%M%S"
time1 = datetime.datetime.strftime(now,geshi) # 设置mongodb
client = pymongo.MongoClient('localhost')
# 设置数据库名
db = client['lagou']
# 指定集合名
data_name = 'lagouData'
detail = 'detailData' # 常量
CITY = '广州'
# 查询的岗位名称
POSITON_NAME = '数据挖掘'
# 想要爬取的总页面数
PAGE_SUN =
# 每页返回的职位数量
PAGE_SIZE = # 匹配span[position()>]:表示p标签下从第三个span开始匹配所以
# //dd[@class='job_request']/p/span[position()>3] # index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize=15'
# index页地址url
index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize={}'
# 详情页地址url
detail_url = 'https://m.lagou.com/jobs/{}.html'
# 浏览代理用户
user_agents = [
"Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
"Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080",
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.5.18 NetType/WIFI Language/en",
"Mozilla/5.0 (Linux; Android 5.1.1; vivo Xplay5A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 T7/9.3 baiduboxapp/9.3.0.10 (Baidu; P1 5.1.1)",
"Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0; LEX626 Build/HEXCNFN5902606111S) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/7.4 baiduboxapp/8.3.1 (Baidu; P1 6.0)",
"Mozilla/5.0 (iPhone 92; CPU iPhone OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.7.2 Mobile/14F89 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
"Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; ZUK Z2121 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.8.952 Mobile Safari/537.36"] # index页面
def index_fn():
user_agent=random.choice(user_agents)
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "m.lagou.com",
#
"Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
timeStamp=timeStamp, time=time1),
"Referer": "https://m.lagou.com/search.html",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
for i in range(PAGE_SIZE):
proxies = {'HTTP': '171.37.164.78:8123'}
response = requests.get(index_url.format(CITY, POSITON_NAME, i, PAGE_SIZE), headers=headers,proxies=proxies).content.decode()
content = json.loads(response)
# print('content', content)
if content:
try:
result = content['content']['data']['page']['result']
for item in result:
# print(type(item),item)
# print(item['positionId'])
data = {
'positionId': item['positionId'],
'positionName': item['positionName'],
'city': item['city'],
'createTime':item['createTime'],
'companyId': item['companyId'],
'companyLogo': item['companyLogo'],
'companyName': item['companyName'],
'companyFullName': item['companyFullName'],
}
time.sleep(0.5)
# db['lagouData'].insert(data)
yield data
except Exception as e:
print('爬取index页出错', e)
else:
time.sleep()
print('重新加载')
# except Exception as e:
# print('爬取index页出错', e) # 详情页:
def detail_d(positionId):
# 随机获取代理对象
user_agent = random.choice(user_agents)
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "m.lagou.com",
#
"Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
timeStamp=timeStamp, time=time1),
"Referer": "https://m.lagou.com/search.html",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
response = requests.get(detail_url.format(positionId), headers=headers).content.decode() xml = etree.HTML(response)
title = xml.xpath('''//div[@class='postitle']/h2/text()''')
job_details = xml.xpath('''//div[@class='detail']/div[1]//span/span/text()''')
job_detail = str(job_details).replace(r'\n', '').replace(' ', '')
work_detial = xml.xpath('''//div[@class='content']//p/text()''')
company_img = xml.xpath('''//div[@class='content']//p/text()''')
company_infors = xml.xpath(
'''//div[@class='company activeable']/div/div/h2/text()|//div[@class='dleft']/p/text()''')
company_infor = str(company_infors).strip().replace(r'\n', '').replace(' ', '')
detail_datas = {
'title': title,
'job_detail': job_detail,
'work_detial': work_detial,
'company_img': company_img,
'company_infor': company_infor
}
return detail_datas # 保存到mongodb
def save_to_mongodb(data, detail_datas, positionId):
# if db[data_name].update({'positionId': positionId}, {'$set': data}, True):
# print('update to Mongo', data['positionId']) db['lagouData'].insert(data)
db['detailDta'].insert(detail_datas)
print('成功存入mongodb') # 保存为csv文件
def save_to_csv():
item_list = []
for item in index_fn():
item_list.append(item)
print('', item)
# print('详情列', item_list)
# item_list是一个列表,里面装很多字典类似{'positionId': , 'positionName': '数据挖掘工程师', 'city': '广州',...
datas = pd.DataFrame(item_list, columns=["positionId", "positionName", "city", "createTime", "salary", "companyId",
"companyLogo", "companyName", "companyFullName"]) datas.to_csv('./static/lagou.csv')
print('保存为csv文件成功') def run():
# 保存为csv文件
# proxies=get_ip()
# for i in proxies:
data = index_fn()
save_to_csv()
for item in data:
print('data', item)
positionId = item['positionId']
print(positionId)
# 调用详情页函数
detail_datas = detail_d(positionId)
# 保存详情页和主页的数据到mongodb
save_to_mongodb(data, detail_datas, positionId) if __name__ == '__main__':
run()
预防反爬虫措施:
1.用户代理变换设置
2.不同ip代理的设置
3.设置用户cookie变化的信息
列举一下获取网络免费ip代理,并验证其是否可用的代码:
import requests
import re
import telnetlib
from lxml import etree
import time
def get_ip():
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
}
url = "http://www.xicidaili.com/nn/"
res = requests.get(url,headers=headers)
content = res.content.decode() # 调用lxml中的etree库便于用xpath获取html中的文本
xml = etree.HTML(content)
# 以下为xpath语法
# //tr[@class='odd']//td[2]//text() ip
# //tr[@class='odd']//td[3]//text() port 端口
# //tr[@class='odd']//td[6]//text() type 类型
ip_list = xml.xpath("//tr[@class='odd']//td[2]//text()")
port_list = xml.xpath("//tr[@class='odd']//td[3]//text()")
type_list = xml.xpath("//tr[@class='odd']//td[6]//text()")
if len(ip_list) != :
for ip,port,type in zip(ip_list,port_list,type_list):
proxies = {
type:"{}:{}".format(ip,port)
}
try:
telnetlib.Telnet(ip, port=port, timeout=)
except Exception:
print("不能使用该{}".format(proxies))
else:
print('可以使用该{}'.format(proxies))
yield proxies
get_ip()
else:
time.sleep()
get_ip() # content= res.content.decode()
# print(content) # if __name__ == '__main__':
# ip = get_ip()
# print(ip)
# for i in ip:
# pass
# yield i
# print('getip,getip',i)
利用xpath爬取招聘网的招聘信息的更多相关文章
- python爬取当当网的书籍信息并保存到csv文件
python爬取当当网的书籍信息并保存到csv文件 依赖的库: requests #用来获取页面内容 BeautifulSoup #opython3不能安装BeautifulSoup,但可以安装Bea ...
- 利用scrapy爬取腾讯的招聘信息
利用scrapy框架抓取腾讯的招聘信息,爬取地址为:https://hr.tencent.com/position.php 抓取字段包括:招聘岗位,人数,工作地点,发布时间,及具体的工作要求和工作任务 ...
- 如何利用Xpath抓取京东网商品信息
前几小编分别利用Python正则表达式和BeautifulSoup爬取了京东网商品信息,今天小编利用Xpath来为大家演示一下如何实现京东商品信息的精准匹配~~ HTML文件其实就是由一组尖括号构成的 ...
- 利用python爬取贝壳网租房信息
最近准备换房子,在网站上寻找各种房源信息,看得眼花缭乱,于是想着能否将基本信息汇总起来便于查找,便用python将基本信息爬下来放到excel,这样一来就容易搜索了. 1. 利用lxml中的xpath ...
- 利用jsoup爬取百度网盘资源分享连接(多线程)
突然有一天就想说能不能用某种方法把百度网盘上分享的资源连接抓取下来,于是就动手了.知乎上有人说过最好的方法就是http://pan.baidu.com/wap抓取,一看果然链接后面的uk值是一串数字, ...
- 利用selenium爬取京东商品信息存放到mongodb
利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...
- python 爬虫之爬取大街网(思路)
由于需要,本人需要对大街网招聘信息进行分析,故写了个爬虫进行爬取.这里我将记录一下,本人爬取大街网的思路. 附:爬取得数据仅供自己分析所用,并未用作其它用途. 附:本篇适合有一定 爬虫基础 crawl ...
- 利用Selenium爬取淘宝商品信息
一. Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样.由于这个性质,Selenium也是一 ...
- Python爬虫项目--爬取自如网房源信息
本次爬取自如网房源信息所用到的知识点: 1. requests get请求 2. lxml解析html 3. Xpath 4. MongoDB存储 正文 1.分析目标站点 1. url: http:/ ...
随机推荐
- chip
1.芯片(chip.pas/cpp) [问题描述] 企鹅集成电路公司生产了一种大小为 2×3的芯片.每块芯片是从一块大小为N×M的硅片上切下来的,但由于原材料纯度问题,因而有若干的单位正方形并不能作为 ...
- axios 使用
<!DOCTYPE html> <html lang="en"> <head> {#导入静态文件#} {% load static %} < ...
- sublime中编辑服务器上的文件
背景:公司项目需要进行构建编译,在服务器上速度比较快,所以需要将sublime和linux中的文件相关联. 参考资料:http://zyan.cc/samba_linux_windows/ 主要有两步 ...
- STM32F4 ------ RTC
如果只执行 HAL_RTC_GetTime(),读取完后时间不再跑 HAL_RTC_GetTime() 和 HAL_RTC_GetDate()
- rest_framework学习之路
目录 RESTful理解 APIView 序列化组件 视图组件 解析器 认证组件 权限组件 频率组件 分页器 响应器 URL控制器 版本控制器
- phpStudy 5.5n +zendstudio12.5+xDebugger的配置
1.之前一直安装zendDebugger都没装上去,用phpStudy版本转换器转到对应版本的ZendDebuger也没用,后来发现自己下载的zendstudio的php是5.5的,而且自带了zend ...
- 5款Mac极速下载工具推荐和下载
最近几年用到下载工具的情况其实很少了,比如几年前我们可能经常用下载工具下载视频.音乐.图书等资源,但今天的我们更多的在线看视频.在线听音乐了,偶尔用到下载的时候直接用浏览器自带的下载工具也完全够用了, ...
- 上这个资源网站,让你轻松无忧找mac软件资源
之前分享过好几篇关于mac软件相关的文章(想要看其他的mac软件专题文章,可以关注我,点击进入查看发表的文章),有网友表示,优质的软件推荐清单有了,想要下载和获取mac软件,买一个正版软件在APP s ...
- Go-day02
Go程序的基本结构 ***func init() 会在main函数之前执行 1.包中的函数调用 a.同一个包中函数,直接调用 b.不同包中函数,通过包名+点+函数名进行调用 2.包的访问控制规则 a. ...
- kettle连接mysql数据库并进行数据分析
1.数据库链接驱动 如果没有安装对应的数据库链接驱动,在数据库链接的过程中,可能会报某个数据库连接找不到的异常,因此需要下载对应驱动后(安装步骤可以参见“怎么在官网上下载java连接mysql的驱动j ...