爬虫(猫眼电影+校花网+github+今日头条+拉钩)
Requests+正则表达式爬取猫眼TOP100榜电影信息
MARK:将信息写入文件解决乱码方法,开启进程池秒爬。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import requests from requests.exceptions import RequestException import re import json from multiprocessing import Pool def get_one_page(url): try : response = requests.get(url) if response.status_code = = 200 : return response.text return None except RequestException: return None def parse_one_page(html): pattern = re. compile ( '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' , re.S) items = re.findall(pattern, html) for item in items: yield { '排行' : item[ 0 ], '图片' : item[ 1 ], '电影' : item[ 2 ], '演员' : item[ 3 ].strip()[ 3 :], '上映信息' : item[ 4 ].strip()[ 5 :], '评分' : item[ 5 ] + item[ 6 ] } def write_to_file(content): with open ( 'result.txt' , 'a' , encoding = 'utf-8' ) as f: f.write(json.dumps(content, ensure_ascii = False ) + '\n' ) def main(offset): url = 'http://maoyan.com/board/4?offset=' + str (offset) html = get_one_page(url) for item in parse_one_page(html): print (item) write_to_file(item) if __name__ = = '__main__' : # for i in range(10): # main(i*10) pool = Pool() # 进程池 多进程 pool. map (main, [i * 10 for i in range ( 10 )]) |
Requests+正则表达式爬取校花网视频
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
import requests import re import os def get_page(url): try : response = requests.get(url) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except : print ( "爬取失败" ) def get_url(html): pattern = re. compile ( 'class="items".*?href="(.*?)"' , re.S) urls = re.findall(pattern, html) for url in urls: if not url.startswith( 'http' ): url = 'http://www.xiaohuar.com' + url yield url def get_detail_url(detail_content): pattern = re. compile ( 'id="media".*?src="(.*?)"' , re.S) urls = re.findall(pattern, detail_content) for url in urls: if url: if url.endswith( '.mp4' ): yield url def download(url): root = "D://movie2//" path = root + url.split( '/' )[ - 1 ] try : if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): response = requests.get(url) # with open(path, 'wb') as f: # f.write(response.content) with open (path, 'wb' ) as f: for line in response.iter_content(): f.write(line) print ( "文件保存成功" ) else : print ( "文件已存在" ) except : print ( "下载失败" ) def main(page_num): url = 'http://www.xiaohuar.com/list-3-{0}.html' . format (page_num) html = get_page(url) urls = get_url(html) for url in urls: detail_content = get_page(url) detail_urls = get_detail_url(detail_content) for detail_url in detail_urls: download(detail_url) if __name__ = = '__main__' : for num in range ( 30 ): main(num) |
Requests+PyQuery模拟登陆github
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import requests from pyquery import PyQuery LOGIN_URL = 'https://github.com/login' SESSION_URL = 'https://github.com/session' session = requests.session() response = session.get(LOGIN_URL) text = PyQuery(response.text) authenticity_token = text( '#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)' ).attr( 'value' ) data = { 'commit' : 'Sign in' , 'utf8' : '✓' , 'authenticity_token' : authenticity_token, 'login' : 'lcgsmile@qq.com' , 'password' : 'lcg@pwd.' } response = session.post(SESSION_URL, data = data) print (response.status_code) # 200 |
分析Ajax请求并抓取今日头条街拍美图
配置文件config.py
1
2
3
4
5
6
7
|
MONGO_URL = 'localhost' MONGO_DB = 'toutiao' MONGO_TABLE = 'toutiao' GROUP_START = 1 GROUP_END = 20 KEYWORD = '街拍' |
主爬虫文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import re from multiprocessing import Pool from hashlib import md5 from json.decoder import JSONDecodeError from config import * client = pymongo.MongoClient(MONGO_URL, connect = False ) # 多进程抓取connect=False db = client[MONGO_DB] def get_page_index(offset, keyword): """ 爬取索引页 """ data = { 'autoload' : 'true' , 'count' : 20 , 'cur_tab' : 3 , 'format' : 'json' , 'keyword' : keyword, 'offset' : offset, } params = urlencode(data) # 将字典类型构造成url的请求参数 base = 'http://www.toutiao.com/search_content/' url = base + '?' + params try : response = requests.get(url) if response.status_code = = 200 : return response.text return None except ConnectionError: print ( 'Error occurred' ) return None def download_image(url): """ 下载图片 """ print ( 'Downloading' , url) try : response = requests.get(url) if response.status_code = = 200 : save_image(response.content) return None except ConnectionError: return None def save_image(content): """ 保存图片 """ file_path = '{0}/{1}.{2}' . format (os.getcwd(), md5(content).hexdigest(), 'jpg' ) # 用一个md5哈希生成的文件名防止重复 print (file_path) if not os.path.exists(file_path): with open (file_path, 'wb' ) as f: f.write(content) def parse_page_index(text): """ 解析数据 """ try : data = json.loads(text) # json字符串转换成字典 if data and 'data' in data.keys(): for item in data.get( 'data' ): yield item.get( 'article_url' ) except JSONDecodeError: pass def get_page_detail(url): """ 请求详情页 """ try : response = requests.get(url) if response.status_code = = 200 : return response.text return None except ConnectionError: print ( 'Error occurred' ) return None def parse_page_detail(html, url): """ 解析详情页 """ soup = BeautifulSoup(html, 'lxml' ) result = soup.select( 'title' ) title = result[ 0 ].get_text() if result else '' images_pattern = re. compile ( 'gallery: JSON.parse\("(.*)"\)' , re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group( 1 ).replace( '\\', ' ')) if data and 'sub_images' in data.keys(): sub_images = data.get( 'sub_images' ) images = [item.get( 'url' ) for item in sub_images] for image in images: download_image(image) return { 'title' : title, 'url' : url, 'images' : images } def save_to_mongo(result): """ 将数据插入到MongoDB """ if db[MONGO_TABLE].insert(result): print ( 'Successfully Saved to Mongo' , result) return True return False def main(offset): text = get_page_index(offset, KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) result = parse_page_detail(html, url) if result: save_to_mongo(result) if __name__ = = '__main__' : pool = Pool() groups = ([x * 20 for x in range (GROUP_START, GROUP_END + 1 )]) pool. map (main, groups) pool.close() pool.join() |
拉勾网自动投递简历
import requests
import re # 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent r1 = session.get('https://passport.lagou.com/login/login.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
},
) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token) # 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent:
# X-Anit-Forge-Code
# X-Anit-Forge-Token
# X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
headers={
'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest'
},
data={
'isValidate': True,
'username': '',
'password': '70621c64832c4d4d66a47be6150b4a8e'
}
) # 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent: session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
headers={
'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) # 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) # print('18611453110' in response.text) # 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
# 请求参数:
# gj:3年及以下
# xl:不要求
# jd:不需要融资
# hy:移动互联网
# px:default
# yx:15k-25k
# city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest # 4、请求体:
# first:true
# pn:1
# kd:python数据分析 from urllib.parse import urlencode params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url) response = session.post('https://www.lagou.com/jobs/positionAjax.json',
params={
# 'gj': '3年及以下',
# 'xl': '不要求',
# 'jd': '不需要融资',
# 'hy': '移动互联网',
'px': 'default',
'yx': '15k-25k',
'city': '北京',
'district': '海淀区', },
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Referer': url, }) # print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
fullname = comanpy_info['companyFullName']
emp_num = comanpy_info['companySize']
salary = comanpy_info['salary']
workyear = comanpy_info['workYear']
positionName = comanpy_info['positionName']
positionId = comanpy_info['positionId']
detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) print(detail_url)
print(fullname)
print(emp_num)
print(salary)
print(workyear)
print(positionName)
print(positionId)
print() # 3、============================================爬取职位信息
# 第一步:请求详情页:
# 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
# 2、请求的方式:GET
# 3、请求头:
# User-Agent
r1 = session.get(detail_url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:投递简历
# 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
# 2、请求的方式:POST
# 3、请求头:
# User-Agent
# Referer:detail_url
# X-Anit-Forge-Code:31832262
# X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
# X-Requested-With:XMLHttpRequest # 4、请求体:
# 'positionId':3984845
# 'type':1
# 'force':True session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Referer': detail_url,
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest'
},
data={
'positionId': positionId,
'type': 1,
'force': True
} ) print('投递成功',detail_url) lagou

import requests
import re # 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent r1 = session.get('https://passport.lagou.com/login/login.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
},
) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token) # 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent:
# X-Anit-Forge-Code
# X-Anit-Forge-Token
# X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
headers={
'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest'
},
data={
'isValidate': True,
'username': '18611453110',
'password': '70621c64832c4d4d66a47be6150b4a8e'
}
) # 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent: session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
headers={
'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) # 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) # print('18611453110' in response.text) # 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
# 请求参数:
# gj:3年及以下
# xl:不要求
# jd:不需要融资
# hy:移动互联网
# px:default
# yx:15k-25k
# city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest # 4、请求体:
# first:true
# pn:1
# kd:python数据分析 from urllib.parse import urlencode params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url) response = session.post('https://www.lagou.com/jobs/positionAjax.json',
params={
# 'gj': '3年及以下',
# 'xl': '不要求',
# 'jd': '不需要融资',
# 'hy': '移动互联网',
'px': 'default',
'yx': '15k-25k',
'city': '北京',
'district': '海淀区', },
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Referer': url, }) # print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
fullname = comanpy_info['companyFullName']
emp_num = comanpy_info['companySize']
salary = comanpy_info['salary']
workyear = comanpy_info['workYear']
positionName = comanpy_info['positionName']
positionId = comanpy_info['positionId']
detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) print(detail_url)
print(fullname)
print(emp_num)
print(salary)
print(workyear)
print(positionName)
print(positionId)
print() # 3、============================================爬取职位信息
# 第一步:请求详情页:
# 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
# 2、请求的方式:GET
# 3、请求头:
# User-Agent
r1 = session.get(detail_url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:投递简历
# 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
# 2、请求的方式:POST
# 3、请求头:
# User-Agent
# Referer:detail_url
# X-Anit-Forge-Code:31832262
# X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
# X-Requested-With:XMLHttpRequest # 4、请求体:
# 'positionId':3984845
# 'type':1
# 'force':True session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Referer': detail_url,
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest'
},
data={
'positionId': positionId,
'type': 1,
'force': True
} ) print('投递成功',detail_url)
import requests
import re # 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent r1 = session.get('https://passport.lagou.com/login/login.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
},
) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token) # 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent:
# X-Anit-Forge-Code
# X-Anit-Forge-Token
# X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
headers={
'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest'
},
data={
'isValidate': True,
'username': '',
'password': '70621c64832c4d4d66a47be6150b4a8e'
}
) # 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent: session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
headers={
'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) # 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) # print('18611453110' in response.text) # 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
# 请求参数:
# gj:3年及以下
# xl:不要求
# jd:不需要融资
# hy:移动互联网
# px:default
# yx:15k-25k
# city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest # 4、请求体:
# first:true
# pn:1
# kd:python数据分析 from urllib.parse import urlencode params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url) response = session.post('https://www.lagou.com/jobs/positionAjax.json',
params={
# 'gj': '3年及以下',
# 'xl': '不要求',
# 'jd': '不需要融资',
# 'hy': '移动互联网',
'px': 'default',
'yx': '15k-25k',
'city': '北京',
'district': '海淀区', },
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Referer': url, }) # print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
fullname = comanpy_info['companyFullName']
emp_num = comanpy_info['companySize']
salary = comanpy_info['salary']
workyear = comanpy_info['workYear']
positionName = comanpy_info['positionName']
positionId = comanpy_info['positionId']
detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) print(detail_url)
print(fullname)
print(emp_num)
print(salary)
print(workyear)
print(positionName)
print(positionId)
print() # 3、============================================爬取职位信息
# 第一步:请求详情页:
# 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
# 2、请求的方式:GET
# 3、请求头:
# User-Agent
r1 = session.get(detail_url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
}
) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:投递简历
# 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
# 2、请求的方式:POST
# 3、请求头:
# User-Agent
# Referer:detail_url
# X-Anit-Forge-Code:31832262
# X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
# X-Requested-With:XMLHttpRequest # 4、请求体:
# 'positionId':3984845
# 'type':1
# 'force':True session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Referer': detail_url,
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest'
},
data={
'positionId': positionId,
'type': 1,
'force': True
} ) print('投递成功',detail_url) lagou
爬虫(猫眼电影+校花网+github+今日头条+拉钩)的更多相关文章
- Python-爬取校花网视频(单线程和多线程版本)
一.参考文章 python爬虫爬取校花网视频,单线程爬取 爬虫----爬取校花网视频,包含多线程版本 上述两篇文章都是对校花网视频的爬取,由于时间相隔很久了,校花网上的一些视频已经不存在了,因此上述文 ...
- Python 爬虫 爬校花网!!
爬虫:是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本 1.福利来了 校花网 ,首先说为什么要爬这个网站呢,第一这个网站简单爬起来容易不会受到打击,第二呢 你懂得... 1.第一步,需要下载 ...
- python爬虫基础应用----爬取校花网视频
一.爬虫简单介绍 爬虫是什么? 爬虫是首先使用模拟浏览器访问网站获取数据,然后通过解析过滤获得有价值的信息,最后保存到到自己库中的程序. 爬虫程序包括哪些模块? python中的爬虫程序主要包括,re ...
- Python之爬虫-校花网
Python之爬虫-校花网 #!/usr/bin/env python # -*- coding:utf-8 -*- import re import requests # 拿到校花网主页的内容 re ...
- Python 爬虫 校花网
爬虫:是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本. 福利来了 校花网 ,首先说为什么要爬这个网站呢,第一这个网站简单爬起来容易,不会受到打击,第二呢 你懂得.... 1.第一步,需要下 ...
- Go语言实战-爬取校花网图片
一.目标网站分析 爬取校花网http://www.xiaohuar.com/大学校花所有图片. 经过分析,所有图片分为四个页面,http://www.xiaohuar.com/list-1-0.htm ...
- day1之校花网小试牛刀
一 利用生成器来完成爬去校花网视频 import requests import re import os import hashlib import time DOWLOAD_PATH=r'D:\D ...
- Python之爬虫-猫眼电影
Python之爬虫-猫眼电影 #!/usr/bin/env python # coding: utf-8 import json import requests import re import ti ...
- python实战项目 — 爬取 校花网图片
重点: 1. 指定路径创建文件夹,判断是否存在 2. 保存图片文件 # 获得校花网的地址,图片的链接 import re import requests import time import os ...
随机推荐
- shell编程 之 文件包含
解释:就是在一个脚本中引用或者运行其他脚本的文件. 常用格式:. filename 或者 source filename 实例:/hehe文件夹下有两个文件:t2.sh 和t3.sh t2.sh的内容 ...
- Django中的缓存基础知识
由于Django是动态网站,所有每次请求均会去数据进行相应的操作,当程序访问量大时,耗时必然会更加明显,最简单解决方式是使用:缓存,缓存将一个某个views的返回值保存至内存或者memcache中,5 ...
- RabbitMQ channel 参数详解
1.Channel 1.1 channel.exchangeDeclare(): type:有direct.fanout.topic三种durable:true.false true:服务器重启会保留 ...
- 管理并行SQL执行的进程
本节介绍的并行执行功能可用于Oracle数据库企业版 本节介绍如何管理SQL语句的并行处理.在此配置中,Oracle数据库可以将处理SQL语句的工作分为多个并行进程. 许多SQL语句的执行可以并行化. ...
- 围在栅栏中的爱WriteUp(附QWE密码加解密脚本)
题目的链接:http://www.shiyanbar.com/ctf/1917 1.首先题目给出的是摩尔斯电码: 在下面的网站上解密:https://www.cryptool.org/en/cto-c ...
- 20165231 预备作业二:学习基础和C语言基础调查
微信文章感想 读了娄老师微信公众号中的文章,老师给我们的启示首先就是要坚持,万事开头难,但是只要肯坚持就一定会有所成就,不管是学习还是生活方面.其中最有触动的就是减肥了,是我三四年来一直难以完成的目标 ...
- 【Linux】时间同步设置+防火墙设置+SELinux设置
时间同步设置 在大数据集群环境中,要求每台集群的时间必须是同步的,这样我们就会要求每台集群的时间必须和一台服务的时间是同步的.接下来介绍一下步骤: 1,设置ntp客户端 yum -y install ...
- dubbo源码分析4——SPI机制_ExtensionFactory类的作用
ExtensionFactory的源码: @SPI public interface ExtensionFactory { /** * Get extension. * * @param type o ...
- MongoDB与MySQL的插入、查询性能测试
1.1 MongoDB的简单介绍 在当今的数据库市场上,MySQL无疑是占有一席之地的.作为一个开源的关系型数据库,MySQL被大量应用在各大网站后台中,承担着信息存储的重要作用.2009年,甲骨文 ...
- 023_nginx跨域问题
什么是跨域? 使用js获取数据时,涉及到的两个url只要协议.域名.端口有任何一个不同,都被当作是不同的域,相互访问就会有跨域问题.例如客户端的域名是www.redis.com.cn,而请求的域名是w ...