Requests+正则表达式爬取猫眼TOP100榜电影信息

MARK:将信息写入文件解决乱码方法,开启进程池秒爬。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import Pool
 
 
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
 
 
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            '排行': item[0],
            '图片': item[1],
            '电影': item[2],
            '演员': item[3].strip()[3:],
            '上映信息': item[4].strip()[5:],
            '评分': item[5+ item[6]
        }
 
 
def write_to_file(content):
    with open('result.txt''a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False+ '\n')
 
 
def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
 
 
if __name__ == '__main__':
    # for i in range(10):
    #   main(i*10)
    pool = Pool()  # 进程池 多进程
    pool.map(main, [i * 10 for in range(10)])

Requests+正则表达式爬取校花网视频

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import re
import os
 
 
def get_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        print("爬取失败")
 
 
def get_url(html):
    pattern = re.compile('class="items".*?href="(.*?)"', re.S)
    urls = re.findall(pattern, html)
    for url in urls:
        if not url.startswith('http'):
            url = 'http://www.xiaohuar.com' + url
        yield url
 
 
def get_detail_url(detail_content):
    pattern = re.compile('id="media".*?src="(.*?)"', re.S)
    urls = re.findall(pattern, detail_content)
    for url in urls:
        if url:
            if url.endswith('.mp4'):
                yield url
 
 
def download(url):
    root = "D://movie2//"
    path = root + url.split('/')[-1]
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            response = requests.get(url)
            # with open(path, 'wb') as f:
            #     f.write(response.content)
 
            with open(path, 'wb') as f:
                for line in response.iter_content():
                    f.write(line)
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("下载失败")
 
 
def main(page_num):
    url = 'http://www.xiaohuar.com/list-3-{0}.html'.format(page_num)
    html = get_page(url)
    urls = get_url(html)
    for url in urls:
        detail_content = get_page(url)
        detail_urls = get_detail_url(detail_content)
        for detail_url in detail_urls:
            download(detail_url)
 
 
if __name__ == '__main__':
    for num in range(30):
        main(num)

Requests+PyQuery模拟登陆github

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import requests
from pyquery import PyQuery
 
LOGIN_URL = 'https://github.com/login'
SESSION_URL = 'https://github.com/session'
session = requests.session()
response = session.get(LOGIN_URL)
text = PyQuery(response.text)
authenticity_token = text('#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)').attr('value')
data = {
    'commit''Sign in',
    'utf8''✓',
    'authenticity_token': authenticity_token,
    'login''lcgsmile@qq.com',
    'password''lcg@pwd.'
}
response = session.post(SESSION_URL, data=data)
print(response.status_code)  # 200

分析Ajax请求并抓取今日头条街拍美图

配置文件config.py

1
2
3
4
5
6
7
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
 
GROUP_START = 1
GROUP_END = 20
KEYWORD = '街拍'

主爬虫文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from config import *
 
client = pymongo.MongoClient(MONGO_URL, connect=False)  # 多进程抓取connect=False
db = client[MONGO_DB]
 
 
def get_page_index(offset, keyword):
    """
    爬取索引页
    """
    data = {
        'autoload''true',
        'count'20,
        'cur_tab'3,
        'format''json',
        'keyword': keyword,
        'offset': offset,
    }
    params = urlencode(data)  # 将字典类型构造成url的请求参数
    base = 'http://www.toutiao.com/search_content/'
    url = base + '?' + params
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        print('Error occurred')
        return None
 
 
def download_image(url):
    """
    下载图片
    """
    print('Downloading', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            save_image(response.content)
        return None
    except ConnectionError:
        return None
 
 
def save_image(content):
    """
    保存图片
    """
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
    # 用一个md5哈希生成的文件名防止重复
    print(file_path)
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:
            f.write(content)
 
 
def parse_page_index(text):
    """
    解析数据
    """
    try:
        data = json.loads(text)  # json字符串转换成字典
        if data and 'data' in data.keys():
            for item in data.get('data'):
                yield item.get('article_url')
    except JSONDecodeError:
        pass
 
 
def get_page_detail(url):
    """
    请求详情页
    """
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        print('Error occurred')
        return None
 
 
def parse_page_detail(html, url):
    """
    解析详情页
    """
    soup = BeautifulSoup(html, 'lxml')
    result = soup.select('title')
    title = result[0].get_text() if result else ''
    images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
    result = re.search(images_pattern, html)
    if result:
        data = json.loads(result.group(1).replace('\\', ''))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url'for item in sub_images]
            for image in images: download_image(image)
            return {
                'title': title,
                'url': url,
                'images': images
            }
 
 
def save_to_mongo(result):
    """
    将数据插入到MongoDB
    """
    if db[MONGO_TABLE].insert(result):
        print('Successfully Saved to Mongo', result)
        return True
    return False
 
 
def main(offset):
    text = get_page_index(offset, KEYWORD)
    urls = parse_page_index(text)
    for url in urls:
        html = get_page_detail(url)
        result = parse_page_detail(html, url)
        if result: save_to_mongo(result)
 
 
if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 20 for in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups)
    pool.close()
    pool.join()

拉勾网自动投递简历

  1. import requests
  2. import re
  3.  
  4. # 1、============================================认证流程
  5. session = requests.session()
  6. # 第一步:
  7. # 请求的URL:https://passport.lagou.com/login/login.html,
  8. # 请求的方法GET,
  9. # 请求头只包含User-agent
  10.  
  11. r1 = session.get('https://passport.lagou.com/login/login.html',
  12. headers={
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  14. },
  15. )
  16.  
  17. X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
  18. X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
  19. # print(X_Anti_Forge_Code)
  20. # print(X_Anti_Forge_Token)
  21.  
  22. # 第二步:
  23. # 1、请求的URL:https://passport.lagou.com/login/login.json,
  24. # 2、请求方法POST,
  25. # 3、请求头:
  26. # Referer:https://passport.lagou.com/login/login.html
  27. # User-Agent:
  28. # X-Anit-Forge-Code
  29. # X-Anit-Forge-Token
  30. # X-Requested-With
  31. # 4、请求体:
  32. # isValidate:true
  33. # username:1111111111
  34. # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
  35. session.post('https://passport.lagou.com/login/login.json',
  36. headers={
  37. 'Referer': 'https://passport.lagou.com/login/login.html',
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  39. 'X-Anit-Forge-Code': X_Anti_Forge_Code,
  40. 'X-Anit-Forge-Token': X_Anti_Forge_Token,
  41. 'X-Requested-With': 'XMLHttpRequest'
  42. },
  43. data={
  44. 'isValidate': True,
  45. 'username': '',
  46. 'password': '70621c64832c4d4d66a47be6150b4a8e'
  47. }
  48. )
  49.  
  50. # 第三:
  51. # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
  52. # 2、请求方法GET,
  53. # 3、请求头:
  54. # Referer:https://passport.lagou.com/login/login.html
  55. # User-Agent:
  56.  
  57. session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
  58. headers={
  59. 'Referer': 'https://passport.lagou.com/login/login.html',
  60. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  61. }
  62. )
  63.  
  64. # 验证
  65. response = session.get('https://www.lagou.com/resume/myresume.html',
  66. headers={
  67. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  68. }
  69. )
  70.  
  71. # print('18611453110' in response.text)
  72.  
  73. # 2、============================================爬取职位信息
  74. # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
  75. # 2、请求的方式:POST
  76. # 请求参数:
  77. # gj:3年及以下
  78. # xl:不要求
  79. # jd:不需要融资
  80. # hy:移动互联网
  81. # px:default
  82. # yx:15k-25k
  83. # city:全国
  84. # 3、请求头:
  85. # User-Agent
  86. # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
  87. # X-Anit-Forge-Code:0
  88. # X-Anit-Forge-Token:None
  89. # X-Requested-With:XMLHttpRequest
  90.  
  91. # 4、请求体:
  92. # first:true
  93. # pn:1
  94. # kd:python数据分析
  95.  
  96. from urllib.parse import urlencode
  97.  
  98. params = {'kw': 'python数据分析'}
  99. res = urlencode(params).split('=')[-1]
  100. url = 'https://www.lagou.com/jobs/list_' + res
  101. # print(url)
  102.  
  103. response = session.post('https://www.lagou.com/jobs/positionAjax.json',
  104. params={
  105. # 'gj': '3年及以下',
  106. # 'xl': '不要求',
  107. # 'jd': '不需要融资',
  108. # 'hy': '移动互联网',
  109. 'px': 'default',
  110. 'yx': '15k-25k',
  111. 'city': '北京',
  112. 'district': '海淀区',
  113.  
  114. },
  115. headers={
  116. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  117. 'Referer': url,
  118.  
  119. })
  120.  
  121. # print(response.status_code)
  122. result = response.json()['content']['positionResult']['result']
  123. for comanpy_info in result:
  124. fullname = comanpy_info['companyFullName']
  125. emp_num = comanpy_info['companySize']
  126. salary = comanpy_info['salary']
  127. workyear = comanpy_info['workYear']
  128. positionName = comanpy_info['positionName']
  129. positionId = comanpy_info['positionId']
  130. detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)
  131.  
  132. print(detail_url)
  133. print(fullname)
  134. print(emp_num)
  135. print(salary)
  136. print(workyear)
  137. print(positionName)
  138. print(positionId)
  139. print()
  140.  
  141. # 3、============================================爬取职位信息
  142. # 第一步:请求详情页:
  143. # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
  144. # 2、请求的方式:GET
  145. # 3、请求头:
  146. # User-Agent
  147. r1 = session.get(detail_url,
  148. headers={
  149. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  150. }
  151. )
  152.  
  153. X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
  154. X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
  155.  
  156. # 第二步:投递简历
  157. # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
  158. # 2、请求的方式:POST
  159. # 3、请求头:
  160. # User-Agent
  161. # Referer:detail_url
  162. # X-Anit-Forge-Code:31832262
  163. # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
  164. # X-Requested-With:XMLHttpRequest
  165.  
  166. # 4、请求体:
  167. # 'positionId':3984845
  168. # 'type':1
  169. # 'force':True
  170.  
  171. session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
  172. headers={
  173. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  174. 'Referer': detail_url,
  175. 'X-Anit-Forge-Code': X_Anti_Forge_Code,
  176. 'X-Anit-Forge-Token': X_Anti_Forge_Token,
  177. 'X-Requested-With': 'XMLHttpRequest'
  178. },
  179. data={
  180. 'positionId': positionId,
  181. 'type': 1,
  182. 'force': True
  183. }
  184.  
  185. )
  186.  
  187. print('投递成功',detail_url)
  188.  
  189. lagou
  1. import requests
  2. import re
  3.  
  4. # 1、============================================认证流程
  5. session = requests.session()
  6. # 第一步:
  7. # 请求的URL:https://passport.lagou.com/login/login.html,
  8. # 请求的方法GET,
  9. # 请求头只包含User-agent
  10.  
  11. r1 = session.get('https://passport.lagou.com/login/login.html',
  12. headers={
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  14. },
  15. )
  16.  
  17. X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
  18. X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
  19. # print(X_Anti_Forge_Code)
  20. # print(X_Anti_Forge_Token)
  21.  
  22. # 第二步:
  23. # 1、请求的URL:https://passport.lagou.com/login/login.json,
  24. # 2、请求方法POST,
  25. # 3、请求头:
  26. # Referer:https://passport.lagou.com/login/login.html
  27. # User-Agent:
  28. # X-Anit-Forge-Code
  29. # X-Anit-Forge-Token
  30. # X-Requested-With
  31. # 4、请求体:
  32. # isValidate:true
  33. # username:1111111111
  34. # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
  35. session.post('https://passport.lagou.com/login/login.json',
  36. headers={
  37. 'Referer': 'https://passport.lagou.com/login/login.html',
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  39. 'X-Anit-Forge-Code': X_Anti_Forge_Code,
  40. 'X-Anit-Forge-Token': X_Anti_Forge_Token,
  41. 'X-Requested-With': 'XMLHttpRequest'
  42. },
  43. data={
  44. 'isValidate': True,
  45. 'username': '18611453110',
  46. 'password': '70621c64832c4d4d66a47be6150b4a8e'
  47. }
  48. )
  49.  
  50. # 第三:
  51. # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
  52. # 2、请求方法GET,
  53. # 3、请求头:
  54. # Referer:https://passport.lagou.com/login/login.html
  55. # User-Agent:
  56.  
  57. session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
  58. headers={
  59. 'Referer': 'https://passport.lagou.com/login/login.html',
  60. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  61. }
  62. )
  63.  
  64. # 验证
  65. response = session.get('https://www.lagou.com/resume/myresume.html',
  66. headers={
  67. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  68. }
  69. )
  70.  
  71. # print('18611453110' in response.text)
  72.  
  73. # 2、============================================爬取职位信息
  74. # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
  75. # 2、请求的方式:POST
  76. # 请求参数:
  77. # gj:3年及以下
  78. # xl:不要求
  79. # jd:不需要融资
  80. # hy:移动互联网
  81. # px:default
  82. # yx:15k-25k
  83. # city:全国
  84. # 3、请求头:
  85. # User-Agent
  86. # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
  87. # X-Anit-Forge-Code:0
  88. # X-Anit-Forge-Token:None
  89. # X-Requested-With:XMLHttpRequest
  90.  
  91. # 4、请求体:
  92. # first:true
  93. # pn:1
  94. # kd:python数据分析
  95.  
  96. from urllib.parse import urlencode
  97.  
  98. params = {'kw': 'python数据分析'}
  99. res = urlencode(params).split('=')[-1]
  100. url = 'https://www.lagou.com/jobs/list_' + res
  101. # print(url)
  102.  
  103. response = session.post('https://www.lagou.com/jobs/positionAjax.json',
  104. params={
  105. # 'gj': '3年及以下',
  106. # 'xl': '不要求',
  107. # 'jd': '不需要融资',
  108. # 'hy': '移动互联网',
  109. 'px': 'default',
  110. 'yx': '15k-25k',
  111. 'city': '北京',
  112. 'district': '海淀区',
  113.  
  114. },
  115. headers={
  116. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  117. 'Referer': url,
  118.  
  119. })
  120.  
  121. # print(response.status_code)
  122. result = response.json()['content']['positionResult']['result']
  123. for comanpy_info in result:
  124. fullname = comanpy_info['companyFullName']
  125. emp_num = comanpy_info['companySize']
  126. salary = comanpy_info['salary']
  127. workyear = comanpy_info['workYear']
  128. positionName = comanpy_info['positionName']
  129. positionId = comanpy_info['positionId']
  130. detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)
  131.  
  132. print(detail_url)
  133. print(fullname)
  134. print(emp_num)
  135. print(salary)
  136. print(workyear)
  137. print(positionName)
  138. print(positionId)
  139. print()
  140.  
  141. # 3、============================================爬取职位信息
  142. # 第一步:请求详情页:
  143. # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
  144. # 2、请求的方式:GET
  145. # 3、请求头:
  146. # User-Agent
  147. r1 = session.get(detail_url,
  148. headers={
  149. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  150. }
  151. )
  152.  
  153. X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
  154. X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
  155.  
  156. # 第二步:投递简历
  157. # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
  158. # 2、请求的方式:POST
  159. # 3、请求头:
  160. # User-Agent
  161. # Referer:detail_url
  162. # X-Anit-Forge-Code:31832262
  163. # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
  164. # X-Requested-With:XMLHttpRequest
  165.  
  166. # 4、请求体:
  167. # 'positionId':3984845
  168. # 'type':1
  169. # 'force':True
  170.  
  171. session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
  172. headers={
  173. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  174. 'Referer': detail_url,
  175. 'X-Anit-Forge-Code': X_Anti_Forge_Code,
  176. 'X-Anit-Forge-Token': X_Anti_Forge_Token,
  177. 'X-Requested-With': 'XMLHttpRequest'
  178. },
  179. data={
  180. 'positionId': positionId,
  181. 'type': 1,
  182. 'force': True
  183. }
  184.  
  185. )
  186.  
  187. print('投递成功',detail_url)
  1. import requests
  2. import re
  3.  
  4. # 1、============================================认证流程
  5. session = requests.session()
  6. # 第一步:
  7. # 请求的URL:https://passport.lagou.com/login/login.html,
  8. # 请求的方法GET,
  9. # 请求头只包含User-agent
  10.  
  11. r1 = session.get('https://passport.lagou.com/login/login.html',
  12. headers={
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  14. },
  15. )
  16.  
  17. X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
  18. X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
  19. # print(X_Anti_Forge_Code)
  20. # print(X_Anti_Forge_Token)
  21.  
  22. # 第二步:
  23. # 1、请求的URL:https://passport.lagou.com/login/login.json,
  24. # 2、请求方法POST,
  25. # 3、请求头:
  26. # Referer:https://passport.lagou.com/login/login.html
  27. # User-Agent:
  28. # X-Anit-Forge-Code
  29. # X-Anit-Forge-Token
  30. # X-Requested-With
  31. # 4、请求体:
  32. # isValidate:true
  33. # username:1111111111
  34. # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
  35. session.post('https://passport.lagou.com/login/login.json',
  36. headers={
  37. 'Referer': 'https://passport.lagou.com/login/login.html',
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  39. 'X-Anit-Forge-Code': X_Anti_Forge_Code,
  40. 'X-Anit-Forge-Token': X_Anti_Forge_Token,
  41. 'X-Requested-With': 'XMLHttpRequest'
  42. },
  43. data={
  44. 'isValidate': True,
  45. 'username': '',
  46. 'password': '70621c64832c4d4d66a47be6150b4a8e'
  47. }
  48. )
  49.  
  50. # 第三:
  51. # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
  52. # 2、请求方法GET,
  53. # 3、请求头:
  54. # Referer:https://passport.lagou.com/login/login.html
  55. # User-Agent:
  56.  
  57. session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
  58. headers={
  59. 'Referer': 'https://passport.lagou.com/login/login.html',
  60. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  61. }
  62. )
  63.  
  64. # 验证
  65. response = session.get('https://www.lagou.com/resume/myresume.html',
  66. headers={
  67. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  68. }
  69. )
  70.  
  71. # print('18611453110' in response.text)
  72.  
  73. # 2、============================================爬取职位信息
  74. # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
  75. # 2、请求的方式:POST
  76. # 请求参数:
  77. # gj:3年及以下
  78. # xl:不要求
  79. # jd:不需要融资
  80. # hy:移动互联网
  81. # px:default
  82. # yx:15k-25k
  83. # city:全国
  84. # 3、请求头:
  85. # User-Agent
  86. # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
  87. # X-Anit-Forge-Code:0
  88. # X-Anit-Forge-Token:None
  89. # X-Requested-With:XMLHttpRequest
  90.  
  91. # 4、请求体:
  92. # first:true
  93. # pn:1
  94. # kd:python数据分析
  95.  
  96. from urllib.parse import urlencode
  97.  
  98. params = {'kw': 'python数据分析'}
  99. res = urlencode(params).split('=')[-1]
  100. url = 'https://www.lagou.com/jobs/list_' + res
  101. # print(url)
  102.  
  103. response = session.post('https://www.lagou.com/jobs/positionAjax.json',
  104. params={
  105. # 'gj': '3年及以下',
  106. # 'xl': '不要求',
  107. # 'jd': '不需要融资',
  108. # 'hy': '移动互联网',
  109. 'px': 'default',
  110. 'yx': '15k-25k',
  111. 'city': '北京',
  112. 'district': '海淀区',
  113.  
  114. },
  115. headers={
  116. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  117. 'Referer': url,
  118.  
  119. })
  120.  
  121. # print(response.status_code)
  122. result = response.json()['content']['positionResult']['result']
  123. for comanpy_info in result:
  124. fullname = comanpy_info['companyFullName']
  125. emp_num = comanpy_info['companySize']
  126. salary = comanpy_info['salary']
  127. workyear = comanpy_info['workYear']
  128. positionName = comanpy_info['positionName']
  129. positionId = comanpy_info['positionId']
  130. detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)
  131.  
  132. print(detail_url)
  133. print(fullname)
  134. print(emp_num)
  135. print(salary)
  136. print(workyear)
  137. print(positionName)
  138. print(positionId)
  139. print()
  140.  
  141. # 3、============================================爬取职位信息
  142. # 第一步:请求详情页:
  143. # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
  144. # 2、请求的方式:GET
  145. # 3、请求头:
  146. # User-Agent
  147. r1 = session.get(detail_url,
  148. headers={
  149. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  150. }
  151. )
  152.  
  153. X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
  154. X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
  155.  
  156. # 第二步:投递简历
  157. # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
  158. # 2、请求的方式:POST
  159. # 3、请求头:
  160. # User-Agent
  161. # Referer:detail_url
  162. # X-Anit-Forge-Code:31832262
  163. # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
  164. # X-Requested-With:XMLHttpRequest
  165.  
  166. # 4、请求体:
  167. # 'positionId':3984845
  168. # 'type':1
  169. # 'force':True
  170.  
  171. session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
  172. headers={
  173. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  174. 'Referer': detail_url,
  175. 'X-Anit-Forge-Code': X_Anti_Forge_Code,
  176. 'X-Anit-Forge-Token': X_Anti_Forge_Token,
  177. 'X-Requested-With': 'XMLHttpRequest'
  178. },
  179. data={
  180. 'positionId': positionId,
  181. 'type': 1,
  182. 'force': True
  183. }
  184.  
  185. )
  186.  
  187. print('投递成功',detail_url)
  188.  
  189. lagou

爬虫(猫眼电影+校花网+github+今日头条+拉钩)的更多相关文章

  1. Python-爬取校花网视频(单线程和多线程版本)

    一.参考文章 python爬虫爬取校花网视频,单线程爬取 爬虫----爬取校花网视频,包含多线程版本 上述两篇文章都是对校花网视频的爬取,由于时间相隔很久了,校花网上的一些视频已经不存在了,因此上述文 ...

  2. Python 爬虫 爬校花网!!

    爬虫:是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本 1.福利来了  校花网 ,首先说为什么要爬这个网站呢,第一这个网站简单爬起来容易不会受到打击,第二呢 你懂得... 1.第一步,需要下载 ...

  3. python爬虫基础应用----爬取校花网视频

    一.爬虫简单介绍 爬虫是什么? 爬虫是首先使用模拟浏览器访问网站获取数据,然后通过解析过滤获得有价值的信息,最后保存到到自己库中的程序. 爬虫程序包括哪些模块? python中的爬虫程序主要包括,re ...

  4. Python之爬虫-校花网

    Python之爬虫-校花网 #!/usr/bin/env python # -*- coding:utf-8 -*- import re import requests # 拿到校花网主页的内容 re ...

  5. Python 爬虫 校花网

    爬虫:是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本. 福利来了  校花网 ,首先说为什么要爬这个网站呢,第一这个网站简单爬起来容易,不会受到打击,第二呢 你懂得.... 1.第一步,需要下 ...

  6. Go语言实战-爬取校花网图片

    一.目标网站分析 爬取校花网http://www.xiaohuar.com/大学校花所有图片. 经过分析,所有图片分为四个页面,http://www.xiaohuar.com/list-1-0.htm ...

  7. day1之校花网小试牛刀

    一 利用生成器来完成爬去校花网视频 import requests import re import os import hashlib import time DOWLOAD_PATH=r'D:\D ...

  8. Python之爬虫-猫眼电影

    Python之爬虫-猫眼电影 #!/usr/bin/env python # coding: utf-8 import json import requests import re import ti ...

  9. python实战项目 — 爬取 校花网图片

    重点: 1.  指定路径创建文件夹,判断是否存在 2. 保存图片文件 # 获得校花网的地址,图片的链接 import re import requests import time import os ...

随机推荐

  1. Nginx系列3:用Nginx搭建一个具备缓存功能的反向代理服务

    反向代理的理解:https://www.cnblogs.com/zkfopen/p/10126105.html 我是在一台linux服务器上搭建了两个nginx服务器A和B,把静态资源文件甲放在A服务 ...

  2. zookeeperCli和Java操作zookeeperAPI

    推荐一个zookeeper可视化工具:zktools.exe eclipse集成的工具: http://www.massedynamic.org/eclipse/updates/ 1.zkCli客户端 ...

  3. 非极大值抑制(NMS)的几种实现

    因为之前对比了RoI pooling的几种实现,发现python.pytorch的自带工具函数速度确实很慢,所以这里再对Faster-RCNN中另一个速度瓶颈NMS做一个简单对比试验. 这里做了四组对 ...

  4. asyncio之asyncio.run

    asyncio.run(coro, *, debug=False) python3.7加入,只能作为asyncio的入口函数.

  5. Windows PowerShell 入門(5)-制御構文

    Windows PowerShellにおける制御構文について学びます.数ある制御構文の中でもSwitch文は.他の言語に比べ豊富な機能が用意されています. 対象読者 Windows PowerShel ...

  6. u3d常用代码小集合

    01.基本碰撞检测代码 function OnCollisionEnter(theCollision : Collision){ if(theCollision.gameObject.name == ...

  7. linux批量替换文件内容3种方法(perl,sed,shell)

    方法1:perl   这两天在构建一个应用的使用用到了maven,由于project很大,足足有700多个 pom.xml文件,更郁闷的是在很多pom.xml文件里都单独指定了资源库的url,我需要把 ...

  8. 日期控件datepicker的使用

    引入JS: <script type="text/javascript" src="static/my/js/bootstrap-datepicker.min.js ...

  9. 解决ssh登陆很慢的问题

    解决步骤: 先备份/etc/ssh/sshd_config,备份命令为 cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak 1.su (以root用户登录 ...

  10. LabVIEW---vi图标和符号的制作

    前言: 使用图形化设计语言进行开发时候,为VI添加说明的一个重要的方法是为其建立一个形象的图标,每个VI都在前面板后程序框图的右上角有一个图标,它是VI的图形化表示.如果VI当作子VI调用,该图标就会 ...