requests高级用法、代理池搭建

requests高级用法

1.自动携带cookie的session对象

# session对象----》已经模拟登录上了一些网站---》单独把cookie 取出来

import requests

header = {

    'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2F',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',

}

data = {

    'username': '用户名',

    'password': '密码',

    'captcha': 1111,

    'remember': 1,

    'ref': 'http://www.aa7a.cn/',

    'act': 'act_login',

}

# 使用session发送请求

session = requests.session()

res = session.post('http://www.aa7a.cn/user.php',data=data,headers=header)

cookie = res.cookies.get_dict()  # 转成字典

print(type(cookie))

res1 = session.get('http://www.aa7a.cn/') # 不需要携带cookie了

print('用户名' in res1.text)

2.响应response

# http的响应，就是res对象，所有http响应的东西，都在这个对象中

response = requests.get('http://www.aa7a.cn/')

# print(type(response))  # requests.models.Response

from requests.models import Response

print(response.text)   # 响应体转成字符串，默认使用utf-8编码----》以后打印出来可能会乱码

print(response.content) #响应体的bytes格式

print(response.status_code)#响应状态码

print(response.headers)  # 响应头

print(response.cookies)  # cookie

print(response.cookies.get_dict()) # cookie 转成字典

print(response.cookies.items()) #键值对的形式

print(response.url)    # 请求地址

print(response.history)  # 访问一个地址，如果重定向了，requests会自动重定向过去，放着之前没重定向之前的地址，列表

print(response.encoding)  #网页编码

# 关闭：response.close()

response.iter_content()  # 一点一点取，用来下载图片视频之类的

''' 下载图片，视频到本地'''

# res = requests.get('https://pics0.baidu.com/feed/bd3eb13533fa828bf4e9977c327a1138950a5adc.jpeg')

# with open('1.png','wb') as f:

#     f.write(res.content)

res = requests.get('https://vd2.bdstatic.com/mda-pg6acnyf44f4dkdg/720p/h264/1688715361322809563/mda-pg6acnyf44f4dkdg.mp4')

with open('xsp.mp4','wb') as f:

    for line in res.iter_content(chunk_size=1024):

        f.write(line)

'''编码问题'''

# 直接打印res.text 字符串形式-----》从网络过来是二进制----》转成字符串涉及到编码---》默认以utf-8,---》现在会自动识别页面的编码，自动转成对应的

# res.encoding='gbk' # 手动指定编码

# print(res.text)

3.解析json

# 返回的可能是html，也可能是json(前后端分离项目，返回数据json格式)，转成字典直接用

# res=requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword',data='cname=&pid=&keyword=%E5%91%A8%E6%B5%A6&pageIndex=1&pageSize=10',headers={

#     'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8'

# })

res = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data={

    'cname': '',

    'pid': '',

    'keyword': '周浦',

    'pageIndex': 1,

    'pageSize': 10,

})

for item in res.json()['Table1']:

    print('餐厅名字：%s，餐厅地址：%s'%(item['storeName'],item['addressDetail']))

4.发起https请求

res = requests.get('https://www.cnblogs.com/XxMa/p/17515369.html',verify=False)

# verify参数的作用是验证证书,在发起HTTPS请求时使用

print(res.text)

5.使用代理

# 访问某些网站，频率过高，就会被封ip===》使用代理ip访问---》封的是代理ip

import requests

proxies = {

    'http': '104.193.88.77:80',

}

response = requests.get('https://www.cnblogs.com/XxMa/p/17515369.html',proxies=proxies)

print(response)

6.超时设置

response = requests.get('https://www.baidu.com',timeout=1)

print(response)

7.异常处理

from requests.exceptions import * #可以查看requests.exceptions获取异常类型

try:

    r=requests.get('http://www.baidu.com',timeout=0.00001)

except ReadTimeout:

    print('===')

except RequestException:

    print('Error')

except Exception as e:

    print(e)

8.上传文件

files = {'myfile':open('1.png','rb')}

response = requests.post('http://127.0.0.1:8000/upload/',files=files)

print(response.status_code)

9.认证登录

response = requests.get('xxx',auth=('user','password'))

print(response.status_code)

代理池搭建

# 搭建一个代理池---》每次可以从池中随机取出一个代理---》发送请求

# 公司内部要用，会花钱买

# 咱们自己用，基于网上的开源软件，自己搭建

##  开源的代理池核心原理：https://github.com/jhao104/proxy_pool

	1 使用爬虫技术，爬取网上免费的代理

    2 爬完回来做验证，如果能用，存到redis中

    # 启动调度程序，爬代理，验证，存到redis中

	python proxyPool.py schedule

    3 使用flask启动服务，对外开放了几个接口，向某个接口发请求，就能随机获取一个代理

    # 启动webApi服务

    python proxyPool.py server

#搭建步骤:

	1 从git拉去开源代码

    	git clone https://github.com/jhao104/proxy_pool.git

    2 使用pycharm打开，创建虚拟环境

    	mkvirtualenv -p python3 pool

    3 配置项目使用虚拟环境

    4 修改项目配置文件

    	 DB_CONN = 'redis://127.0.0.1:6379/2'

		HTTP_URL = "http://www.baidu.com"

		HTTPS_URL = "https://www.baidu.com"

    5 启动调度程序---》爬取网站，验证，存到redis

    	python proxyPool.py schedule

    6 启动web程序（flask写的）

    	python proxyPool.py server

    7 向http://192.168.1.252:5010/get/?type=http 地址发送请求就可以随机获取代理ip

使用代理池

import requests

res = requests.get('http://192.168.1.252:5010/get/?type=http').json()['proxy']

proxies = {

    'http': res,

}

print(proxies)

# 我们是http 要使用http的代理

respone = requests.get('http://139.155.203.196:8080/', proxies=proxies)

print(respone.text)

爬取某视频网站

# https://www.pearvideo.com/

#  加载下一页的地址

https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0

import requests

import re

res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')

video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)

print(video_list)

for video in video_list:

    url = 'https://www.pearvideo.com/' + video

    header = {

        'Referer': url

    }

    video_id = video.split('_')[-1]

    video_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.8273125965736401' % video_id

    res1 = requests.get(video_url, headers=header).json()  # 真正能拿到视频的地址发送请求

    real_mp4_url = res1['videoInfo']['videos']['srcUrl']

    real_mp4_url = real_mp4_url.replace(real_mp4_url.split('/')[-1].split('-')[0], 'cont-%s' % video_id)

    print(real_mp4_url)

    res2 = requests.get(real_mp4_url)

    with open('./video/%s.mp4' % video, 'wb') as f:

        for line in res2.iter_content():

            f.write(line)

# ajax 请求拿回来的视频地址是：

# 能播放的地址：

# https://video.pearvideo.com/mp4/adshort/20181106/     1688703103822    -13189302_adpkg-ad_hd.mp4  # 不能的

# https://video.pearvideo.com/mp4/adshort/20181106/      cont-1470647     -13189302_adpkg-ad_hd.mp4 #能的

# url = 'https://video.pearvideo.com/mp4/adshort/20181106/1688703103822-13189302_adpkg-ad_hd.mp4'

requests高级用法、代理池搭建的更多相关文章

爬虫—Requests高级用法
Requests高级用法 1.文件上传我们知道requests可以模拟提交一些数据.假如有的网站需要上传文件,我们也可以用requests来实现. import requests files = { ...
技术分享：Proxy-Pool代理池搭建IP代理
技术分享:Proxy-Pool代理池搭建IP代理前言本章内容仅供参考,不涉及实际使用,主要使用Python环境和Redis数据库进行环境搭建,工具网盘存储如下,有问题可以私聊我.网址:https:/ ...
python requests 高级用法
高级用法本篇文档涵盖了 Requests 的一些高级特性. 会话对象会话对象让你能够跨请求保持某些参数.它也会在同一个 Session 实例发出的所有请求之间保持 cookie, 期间使用 url ...
Proxypool代理池搭建
个人博客:点我前言项目地址 : https://github.com/jhao104/proxy_pool 这个项目是github上一个大佬基于python爬虫制作的定时获取免费可用代理并入池的代 ...
（转）python requests 高级用法 -- 包括SSL 证书错误的解决方案
我在使用requests访问某个https网站时出现错误 error::SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify fai ...
python requests 高级用法 -- 包括SSL 证书错误的解决方案
Session Objects会话对象 Session对象在请求时允许你坚持一定的参数.此外,还坚持由Session实例的所有请求的cookie. 让我们坚持在请求时使用 s = requests.S ...
requests高级用法
会话对象当你向同一主机发送多个请求时,session会重用底层的tcp连接,从而提升性能,同时session也会为所有请求保持 cookie. # _*_ coding: utf-8 _*_ imp ...
python爬虫redis-ip代理池搭建几十万的ip数据--可以使用
from bs4 import BeautifulSoupimport requests,os,sys,time,random,redisfrom lxml import etreeconn = re ...
爬虫之requests 高级用法
1. 文件上传 import requests files = {'file': open('favicon.ico', 'rb')} r = requests.post("http://h ...
进程线程协程补充、docker-compose一键部署项目、搭建代理池、requests超时设置、认证设置、异常处理、上传文件
今日内容概要补充:进程,线程,协程 docker-compose一键部署演示搭建代理池 requests超时设置 requests认证设置 requests异常处理 requests上传文件内容 ...

随机推荐

LeeCode数组问题（二）
LeeCode 977:有序数组的平方题目描述: 给你一个按非递减顺序排列的整数数组nums,返回每个数字的平方组成的新数组,要求也按非递减顺序排序. 标签:数组,首尾指针,最大值优先时间复杂度: ...
HTML、 input；、accept 属性-规定能够通过文件上传进行提交的文件类型
定义和用法文章地址: http://www.w3school.com.cn/tags/att_input_accept.asp accept 属性规定了可通过文件上传提交的服务器接受的文件类型. 注 ...
.NET Core反射获取带有自定义特性的类，通过依赖注入根据Attribute元数据信息调用对应的方法
前言前段时间有朋友问道一个这样的问题,.NET Core中如何通过Attribute的元数据信息来调用标记的对应方法.我第一时间想到的就是通过C#反射获取带有Custom Attribute标记的类 ...
Python 使用QQ 邮箱进行发送邮件及经验总结
今天我带大家实现下简单的发邮件功能.我们要使用到email和smtplib库,这两个库时python自带的,直接import 引用就好了,实现环境python3.6. 对QQ邮箱进行开启SMTP服务 ...
快速上手Linux核心命令（四）：文件内容相关命令
@ 目录前言 cat 合并文件或查看文件内容 more 分页显示文件内容 less 分页显示文件内容 head 显示文件内容头部 tail 显示文件内容尾部 tailf 跟踪日志文件 diff 比较 ...
error while loading shared libraries: libstdc++.so.6: cannot open shared object file: No such file o
error while loading shared libraries: libstdc++.so.6: cannot open shared object file: No such file o ...
高性能、快响应！火山引擎 ByteHouse 物化视图功能及入门介绍
更多技术交流.求职机会,欢迎关注字节跳动数据平台微信公众号,回复[1]进入官方交流群物化视图是指将视图的计算结果存储在数据库中的一种技术.当用户执行查询时,数据库会直接从已经预计算好的结果中获取数据 ...
Vue2到Vue3的改变
一.Vue2->Vue3 如果有Vue2的基础,并在此基础上学习Vue3,并不需要把完整的官网看完,我们只需要关注一下新功能和非兼容的变化即可进行开发. 二.Vue3变化统一元素上使用的v-i ...
2020-08-31：描述HTTP的版本之间的区别，主要是1.0/1.1/2.0三个版本的区别。
福哥答案2020-08-31: HTTP1.0与HTTP1.1的主要区别 1.长连接HTTP1.0:需要使用keep-alive参数来告知服务器端要建立一个长连接.HTTP1.1:默认支持长连接.2. ...
2020-12-13：用最少数量的线程，每个线程执行for的空循环，把cpu打满了。如果在for的空循环里添加打印输出函数，会把cpu打满吗？为什么？
福哥答案2020-12-13:不会.输出会进行io操作,相对于CPU的速度,这是一个非常缓慢的过程,所以CPU会有机会空闲下来.***[评论](https://user.qzone.qq.com/31 ...

requests高级用法、代理池搭建

requests高级用法

1.自动携带cookie的session对象

2.响应response

3.解析json

4.发起https请求

5.使用代理

6.超时设置

7.异常处理

8.上传文件

9.认证登录

代理池搭建

使用代理池

爬取某视频网站

requests高级用法、代理池搭建的更多相关文章

随机推荐

热门专题