一 requests模块
- 概念:
- python中原生的基于网络请求的模块,模拟浏览器进行请求发送,获取页面数据
- 安装: pip install requests
二 requests使用的步骤
- 1 指定url
- 2 基于requests模块请求发送
- 3 获取响应对象中的数据值(text)
- 4 持久化储存
三 反反爬
- 1 设置ip
- 2 设置UA
import requests word = input('请你输入你要查的词') url = 'https://www.sogou.com/web?' params = {
'query': word
} heards = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
} response = requests.get(url=url, params=params,heards=heards,proxies={'https': ''}) ######UA 和 IP page_tail = response.text filename = word + '.html' with open(filename, 'w', encoding='utf-8') as f:
四 示例
import requests # 1 指定url
url = 'https://www.sogou.com/'
# 2 基于ruquests模块发送请求
response = requests.get(url=url)
# 3 获取响应对象的数据值
page_text = response.text
# 4 持久化存储
with open('./sogou.html','w',encoding='utf-8') as f:
注意: 对于上面的代码
response.content 返回二进制的页面数据
response.headers 返回响应头信息
response.status_code 返回响应200
response.url 返回是地址
response.encoding 返回的是响应对象中存储数据的原始编码程序
import requests word = input('请你输入你要查的词')
url = 'https://www.sogou.com/web' param = {
'query': word
response = requests.get(url=url, params=param) page_text = response.text
filename = word+'.html'
with open(filename, 'w', encoding='utf-8') as f:
# 依照我们上面所说的步骤
import requests url = 'https://www.douban.com/accounts/login' data = { # 在浏览器中找
"source": "index_nav",
"form_email": "xxxxxxxxx",
"form_password": "xxxxxxxxx"
} response = requests.post(url=url,data=data) page_text = response.text with open('douban.html', 'w', encoding='utf-8') as f:
基于requests模块ajax的get请求-------爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
import requests url = 'https://movie.douban.com/j/chart/top_list?' param = { #携带的数据
'type': '',
'interval_id': '100:90',
'action': '',
'start': '',
'limit': '',
} response = requests.get(url=url, params=param})
import requests url = ' http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
city = input('请输入你要查的城市')
data = {
'cname': '',
'pid': '',
'keyword': city,
'pageIndex': '',
'pageSize': '',
response = requests.post(url=url, data=data)
import requests
import os url = 'https://www.cnblogs.com/#p'
if not os.path.exists('boke'):
os.mkdir('boke') start_page = int(input('enter a start page:'))
end_page = int(input('enter a end page:')) for page in range(start_page, end_page + 1):
url = url + str(page)
response = requests.get(url=url, proxies={'https': ''})
page_text = response.text fileName = str(page) + '.html'
filePath = './boke/' + fileName
with open(filePath, 'w', encoding='utf-8') as f:
print('第%s页打印' % page)
# 根据实际情况 本段代码所保存的html,是同一个(第一页的内容),
# 我们从页面抓包可以知道,它在第二页的时候发送了一个post请求
import requests
import os url = "http://www.cnblogs.com/mvc/AggSite/PostList.aspx" # url
if not os.path.exists('boke'):
os.mkdir('boke') start_page = int(input('enter a start page:'))
end_page = int(input('enter a end page:')) for page in range(start_page, end_page+1):
data = {
"CategoryType": "SiteHome",
"ParentCategoryId": 0,
"CategoryId": 808,
"PageIndex": page,
"TotalPostCount": 4000,
"ItemListActionName": "PostList"
} res = requests.post(url=url, data=data, verify=False)
page_text = res.text fileName = str(page) + '.html'
filePath = './boke/' + fileName
with open(filePath, 'w', encoding='gbk') as f:
print('第%s页打印' % page)
