urllib模块使用

urllib.request

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

import urllib.request

url = 'http://httpbin.org/ip'
response = urllib.request.urlopen(url)
html = response.read() # 返回bytes类型数据
print(html) url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') # 通过decode()方法将bytes类型数据转化为str类型数据
print(html)

发送post数据

import urllib.request
import urllib.parse url = 'http://httpbin.org/post' data = {
'name' : "小明",
'age' : 30
}
# data = urllib.parse.urlencode(data) # Error: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
# data = urllib.parse.urlencode(data).encode('utf-8')
data = bytes(urllib.parse.urlencode(data),encoding="utf-8")
response = urllib.request.urlopen(url, data=data)
html = response.read().decode('utf-8')
print(html)

设置timeout

import urllib.request

url = 'http://httpbin.org/get'
response = urllib.request.urlopen(url, timeout=1)
html = response.read().decode('utf-8')
print(html)
import socket
import urllib.request
import urllib.error url = 'http://httpbin.org/get'
try:
response = urllib.request.urlopen(url, timeout=0.1)
html = response.read().decode('utf-8')
print(html)
except urllib.error.URLError as e:
print("捕获异常....")
print(e.reason)
if isinstance(e.reason, socket.timeout):
print("请求超时")

响应

响应类型、状态码、响应头、实际获取的url

import urllib.request

url = 'http://www.python.org'
response = urllib.request.urlopen(url)
# 响应类型
response_type = type(response)
print(response_type) # <class 'http.client.HTTPResponse'>
# 状态码
status_code = response.getcode()
print(status_code)
# 状态码对应的信息
status = response.reason
print(status) # 比如 200对应Ok, 404对应Not Found
# 响应头
response_headers = response.getheaders() # 返回列表
print(response_headers)
server_type = response.getheader('Server') # getheader()获取响应头的指定部分信息
print(server_type)
print(type(response.headers)) # <class 'http.client.HTTPMessage'>
content_type = response.headers['Content-Type'] # 获取Content-Type
print(content_type)
# 实际获取的url, 可以用来判断是否发生重定向
actual_url = response.geturl()
print(actual_url)

class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

import urllib.request

url = 'http://httpbin.org/get'
request = urllib.request.Request(url) # 创建请求对象
response = urllib.request.urlopen(request) # 发送请求
html = response.read().decode('utf-8')
print(html)
# 默认的User-Agent为"Python-urllib/x.x" # x.x为python版本号

发送post数据

import urllib.request
import urllib.parse url = 'http://httpbin.org/post'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
data = {
'name' : 'peter',
'age' : 20
} data = bytes(urllib.parse.urlencode(data), encoding="utf-8") # POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
# post数据时 "Content-Type": "application/x-www-form-urlencoded"

urllib.request.Request 对象方法

import urllib.request

url = 'http://httpbin.org/get'
request = urllib.request.Request(url)
# add_header(key, val) # 添加请求头信息
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36')
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)

Handlers

ProxyHandler(代理)

import urllib.request

# 字典,key为协议类型,value 为 ip地址:端口号
proxy_dict = {
'http': '127.0.0.1:6688',
'https': '127.0.0.1:6688',
} headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
} proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener) opener.addheaders = headers.items() # 设置请求头 url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = urllib.request.urlopen(url)
print(response.read().decode('utf-8')) # 常见错误:
# HTTPError: HTTP Error 403: Forbidden : 很可能代理服务器设置了权限,当前ip不在代理服务器允许访问列表中

代理需要身份认证

# 错误提示: HTTPError: HTTP Error 407: Proxy Authentication Required

#方法1: 代理ip设置格式 http://用户名:密码@ip地址:端口号
import urllib.request # 字典,key为协议类型,value 为 ip地址:端口号
proxy_dict = {
'http': 'http://name:password@127.0.0.1:6688',
'https': 'http://name:password@127.0.0.1:6688',
} headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
} proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener) opener.addheaders = headers.items() # 设置请求头 url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))
#方法2: 使用ProxyBasicAuthHandler用于代理登陆验证(需要提供相应的用户名和密码)
import urllib.request # 字典,key为协议类型,value 为 ip地址:端口号
proxy_dict = {
'http': 'http://127.0.0.1:6688',
} headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
} proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'http://127.0.0.1:6688', 'name', 'password') # #realm(域)设为None即可
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(proxy_hanlder, proxy_auth_handler)
urllib.request.install_opener(opener) opener.addheaders = headers.items() # 设置请求头 url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))

HTTPBasicAuthHandler

用于访问web服务器时的身份验证

import urllib.request

url = 'http://127.0.0.1/test/'
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, 'admin','password') # 添加对应url的用户名和密码
http_auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(http_auth_handler)
response = opener.open(url)
print(response.read().decode('utf-8'))

FTPHandler

import urllib.request

url = 'ftp://ftp1.linuxidc.com'
username = 'ftp1.linuxidc.com'
password = 'www.linuxidc.com' ftp_url = 'ftp://%s:%s@ftp1.linuxidc.com' %(username, password)
ftp_handler = urllib.request.FTPHandler()
opener = urllib.request.build_opener(ftp_handler)
response = opener.open(ftp_url)
print(response.read().decode('utf-8', 'ignore'))

HTTPHandler、HTTPSHandler

import urllib.request

url = 'http://www.baidu.com'
# 通过将debuglevel=1,将debug Log 打开,这样收发包的内容就会在屏幕上打印出来,方便调试
http_handler = urllib.request.HTTPHandler(debuglevel=1)
https_handler = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(http_handler, https_handler)
response = opener.open(url) '''
效果:
send: b'GET / HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: www.baidu.com\r\nUser-Agent: Python-urllib/3.6\r\nConnection: close\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Date header: Content-Type header: Transfer-Encoding header: Connection header: Vary header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: P3P header: Cache-Control header: Cxy_all header: Expires header: X-Powered-By header: Server header: X-UA-Compatible header: BDPAGETYPE header: BDQID header: BDUSERID
'''

Cookie

CookieJar

import urllib.request
import http.cookiejar url = 'http://www.baidu.com'
cookie = http.cookiejar.CookieJar()
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
response = opener.open(url)
print(response.getcode())
for item in cookie: # item为<class 'http.cookiejar.Cookie'>
print(item.name, item.value, sep=" : ")

MozillaCookieJar

创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

import urllib.request
import http.cookiejar url = 'https://www.zhihu.com/settings/profile'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
} cookie = http.cookiejar.MozillaCookieJar("zhihu_cookie.txt")
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
opener.addheaders = headers.items() try:
cookie.load() # 将cookie数据从文件加载到内存 很重要
except http.cookiejar.LoadError as e:
print('cookie文件加载失败')
except IOError as e:
print("cookie文件不存在") response = opener.open(url)
print(response.geturl()) # 将geturl()返回的结果和url比对,判断是否登陆成功,失败会转到知乎登陆界面
html = response.read().decode('utf-8')
print(html) # 对于登陆成功,需要调用MozillaCookieJar对象的save()方法,将数据从内存保存到文件中

LWPCookieJar

创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

import urllib.request
import http.cookiejar url = 'http://www.baidu.com'
cookie = http.cookiejar.LWPCookieJar("cookies.txt")
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response = opener.open(url)
# 必须调用save方法 将内存中的cookie对象保存到本地文件中, 下次再次使用cookie,只需调用load方法将其加载到内存中即可
cookie.save(ignore_discard=True, ignore_expires=True)

异常处理

URLError

引起URLError的原因通常有: 无网络连接,即本机无法上网、访问的目标服务器不存在。在这种情况下,异常对象会有reason属性(它是一个由(错误码、错误原因)组成的元组对象)。 捕获异常方法如下:

import urllib.request

try:
response = urllib.request.urlopen('http://www.hello_world.org')
except urllib.request.URLError as e:
print(type(e.reason)) # <class 'socket.gaierror'>
print(e.reason) # # [Errno 11001] getaddrinfo failed

HTTPError

HTTPError是URLError的子类,每次调用urlopen方法发出一个请求时,服务器上都会产生对应response,它包含一个数字"状态码",
常见的状态码有200(请求成功),302(重定向),304(文档的内容(自上次访问以来或者根据请求的条件)并没有改变)
这些状态码有的表示服务器无法完成请求。如果无法处理请求,urlopen会抛出HTTPError。
典型的错误包括404(页面没有找到)、403(请求被禁止)、401(当前请求需要用户认证)、407(需要代理验证)、500(服务器内部错误)
# 方式1
import urllib.request
import urllib.error url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
response = urllib.request.urlopen(url)
# HTTPError是URLError子类,要放到前面处理
except urllib.error.HTTPError as e:
print("The server cannot fulfill the request...")
print("Error code: ", e.code)
print("Reason: ", e.reason)
except urllib.error.URLError as e:
print("failed to fetch the server...")
print("Reason: ", e.reason)
# 方式2
import urllib.request
import urllib.error url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print("The server cannot fulfill the request...")
print("Error code: ", e.code)
print("Reason: ", e.reason)
else:
print("failed to fetch the server...")
print("Reason: ", e.reason)

urllib.parse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

负责解析URL

from urllib.parse import urlparse

# def urlparse(url, scheme='', allow_fragments=True)
# 将url解析成6部分 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
# 返回6元祖 (scheme, netloc, path, params, query, fragment) result = urlparse('http://www.baidu.com/index.html;user?id=100#comment')
print(type(result)) # <class 'urllib.parse.ParseResult'>
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment') # 只有通过// 才能识别netloc
result = urlparse(
'//www.baidu.com/index.html;user?id=100#comment',
scheme="https")
print(result) # ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment') result = urlparse(
'www.baidu.com/index.html;user?id=100#comment',
scheme="https")
print(result) # ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=100', fragment='comment') # 原url已包含scheme,使用已有的scheme
result = urlparse(
'http://www.baidu.com/index.html;user?id=100#comment',
scheme="https")
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment') result = urlparse(
"http://www.baidu.com/index.html;user?id=100#comment",
allow_fragments=False)
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100#comment', fragment='') result = urlparse(
"http://www.baidu.com/index.html#comment",
allow_fragments=False)
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')

urllib.parse.urlunparse(parts)

from urllib.parse import urlunparse

data = ("http", 'www.baidu.com','index.html', 'user','id=100','comment')
url = urlunparse(data)
print(url)

urllib.parse.urljoin(base, url, allow_fragments=True)

# 以相对路径的url为准,base url向相对路径url提供相对路径url缺少的scheme(协议),netloc(主机地址), 来构造完整的url路径
from urllib.parse import urljoin print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com/index.html","FAQ.html"))
print(urljoin("http://www.baiud.com/index.html", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/index.html", "http://www.google.com/FAQ.html?question=2"))
print(urljoin("http://www.baidu.com/index.html?wd=abc", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/", "?category=5#comment"))
print(urljoin("http://www.baidu.com/#comment", "?category=5")) '''
http://www.baidu.com/FAQ.html
http://www.baidu.com/FAQ.html
http://www.google.com/FAQ.html
http://www.google.com/FAQ.html?question=2
http://www.google.com/FAQ.html
http://www.baidu.com/?category=5#comment
http://www.baidu.com/?category=5
'''

urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)

 from urllib.parse import urlencode

basic_url = 'http://httpbin.org/get'
data = {
"key": '天气',
}
data = urlencode(data)
full_url = '%s?%s' % (basic_url, data)
print(full_url) # http://httpbin.org/get?key=%E5%A4%A9%E6%B0%94

python3 urllib模块使用的更多相关文章

  1. Python3学习笔记(urllib模块的使用)转http://www.cnblogs.com/Lands-ljk/p/5447127.html

    Python3学习笔记(urllib模块的使用)   1.基本方法 urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None,  ...

  2. Python3:urllib模块的使用

    Python3:urllib模块的使用1.基本方法 urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=N ...

  3. python3 urllib和requests模块

    urllib模块是python自带的,直接调用就好,用法如下: 1 #处理get请求,不传data,则为get请求 2 import urllib 3 from urllib.request impo ...

  4. Python3 urllib.request库的基本使用

    Python3 urllib.request库的基本使用 所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地. 在Python中有很多库可以用来抓取网页,我们先学习urlli ...

  5. python3 urllib 类

    urllib模块中的方法 1.urllib.urlopen(url[,data[,proxies]]) 打开一个url的方法,返回一个文件对象,然后可以进行类似文件对象的操作.本例试着打开google ...

  6. urllib模块学习

    一.urllib库 概念:urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urll ...

  7. 爬虫--urllib模块

    一.urllib库 概念:urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urll ...

  8. Python3 urllib抓取指定URL的内容

    最近在研究Python,熟悉了一些基本语法和模块的使用:现在打算研究一下Python爬虫.学习主要是通过别人的博客和自己下载的一下文档进行的,自己也写一下博客作为记录学习自己过程吧.Python代码写 ...

  9. urllib模块的使用

    Python3学习笔记(urllib模块的使用) 1.基本方法 urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, ca ...

随机推荐

  1. NYOJ 12:喷水装置(二)(贪心,区间覆盖问题)

    12-喷水装置(二) 内存限制:64MB 时间限制:3000ms 特判: No 通过数:28 提交数:109 难度:4 题目描述: 有一块草坪,横向长w,纵向长为h,在它的橫向中心线上不同位置处装有n ...

  2. Go Example--常量

    package main import ( "fmt" "math" ) const s string = "constant" //定义字 ...

  3. controller层,service层,dao层(main函数,子函数,子的子函数)

    controller层相当于main函数————————————————————————————————————————————————————@RequestMapping("/query ...

  4. lerna import && add 使用&&常见问题解决

    使用lerna 的import 我们可以方便的将一个普通的npm 包倒入到lerna 管理的monorepo 中 环境准备 lerna init 注意必须是一个git 项目,同时需要commit ,不 ...

  5. mysql 严格模式 Strict Mode说明(转)

    转自https://www.cnblogs.com/jhcelue/p/7290243.html 1.开启与关闭Strict Mode方法 找到mysql安装文件夹下的my.cnf(windows系统 ...

  6. redhat 6.4下PXE+Kickstart无人值守安装操作系统

    一 前言 作为中小公司的运维,经常会遇到一些机械式的重复工作,例如:有时公司同时上线几十甚至上百台服务器,而且需要我们在短时间内完成系统安装.常规的办法有什么?1.光盘安装系统:每个服务器DVD内置光 ...

  7. 【转】OPPO A77保持应用后台运行方法

    原文网址:http://www.3533.com/news/16/201708/163086/1.htm OPPO A77保持应用后台运行方法.手机的运行内存大小有限,因此在出现运行应用过多时,系统就 ...

  8. 事件委托(event delegation) 或叫 事件代理

    比较好的介绍文章: 关于事件委托的整理 ,另附bind,live,delegate,on区别:https://www.cnblogs.com/MagicZhao123/p/5980957.html j ...

  9. jp@gc - Stepping Thread Group配置解释描述

    测试环境 apache-jmeter-2.13   插件: https://jmeter-plugins.org/downloads/old/ http://pan.baidu.com/s/1gfC1 ...

  10. 开通mysql root 用户远程访问权限(转)

    基于安全考虑root账户一般只能本地访问,但是在开发过程中可能需要打开root的远程访问权限.下面是基本的步骤:1.登录到mysql中,为root进行远程访问的授权,执行下面的命令: mysql> ...