Python3.x：bs4解析html基础用法

代码：

import urllib.request

from bs4 import BeautifulSoup

import re

url = r'http://fund.eastmoney.com/340007.html?spm=search'

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

req = urllib.request.Request(url=url, headers=headers)

res = urllib.request.urlopen(req)

html = res.read().decode('utf-8')

#使用自带的html.parser解析，速度慢但通用

soup = BeautifulSoup(html, "html.parser")

#或者soup = BeautifulSoup(html, "html5lib")

#输出第一个 title 标签

print(soup.title)

#输出第一个 title 标签的标签名称

print(soup.title.name)

#输出第一个 title 标签的包含内容

print(soup.title.string)

#输出第一个 title 标签的父标签的标签名称

print(soup.title.parent.name)

#输出第一个  p 标签

print(soup.p)

#输出第一个  p 标签的 class 属性内容

print(soup.p['class'])

#输出第一个  a 标签的  href 属性内容

print(soup.a['href'])

#输出第一个  p 标签的所有子节点

print(soup.p.contents)

#输出第一个  a 标签

print(soup.a)

#输出所有的  a 标签，以列表形式显示

print(soup.find_all('a'))

#输出第一个 id 属性等于  gz_gszze 的标签

print(soup.find(id='gz_gszze'))

#输出第一个 id 属性等于  gz_gszze 的标签的文本内容

print(soup.find(id='gz_gszze').get_text())

#获取所有文字内容

print(soup.get_text())

#输出第一个  a 标签的所有属性信息

print(soup.a.attrs)

#循环a标签

for link in soup.find_all('a'):

    #获取 link 的  href 属性内容

    print(link.get('href'))

#对soup.p的子节点进行循环输出

for child in soup.p.children:

    print(child)

#正则匹配，标签名字中带有sp的标签

for tag in soup.find_all(re.compile("sp")):

    print(tag.name)

#按照CSS类名搜索tag的功能非常实用,但标识CSS类名的关键字 class 在Python中是保留字,使用 class 做参数会导致语法错误.从Beautiful Soup的4.1.1版本开始,可以通过 class_ 参数搜索有指定CSS类名的tag

#查找dl标签class为dataItem02的所有dl标签

for tag in soup.find_all("dl", class_="dataItem02"):

    print(tag.name)

#或者

for tag in soup.find_all('dl', attrs={'class': "dataItem02"}):

    print(tag.name)

#查找dl标签class为包含'ui-font-'字符的所有dl标签

for tagspan in child.find_all("span", class_=re.compile('ui-font-')):

    print(tagspan.get_text())

#数组对象定义（用于存放对象）

content_list = []

#按照CSS类名搜索tag的功能非常实用,但标识CSS类名的关键字 class 在Python中是保留字,使用 class 做参数会导致语法错误.从Beautiful Soup的4.1.1版本开始,可以通过 class_ 参数搜索有指定CSS类名的tag

#查找dl标签class为dataItem02的所有dl标签

for tag in soup.find_all("dl", class_="dataItem02"):

    #对tag的子节点进行循环输出

    for child in tag.children:

        print(child)

        #将对象存进数组

        content_list.append(child)

#获取数组中的第一个对象的值

print('content_list[0]：'+content_list[0].get_text())

find与find_all一起用：

    #第一个class = 'postlist'的div里的所有a 标签是我们要找的信息

    #注意：BeautifulSoup()返回的类型是<class 'bs4.BeautifulSoup'>

    #　 　find()返回的类型是<class 'bs4.element.Tag'>

    #　 　find_all()返回的类型是<class 'bs4.element.ResultSet'>

    #　 　<class 'bs4.element.ResultSet'>不能再进项find/find_all操作

    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')

    for a in all_a:

        title = a.get_text()  # 提取文本

        if(title != ''):

            print("标题：" + title)

#最大页数在span标签中的第10个

pic_max = soup.find_all('span')[10].text

#找标题

title = soup.find('h2',class_='main-title').text

#图片地址在img标签alt属性为'图书'地方

pic_url = mess.find('img',alt = '图书')

#获取pic_url中的src属性值：pic_url['src']

html = requests.get(pic_url['src'],headers = headers)

#图片不是文本文件，以二进制格式写入，所以是html.content

#open(路径+文件名,读写模式)

#读写模式:r只读,r+读写,w新建(会覆盖原有文件),a追加,b二进制文件.常用模式

f = open(file_name,'wb')

f.write(html.content)

f.close()

#正则 re.findall  的简单用法（返回string中所有与pattern相匹配的全部字串，返回形式为数组），用法：findall(pattern, string, flags=0)

#示例1：查找全部r标识代表后面是正则的语句

str_1 = re.findall(r"com","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_1)

#输出结果：['com']

#示例2：符号^表示匹配以http开头的的字符串返回,

str_2 = re.findall(r"^http","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_2)

# 输出结果：['http']

#示例3：用$符号表示以html结尾的字符串返回,判断是否字符串结束的字符串

str_3 = re.findall(r"html$","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_3)

# 输出结果：['html']

# 示例4：[...]匹配括号中的其中一个字符

str_4 = re.findall(r"[n,w]b","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_4)

# 输出结果：['nb']

# 示例5：“d”是正则语法规则用来匹配0到9之间的数返回列表

str_5 = re.findall(r"\d","http://www.cnblogs.com/lizm166/p/8143231.html")

str_6 = re.findall(r"\d\d\d","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_5)

# 输出结果：['1', '6', '6', '8', '1', '4', '3', '2', '3', '1']

print (str_6)

# 输出结果：['166', '814', '323']

# 示例6：小d表示取数字0-9，大D表示不要数字，也就是除了数字以外的内容返回

str_7 = re.findall(r"\D","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_7)

# 输出结果：['h', 't', 't', 'p', ':', '/', '/', 'w', 'w', 'w', '.', 'c', 'n', 'b', 'l', 'o', 'g', 's', '.', 'c', 'o', 'm', '/', 'l', 'i', 'z', 'm', '/', 'p', '/', '.', 'h', 't', 'm', 'l']

# 示例7：“w”在正则里面代表匹配从小写a到z,大写A到Z，数字0到9

str_8 = re.findall(r"\w","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_8)

# 输出结果：['h', 't', 't', 'p', 'w', 'w', 'w', 'c', 'n', 'b', 'l', 'o', 'g', 's', 'c', 'o', 'm', 'l', 'i', 'z', 'm', '1', '6', '6', 'p', '8', '1', '4', '3', '2', '3', '1', 'h', 't', 'm', 'l']

# 示例8：“W”在正则里面代表匹配除了字母与数字以外的特殊符号

str_9 = re.findall(r"\W","http://www.cnblogs.com/lizm166/p/8143231.html")

print (str_9)

# 输出结果：[':', '/', '/', '.', '.', '/', '/', '/', '.']

# 获取所有a标签（属性target为_blank）

tr.find_all('a',target='_blank')

作者：整合侠
链接：http://www.cnblogs.com/lizm166/p/8205085.html
来源：博客园
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

Python3.x：bs4解析html基础用法的更多相关文章

bs4.BeautifulSoup的基础用法
导入模块 from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,"html.parser") 下面看下常见的用法 ...
bs4 解析以及用法
bs4解析 bs4: 环境安装: lxml bs4 bs4编码流程: 1.实例化一个bs4对象,且将页面源码数据加载到该对象中 2.bs相关的方法或者属性实现标签定位 3.取文本或者取属性 bs的属性 ...
logstash安装与基础用法
若是搭建elk,建议先安装好elasticsearch 来自官网,版本为2.3 wget -c https://download.elastic.co/logstash/logstash/packag ...
Smarty基础用法
一.Smarty基础用法: 1.基础用法如下 include './smarty/Smarty.class.php';//引入smarty类 $smarty = new Smarty();//实例化s ...
asyncio 基础用法
asyncio 基础用法 python也是在python 3.4中引入了协程的概念.也通过这次整理更加深刻理解这个模块的使用 asyncio 是干什么的? asyncio是Python 3.4版本引入 ...
【Python爬虫】selenium基础用法
selenium 基础用法阅读目录初识selenium 基本使用查找元素元素互交操作执行JavaScript 获取元素信息等待前进后退 Cookies 选项卡管理异常处理初识sele ...
爬虫的三种解析方式(正则解析, xpath解析, bs4解析)
一 : 正则解析 : 常用正则回顾: 单字符: . : 除换行符以外的所有字符 [] : [aoe] [a-w] 匹配集合中任意一个字符 \d : 数字 [0-9] \D : 非数字 \w : 非数字 ...
python bs4解析网页时 bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to inst（转）
Python小白,学习时候用到bs4解析网站,报错 bs4.FeatureNotFound: Couldn't find a tree builder with the features you re ...
爬虫简介、requests 基础用法、urlretrieve()
1. 爬虫简介 2. requests 基础用法 3. urlretrieve() 1. 爬虫简介爬虫的定义网络爬虫(又被称为网页蜘蛛.网络机器人),是一种按照一定的规则,自动地抓取万维网信息的程 ...

随机推荐

[直观学习排序算法] 视觉直观感受若干常用排序算法以及 iOS 资料
http://www.zhfish.net/?s=点击范围 1 快速排序介绍: 快速排序是由东尼·霍尔所发展的一种排序算法.在平均状况下,排序 n 个项目要Ο(n log n)次比较.在最坏状况下则 ...
web 前端规范实例
<!DOCTYPE html> <html> <head> <title>tmall</title> <!-- 为了被搜索引擎作为流量 ...
ceph 存储安装部署
环境准备 1.三台服务器 cephnode01 192.168.254.83 cephnode02 192.168.254.84 cephnode03 192.168.254.85 2.基本环境配置 ...
解决instance中文命名导致nova list报错问题
当创建instance之后,如果使用英文命名,执行nova list的时候,无问题,但是,如果instance中出现中文,执行nova list的时候,会报以下错误: [root@controller ...
Linux下多任务间通信和同步-mmap共享内存
Linux下多任务间通信和同步-mmap共享内存嵌入式开发交流群280352802,欢迎加入! 1.简介共享内存可以说是最有用的进程间通信方式.两个不用的进程共享内存的意思是:同一块物理内存被映射 ...
记一次开发：Qt简单电话本程序
前言断断续续学习C++一年了,现在要做课设,觉得控制台界面实在太难看,于是用Qt做一个图形化的程序出来. 学习Qt也没有多久,只是了解了个大概,这次开发基本上是啃了2天的官方帮助文档,然后利用各种Q ...
JavaScript—文字自动变化为自定义颜色
效果: JS代码: var ColorTimer; var Colorforn = 0; //颜色代码 var ColorArray = new Array("#00CCCC", ...
微信小程序 --- 选择图片和拍照
wx.chooseImage 选择图片 / 进行拍照 //获取应用实例 const app = getApp() Page({ data: { onOff:true }, btnclick:funct ...
postgresql----时间类型
postgresql支持的时间类型如下图所示: 日期 date: 建议日期的输入格式为1997-01-01,虽然也支持19970101,1/1/1997,Jan-1-1997等多种格式. 时间戳 ti ...
《挑战程序设计竞赛》2.5 最小生成树 POJ3723 3169 1258 2377 2395 AOJ2224（1）
POJ3723 http://poj.org/problem?id=3723 题意 windy要组建一支军队,召集了N个女孩和M个男孩,每个人要付10000RMB,但是如果一个女孩和一个男孩有关系d的 ...

Python3.x：bs4解析html基础用法

Python3.x：bs4解析html基础用法

Python3.x：bs4解析html基础用法的更多相关文章

随机推荐

热门专题