# 字符串初始化
html = '''
<div>
<ul>
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
''' from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li')) # url初始化
from pyquery import PyQuery as pq
doc = pq(url = "http://www.baidu.com")
print(doc("head")) # 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename = "demo.html")
print(doc('li')) # 基本CSS选择器
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面id 前面需要加上#,class 前面需要加上.
print(doc('#container .list li')) # 查找元素
# 子元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis) lis = items.children()
print(type(lis))
print(lis) lis = items.children('.active')
print(lis) # 父元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents) parents = items.parents('.wrap')
print(parents)
 # 兄弟元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面item-0后面直接是. 没有空格
li = doc('.list .item-0.active')
print(li.siblings()) print(li.siblings('.active')) # 遍历
# 单个元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li) # 获取信息
# 获取属性
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
# 获取属性的两种方法
print(a.attr('href'))
print(a.attr.href) # 获取文本
print(a.text()) # 获取html
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
# 得到<li>标签里面的代码
print(li.html()) # DOM操作
# addClass、removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')
print(li)
li.add_class('active')
print(li) # attr CSS
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li) # remove
html = '''
<div class = "wrap">
Hello,World
<p>This is a paragraph</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text()) # 伪类选择器
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 获取第一个元素
li = doc('li:first-child')
print(li)
# 获取最后一个元素
li = doc('li:last-child')
print(li)
# 获取第二个元素
li = doc('li:nth-child(2)')
print(li)
# 获取下标为2的元素后面的所有元素(下标从0开始)
li = doc('li:gt(2)')
print(li)
# 获取下标为偶数的元素
li = doc('li:nth-child(2n)')
print(li)
# 获取内容包含second 的元素
li = doc('li:contains(second)')
print(li)

Python爬虫之pyquery库的基本使用的更多相关文章

  1. Python爬虫之PyQuery使用(六)

    Python爬虫之PyQuery使用 PyQuery简介 pyquery能够通过选择器精确定位 DOM 树中的目标并进行操作.pyquery相当于jQuery的python实现,可以用于解析HTML网 ...

  2. python爬虫之urllib库(三)

    python爬虫之urllib库(三) urllib库 访问网页都是通过HTTP协议进行的,而HTTP协议是一种无状态的协议,即记不住来者何人.举个栗子,天猫上买东西,需要先登录天猫账号进入主页,再去 ...

  3. python爬虫之urllib库(二)

    python爬虫之urllib库(二) urllib库 超时设置 网页长时间无法响应的,系统会判断网页超时,无法打开网页.对于爬虫而言,我们作为网页的访问者,不能一直等着服务器给我们返回错误信息,耗费 ...

  4. python爬虫之urllib库(一)

    python爬虫之urllib库(一) urllib库 urllib库是python提供的一种用于操作URL的模块,python2中是urllib和urllib2两个库文件,python3中整合在了u ...

  5. Python爬虫之selenium库使用详解

    Python爬虫之selenium库使用详解 本章内容如下: 什么是Selenium selenium基本使用 声明浏览器对象 访问页面 查找元素 多个元素查找 元素交互操作 交互动作 执行JavaS ...

  6. Mac os 下 python爬虫相关的库和软件的安装

      由于最近正在放暑假,所以就自己开始学习python中有关爬虫的技术,因为发现其中需要安装许多库与软件所以就在这里记录一下以避免大家在安装时遇到一些不必要的坑. 一. 相关软件的安装:   1. h ...

  7. python爬虫(四)_urllib2库的基本使用

    本篇我们将开始学习如何进行网页抓取,更多内容请参考:python学习指南 urllib2库的基本使用 所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地.在Python中有很 ...

  8. python爬虫之PyQuery的基本使用

    PyQuery库也是一个非常强大又灵活的网页解析库,如果你有前端开发经验的,都应该接触过jQuery,那么PyQuery就是你非常绝佳的选择,PyQuery 是 Python 仿照 jQuery 的严 ...

  9. python爬虫之requests库

    在python爬虫中,要想获取url的原网页,就要用到众所周知的强大好用的requests库,在2018年python文档年度总结中,requests库使用率排行第一,接下来就开始简单的使用reque ...

随机推荐

  1. 【Redis篇】Redis集群安装与初始

    一.前述   本文将单台节点不同端口模拟集群方式. 二.具体搭建 前提是安装好redis具体可参考http://www.cnblogs.com/LHWorldBlog/p/8463269.html 1 ...

  2. Python内置函数(42)——memoryview

    英文文档: class memoryview(obj) memoryview objects allow Python code to access the internal data of an o ...

  3. java代码之美(3)---guava 复写Object常用方法

    guava 复写Object常用方法 Guava 是一个 Google 的基于java1.6的类库集合的扩展项目,这个库提供用于集合,缓存,支持原语,并发性,常见注解,字符串处理,I/O和验证的实用方 ...

  4. pdf生成库-libharu编译

    相关文章:libharu 源码编译 VS2010 1.首先下载libharu源码,libharu依赖于libpng和libzib,如果编译过libpng和libzib的话,直接拿过来用即可.否则需要自 ...

  5. asp.net core AuthenticationMiddleware 在WebApi中的的使用

    在.net framework 4.5架构下使用认证(Authentication)授权(Authorization). IIS使用HttpModule进行认证(Authentication),我们可 ...

  6. iPhone多次输入错误密码锁机后刷机恢复(原有内容会丢失)

    这个操作会完全丢失手机当前存储的资料,已经备份到iTunes的内容,将来可以通过iTunes恢复.已经被自动备份到iCloud的内容,比如通讯录,将来可以自动从iCloud恢复.以前没有备份过的资料, ...

  7. 微信小程序与AspNetCore SignalR聊天实例

    微信小程序与aspnetcore signalr实例 本文不对小程序与signalr做任何介绍,默认读者已经掌握 aspnetcore Signalr文档 小程序文档 写在之前 SignalR没有提供 ...

  8. npm用法

    查看包信息npm info mongodb 查看包的最新版本npm view mongodb version 安装npm install mongodb@2.2.33 已安装的包修改版本npm ins ...

  9. 为容器化的 Go 程序搭建 CI

    本文介绍如何使用 Jenkins 的声明式 pipeline 为一个简单的 Golang web 应用搭建 CI 环境.如果你还不太了解 Jenkins 及其声明式 pipeline,请先参考笔者的 ...

  10. Connection 对象简介 方法解读 JDBC简介(四)

    通过驱动管理器DriverManager的getConnection方法,可以创建到指定URL的连接     Connection conn = DriverManager.getConnection ...