# 字符串初始化
html = '''
<div>
<ul>
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
''' from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li')) # url初始化
from pyquery import PyQuery as pq
doc = pq(url = "http://www.baidu.com")
print(doc("head")) # 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename = "demo.html")
print(doc('li')) # 基本CSS选择器
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面id 前面需要加上#,class 前面需要加上.
print(doc('#container .list li')) # 查找元素
# 子元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis) lis = items.children()
print(type(lis))
print(lis) lis = items.children('.active')
print(lis) # 父元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents) parents = items.parents('.wrap')
print(parents)
 # 兄弟元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面item-0后面直接是. 没有空格
li = doc('.list .item-0.active')
print(li.siblings()) print(li.siblings('.active')) # 遍历
# 单个元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li) # 获取信息
# 获取属性
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
# 获取属性的两种方法
print(a.attr('href'))
print(a.attr.href) # 获取文本
print(a.text()) # 获取html
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
# 得到<li>标签里面的代码
print(li.html()) # DOM操作
# addClass、removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')
print(li)
li.add_class('active')
print(li) # attr CSS
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li) # remove
html = '''
<div class = "wrap">
Hello,World
<p>This is a paragraph</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text()) # 伪类选择器
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 获取第一个元素
li = doc('li:first-child')
print(li)
# 获取最后一个元素
li = doc('li:last-child')
print(li)
# 获取第二个元素
li = doc('li:nth-child(2)')
print(li)
# 获取下标为2的元素后面的所有元素(下标从0开始)
li = doc('li:gt(2)')
print(li)
# 获取下标为偶数的元素
li = doc('li:nth-child(2n)')
print(li)
# 获取内容包含second 的元素
li = doc('li:contains(second)')
print(li)

Python爬虫之pyquery库的基本使用的更多相关文章

  1. Python爬虫之PyQuery使用(六)

    Python爬虫之PyQuery使用 PyQuery简介 pyquery能够通过选择器精确定位 DOM 树中的目标并进行操作.pyquery相当于jQuery的python实现,可以用于解析HTML网 ...

  2. python爬虫之urllib库(三)

    python爬虫之urllib库(三) urllib库 访问网页都是通过HTTP协议进行的,而HTTP协议是一种无状态的协议,即记不住来者何人.举个栗子,天猫上买东西,需要先登录天猫账号进入主页,再去 ...

  3. python爬虫之urllib库(二)

    python爬虫之urllib库(二) urllib库 超时设置 网页长时间无法响应的,系统会判断网页超时,无法打开网页.对于爬虫而言,我们作为网页的访问者,不能一直等着服务器给我们返回错误信息,耗费 ...

  4. python爬虫之urllib库(一)

    python爬虫之urllib库(一) urllib库 urllib库是python提供的一种用于操作URL的模块,python2中是urllib和urllib2两个库文件,python3中整合在了u ...

  5. Python爬虫之selenium库使用详解

    Python爬虫之selenium库使用详解 本章内容如下: 什么是Selenium selenium基本使用 声明浏览器对象 访问页面 查找元素 多个元素查找 元素交互操作 交互动作 执行JavaS ...

  6. Mac os 下 python爬虫相关的库和软件的安装

      由于最近正在放暑假,所以就自己开始学习python中有关爬虫的技术,因为发现其中需要安装许多库与软件所以就在这里记录一下以避免大家在安装时遇到一些不必要的坑. 一. 相关软件的安装:   1. h ...

  7. python爬虫(四)_urllib2库的基本使用

    本篇我们将开始学习如何进行网页抓取,更多内容请参考:python学习指南 urllib2库的基本使用 所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地.在Python中有很 ...

  8. python爬虫之PyQuery的基本使用

    PyQuery库也是一个非常强大又灵活的网页解析库,如果你有前端开发经验的,都应该接触过jQuery,那么PyQuery就是你非常绝佳的选择,PyQuery 是 Python 仿照 jQuery 的严 ...

  9. python爬虫之requests库

    在python爬虫中,要想获取url的原网页,就要用到众所周知的强大好用的requests库,在2018年python文档年度总结中,requests库使用率排行第一,接下来就开始简单的使用reque ...

随机推荐

  1. Agens层次聚类

    层次聚类是另一种主要的聚类方法,它具有一些十分必要的特性使得它成为广泛应用的聚类方法.它生成一系列嵌套的聚类树来完成聚类.单点聚类处在树的最底层,在树的顶层有一个根节点聚类.根节点聚类覆盖了全部的所有 ...

  2. 2.Django路由规则

    路由规则 1.基于正则的url 在templates目录下创建index.html.detail.html文件 (1)index.html <!DOCTYPE html> <html ...

  3. 智能压缩,摆脱用 Gzip 还是 Brotli 的纠结

    近日,又拍云上线了“智能压缩”功能,同时支持 Gzip 和 Brotli 压缩算法,在节约流量的同时,进一步减少用户的等待时间. CDN 流量问题一直以来是大家关注的重点,又拍云针对流量节约上线了一系 ...

  4. Elasticsearch索引原理

    转载 http://blog.csdn.net/endlu/article/details/51720299 最近在参与一个基于Elasticsearch作为底层数据框架提供大数据量(亿级)的实时统计 ...

  5. 华为oj之等差数列前n项和

    题目: 等差数列 热度指数:1010 时间限制:1秒 空间限制:32768K 题目描述 功能: 对于等差数列 2,5,8,11,14- 输入: 正整数N >0 输出: 求等差数列前N项和 返回: ...

  6. 【漫画】以后在有面试官问你平衡(AVL)树,你就把这篇文章扔给他。

    西天取经的路上,一样上演着编程的乐趣..... 1.若它的左子树不为空,则左子树上所有的节点值都小于它的根节点值. 2.若它的右子树不为空,则右子树上所有的节点值均大于它的根节点值. 3.它的左右子树 ...

  7. 冗余jar包识别神器 - loose.jar

    冗余jar包识别神器 - loose.jar 场景描述 项目迭代久了,会添加各类jar包,事实上很多jar包其实根本没用到.那如何快速识别冗余的jar,以方便从项目中清除掉呢? 比如: 该简单的测试工 ...

  8. 想晋级高级工程师只知道表面是不够的!Git内部原理介绍

    本文由云+社区发表 作者:腾讯工蜂用户:王二卫 从不一样的视角了解git,以便更好的使用git 一.git & git 版本库认识 git 是一个内容寻址的文件系统,其核心部分是一个简单的键值 ...

  9. c# Task 篇幅二

    上面一篇https://i.cnblogs.com/EditPosts.aspx?postid=10444773我们介绍了Task的启动,Task的一些方法以及应用,今天我们着重介绍一下Task其它概 ...

  10. (摘)linux下yum安装redis以及使用

    1.yum install redis      --查看是否有redis   yum 源 2.yum install epel-release    --下载fedora的epel仓库 3.yum ...