
pip3 install beautifulsoup4


解析器 使用方法 优势 劣势
Python标准库 BeautifulSoup(markup,'html,parser') Python的内置标准库、执行速度适中、文档容错能力强 Python 2.7.3 or 3.2.2前的版本中文容错能力差
lxml HTML 解析库 BeautifulSoup(markup,'lxml') 速度快、文档容错能力强 需要安装C语言库
lxml XML 解析库 BeautifulSoup(markup,'xml') 速度快、唯一支持XML的解析器 需要安装C语言库
html5lib BeautifulSoup(markup,'xml') 最好的容错性、以浏览器的方式解析文档、生成HTML5格式的文档 速度慢、不依赖外部扩展


  1. html = """
  2. <html dir="ltr" lang="en"><head><meta charset="utf-8"/> <title>The Dormouse's story</title> </head><body><p class="title" name="dormouse"> <b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters;and their names were
  3. <a class="sister" href="http://example.com/elsie" id="link1"> <!-- Elsie --></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well
  4. </p> <p class="story"> ...story go on...</p>
  5. """
  6. from bs4 import BeautifulSoup
  7. soup = BeautifulSoup(html,'lxml')
  8. print(soup.prettify()


  1. <html dir="ltr" lang="en">
  2. <head>
  3. <meta charset="utf-8"/>
  4. <title>
  5. The Dormouse's story
  6. </title>
  7. </head>
  8. <body>
  9. <p class="title" name="dormouse">
  10. <b>
  11. The Dormouse's story
  12. </b>
  13. </p>
  14. <p class="story">
  15. Once upon a time there were three little sisters;and their names were
  16. <a class="sister" href="http://example.com/elsie" id="link1">
  17. <!-- Elsie -->
  18. </a>
  19. <a class="sister" href="http://example.com/lacie" id="link2">
  20. Lacie
  21. </a>
  22. and
  23. <a class="sister" href="http://example.com/tillie" id="link3">
  24. Tillie
  25. </a>
  26. ; and they lived at the bottom of a well
  27. </p>
  28. <p class="story">
  29. ...story go on...
  30. </p>
  31. </body>
  32. </html>



The Dormouse's story



  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.title)
  4. print(type(soup.title))
  5. print(soup.head)
  6. print(soup.p)


  1. <title>The Dormouse's story</title>
  2. <class 'bs4.element.Tag'>
  3. <head><meta charset="utf-8"/> <title>The Dormouse's story</title> </head>
  4. <p class="title" name="dormouse"> <b>The Dormouse's story</b></p> #只返回第一个p标签


  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.title.name)



  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.p.attrs['name'])
  4. print(soup.p['name'])





  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.b.string)

The Dormouse's story


  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.head.title.string)

The Dormouse's story


  1. html = '''<html dir="ltr" lang="en"><head><meta charset="utf-8"/> <title>The Dormouse\'s story</title> </head><body><p class="story">Once upon a time there were three little sisters;and their names were\n <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well\n </p> <p class="story"> ...story go on...</p>
  2. '''
  3. from bs4 import BeautifulSoup
  4. soup = BeautifulSoup(html,'lxml')
  5. print(soup.p.contents)
  1. ['Once upon a time there were three little sisters;and their names were\n ', <a class="sister" href="http://example.com/elsie" id="link1"> <!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 'and', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, '; and they lived at the bottom of a well\n ']


  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.p.children)
  4. for i,child in enumerate(soup.p.children):
  5. print(i,child)

<list_iterator object at 0x7fe986ba07f0>

0 Once upon a time there were three little sisters;and their names were

1 <a class="sister" href="http://example.com/elsie" id="link1"> <!-- Elsie --></a>

2 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

3 and

4 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

5 ; and they lived at the bottom of a well

  1. html = '''<html dir="ltr" lang="en"><head><meta charset="utf-8"/> <title>The Dormouse\'s story</title> </head><body><p class="story">Once upon a time there were three little sisters;and their names were\n <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well\n </p> <p class="story"> ...story go on...</p>
  2. ... '''
  3. from bs4 import BeautifulSoup
  4. soup = BeautifulSoup(html,'lxml')
  5. print(soup.p.descendants)
  6. for i,child in enumerate(soup.p.descendants):
  7. print(i,child)


<generator object descendants at 0x7fe986c11468>

0 Once upon a time there were three little sisters;and their names were

1 <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a>


3 <span>Elsie </span>

4 Elsie

5 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

6 Lacie

7 and

8 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

9 Tillie

10 ; and they lived at the bottom of a well


  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(soup.a.parent)


  1. <p class="story">Once upon a time there were three little sisters;and their names were
  2. <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well
  3. </p>
  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(list(enumerate(soup.a.parent)))


  1. [(0, 'Once upon a time there were three little sisters;and their names were\n '), (1, <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a>), (2, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (3, 'and'), (4, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (5, '; and they lived at the bottom of a well\n ')]



  1. [(0, <p class="story">Once upon a time there were three little sisters;and their names were
  2. <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well
  3. </p>), (1, <body><p class="story">Once upon a time there were three little sisters;and their names were
  4. <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well
  5. </p> <p class="story"> ...story go on...</p>
  6. </body>), (2, <html dir="ltr" lang="en"><head><meta charset="utf-8"/> <title>The Dormouse's story</title> </head><body><p class="story">Once upon a time there were three little sisters;and their names were
  7. <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well
  8. </p> <p class="story"> ...story go on...</p>
  9. </body></html>), (3, <html dir="ltr" lang="en"><head><meta charset="utf-8"/> <title>The Dormouse's story</title> </head><body><p class="story">Once upon a time there were three little sisters;and their names were
  10. <a class="sister" href="http://example.com/elsie" id="link1"> <span>Elsie </span></a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well
  11. </p> <p class="story"> ...story go on...</p>
  12. </body></html>)]


  1. from bs4 import BeautifulSoup
  2. soup = BeautifulSoup(html,'lxml')
  3. print(list(enumerate(soup.a.next_siblings)))


[(0, Lacie), (1, 'and'), (2, Tillie), (3, '; and they lived at the bottom of a well\n ')]

  1. `print(list(enumerate(soup.a.previous_siblings)))`
  2. > `[(0, 'Once upon a time there were three little sisters;and their names were\n ')]`
  3. ## 标准选择器
  4. ### find_all(name,attrs,recursive,text,**kwargs)
  5. 可根据标签名、属性、内容查找文档
  6. #### name
  7. ```py
  8. html = """
  9. <div class="panel">
  10. <div class="panel-heading">
  11. <h4>Helllo</h4>
  12. </div>
  13. <div class="panel-body">
  14. <ul class="list" id="list-1">
  15. <li class="element">Foo</li>
  16. <li class="element">Bar</li>
  17. <li class="element">Jay</li>
  18. </ul>
  19. <ul class="list list-small" id="list-2">
  20. <li class="element">Foo</li>
  21. <li class="element">Bar</li>
  22. </ul>
  23. </div>
  24. </div>
  25. """
  26. from bs4 import BeautifulSoup
  27. soup = BeautifulSoup(html,'lxml')
  28. print(soup.find_all('ul'))
  29. print(type(soup.find_all('ul')[0]))



  • Foo
  • Bar
  • Jay


  • Foo
  • Bar

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
for ul in soup.find_all('ul'):

  • Foo
  • ,

  • Bar
  • ,

  • Jay
  • ]

  • Foo
  • ,

  • Bar
  • ]


    1. html = '''
    2. <div class="panel">\n <div class="panel-heading">\n <h4>Helllo</h4>\n </div>\n <div class="panel-body">\n <ul class="list" id="list-1" name=elements>\n <li class="element">Foo</li>\n <li class="element">Bar</li>\n <li class="element">Jay</li>\n </ul>\n <ul class="list list-small" id="list-2">\n <li class="element">Foo</li>\n <li class="element">Bar</li>\n </ul>\n </div>\n</div>
    3. '''
    4. from bs4 import BeautifulSoup
    5. soup = BeautifulSoup(html,'lxml')
    6. print(soup.find_all(attrs={'id':'list-1'}))
    7. print(soup.find_all(attrs={'name':'elements'}))


    1. [<ul class="list" id="list-1" name="elements">
    2. <li class="element">Foo</li>
    3. <li class="element">Bar</li>
    4. <li class="element">Jay</li>
    5. </ul>]
    1. [<ul class="list" id="list-1" name="elements">
    2. <li class="element">Foo</li>
    3. <li class="element">Bar</li>
    4. <li class="element">Jay</li>
    5. </ul>]


    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. print(soup.find_all(id='list-1'))
    1. [<ul class="list" id="list-1" name="elements">
    2. <li class="element">Foo</li>
    3. <li class="element">Bar</li>
    4. <li class="element">Jay</li>
    5. </ul>]


    1. [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]


    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. print(soup.find_all(text='Foo'))

    ['Foo', 'Foo']



    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. print(soup.find('ul'))
    • Foo
    • Bar
    • Jay



    <class 'bs4.element.Tag'>


    <class 'NoneType'>



    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. print(soup.select('.panel .panel-heading'))
    4. print(soup.select('ul li'))
    5. print(soup.select('#list-2 .element'))
    6. print(soup.select('ul')[0])





    1. [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
    2. [<li class="element">Foo</li>, <li class="element">Bar</li>]
    1. <ul class="list" id="list-1" name="elements">
    2. <li class="element">Foo</li>
    3. <li class="element">Bar</li>
    4. <li class="element">Jay</li>
    5. </ul>


    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. for ul in soup.select('ul'):
    4. print(ul.select('li'))


    1. [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
    2. [<li class="element">Foo</li>, <li class="element">Bar</li>]


    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. for ul in soup.select('ul'):
    4. print(ul['id'])
    5. print(ul.attrs['id'])







    1. from bs4 import BeautifulSoup
    2. soup = BeautifulSoup(html,'lxml')
    3. for li in soup.select('li'):
    4. print(li.get_text())








    • 推荐使用lxml解析库,必要时使用html.parser
    • 标签选择筛选功能弱但是速度快
    • 建议使用find()、find_all()查询匹配单个结果或多个结果
    • 如果对CSS选择器书系建议使用select()
    • 记住常用的获取属性和文本值的方法


