

  Python标准库中提供了:urllib、urllib2、httplib等模块以供Http请求,使用起来较为麻烦。requests是基于Python开发的HTTP 第三方库,在Python内置模块的基础上进行了高度的封装,使用了更简单,代码量更少。 官方文档:http://docs.python-requests.org/zh_CN/latest/user/quickstart.html

  requests的api 主要包括了八个方法:

  1. def get(url, params=None, **kwargs):
  2. def options(url, **kwargs):
  3. def head(url, **kwargs):
  4. def post(url, data=None, json=None, **kwargs):
  5. def put(url, data=None, **kwargs):
  6. def patch(url, data=None, **kwargs):
  7. def delete(url, **kwargs):
  9. #上面方法都是基于request方法实现的(method参数)
  10. def request(method, url, **kwargs):


  1. def get(url, params=None, **kwargs):
  2. """Sends a GET request.
  3. :param url: URL for the new :class:`Request` object.
  4. :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
  5. :param \*\*kwargs: Optional arguments that ``request`` takes.
  6. :return: :class:`Response <Response>` object
  7. :rtype: requests.Response
  8. """
  9. kwargs.setdefault('allow_redirects', True)
  10. return request('get', url, params=params, **kwargs) # 发送get请求,基于request方法,method=‘get’
  12. def post(url, data=None, json=None, **kwargs):
  13. """Sends a POST request.
  15. :param url: URL for the new :class:`Request` object.
  16. :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
  17. :param json: (optional) json data to send in the body of the :class:`Request`.
  18. :param \*\*kwargs: Optional arguments that ``request`` takes.
  19. :return: :class:`Response <Response>` object
  20. :rtype: requests.Response
  21. """
  22. return request('post', url, data=data, json=json, **kwargs) # 发送post请求,基于request方法,method=‘post‘’


  1. def request(method, url, **kwargs):
  2. """Constructs and sends a :class:`Request <Request>`.
  4. :param method: method for the new :class:`Request` object. #method,对应‘get’,‘post’,‘put’,'delete'等。必须参数
  5. :param url: URL for the new :class:`Request` object.       # url,必须参数
  6. :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`. # params,url中的查询字符窜,字典或字节类型,urlencode方法
  7. :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.  #data, 发送的数据,字典,字节,和类文件对象
  8. :param json: (optional) json data to send in the body of the :class:`Request`.                   #json, 发送的数据,json格式的
  9. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.              # headers,请求头,字典格式
  10. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.                # cookies,字典或CookieJar对象
  11. :param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload. #字典{‘name’:file-like obj}
  12. ``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')``           #或字典{‘name’:file-tuple} (嵌套元组)
  13. or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string
  14. defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers
  15. to add for the file.
  16. :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth. #auth,元组
  17. :param timeout: (optional) How long to wait for the server to send data        #超时时间,浮点数或元组
  18. before giving up, as a float, or a :ref:`(connect timeout, read
  19. timeout) <timeouts>` tuple.
  20.   :type timeout: float or tuple
  21. :param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed. #allow_redirects,是否允许重定向,
  22.   :type allow_redirects: bool
  23. :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. #代理服务器,协议和url字典 {'http':proxy_ip}
  24. :param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to ``True``. #verify,是否ssl认证,默认为True
  25. :param stream: (optional) if ``False``, the response content will be immediately downloaded. # stream,默认为false,会直接下载到内存,文件较大时应设置为True
  26. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
  27. :return: :class:`Response <Response>` object
  28.   :rtype: requests.Response
  30. Usage::
  32. >>> import requests
  33. >>> req = requests.request('GET', 'http://httpbin.org/get')
  34. <Response [200]>
  35. """
  37. # By using the 'with' statement we are sure the session is closed, thus we
  38. # avoid leaving sockets open which can trigger a ResourceWarning in some
  39. # cases, and look like a memory leak in others.
  40. with sessions.Session() as session:
  41. return session.request(method=method, url=url, **kwargs)


  1. method
  2. # requests.request(method='get', url='')
  3. # requests.request(method='post', url='')
  4. params:
  5. # - 可以是字典
  6. # - 可以是字符串
  7. # - 可以是字节(ascii编码以内)
  9. # requests.request(method='get',
  10. # url='',
  11. # params={'k1': 'v1', 'k2': '水电费'})
  13. # requests.request(method='get',
  14. # url='',
  15. # params="k1=v1&k2=水电费&k3=v3&k3=vv3")
  17. # requests.request(method='get',
  18. # url='',
  19. # params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))
  21. # 错误
  22. # requests.request(method='get',
  23. # url='',
  24. # params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
  25. data:
  26. # 可以是字典
  27. # 可以是字符串
  28. # 可以是字节
  29. # 可以是文件对象
  31. # requests.request(method='POST',
  32. # url='',
  33. # data={'k1': 'v1', 'k2': '水电费'})
  35. # requests.request(method='POST',
  36. # url='',
  37. # data="k1=v1; k2=v2; k3=v3; k3=v4"
  38. # )
  40. # requests.request(method='POST',
  41. # url='',
  42. # data="k1=v1;k2=v2;k3=v3;k3=v4",
  43. # headers={'Content-Type': 'application/x-www-form-urlencoded'}
  44. # )
  46. # requests.request(method='POST',
  47. # url='',
  48. # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4
  49. # headers={'Content-Type': 'application/x-www-form-urlencoded'}
  50. # )
  51. json:
  52. # 将json中对应的数据进行序列化成一个字符串,json.dumps(...)
  53. # 然后发送到服务器端的body中,并且Content-Type是 {'Content-Type': 'application/json'}
  54. requests.request(method='POST',
  55. url='',
  56. json={'k1': 'v1', 'k2': '水电费'})
  58. headers:
  59. # 发送请求头到服务器端
  60. requests.request(method='POST',
  61. url='',
  62. json={'k1': 'v1', 'k2': '水电费'},
  63. headers={'Content-Type': 'application/x-www-form-urlencoded'}
  64. )
  65. cookies():
  66. # 发送Cookie到服务器端
  67. requests.request(method='POST',
  68. url='',
  69. data={'k1': 'v1', 'k2': 'v2'},
  70. cookies={'cook1': 'value1'},
  71. )
  72. # 也可以使用CookieJar(字典形式就是在此基础上封装)
  73. from http.cookiejar import CookieJar
  74. from http.cookiejar import Cookie
  75. obj = CookieJar()
  76. obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
  77. discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
  78. port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
  79. )
  80. requests.request(method='POST',
  81. url='',
  82. data={'k1': 'v1', 'k2': 'v2'},
  83. cookies=obj)
  84. files:
  85. # 发送文件
  86. # file_dict = {
  87. # 'f1': open('readme', 'rb')
  88. # }
  89. # requests.request(method='POST',
  90. # url='',
  91. # files=file_dict)
  93. # 发送文件,定制文件名
  94. # file_dict = {
  95. # 'f1': ('test.txt', open('readme', 'rb'))
  96. # }
  97. # requests.request(method='POST',
  98. # url='',
  99. # files=file_dict)
  101. # 发送文件,定制文件名
  102. # file_dict = {
  103. # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
  104. # }
  105. # requests.request(method='POST',
  106. # url='',
  107. # files=file_dict)
  109. # 发送文件,定制文件名
  110. # file_dict = {
  111. # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
  112. # }
  113. # requests.request(method='POST',
  114. # url='',
  115. # files=file_dict)
  117. auth: 认证方法
  118. from requests.auth import HTTPBasicAuth, HTTPDigestAuth
  119. ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
  120. print(ret.text)
  122. # ret = requests.get('',
  123. # auth=HTTPBasicAuth('admin', 'admin'))
  124. # ret.encoding = 'gbk'
  125. # print(ret.text)
  127. # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
  128. # print(ret)
  129. timeout: 超时时间
  130. # ret = requests.get('http://google.com/', timeout=1)
  131. # print(ret)
  133. # ret = requests.get('http://google.com/', timeout=(5, 1))
  134. # print(ret)
  136. allow_redirects:
  137. ret = requests.get('', allow_redirects=False)
  138. print(ret.text)
  140. proxies:
  141. # proxies = {
  142. # "http": "",
  143. # "https": "",
  144. # }
  146. # proxies = {'': ''}
  148. # ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
  149. # print(ret.headers)
  151. # from requests.auth import HTTPProxyAuth
  152. #
  153. # proxyDict = {
  154. # 'http': '',
  155. # 'https': ''
  156. # }
  157. # auth = HTTPProxyAuth('username', 'mypassword')
  158. #
  159. # r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
  160. # print(r.text)
  161. stream:
  162. ret = requests.get('', stream=True) #默认为false,会直接将文件下载到内存,文件过大时会撑满内存,
  163. print(ret.content)
  164. ret.close()
  166. # from contextlib import closing
  167. # with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
  168. # # 在此处理响应。
  169. # for i in r.iter_content(): # 设置成True时,遍历内容时才开始下载
  170. # print(i)

  request方法的最后调用了Session 类,其内部也实现了request,get,post等方法,部分源码如下:

  1. class Session(SessionRedirectMixin):
  2. """A Requests session.
  3. Provides cookie persistence, connection-pooling, and configuration.
  4. Basic Usage::
  6. >>> import requests
  7. >>> s = requests.Session()
  8. >>> s.get('http://httpbin.org/get')
  9. <Response [200]>
  11. Or as a context manager::
  13. >>> with requests.Session() as s:
  14. >>> s.get('http://httpbin.org/get')
  15. <Response [200]>

 1.1 Seeeion 对象 

  下面代码两者的区别:requests.get相当于每次请求时都新建了一个session对象,而requests.session()是新建一个session对象,然后重复利用该session对象,从而实现保持session对象的cookie,参数等在不同请求中保持持久化。(所以Session对象拥有requests的所有http method)



  1. #利用Session
    client = requests.session()
  2. resp = client.get(url='...')

  3. #利用requests
  4. resp = requests.get(url='...')

  不同session的cookie保持:如下面的代码,对于first_session每次请求都会带上{"cookies":{"cookieone":"111"}}, 而对于second_session,每次请求都会带上{"cookies":{"cookietwo":"222"}};

  1. import requests
  3. first_session = requests.Session()
  4. second_session = requests.Session()
  6. first_session.get('http://httpbin.org/cookies/set/cookieone/111')
  7. r = first_session.get('http://httpbin.org/cookies')
  8. print(r.text)
  10. second_session.get('http://httpbin.org/cookies/set/cookietwo/222')
  11. r = second_session.get('http://httpbin.org/cookies')
  12. print(r.text)
  14. r = first_session.get('http://httpbin.org/anything')
  15. print(r.text)


  1. {"cookies":{"cookieone":""}}
  3. {"cookies":{"cookietwo":""}}
  5. {"args":{},"data":"","files":{},"form":{},"headers":{"Accept":"*/*","Accept-Encoding":"gzip, deflate","Connection":"close","Cookie":"cookieone=111","Host":"httpbin.org","User-Agent":"python-requests/2.9.1"},"json":null,"method":"GET","origin":"","url":"http://httpbin.org/anything"}

  session的cookie更新: 如下面代码中,通过first_session.cookies更新的cookie会跟随每次请求,而first_session.get() 请求中cookies参数传入的cookie,只对该请求有效,不会被持久化。

  1. import requests
  3. first_session = requests.Session()
  5. first_session.cookies.update({'default_cookie': 'default'})
  7. r = first_session.get('http://httpbin.org/cookies', cookies={'first-cookie': ''})
  8. print(r.text)
  10. r = first_session.get('http://httpbin.org/cookies')
  11. print(r.text)


  1. {"cookies":{"default_cookie":"default","first-cookie":""}}
  3. {"cookies":{"default_cookie":"default"}}


  1. def requests_session():
  2. import requests
  4. session = requests.Session()
  6. ### 1、首先登陆任何页面,获取cookie
  8. i1 = session.get(url="http://dig.chouti.com/help/service")
  10. ### 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
  11. i2 = session.post(
  12. url="http://dig.chouti.com/login",
  13. data={
  14. 'phone': "",
  15. 'password': "xxxxxx",
  16. 'oneMonth': ""
  17. }
  18. )
  19.   # 3,保持会话,自动带着授权的cookie进行访问
  20. i3 = session.post(
  21. url="http://dig.chouti.com/link/vote?linksId=8589623",
  22. )
  23. print(i3.text)

  1.2 Response


    通过response.cookies,response.headers,response.status_code,encoding可以拿到服务器返回的cookies, 响应头,状态码,编码等信息。


  1. class Response(object):
  2. """The :class:`Response <Response>` object, which contains a
  3. server's response to an HTTP request.
  4. """
  5. __attrs__ = [
  6. '_content', 'status_code', 'headers', 'url', 'history',
  7. 'encoding', 'reason', 'cookies', 'elapsed', 'request'
  8. ]
      def content(self): """Content of the response, in bytes."""
      def text(self):   """Content of the response, in unicode."""


  1. import requests
  3. r = requests.get('https://cdn.pixabay.com/photo/2018/07/05/02/50/sun-hat-3517443_1280.jpg', stream=True)
  4. downloaded_file = open("sun-hat.jpg", "wb")
  5. for chunk in r.iter_content(chunk_size=):
  6. if chunk:
  7. downloaded_file.write(chunk)

  1. #下面方法能拿到原始的数据
    import requests
  2. r = requests.get("http://exampleurl.com", stream=True)
  3. r.raw


  BeautifulSopu模块是一个可以从HTML或XML文件中提取数据的Python第三方库。其接受一个html或xml字符串(或html,xml文档句柄),将文档被转换成Unicode,利用解析器来解析这段文档。BeautifulSoup支持几种不同的解析器:python标准库中的html.parser,以及第三方库lxml,lxml-xml和html5lib。Beautiful Soup最终将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种: Tag , NavigableString , BeautifulSoup , Comment .



  1. class BeautifulSoup(Tag):
  2. ROOT_TAG_NAME = u'[document]'
  3. DEFAULT_BUILDER_FEATURES = ['html', 'fast']
  4. ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
  5. NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
  6. def __init__(self, markup="", features=None, builder=None,
  7. parse_only=None, from_encoding=None, exclude_encodings=None,
  8. **kwargs):
  9. """The Soup object is initialized as the 'root tag', and the
  10. provided markup (which can be a string or a file-like object)
  11. is fed into the underlying parser."""


  1. html_doc = """
  2. <html><head><title>The Dormouse's story</title></head>
  3. <body>
  4. <p class="title"><b>The Dormouse's story</b></p>
  6. <p class="story">Once upon a time there were three little sisters; and their names were
  7. <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  8. <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  9. <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  10. and they lived at the bottom of a well.</p>
  12. <p class="story">...</p>
  13. """
  14. from bs4 import BeautifulSoup
  15. soup = BeautifulSoup(html_doc, 'html.parser')


  1. name, 标签名字:
  2. # tag = soup.find('a')
  3. # name = tag.name # 获取
  4. # print(name)
  5. # tag.name = 'span' # 设置
  6. # print(soup)
  7. # soup.head #拿到head标签
  1. attrs, 标签属性
  2. # tag = soup.find('a')
  3. # attrs = tag.attrs # 获取
  4. # print(attrs)
  5. # tag.attrs = {'ik':123} # 设置
  6. # tag.attrs['id'] = 'iiiii' # 设置
  7. # print(soup)
    #tag['id'] #直接拿到属性
  1. children, 所有子标签,返回生成器
  2. contents,所有子标签,返回列表
    # body = soup.find('body')
    # v = body.children #
    v = body.contents[0]
  1. decendants, 所有的子孙节点
    # body = soup.find('body')
    # v = body.descendants
  1. string: tag只有一个 NavigableString 类型子节点,那么这个tag可以使用 .string 得到子节点 (NavigableString,类似一个unicode字符窜,string拿到文本)
  2. strings: tag中包含多个字符串 [2] ,可以使用 .strings 来循环获取
  3. stripped_strings: 输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容:
  1. # tag = soup.find('a')
    #for string in tag.strings:
  2. # print(repr(string))
  1. clear(),将标签的所有子标签全部清空(保留标签名
  2. # tag = soup.find('body')
  3. # tag.clear()
  4. decompose(), 递归的删除所有的标签(不保留标签名)
  5. # body = soup.find('body')
  6. # body.decompose()
  7. extract(),递归的删除所有的标签,并获取删除的标签
  8. # body = soup.find('body')
  9. # v = body.extract()
  1. decode,转换数据为字符串(含当前标签);decode_contents(不含当前标签)
  2. # body = soup.find('body')
  3. # v = body.decode()
  4. # v = body.decode_contents()
  5. # print(v)
  1. def decode(self, indent_level=None,eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
  2. """Returns a Unicode representation of this tag and its contents.
  3. 默认encoding=‘utf-8’
  1. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
  2. # body = soup.find('body')
  3. # v = body.encode()
  4. # v = body.encode_contents()
  5. # print(v)
  1. def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,indent_level=None, formatter="minimal",errors="xmlcharrefreplace"):
  2. 默认encoding=‘utf-8
  1. find_all() :搜索当前tag的所有tag子节点,获取匹配的所有标签,以列表形式返回
  2. def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):
  3. """Extracts a list of Tag objects that match the given
  4. criteria. You can specify the name of the Tag and any
  5. attributes you want the Tag to have.
  6. The value of a key-value pair in the 'attrs' map can be a
  7. string, a list of strings, a regular expression object, or a
  8. callable that takes a string and returns whether or not the
  9. string matches for some custom definition of 'matches'. The
  10. same is true of the tag name."""
  11. name:查找所有名字为 name tag name可以为字符串,正则表达式,列表,方法,True #True匹配任意标签名
  12. # tags = soup.find_all('a')
  13. # print(tags)
  15. # tags = soup.find_all('a',limit=1) # limit,只匹配一次;类似于find()
  16. # print(tags)
  18. attrs参数:tag的属性值包含筛选条件
  19. # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
  20. # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
  21. soup.find_all("a", class_="sister")
  22. # print(tags)
  24. # ####### 列表 #######
  25. # v = soup.find_all(name=['a','div'])
  26. # print(v)
  27. # v = soup.find_all(class_=['sister0', 'sister']) #class 为python关键字,所以加下划线
  28. # print(v)
  29. # v = soup.find_all(text=['Tillie'])
  30. # print(v, type(v[0]))
  31. # v = soup.find_all(id=['link1','link2'])
  32. # print(v)
  33. # v = soup.find_all(href=['link1','link2'])
  34. # print(v)
  36. # ####### 正则 #######
  37. import re
  38. # rep = re.compile('p')
  39. # rep = re.compile('^p')
  40. # v = soup.find_all(name=rep)
  41. # print(v)
  43. # rep = re.compile('sister.*')
  44. # v = soup.find_all(class_=rep)
  45. # print(v)
  47. # rep = re.compile('http://www.oldboy.com/static/.*')
  48. # v = soup.find_all(href=rep)
  49. # print(v)
  51. # ####### 方法筛选 #######
  52. # def func(tag):
  53. # return tag.has_attr('class') and tag.has_attr('id')
  54. # v = soup.find_all(name=func)
  55. # print(v
  1. find(),获取匹配的第一个标签
  2. # tag = soup.find('a')
  3. # print(tag)
  4. # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
  5. # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
  6. # print(tag)
  1. get(),获取标签属性
    def get(self, key, default=None):
    return self.attrs.get(key, default)
  1. # tag = soup.find('a')
    # v = tag.get('id')
    #类似于tag.attrs['id'] # print(v)
  1. has_attr(),检查标签是否具有该属性
  2. # tag = soup.find('a')
  3. # v = tag.has_attr('id')
  4. # print(v)
  5. def has_attr(self, key):
  6. return key in self.attrs
  1. get_text(),获取标签内部文本内容 #类似string
  2. # tag = soup.find('a')
  3. # v = tag.get_text('id')
  4. # print(v)
  1. index(),检查标签在某标签中的索引位置
  2. def index(self, element):
  3. """
  4. Find the index of a child by identity, not value. Avoids issues with
  5. tag.contents.index(element) getting the index of equal elements.
  6. """
  7. for i, child in enumerate(self.contents):
  8. if child is element:
  9. return i
  10. raise ValueError("Tag.index: element not in tag")
  11. # tag = soup.find('body')
  12. # v = tag.index(tag.find('div'))
  13. # print(v)
  1. is_empty_element(),是否是空标签(是否可以是空)或者自闭合标签,
  2. 判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
  3. # tag = soup.find('br')
  4. # v = tag.is_empty_element
  5. # print(v)
  1. select,select_one, CSS选择器 (和css选择器一样)
  2. soup.select("title")
  3. soup.select("p nth-of-type(3)") #父元素中第三个p标签
  4. soup.select("body a")
  5. soup.select("html head title")
  6. tag = soup.select("span,a")
  7. soup.select("head > title")
  8. soup.select("p > a")
  9. soup.select("p > a:nth-of-type(2)")
  10. soup.select("p > #link1")
  11. soup.select("body > a")
  12. soup.select("#link1 ~ .sister")
  13. soup.select("#link1 + .sister")
  14. soup.select(".sister")
  15. soup.select("[class~=sister]")
  16. soup.select("#link1")
  17. soup.select("a#link2")
  18. soup.select('a[href]')
  19. soup.select('a[href="http://example.com/elsie"]')
  20. soup.select('a[href^="http://example.com/"]')
  21. soup.select('a[href$="tillie"]')
  22. soup.select('a[href*=".com/el"]')
  24. from bs4.element import Tag
  25. def default_candidate_generator(tag):
  26. for child in tag.descendants:
  27. if not isinstance(child, Tag):
  28. continue
  29. if not child.has_attr('href'):
  30. continue
  31. yield child
  32. tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
  33. print(type(tags), tags)

  34. from bs4.element import Tag
  35. def default_candidate_generator(tag):
  36. for child in tag.descendants:
  37. if not isinstance(child, Tag):
  38. continue
  39. if not child.has_attr('href'):
  40. continue
  41. yield child
  42. tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
  43. print(type(tags), tags)
  1. 修改文档树标签的内容
  2. # tag = soup.find('span')
  3. # print(tag.string) # 获取
  4. # tag.string = 'new content' # 设置
  5. # print(soup)
  7. # tag = soup.find('body')
  8. # print(tag.string)
  9. # tag.string = 'xxx'
  10. # print(soup)
  12. # tag = soup.find('body')
  13. # v = tag.stripped_strings # 递归内部获取所有标签的文本
  14. # print(v)
  16. append():在当前标签内部追加一个标签
  # tag = soup.find('body')
  # tag.append(soup.find('a'))
  # print(soup)
  # from bs4.element import Tag
  # obj = Tag(name='i',attrs={'id': 'it'})
  # obj.string = '我是一个新来的'
  # tag = soup.find('body')
  # tag.append(obj)
  # print(soup)

  # tag = soup.find('body')
  # tag.insert(2, obj)
  # print(soup)
insert_after(),insert_before() 在当前标签后面或前面插入
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))  #a包裹p

# print(soup)

# tag = soup.find('a')
# v = tag.unwrap()    # a包裹的标签
# print(soup)


  1. #coding:utf-8
  2. import requestsfrom bs4 import BeautifulSoup
  4. #下载当前网页html文件
  5. response = requests.get("https://www.cnblogs.com/silence-cho/p/9786069.html")
  6. print type(response.text)
  7. with open('python.html','w') as f:
  8. f.write(response.text.encode('utf-8'))
  9. with open('python.html','r') as f:
  10. html_file = f.read().decode('utf-8')
  12. #使用Beautiful模块
  13. soup = BeautifulSoup(html_file,'lxml')
  14. a_tags = soup.find_all('a')
  15. for a_tag in a_tags:
  16. if a_tag.has_attr('href'):
  17. print a_tag.attrs['href']
  19. text = soup.get_text().encode('gbk',errors='ignore') #使用get_text()方法,拿到所有文本
  20. with open('text1.txt','w') as f:
  21. f.write(text)
  23. strings = soup.strings #使用strings属性,拿到所有文本
  24. with open('string.txt','w') as f:
  25. for string in strings: #strings 为generator类型,包含拿到的所有文本
  26. f.write(string.encode('gbk',errors='ignore'))



  1. '''
  2. 自动登录抽屉热搜榜流程:先访问主页,获取cookie1,然后携带用户名,密码和cookie1访问登陆页面对cookie1授权,随后就能利用cookie1直接访问个人主页等。
  3. 注意真正起作用的是cookie1里面gpsd': '2c805bc26ead2dfcc09ef738249abf65,第二次进行登陆时对这个值进行了认证,
  4. 随后就能利用cookie1进行访问了,进行登录时也会返回cookie2,但cookie2并不起作用
  5. '''
  7. import requests
  8. from bs4 import BeautifulSoup
  10. #访问首页
  11. response=requests.get(
  12. url="https://dig.chouti.com/",
  13. headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0"}
  14. )
  15. cookie_dict = response.cookies.get_dict()
  16. print cookie_dict
  18. #登录页面,发送post
  19. response2= requests.post(
  20. url="https://dig.chouti.com/login",
  21. data={
  22. "oneMonth":"",
  23. "password":"你自己的密码",
  24. "phone":"",
  25. },
  26. headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0"},
  27. cookies=cookie_dict,
  28. )
  30. #携带cookie,访问首页,显示为登录状态
  31. response3= requests.get(
  32. url="https://dig.chouti.com/",
  33. headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0"},
  34. cookies = cookie_dict
  35. )
  37. #携带cookie,进行点赞,返回推送成功
  38. response4 = requests.post(
  39. url="https://dig.chouti.com/link/vote?linksId=22650731",
  40. headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0"},
  41. cookies = cookie_dict
  42. )
  43. print response4.text
  44. #{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53961215992","likedTime":"1539697099953000","lvCount":"13","nick":"silence624","uvCount":"1","voteTime":"小于1分钟前"}}}



  1. import requests
  2. from bs4 import BeautifulSoup
  3. response1 = requests.get(
  4. url="https://github.com/login", #url为https://github.com/时拿到的cookie不行
  5. headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0"},
  7. )
  8. cookie_dict = response1.cookies.get_dict() #拿到cookie
  9. print cookie_dict
  10. soup = BeautifulSoup(response1.text,features='html.parser')
  11. tag = soup.find(name='input',attrs={"name":"authenticity_token"})
  12. authenticity_token = tag.attrs.get('value') # 从前端页面拿到跨站伪造请求token值
  13. print authenticity_token
  14. response = requests.post(
  15. url='https://github.com/session',
  16. data={
  17. "authenticity_token":authenticity_token,
  18. "commit":"Sign+in",
  19. "login":"xxx",
  20. "password":"xxx",
  21. "utf8":""
  22. },
  23. headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:62.0) Gecko/20100101 Firefox/62.0"},
  24. cookies = cookie_dict,
  25. )
  26. # print response.text
  27. c2=response.cookies.get_dict()
  28. cookie_dict.update(c2) #自动登录,对cookie值进行更新
  30. r = requests.get(url="https://github.com/settings/repositories",cookies=cookie_dict) #利用更新后的cookie保持会话,拿到仓库名
  31. soup2 = BeautifulSoup(r.text,features='html.parser')
  32. tags = soup2.find_all(name='a',attrs={'class':'mr-1'})
  33. for item in tags:
  34. print item.get_text()





