1. 爬虫基本操作

例如舆情系统：
　　获取汽车之家新闻放到自己数据库里，创建自己的app，发布内容，注明来源，自己创业。

URL指定内容获取到

    - 发送Http请求：http://www.autohome.com.cn/news/

    - 基于正则表达式获取内容

Python实现：

import requests

from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')

response.text

obj = BeautifulSoup(response.text,...)

标签对象 = obj.find('a') # 找到匹配成功的第一个标签

标签对象.find(...)

[标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签

示例一：爬取汽车之家新闻

requests

	obj = requests.get("url")

	obj.content

	obj.encoding = "gbk"

	obj.text

	soup = beautifulsoup(obj.text,'html.parser')

	标签对象 = soup.find(name='xx')

	[标签对象,标签对象,] = soup.find_all(...)

	标签对象.text

	标签对象.attrs

	标签对象.get(...)

import requests

from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')     # socket发送的是字节类型

# # print(response.text)    # 字符串，编码设置不对出现乱码

# print(response.content)     # response.content获取的是字节类型

response.encoding = 'gbk'

# print(response.text)        # response.text拿到的是文本信息

# python有个内置解析器html.parser，html页面的<html lang='en'...></html>对象通过html.parser解析出来

soup = BeautifulSoup(response.text,'html.parser')

tag = soup.find(id='auto-channel-lazyload-article')

# h3 = tag.find(name='h3',class_='c1')     # name是标签名。标签名不能直接写，class='c1'直接报错，写成class_='c1',或者写成attrs={'class':'c1'}

# h3 = tag.find(name='h3',attrs={'class':'c1'})

h3 = tag.find(name='h3')

print(h3)

练习一：获取一个新闻

response = requests.get('http://www.autohome.com.cn/news/')

response.encoding = 'gbk'

soup = BeautifulSoup(response.text,'html.parser')

li_list = soup.find(id='auto-channel-lazyload-article').find_all('li')    # find_all('li')默认为find_all(name='li')

for li in li_list:

    # print(li.find('h3'))        # 有时候获取到的li.find('h3')为None

    title = li.find('h3')

    if not title:

        continue

    # print(title,type(title))    # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'>

    summary = li.find('p').text

    # url = li.find('a').attrs['href']    # li.find('a').attrs  # 获取到li的所有属性，是个字典.使用get也可以获取到url

    url = li.find('a').get('href')

    img = li.find('img').get('src')

    # # 下载img

    # res= requests.get(img)

    # file_name = '%s.jpg'%(title,)       # 标题当作下载的img文件名不符合规范，需修改

    # with open(file_name,'wb') as f:

    #     f.write(res.content)

    print(title.text, summary,url,img)  # 标题：title.text，简介：summary

    print('=============')

练习二：找到所有新闻，其中包括标题，简介，url，图片

示例二：python代码登录github

1. 登录页面发送请求GET，获取csrftoken

2. 发送POST请求：

　　携带用户名、密码、csrftoken发送POST请求

　　产生cookie，拿到后下次就不需要登录了

requests

	obj = requests.get("url")

	obj.content

	obj.encoding = "gbk"

	obj.text

	obj.cookies.get_dict()

	requests.get("url",cookies={'k1':"v1"})

	soup = beatifulsoup(obj.text,'html.parser')

	标签 = soup.find(name='xx')

	[标签,] = soup.find_all(...)

	标签.text

	标签.attrs

	标签.get(...)

import requests

from bs4 import BeautifulSoup

# 获取token

r1 = requests.get('https://github.com/login')

s1 = BeautifulSoup(r1.text,'html.parser')

token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value')  # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token

print(token)    # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==

r1_token_dict = r1.cookies.get_dict()

# 将用户名、密码、token以POST请求发送到服务端

# 测试下发送POST请求时，查看浏览器Network响应头Headers发送请求的内容

"""

utf8:?

authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==

login:asdf

password:asdf

commit:Sign in

"""

r2 = requests.post(

    'http://github.com/session',    # POST发送的url是从浏览器Network响应头Headers中查看获取到的

    data={

        'utf8':'?',

        'authenticity_token':token,

        # 'login':'用户名',

        'login':'317828332@qq.com',

        'password':'alex3714',

        # 'password':'密码',

        'commit':'Sign in'

    },

    cookies = r1_token_dict

)

# print(r2.text)

r2_cookie_dict = r2.cookies.get_dict()

print(r1_token_dict)        # 有些网页get请求时有cookies，有些没有

#---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}

print(r2_cookie_dict)          # post请求时的cookies

#---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'}

#整合二个cookies

cookie_dict = {}

cookie_dict.update(r1_token_dict)

cookie_dict.update(r2_cookie_dict)

#再次发送请求时

r3 = requests.get(

    # url='xxxxxx',           #登录后可以访问github的页面

    url='https://github.com/settings/emails',

    cookies=cookie_dict

)

print(r3.text)

代码实现

示例三：对抽屉新闻点赞

# 1.登录，拿到cookie

# 2.找到标签url，看抽屉页面发送的点赞请求，首先看往哪个url发送请求。

# 发送的是post请求，发送的url地址：http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面，返回的是字典

import requests

from bs4 import BeautifulSoup

# 1.获取cookie

r0 = requests.get('http://dig.chouti.com/')

r0_cookie_dict = r0.cookies.get_dict()

# 2.发送用户名、密码、cookie

r1 = requests.post(

    'http://dig.chouti.com/login',

    data={

        'phone':'',

        'password':'woshiniba',

        'oneMonth':1    # 一个月免登录

    },

    cookies=r0_cookie_dict

)

r1_cookie_dict = r1.cookies.get_dict()

print(r1.text)

#---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}}    #这是手机不对的情况下打印的内容

print(r1.cookies.get_dict())

#---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'}

cookie_dict = {}

cookie_dict.update(r0_cookie_dict)

cookie_dict.update(r1_cookie_dict)

# cookie_dict={'gpsd':r0_cookie_dict['gpsd']}      # 同上面cookie_dict一样，但不推荐使用

# 点赞

r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict)    # 点赞的时候是post请求,linksId=13911006是文章id

print(r2.text)

2. requests模块

requests模块中提供的方法

# requests.get()

# requests.post()

# requests.put()

# requests.request('post')

# requests.get(url, params=None, **kwargs)

# requests.post(url, data=None, json=None, **kwargs)

# requests.put(url, data=None, **kwargs)

# requests.head(url, **kwargs)

# requests.delete(url, **kwargs)

# requests.patch(url, data=None, **kwargs)

# requests.options(url, **kwargs)

#

# # 以上方法均是在此方法的基础上构建

# requests.request(method, url, **kwargs)

调用关系

# url='xxx',

# params={'k1':'v1','nid':888},     #GET传参

# cookies={},

# headers={},

# data = {},        # data提供数据

# json = {}         # json提供数据

# requests.get(

#     url='xxx',

#     params={'k1':'v1','nid':888},

#     cookies={},

#     headers={}

# )

# http://www.baidu.com?k1=v1&nid=888

requests.post(

    url='xxx',

    params={'k1':'v1','nid':888},

    cookies={},

    headers={},

    json={}

)

# 注意：向后台发送去年请求时，注意请求头

# requests.post(url='',data={})   # 默认携带请求头application/x-www-form-urlencoded

requests.post(url='',data={},headers={'content-type':'application/json'})   # 这样写的话django通过request.POST拿不到值，只能通过request.boby中自己拿

requests.post(url='',json={})       # 默认携带请求头headers={'content-type':'application/json'}

常用参数

# auth

def param_auth():

    from requests.auth import HTTPBasicAuth, HTTPDigestAuth     # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的

    # 简单常用的基本验证规则

    ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPBasicAuth验证规则

    ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPDigestAuth验证规则

    # 上面二种规则不会简单的，爬虫反爬不可能那么简单按照这二种规则验证账号密码。

    print(ret.text)

    # ret = requests.get('http://192.168.1.1',)

    # auth=HTTPBasicAuth('admin', 'admin'))

    # ret.encoding = 'gbk'

    # print(ret.text)

    # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))

    # print(ret)

# timeout   超时时间限制

# allow_redirects 允许重定向

# 假设访问http://www.abc.com跳转到http://www.baidu.com

response = requests.get('http://www.abc.com',allow_redirects=False)

print(response.text)        # 不允许重定向，则返回的是http://www.abc.com的内容

response = requests.get('http://www.abc.com',allow_redirects=True)

print(response.text)       # 返回的是http://www.baidu.com的内容

# proxies   代理，防止爬网页时，把ip封了，加代理。可以买代理，也可以自己搭代理服务器，自己生成

# stream

# verify    证书，例如12306的证书。知乎证书可带可不带

requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem')  # stream=True需要携带证书，stream=False不需要携带证书

其他参数

3. BeautifulSoup

beautifulsoup：把html结构化成对象，通过对象的方式取html内部元素

#html_doc =

#"""

# <html><head><title>The Dormouse's story</title></head>

# <body>

# asdf

#     <div class="title">

#         <b>The Dormouse's story总共</b>

#         <h1>f</h1>

#     </div>

# <div class="story">Once upon a time there were three little sisters; and their names were

#     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,

#     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

#     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

# and they lived at the bottom of a well.</div>

# ad<br/>sf

# <p class="story">...</p>

# </body>

# </html>

# """

#from bs4 import BeautifulSoup

#soup = BeautifulSoup(html_doc, features="lxml")		# 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同，lxml性能更好，不过要安装lxml模块，推荐使用

#tag = soup.find(class_='story')

# print(tag)

# print(tag.name)

# #---> div

# # tag.name = 'span' # 设置

name属性

# print(tag.attrs)

# #---> {'class': ['story']}

# tag.attrs['kkk'] = 'vvv'

# print(tag.attrs)

# #---> {'class': ['story'], 'kkk': 'vvv'}

# del tag.attrs['kkk']

# print(tag.attrs)

# #---> {'class': ['story']}

attrs属性

# print(tag.children)

# #---> <list_iterator object at 0x0000000002EA32B0>

# print(list(tag.children))

# #---> ['Once upon a time there were three little sisters; and their names were\n    ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n    ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n    ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']

# for item in tag.children:

#     print(type(item),item)

# # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were

#

#     # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>

#     # <class 'bs4.element.NavigableString'> ,

#     #

#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

#     # <class 'bs4.element.NavigableString'>  and

#     #

#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

#     # <class 'bs4.element.NavigableString'> ;

#     # and they lived at the bottom of a well.

chidren属性

# print(tag)

# # ---> <div class="story">Once upon a time there were three little sisters; and their names were

#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,

#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

#     # and they lived at the bottom of a well.</div>

# tag.clear()

# print(tag)

# ---> <div class="story"></div>

clear属性，清空，但保留标签名

# tag.decompose()

# print(tag)

# #---> <None></None>

decompose,递归的删除所有的标签

# taga = tag.find(name='a')

# taga.extract()

# print(tag)

extract属性,递归的删除所有的标签，并获取删除的标签

# print(tag.decode())

# #---> <div class="story">Once upon a time there were three little sisters; and their names were

#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,

#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

#     # and they lived at the bottom of a well.</div>

# print(type(tag.decode()))

# # ---> <class 'str'>

# print(tag.decode_contents(),type(tag.decode_contents()))

# #---> Once upon a time there were three little sisters; and their names were

# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,

# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

# # and they lived at the bottom of a well. <class 'str'>

decode 将标签对象转为字符串类型.但decode_contents（不含当前标签）

# print(tag.decode())

# #---> <div class="story">Once upon a time there were three little sisters; and their names were

#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,

#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

#     # and they lived at the bottom of a well.</div>

# print(type(tag.decode()))

# # ---> <class 'str'>

# print(tag.decode_contents(),type(tag.decode_contents()))

# #---> Once upon a time there were three little sisters; and their names were

# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,

# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

# # and they lived at the bottom of a well. <class 'str'>

decode 将标签对象转为字符串类型.但decode_contents（不含当前标签）

# print(type(tag.encode()))

# # ---> <class 'bytes'>

# print(tag.encode())

# #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n    <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>'

# print(tag.encode_contents(),type(tag.encode_contents()))

encode,转换为字节（含当前标签）；encode_contents（不含当前标签）

# tag = soup.find('a')

# print(tag)

# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')    # recursive递归找；text文本内容，很少用

# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')

# print(tag)

find,获取匹配的第一个标签

# tags = soup.find_all('a')

# print(tags)

# tags = soup.find_all('a',limit=1)     # limit=1只找一个

# print(tags)

# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')

# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')

# print(tags)

find_all,获取匹配的所有标签

# v = soup.find_all(name=['a','div'])       # name=['a','div'] 查找‘a’标签和'div'标签

# print(v)

# v = soup.find_all(class_=['sister0', 'sister'])   # class_=['sister0', 'sister']查找class='sister0'或者class='sister'

# print(v)

# v = soup.find_all(text=['Tillie'])

# print(v, type(v[0]))

# v = soup.find_all(id=['link1','link2'])

# print(v)

# v = soup.find_all(href=['link1','link2'])

# print(v)

列表

#import re

# rep = re.compile('p')

# rep = re.compile('^p')

# v = soup.find_all(name=rep)

# print(v)

# rep = re.compile('sister.*')

# v = soup.find_all(class_=rep)

# print(v)

# rep = re.compile('http://www.oldboy.com/static/.*')

# v = soup.find_all(href=rep)

# print(v)

正则

# def func(tag):

#     return tag.has_attr('class') and tag.has_attr('id')       # 返回结果为True,就把结果给v = soup.find_all()

# v = soup.find_all(name=func)      # name=func把标签遍历一遍，每找到标签执行一次函数。

# print(v)

方法筛选，不常用

# tag = soup.find('a')

# v = tag.get('id')

# print(v)

get,获取标签属性

# tag = soup.find('a')

# v = tag.has_attr('id')

# print(v)

has_attr,检查标签是否具有该属性

# tag = soup.find('a')

# v = tag.get_text()

# print(v)

get_text,获取标签内部文本内容

# tag = soup.find('body')

# v = tag.index(tag.find('div'))

# print(v)

# tag = soup.find('body')

# for i,v in enumerate(tag):

#     print(i,v)

index,检查标签在某标签中的索引位置

is_empty_element,是否是空标签(是否可以是空)或者自闭合标签

# soup.next             # 找下一个，不管是标签还是文本

# soup.next_element     # 找下一个标签

# soup.next_elements

# soup.next_sibling     # 找兄弟姐妹

# soup.next_siblings

# tag.previous

# tag.previous_element

# tag.previous_elements

# tag.previous_sibling

# tag.previous_siblings

# tag.parent

# tag.parents

当前的关联标签

# tag.find_next(...)

# tag.find_all_next(...)

# tag.find_next_sibling(...)

# tag.find_next_siblings(...)

# tag.find_previous(...)

# tag.find_all_previous(...)

# tag.find_previous_sibling(...)

# tag.find_previous_siblings(...)

# tag.find_parent(...)

# tag.find_parents(...)

# 参数同find_all

查找某标签的关联标签

# soup.select("title")

#

# soup.select("p nth-of-type(3)")

#

# soup.select("body a")

#

# soup.select("html head title")

#

# tag = soup.select("span,a")

#

# soup.select("head > title")

#

# soup.select("p > a")

#

# soup.select("p > a:nth-of-type(2)")

#

# soup.select("p > #link1")

#

# soup.select("body > a")

#

# soup.select("#link1 ~ .sister")

#

# soup.select("#link1 + .sister")

#

# soup.select(".sister")

#

# soup.select("[class~=sister]")

#

# soup.select("#link1")

#

# soup.select("a#link2")

#

# soup.select('a[href]')

#

# soup.select('a[href="http://example.com/elsie"]')

#

# soup.select('a[href^="http://example.com/"]')

#

# soup.select('a[href$="tillie"]')

#

# soup.select('a[href*=".com/el"]')

#

# from bs4.element import Tag

#

#

# def default_candidate_generator(tag):

#     for child in tag.descendants:

#         if not isinstance(child, Tag):

#             continue

#         if not child.has_attr('href'):

#             continue

#         yield child

#

#

# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)

# print(type(tags), tags)

#

# from bs4.element import Tag

#

#

# def default_candidate_generator(tag):

#     for child in tag.descendants:

#         if not isinstance(child, Tag):

#             continue

#         if not child.has_attr('href'):

#             continue

#         yield child

#

#

# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)

# print(type(tags), tags)

select, select_one, CSS选择器，select查找多个，select_one查找一个，但是参数类型不一样

# tag = soup.find('span')

# print(tag.string)          # 获取

# tag.string = 'new content' # 设置

# print(soup)

# tag = soup.find('body')

# print(tag.string)

# tag.string = 'xxx'            # tag.text不能修改标签内容

# print(soup)

# tag = soup.find('body')

# v = tag.stripped_strings  # 递归内部获取所有标签的文本

# print(v)

标签的内容

# tag = soup.find('body')

# tag.append(soup.find('a'))

# print(soup)

# 如果实在想追加当前标签已经存在的，方法如下

# from bs4.element import Tag

# obj = Tag(name='i',attrs={'id': 'it'})

# obj.string = '我是一个新来的'

# tag = soup.find('body')

# tag.append(obj)

# print(soup)

append在当前标签内部追加一个标签，当当前内部标签有追加的这个标签时，只是把当前标签内部位置被追加的标签移动到最后

# from bs4.element import Tag

# obj = Tag(name='i', attrs={'id': 'it'})

# obj.string = '我是一个新来的'

# tag = soup.find('body')

# tag.insert(2, obj)

# print(soup)

insert在当前标签内部指定位置插入一个标签

# from bs4.element import Tag

# obj = Tag(name='i', attrs={'id': 'it'})

# obj.string = '我是一个新来的'

# tag = soup.find('body')

# # tag.insert_before(obj)

# tag.insert_after(obj)

# print(soup)

insert_after, insert_before在当前标签后面或前面插入

# from bs4.element import Tag

# obj = Tag(name='i', attrs={'id': 'it'})

# obj.string = '我是一个新来的'

# tag = soup.find('div')

# tag.replace_with(obj)

# print(soup)

replace_with 在当前标签替换为指定标签

# tag = soup.find('div')

# a = soup.find('a')

# tag.setup(previous_sibling=a)

# print(tag.previous_sibling)

创建标签之间的关系，关系创建完后没什么用，不会改变标签间的位置

# from bs4.element import Tag

# obj1 = Tag(name='div', attrs={'id': 'it'})

# obj1.string = '我是一个新来的'

#

# tag = soup.find('a')

# v = tag.wrap(obj1)

# print(soup)

# tag = soup.find('a')

# v = tag.wrap(soup.find('p'))

# print(soup)

wrap，将指定标签把当前标签包裹起来

# tag = soup.find('a')

# v = tag.unwrap()

# print(soup)

unwrap，去掉当前标签，将保留其包裹的标签

# tag = soup.find('a')

# v = tag.unwrap()

# print(soup)

unwrap，去掉当前标签，将保留其包裹的标签

爬虫基本操作、requests和BeautifulSoup的更多相关文章

python爬虫之requests+selenium+BeautifulSoup
前言: 环境配置:windows64.python3.4 requests库基本操作: 1.安装:pip install requests 2.功能:使用 requests 发送网络请求,可以实现跟浏 ...
爬虫 1 requests 、beautifulsoup
1.requests 1.method 提交方式:post.get.put.delete.options.head.patch 2.url 访问地址 3.params 在url中传递的参数,GET p ...
005 爬虫（requests与beautifulSoup库的使用）
一:知识点 1.安装requests库 2.Brautiful soup 可以提供一些简单的,python式的函数来处理导航,搜索,修改分析树等功能. 她是一个工具箱,通过解析文档为用户提供需要抓去的 ...
python3 爬虫相关-requests和BeautifulSoup
前言时间的关系,这篇文章只记录了相关库的使用,没有进行深入分析,各位看官请见谅(还是因为懒.....) requests使用发送无参数的get请求 r = requests.get('http:/ ...
$python爬虫系列（2）—— requests和BeautifulSoup库的基本用法
本文主要介绍python爬虫的两大利器:requests和BeautifulSoup库的基本用法. 1. 安装requests和BeautifulSoup库可以通过3种方式安装: easy_inst ...
python爬虫系列（2）—— requests和BeautifulSoup
本文主要介绍python爬虫的两大利器:requests和BeautifulSoup库的基本用法. 1. 安装requests和BeautifulSoup库可以通过3种方式安装: easy_inst ...
【网络爬虫入门01】应用Requests和BeautifulSoup联手打造的第一条网络爬虫
[网络爬虫入门01]应用Requests和BeautifulSoup联手打造的第一条网络爬虫广东职业技术学院欧浩源 2017-10-14 1.引言在数据量爆发式增长的大数据时代,网络与用户的沟 ...
python爬虫----基本操作
一.爬虫基本操作有些网站和其他网站是有关系(链接),全球的网站就相当于一个蜘蛛网,我们放一只蜘蛛在上面爬,一定能够把网爬个遍.那么如果我们要爬取互联网上内容我们就相当于放一只蜘蛛在上面. 爬虫分为 ...
孤荷凌寒自学python第六十七天初步了解Python爬虫初识requests模块
孤荷凌寒自学python第六十七天初步了解Python爬虫初识requests模块 (完整学习过程屏幕记录视频地址在文末) 从今天起开始正式学习Python的爬虫. 今天已经初步了解了两个主要的模块: ...

随机推荐

LibreOJ2097 - 「CQOI2015」任务查询系统
Portal Description 给出$n(n\leq10^5)$个任务,和总时间范围$m(m\leq10^5)$.每个任务有开始/结束时间\(s_i,e_i(1\leq s_i \leq ...
[中山市选]杀人游戏 (Tarjan缩点)
题目链接 Solution 可以考虑到如果知道环内一点的身份,如果凶手在其中就查出来了,同时不会有危险. 那么对警察造成威胁的就是那些身份不明且不能从其他点转移过来的点. 那么大部答案就是缩完点之后入 ...
[转] Makefile 基础 (8) —— Makefile 隐含规则
该篇文章为转载,是对原作者系列文章的总汇加上标注. 支持原创,请移步陈浩大神博客:(最原始版本) http://blog.csdn.net/haoel/article/details/2886 我转自 ...
javascript中 for循环的一些写法 for length 以及for in 还有 for of 的区别
最近在写一些前端的代码,遇到一个产品列表遍历的问题,正好使用到for 的几种用法,于是研究了下. 代码如下,先说明下goodslist 是一个产品列表形如这样的数据格式 { ‘types’:1, ' ...
POJ 3233
矩阵分治注意不要用 (*this) 会改变原值 #include <iostream> #include <cstdio> #include <cstring> ...
Object,String,StringBuffer,StringBuilder,System,Runtime,Date,Math介绍及用法(API)
1 Object对象面向对象的核心思想:“找合适的对象,做适合的事情”. 合适的对象: 自己描述类,自己创建对象. sun已经描述了好多常用的类,可以使用这些类创建对象. API(App ...
Git开发必知必会
比如说你现在准备写一个自己的视频资源网站,在创业初期,你的项目暂时还是测试阶段,没有用户的时候,你可能只有一个人在开发,你每天都以写的内容和时间作为文件名的命名,这样其实是可以满足你对版本控制的基本需 ...
AC日记——Propagating tree Codeforces 383c
C. Propagating tree time limit per test 2 seconds memory limit per test 256 megabytes input standard ...
python常用模块1
一. 什么是模块: 常见的场景:一个模块就是一个包含了python定义和声明的文件,文件名就是模块名字加上.py的后缀. 但其实import加载的模块分为四个通用类别: 1 使用python编写的代码 ...
android-samples-mvp
Model–view–presenter (MVP)介绍 mvp在wiki上的介绍为 Model 定义用户界面所需要被显示的数据模型,一个模型包含着相关的业务逻辑 View View不应该处理业务 ...

爬虫基本操作、requests和BeautifulSoup

1. 爬虫基本操作

2. requests模块

3. BeautifulSoup

爬虫基本操作、requests和BeautifulSoup的更多相关文章

随机推荐

热门专题