爬虫基本操作、requests和BeautifulSoup
1. 爬虫基本操作
例如舆情系统:
获取汽车之家新闻放到自己数据库里,创建自己的app,发布内容,注明来源,自己创业。
URL指定内容获取到
- 发送Http请求:http://www.autohome.com.cn/news/
- 基于正则表达式获取内容
Python实现:
import requests
from bs4 import BeautifulSoup response = requests.get('http://www.autohome.com.cn/news/')
response.text obj = BeautifulSoup(response.text,...)
标签对象 = obj.find('a') # 找到匹配成功的第一个标签
标签对象.find(...) [标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签
示例一:爬取汽车之家新闻
requests obj = requests.get("url")
obj.content
obj.encoding = "gbk"
obj.text soup = beautifulsoup(obj.text,'html.parser')
标签对象 = soup.find(name='xx')
[标签对象,标签对象,] = soup.find_all(...) 标签对象.text
标签对象.attrs
标签对象.get(...)
import requests
from bs4 import BeautifulSoup response = requests.get('http://www.autohome.com.cn/news/') # socket发送的是字节类型
# # print(response.text) # 字符串,编码设置不对出现乱码
# print(response.content) # response.content获取的是字节类型
response.encoding = 'gbk'
# print(response.text) # response.text拿到的是文本信息 # python有个内置解析器html.parser,html页面的<html lang='en'...></html>对象通过html.parser解析出来
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(id='auto-channel-lazyload-article')
# h3 = tag.find(name='h3',class_='c1') # name是标签名。标签名不能直接写,class='c1'直接报错,写成class_='c1',或者写成attrs={'class':'c1'}
# h3 = tag.find(name='h3',attrs={'class':'c1'})
h3 = tag.find(name='h3')
print(h3)
练习一:获取一个新闻
response = requests.get('http://www.autohome.com.cn/news/')
response.encoding = 'gbk'
soup = BeautifulSoup(response.text,'html.parser')
li_list = soup.find(id='auto-channel-lazyload-article').find_all('li') # find_all('li')默认为find_all(name='li')
for li in li_list:
# print(li.find('h3')) # 有时候获取到的li.find('h3')为None
title = li.find('h3')
if not title:
continue
# print(title,type(title)) # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'>
summary = li.find('p').text
# url = li.find('a').attrs['href'] # li.find('a').attrs # 获取到li的所有属性,是个字典.使用get也可以获取到url
url = li.find('a').get('href')
img = li.find('img').get('src') # # 下载img
# res= requests.get(img)
# file_name = '%s.jpg'%(title,) # 标题当作下载的img文件名不符合规范,需修改
# with open(file_name,'wb') as f:
# f.write(res.content) print(title.text, summary,url,img) # 标题:title.text,简介:summary
print('=============')
练习二:找到所有新闻,其中包括标题,简介,url,图片
示例二:python代码登录github
1. 登录页面发送请求GET,获取csrftoken
2. 发送POST请求:
携带用户名、密码、csrftoken发送POST请求
产生cookie,拿到后下次就不需要登录了
requests obj = requests.get("url")
obj.content
obj.encoding = "gbk"
obj.text
obj.cookies.get_dict() requests.get("url",cookies={'k1':"v1"}) soup = beatifulsoup(obj.text,'html.parser')
标签 = soup.find(name='xx')
[标签,] = soup.find_all(...) 标签.text
标签.attrs
标签.get(...)
import requests
from bs4 import BeautifulSoup # 获取token
r1 = requests.get('https://github.com/login')
s1 = BeautifulSoup(r1.text,'html.parser')
token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value') # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token
print(token) # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==
r1_token_dict = r1.cookies.get_dict() # 将用户名、密码、token以POST请求发送到服务端
# 测试下发送POST请求时,查看浏览器Network响应头Headers发送请求的内容
"""
utf8:?
authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==
login:asdf
password:asdf
commit:Sign in
""" r2 = requests.post(
'http://github.com/session', # POST发送的url是从浏览器Network响应头Headers中查看获取到的
data={
'utf8':'?',
'authenticity_token':token,
# 'login':'用户名',
'login':'317828332@qq.com',
'password':'alex3714',
# 'password':'密码',
'commit':'Sign in'
},
cookies = r1_token_dict
)
# print(r2.text)
r2_cookie_dict = r2.cookies.get_dict()
print(r1_token_dict) # 有些网页get请求时有cookies,有些没有
#---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}
print(r2_cookie_dict) # post请求时的cookies
#---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'} #整合二个cookies
cookie_dict = {}
cookie_dict.update(r1_token_dict)
cookie_dict.update(r2_cookie_dict) #再次发送请求时
r3 = requests.get(
# url='xxxxxx', #登录后可以访问github的页面
url='https://github.com/settings/emails',
cookies=cookie_dict
)
print(r3.text)
代码实现
示例三:对抽屉新闻点赞
# 1.登录,拿到cookie
# 2.找到标签url,看抽屉页面发送的点赞请求,首先看往哪个url发送请求。
# 发送的是post请求,发送的url地址:http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面,返回的是字典 import requests
from bs4 import BeautifulSoup
# 1.获取cookie
r0 = requests.get('http://dig.chouti.com/')
r0_cookie_dict = r0.cookies.get_dict() # 2.发送用户名、密码、cookie
r1 = requests.post(
'http://dig.chouti.com/login',
data={
'phone':'',
'password':'woshiniba',
'oneMonth':1 # 一个月免登录
},
cookies=r0_cookie_dict
)
r1_cookie_dict = r1.cookies.get_dict()
print(r1.text)
#---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}} #这是手机不对的情况下打印的内容
print(r1.cookies.get_dict())
#---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'} cookie_dict = {}
cookie_dict.update(r0_cookie_dict)
cookie_dict.update(r1_cookie_dict) # cookie_dict={'gpsd':r0_cookie_dict['gpsd']} # 同上面cookie_dict一样,但不推荐使用 # 点赞
r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict) # 点赞的时候是post请求,linksId=13911006是文章id
print(r2.text)
2. requests模块
requests模块中提供的方法
# requests.get()
# requests.post()
# requests.put()
# requests.request('post') # requests.get(url, params=None, **kwargs)
# requests.post(url, data=None, json=None, **kwargs)
# requests.put(url, data=None, **kwargs)
# requests.head(url, **kwargs)
# requests.delete(url, **kwargs)
# requests.patch(url, data=None, **kwargs)
# requests.options(url, **kwargs)
#
# # 以上方法均是在此方法的基础上构建
# requests.request(method, url, **kwargs)
调用关系
# url='xxx',
# params={'k1':'v1','nid':888}, #GET传参
# cookies={},
# headers={},
# data = {}, # data提供数据
# json = {} # json提供数据 # requests.get(
# url='xxx',
# params={'k1':'v1','nid':888},
# cookies={},
# headers={}
# )
# http://www.baidu.com?k1=v1&nid=888 requests.post(
url='xxx',
params={'k1':'v1','nid':888},
cookies={},
headers={},
json={}
) # 注意:向后台发送去年请求时,注意请求头 # requests.post(url='',data={}) # 默认携带请求头application/x-www-form-urlencoded requests.post(url='',data={},headers={'content-type':'application/json'}) # 这样写的话django通过request.POST拿不到值,只能通过request.boby中自己拿 requests.post(url='',json={}) # 默认携带请求头headers={'content-type':'application/json'}
常用参数
# auth
def param_auth():
from requests.auth import HTTPBasicAuth, HTTPDigestAuth # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的
# 简单常用的基本验证规则
ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) # HTTPBasicAuth验证规则
ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf')) # HTTPDigestAuth验证规则
# 上面二种规则不会简单的,爬虫反爬不可能那么简单按照这二种规则验证账号密码。
print(ret.text) # ret = requests.get('http://192.168.1.1',)
# auth=HTTPBasicAuth('admin', 'admin'))
# ret.encoding = 'gbk'
# print(ret.text) # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
# print(ret) # timeout 超时时间限制 # allow_redirects 允许重定向
# 假设访问http://www.abc.com跳转到http://www.baidu.com
response = requests.get('http://www.abc.com',allow_redirects=False)
print(response.text) # 不允许重定向,则返回的是http://www.abc.com的内容 response = requests.get('http://www.abc.com',allow_redirects=True)
print(response.text) # 返回的是http://www.baidu.com的内容 # proxies 代理,防止爬网页时,把ip封了,加代理。可以买代理,也可以自己搭代理服务器,自己生成 # stream # verify 证书,例如12306的证书。知乎证书可带可不带
requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem') # stream=True需要携带证书,stream=False不需要携带证书
其他参数
3. BeautifulSoup
beautifulsoup:把html结构化成对象,通过对象的方式取html内部元素
#html_doc =
#"""
# <html><head><title>The Dormouse's story</title></head>
# <body>
# asdf
# <div class="title">
# <b>The Dormouse's story总共</b>
# <h1>f</h1>
# </div>
# <div class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</div>
# ad<br/>sf
# <p class="story">...</p>
# </body>
# </html>
# """
#from bs4 import BeautifulSoup
#soup = BeautifulSoup(html_doc, features="lxml") # 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同,lxml性能更好,不过要安装lxml模块,推荐使用 #tag = soup.find(class_='story')
# print(tag)
# print(tag.name)
# #---> div
# # tag.name = 'span' # 设置
name属性
# print(tag.attrs)
# #---> {'class': ['story']}
# tag.attrs['kkk'] = 'vvv'
# print(tag.attrs)
# #---> {'class': ['story'], 'kkk': 'vvv'}
# del tag.attrs['kkk']
# print(tag.attrs)
# #---> {'class': ['story']}
attrs属性
# print(tag.children)
# #---> <list_iterator object at 0x0000000002EA32B0>
# print(list(tag.children))
# #---> ['Once upon a time there were three little sisters; and their names were\n ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
# for item in tag.children:
# print(type(item),item)
# # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were
#
# # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>
# # <class 'bs4.element.NavigableString'> ,
# #
# # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
# # <class 'bs4.element.NavigableString'> and
# #
# # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
# # <class 'bs4.element.NavigableString'> ;
# # and they lived at the bottom of a well.
chidren属性
# print(tag)
# # ---> <div class="story">Once upon a time there were three little sisters; and their names were
# # <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well.</div>
# tag.clear()
# print(tag)
# ---> <div class="story"></div>
clear属性,清空,但保留标签名
# tag.decompose()
# print(tag)
# #---> <None></None>
decompose,递归的删除所有的标签
# taga = tag.find(name='a')
# taga.extract()
# print(tag)
extract属性,递归的删除所有的标签,并获取删除的标签
# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
# # <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# # <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>
decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
# # <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# # <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>
decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
# print(type(tag.encode()))
# # ---> <class 'bytes'>
# print(tag.encode())
# #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>'
# print(tag.encode_contents(),type(tag.encode_contents()))
encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # recursive递归找;text文本内容,很少用
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)
find,获取匹配的第一个标签
# tags = soup.find_all('a')
# print(tags) # tags = soup.find_all('a',limit=1) # limit=1只找一个
# print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
find_all,获取匹配的所有标签
# v = soup.find_all(name=['a','div']) # name=['a','div'] 查找‘a’标签和'div'标签 # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # class_=['sister0', 'sister']查找class='sister0'或者class='sister'
# print(v) # v = soup.find_all(text=['Tillie'])
# print(v, type(v[0])) # v = soup.find_all(id=['link1','link2'])
# print(v) # v = soup.find_all(href=['link1','link2'])
# print(v)
列表
#import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v) # rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v) # rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
正则
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id') # 返回结果为True,就把结果给v = soup.find_all()
# v = soup.find_all(name=func) # name=func把标签遍历一遍,每找到标签执行一次函数。
# print(v)
方法筛选,不常用
# tag = soup.find('a')
# v = tag.get('id')
# print(v)
get,获取标签属性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)
has_attr,检查标签是否具有该属性
# tag = soup.find('a')
# v = tag.get_text()
# print(v)
get_text,获取标签内部文本内容
# tag = soup.find('body')
# v = tag.index(tag.find('div'))
# print(v)
# tag = soup.find('body')
# for i,v in enumerate(tag):
# print(i,v)
index,检查标签在某标签中的索引位置
# soup.next # 找下一个,不管是标签还是文本
# soup.next_element # 找下一个标签
# soup.next_elements
# soup.next_sibling # 找兄弟姐妹
# soup.next_siblings # tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings # tag.parent
# tag.parents
当前的关联标签
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...) # tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...) # tag.find_parent(...)
# tag.find_parents(...)
# 参数同find_all
查找某标签的关联标签
# soup.select("title")
#
# soup.select("p nth-of-type(3)")
#
# soup.select("body a")
#
# soup.select("html head title")
#
# tag = soup.select("span,a")
#
# soup.select("head > title")
#
# soup.select("p > a")
#
# soup.select("p > a:nth-of-type(2)")
#
# soup.select("p > #link1")
#
# soup.select("body > a")
#
# soup.select("#link1 ~ .sister")
#
# soup.select("#link1 + .sister")
#
# soup.select(".sister")
#
# soup.select("[class~=sister]")
#
# soup.select("#link1")
#
# soup.select("a#link2")
#
# soup.select('a[href]')
#
# soup.select('a[href="http://example.com/elsie"]')
#
# soup.select('a[href^="http://example.com/"]')
#
# soup.select('a[href$="tillie"]')
#
# soup.select('a[href*=".com/el"]')
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
# for child in tag.descendants:
# if not isinstance(child, Tag):
# continue
# if not child.has_attr('href'):
# continue
# yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
# print(type(tags), tags)
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
# for child in tag.descendants:
# if not isinstance(child, Tag):
# continue
# if not child.has_attr('href'):
# continue
# yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
# print(type(tags), tags)
select, select_one, CSS选择器,select查找多个,select_one查找一个,但是参数类型不一样
# tag = soup.find('span')
# print(tag.string) # 获取
# tag.string = 'new content' # 设置
# print(soup) # tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx' # tag.text不能修改标签内容
# print(soup) # tag = soup.find('body')
# v = tag.stripped_strings # 递归内部获取所有标签的文本
# print(v)
标签的内容
# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
# 如果实在想追加当前标签已经存在的,方法如下
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)
append在当前标签内部追加一个标签,当当前内部标签有追加的这个标签时,只是把当前标签内部位置被追加的标签移动到最后
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)
insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)
insert_after, insert_before在当前标签后面或前面插入
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
replace_with 在当前标签替换为指定标签
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
创建标签之间的关系,关系创建完后没什么用,不会改变标签间的位置
# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup) # tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)
wrap,将指定标签把当前标签包裹起来
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
unwrap,去掉当前标签,将保留其包裹的标签
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
unwrap,去掉当前标签,将保留其包裹的标签
爬虫基本操作、requests和BeautifulSoup的更多相关文章
- python爬虫之requests+selenium+BeautifulSoup
前言: 环境配置:windows64.python3.4 requests库基本操作: 1.安装:pip install requests 2.功能:使用 requests 发送网络请求,可以实现跟浏 ...
- 爬虫 1 requests 、beautifulsoup
1.requests 1.method 提交方式:post.get.put.delete.options.head.patch 2.url 访问地址 3.params 在url中传递的参数,GET p ...
- 005 爬虫(requests与beautifulSoup库的使用)
一:知识点 1.安装requests库 2.Brautiful soup 可以提供一些简单的,python式的函数来处理导航,搜索,修改分析树等功能. 她是一个工具箱,通过解析文档为用户提供需要抓去的 ...
- python3 爬虫相关-requests和BeautifulSoup
前言 时间的关系,这篇文章只记录了相关库的使用,没有进行深入分析,各位看官请见谅(还是因为懒.....) requests使用 发送无参数的get请求 r = requests.get('http:/ ...
- $python爬虫系列(2)—— requests和BeautifulSoup库的基本用法
本文主要介绍python爬虫的两大利器:requests和BeautifulSoup库的基本用法. 1. 安装requests和BeautifulSoup库 可以通过3种方式安装: easy_inst ...
- python爬虫系列(2)—— requests和BeautifulSoup
本文主要介绍python爬虫的两大利器:requests和BeautifulSoup库的基本用法. 1. 安装requests和BeautifulSoup库 可以通过3种方式安装: easy_inst ...
- 【网络爬虫入门01】应用Requests和BeautifulSoup联手打造的第一条网络爬虫
[网络爬虫入门01]应用Requests和BeautifulSoup联手打造的第一条网络爬虫 广东职业技术学院 欧浩源 2017-10-14 1.引言 在数据量爆发式增长的大数据时代,网络与用户的沟 ...
- python爬虫----基本操作
一.爬虫基本操作 有些网站和其他网站是有关系(链接),全球的网站就相当于一个蜘蛛网,我们放一只蜘蛛在上面爬,一定能够把网爬个遍.那么如果我们要爬取互联网上内容我们就相当于放一只蜘蛛在上面. 爬虫分为 ...
- 孤荷凌寒自学python第六十七天初步了解Python爬虫初识requests模块
孤荷凌寒自学python第六十七天初步了解Python爬虫初识requests模块 (完整学习过程屏幕记录视频地址在文末) 从今天起开始正式学习Python的爬虫. 今天已经初步了解了两个主要的模块: ...
随机推荐
- [python学习篇][廖雪峰][2]函数式编程
函数名也是变量: >>> f = abs >>> f(-10) 10 然变量可以指向函数,函数的参数能接收变量,那么一个函数就可以接收另一个函数作为参数,这种函数就 ...
- Ajax、Comet、Websocket、SSE
从 http 协议说起 1996年IETF HTTP工作组发布了HTTP协议的1.0版本 ,到现在普遍使用的版本1.1,HTTP协议经历了17 年的发展.这种分布式.无状态.基于TCP的请求/响应式 ...
- C# 条件与&&与条件或||的使用总结
CSDN说明: 条件“或”运算符 (||) 执行 bool 操作数的逻辑“或”运算,但仅在必要时才计算第二个操作数. 件“与”运算符 (&&) 执行其 bool 操作数的逻辑“与”运算 ...
- hdu图论题目分类
=============================以下是最小生成树+并查集====================================== [HDU] 1213 How Many ...
- jQuery全屏滚动插件fullPage.js使用中遇到的问题(滑动轮播效果无效)
1.fullPage.js是什么? fullPage.js 是一个基于 jQuery ,用来制作全屏网站的插件. 2.兼容性: - jquery兼容:兼容 jQuery 1.7+ - 浏览器兼容: I ...
- matlab 中的删除文件
Matlab中有两种删除文件的方式: 一种是删除文件 delete()函数 //可以使用help delete命令查询delete()函数的使用方法 delete('p1.jpg' ...
- Python Base Three
//sixth day to study python(2016/8/7) 32. In python , there are have an special type dictionary , it ...
- [bzoj1095][ZJOI2007]Hide 捉迷藏 点分树,动态点分治
[bzoj1095][ZJOI2007]Hide 捉迷藏 2015年4月20日7,8876 Description 捉迷藏 Jiajia和Wind是一对恩爱的夫妻,并且他们有很多孩子.某天,Jiaji ...
- C#递归删除进程及其子进程
/// <summary> /// 结束进程和相关的子进程 /// </summary> /// <param name="pid">需要结束的 ...
- poj 4001 To Miss Our Children Time
To Miss Our Children Time Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65768/65768 K (Jav ...