import requests, re, json, os, time
from fake_useragent import UserAgent
from lxml import etree
from urllib import parse class MyError(Exception):
def __init__(self, status, msg):
self.status = status
self.msg = msg class WyRinking():
def __init__(self):
ua = UserAgent()
self.stratUrl = "https://music.163.com/discover/toplist"
self.headers = {
"User-Agent": ua.random
}
self.timeout = 10
self.allow_redirects = False
self.nameList = []
self.urlList = [] def __getRinkNameUrl(self, response):
'''获取所有排行榜名字,和url'''
html_selector = self.__etreeSelector(response)
self.nameList = html_selector.xpath(
"//div[contains(@class,'item') and contains(@class,'f-cb')]/p[@class='name']/a/text()") or []
self.urlList = html_selector.xpath(
"//div[contains(@class,'item') and contains(@class,'f-cb')]/p[@class='name']/a/@href") or [] def __getPageHtml(self, url):
'''请求页面'''
try:
response = requests.get(url, headers=self.headers, timeout=self.timeout,
allow_redirects=self.allow_redirects)
return response
except requests.exceptions.Timeout as e:
print("Timeout Error>>:", e)
self.__getPageHtml(url=url) def __getRankHtml(self):
'''获取每个排行榜的html源码'''
if not self.nameList and not self.urlList:
raise MyError(10000, "{},{} 数据不能为空".format(self.nameList, self.urlList))
if len(self.nameList) != len(self.urlList):
raise MyError(10001, "nameList,urlList数据不能一一对应")
for i in range(len(self.urlList)):
url = parse.urljoin(self.stratUrl, url=self.urlList[i])
response = self.__getPageHtml(url=url)
response.customizeName = self.nameList[i]
self.__getRankInfo(response) def __getRankInfo(self, response):
'''获取到网页中的json格式数据,写入到文件'''
html_selector = self.__etreeSelector(response) test = html_selector.xpath("//*[@id='song-list-pre-data']/text()")[0] or ""
updateTime = html_selector.xpath("//span[contains(@class,'sep') and contains(@class,'s-fc3')]/text()")[0]
try:
data = json.loads(test)
except json.decoder.JSONDecodeError:
data = json.loads(test + '"}}]')
'''
if not len(songNmaeList) == len(songUrlList) == len(songIdList) == len(songIdList):
raise MyError(10001, "songNmaeList,songUrlList,songIdList,songIdList数据不能一一对应")
'''
fileName = response.customizeName + '--' + updateTime + ".json"
if not Rink_BASE_PATH:
raise MyError(10005, "需要在全局中配置该参数Rink_BASE_PATH,用于文件存放地址")
if not os.path.exists(Rink_BASE_PATH):
os.makedirs(Rink_BASE_PATH)
path = os.path.join(Rink_BASE_PATH, fileName)
self.__writeToFile(path, data) def __writeToFile(self, path, data):
print('正在写入文件{}.json'.format(path))
index = 1
with open(path, "w", encoding="utf-8") as f:
for data_dic in data:
dic = {}
dic["rankNum"] = index
dic["songId"] = data_dic.get("id")
dic["songName"] = data_dic.get("name")
dic["artistsInfo"] = data_dic.get("artists")
dic["commentThreadId"] = data_dic.get("commentThreadId")
f.write(json.dumps(dic, ensure_ascii=False) + "\n")
index += 1 def __reSongId(self, songurl: str):
'''
:param songurl: /song?id=1336871144 格式类似于这样
'''
pattern = r"id=(\d+)"
try:
id = re.findall(pattern, songurl)[0]
except IndexError:
raise MyError(10002, "歌曲id获取失败")
return id def collectRanking(self):
'''获取网易云排行榜数据'''
response = self.__getPageHtml(url=self.stratUrl)
self.__getRinkNameUrl(response)
self.__getRankHtml() def __etreeSelector(self, response):
'''将response对象转换为xml格式'''
return etree.HTML(response.text) class WySinger():
__isFirstStatus = True # 请求华语男歌手页面的时候,获取到A-Z对应的参数,这个只需要获取一次就足够 def __init__(self):
ua = UserAgent()
self.stratUrl = "https://music.163.com/discover/artist"
self.headers = {
"User-Agent": ua.random
}
self.timeout = 10
self.allow_redirects = False
self.sCategoryNameList = []
self.sCategoryIdList = []
self.sCategoryUrlList = []
self.initialIdList = []
self.markList = [] def __getPageHtml(self, url):
'''请求页面'''
try:
response = requests.get(url, headers=self.headers, timeout=self.timeout,
allow_redirects=self.allow_redirects)
return response
except requests.exceptions.Timeout as e:
print("Timeout Error>>:", e)
self.__getPageHtml(url=url) def __getSingerCategory(self, response):
htmlSelector = self.__etreeSelector(response)
sCategoryNameList = htmlSelector.xpath(
"//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/text()")
sCategoryIdList = htmlSelector.xpath(
"//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/@data-cat")
sCategoryUrlList = htmlSelector.xpath(
"//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/@href")
if sCategoryUrlList and len(sCategoryNameList) == len(sCategoryIdList) == len(sCategoryUrlList):
self.sCategoryNameList = sCategoryNameList or []
self.sCategoryIdList = sCategoryIdList or []
self.sCategoryUrlList = [parse.urljoin(self.stratUrl, url) for url in sCategoryUrlList or []] def __getSingerListPage(self):
if not self.sCategoryNameList and not self.sCategoryUrlList:
raise MyError(10000, "{},{} 数据不能为空".format(self.sCategoryNameList, self.sCategoryUrlList))
if len(self.sCategoryNameList) != len(self.sCategoryUrlList):
raise MyError(10001, "nameList,urlList数据不能一一对应")
for sCategoryUrl in self.sCategoryUrlList:
response = self.__getPageHtml(sCategoryUrl)
if self.__isFirstStatus:
self.__getInitialId(response)
self.__isFirstStatus = False for inintalId in self.initialIdList:
if inintalId == "-1":
# inintalId 为-1的时候代表热门,但是会和后面的歌手信息重复,所以做个判断
continue
url = sCategoryUrl + "&initial=" + inintalId
res = self.__getPageHtml(url)
yield res def __getSingerIdUrl(self, response):
htmlSelector = self.__etreeSelector(response)
aSelector = htmlSelector.xpath(
"//*[@id='m-artist-box']//a[@class='msk'] | //*[@id='m-artist-box']/li[@class='sml']/a[1]")
singerUrlList = [parse.urljoin(self.stratUrl, selector.xpath("@href")[0]) for selector in aSelector]
singerNameList = [selector.xpath("@title")[0].replace("的音乐", "") for selector in aSelector]
if singerUrlList and len(singerUrlList) == len(singerNameList):
yield list(zip(singerUrlList, singerNameList))
else:
yield [] def __getInitialId(self, response):
'''获取A-Z对应的initialId'''
htmlSelector = self.__etreeSelector(response)
urlList = htmlSelector.xpath("//*[@id='initial-selector']/li/a/@href")
initialIdList = [self.__reInitialId(url) for url in urlList]
markList = htmlSelector.xpath("//*[@id='initial-selector']/li/a/text()") if len(initialIdList) == len(markList):
self.initialIdList = initialIdList
self.markList = markList def __reInitialId(self, url):
'''
url格式为:/discover/artist/cat?id=1001&initial=-1
'''
pattern = r"initial=(.*)"
initialId = re.findall(pattern, url, re.S)[0]
return initialId def __getSingerDetails(self, response):
htmlSelector = self.__etreeSelector(response)
try:
data_json = htmlSelector.xpath("//*[@id='song-list-pre-data']/text()")[0]
data_list = json.loads(data_json, strict=False)
singerDetails_json = htmlSelector.xpath("//script[@type='application/ld+json']/text()")[0]
singerDetails_dict = json.loads(singerDetails_json, strict=False)
singerDetails_content = singerDetails_dict.get("description")
return data_list, singerDetails_content
except Exception as e:
# 有些音乐人是没有音乐作品的,所以通过索引取值([0])就会抛异常,我这里捕捉改异常,不进行处理就好
print(e)
return None, None def __writeToFile(self, datalist, singerDetails_content, singerName):
if not os.path.exists(Singer_BASE_PATH):
os.makedirs(Singer_BASE_PATH)
path = os.path.join(Singer_BASE_PATH, singerName)
print("正在写入{}".format(singerName))
with open(path + ".txt", 'w', encoding="utf-8") as f:
f.write("歌手简介:{}".format(singerDetails_content) + "\n")
for data in datalist:
f.write("-" * 50 + "\n")
f.write("歌曲名:{}".format(data.get("name")) + "\n")
f.write("歌曲ID:{}".format(data.get("privilege").get("id")) + "\n")
f.write("歌曲专辑:{}".format(data.get("album").get("name")) + "\n")
f.write("歌曲别号:{}".format("无" if not data.get("alias") else data.get("alias")) + "\n") def __etreeSelector(self, response):
'''将response对象转换为xml格式'''
return etree.HTML(response.text) def collectSinger(self):
response = self.__getPageHtml(url=self.stratUrl)
self.__getSingerCategory(response)
resGenerator = self.__getSingerListPage()
for res in resGenerator:
time.sleep(1)
a = self.__getSingerIdUrl(res) # a是一个生成器,不知道取啥名,它__next__就是一个列表,这个列表就是当前页面所有歌手名和url的元组
for i in a: # i 就是 a__next__得来的列表
for b in i: # b 就是我们想要的结果 一个元组(歌手名,歌手详情页的url)
singerUrl = b[0]
singerName = b[1]
singerResponse = self.__getPageHtml(singerUrl)
datalist, singerDetails_content = self.__getSingerDetails(singerResponse)
if not datalist and not singerDetails_content:
continue
self.__writeToFile(datalist, singerDetails_content, singerName) if __name__ == '__main__':
Rink_BASE_PATH = r"D:\spidersData\Rinking"
Singer_BASE_PATH = r"D:\spidersData\SingerInfo"
wangyiyun = WyRinking()
wangyiyun.collectRanking() # 获取网易云排行榜数据
wangyiyun = WySinger()
wangyiyun.collectSinger() # 获取网易云所有歌手及作品

python3爬虫-网易云排行榜,网易云歌手及作品的更多相关文章

  1. python3爬虫应用--爬取网易云音乐(两种办法)

    一.需求 好久没有碰爬虫了,竟不知道从何入手.偶然看到一篇知乎的评论(https://www.zhihu.com/question/20799742/answer/99491808),一时兴起就也照葫 ...

  2. Python爬虫——request实例:爬取网易云音乐华语男歌手top10歌曲

    requests是python的一个HTTP客户端库,跟urllib,urllib2类似,但比那两个要简洁的多,至于request库的用法, 推荐一篇不错的博文:https://cuiqingcai. ...

  3. 爬虫综合大作业——网易云音乐爬虫 & 数据可视化分析

    作业要求来自于https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/3075 爬虫综合大作业 选择一个热点或者你感兴趣的主题. 选择爬取的对象 ...

  4. 在Ubuntu18.04.2LTS上使用wine安装qq,微信,迅雷,百度网盘,网易云音乐等软件

    在Ubuntu18.04.2LTS上使用wine安装qq,微信,迅雷,百度网盘,网易云音乐等软件 一.前言 在Linux上办公有一点一直是大家的痛,那就是这些系统上没有我们常用的一些软件,比如QQ,微 ...

  5. Python实现简单的爬虫获取某刀网的更新数据

    昨天晚上无聊时,想着练习一下Python所以写了一个小爬虫获取小刀娱乐网里的更新数据 #!/usr/bin/python # coding: utf-8 import urllib.request i ...

  6. Python爬虫爬取全书网小说,程序源码+程序详细分析

    Python爬虫爬取全书网小说教程 第一步:打开谷歌浏览器,搜索全书网,然后再点击你想下载的小说,进入图一页面后点击F12选择Network,如果没有内容按F5刷新一下 点击Network之后出现如下 ...

  7. python3爬虫系列19之反爬随机 User-Agent 和 ip代理池的使用

    站长资讯平台:python3爬虫系列19之随机User-Agent 和ip代理池的使用我们前面几篇讲了爬虫增速多进程,进程池的用法之类的,爬虫速度加快呢,也会带来一些坏事. 1. 前言比如随着我们爬虫 ...

  8. [原]openstack-kilo--issue(七):虚拟机怎么通外网,外网怎么ping通虚拟机

    =====问题======= 虚拟机可以ping通外网,外网能ping通虚拟机但是收不到reply 这个问题本人遇到有两种情况: 1.安装完整openstack-kilo后,在route中和虚拟机中抓 ...

  9. C#获取内网和外网IP

    写了个小客户端,里面用到了获取内网和外网的IP地址,代码如下: // InnerIP var ipHost = Dns.Resolve(Dns.GetHostName()); ]; innerIP = ...

随机推荐

  1. 软件项目技术点(6)——结合鼠标操作绘制动态canvas画布

    AxeSlide软件项目梳理   canvas绘图系列知识点整理 我们创建一个类封装了所有鼠标需要处理的事件. export class MouseEventInfo { el: HTMLElemen ...

  2. Quick Easy FTP Server FTP工具文件传输使用

    1 工具配置 2 linux 下ftp命令上传和下载文件到FTP_DIR目录 1 ftp 172.16.18.292 输入用户名3 输入密码 4 ls或dir 查看目录及文件  lcd进入本地目录  ...

  3. 精通Groovy

    https://www.ibm.com/developerworks/cn/education/java/j-groovy/j-groovy.html https://juejin.im/entry/ ...

  4. 用两个栈实现队列(C++ 和 Python 实现)

    (说明:本博客中的题目.题目详细说明及参考代码均摘自 “何海涛<剑指Offer:名企面试官精讲典型编程题>2012年”) 题目 用两个栈实现一个队列.队列的声明如下,请实现它的两个函数 a ...

  5. “云中论道”之——使用开源技术和Azure公有云服务快速搭建云端IoT解决方案(上)

    “云中论道”技术课堂第一课开讲啦!微软各路技术咖们齐聚一堂,为大家带来干货不断!作为“云中论道“课堂的开课之作,我们首先邀请到了微软Azure专家级的架构师:槐长清,他为我们带来了关于“使用开源技术和 ...

  6. C++数组怎么复制

    C++数组怎么复制: #include <string.h>main(){int a[10]={34,56,4,10,77,51,93,30,5,52};int b[10];memcpy( ...

  7. 查询组成员(group)

    查询组成员 $groupname = "groupname" $members = (get-adgroup $groupname -properties member).memb ...

  8. js笔记 标签: javascript 2016-08-01 13:30 75人阅读 评论(0) 收藏

    typeof可以用来检测给定变量的数据类型,typeof是一个操作符而不是函数,所以圆括号可以省略. Undefined类型只有一个值,即特殊的undefined.在使用var声明变量但未对其加以初始 ...

  9. AutoHotkey使用Excel的Com对象可能导致进程残留问题的原因及解决方案

    在AutoHotkey脚本中,对Excel的应用体验很不错,xl := ComObjActive("Excel.Application")就和当前Excel表连接了, 通过xl变量 ...

  10. lua之m进制转换为n进制-任意进制转换算法

    够无聊的写这个,为防止需要的人也无聊一遍,写个吧 算法有n种,但是,咱们一种就够用了 --数组倒序排列 local function orderByDesc( input ) local output ...