• 1、初衷:想在网上批量下载点听书、脱口秀之类,资源匮乏,大家可以一试
  • 2、技术:wireshark scrapy jsonMonogoDB
  • 3、思路:wireshark分析移动APP返回的各种连接分类、列表、下载地址等(json格式)
  • 4、思路:scrapy解析json,并生成下载连接
  • 5、思路:存储到MongoDB
  • 6、难点:wireshark分析各类地址,都是简单的scrapy的基础使用,官网的说明文档都有
  • 7、按照:tree /F生成的文件目录进行说明吧

1 items.py 字段设置,根据需要改变

'''

from scrapy import Item,Field

class QtscrapyItem(Item):

id = Field()

parent_info = Field()

title = Field()

update_time = Field()

file_path = Field()

source = Field()

'''

2 pipelines.py 字段设置及相关处理,根据需要改变

'''

import pymongo as pymongo

from scrapy import signals

import json

import codecs

from scrapy.conf import settings

class QtscrapyPipeline(object):

def init(self):

self.file = codecs.open('qingting_209.json', 'wb', encoding='utf-8')

def process_item(self, item, spider):

line = json.dumps(dict(item), ensure_ascii=False) + "\n"

# print(line)

self.file.write(line)

return item

class QtscrapyMongoPipeline(object):

def init(self):

host = settings['MONGODB_HOST']

port = settings['MONGODB_PORT']

dbName = settings['MONGODB_DBNAME']

client = pymongo.MongoClient(host=host, port=port)

tdb = client[dbName]

self.post = tdb[settings['MONGODB_DOCNAME']]

def process_item(self, item, spider):

qtfm = dict(item)

self.post.insert(qtfm)

return item

'''

3 settings.py 基础配置 配置数据库存储相关 QtscrapyPipeline 来自pipelines.py中定义的类

'''

ITEM_PIPELINES = {

# 'qtscrapy.pipelines.QtscrapyPipeline': 300,

'qtscrapy.pipelines.QtscrapyMongoPipeline': 300,

}

MONGODB_HOST = '127.0.0.1'

MONGODB_PORT = 12345

MONGODB_DBNAME = 'qingtingDB'

MONGODB_DOCNAME = 'qingting'

'''

└─spiders

4 qingting.py 爬虫,各显神通

'''

from scrapy.spiders import BaseSpider

from scrapy.http import Request

import sys, json

from qtscrapy.items import QtscrapyItem

from scrapy_redis.spiders import RedisSpider

reload(sys)

sys.setdefaultencoding("utf-8")

1 酷我听书地址分析

http://ts.kuwo.cn/service/gethome.php?act=new_home

http://ts.kuwo.cn/service/getlist.v31.php?act=catlist&id=97

http://ts.kuwo.cn/service/getlist.v31.php?act=cat&id=21&type=hot

http://ts.kuwo.cn/service/getlist.v31.php?act=detail&id=100102396

2 配合Redis使用class qtscrapy(RedisSpider):

class qtscrapy(BaseSpider):

name = "qingting"

# redis_key = 'qingting:start_urls'

base_url = "http://api2.qingting.fm/v6/media/recommends/guides/section/"

start_urls = ["http://api2.qingting.fm/v6/media/recommends/guides/section/0",

"http://ts.kuwo.cn/service/gethome.php?act=new_home",

"http://api.mting.info/yyting/bookclient/ClientTypeResource.action?type=0&pageNum=0&pageSize=500&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&q=0&imei=ODY1MTY2MDIxNzMzNjI0"]

allowed_domains = ["api2.qingting.fm", "ts.kuwo.cn", "api.mting.info"]

def parse(self, response):

3 根据返回的url判断,在思考是scrapy执行多爬虫还是这种混杂

    if "qingting" in response.url:
qt_json = json.loads(response.body, encoding="utf-8")
if qt_json["data"] is not None:
for data in qt_json["data"]:
if data is not None:
for de in data["recommends"]:
if de["parent_info"] is None:
pass
else:
jm_url = "http://api2.qingting.fm/v6/media/channelondemands/%(parent_id)s/programs/curpage/1/pagesize/1000" % \
de["parent_info"]
yield Request(jm_url, callback=self.get_qt_jmlist, meta={"de": de})
for i in range(0, 250):
url = self.base_url + str(i)
yield Request(url, callback=self.parse)
if "kuwo" in response.url:
kw_json = json.loads(response.body, encoding="utf-8")
if kw_json["cats"] is not None:
for data in kw_json["cats"]:
pp_id = data["Id"]
kw_url = "http://ts.kuwo.cn/service/getlist.v31.php?act=catlist&id=%s" % pp_id
yield Request(kw_url, callback=self.get_kw_catlist)
if "mting" in response.url:
# print(response)
lr_json = json.loads(response.body, encoding="utf-8")
if len(lr_json["list"]) > 0:
for l in lr_json["list"]:
try:
lr_url = "http://api.mting.info/yyting/bookclient/ClientTypeResource.action?type=%(id)s&pageNum=0&pageSize=1000&sort=2&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&imei=ODY1MTY2MDIxNzMzNjI0" % l
yield Request(lr_url, callback=self.get_lr_booklist)
except:
pass
for r in range(-10, 1000):
lr_url = "http://api.mting.info/yyting/bookclient/ClientTypeResource.action?type=%s&pageNum=0&pageSize=1000&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&q=0&imei=ODY1MTY2MDIxNzMzNjI0" % t
yield Request(lr_url, callback=self.parse)

4 需递归几次是由App结构决定的

def get_qt_jmlist(self, response):
jm_json = json.loads(response.body, encoding="utf-8")
de = response.meta["de"]
for jm_data in jm_json["data"]:
if jm_data is None:
pass
else:
try:
file_path = "http://upod.qingting.fm/%(file_path)s?deviceid=ffffffff-ebbe-fdec-ffff-ffffb1c8b222" % \
jm_data["mediainfo"]["bitrates_url"][0]
item = QtscrapyItem()
# print(item)
# print(jm_data["id"])
item["id"] = str(jm_data["id"])
parent_info = "%(parent_id)s_%(parent_name)s" % de["parent_info"]
item["parent_info"] = parent_info
item["title"] = jm_data["title"]
item["update_time"] = str(jm_data["update_time"])[:str(jm_data["update_time"]).index(' ')]
item["file_path"] = file_path
item["source"] = "qingting"
yield item
except:
pass
pass
def get_kw_catlist(self, response):
try:
kw_json = json.loads(response.body, encoding="utf-8")
if kw_json["sign"] is not None:
if kw_json["list"] is not None:
for data in kw_json["list"]:
p_id = data["Id"]
kw_p_url = "http://ts.kuwo.cn/service/getlist.v31.php?act=cat&id=%s&type=hot" % p_id
yield Request(kw_p_url, callback=self.get_kw_cat)
except:
print("*" * 300)
print(self.name, kw_json)
pass
def get_kw_cat(self, response):
try:
kw_json = json.loads(response.body, encoding="utf-8")
p_info = {}
if kw_json["sign"] is not None:
if kw_json["list"] is not None:
for data in kw_json["list"]:
id = data["Id"]
p_info["p_id"] = data["Id"]
p_info["p_name"] = data["Name"]
kw_pp_url = "http://ts.kuwo.cn/service/getlist.v31.php?act=detail&id=%s" % id
yield Request(kw_pp_url, callback=self.get_kw_jmlist, meta={"p_info": p_info})
except:
print("*" * 300)
print(self.name, kw_json)
pass
def get_kw_jmlist(self, response):
jm_json = json.loads(response.body, encoding="utf-8")
p_info = response.meta["p_info"]
for jm_data in jm_json["Chapters"]:
if jm_data is None:
pass
else:
try:
file_path = "http://cxcnd.kuwo.cn/tingshu/res/WkdEWF5XS1BB/%s" % jm_data["Path"]
item = QtscrapyItem()
item["id"] = str(jm_data["Id"])
parent_info = "%(p_id)s_%(p_name)s" % p_info
item["parent_info"] = parent_info
item["title"] = jm_data["Name"]
item["update_time"] = ""
item["file_path"] = file_path
item["source"] = "kuwo"
yield item
except:
pass
pass
def get_lr_booklist(self, response):
s_lr_json = json.loads(response.body, encoding="utf-8")
if len(s_lr_json["list"]) > 0:
for s_lr in s_lr_json["list"]:
s_lr_url = "http://api.mting.info/yyting/bookclient/ClientGetBookResource.action?bookId=%(id)s&pageNum=1&pageSize=2000&sortType=0&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&imei=ODY1MTY2MDIxNzMzNjI0" % s_lr
meta = {}
meta["id"] = s_lr["id"]
meta["name"] = s_lr["name"]
yield Request(s_lr_url, callback=self.get_lr_kmlist, meta={"meta": meta})
def get_lr_kmlist(self, response):
ss_lr_json = json.loads(response.body, encoding="utf-8")
parent = response.meta["meta"]
if len(ss_lr_json["list"]) > 0:
for ss_lr in ss_lr_json["list"]:
try:
item = QtscrapyItem()
item["id"] = str(ss_lr["id"])
parent_info = "%(id)s_%(name)s" % parent
item["parent_info"] = parent_info
item["title"] = ss_lr["name"]
item["update_time"] = ""
item["file_path"] = ss_lr["path"]
item["source"] = "lr"
yield item
except:
pass

'''

5 结果展示,爬取了大概40万记录

python scrapy+Mongodb爬取蜻蜓FM,酷我及懒人听书的更多相关文章

  1. 教程+资源,python scrapy实战爬取知乎最性感妹子的爆照合集(12G)!

    一.出发点: 之前在知乎看到一位大牛(二胖)写的一篇文章:python爬取知乎最受欢迎的妹子(大概题目是这个,具体记不清了),但是这位二胖哥没有给出源码,而我也没用过python,正好顺便学一学,所以 ...

  2. Python scrapy框架爬取瓜子二手车信息数据

    项目实施依赖: python,scrapy ,fiddler scrapy安装依赖的包: 可以到https://www.lfd.uci.edu/~gohlke/pythonlibs/  下载 pywi ...

  3. python scrapy框架爬取豆瓣

    刚刚学了一下,还不是很明白.随手记录. 在piplines.py文件中 将爬到的数据 放到json中 class DoubanmoviePipelin2json(object):#打开文件 open_ ...

  4. Python+Scrapy+Crawlspider 爬取数据且存入MySQL数据库

    1.Scrapy使用流程 1-1.使用Terminal终端创建工程,输入指令:scrapy startproject ProName 1-2.进入工程目录:cd ProName 1-3.创建爬虫文件( ...

  5. 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)

    前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...

  6. python+selenium+PhantomJS爬取网页动态加载内容

    一般我们使用python的第三方库requests及框架scrapy来爬取网上的资源,但是设计javascript渲染的页面却不能抓取,此时,我们使用web自动化测试化工具Selenium+无界面浏览 ...

  7. 使用scrapy框架爬取自己的博文(2)

    之前写了一篇用scrapy框架爬取自己博文的博客,后来发现对于中文的处理一直有问题- - 显示的时候 [u'python\u4e0b\u722c\u67d0\u4e2a\u7f51\u9875\u76 ...

  8. Scrapy+selenium爬取简书全站

    Scrapy+selenium爬取简书全站 环境 Ubuntu 18.04 Python 3.8 Scrapy 2.1 爬取内容 文字标题 作者 作者头像 发布日期 内容 文章连接 文章ID 思路 分 ...

  9. [python] 常用正则表达式爬取网页信息及分析HTML标签总结【转】

    [python] 常用正则表达式爬取网页信息及分析HTML标签总结 转http://blog.csdn.net/Eastmount/article/details/51082253 标签: pytho ...

随机推荐

  1. GNOME on Arch Linux

    Arch Linux上Gnome桌面截图欣赏: 相比而言,Debian的壁纸一直好像格调不够啊:

  2. Java一步一步构建web系统 在IDEA下用Maven搭建多模块项目

    1.需求 做一个项目会有很多模块,主要是方便复用,通过各个模块之间聚合.模块也可以独立出来,如公用类库,也可以在做其它项目中使用.该文的实例会有两个模块:分别为dallin-web模块,dallin- ...

  3. 使用Apache的Base64类实现Base64加解密

    包名称:org.apache.commons.codec.binary 类名称:org.apache.commons.codec.binary.Base64 1.Base64加密 public sta ...

  4. GJM : 用JIRA管理你的项目(一)JIRA环境搭建 [转载]

    感谢您的阅读.喜欢的.有用的就请大哥大嫂们高抬贵手"推荐一下"吧!你的精神支持是博主强大的写作动力以及转载收藏动力.欢迎转载! 版权声明:本文原创发表于 [请点击连接前往] ,未经 ...

  5. Could not publish to the server. java.lang.NullPointerException

    右键单击tomcat服务器,找到Properties,点下switch location就好了.

  6. Sending e-mail

    E-mail functionality uses the Apache Commons Email library under the hood. You can use theplay.libs. ...

  7. CSDN数据库被爆 统计CSDN用户都喜欢哪些密码

    今天有黑客在网上公开了知名网站CSDN的用户数据库,这是一次严重的暴库泄密事件,涉及到的账户总量高达600万个.有人写了一个小程序,统计了这次公布的 6428632 个 CSDN 哪些密码出镜率较高? ...

  8. HTML5 Content Editable实践

    基于此开发文档:https://developer.mozilla.org/zh-CN/docs/Web/Guide/HTML/Content_Editable 问题:通过contenteditabl ...

  9. 【探讨】javascript事件机制底层实现原理

    前言 又到了扯淡时间了,我最近在思考javascript事件机制底层的实现,但是暂时没有勇气去看chrome源码,所以今天我来猜测一把 我们今天来猜一猜,探讨探讨,javascript底层事件机制是如 ...

  10. 为Sharepoint 2010 批量创建SharePoint测试用户

    无意搜到下面一篇文章,http://www.cnblogs.com/lambertqin/archive/2012/04/19/2457372.html,原作者写的太"高大上",可 ...