使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

备注还没来得及写，共爬取八千多的歌手，每名歌手平均三十首歌曲算，大概二十多万首歌曲

run.py

 #!/usr/bin/env python

 # -*- coding: utf-8 -*-

 __author__ = 'Zqf'

 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider

 from scrapy.crawler import CrawlerProcess

 from scrapy.utils.project import get_project_settings

 # 获取settings.py模块的设置

 settings = get_project_settings()

 process = CrawlerProcess(settings=settings)

 # 可以添加多个spider

 process.crawl(DingdianSimpleSpider)

 # 启动爬虫，会阻塞，直到爬取完成

 process.start()

kugou.py

 #!/usr/bin/env python

 # -*- coding: utf-8 -*-

 import re

 __author__ = 'Zqf'

 import scrapy

 from kugoumusic.items import KugoumusicItem

 from scrapy.linkextractors import LinkExtractor

 from scrapy.spiders import Rule

 class KugouSpiders(scrapy.spiders.CrawlSpider):

     name = 'kugou'

     start_urls = ['http://www.kugou.com/']

     rules = (

         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',

                                   'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),

         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')

     )

     def parse_item(self, response):

         singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()

         print(singer)

         songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()

         print(songs)

         item = KugoumusicItem()

         item['singer'] = singer

         item['songs'] = songs

         yield item

items.py

 # -*- coding: utf-8 -*-

 # Define here the models for your scraped items

 #

 # See documentation in:

 # https://doc.scrapy.org/en/latest/topics/items.html

 import scrapy

 class KugoumusicItem(scrapy.Item):

     # define the fields for your item here like:

     # name = scrapy.Field()

     singer = scrapy.Field()

     songs = scrapy.Field()

pipelines.py

 # -*- coding: utf-8 -*-

 # Define your item pipelines here

 #

 # Don't forget to add your pipeline to the ITEM_PIPELINES setting

 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

 from pymongo import MongoClient

 class KugoumusicPipeline(object):

     def open_spider(self, spider):

         # mongo_config = spider.settings['MONGO_CONFIG']

         # host = '127.0.0.1', port = 27017

         self.client = MongoClient(host='127.0.0.1', port=27017)

         self.coll = self.client['student_db']['kugou']

         self.li = []

     def close_spider(self, spider):

         self.insert()

         self.client.close()

     def insert(self):

         self.coll.insert_many(self.li)

     def process_item(self, item, spider):

         if len(self.li) >= 100:

             self.insert()

             self.li = []

             print("成功插入100条数据-------------------------------------")

         else:

             self.li.append(dict(item))

         return item

settings.py

 # -*- coding: utf-8 -*-

 # Scrapy settings for kugoumusic project

 #

 # For simplicity, this file contains only settings considered important or

 # commonly used. You can find more settings consulting the documentation:

 #

 #     https://doc.scrapy.org/en/latest/topics/settings.html

 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

 BOT_NAME = 'kugoumusic'

 SPIDER_MODULES = ['kugoumusic.spiders']

 NEWSPIDER_MODULE = 'kugoumusic.spiders'

 # MONGO_CONFIG = ['192.168.62.35:1806, '

 #               '192.168.62.240:1806, '

 #               '192.168.62.23:1806, '

 #               '192.168.62.32:1806, '

 #               '192.168.62.25:1806, '

 #               '192.168.62.28:1806, '

 #               '192.168.62.241:1806']

 # MONGO_CONFIG = {

 #     'host': '127.0.0.1',

 #     'port': 27017

     # 'user': 'root',

     # 'password': '123456',

     # 'db': 's1806',

     # 'charset': 'utf8'

 # }

 # Crawl responsibly by identifying yourself (and your website) on the user-agent

 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)'

 # Obey robots.txt rules

 ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)

 #CONCURRENT_REQUESTS = 32

 # Configure a delay for requests for the same website (default: 0)

 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

 # See also autothrottle settings and docs

 #DOWNLOAD_DELAY = 3

 # The download delay setting will honor only one of:

 #CONCURRENT_REQUESTS_PER_DOMAIN = 16

 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)

 #COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)

 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:

 DEFAULT_REQUEST_HEADERS = {

     'Connection': 'keep-alive',

     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',

     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

     'Accept-Encoding': 'gzip, deflate, br',

     'Accept-Language': 'zh-CN,zh;q=0.9',

 }

 # Enable or disable spider middlewares

 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

 #SPIDER_MIDDLEWARES = {

 #    'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,

 #}

 # Enable or disable downloader middlewares

 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

 #DOWNLOADER_MIDDLEWARES = {

 #    'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,

 #}

 # Enable or disable extensions

 # See https://doc.scrapy.org/en/latest/topics/extensions.html

 #EXTENSIONS = {

 #    'scrapy.extensions.telnet.TelnetConsole': None,

 #}

 # Configure item pipelines

 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

 ITEM_PIPELINES = {

    'kugoumusic.pipelines.KugoumusicPipeline': 300,

 }

 # Enable and configure the AutoThrottle extension (disabled by default)

 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html

 #AUTOTHROTTLE_ENABLED = True

 # The initial download delay

 #AUTOTHROTTLE_START_DELAY = 5

 # The maximum download delay to be set in case of high latencies

 #AUTOTHROTTLE_MAX_DELAY = 60

 # The average number of requests Scrapy should be sending in parallel to

 # each remote server

 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

 # Enable showing throttling stats for every response received:

 #AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)

 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

 #HTTPCACHE_ENABLED = True

 #HTTPCACHE_EXPIRATION_SECS = 0

 #HTTPCACHE_DIR = 'httpcache'

 #HTTPCACHE_IGNORE_HTTP_CODES = []

 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章

Java爬虫系列之实战：爬取酷狗音乐网 TOP500 的歌曲(附源码)
在前面分享的两篇随笔中分别介绍了HttpClient和Jsoup以及简单的代码案例: Java爬虫系列二:使用HttpClient抓取页面HTML Java爬虫系列三:使用Jsoup解析HTML 今天 ...
python爬取酷狗音乐排行榜
本文为大家分享了python爬取酷狗音乐排行榜的具体代码,供大家参考,具体内容如下
python使用beautifulsoup4爬取酷狗音乐
声明:本文仅为技术交流,请勿用于它处. 小编经常在网上听一些音乐但是有一些网站好多音乐都是付费下载的正好我会点爬虫技术,空闲时间写了一份,截止4月底没有问题的,会下载到当前目录,只要按照bs4库就好, ...
python爬取酷狗音乐
url:https://www.kugou.com/yy/html/rank.html 我们随便访问一个歌曲可以看到url有个hash https://www.kugou.com/song/#hash ...
【Python】【爬虫】爬取酷狗音乐网络红歌榜
原理:我的上篇博客 import requests import time from bs4 import BeautifulSoup def get_html(url): ''' 获得 HTML ' ...
Python爬取酷狗飙升榜前十首（100）首，写入CSV文件
酷狗飙升榜,写入CSV文件爬取酷狗音乐飙升榜的前十首歌名.歌手.时间,是一个很好的爬取网页内容的例子,对爬虫不熟悉的读者可以根据这个例子熟悉爬虫是如何爬取网页内容的. 需要用到的库:requests ...
【Python】【爬虫】爬取酷狗TOP500
好啦好啦,那我们来拉开我们的爬虫之旅吧~~~ 这一只小爬虫是爬取酷狗TOP500的,使用的爬取手法简单粗暴,目的是帮大家初步窥探爬虫长啥样,后期会慢慢变得健壮起来的. 环境配置在此之前需要下载一个谷 ...
htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载
上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子: 酷狗音乐: import java.io.BufferedInp ...
使用Xpath爬取酷狗TOP500的歌曲信息
使用xpath爬取酷狗TOP500的歌曲信息, 将排名.歌手名.歌曲名.歌曲时长,提取的结果以文件形式保存下来.参考网址:http://www.kugou.com/yy/rank/home/1-888 ...

随机推荐

cassandra的全文检索插件
https://github.com/Stratio/cassandra-lucene-index Stratio’s Cassandra Lucene Index Stratio’s Cassand ...
国产手机没有google services 和google play崩溃，判断google services是否存在
public static boolean isGooglePlayServiceAvailable (Context context) { int status = GooglePlayServic ...
6. extjs panel layoutconfig属性
转自:https://blog.csdn.net/xingtianyiyun/article/details/7686811 layoutConfig: Object 这是一个包含指定布局详细属性的对 ...
bzoj 1593: [Usaco2008 Feb]Hotel 旅馆【线段树】
参考:https://blog.csdn.net/u010336344/article/details/53034372 神一样的线段树线段树上维护:ll从左开始最长空段:rr从右开始最长空段:le ...
下载tortoisegit
https://download.tortoisegit.org/tgit/ 藏经阁技术资料分享群二维码
vue中引入swiper插件
这里我们使用npm的方式安装swiper插件. 1.npm install vue-awesome-swiper --save 2.在main.js文件中引入文件 import Vue from 'v ...
Http协议对格式、请求头、方法
######### #概览 ######### 超文本传输协议(Http: Hyper Text Transfer Protocol) :用于发送WWW方式的数据.采用TCP/IP协议,是一个无状态协 ...
（四）Mybatis总结之接口映射
前面Mybatis是直接通过Dao层与数据交互,更好的方法是Mybatis通过接口映射方式与数据交互 1.在项目中添加maven支持(即pom.xml下添加支持) <!-- 在pom.xml下配 ...
oracle插入字符串数据时，字符串中有'单引号
使用insert into(field1,field2...) values('val1','val2'...)时,若值中有单引号时会报错. 处理方法:判断一下val1,val2中是否含有单引号,若含 ...
Java反射机制实战——字段篇
首先,我们来认识几个类. Class(java.lang.Class) Class对象是一个特殊对象,每一个类都有一个Class对象,用来创建该类的“常规”对象.可以通过对象的getClass()方法 ...

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章

随机推荐

热门专题