使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中
备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲
run.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Zqf'
from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings # 获取settings.py模块的设置
settings = get_project_settings()
process = CrawlerProcess(settings=settings) # 可以添加多个spider
process.crawl(DingdianSimpleSpider) # 启动爬虫,会阻塞,直到爬取完成
process.start()
kugou.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re __author__ = 'Zqf' import scrapy
from kugoumusic.items import KugoumusicItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule class KugouSpiders(scrapy.spiders.CrawlSpider):
name = 'kugou' start_urls = ['http://www.kugou.com/'] rules = (
Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),
Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')
) def parse_item(self, response):
singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
print(singer)
songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
print(songs) item = KugoumusicItem()
item['singer'] = singer
item['songs'] = songs yield item
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class KugoumusicItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
singer = scrapy.Field()
songs = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient class KugoumusicPipeline(object): def open_spider(self, spider):
# mongo_config = spider.settings['MONGO_CONFIG']
# host = '127.0.0.1', port = 27017
self.client = MongoClient(host='127.0.0.1', port=27017)
self.coll = self.client['student_db']['kugou']
self.li = [] def close_spider(self, spider):
self.insert()
self.client.close() def insert(self):
self.coll.insert_many(self.li) def process_item(self, item, spider):
if len(self.li) >= 100:
self.insert()
self.li = []
print("成功插入100条数据-------------------------------------")
else:
self.li.append(dict(item)) return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for kugoumusic project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'kugoumusic' SPIDER_MODULES = ['kugoumusic.spiders']
NEWSPIDER_MODULE = 'kugoumusic.spiders' # MONGO_CONFIG = ['192.168.62.35:1806, '
# '192.168.62.240:1806, '
# '192.168.62.23:1806, '
# '192.168.62.32:1806, '
# '192.168.62.25:1806, '
# '192.168.62.28:1806, '
# '192.168.62.241:1806'] # MONGO_CONFIG = {
# 'host': '127.0.0.1',
# 'port': 27017
# 'user': 'root',
# 'password': '123456',
# 'db': 's1806',
# 'charset': 'utf8'
# }
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kugoumusic.pipelines.KugoumusicPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章
- Java爬虫系列之实战:爬取酷狗音乐网 TOP500 的歌曲(附源码)
在前面分享的两篇随笔中分别介绍了HttpClient和Jsoup以及简单的代码案例: Java爬虫系列二:使用HttpClient抓取页面HTML Java爬虫系列三:使用Jsoup解析HTML 今天 ...
- python爬取酷狗音乐排行榜
本文为大家分享了python爬取酷狗音乐排行榜的具体代码,供大家参考,具体内容如下
- python使用beautifulsoup4爬取酷狗音乐
声明:本文仅为技术交流,请勿用于它处. 小编经常在网上听一些音乐但是有一些网站好多音乐都是付费下载的正好我会点爬虫技术,空闲时间写了一份,截止4月底没有问题的,会下载到当前目录,只要按照bs4库就好, ...
- python爬取酷狗音乐
url:https://www.kugou.com/yy/html/rank.html 我们随便访问一个歌曲可以看到url有个hash https://www.kugou.com/song/#hash ...
- 【Python】【爬虫】爬取酷狗音乐网络红歌榜
原理:我的上篇博客 import requests import time from bs4 import BeautifulSoup def get_html(url): ''' 获得 HTML ' ...
- Python爬取酷狗飙升榜前十首(100)首,写入CSV文件
酷狗飙升榜,写入CSV文件 爬取酷狗音乐飙升榜的前十首歌名.歌手.时间,是一个很好的爬取网页内容的例子,对爬虫不熟悉的读者可以根据这个例子熟悉爬虫是如何爬取网页内容的. 需要用到的库:requests ...
- 【Python】【爬虫】爬取酷狗TOP500
好啦好啦,那我们来拉开我们的爬虫之旅吧~~~ 这一只小爬虫是爬取酷狗TOP500的,使用的爬取手法简单粗暴,目的是帮大家初步窥探爬虫长啥样,后期会慢慢变得健壮起来的. 环境配置 在此之前需要下载一个谷 ...
- htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载
上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子: 酷狗音乐: import java.io.BufferedInp ...
- 使用Xpath爬取酷狗TOP500的歌曲信息
使用xpath爬取酷狗TOP500的歌曲信息, 将排名.歌手名.歌曲名.歌曲时长,提取的结果以文件形式保存下来.参考网址:http://www.kugou.com/yy/rank/home/1-888 ...
随机推荐
- 开始学习java编程
先看视屏,学习JAVA语法先,后面再看java web mvc,以及myeclipse http://i.youku.com/u/UMzM4MjMxNjMy/videos 争取20天内进步很大.
- BZOJ_3133_[Baltic2013]ballmachine_堆+倍增
BZOJ_3133_[Baltic2013]ballmachine_堆+倍增 Description 有一个装球机器,构造可以看作是一棵树.有下面两种操作: 从根放入一个球,只要下方有空位,球会沿着树 ...
- 3-4章 第3章 form表单组件与小程序前后端通信
View它相当于是一个点击触发一个事件,但是它的事件应该是相对来说可能是比较是偏向于页面上的一些展示,或者说是页面上的一些导航的一些跳转.Button它是一个标签, button是一个标签,一般去触发 ...
- 19_传智播客iOS视频教程_类和对象
什么是类?什么是对象? 看的见.摸的着.拿过来就可以直接使用.例如报纸就是一个对象. 学生这个东西可不可以只是指一个?就只有那一个才是学生.不是的,学生是有很多个的.它是对一类人的统称.类是统称.所以 ...
- P3343 [ZJOI2015]地震后的幻想乡
传送门 给积分大佬跪了 再给状压大佬也跪了 //minamoto #include<bits/stdc++.h> #define rint register int #define ll ...
- $CF55D [数位DP]$
题面 数位DP+状压. 首先,按照数位DP的基本套路,每个个位数的最小公倍数为2520,所以只用考虑模2520的情况.考虑一个DP.dp[i][j][k]表示当前是第i位,2~9的数的集合为j,模25 ...
- linux rpm 安装
1.rpm 安装rpm -ivh package_name-i:install的意思-v:查看更详细的安装信息-h:以安装信息栏显示安装进度rpm -ivh package_name --test 2 ...
- 使用Oracle实现的MyBatis分页查询效果
1.mybatis.xml <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE configur ...
- tablespace monitor shell for windows
1. #! /bin/ksh #set -x SID=$1 ORACLE_SID=stat10gORACLE_HOME=/oracle10g/product/10.2PATH=$PATH:/usr/b ...
- 9i 和 11 g 区别
9i 和 11 g 区别 9i 不支持 2/4/8G,只支持 2000/4000/8000 M的