scrapy实战1分布式爬取有缘网(6.22接口已挂):
直接上代码:
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html import scrapy class YouyuanwangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 个人头像链接
header_url=scrapy.Field()
# 用户名
username=scrapy.Field()
# 内心独白
monologue=scrapy.Field()
# 相册图片链接
pic_urls=scrapy.Field()
#籍贯
place_from=scrapy.Field()
#学历
education=scrapy.Field()
# 年龄
age=scrapy.Field()
#身高
height=scrapy.Field()
#工资
salary=scrapy.Field()
#兴趣爱好
hobby=scrapy.Field()
# 网站来源 youyuan
source=scrapy.Field()
# 个人主页源url
source_url=scrapy.Field()
# 爬虫名
spider=scrapy.Field()
spiders >yuoyuan.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
from youyuanwang.items import YouyuanwangItem # class YouyuanSpider(CrawlSpider):
class youyuan(RedisCrawlSpider):
name = 'youyuan'
# allowed_domains = ['www.youyuan.com']
# 有缘网的列表页
# start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
redis_key = 'youyuan:start_urls'
#动态域范围的获取
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(youyuan, self).__init__(*args, **kwargs)
#匹配全国
#list_page = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))
# 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/")
# 个人主页 匹配规则,根据response提取链接
profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\d+-profile/") rules = (
# 匹配列表页成功,跟进链接,跳板
Rule(page_links),
# 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
Rule(profile_page,callback="parse_profile_page",follow=False)
) # 处理个人主页信息,得到我们要的数据
def parse_profile_page(self, response):
item=YouyuanwangItem()
# 个人头像链接
item['header_url']=self.get_header_url(response)
# 用户名
item['username']=self.get_username(response)
#籍贯
item['place_from']=self.get_place_from(response)
#学历
item['education']=self.get_education(response) # 年龄
item['age']=self.get_age(response)
# 身高
item['height']=self.get_height(response)
# 工资
item['salary']=self.get_salary(response)
# 兴趣爱好
item['hobby']=self.get_hobby(response)
# 相册图片链接
item['pic_urls'] = self.get_pic_urls(response)
# 内心独白
item['monologue'] = self.get_monologue(response)
# 个人主页源url
item['source_url']=response.url
# 网站来源 youyuan
item['source']="youyuan"
# 爬虫名
item['spider']="youyuan"
yield item
#提取头像地址
def get_header_url(self,response):
header=response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract()
if len(header):
header_url=header[0]
else:
header_url= ""
return header_url.strip()
#提取用户名
def get_username(self,response):
username=response.xpath('//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()').extract()
if len(username):
username=username[0]
else:
username=""
return username.strip()
#提取年龄
def get_age(self,response):
age=response.xpath('//dl[@class="personal_cen"]//p[@class="local"]/text()').extract()
if len(age):
age=age[0].split()[1]
else:
age=""
return age
#提取身高
def get_height(self,response):
height=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()').extract()
if len(height):
height=height[0]
else:
height="" return height.strip()
#提取工资
def get_salary(self,response):
salary=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()').extract()
if len(salary):
salary=salary[0]
else:
salary=""
return salary.strip()
#提取兴趣爱好
def get_hobby(self,response):
hobby=response.xpath('//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()').extract()
if len(hobby):
hobby=",".join(hobby).replace(" ","")
else:
hobby=""
return hobby.strip()
#提取相册图片
def get_pic_urls(self,response):
pic_urls=response.xpath('//div[@class="ph_show"]/ul/li/a/img/@src').extract()
if len(pic_urls):
pic_urls=",".join(pic_urls)
#将相册url列表转换成字符串
else:
pic_urls=""
return pic_urls
#提取内心独白
def get_monologue(self,response):
monologue=response.xpath('//div[@class="pre_data"]/ul/li/p/text()').extract()
if len(monologue):
monologue=monologue[0]
else:
monologue=""
return monologue.strip()
#提取籍贯
def get_place_from(self,response):
place_from=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()').extract()
if len(place_from):
place_from=place_from[0]
else:
place_from=""
return place_from.strip()
#提取学历
def get_education(self,response):
education=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()').extract()
if len(education):
education=education[0]
else:
education=""
return education.strip()
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule,CrawlSpider
#from scrapy_redis.spiders import RedisCrawlSpider
from youyuanwang.items import YouyuanwangItem class YouyuanSpider(CrawlSpider):
#class YouyuanSpider(RedisCrawlSpider):
name = 'youyuan'
allowed_domains = ['www.youyuan.com']
# 有缘网的列表页
start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
#redis_key = 'YouyuanSpider:start_urls'
#动态域范围的获取
# def __init__(self, *args, **kwargs):
# # Dynamically define the allowed domains list.
# domain = kwargs.pop('domain', '')
# self.allowed_domains = filter(None, domain.split(','))
# super(YouyuanSpider, self).__init__(*args, **kwargs)
#匹配全国
#list_page = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))
# 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/")
# 个人主页 匹配规则,根据response提取链接
profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\d+-profile/") rules = (
# 匹配列表页成功,跟进链接,跳板
Rule(page_links),
# 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
Rule(profile_page,callback="parse_profile_page",follow=False)
) # 处理个人主页信息,得到我们要的数据
def parse_profile_page(self, response):
item=YouyuanwangItem()
# 个人头像链接
item['header_url']=self.get_header_url(response)
# 用户名
item['username']=self.get_username(response)
#籍贯
item['place_from']=self.get_place_from(response)
#学历
item['education']=self.get_education(response) # 年龄
item['age']=self.get_age(response)
# 身高
item['height']=self.get_height(response)
# 工资
item['salary']=self.get_salary(response)
# 兴趣爱好
item['hobby']=self.get_hobby(response)
# 相册图片链接
item['pic_urls'] = self.get_pic_urls(response)
# 内心独白
item['monologue'] = self.get_monologue(response)
# 个人主页源url
item['source_url']=response.url
# 网站来源 youyuan
item['source']="youyuan"
# 爬虫名
item['spider']="youyuan"
yield item
#提取头像地址
def get_header_url(self,response):
header=response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract()
if len(header):
header_url=header[0]
else:
header_url= ""
return header_url.strip()
#提取用户名
def get_username(self,response):
username=response.xpath('//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()').extract()
if len(username):
username=username[0]
else:
username=""
return username.strip()
#提取年龄
def get_age(self,response):
age=response.xpath('//dl[@class="personal_cen"]//p[@class="local"]/text()').extract()
if len(age):
age=age[0].split()[1]
else:
age=""
return age
#提取身高
def get_height(self,response):
height=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()').extract()
if len(height):
height=height[0]
else:
height="" return height.strip()
#提取工资
def get_salary(self,response):
salary=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()').extract()
if len(salary):
salary=salary[0]
else:
salary=""
return salary.strip()
#提取兴趣爱好
def get_hobby(self,response):
hobby=response.xpath('//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()').extract()
if len(hobby):
hobby=",".join(hobby).replace(" ","")
else:
hobby=""
return hobby.strip()
#提取相册图片
def get_pic_urls(self,response):
pic_urls=response.xpath('//div[@class="ph_show"]/ul/li/a/img/@src').extract()
if len(pic_urls):
pic_urls=",".join(pic_urls)
#将相册url列表转换成字符串
else:
pic_urls=""
return pic_urls
#提取内心独白
def get_monologue(self,response):
monologue=response.xpath('//div[@class="pre_data"]/ul/li/p/text()').extract()
if len(monologue):
monologue=monologue[0]
else:
monologue=""
return monologue.strip()
#提取籍贯
def get_place_from(self,response):
place_from=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()').extract()
if len(place_from):
place_from=place_from[0]
else:
place_from=""
return place_from.strip()
#提取学历
def get_education(self,response):
education=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()').extract()
if len(education):
education=education[0]
else:
education=""
return education.strip()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# import json
#
# class YouyuanwangPipeline(object):
# def __init__(self):
# self.filename=open("youyuanwang.json","wb")
# def process_item(self, item, spider):
# jsontext=json.dumps(dict(item),ensure_ascii=False) + ",\n"
# self.filename.write(jsontext.encode("utf-8"))
# return item
# def close_spider(self,spider):
# self.filename.close() import pymysql
from .models.es_types import YouyuanType
class XiciPipeline(object):
def process_item(self, item, spider):
# DBKWARGS=spider.settings.get('DBKWARGS')
con=pymysql.connect(host='127.0.0.1',user='root',passwd='',db='yunyuan',charset='utf8')
cur=con.cursor()
sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)"
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
lis=(item['header_url'],item['username'],item['monologue'],item['pic_urls'],item['place_from'],item['education'],item['age'],item['height'],item['salary'],item['hobby'],item['source']) cur.execute(sql,lis)
con.commit()
cur.close()
con.close()
return item class ElasticsearchPipeline(object):
def process_item(self,item,spider):
youyuan = YouyuanType()
youyuan.header_url=item["header_url"]
youyuan.username=item["username"]
youyuan.age=item["age"]
youyuan.salary=item["salary"]
youyuan.monologue=item["monologue"]
youyuan.pic_urls=item["pic_urls"]
youyuan.place_from=item["place_from"] youyuan.save() return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for youyuanwang project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'youyuanwang' SPIDER_MODULES = ['youyuanwang.spiders']
NEWSPIDER_MODULE = 'youyuanwang.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'youyuanwang (+http://www.yourdomain.com)' # Obey robots.txt rules
#ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'youyuanwang.middlewares.YouyuanwangSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'youyuanwang.middlewares.MyCustomDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#'youyuanwang.pipelines.XiciPipeline': 300,
'youyuanwang.pipelines.ElasticsearchPipeline': 300,
# 'scrapy_redis.pipelines.RedisPipeline':400,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
从redis保存到mongodb 在目录下新建文件process_item_mongo.py(名字随便取)
#coding=utf-8 import pymongo
import redis
import json def process_item():
Redis_conn=redis.StrictRedis(host='127.0.0.1',port=6379,db=0)
Mongo_conn=pymongo.MongoClient(host='127.0.0.1',port=27017)
db=Mongo_conn["youyuan"]
table=db["beijing_18_25"]
while True:
source, data = Redis_conn.blpop(["youyuan:items"])
data = json.loads(data.decode("utf-8"))
table.insert(data)
if __name__=="__main__":
process_item()
从redis保存到mysql 在目录下新建文件process_item_mysql.py(名字随便取)
#coding=utf-8 import pymysql
import redis
import json def process_item():
Redis_conn=redis.StrictRedis(host='127.0.0.1',port=6379,db=0)
MySql_conn=pymysql.connect(host='127.0.0.1',user='root',passwd='',port=3306,db='yunyuan')
while True:
source,data=Redis_conn.blpop("youyuan:items")
data=json.loads(data.decode("utf-8"))
cur=MySql_conn.cursor()
sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)"
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
lis = (data['header_url'], data['username'], data['monologue'], data['pic_urls'], data['place_from'],
data['education'], data['age'], data['height'], data['salary'], data['hobby'], data['source'])
cur.execute(sql,lis)
MySql_conn.commit()
cur.close()
MySql_conn.close()
if __name__=="__main__":
process_item()
数据:
申明:以上只限于参考学习交流!!! 更多:https://github.com/huwei86/Spideryouyuanwang
scrapy实战1分布式爬取有缘网(6.22接口已挂):的更多相关文章
- scrapy实战2分布式爬取lagou招聘(加入了免费的User-Agent随机动态获取库 fake-useragent 使用方法查看:https://github.com/hellysmile/fake-useragent)
items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentati ...
- scrapy项目3:爬取当当网中机器学习的数据及价格(spider类)
1.网页解析 当当网中,人工智能数据的首页url如下为http://category.dangdang.com/cp01.54.12.00.00.00.html 点击下方的链接,一次观察各个页面的ur ...
- Python爬虫基础--分布式爬取贝壳网房屋信息(Client)
1. client_code01 2. client_code02 3. 这个时候运行多个client就可以分布式进行数据爬取.
- scrapy项目4:爬取当当网中机器学习的数据及价格(CrawlSpider类)
scrapy项目3中已经对网页规律作出解析,这里用crawlspider类对其内容进行爬取: 项目结构与项目3中相同如下图,唯一不同的为book.py文件 crawlspider类的爬虫文件book的 ...
- scrapy实战--登陆人人网爬取个人信息
今天把scrapy的文档研究了一下,感觉有点手痒,就写点东西留点念想吧,也做为备忘录.随意写写,看到的朋友觉得不好,不要喷我哈. 创建scrapy工程 cd C:\Spider_dev\app\scr ...
- Python爬虫基础--分布式爬取贝壳网房屋信息(Server)
1. server_code01 2. server_code02 3. server_code03
- 爬虫入门(四)——Scrapy框架入门:使用Scrapy框架爬取全书网小说数据
为了入门scrapy框架,昨天写了一个爬取静态小说网站的小程序 下面我们尝试爬取全书网中网游动漫类小说的书籍信息. 一.准备阶段 明确一下爬虫页面分析的思路: 对于书籍列表页:我们需要知道打开单本书籍 ...
- Scrapy爬虫(5)爬取当当网图书畅销榜
本次将会使用Scrapy来爬取当当网的图书畅销榜,其网页截图如下: 我们的爬虫将会把每本书的排名,书名,作者,出版社,价格以及评论数爬取出来,并保存为csv格式的文件.项目的具体创建就不再多讲 ...
- Scrapy 分布式爬取
由于受到计算机能力和网络带宽的限制,单台计算机运行的爬虫咋爬取数据量较大时,需要耗费很长时间.分布式爬取的思想是“人多力量大”,在网络中的多台计算机同时运行程序,公童完成一个大型爬取任务, Scrap ...
随机推荐
- 毕业设计之感悟 —— UML 与 ER 图
今天毕业设计答辩,虽然我第一个上场,但是不是特别紧张,因为整个系统都是我写的.我以为自己天衣无缝,能应付所有老师的所有问题.事实上,我被老师教育了一番. 老师说我,毕业论文中没有一个类.我一开始比较懵 ...
- WPF:如何为程序添加splashScreen?
原文:WPF:如何为程序添加splashScreen? 大家是否还记得在Windows Forms程序中如何实现splashScreen吗?我们一般都会使用Microsoft.VisualBasic. ...
- .Net Core中使用NodeJs加解密DES,MD5,AES,REA
鉴于使用.net core我们的加解密也同时迁移到了跨平台上,我使用的是NodeJs加解密的.废话不多说了,还是来干活吧. 1.创建Node项目 2.添加package.json { "n ...
- Win8 Metro(C#)数字图像处理--2.69中点滤波器
原文:Win8 Metro(C#)数字图像处理--2.69中点滤波器 [函数代码] <strong> /// <summary> /// Mid-point filter. / ...
- Win10《芒果TV》商店版更新v3.2.0:全新播放体验,跟着爸爸,想去哪就去哪
喜迎十一月黑五大促,跟着爸爸,想去哪就去哪,<芒果TV>UWP版迅速更新v3.2.0版,全新播放页华丽蜕变,新增互动评论.猜你喜欢.宽窄屏适配.多窗体模式切换. 芒果TV UWP V3.2 ...
- Android进程间通信-AIDL实现原理
Android进程间通信基于Proxy(代理)与Stub(桩或存根)的设计模式(如图1-1所示).其中,Proxy将特殊性接口转换成通用性接口,Stub将通用性接口转换成特殊性接口,二者之间的数据转换 ...
- jQuery.form的使用方法
首先需要引入jquery.form.js 之后即可使用 jquery.form.js的中文API网址http://www.vaikan.com/docs/jquery.form.plugin/jque ...
- 递归导致的StackOverflow的分析
递归在多层次遍历时尤为重要,这里我们不讲递归的实现,来谈谈递归的内存占用情况. 如下代码,当我们运行时很简单,StackOverflowException瞬间抛出:这里确实是“瞬间”出错了,线程堆栈溢 ...
- admin组件的使用
导进来的模板不能直接放到static里,要从其他的地方移动到static里,否则pycharm不能自动改变路径2.自关联class Userunfo(models.Model): title=m ...
- 爬取虎扑NBA首页主干道推荐贴的一只小爬虫,日常爬不冷笑话解闷
虎扑是广大jrs的家园,步行街是这个家园里最繁华的地段.据称广大jrs平均学历985,步行街街薪30w起步. 大学时经舍友安利,开始了解虎扑,主要是看看NBA的一些资讯. 偶尔也上上这个破街,看看jr ...