RandomForest&ROC
# -*- coding: utf-8 -*-
# __author__ = 'JieYao'
from biocluster.agent import Agent
from biocluster.tool import Tool
import os
import types
import subprocess
from biocluster.core.exceptions import OptionError class RandomforestAgent(Agent):
"""
需要RandomForest.pl
version v1.0
author: JieYao
last_modified:2016.07.18
""" def __init__(self, parent):
super(RandomforestAgent, self).__init__(parent)
options = [
{"name": "otutable", "type": "infile", "format": "meta.otu.otu_table, meta.otu.tax_summary_dir"},
{"name": "level", "type": "string", "default": "otu"},
{"name": "envtable", "type": "infile", "format": "meta.otu.group_table"},
{"name": "envlabs", "type": "string", "default": ""},
{"name": "ntree", "type": "int", "default": 500 },
{"name": "problem_type", "type": "int", "default": 2 },
{"name": "top_number", "type": "int", "default": 50}
]
self.add_option(options)
self.step.add_steps('RandomforestAnalysis')
self.on('start', self.step_start)
self.on('end', self.step_end) def step_start(self):
self.step.RandomforestAnalysis.start()
self.step.update() def step_end(self):
self.step.RandomforestAnalysis.finish()
self.step.update() def gettable(self):
"""
根据输入的otu表和分类水平计算新的otu表
:return:
"""
if self.option('otutable').format == "meta.otu.tax_summary_dir":
return self.option('otutable').get_table(self.option('level'))
else:
return self.option('otutable').prop['path'] def check_options(self):
"""
重写参数检查
"""
if not self.option('otutable').is_set:
raise OptionError('必须提供otu表')
self.option('otutable').get_info()
if self.option('otutable').prop['sample_num'] < 2:
raise OptionError('otu表的样本数目少于2,不可进行随机森林特征分析')
if self.option('envtable').is_set:
self.option('envtable').get_info()
if self.option('envlabs'):
labs = self.option('envlabs').split(',')
for lab in labs:
if lab not in self.option('envtable').prop['group_scheme']:
raise OptionError('envlabs中有不在物种(环境因子)表中存在的因子:%s' % lab)
else:
pass
if len(self.option('envtable').prop['sample']) < 2:
raise OptionError('物种(环境因子)表的样本数目少于2,不可进行随机森林特征分析')
samplelist = open(self.gettable()).readline().strip().split('\t')[1:]
if self.option('envtable').is_set:
self.option('envtable').get_info()
if len(self.option('envtable').prop['sample']) > len(samplelist):
raise OptionError('OTU表中的样本数量:%s少于物种(环境因子)表中的样本数量:%s' % (len(samplelist),
len(self.option('envtable').prop['sample'])))
for sample in self.option('envtable').prop['sample']:
if sample not in samplelist:
raise OptionError('物种(环境因子)表的样本中存在OTU表中未知的样本%s' % sample)
table = open(self.gettable())
if len(table.readlines()) < 4 :
raise OptionError('数据表信息少于3行')
table.close()
if self.option('top_number') > self.option('otutable').prop['otu_num']:
self.option('top_number', self.option('otutable').prop['otu_num'])
return True def set_resource(self):
"""
设置所需资源
"""
self._cpu = 2
self._memory = '' def end(self):
result_dir = self.add_upload_dir(self.output_dir)
result_dir.add_relpath_rules([
[".", "", "RandomForest分析结果和ROC计算数据输出目录"],
["./randomforest_confusion_table.xls", "xls", "RandomForest样本分组模拟结果"],
["./randomforest_mds_sites.xls", "xls", "样本点坐标表"],
["./randomforest_proximity_table.xls", "xls", "样本相似度临近矩阵"],
["./randomforest_topx_vimp.xls", "xls", "Top-X物种(环境因子)丰度表"],
["./randomforest_vimp_table.xls", "xls", "所有物种(环境因子)重要度表"],
["./randomforest_predicted_answer.xls", "xls", "随机森林预测分组结果表"],
["./randomforest_votes_probably.xls","xls", "随机森林各样本分组投票预测概率表"],
["./roc_table.xls", "xls", "ROC数据标准化后数据表"],
["./roc_point.xls", "xls", "ROC作图坐标点数据表"],
["./auc.xls", "xls", "ROC折线下方面积值"],
])
print self.get_upload_files()
super(RandomforestAgent, self).end() class RandomforestTool(Tool):
def __init__(self, config):
super(RandomforestTool, self).__init__(config)
self._version = '1.0.1'
self.cmd_path = self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/RandomForest_perl.pl'
self.env_table = self.get_new_env()
self.otu_table = self.get_otu_table() def get_otu_table(self):
"""
根据调用的level参数重构otu表
:return:
"""
if self.option('otutable').format == "meta.otu.tax_summary_dir":
otu_path = self.option('otutable').get_table(self.option('level'))
else:
otu_path = self.option('otutable').prop['path']
if self.option('envtable').is_set:
return self.filter_otu_sample(otu_path, self.option('envtable').prop['sample'],
os.path.join(self.work_dir, 'temp_filter.otutable'))
else:
return otu_path def filter_otu_sample(self, otu_path, filter_samples, newfile):
if not isinstance(filter_samples, types.ListType):
raise Exception('过滤otu表样本的样本名称应为列表')
try:
with open(otu_path, 'rb') as f, open(newfile, 'wb') as w:
one_line = f.readline()
all_samples = one_line.rstrip().split('\t')[1:]
if not ((set(all_samples) & set(filter_samples)) == set(filter_samples)):
raise Exception('提供的过滤样本集合中存在otu表中不存在的样本all:%s,filter_samples:%s' % (all_samples, filter_samples))
if len(all_samples) == len(filter_samples):
return otu_path
samples_index = [all_samples.index(i) + 1 for i in filter_samples]
w.write('OTU\t' + '\t'.join(filter_samples) + '\n')
for line in f:
all_values = line.rstrip().split('\t')
new_values = [all_values[0]] + [all_values[i] for i in samples_index]
w.write('\t'.join(new_values) + '\n')
return newfile
except IOError:
raise Exception('无法打开OTU相关文件或者文件不存在') def get_new_env(self):
"""
根据envlabs生成新的envtable
"""
if self.option('envlabs'):
new_path = self.work_dir + '/temp_env_table.xls'
self.option('envtable').sub_group(new_path, self.option('envlabs').split(','))
return new_path
else:
return self.option('envtable').path def run(self):
"""
运行
"""
super(RandomforestTool, self).run()
self.run_RandomForest_perl() def formattable(self, tablepath):
with open(tablepath) as table:
if table.read(1) == '#':
newtable = os.path.join(self.work_dir, 'temp_format.table')
with open(newtable, 'w') as w:
w.write(table.read())
return newtable
return tablepath def run_RandomForest_perl(self):
"""
运行RandomForest.pl
"""
real_otu_path = self.formattable(self.otu_table)
cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.cmd_path
cmd += ' -i %s -o %s' % (real_otu_path, self.work_dir + '/RandomForest')
if self.option('envtable').is_set:
cmd += ' -g %s -m %s' % (self.env_table, self.env_table)
cmd += ' -ntree %s' % (str(self.option('ntree')))
cmd += ' -type %s' % (str(self.option('problem_type')))
cmd += ' -top %s' % (str(self.option('top_number')))
self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算') try:
subprocess.check_output(cmd, shell=True)
self.logger.info('生成 cmd.r 文件成功')
except subprocess.CalledProcessError:
self.logger.info('生成 cmd.r 文件失败')
self.set_error('无法生成 cmd.r 文件')
try:
subprocess.check_output(self.config.SOFTWARE_DIR +
'/program/R-3.3.1/bin/R --restore --no-save < %s/cmd.r' % (self.work_dir + '/RandomForest'), shell=True)
self.logger.info('RandomForest计算成功')
except subprocess.CalledProcessError:
self.logger.info('RandomForest计算失败')
self.set_error('R运行计算RandomForest失败')
self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算完成')
allfiles = self.get_filesname()
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[1], 'randomforest_mds_sites.xls')
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[2], 'randomforest_proximity_table.xls')
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[3], 'randomforest_topx_vimp.xls')
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[4], 'randomforest_vimp_table.xls')
if self.option('envtable').is_set:
if allfiles[0] and allfiles[5] and allfiles[6]:
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[0], 'randomforest_confusion_table.xls')
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[5], 'randomforest_predicted_answer.xls')
self.linkfile(self.work_dir + '/RandomForest/' + allfiles[6], 'randomforest_votes_probably.xls')
else:
self.set_error('按分组计算的文件生成出错')
if not self.option('envtable').is_set:
self.end()
if not (allfiles[5] and allfiles[6]):
self.end()
cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/calc_roc.pl'
cmd += ' -i1 %s' %(self.work_dir + '/RandomForest/randomforest_votes_probably.xls')
cmd += ' -i2 %s' %(self.work_dir + '/RandomForest/randomforest_predicted_answer.xls')
cmd += ' -o %s' %(self.work_dir + '/ROC/')
self.logger.info('开始运行calc_roc.pl计算ROC相关数据') try:
subprocess.check_output(cmd, shell=True)
self.logger.info('生成 roc_cmd.r 成功')
except subprocess.CalledProcessError:
self.logger.info('生成 roc_cmd.r 失败')
self.set_error('无法生成 roc_cmd.r 文件')
try:
subprocess.check_output(self.config.SOFTWARE_DIR +
'/program/R-3.3.1/bin/R --restore --no-save < %s/roc_cmd.r' % (self.work_dir + '/ROC'), shell=True)
self.logger.info('ROC计算成功')
except subprocess.CalledProcessError:
self.logger.info('ROC计算失败')
self.set_error('R运行计算ROC失败')
self.logger.info('运行calc_roc.pl程序进行ROC计算完成')
allfiles = self.get_roc_filesname()
self.linkfile(self.work_dir + '/ROC/' + allfiles[0], 'roc_table.xls')
self.linkfile(self.work_dir + '/ROC/' + allfiles[1], 'roc_point.xls')
self.linkfile(self.work_dir + '/ROC/' + allfiles[2], 'auc.xls')
self.end() def linkfile(self, oldfile, newname):
"""
link文件到output文件夹
:param oldfile: 资源文件路径
:param newname: 新的文件名
:return:
"""
newpath = os.path.join(self.output_dir, newname)
if os.path.exists(newpath):
os.remove(newpath)
os.link(oldfile, newpath) def get_roc_filesname(self):
filelist = os.listdir(self.work_dir + '/ROC')
roc_table_file = None
roc_point_file = None
auc_file = None
for name in filelist:
if 'roc_table.xls' in name:
roc_table_file = name
elif 'roc_point.xls' in name:
roc_point_file = name
elif 'auc.xls' in name:
auc_file = name
if (roc_table_file and roc_point_file and auc_file):
return [roc_table_file, roc_point_file, auc_file]
else:
self.set_error("未知原因,ROC计算结果丢失") def get_filesname(self):
filelist = os.listdir(self.work_dir + '/RandomForest')
randomforest_confusion_table_file = None
randomforest_mds_sites_file = None
randomforest_proximity_table_file = None
randomforest_topx_vimp_file = None
randomforest_vimp_table_file = None
randomforest_predicted_answer_file = None
randomforest_votes_probably_file = None
for name in filelist:
if 'randomforest_confusion_table.xls' in name:
randomforest_confusion_table_file = name
elif 'randomforest_mds_sites.xls' in name:
randomforest_mds_sites_file = name
elif 'randomforest_proximity_table.xls' in name:
randomforest_proximity_table_file = name
elif 'randomforest_topx_vimp.xls' in name:
randomforest_topx_vimp_file = name
elif 'randomforest_vimp_table.xls' in name:
randomforest_vimp_table_file = name
elif 'randomforest_predicted_answer.xls' in name:
randomforest_predicted_answer_file = name
elif 'randomforest_votes_probably.xls' in name:
randomforest_votes_probably_file = name
if (randomforest_mds_sites_file and randomforest_proximity_table_file and
randomforest_topx_vimp_file and randomforest_vimp_table_file):
if self.option('envtable').is_set:
if not randomforest_confusion_table_file:
self.set_error('未知原因,样本分组模拟结果丢失或未生成')
if not randomforest_predicted_answer_file:
self.set_error('未知原因,样本分组预测结果文件丢失或未生成')
if not randomforest_votes_probably_file:
self.set_error('未知原因,样本分组预测概率表丢失或未生成')
return [randomforest_confusion_table_file, randomforest_mds_sites_file,
randomforest_proximity_table_file, randomforest_topx_vimp_file,
randomforest_vimp_table_file, randomforest_predicted_answer_file,
randomforest_votes_probably_file]
else:
self.set_error('未知原因,数据计算结果丢失或者未生成')
#!/mnt/ilustre/users/sanger-dev/app/program/perl/perls/perl-5.24.0/bin/perl
use strict;
use warnings;
use Getopt::Long;
my %opts;
my $VERSION = "V2.20160708";
GetOptions( \%opts,"i=s","m=s","o=s","g=s","ntree=i","top=i","type=s");
my $usage = <<"USAGE";
Program : $
Discription: Program used to caculate randomforest,with mds plot and importance variables given .
Version : $VERSION
Contact : jie.yao\@majorbio.com
Usage :perl $ [options]
-i * input otu table file
-o * output dir
-m input mapping file if you want set points\'s color and pch by groups. If omitted, randomForest will run in unsupervised mode.
Default:none
-g group name in mapping file .Default:none
-ntree Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times.Default:500
-top How many variables to show?
-type either 1,2 or 3, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity). Example:$0 -i otu_table.xls -o randomForest -m group -g group USAGE die $usage if(!($opts{i}&&$opts{o}));
die $usage if($opts{m}&& !$opts{g});
die $usage if(!$opts{m}&& $opts{g}); $opts{m}=defined $opts{m}?$opts{m}:"none";
$opts{g}=defined $opts{g}?$opts{g}:"none";
$opts{ntree}=defined $opts{ntree}?$opts{ntree}:"500";
$opts{type}=defined $opts{type}?$opts{type}:"1";
$opts{top}=defined $opts{top}?$opts{top}:"50"; if(! -e $opts{o}){
`mkdir $opts{o}`;
} open CMD,">$opts{o}/cmd.r";
print CMD "
library(sp,warn.conflicts = F)
library(randomForest,warn.conflicts = F)
library(maptools,warn.conflicts = F)
basename=\"randomforest\" # if read otu data
otu <-read.table(\"$opts{i}\",sep=\"\\t\",head=T,check.names = F)
rownames(otu) <-as.factor(otu[,1])
otu <-otu[,-1]
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"_*{.+}\",\" \",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"-\",\"_\",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\[\",\"\",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\]\",\"\",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\(\",\"\",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\)\",\"\",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"^[0-9]\",\"X\\\\1\",x,perl = TRUE))
rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\/\",\"\",x,perl = TRUE))
otu <-as.data.frame(t(otu),stringsAsFactors=T) map=\"$opts{m}\"
if(map !=\"none\"){
sd <-read.table(\"$opts{m}\",head=T,sep=\"\\t\",comment.char = \"\",check.names = FALSE)
rownames(sd) <- as.character(sd[,1])
sd[,1] <-as.character(sd[,1])
sd\$group <-as.factor(sd\$group )
legend <- as.matrix(unique(sd\$group))
} set.seed(1)
if(map != \"none\"){
otu.rf <- randomForest(sd\$group ~ .,otu,importance=T,proximity=T,ntree=$opts{ntree}) class_count <-as.matrix(table(sd\$group))
class <-data.frame(count=class_count) ##randomforest votes probably
votes_probably<- paste(\"$opts{o}/\",basename,\"_votes_probably.xls\",sep=\"\")
write.table(otu.rf\$votes,votes_probably,sep=\"\\t\",quote=F) ##randomforest predicted answer
predicted_answer <- paste(\"$opts{o}/\",basename,\"_predicted_answer.xls\",sep=\"\")
write.table(otu.rf\$predicted,predicted_answer,sep=\"\\t\",quote=F) ##randomforest classification table
rf_table <- paste(\"$opts{o}/\",basename,\"_confusion_table.xls\",sep=\"\")
write.table(otu.rf\$confusion,rf_table,sep=\"\\t\",quote=F)
mds <- cmdscale(1-otu.rf\$proximity)
}else{
otu.rf <- randomForest(otu,importance=T,proximity=T,ntree=$opts{ntree})
mds <- cmdscale(1-otu.rf\$proximity)
} ##mds points
mds_points <- paste(\"$opts{o}/\",basename,\"_mds_sites.xls\",sep=\"\")
write.table(mds,mds_points,sep=\"\\t\",quote=F) ##proximity table
proximity <- paste(\"$opts{o}/\",basename,\"_proximity_table.xls\",sep=\"\")
write.table(otu.rf\$proximity,proximity,sep=\"\\t\",quote=F) ## importance table
vimp_table <- paste(\"$opts{o}/\",basename,\"_vimp_table.xls\",sep=\"\")
write.table(otu.rf\$importance,vimp_table,sep=\"\\t\",quote=F) ## top importance species table
topx_vimp <- paste(\"$opts{o}/\",basename,\"_topx_vimp.xls\",sep=\"\")
imp <- importance(otu.rf)
if($opts{type} == 1){
top <- imp[order(imp[,\"MeanDecreaseAccuracy\"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
write.table(t(otu)[rownames(top),],topx_vimp,sep=\"\\t\",quote=F) }else if ($opts{type} == 2){
top <- imp[order(imp[,\"MeanDecreaseGini\"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
write.table(t(otu)[rownames(top),],topx_vimp,sep=\"\\t\",quote=F)
} "; `R --restore --no-save < $opts{o}/cmd.r`;
RandomForest&ROC的更多相关文章
- ROC曲线、PR曲线
在论文的结果分析中,ROC和PR曲线是经常用到的两个有力的展示图. 1.ROC曲线 ROC曲线(receiver operating characteristic)是一种对于灵敏度进行描述的功能图像. ...
- ROC & AUC笔记
易懂:http://alexkong.net/2013/06/introduction-to-auc-and-roc/ 分析全面但难懂:http://mlwiki.org/index.php/ROC_ ...
- 精确率与召回率,RoC曲线与PR曲线
在机器学习的算法评估中,尤其是分类算法评估中,我们经常听到精确率(precision)与召回率(recall),RoC曲线与PR曲线这些概念,那这些概念到底有什么用处呢? 首先,我们需要搞清楚几个拗口 ...
- 【数据挖掘】朴素贝叶斯算法计算ROC曲线的面积
题记: 近来关于数据挖掘学习过程中,学习到朴素贝叶斯运算ROC曲线.也是本节实验课题,roc曲线的计算原理以及如果统计TP.FP.TN.FN.TPR.FPR.ROC面积等等.往往运用 ...
- PR曲线,ROC曲线,AUC指标等,Accuracy vs Precision
作为机器学习重要的评价指标,标题中的三个内容,在下面读书笔记里面都有讲: http://www.cnblogs.com/charlesblc/p/6188562.html 但是讲的不细,不太懂.今天又 ...
- ID3、C4.5、CART、RandomForest的原理
决策树意义: 分类决策树模型是表示基于特征对实例进行分类的树形结构.决策树可以转换为一个if_then规则的集合,也可以看作是定义在特征空间划分上的类的条件概率分布. 它着眼于从一组无次序.无规则的样 ...
- C4.5,CART,randomforest的实践
#################################Weka-J48(C4.5)################################# ################### ...
- 如何利用Matlab进行ROC分析
ROC曲线基本知识: 判断分类器的工作效率需要使用召回率和准确率两个变量. 召回率:Recall,又称"查全率", 准确率:Precision,又称"精度".& ...
- 机器学习之分类器性能指标之ROC曲线、AUC值
分类器性能指标之ROC曲线.AUC值 一 roc曲线 1.roc曲线:接收者操作特征(receiveroperating characteristic),roc曲线上每个点反映着对同一信号刺激的感受性 ...
随机推荐
- angularjs学习笔记1-angular总体简介及其特点
以前开发(web或者移动端)前端主要使用jQuery+原生js,如果使用某些前端UI框架的话,它自己还可能提供一些API可以使用.而且目前很多UI框架都是基于jQuery的,所以说一下由jQuery跨 ...
- [BZOJ2286][SDOI2011]消耗战(虚树DP)
2286: [Sdoi2011]消耗战 Time Limit: 20 Sec Memory Limit: 512 MBSubmit: 4998 Solved: 1867[Submit][Statu ...
- 20162325金立清 实验四 Android程序设计 实验报告
实验四 Android程序设计 实验报告 代码托管地址 码云链接 实验内容 安装使用Android Stuidio Activity测试 UI测试 布局测试 事件处理测试 Android程序设计-1 ...
- ZXing for Android 修改为竖屏模式
zxing github连接:https://github.com/zxing/zxing 以下为修改方法 Step 1: Add following lines to rotate data bef ...
- 零起点学算法08——简单的输入和计算(a+b)
#include <stdio.h> int main() { int a; int b; scanf("%d %d",&a,&b); printf(& ...
- php读取超大文件fseek
function readMaxFile($fp , $start = 0) { $tag = "\n"; $i = 0; $content = ''; while($i < ...
- 简单实现ToolStripMenuItem(菜单栏)的单选效果
来源:http://www.97world.com/archives/2194 这几天在写又拍云的客户端,老实说确实学到了不少东西!接下来的几天我会把一些技巧或者原来没有接触过的一些东西发上来,算是复 ...
- Bestreviewapp给iOS软件写评论赚钱
BestReviewApp 这是一个评论类的活动,网站上会提供App列表,在iTunes评论这些应用就能获得报酬.目前账号中的余额可通过PayPal或支付宝提取出来.BestReviewApp 开放的 ...
- [Java基础] Java enum的用法详解
用法一:常量 在JDK1.5 之前,我们定义常量都是: public static fianl.... .现在好了,有了枚举,可以把相关的常量分组到一个枚举类型里,而且枚举提供了比常量更多的方法. p ...
- python升级导致yum命令无法使用的解决办法?
yum是依赖特定的python版本的,不同的linux系统需要的python版本不同. 查看yum的启动脚本:which is yum 头一行指定使用的python版本,这个必须是系统需要的,而不要使 ...