我的代码-random forest
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score,recall_score,average_precision_score,auc
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score,recall_score,average_precision_score,auc
# In[137]:
from imblearn.over_sampling import SMOTE
data = pd.read_csv(r"D:\Users\sgg91044\Desktop\model_data_1.csv")
# In[138]:
data.head()
# In[50]:
data= data.drop(columns=['Stg','RNK','parametername','ooc','oos'])
p= pd.pivot_table(data, index=['eqpid','Chamber','lotid','slotid','stage','Recipie_Name','finishtime'],values='data1', columns='Param_Name', aggfunc=np.sum)
# In[54]:
p
p.to_csv(r'D:\Users\sgg91044\Desktop\more_parameter\more_parameter_pivot.csv', index=True, header=True)
# In[236]:
p.drop(columns=["waferid","defect_count"],inplace=True)
# In[237]:
data.head()
# In[184]:
data = pd.read_csv(r"D:\Users\sgg91044\Desktop/MEP_data_pivot.csv")
# In[6]:
data.iloc[:,0:17] = data.iloc[:,0:17].apply(pd.to_numeric,errors='coerce')
# In[239]:
for i in range(0,18):
med = np.median(data.iloc[:,i][data.iloc[:,i].isna() == False])
data.iloc[:,i] = data.iloc[:,i].fillna(med)
# In[139]:
data.Target = data.Target.astype("category")
# In[140]:
Y = data.Target
X = data.drop(columns='Target')
# In[195]:
ohe = OneHotEncoder()
le = LabelEncoder()
# In[246]:
X=X.drop(columns=['eqpid','lotid','Chamber','Recipie_Name'])
X
for i in range(0,18):
med = np.median(data.iloc[:,i][data.iloc[:,i].isna() == False])
data.iloc[:,i] = data.iloc[:,i].fillna(med)
# In[243]:
X["eqp_encoded"] = le.fit_transform(X.iloc[:,0])
X["chmbr_encoded"] = le.fit_transform(X.iloc[:,1])
X.drop(columns=['Step'],inplace=True)
X['recipe_encoded'] = le.fit_transform(X.iloc[:,2])
# In[135]:
X_eqp = ohe.fit_transform(X.eqp_encoded.values.reshape(-1,1)).toarray()
X_chamber = ohe.fit_transform(X.chmbr_encoded.values.reshape(-1,1)).toarray()
X_recipie = ohe.fit_transform(X.recipe_encoded.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X_eqp, columns = ["Eqp_"+str(int(i)) for i in range(X_eqp.shape[1])])
X = pd.concat([X, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X_chamber, columns = ["Chamber_"+str(int(i)) for i in range(X_chamber.shape[1])])
X = pd.concat([X, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X_recipie, columns = ["Recipie_"+str(int(i)) for i in range(X_recipie.shape[1])])
X = pd.concat([X, dfOneHot], axis=1)
# In[136]:
Trace_back = pd.concat([X[["eqpid","Chamber","Recipie_Name"]],X[["eqp_encoded","chmbr_encoded","recipe_encoded"]]],axis=1)
# In[137]:
X.drop(columns=list(Trace_back.columns),inplace=True)
# In[197]:
nz = Normalizer()
X.iloc[:,0:19]=pd.DataFrame(nz.fit_transform(X.iloc[:,0:19]),columns=X.iloc[:,0:19].columns)
# In[150]:
data.Target = data.Target.astype("category")
Y = data.Target
X = data.drop(columns='Target')
# In[124]:
sm = SMOTE(random_state=12, ratio = 1.0)
X_smote, Y_smote = sm.fit_sample(X, Y)
# In[237]:
data=pd.read_csv(r"D:\Users\sgg91044\Desktop\model_data_1.csv")
# In[238]:
data.eqpid = data.eqpid.astype("category")
data.chamber = data.chamber.astype("category")
data.wafer = data.wafer.astype("category")
# In[239]:
data.Target = data.Target.astype("category")
Y = data.Target
X = data.drop(columns='Target')
# In[240]:
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=8)
# In[241]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)
# In[242]:
print(y_train.value_counts(), np.bincount(y_train))
# In[243]:
from sklearn.ensemble import RandomForestClassifier
# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, oob_score = True, verbose = 1, n_jobs = -1)
# In[244]:
# Train on the training data
random_forest.fit(x_train_smote,y_train_smote)
# In[245]:
# Make predictions on the test data
y_pred = random_forest.predict(X_test)
# In[246]:
print(classification_report(y_pred=y_pred,y_true=y_test))
# In[247]:
print(confusion_matrix(y_pred=y_pred,y_true=y_test))
# In[235]:
from sklearn.externals import joblib
# Save to file in the current working directory
joblib_file = "model_RF.pkl"
joblib.dump(random_forest, joblib_file)
# In[229]:
X_Nov=pd.read_csv(r'D:\Users\sgg91044\Desktop\sep_oct_data\Nov_good_imputed.csv')
Y_Nov=pd.read_csv(r'D:\Users\sgg91044\Desktop\sep_oct_data\Y_Nov.csv')
# In[230]:
def encode_eqpid(eqpid):
return int(eqpid[-2:])-1
def encode_chamber(chamber):
if chamber == 'A':
return 0
else:
return 1
def encode_wafer(wafer):
if wafer > 0:
return wafer-1
data=pd.read_csv(r"D:\Users\sgg91044\Desktop\normalizing_example.csv")
nz = Normalizer()
data.iloc[:,8:10]=pd.DataFrame(nz.fit_transform(data.iloc[:,8:10]),columns=data.iloc[:,8:10].columns)
data.iloc[:,0:3]=pd.DataFrame(nz.fit_transform(data.iloc[:,0:3]),columns=data.iloc[:,0:3].columns)
# In[231]:
X_Nov.eqpid = X_Nov.eqpid.apply(encode_eqpid)
X_Nov.chamber = X_Nov.chamber.apply(encode_chamber)
X_Nov.wafer = X_Nov.wafer.apply(encode_wafer)
X_Nov.eqpid = X_Nov.eqpid.astype("category")
X_Nov.chamber = X_Nov.chamber.astype("category")
X_Nov.wafer = X_Nov.wafer.astype("category")
X_Nov.iloc[:,11:13]=nz.transform(X_Nov.iloc[:,11:13])
X_Nov.iloc[:,3:6]=nz.transform(X_Nov.iloc[:,3:6])
#SUM_ETCM
X_Nov["SUM_ETCM"]=np.array(X_Nov.ETCM_PHA4)+np.array(X_Nov.ETCM_PHB4)+np.array(X_Nov.ETCM_PHC4)
# In[232]:
X_Nov=X_Nov.drop(columns="Target")
# In[233]:
# Make predictions on the test data
y_pred = random_forest.predict(X_Nov)
# In[234]:
print(classification_report(y_pred=y_pred,y_true=Y_Nov))
# In[129]:
print("Accuracy of Random_forest:",round(accuracy_score(y_pred=y_pred,y_true=y_test) * 100,2),"%")
# In[130]:
print("Sensitivity of Random_forest:",round(recall_score(y_pred=y_pred,y_true=y_test)*100,2),"%")
# In[18]:
from sklearn.externals import joblib
joblib.dump(random_forest, r'D:\Users\sgg91044\Desktop\deployment\model_RF.pkl')
# In[217]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
{'kernel':['poly'],'degree':[2,3,5]}]
clf = GridSearchCV(SVC(),param_grid=tuned_parameters,cv=3,scoring='recall',verbose=True)
clf.fit(x_train_smote,y_train_smote)
# In[218]:
[clf.best_estimator_.kernel,clf.best_estimator_.C,clf.best_estimator_.gamma]
y_pred = clf.predict(X_test)
# In[219]:
print(classification_report(y_pred=y_pred,y_true=y_test))
# In[165]:
df=pd.DataFrame(y_pred)
df.to_csv(r'D:\Users\sgg91044\Desktop\df_pred.csv', index=True, header=True)
# In[223]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,y_train)
# In[224]:
y_test_pred=classifier.predict(X_test)
# In[225]:
print(classification_report(y_pred=y_test_pred,y_true=y_test))
# In[120]:
f1_score(y_pred=y_pred,y_true=y_test)
# In[121]:
print("Accuracy of Random_forest:",round(accuracy_score(y_pred=y_pred,y_true=y_test) * 100,2),"%")
# In[122]:
print("Sensitivity of Random_forest:",round(recall_score(y_pred=y_pred,y_true=y_test)*100,2),"%")
# In[30]:
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=0)
# In[31]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)
# In[32]:
print(y_train.value_counts(), np.bincount(y_train_smote))
# In[86]:
from sklearn.ensemble import RandomForestClassifier
# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
# In[89]:
# Train on the training data
random_forest.fit(x_train_smote,y_train_smote)
# In[90]:
# Make predictions on the test data
y_pred = random_forest.predict(X_test)
# In[91]:
print(classification_report(y_pred=y_pred,y_true=y_test))
# In[92]:
print(confusion_matrix(y_pred=y_pred,y_true=y_test))
# In[93]:
f1_score(y_pred=y_pred,y_true=y_test)
# In[220]:
print("Accuracy of Random_forest:",round(accuracy_score(y_pred=y_pred,y_true=y_test) * 100,2),"%")
# In[221]:
print("Sensitivity of Random_forest:",round(recall_score(y_pred=y_pred,y_true=y_test)*100,2),"%")
# In[96]:
y_pred_rf = random_forest.predict_proba(X_test)
y_pred_rf
# In[99]:
# The random forest model by itself
y_pred_rf = random_forest.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
# In[83]:
import matplotlib.pyplot as plt
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred_rf.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print ("AUC of Random_forest:", roc_auc["micro"])
我的代码-random forest的更多相关文章
- 多分类问题中,实现不同分类区域颜色填充的MATLAB代码(demo:Random Forest)
之前建立了一个SVM-based Ordinal regression模型,一种特殊的多分类模型,就想通过可视化的方式展示模型分类的效果,对各个分类区域用不同颜色表示.可是,也看了很多代码,但基本都是 ...
- [Machine Learning & Algorithm] 随机森林(Random Forest)
1 什么是随机森林? 作为新兴起的.高度灵活的一种机器学习算法,随机森林(Random Forest,简称RF)拥有广泛的应用前景,从市场营销到医疗保健保险,既可以用来做市场营销模拟的建模,统计客户来 ...
- paper 56 :机器学习中的算法:决策树模型组合之随机森林(Random Forest)
周五的组会如约而至,讨论了一个比较感兴趣的话题,就是使用SVM和随机森林来训练图像,这样的目的就是 在图像特征之间建立内在的联系,这个model的训练,着实需要好好的研究一下,下面是我们需要准备的入门 ...
- sklearn_随机森林random forest原理_乳腺癌分类器建模(推荐AAA)
sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频) https://study.163.com/course/introduction.htm?courseId=1005269003& ...
- 随机森林(Random Forest)
阅读目录 1 什么是随机森林? 2 随机森林的特点 3 随机森林的相关基础知识 4 随机森林的生成 5 袋外错误率(oob error) 6 随机森林工作原理解释的一个简单例子 7 随机森林的Pyth ...
- 随机森林(Random Forest),决策树,bagging, boosting(Adaptive Boosting,GBDT)
http://www.cnblogs.com/maybe2030/p/4585705.html 阅读目录 1 什么是随机森林? 2 随机森林的特点 3 随机森林的相关基础知识 4 随机森林的生成 5 ...
- [ML学习笔记] 决策树与随机森林(Decision Tree&Random Forest)
[ML学习笔记] 决策树与随机森林(Decision Tree&Random Forest) 决策树 决策树算法以树状结构表示数据分类的结果.每个决策点实现一个具有离散输出的测试函数,记为分支 ...
- [Machine Learning & Algorithm] 随机森林(Random Forest)-转载
作者:Poll的笔记 博客出处:http://www.cnblogs.com/maybe2030/ 阅读目录 1 什么是随机森林? 2 随机森林的特点 3 随机森林的相关基础知识 4 随机森林的生成 ...
- ML(4.3): R Random Forest
随机森林模型是一种数据挖掘模型,常用于进行分类预测.随机森林模型包含多个树形分类器,预测结果由多个分类器投票得出. 决策树相当于一个大师,通过自己在数据集中学到的知识对于新的数据进行分类.俗话说得好, ...
随机推荐
- RSA填充模式
1)RSA_PKCS1_PADDING 填充模式,最常用的模式 输入RSA_size(rsa) – 11输出和modulus一样长 2)RSA_PKCS1_OAEP_PADDINGRSA_size(r ...
- Saiku Table展示数据合并bug修复(二十五)
Saiku Table展示数据合并bug修复 Saiku以table的形式展示数据,如果点击了 非空的字段 按钮,则会自动进行数据合并,为空的数据行以及数据列都会自动隐藏掉. 首先我们应该定位问题: ...
- Java 访问限制符 在同一包中或在不同包中:使用类创建对象的权限 & 对象访问成员变量与方法的权限 & 继承的权限 & 深入理解protected权限
一.实例成员与类成员 1. 当类的字节码被加载到内存, 类中类变量.类方法即被分配了相应内存空间.入口地址(所有对象共享). 2. 当该类创建对象后,类中实例变量被分配内存(不同对象的实例变量互不相同 ...
- 关于html的基础标签
html: 超文本标记语言 h1 -- h6: 标题标签 一级标题,一个页面中只能存在一个h1 数字越大,标题的级别越小br:换行p:段落 自带换行,前后自带间隙a::::href属性---指定将要跳 ...
- 等积投影(equal-area projection)
等积投影(equal-area projection)是地图投影的一种,是地图上任何图形面积经主比例尺放大以后与实地上相应图形面积保持大小不变的一类投影.即投影面积与实地面积相等的投影——面积比为1. ...
- Visual Studio+VAssistX自动添加注释,函数头注释,文件头注释
转载:http://blog.csdn.net/xzytl60937234/article/details/70455777 在VAssistX中为C++提供了比较规范注释模板,用这个注释模板为编写的 ...
- springmvc初始化失败问题跟踪
1.问题 访问路径http://10.118.30.52:8088/helloWorld/hello后会报404错误,原因是springmvc配置文件中的包扫描路径错误.修改配置如下: <con ...
- erlang并发编程(二)
补充-------erlang并发编程 Pid =spawn(fun()-> do_sth() end). 进程监视: Ref = monitor(process, Pid)靠抛异常来终结进程 ...
- log4j的简介和使用
一.log4j是什么 引用官网的介绍 Log4j is a fast and flexible framework for logging application debugging messages ...
- Devexpress GridControl 多选
以前dev的多选要自己处理,一般的处理方式就是在单元格中添加checkbox控件.后来的版本中dev增加了多选的支持,只需要设置一下属性就可以了,属性如下图: 然后效果设计页面就是这个样子: 运行以后 ...