RM

 # -*- coding: utf-8 -*-
 """
 RandomForestClassifier 예

 """
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_wine
 from sklearn import metrics #model 평가 도구

 #1.dataset load
 wine=load_wine()
 wine_x=wine.data
 wine_y=wine.target# 3개 범주 

 #data set보기
 print(wine_x[:5,:])
 """
 [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
   2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
  [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
   2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
  [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
   3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
  [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
   2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
  [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
   3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]
 """
 print(wine_y[:5]) #[0 0 0 0 0]
 print(wine_y[170:175]) #[2 2 2 2 2]

 #2. train/test
 X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)

 #3.RM model 생성
 obj=RandomForestClassifier()
 model=obj.fit(X_train,y_train)
 print(model)
 """
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)
 """

 #4.model 평가 : 도구
 pred=model.predict(X_test)
 Y=y_test

 # 평가 도구
 acc=metrics.accuracy_score(Y,pred)
 print(acc) #0.9629629629629629

 report=metrics.classification_report(Y,pred)
 print(report)
 """
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.90      1.00      0.95        18
           2       1.00      0.91      0.95        22

 avg / total       0.97      0.96      0.96        54
 """

 ##############################################
 # RF model Tuning
 ##############################################
 """
 n_estimators=10 tree개수 (400~500) 제일 좋음
 min_samples_split=2 : 변수의 개수(sqrt(n))
 """
 #2. train/test
 X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)
 print(wine_x.shape)#(178, 13)  13의 루트
 print(np.sqrt(13)) #3.605551275463989=>4

 # 3. RM model 생성
 obj2=RandomForestClassifier(n_estimators=400,
                            min_samples_split=3)
 model2=obj2.fit(X_train,y_train)
 print(model2)
 """
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=4,
             min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)
 """
 # 4. model 평가 : 도구
 pred2=model.predict(X_test)
 Y=y_test

 # 평가 도구
 acc=metrics.accuracy_score(Y,pred2)
 print(acc) #0.9814814814814815

 report=metrics.classification_report(Y,pred2)
 print(report)
 """
              precision    recall  f1-score   support
           0       1.00      1.00      1.00        14
           1       0.95      1.00      0.97        18
           2       1.00      0.95      0.98        22
 avg / total       0.98      0.98      0.98        54
 """

cross_validation

 # -*- coding: utf-8 -*-
 """
 교차 검정예
 """
 import pandas as pd
 from sklearn.model_selection import cross_validate # 교차검정
 from sklearn.ensemble import RandomForestClassifier # RM

 # 1. data set
 iris=pd.read_csv("../data/iris.csv")
 print(iris.info())
 """
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 150 entries, 0 to 149
 Data columns (total 5 columns):
 Sepal.Length    150 non-null float64
 Sepal.Width     150 non-null float64
 Petal.Length    150 non-null float64
 Petal.Width     150 non-null float64
 Species         150 non-null object
 dtypes: float64(4), object(1)
 memory usage: 5.9+ KB
 None
 """

 cols=list(iris.columns)

 x_data=iris[cols[:4]] #1~4
 y_data=iris[cols[-1]] 

 #2.model 생성
 obj=RandomForestClassifier()
 model=obj.fit(x_data,y_data)

 #3.교차 검정 cv=5(5겹 교차검정)
 score=cross_validate(model,x_data,y_data,cv=5)
 print(score)
 """
 {'fit_time': array([0.01000023, 0.01000023, 0.00900006, 0.00999999, 0.01000023]),
 'score_time': array([0.00099993, 0.00099993, 0.00099993, 0.00100017, 0.00099993]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}
 """

 test_score=list(score['test_score'])
 print(test_score) #[0.966, 0.966, 0.933, 0.9, 1.0]

 import numpy as np
 score_arr=np.array(test_score)
 print(score_arr.mean())#0.9533333333333334

RM_regression

 # -*- coding: utf-8 -*-
 """
 RandomForestRegressor 예
 """

 import pandas as pd
 import numpy as np

 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston # data set
 from sklearn import metrics # model 평가 도구

 # 1. dataset load
 boston = load_boston()
 boston_x = boston.data
 boston_y = boston.target # 연속형 

 # data set 보기
 print(boston_x.shape) # (506, 13)
 print(boston_y.shape) # (506,) 

 # 2. train/test
 x_train, x_test, y_train, y_test = train_test_split(
         boston_x, boston_y, test_size=0.3, random_state=123)

 # 3. RM model 생성
 obj = RandomForestRegressor(random_state=234)
 model = obj.fit(x_train, y_train)
 print(model)
 """
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=234, verbose=0, warm_start=False)
 """

 # 4. model 평가 : 도구
 y_pred = model.predict(x_test)
 y_real_value = y_test 

 # 평가 도구
 model_score = model.score(x_test, y_test)
 print(model_score)
 # 0.7998066141697237

xgboost_test

 # -*- coding: utf-8 -*-
 """
 xgboot분류분석
 """
 import pandas as pd
 from xgboost import XGBClassifier #model
 from xgboost import plot_importance #중요변수 시각화
 from xgboost import plot_tree# tree 시각화
 from sklearn.model_selection import train_test_split

 #1.iris data set load
 iris=pd.read_csv("../data/iris.csv")

 cols=list(iris.columns)
 iris_x=iris[cols[:4]]
 iris_y=iris[cols[-1]]

 # 2. train/test set
 x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3,random_state=123)

 # 3. model 생성
 obj=XGBClassifier()
 model=obj.fit(x_train,y_train)
 print(model)
 """
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)

  objective = "binary:logistic": binary:logistic" : y변수 이항
 • max_depth = 2: tree 구조가 간단한 경우 : 2
 • nthread = 2 : cpu 사용 수 : 2
 • nrounds = 2 : 실제값과 예측값의 차이를 줄이기 위한 반복학습 횟수
 • eta = 1 : 학습률을 제어하는 변수(Default: 0.3), 오버 피팅을 방지

 """
 # 4. model 평가
 y_pred=model.predict(x_test)
 print(y_pred)
 Y=y_test
 """
 ['versicolor' 'virginica' 'virginica' 'versicolor' 'setosa' 'versicolor'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica' 'setosa'
  'versicolor' 'virginica' 'virginica' 'virginica' 'setosa' 'setosa'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa' 'virginica' 'setosa'
  'setosa' 'setosa' 'virginica' 'virginica' 'setosa' 'virginica'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'versicolor' 'virginica'
  'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'virginica'
  'virginica' 'virginica']
 """

 # 중요변수 시각화
 import matplotlib.pyplot as plt
 plot_importance(model)
 plt.show()

 #fscore 중요변수 확인
 score=model.get_booster().get_fscore()
 print('x 중요변수=',score)
 #x 중요변수= {'Petal.Length': 255, 'Petal.Width': 135, 'Sepal.Width': 64, 'Sepal.Length': 118}

 #모델 평가
 from sklearn import metrics
 acc=metrics.accuracy_score(y_pred,Y)
 print("acc=",acc) #acc= 0.9333333333333333

 report=metrics.classification_report(Y,y_pred)
 print(report)
 """
              precision    recall  f1-score   support
      setosa       1.00      1.00      1.00        18
  versicolor       0.77      1.00      0.87        10
   virginica       1.00      0.82      0.90        17
 avg / total       0.95      0.93      0.93
 """

 plot_tree(model)
 plt.show()

xgboost_regression

 # -*- coding: utf-8 -*-
 """
 Created on Sun Feb 24 15:18:35 2019

 @author: 502-03
 """

 import pandas as pd
 from xgboost import XGBRegressor #model (회귀모델)
 from xgboost import plot_importance #중요변수 시각화
 from xgboost import plot_tree
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston# dataset
 import matplotlib.pyplot as plt

 # 1. dataset load
 boston=load_boston()
 x=boston.data
 y=boston.target

 print(x.shape) #(506, 13)
 print(y.shape) #(506,)

 # 2. train/test set
 x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=123)

 # 3. model 생성
 obj=XGBRegressor(n_estimators=400,max_depth=6)
 model=obj.fit(x_train,y_train)
 print(model)
 """
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)
 """
 # 중요변수
 score=model.get_booster().get_fscore()
 print(score)
 """
 {'f5': 83, 'f12': 78, 'f0': 91, 'f4': 42, 'f7': 110, 'f10': 32,
 'f6': 46, 'f9': 38, 'f3': 1, 'f8': 16, 'f11': 51, 'f1': 2, 'f2': 15}
 """

 plot_importance(model)
 plt.show()

 plot_tree(model)
 plt.show()

 print(boston.feature_names)
 """
 ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD'
 'TAX' 'PTRATIO' 'B' 'LSTAT']
 """

xgboost_freeze

 # -*- coding: utf-8 -*-
 """
 수도사업소 주관 - big 콘테스트 dataset
 동파유무(0 or  1) 분류하는 위한 dataset
 """

 import pandas as pd
 from xgboost import XGBClassifier # model(분류모델)
 from xgboost import plot_importance # 중요변수 시각화
 from sklearn.datasets import load_boston # dataset
 from sklearn.model_selection import train_test_split

 from matplotlib import font_manager, rc
 font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
 rc('font', family=font_name)

 import matplotlib.pyplot as plt # 중요변수 시각화 

 freeze = pd.read_csv("../data/freeze_dataset.csv",encoding="MS949")
 print(freeze.info())
 '''
 RangeIndex: 37089 entries, 0 to 37088
 Data columns (total 95 columns):
 '''

 print(freeze.head())

 # 칼럼명 수정 : 공백 -> '_'
 freeze.columns = freeze.columns.str.replace(' ', '_')
 print(freeze.info())

 # 동파유무(0 or 1)
 print(freeze['동파유무'].value_counts())
 '''
 0.0    34130 : 90%
 1.0     2959 : 10%
 '''

 cols = list(freeze.columns) # 95개 칼럼
 x_cols = cols[1:]
 y_cols = cols[0]

 print(y_cols) # 동파유무

 train_set, test_set = train_test_split(
         freeze, test_size=0.4)

 # model
 obj = XGBClassifier()
 model = obj.fit(train_set[x_cols], train_set[y_cols])

 # 중요변수 score, 시각화
 score = model.get_booster().get_fscore()
 print(score)

 plot_importance(model)
 plt.show()

Emsemble的更多相关文章

  1. 推荐系统resys小组线下活动见闻2009-08-22

    http://www.tuicool.com/articles/vUvQVn 时间2009-08-30 15:13:22  不周山原文  http://www.wentrue.net/blog/?p= ...

  2. 自适应注意力机制在Image Caption中的应用

    在碎片化阅读充斥眼球的时代,越来越少的人会去关注每篇论文背后的探索和思考. 在这个栏目里,你会快速 get 每篇精选论文的亮点和痛点,时刻紧跟 AI 前沿成果. 点击本文底部的「阅读原文」即刻加入社区 ...

  3. 我的第一次面试 —— 腾讯 AI安全 一面总结

    前言 在校两年半,没经历过面试的毒打,第一次面试给了腾讯,周二晚上学长帮推的简历周三下午就打电话来问周四晚上有没有空面试.那天下午还在赶着数据库的实验报告,脑子有点转不过来就说了有空,然后仔细一看好像 ...

随机推荐

  1. python12--字符串的比较 函数的默认值的细节 三元表达式 函数对象 名称空间 作用域 列表与字典的推导式 四则运算 函数的嵌套

     复习   1.字符串的比较; 2.函数的参数; ******实参与形参的分类: 3.函数的嵌套调用:     # 字符串的比较#  -- 按照从左往右比较每一个字符,通过字符对应的ascii进行比较 ...

  2. Redis-Cluster操作命令大全

    今天整理下redis-cluster操作命令 一.Cluster操作命令 CLUSTER INFO 打印集群的信息 CLUSTER NODES 列出集群当前已知的所有节点(node),以及这些节点的相 ...

  3. Windows下U盘管理程序

    一个操作系统的作业,生成的程序需要使用管理员权限运行,参考了很多网上的代码,如果打开错误,请修改字符集为使用多字节字符集,并且调整为release模式. 作业的内容如下: 任务操作系统API应用体验与 ...

  4. Kubernetes之canal的网络策略(NetworkPolicy)

    安装要求: 1.我们这里安装的是3.3的版本.kubernetes的要求: 支持的版本 1.10 1.11 1.12 2.CNI插件需要启用,Calico安装为CNI插件.必须通过传递--networ ...

  5. UI命名规范

    Ui控件汇总 按钮 单行文本框 多行文本框 水平滑动条 垂直滑动条 水平滚动条 垂直滚动条 进度条 下拉列表框 复选框 UI命名规范 UI文字做好备份,方便后期维护修改 按钮命名规范:btn_ 第一个 ...

  6. HDU 2594(求最长公共前后缀 kmp)

    题意是在所给的两个字符串中找最长的公共前后缀,即第一个字符串前缀和第二个字符串后缀的最长相等串. 思路是将两个字符串拼接在一起,然后直接套用 kmp 算法即可. 要注意用 next 会报编译错误,改成 ...

  7. Linux ip配置

    ifconfig  查看ip ifconfig eth0  192.168.100.10  netmask 255.255.255.0  或者 ifconfig eth0  192.168.100.1 ...

  8. vuex概念详解

    阅读vuex官网以后用自己的话概括起来就是:vuex是vue配套的公共数据管理工具,它可以把一些共享的数据,保存到vuex中,方便整个程序中的任何组件直接获取或修改我们的公共数据. vuex是为了保存 ...

  9. python—异常

    异常是在程序中不可避免的,当程序遇到一个异常时程序就会停止,可以使用try—except进行处理异常,即便在出现异常程序也可以继续运行. 语法: try: 代码 except 异常名: 处理异常的代码 ...

  10. Laravel框架中打印sql

     在使用Laravel框架的时候,调试的时候,需要将查询的SQL输出校验,这是需要将SQL打印出来. 一.方法 DB::connection()->enableQueryLog();  // 开 ...