lightgbm使用leaf_wise tree生长策略,leaf_wise_tree的优点是收敛速度快,缺点是容易过拟合。

# lightgbm关键参数

# lightgbm调参方法cv

代码github地址

  1. 1 # -*- coding: utf-8 -*-
  2. 2 """
  3. 3 # 作者:wanglei5205
  4. 4 # 邮箱:wanglei5205@126.com
  5. 5 # 博客:http://cnblogs.com/wanglei5205
  6. 6 # github:http://github.com/wanglei5205
  7. 7 """
  8. 8 ### 导入模块
  9. 9 import numpy as np
  10. 10 import pandas as pd
  11. 11 import lightgbm as lgb
  12. 12 from sklearn import metrics
  13. 13
  14. 14 ### 载入数据
  15. 15 print('载入数据')
  16. 16 dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
  17. 17 dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
  18. 18 dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
  19. 19 dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
  20. 20 dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
  21. 21
  22. 22 print('数据去重')
  23. 23 dataset1.drop_duplicates(inplace=True)
  24. 24 dataset2.drop_duplicates(inplace=True)
  25. 25 dataset3.drop_duplicates(inplace=True)
  26. 26 dataset4.drop_duplicates(inplace=True)
  27. 27 dataset5.drop_duplicates(inplace=True)
  28. 28
  29. 29 print('数据合并')
  30. 30 trains = pd.concat([dataset1,dataset2],axis=0)
  31. 31 trains = pd.concat([trains,dataset3],axis=0)
  32. 32 trains = pd.concat([trains,dataset4],axis=0)
  33. 33
  34. 34 online_test = dataset5
  35. 35
  36. 36 ### 数据拆分(训练集+验证集+测试集)
  37. 37 print('数据拆分')
  38. 38 from sklearn.model_selection import train_test_split
  39. 39 train_xy,offline_test = train_test_split(trains,test_size = 0.2,random_state=21)
  40. 40 train,val = train_test_split(train_xy,test_size = 0.2,random_state=21)
  41. 41
  42. 42 # 训练集
  43. 43 y_train = train.is_trade # 训练集标签
  44. 44 X_train = train.drop(['instance_id','is_trade'],axis=1) # 训练集特征矩阵
  45. 45
  46. 46 # 验证集
  47. 47 y_val = val.is_trade # 验证集标签
  48. 48 X_val = val.drop(['instance_id','is_trade'],axis=1) # 验证集特征矩阵
  49. 49
  50. 50 # 测试集
  51. 51 offline_test_X = offline_test.drop(['instance_id','is_trade'],axis=1) # 线下测试特征矩阵
  52. 52 online_test_X = online_test.drop(['instance_id'],axis=1) # 线上测试特征矩阵
  53. 53
  54. 54 ### 数据转换
  55. 55 print('数据转换')
  56. 56 lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
  57. 57 lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,free_raw_data=False)
  58. 58
  59. 59 ### 设置初始参数--不含交叉验证参数
  60. 60 print('设置参数')
  61. 61 params = {
  62. 62 'boosting_type': 'gbdt',
  63. 63 'objective': 'binary',
  64. 64 'metric': 'binary_logloss',
  65. 65 }
  66. 66
  67. 67 ### 交叉验证(调参)
  68. 68 print('交叉验证')
  69. 69 min_merror = float('Inf')
  70. 70 best_params = {}
  71. 71
  72. 72 # 准确率
  73. 73 print("调参1:提高准确率")
  74. 74 for num_leaves in range(20,200,5):
  75. 75 for max_depth in range(3,8,1):
  76. 76 params['num_leaves'] = num_leaves
  77. 77 params['max_depth'] = max_depth
  78. 78
  79. 79 cv_results = lgb.cv(
  80. 80 params,
  81. 81 lgb_train,
  82. 82 seed=2018,
  83. 83 nfold=3,
  84. 84 metrics=['binary_error'],
  85. 85 early_stopping_rounds=10,
  86. 86 verbose_eval=True
  87. 87 )
  88. 88
  89. 89 mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  90. 90 boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  91. 91
  92. 92 if mean_merror < min_merror:
  93. 93 min_merror = mean_merror
  94. 94 best_params['num_leaves'] = num_leaves
  95. 95 best_params['max_depth'] = max_depth
  96. 96
  97. 97 params['num_leaves'] = best_params['num_leaves']
  98. 98 params['max_depth'] = best_params['max_depth']
  99. 99
  100. 100 # 过拟合
  101. 101 print("调参2:降低过拟合")
  102. 102 for max_bin in range(1,255,5):
  103. 103 for min_data_in_leaf in range(10,200,5):
  104. 104 params['max_bin'] = max_bin
  105. 105 params['min_data_in_leaf'] = min_data_in_leaf
  106. 106
  107. 107 cv_results = lgb.cv(
  108. 108 params,
  109. 109 lgb_train,
  110. 110 seed=42,
  111. 111 nfold=3,
  112. 112 metrics=['binary_error'],
  113. 113 early_stopping_rounds=3,
  114. 114 verbose_eval=True
  115. 115 )
  116. 116
  117. 117 mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  118. 118 boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  119. 119
  120. 120 if mean_merror < min_merror:
  121. 121 min_merror = mean_merror
  122. 122 best_params['max_bin']= max_bin
  123. 123 best_params['min_data_in_leaf'] = min_data_in_leaf
  124. 124
  125. 125 params['min_data_in_leaf'] = best_params['min_data_in_leaf']
  126. 126 params['max_bin'] = best_params['max_bin']
  127. 127
  128. 128 print("调参3:降低过拟合")
  129. 129 for feature_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  130. 130 for bagging_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  131. 131 for bagging_freq in range(0,50,5):
  132. 132 params['feature_fraction'] = feature_fraction
  133. 133 params['bagging_fraction'] = bagging_fraction
  134. 134 params['bagging_freq'] = bagging_freq
  135. 135
  136. 136 cv_results = lgb.cv(
  137. 137 params,
  138. 138 lgb_train,
  139. 139 seed=42,
  140. 140 nfold=3,
  141. 141 metrics=['binary_error'],
  142. 142 early_stopping_rounds=3,
  143. 143 verbose_eval=True
  144. 144 )
  145. 145
  146. 146 mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  147. 147 boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  148. 148
  149. 149 if mean_merror < min_merror:
  150. 150 min_merror = mean_merror
  151. 151 best_params['feature_fraction'] = feature_fraction
  152. 152 best_params['bagging_fraction'] = bagging_fraction
  153. 153 best_params['bagging_freq'] = bagging_freq
  154. 154
  155. 155 params['feature_fraction'] = best_params['feature_fraction']
  156. 156 params['bagging_fraction'] = best_params['bagging_fraction']
  157. 157 params['bagging_freq'] = best_params['bagging_freq']
  158. 158
  159. 159 print("调参4:降低过拟合")
  160. 160 for lambda_l1 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  161. 161 for lambda_l2 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  162. 162 for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  163. 163 params['lambda_l1'] = lambda_l1
  164. 164 params['lambda_l2'] = lambda_l2
  165. 165 params['min_split_gain'] = min_split_gain
  166. 166
  167. 167 cv_results = lgb.cv(
  168. 168 params,
  169. 169 lgb_train,
  170. 170 seed=42,
  171. 171 nfold=3,
  172. 172 metrics=['binary_error'],
  173. 173 early_stopping_rounds=3,
  174. 174 verbose_eval=True
  175. 175 )
  176. 176
  177. 177 mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  178. 178 boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  179. 179
  180. 180 if mean_merror < min_merror:
  181. 181 min_merror = mean_merror
  182. 182 best_params['lambda_l1'] = lambda_l1
  183. 183 best_params['lambda_l2'] = lambda_l2
  184. 184 best_params['min_split_gain'] = min_split_gain
  185. 185
  186. 186 params['lambda_l1'] = best_params['lambda_l1']
  187. 187 params['lambda_l2'] = best_params['lambda_l2']
  188. 188 params['min_split_gain'] = best_params['min_split_gain']
  189. 189
  190. 190
  191. 191 print(best_params)
  192. 192
  193. 193 ### 训练
  194. 194 params['learning_rate']=0.01
  195. 195 lgb.train(
  196. 196 params, # 参数字典
  197. 197 lgb_train, # 训练集
  198. 198 valid_sets=lgb_eval, # 验证集
  199. 199 num_boost_round=2000, # 迭代次数
  200. 200 early_stopping_rounds=50 # 早停次数
  201. 201 )
  202. 202
  203. 203 ### 线下预测
  204. 204 print ("线下预测")
  205. 205 preds_offline = lgb.predict(offline_test_X, num_iteration=lgb.best_iteration) # 输出概率
  206. 206 offline=offline_test[['instance_id','is_trade']]
  207. 207 offline['preds']=preds_offline
  208. 208 offline.is_trade = offline['is_trade'].astype(np.float64)
  209. 209 print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
  210. 210
  211. 211 ### 线上预测
  212. 212 print("线上预测")
  213. 213 preds_online = lgb.predict(online_test_X, num_iteration=lgb.best_iteration) # 输出概率
  214. 214 online=online_test[['instance_id']]
  215. 215 online['preds']=preds_online
  216. 216 online.rename(columns={'preds':'predicted_score'},inplace=True) # 更改列名
  217. 217 online.to_csv("./data/20180405.txt",index=None,sep=' ') # 保存结果
  218. 218
  219. 219 ### 保存模型
  220. 220 from sklearn.externals import joblib
  221. 221 joblib.dump(lgb,'lgb.pkl')
  222. 222
  223. 223 ### 特征选择
  224. 224 df = pd.DataFrame(X_train.columns.tolist(), columns=['feature'])
  225. 225 df['importance']=list(lgb.feature_importance()) # 特征分数
  226. 226 df = df.sort_values(by='importance',ascending=False) # 特征排序
  227. 227 df.to_csv("./data/feature_score_20180331.csv",index=None,encoding='gbk') # 保存分数

【集成学习】lightgbm调参案例的更多相关文章

  1. xgboost&lightgbm调参指南

    本文重点阐述了xgboost和lightgbm的主要参数和调参技巧,其理论部分可见集成学习,以下内容主要来自xgboost和LightGBM的官方文档. xgboost Xgboost参数主要分为三大 ...

  2. LightGBM 调参方法(具体操作)

     sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频) https://study.163.com/course/introduction.htm?courseId=1005269003& ...

  3. LightGBM调参笔记

    本文链接:https://blog.csdn.net/u012735708/article/details/837497031. 概述在竞赛题中,我们知道XGBoost算法非常热门,是很多的比赛的大杀 ...

  4. 自动调参库hyperopt+lightgbm 调参demo

    在此之前,调参要么网格调参,要么随机调参,要么肉眼调参.虽然调参到一定程度,进步有限,但仍然很耗精力. 自动调参库hyperopt可用tpe算法自动调参,实测强于随机调参. hyperopt 需要自己 ...

  5. lightgbm调参方法

    gridsearchcv: https://www.cnblogs.com/bjwu/p/9307344.html gridsearchcv+lightgbm cv函数调参: https://www. ...

  6. LightGBM调参总结

    1. 参数速查 使用num_leaves,因为LightGBM使用的是leaf-wise的算法,因此在调节树的复杂程度时,使用的是num_leaves而不是max_depth. 大致换算关系:num_ ...

  7. XGBoost和LightGBM的参数以及调参

    一.XGBoost参数解释 XGBoost的参数一共分为三类: 通用参数:宏观函数控制. Booster参数:控制每一步的booster(tree/regression).booster参数一般可以调 ...

  8. 使用sklearn进行集成学习——实践

    系列 <使用sklearn进行集成学习——理论> <使用sklearn进行集成学习——实践> 目录 1 Random Forest和Gradient Tree Boosting ...

  9. [转]使用sklearn进行集成学习——实践

    转:http://www.cnblogs.com/jasonfreak/p/5720137.html 目录 1 Random Forest和Gradient Tree Boosting参数详解2 如何 ...

随机推荐

  1. 爬虫框架Scrapy之详解

    Scrapy 框架 Scrapy是用纯Python实现一个为了爬取网站数据.提取结构性数据而编写的应用框架,用途非常广泛. 框架的力量,用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页 ...

  2. JDK 中的监控与故障处理工具-02 (jps)

    jps : JVM Process Status Tool jps 命令可以列出正在运行的虚拟机进程, 并显示虚拟机执行的 main class 的名称(main函数所在的类),以及这些进程的本地虚拟 ...

  3. 关于C# get set的简单用法

    关于C# get set的文章很多,但是笔者的这篇文章有它的特别之处,笔者用简单的语言把c# get set讲述的十分明了. C# get set释一:属性的访问器包含与获取(读取或计算)或设置(写) ...

  4. checkbox及css实现点击下拉菜单

    面试遇到的问题.用checkbox中的:checked伪类选择器实现. 通过label标签来触发checkbox的unchecked 和checked两种状态:用css普通同胞选择器 ~.另外补充一点 ...

  5. spring mvc:练习:表单验证(javaConfig配置和注解)

    使用Spring表单标签, 表单验证使用 JSR303 的验证注解,hibernate-validators,提供了使用MessageSource和访问静态资源(如CSS,JavaScript,图片) ...

  6. 如何在 Ubuntu 中安装 QGit 客户端

    QGit是一款由Marco Costalba用Qt和C++写的开源的图形界面 Git 客户端.它是一款可以在图形界面环境下更好地提供浏览版本历史.查看提交记录和文件补丁的客户端.它利用git命令行来执 ...

  7. java MongoDB查询(二)复杂查询

    前言 在上篇<java MongoDB查询(一)简单查询>中我们简单了解了下查询,但是仅仅有那些查询是不够用的,还需要复杂的查询,这篇就这点进行叙述. 1.数据结构 集合:firstCol ...

  8. Codeforces Round #378 (Div. 2)F - Drivers Dissatisfaction GNU

    http://codeforces.com/contest/733/problem/F 题意:给你一些城市和一些路,每条路有不满意程度和每减少一点不满意程度的花费,给出最大花费,要求找出花费小于s的最 ...

  9. 我的 VSCode 常用扩展

    Beautify (option+shift+F) Bookmarks (option+option+k,l,j) Debugger for Chrome Docker EditorConfig fo ...

  10. LabVIEW之生产者/消费者模式

    LabVIEW之生产者/消费者设计模式 彭会锋