Xgboost_lightgbm_catboost_gridsearch_randomsearch_bayes_opt
本文主要是提高xgboost/lightgbm/catboost等模型在参数调优方面的工程化实现以及在stacking模型融合使用 * xgboost调优 * lightgbm调优 * catboost调优 * stacking融合 * 网格搜索、随机搜索和贝叶斯优化 * LightGBM各种操作
0 工具类函数
from __future__ import print_function
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from itertools import chain
import time
import os
def timer(func):
"""计时器"""
def wrapper(*args, **kwargs):
t1 = time.time()
func(*args, **kwargs)
t2 = time.time()
print("花费时间:{}秒".format(t2-t1))
return wrapper
def time_to_date(time_stamp):
"""把时间戳转成日期的形式"""
time_array = time.localtime(time_stamp)
date_style_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
return date_style_time
def check_path(_path):
"""检查路径是否存在,不存在则新建"""
if os.path.dirname(_path):
if not os.path.exists(os.path.dirname(_path)):
os.makedirs(os.path.dirname(_path))
def get_no_used_features(all_features, used_features, no_used_features_path='features/no_used_features.csv'):
"""统计没有用到的特征"""
print('n_all_features={}, n_feature_used={}'.format(len(all_features), len(used_features)))
no_used_features = list(set(all_features).difference(set(used_features)))
n_no_used = len(no_used_features)
print('n_no_used_feature={}'.format(n_no_used))
df_no_used = pd.DataFrame({'no_used': no_used_features})
df_no_used.to_csv(no_used_features_path)
def get_lgb_features(lgb_model, lgb_feature_path='features/lgb_features.csv'):
"""获取 lgb 的特征重要度"""
feature_names = lgb_model.feature_name()
feature_importances = lgb_model.feature_importance()
df_lgb_features = pd.DataFrame({'feature': feature_names, 'scores': feature_importances})
df_lgb_features = df_lgb_features.sort_values('scores', ascending=False)
df_lgb_features.to_csv(lgb_feature_path, index=False)
def get_grid_params(search_params):
""" xgboost 遍历 grid search 的所有参数组合。
Args:
search_params: dict of params to be search.
search_params = {
'learning_rate': [0.01, 0.025, 0.05],
'max_depth': [5, 6, 7, 8],
'subsample': [0.6, 0.8],
'colsample_bytree': [0.5, 0.6, 0.8]
}
Returns:
grid_params: list, 每个元素为一个dict, 对应每次搜索的参数。
[{'learning_rate':0.01,'max_depth':5,'subsample':0.6,'colsample_bytree':0.5},{},...]
"""
keys = list(search_params.keys())
values = list(search_params.values())
grid_params = list()
for lr in values[0]:
for md in values[1]:
for sub in values[2]:
for cb in values[3]:
dict_cb = dict()
dict_cb['learning_rate'] = lr
dict_cb['max_depth'] = md
dict_cb['subsample'] = sub
dict_cb['colsample_bytree'] = cb
grid_params.append(dict_cb)
return grid_params
def get_grid_params_lgb(search_params):
""" lightgbm 遍历 grid search 的所有参数组合。
Args:
search_params: dict of params to be search.
search_params = {
'num_leaves': [20, 30, 40, 50],
'learning_rate': [0.025, 0.05, 0.1, 0.15, 0.20]
}
Returns:
grid_params: list, 每个元素为一个dict, 对应每次搜索的参数。
[{'num_leaves':0.01,'learning_rate':5},{},...]
"""
keys = list(search_params.keys())
values = list(search_params.values())
grid_params = list()
for nl in values[0]:
for lr in values[1]:
dict_cb = dict()
dict_cb['num_leaves'] = nl
dict_cb['learning_rate'] = lr
grid_params.append(dict_cb)
return grid_params
def print_confusion_matrix(y_true, y_pred):
"""打印分类混淆矩阵。
Args:
y_true: 真实类别。
y_pred: 预测类别。
"""
labels = list(set(y_true))
conf_mat = confusion_matrix(y_true, y_pred, labels=labels)
print("confusion_matrix(left labels: y_true, up labels: y_pred):")
out = "labels\t"
for i in range(len(labels)):
out += (str(labels[i]) + "\t")
print(out)
for i in range(len(conf_mat)):
out = (str(labels[i]) + "\t")
for j in range(len(conf_mat[i])):
out += (str(conf_mat[i][j]) + '\t')
print(out)
return conf_mat
def get_auc(y_true, y_pred_pos_prob, plot_ROC=False):
"""计算 AUC 值。
Args:
y_true: 真实标签,如 [0, 1, 1, 1, 0]
y_pred_pos_prob: 预测每个样本为 positive 的概率。
plot_ROC: 是否绘制 ROC 曲线。
Returns:
roc_auc: AUC 值.
fpr, tpr, thresholds: see roc_curve.
"""
fpr, tpr, thresholds = (y_true, y_pred_pos_prob)
roc_auc = auc(fpr, tpr) # auc 值
if plot_ROC:
plt.plot(fpr, tpr, '-*', lw=1, label='auc=%g' % roc_auc)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
return roc_auc, fpr, tpr, thresholds
def evaluate(y_true, y_pred):
"""二分类预测结果评估。
Args:
y_true: list, 真实标签,如 [1, 0, 0, 1]
y_pred: list,预测结果,如 [1, 1, 0, 1]
Returns:
返回正类别的评价指标。
p: 预测为正类别的准确率: p = tp / (tp + fp)
r: 预测为正类别的召回率: r = tp / (tp + fn)
f1: 预测为正类别的 f1 值: f1 = 2 * p * r / (p + r).
"""
conf_mat = confusion_matrix(y_true, y_pred)
all_p = np.sum(conf_mat[:, 1])
if all_p == 0:
p = 1.0
else:
p = conf_mat[1, 1] / all_p
r = conf_mat[1, 1] / np.sum(conf_mat[1, :])
f1 = f1_score(y_true, y_pred)
return p, r, f1
def feature_analyze(model, to_print=False, to_plot=False, csv_path=None):
"""XGBOOST 模型特征重要性分析。
Args:
model: 训练好的 xgb 模型。
to_print: bool, 是否输出每个特征重要性。
to_plot: bool, 是否绘制特征重要性图表。
csv_path: str, 保存分析结果到 csv 文件路径。
"""
feature_score = model.get_fscore() # 训练好的模型自带属性
feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True)
if to_plot:
features = list()
scores = list()
for (key, value) in feature_score:
features.append(key)
scores.append(value)
plt.barh(range(len(scores)), scores)
plt.yticks(range(len(scores)), features)
for i in range(len(scores)):
plt.text(scores[i] + 0.75, i - 0.25, scores[i])
plt.xlabel('feature socre')
plt.title('feature score evaluate')
plt.grid()
plt.show()
fs = []
for (key, value) in feature_score:
fs.append("{0},{1}\n".format(key, value))
if to_print:
print(''.join(fs))
if csv_path is not None:
with open(csv_path, 'w') as f:
f.writelines("feature,score\n")
f.writelines(fs)
return feature_score
if __name__ == '__main__':
search_params = {
'learning_rate': [0.01, 0.025, 0.05],
'max_depth': [5, 6, 7, 8],
'subsample': [0.6, 0.8],
'colsample_bytree': [0.5, 0.6, 0.8]
}
grid_params = get_grid_params(search_params)
print(grid_params[0])
print(len(grid_params))
- 行为数据、评论数据、历史数据以及用户画像数据处理
- 数据预处理–特征提取
1 xgboost网格搜索
'''
Author: ydzhao
Description: xgboost 网格搜索最佳参数/保存最佳参数/使用最佳参数预测结果
Date: 2020-11-23 10:44:16
LastEditTime: 2020-11-23 13:35:17
FilePath: \project\code\DC-hi_guides-master\gridsearch_1_xgb.py
'''
from __future__ import print_function
from __future__ import division
from data_helper import get_action_rate_before_jp
from data_helper import get_pair_rate_before_jp
from data_helper import get_triple_rate_before_jp
from data_helper import get_gender_convert_rate
from data_helper import get_feat
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import get_grid_params
from my_utils import check_path
from m1_xgb import xgb_fit
from m1_xgb import xgb_predict
import joblib
import time
from tqdm import tqdm
########### 配置文件 ############
class Config(object):
"""xgboost模型调参"""
def __init__(self):
self.params = {'learning_rate': 0.025,
'eval_metric': 'auc',
'n_estimators': 5000,
'max_depth': 5,
'min_child_weight': 7,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.6,
'eta': 0.05,
'silent': 1,
'objective': 'binary:logistic',
'scale_pos_weight': 1}
self.max_round = 5000
self.cv_folds = 5
self.early_stop_round = 30
self.seed = 3
self.save_model_path = None
self.best_model_path = 'model/best_xgb_model.dat'
########### 日志处理 ############
import logging.handlers
LOG_FILE = 'log/xgb_grid_search.log'
check_path('LOG_FILE')
# 实例化handler ,RotatingFileHandler 位于 logging.handlers支持循环日志文件
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount = 5)
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('search')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
########### 逐个参数搜索 ############
def my_search(config, search_params, model_fit, X_train, y_train):
"""Search for best params. 逐个参数搜索,并不是 grid search 所以应该注意参数的顺序。"""
best_auc = 0.0
best_model = None
for k, vs in tqdm(search_params.items()):
best_v = vs[0] # the best vs
for v in vs:
config.params[k] = v
_model, _auc, _round, _ = model_fit(config, X_train, y_train)
if _auc > best_auc: # find the better param
best_v = v
best_auc = _auc
best_model = _model
message = 'Best_auc={}; {}={}, auc={}, round={}'.format(best_auc, k, v, _auc, _round)
logger.info(message)
print(message)
config.params[k] = best_v # set the best params
search_message = 'Finished Search! Best auc={}. Best params are \n{}'.format(best_auc, config.params)
logger.info(search_message)
print(search_message)
return best_model, config, best_auc
########### 网格搜索 ############
def my_grid_search(config, search_params, model_fit, X_train, y_train):
"""grid search"""
best_auc = 0.0
best_model = None
best_params = None
grid_params = get_grid_params(search_params)
message = 'Begin grid searching. Params group number is {}'.format(len(grid_params))
print(message)
logger.info(message)
for i in tqdm(range(len(grid_params))):
dict_params = grid_params[i]
print(dict_params)
logger.info('Searching {}/{}'.format(i+1, len(grid_params)))
for k, v in dict_params.items():
config.params[k] = v
_model, _auc, _round, _ = model_fit(config, X_train, y_train)
if _auc >= best_auc: # find the better param
best_params = dict_params.copy()
best_auc = _auc
best_model = _model
message = 'Best_auc={}; auc={}, round={}, params={}'.format(best_auc, _auc, _round, dict_params)
logger.info(message)
print(message)
search_message = 'Finished Search! Best auc={}. Best params are \n{}'.format(best_auc, best_params)
logger.info(search_message)
print(search_message)
return best_model, best_params, best_auc
if __name__ == '__main__':
######## 获取特征、训练集和测试集
feature_path = 'features/'
train_data, test_data = load_feat(re_get=False, feature_path=feature_path)
train_feats = train_data.columns
test_feats = test_data.columns
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
X_train = train_data.drop(drop_columns, axis=1)
y_train = train_data['label']
X_test = test_data
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
print(X_train.shape, X_test.shape)
logger.info(data_message)
######## 网格调参获取最佳参数并保存模型
t1 = time.time()
config = Config()
search_params = {
'learning_rate': [0.01, 0.025, 0.05],
'max_depth': [5, 6, 7, 8],
'subsample': [0.6, 0.8],
'colsample_bytree': [0.5, 0.6, 0.8]
}
logger.info(search_params)
# best_model, config, best_auc = my_search(config, search_params, xgb_fit, X_train, y_train) # my simple search
best_model, best_params, best_auc = my_grid_search(config, search_params, xgb_fit, X_train, y_train) # my grid search
print('Time cost {}s'.format(time.time() - t1))
######## 保存最佳模型,落盘之前确定目录是否存在
check_path(config.best_model_path)
joblib.dump(best_model, config.best_model_path)
######## 用最佳模型预测
# best_model = joblib.load(config.best_model_path) # load the trained model
now = time.strftime("%m%d-%H%M%S")
result_path = 'result/result_xgb_search_{}.csv'.format(now)
check_path(result_path)
xgb_predict(best_model, X_test, result_path)
######## feature analyze
feature_score_path = 'model/xgb_search_feature_score.csv'
check_path(feature_score_path)
feature_analyze(best_model, csv_path=feature_score_path)
2 lightgbm网格搜索
'''
Author: ydzhao
Description: lightgbm 网格搜索最佳参数/保存最佳参数/使用最佳参数预测结果
Date: 2020-11-23 10:44:16
LastEditTime: 2020-11-23 13:35:17
FilePath: \project\code\DC-hi_guides-master\gridsearch_2_lgb.py
'''
from __future__ import print_function
from __future__ import division
from data_helper import get_action_rate_before_jp
from data_helper import get_pair_rate_before_jp
from data_helper import get_triple_rate_before_jp
from data_helper import get_gender_convert_rate
from data_helper import get_feat
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import get_grid_params_lgb
from my_utils import check_path
from m2_lgb import lgb_fit
from m2_lgb import lgb_predict
import joblib
import time
from tqdm import tqdm
########### 配置文件 ############
class Config(object):
"""lightgbm模型调参"""
def __init__(self):
self.params = {
'objective': 'binary',
'metric': {'auc'},
'learning_rate': 0.05,
'num_leaves': 30, # 叶子设置为 50 线下过拟合严重
'min_sum_hessian_in_leaf': 0.1,
'feature_fraction': 0.3, # 相当于 colsample_bytree
'bagging_fraction': 0.5, # 相当于 subsample
'lambda_l1': 0,
'lambda_l2': 5,
}
self.max_round = 5000
self.cv_folds = 5
self.early_stop_round =30
self.seed = 3
self.save_model_path = None
self.best_model_path = 'model/best_lgb_model.txt'
########### 日志处理 ############
import logging.handlers
LOG_FILE = 'log/lgb_grid_search.log'
check_path('LOG_FILE')
# 实例化handler ,RotatingFileHandler 位于 logging.handlers支持循环日志文件
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount = 5)
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('search')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
########### 逐个参数搜索 ############
def my_search(config, search_params, model_fit, X_train, y_train):
"""Simple search. 逐个参数搜索,并不是 grid search, 所以应该注意参数的顺序。"""
best_auc = 0.0
best_model = None
for k, vs in search_params.items():
best_v = vs[0] # the best vs
for v in vs:
config.params[k] = v
_model, _auc, _round, _ = model_fit(config, X_train, y_train)
if _auc > best_auc: # find the better param
best_v = v
best_auc = _auc
best_model = _model
message = 'Best_auc={}; {}={}, auc={}, round={}'.format(best_auc, k, v, _auc, _round)
logger.info(message)
print(message)
config.params[k] = best_v # set the best params
search_message = 'Finished Search! Best auc={}. Best params are \n{}'.format(best_auc, config.params)
logger.info(search_message)
print(search_message)
return best_model, config, best_auc
########### 网格搜索 ############
def my_grid_search(config, search_params, model_fit, X_train, y_train):
"""grid search."""
best_auc = 0.0
best_model = None
best_params = None
grid_params = get_grid_params_lgb(search_params)
message = 'Begin grid searching. Params group number is {}'.format(len(grid_params))
print(message)
logger.info(message)
for i in tqdm(range(len(grid_params))):
dict_params = grid_params[i]
logger.info('Searching {}/{}'.format(i+1, len(grid_params)))
for k, v in dict_params.items():
config.params[k] = v
_model, _auc, _round, _ = model_fit(config, X_train, y_train)
if _auc >= best_auc: # find the better param
best_params = dict_params.copy()
best_auc = _auc
best_model = _model
message = 'Best_auc={}; auc={}, round={}, params={}'.format(best_auc, _auc, _round, dict_params)
logger.info(message)
print(message)
search_message = 'Finished Search! Best auc={}. Best params are \n{}'.format(best_auc, best_params)
logger.info(search_message)
print(search_message)
return best_model, best_params, best_auc
if __name__ == '__main__':
######## 获取特征、训练集和测试集
feature_path = 'features/'
train_data, test_data = load_feat(re_get=False, feature_path=feature_path)
train_feats = train_data.columns
test_feats = test_data.columns
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
X_train = train_data.drop(drop_columns, axis=1).iloc[:, :50]
y_train = train_data['label']
X_test = test_data.iloc[:, :50]
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
logger.info(data_message)
######## 网格调参获取最佳参数并保存模型
t1 = time.time()
config = Config()
search_params = {
'num_leaves': [20, 30, 40, 50],
'learning_rate': [0.025, 0.05, 0.1, 0.15, 0.20]
}
logger.info(search_params)
# best_model, config, best_auc = my_search(config, search_params, lgb_fit, X_train, y_train) # my simple search
best_model, best_params, best_auc = my_grid_search(config, search_params, lgb_fit, X_train, y_train) # my grid search
print('Time cost {}s'.format(time.time() - t1))
######## 保存最佳模型,落盘之前确定目录是否存在
check_path(config.best_model_path)
best_model.save_model(config.best_model_path)
######## 用最佳模型预测
# best_model = joblib.load(config.best_model_path) # load the trained model
now = time.strftime("%m%d-%H%M%S")
result_path = 'result/result_lgb_search_{}.csv'.format(now)
check_path(result_path)
lgb_predict(best_model, X_test, result_path)
3 xgboost模型训练,获得特征分数表
'''
Author: ydzhao
Description: xgboost模型训练,获得特征分数表
Date: 2020-11-23 10:44:16
LastEditTime: 2020-11-23 13:35:17
FilePath: \project\code\DC-hi_guides-master\m1_xgb.py
'''
from __future__ import print_function
from __future__ import division
from data_helper import get_action_rate_before_jp
from data_helper import get_pair_rate_before_jp
from data_helper import get_triple_rate_before_jp
from data_helper import get_gender_convert_rate
from data_helper import get_feat
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import check_path
import xgboost as xgb
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
import time
import logging.handlers
LOG_FILE = 'log/xgb_train.log'
check_path(LOG_FILE)
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1) # 实例化handler
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('train')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
class Config(object):
def __init__(self):
self.params = {'learning_rate': 0.05,
'eval_metric': 'auc',
'n_estimators': 5000,
'max_depth': 6,
'min_child_weight': 7,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.6,
'eta': 0.05, # 同 learning rate, Shrinkage(缩减),每次迭代完后叶子节点乘以这系数,削弱每棵树的权重
'silent': 1,
'objective': 'binary:logistic',
# 'nthread ': 6,
'scale_pos_weight': 1}
self.max_round = 3000
self.cv_folds = 10
self.early_stop_round = 50
self.seed = 3
self.save_model_path = 'model/xgb.dat'
def xgb_fit(config, X_train, y_train):
"""模型(交叉验证)训练,并返回最优迭代次数和最优的结果。
Args:
config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path}
X_train:array like, shape = n_sample * n_feature
y_train: shape = n_sample * 1
Returns:
best_model: 训练好的最优模型
best_auc: float, 在测试集上面的 AUC 值。
best_round: int, 最优迭代次数。
"""
params = config.params
max_round = config.max_round
cv_folds = config.cv_folds
early_stop_round = config.early_stop_round
seed = config.seed
save_model_path = config.save_model_path
if cv_folds is not None:
dtrain = xgb.DMatrix(X_train, label=y_train)
cv_result = xgb.cv(params, dtrain, max_round, nfold=cv_folds, seed=seed, verbose_eval=True,
metrics='auc', early_stopping_rounds=early_stop_round, show_stdv=False)
# 最优模型,最优迭代次数
best_round = cv_result.shape[0]
best_auc = cv_result['test-auc-mean'].values[-1] # 最好的 auc 值
best_model = xgb.train(params, dtrain, best_round)
else:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
best_model = xgb.train(params, dtrain, max_round, evals=watchlist, early_stopping_rounds=early_stop_round)
best_round = best_model.best_iteration
best_auc = best_model.best_score
cv_result = None
if save_model_path:
check_path(save_model_path)
joblib.dump(best_model, save_model_path)
return best_model, best_auc, best_round, cv_result
def xgb_predict(model, X_test, save_result_path=None):
# print(len(X_test.columns))
# print(len(model.feature_names))
# print(model.get_score())
"""
解决xgboost报错 :
ValueError: feature_names mismatch
即使新数据中,字段的总数和所有字段名称能对应上,但是字段顺序对应不上,也会出现这个问题。
xgboost中,如果顺序对应不上,那么加载后预测结果时,默认还是使用以前的顺序,就造成了实际字段不匹配
"""
X_test = X_test[model.feature_names]
dtest = xgb.DMatrix(X_test)
y_pred_prob = model.predict(dtest)
print('y_pred_prob length:',len(y_pred_prob))
if save_result_path:
df_result = X_test
df_result['orderType'] = y_pred_prob
df_result.to_csv(save_result_path, index=False)
print('Save the result to {}'.format(save_result_path))
return y_pred_prob
def run_cv(X_train, X_test, y_train):
config = Config()
# train model
# t1 = time.time()
# data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
# print(data_message)
# logger.info(data_message)
#
# xgb_model, best_auc, best_round, cv_result = xgb_fit(config = config, X_train = X_train, y_train = y_train)
# check_path(config.save_model_path)
# joblib.dump(xgb_model, config.save_model_path)
#
# t2 = time.time()
# print('Time cost {}s'.format(t2 - t1))
#
# result_message = 'best_round={}, best_auc={}'.format(best_round, best_auc)
# logger.info(result_message)
# print(result_message)
# predict
now = time.strftime("%m%d-%H%M%S")
# result_path = 'result/result_xgb_{}-{:.4f}.csv'.format(now, best_auc)
result_path = 'result/result_xgb_{}-{:.4f}.csv'.format(now, 0.9705906000000001) # 懒得训练了
check_path(result_path)
xgb_model = joblib.load(config.save_model_path)
xgb_predict(model = xgb_model, X_test = X_test, save_result_path = result_path)
# feature analyze
feature_score_path = 'features/xgb_feature_score.csv'
check_path(feature_score_path)
feature_analyze(model = xgb_model, to_print = False, to_plot = False, csv_path = feature_score_path)
if __name__ == '__main__':
"""执行顺序。
1.首先执行 run_cv() , 会得到一份 feature_score.
2.执行 get_no_used_features.py 生成特征排序表。
"""
# get feature
feature_path = 'features/'
train_data, test_data = load_feat(re_get=False, feature_path=feature_path)
train_feats = train_data.columns.values
test_feats = test_data.columns.values
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
X_train = train_data.drop(drop_columns, axis=1)
y_train = train_data['label']
X_test = test_data
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
logger.info(data_message)
# 直接训练
run_cv(X_train, X_test, y_train)
4 lightgbm模型训练,获得特征分数表
'''
Author: ydzhao
Description: lightgbm模型训练,获得特征分数表
Date: 2020-11-23 10:44:16
LastEditTime: 2020-11-23 13:35:17
FilePath: \project\code\DC-hi_guides-master\m2_lgb.py
'''
from __future__ import print_function
from __future__ import division
from data_helper import get_action_rate_before_jp
from data_helper import get_pair_rate_before_jp
from data_helper import get_triple_rate_before_jp
from data_helper import get_gender_convert_rate
from data_helper import get_feat
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import check_path
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
import joblib
from tqdm import tqdm
import logging.handlers
LOG_FILE = 'log/lgb_train.log'
check_path(LOG_FILE)
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1) # 实例化handler
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('train')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
class Config(object):
def __init__(self):
self.params = {
'objective': 'binary',
'metric': {'auc'},
'learning_rate': 0.05,
'num_leaves': 30, # 叶子设置为 50 线下过拟合严重
'min_sum_hessian_in_leaf': 0.1,
'feature_fraction': 0.3, # 相当于 colsample_bytree
'bagging_fraction': 0.5, # 相当于 subsample
'lambda_l1': 0,
'lambda_l2': 5,
'num_thread': 6 # 线程数设置为真实的 CPU 数,一般12线程的机器有6个物理核
}
self.max_round = 3000
self.cv_folds = 5
self.early_stop_round = 30
self.seed = 3
self.save_model_path = 'model/lgb.dat'
def lgb_fit(config, X_train, y_train):
"""模型(交叉验证)训练,并返回最优迭代次数和最优的结果。
Args:
config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path}
X_train:array like, shape = n_sample * n_feature
y_train: shape = n_sample * 1
Returns:
best_model: 训练好的最优模型
best_auc: float, 在测试集上面的 AUC 值。
best_round: int, 最优迭代次数。
"""
params = config.params
max_round = config.max_round
cv_folds = config.cv_folds
early_stop_round = config.early_stop_round
seed = config.seed
# seed = np.random.randint(0, 10000)
save_model_path = config.save_model_path
# 是否交叉验证训练
if cv_folds is not None:
dtrain = lgb.Dataset(X_train, label=y_train)
cv_result = lgb.cv(
params,
dtrain,
max_round,
nfold=cv_folds,
seed=seed,
verbose_eval=True,
metrics='auc',
early_stopping_rounds=early_stop_round,
show_stdv=False
)
# 最优模型,最优迭代次数
best_round = len(cv_result['auc-mean'])
best_auc = cv_result['auc-mean'][-1] # 最好的 auc 值
best_model = lgb.train(params, dtrain, best_round)
else:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100)
dtrain = lgb.Dataset(X_train, label=y_train)
dvalid = lgb.Dataset(X_valid, label=y_valid)
watchlist = [dtrain, dvalid]
best_model = lgb.train(
params,
dtrain,
max_round,
valid_sets=watchlist,
early_stopping_rounds=early_stop_round
)
best_round = best_model.best_iteration
best_auc = best_model.best_score
cv_result = None
if save_model_path:
check_path(save_model_path)
best_model.save_model(save_model_path)
#joblib.dump(best_model, save_model_path)
return best_model, best_auc, best_round, cv_result
def lgb_predict(model, X_test, save_result_path=None):
y_pred_prob = model.predict(X_test)
if save_result_path:
df_result = X_test
df_result['orderType'] = y_pred_prob
df_result.to_csv(save_result_path, index=False)
print('Save the result to {}'.format(save_result_path))
return y_pred_prob
def run_feat_search(X_train, X_test, y_train, feature_names):
"""根据特征重要度,逐个删除特征进行训练,获取最好的特征结果。
同时,将每次迭代的结果求平均作为预测结果"""
config = Config()
# train model
t1 = time.time()
y_pred_list = list()
aucs = list()
for i in tqdm(range(1, 250, 3)):
drop_cols = feature_names[-i:]
X_train_ = X_train.drop(drop_cols, axis=1)
X_test_ = X_test.drop(drop_cols, axis=1)
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train_.shape, X_test_.shape)
print(data_message)
logger.info(data_message)
lgb_model, best_auc, best_round, cv_result = lgb_fit(config, X_train_, y_train)
t2 = time.time()
print('Time cost {}s'.format(t2 - t1))
result_message = 'best_round={}, best_auc={}'.format(best_round, best_auc)
logger.info(result_message)
print(result_message)
# predict
# lgb_model = lgb.Booster(model_file=config.save_model_path)
now = time.strftime("%m%d-%H%M%S")
result_path = 'result/result_lgb_{}-{:.4f}.csv'.format(now, best_auc)
check_path(result_path)
y_pred = lgb_predict(lgb_model, X_test_, result_path)
y_pred_list.append(y_pred)
aucs.append(best_auc)
y_preds_path = 'stack_preds/lgb_feat_search_pred_{}.npz'.format(i)
check_path(y_preds_path)
np.savez(y_preds_path, y_pred_list=y_pred_list, aucs=aucs)
message = 'Saved y_preds to {}. Best auc is {}'.format(y_preds_path, np.max(aucs))
logger.info(message)
print(message)
def run_cv(X_train, X_test, y_train):
config = Config()
# train model
t1 = time.time()
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
logger.info(data_message)
lgb_model, best_auc, best_round, cv_result = lgb_fit(config, X_train, y_train)
t2 = time.time()
print('Time cost {}s'.format(t2 - t1))
result_message = 'best_round={}, best_auc={}'.format(best_round, best_auc)
logger.info(result_message)
print(result_message)
# predict
# lgb_model = lgb.Booster(model_file=config.save_model_path)
now = time.strftime("%m%d-%H%M%S")
result_path = 'result/result_lgb_{}-{:.4f}.csv'.format(now, best_auc)
check_path(result_path)
lgb_predict(lgb_model, X_test, result_path)
if __name__ == '__main__':
"""执行顺序。
1.首先执行 run_cv() , 会得到一份 feature_score.
2.执行 get_no_used_features.py 生成特征排序表。
3.执行 run_feat_search 进行特征搜索,并将预测的结果进行保存。
"""
# get feature
feature_path = 'features/'
train_data, test_data = load_feat(re_get=False, feature_path=feature_path) # re_get=False 在xgboost部分已经预处理过了数据
train_feats = train_data.columns.values
test_feats = test_data.columns.values
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
X_train = train_data.drop(drop_columns, axis=1)
y_train = train_data['label']
X_test = test_data
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
logger.info(data_message)
# 根据特征搜索中最好的结果丢弃部分特征
# n_drop_col = 141
# drop_cols = feature_names[-n_drop_col:]
# X_train = X_train.drop(drop_cols, axis=1)
# X_test = X_test.drop(drop_cols, axis=1)
# 直接训练
# run_cv(X_train, X_test, y_train)
# 特征搜索
# get feature scores
try:
df_lgb_feat_score = pd.read_csv('features/lgb_features.csv')
feature_names = df_lgb_feat_score.feature.values
except Exception as e:
print('You should run the get_no_used_features.py first.')
run_feat_search(X_train, X_test, y_train, feature_names)
5 catboost模型训练,获得特征分数表
'''
Author: ydzhao
Description: catboost 模型训练,获得特征分数表
Date: 2020-11-23 10:44:16
LastEditTime: 2020-11-23 13:35:17
FilePath: \project\code\DC-hi_guides-master\m2_cgb.py
'''
from __future__ import print_function
from __future__ import division
from data_helper import get_action_rate_before_jp
from data_helper import get_pair_rate_before_jp
from data_helper import get_triple_rate_before_jp
from data_helper import get_gender_convert_rate
from data_helper import get_feat
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import check_path
import catboost as cgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
import pickle
import logging.handlers
LOG_FILE = 'log/cgb_train.log'
check_path(LOG_FILE)
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1) # 实例化handler
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('train')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
class Config(object):
def __init__(self):
self.params = {
'learning_rate': 0.05,
'eval_metric': 'AUC',
'depth': 8,
'logging_level': 'Info',
'loss_function': 'Logloss',
'train_dir': 'model/cgb_record/',
'thread_count': 6
}
check_path(self.params['train_dir'])
self.max_round = 1000
self.cv_folds = 5
self.seed = 3
self.save_model_path = 'model/cgb.model'
def cgb_fit(config, X_train, y_train):
"""模型(交叉验证)训练,并返回最优迭代次数和最优的结果。
Args:
config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path}
X_train:array like, shape = n_sample * n_feature
y_train: shape = n_sample * 1
Returns:
best_model: 训练好的最优模型
best_auc: float, 在测试集上面的 AUC 值。
best_round: int, 最优迭代次数。
"""
params = config.params
max_round = config.max_round
cv_folds = config.cv_folds
seed = config.seed
save_model_path = config.save_model_path
if cv_folds is not None:
dtrain = cgb.Pool(X_train, label=y_train)
cv_result = cgb.cv(dtrain, params, num_boost_round=max_round, nfold=cv_folds, seed=seed, logging_level='Verbose')
print(type(cv_result))
print(cv_result.columns)
# 最优模型,最优迭代次数
auc_test_avg = cv_result['test-AUC-mean']
best_round = np.argmax(auc_test_avg)
best_auc = np.max(auc_test_avg) # 最好的 auc 值
best_model = cgb.train(dtrain, params, num_boost_round=best_round)
else:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100)
dtrain = cgb.Pool(X_train, label=y_train)
dvalid = cgb.Pool(X_valid, label=y_valid)
best_model = cgb.train(params, dtrain, num_boost_round=max_round, eval_set=dvalid)
best_round = best_model.best_iteration
best_auc = best_model.best_score
cv_result = None
if save_model_path:
check_path(save_model_path)
# 把训练好的模型存储起来,这样在进行决策时直接将模型读出,而不需要重新训练模型,这样就大大节约了时间。
# Python提供的pickle模块就很好地解决了这个问题,它可以序列化对象并保存到磁盘中,并在需要的时候读取出来,任何对象都可以执行序列化操作。 dump:写入 open:读写
pickle.dump(best_model, open(save_model_path, 'wb'))
return best_model, best_auc, best_round, cv_result
def cgb_predict(model, X_test, save_result_path=None):
y_pred_prob = model.predict(X_test, prediction_type='Probability')
y_pred_prob = y_pred_prob[:, 1] # get the probability of class 1
if save_result_path:
df_result = X_test
df_result['orderType'] = y_pred_prob
df_result.to_csv(save_result_path, index=False)
print('Save the result to {}'.format(save_result_path))
return y_pred_prob
if __name__ == '__main__':
# get feature
feature_path = 'features/'
train_data, test_data = load_feat(re_get=False, feature_path=feature_path)
train_feats = train_data.columns.values
test_feats = test_data.columns.values
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
X_train = train_data.drop(drop_columns, axis=1)
y_train = train_data['label']
X_test = test_data
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
logger.info(data_message)
config = Config()
# train model
# t1 = time.time()
# cgb_model, best_auc, best_round, cv_result = cgb_fit(config, X_train, y_train)
# t2 = time.time()
# print('Time cost {}s'.format(t2 - t1))
# result_message = 'best_round={}, best_auc={}'.format(best_round, best_auc)
# logger.info(result_message)
# print(result_message)
# predict
cgb_model = pickle.load(open(config.save_model_path, 'rb'))
now = time.strftime("%m%d-%H%M%S")
result_path = 'result/result_cgb_{}.csv'.format(now)
check_path(result_path)
cgb_predict(cgb_model, X_test, result_path)
6 stacking模型融合
1 XGB参数调优
XGBoost的参数分为三种:
通用参数:(两种类型的booster,因为tree的性能比线性回归好得多,因此我们很少用线性回归。)
- booster:使用哪个弱学习器训练,默认gbtree,可选gbtree,gblinear 或dart
- nthread:用于运行XGBoost的并行线程数,默认为最大可用线程数
verbosity:打印消息的详细程度。有效值为0(静默),1(警告),2(信息),3(调试)。
Tree Booster的参数:
- eta(learning_rate):learning_rate,在更新中使用步长收缩以防止过度拟合,默认= 0.3,范围:[0,1];典型值一般设置为:0.01-0.2
- gamma(min_split_loss):默认= 0,分裂节点时,损失函数减小值只有大于等于gamma节点才分裂,gamma值越大,算法越保守,越不容易过拟合,但性能就不一定能保证,需要平衡。范围:[0,∞]
- max_depth:默认= 6,一棵树的最大深度。增加此值将使模型更复杂,并且更可能过度拟合。范围:[0,∞]
- min_child_weight:默认值= 1,如果新分裂的节点的样本权重和小于min_child_weight则停止分裂 。这个可以用来减少过拟合,但是也不能太高,会导致欠拟合。范围:[0,∞]
- max_delta_step:默认= 0,允许每个叶子输出的最大增量步长。如果将该值设置为0,则表示没有约束。如果将其设置为正值,则可以帮助使更新步骤更加保守。通常不需要此参数,但是当类极度不平衡时,它可能有助于逻辑回归。将其设置为1-10的值可能有助于控制更新。范围:[0,∞]
- subsample:默认值= 1,构建每棵树对样本的采样率,如果设置成0.5,XGBoost会随机选择一半的样本作为训练集。范围:(0,1]
- **sampling_method:默认= uniform,用于对训练实例进行采样的方法。 **
- uniform:每个训练实例的选择概率均等。通常将subsample> = 0.5 设置 为良好的效果。
- gradient_based:每个训练实例的选择概率与规则化的梯度绝对值成正比,具体来说就是$$\sqrt{g^2+\lambda h^2}$$,subsample可以设置为低至0.1,而不会损失模型精度。
colsample_bytree:默认= 1,列采样率,也就是特征采样率。范围为(0,1]
lambda(reg_lambda):默认=1,L2正则化权重项。增加此值将使模型更加保守。
alpha(reg_alpha):默认= 0,权重的L1正则化项。增加此值将使模型更加保守。
tree_method:默认=auto,XGBoost中使用的树构建算法。
- auto:使用启发式选择最快的方法。
- 对于小型数据集,exact将使用精确贪婪()。
- 对于较大的数据集,approx将选择近似算法()。它建议尝试hist,gpu_hist,用大量的数据可能更高的性能。(gpu_hist)支持。external memory外部存储器。
- exact:精确的贪婪算法。枚举所有拆分的候选点。
- approx:使用分位数和梯度直方图的近似贪婪算法。
- hist:更快的直方图优化的近似贪婪算法。(LightGBM也是使用直方图算法)
- gpu_hist:GPU hist算法的实现。
scale_pos_weight:控制正负权重的平衡,这对于不平衡的类别很有用。Kaggle竞赛一般设置sum(negative instances) / sum(positive instances),在类别高度不平衡的情况下,将参数设置大于0,可以加快收敛。
num_parallel_tree:默认=1,每次迭代期间构造的并行树的数量。此选项用于支持增强型随机森林。
monotone_constraints:可变单调性的约束,在某些情况下,如果有非常强烈的先验信念认为真实的关系具有一定的质量,则可以使用约束条件来提高模型的预测性能。(例如params_constrained[‘monotone_constraints’] = “(1,-1)“,(1,-1)我们告诉XGBoost对第一个预测变量施加增加的约束,对第二个预测变量施加减小的约束。)
任务参数(这个参数用来控制理想的优化目标和每一步结果的度量方法。)
objective:默认=reg:squarederror,表示最小平方误差。
- reg:squarederror,最小平方误差。
- reg:squaredlogerror,对数平方损失。
- reg:logistic,逻辑回归
- reg:pseudohubererror,使用伪Huber损失进行回归,这是绝对损失的两倍可微选择。
- binary:logistic,二元分类的逻辑回归,输出概率。
- binary:logitraw:用于二进制分类的逻辑回归,逻辑转换之前的输出得分。
- binary:hinge:二进制分类的铰链损失。这使预测为0或1,而不是产生概率。(SVM就是铰链损失函数)
- count:poisson –计数数据的泊松回归,泊松分布的输出平均值。
- survival:cox:针对正确的生存时间数据进行Cox回归(负值被视为正确的生存时间)。
- survival:aft:用于检查生存时间数据的加速故障时间模型。
- aft_loss_distribution:survival:aft和aft-nloglik度量标准使用的概率密度函数。
- multi:softmax:设置XGBoost以使用softmax目标进行多类分类,还需要设置num_class(类数)
- multi:softprob:与softmax相同,但输出向量,可以进一步重整为矩阵。结果包含属于每个类别的每个数据点的预测概率。
- rank:pairwise:使用LambdaMART进行成对排名,从而使成对损失最小化。
- rank:ndcg:使用LambdaMART进行列表式排名,使标准化折让累积收益(NDCG)最大化。
- rank:map:使用LambdaMART进行列表平均排名,使平均平均精度(MAP)最大化。
- reg:gamma:使用对数链接进行伽马回归。输出是伽马分布的平均值。
- reg:tweedie:使用对数链接进行Tweedie回归。
- 自定义损失函数和评价指标:https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html
eval_metric:验证数据的评估指标,将根据目标分配默认指标(回归均方根,分类误差,排名的平均平均精度),用户可以添加多个评估指标
- rmse,均方根误差;
- rmsle:均方根对数误差;
mae:平均绝对误差;
mphe:平均伪Huber错误;
logloss:负对数似然;
error:二进制分类错误率;
merror:多类分类错误率;
mlogloss:多类logloss;
auc:曲线下面积;
aucpr:PR曲线下的面积;
ndcg:归一化累计折扣;
map:平均精度;
mape: Mean Absolute Percentage Error平均绝对百分比误差
- rmse,均方根误差;
- seed :随机数种子,[默认= 0]。
XGB参数调优的一般步骤 1. 确定学习速率和提升参数调优的初始值 2. max_depth 和 min_child_weight 参数调优 3. gamma参数调优 4. subsample 和 colsample_bytree 参数优 5. 正则化参数alpha调优 6. 降低学习速率和使用更多的决策树
# 回归
params = {
'booster': 'gbtree',
'objective': 'reg:squarederror',
'gamma': 0.1,
'max_depth': 5,
'lambda': 3,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.1,
'seed': 1000,
'nthread': 4,
}
model = xgb.XGBRegressor(**params)
# 分类
params = {
'booster': 'gbtree',
'objective': 'multi:softmax',
'num_class': 3,
'gamma': 0.1,
'max_depth': 6,
'lambda': 2,
'subsample': 0.7,
'colsample_bytree': 0.75,
'min_child_weight': 3,
'silent': 0,
'eta': 0.1,
'seed': 1,
'nthread': 4,
}
model = xgb.XGBClassifier(**params)
2 LGB参数调优
LightGBM也是像XGBoost一样,是一类集成算法,他跟XGBoost总体来说是一样的,算法本质上与Xgboost没有出入,只是在XGBoost的基础上进行了优化,因此就不对原理进行重复介绍,在这里我们来看看几种算法的差别:
优化速度和内存使用
- 降低了计算每个分割增益的成本。
- 使用直方图减法进一步提高速度。
- 减少内存使用。
- 减少并行学习的计算成本。
稀疏优化
- 用离散的bin替换连续的值。如果#bins较小,则可以使用较小的数据类型(例如uint8_t)来存储训练数据
- 无需存储其他信息即可对特征数值进行预排序 。
精度优化
- 使用叶子数为导向的决策树建立算法而不是树的深度导向。
- 分类特征的编码方式的优化
- 通信网络的优化
- 并行学习的优化
- GPU支持
1.核心参数:(括号内名称是别名)
objective(objective,app ,application):默认regression,用于设置损失函数
回归问题:
- L2损失:regression(regression_l2,l2,mean_squared_error,mse,l2_root,root_mean_squared_error,rmse)
- L1损失:regression_l1(l1, mean_absolute_error, mae)
- 其他损失:huber,fair,poisson,quantile,mape,gamma,tweedie
二分类问题:二进制对数损失分类(或逻辑回归):binary
多类别分类:
- softmax目标函数: multiclass(softmax)
- One-vs-All 目标函数:multiclassova(multiclass_ova,ova,ovr)
交叉熵:
- 用于交叉熵的目标函数(具有可选的线性权重):cross_entropy(xentropy)
- 交叉熵的替代参数化:cross_entropy_lambda(xentlambda)
boosting :默认gbdt,设置提升类型,选项有gbdt,rf,dart,goss,别名:boosting_type,boost
- gbdt(gbrt):传统的梯度提升决策树
- rf(random_forest):随机森林
- dart:多个加性回归树的DROPOUT方法 Dropouts meet Multiple Additive Regression Trees,参见:https://arxiv.org/abs/1505.01866
- goss:基于梯度的单边采样 Gradient-based One-Side Sampling
data(train,train_data,train_data_file,data_filename):用于训练的数据或数据file
valid (test,valid_data,valid_data_file,test_data,test_data_file,valid_filenames):验证/测试数据的路径,LightGBM将输出这些数据的指标
num_iterations:默认=100,类型= INT
n_estimators:提升迭代次数,**LightGBM构造用于多类分类问题的树num_class _ numiterations
learning_rate(shrinkage_rate,eta) :收缩率,默认=0.1
num_leaves(num_leaf,max_leaves,max_leaf) :默认=31,一棵树上的最大叶子数
tree_learner (tree,tree_type,tree_learner_type):默认=serial,可选:serial,feature,data,voting
- serial:单台机器的 tree learner
- feature:特征并行的 tree learner
- data:数据并行的 tree learner
- voting:投票并行的 tree learner
num_threads(num_thread, nthread):LightGBM 的线程数,为了更快的速度, 将此设置为真正的 CPU 内核数, 而不是线程的数量 (大多数 CPU 使用超线程来使每个 CPU 内核生成 2 个线程),当你的数据集小的时候不要将它设置的过大 (比如, 当数据集有 10,000 行时不要使用 64 线程),对于并行学习, 不应该使用全部的 CPU 内核, 因为这会导致网络性能不佳。
device(device_type):默认cpu,为树学习选择设备, 你可以使用 GPU 来获得更快的学习速度,可选cpu, gpu。
seed (random_seed,random_state):与其他种子相比,该种子具有较低的优先级,这意味着如果您明确设置其他种子,它将被覆盖。
2.用于控制模型学习过程的参数:
- max_depth:限制树模型的最大深度. 这可以在 #data 小的情况下防止过拟合. 树仍然可以通过 leaf-wise 生长。
- min_data_in_leaf: 默认=20,一个叶子上数据的最小数量. 可以用来处理过拟合。
- min_sum_hessian_in_leaf(min_sum_hessian_per_leaf, min_sum_hessian, min_hessian):默认=1e-3,一个叶子上的最小 hessian 和. 类似于 min_data_in_leaf, 可以用来处理过拟合.
- feature_fraction:default=1.0,如果 feature_fraction 小于 1.0, LightGBM 将会在每次迭代中随机选择部分特征. 例如, 如果设置为 0.8, 将会在每棵树训练之前选择 80% 的特征,可以用来加速训练,可以用来处理过拟合。
- feature_fraction_seed:默认=2,feature_fraction 的随机数种子。
- bagging_fraction(sub_row, subsample):默认=1,不进行重采样的情况下随机选择部分数据
- bagging_freq(subsample_freq):bagging 的频率, 0 意味着禁用 bagging. k 意味着每 k 次迭代执行bagging
- bagging_seed(bagging_fraction_seed) :默认=3,bagging 随机数种子。
- early_stopping_round(early_stopping_rounds, early_stopping):默认=0,如果一个验证集的度量在 early_stopping_round 循环中没有提升, 将停止训练
- lambda_l1(reg_alpha):L1正则化系数
- lambda_l2(reg_lambda):L2正则化系数
- min_split_gain(min_gain_to_split):执行切分的最小增益,默认=0.
- cat_smooth:默认=10,用于分类特征,可以降低噪声在分类特征中的影响, 尤其是对数据很少的类别
3.度量参数:
metric:default={l2 for regression}, {binary_logloss for binary classification}, {ndcg for lambdarank}, type=multi-enum, options=l1, l2, ndcg, auc, binary_logloss, binary_error …
- l1, absolute loss, alias=mean_absolute_error, mae
- l2, square loss, alias=mean_squared_error, mse
- l2_root, root square loss, alias=root_mean_squared_error, rmse
- quantile, Quantile regression
- huber, Huber loss
- fair, Fair loss
- poisson, Poisson regression
- ndcg, NDCG
- map, MAP
- auc, AUC
- binary_logloss, log loss
- binary_error, 样本: 0 的正确分类, 1 错误分类
- multi_logloss, mulit-class 损失日志分类
- multi_error, error rate for mulit-class 出错率分类
- xentropy, cross-entropy (与可选的线性权重), alias=cross_entropy
- xentlambda, “intensity-weighted” 交叉熵, alias=cross_entropy_lambda
- kldiv, Kullback-Leibler divergence, alias=kullback_leibler
- 支持多指标, 使用 , 分隔
train_metric(training_metric, is_training_metric):默认=False,如果你需要输出训练的度量结果则设置 true
4.GPU 参数:
- gpu_device_id:default为-1, 这个default意味着选定平台上的设备。
3 集成学习
集成学习方法主要分成三种:bagging,boosting 和 Stacking。这里主要介绍Stacking。
Stacking严格来说并不是一种算法,而是精美而又复杂的,对模型集成的一种策略。
Stacking集成算法可以理解为一个两层的集成,第一层含有多个基础分类器,把预测的结果(元特征)提供给第二层, 而第二层的分类器通常是逻辑回归,他把一层分类器的结果当做特征做拟合输出预测结果。在介绍Stacking之前,我们先来对简化版的Stacking进行讨论,也叫做Blending,接着我们对Stacking进行更深入的讨论。
Blending集成学习算法
不知道大家小时候有没有过这种经历:老师上课提问到你,那时候你因为开小差而无法立刻得知问题的答案。就在你彷徨的时候,由于你平时人缘比较好,因此周围的同学向你伸出援手告诉了你他们脑中的正确答案,因此你对他们的答案加以总结和分析最终的得出正确答案。相信大家都有过这样的经历,说这个故事的目的是为了引出集成学习家族中的Blending方式,这种集成方式跟我们的故事是十分相像的。
下面我们来详细讨论下这个Blending集成学习方式:
- (1) 将数据划分为训练集和测试集(test_set),其中训练集需要再次划分为训练集(train_set)和验证集(val_set);
- (2) 创建第一层的多个模型,这些模型可以使同质的也可以是异质的;
- (3) 使用train_set训练步骤2中的多个模型,然后用训练好的模型预测val_set和test_set得到val_predict, test_predict1;
- (4) 创建第二层的模型,使用val_predict作为训练集训练第二层的模型;
- (5) 使用第二层训练好的模型对第二层测试集test_predict1进行预测,该结果为整个测试集的结果。
以上是Blending集成方式的过程,接下来我们来分析这个集成方式的优劣:
- 其中一个最重要的优点就是实现简单粗暴,没有太多的理论的分析。
- 但是这个方法的缺点也是显然的:blending只使用了一部分数据集作为留出集进行验证,也就是只能用上数据中的一部分,实际上这对数据来说是很奢侈浪费的。
Stacking集成学习算法
基于前面对Blending集成学习算法的讨论,我们知道:Blending在集成的过程中只会用到验证集的数据,对数据实际上是一个很大的浪费。为了解决这个问题,我们详细分析下Blending到底哪里出现问题并如何改进。在Blending中,我们产生验证集的方式是使用分割的方式,产生一组训练集和一组验证集,这让我们联想到交叉验证的方式。顺着这个思路,我们对Stacking进行建模(如下图):
- 首先将所有数据集生成测试集和训练集(假如训练集为10000,测试集为2500行),那么上层会进行5折交叉检验,使用训练集中的8000条作为训练集,剩余2000行作为验证集(橙色)。
- 每次验证相当于使用了蓝色的8000条数据训练出一个模型,使用模型对验证集进行验证得到2000条数据,并对测试集进行预测,得到2500条数据,这样经过5次交叉检验,可以得到中间的橙色的5** 2000条验证集的结果(相当于每条数据的预测结果),5**** 2500条测试集的预测结果。**
- 接下来会将验证集的5** 2000条预测结果拼接成10000行长的矩阵(new train set for 第二层输出)**。标记为,而对于5 2500行的测试集的预测结果进行加权平均,得到一个2500一列的矩阵,标记为$$B_1$$。
- 上面得到一个基模型在数据集上的预测结果$$A_1$$、$$B_1$$,这样当我们对3个基模型进行集成的话,相于得到了$$A_1$$、$$A_2$$、$$A_3$$、$$B_1$$、$$B_2$$、$$B_3$$六个矩阵。
- 之后我们会将$$A_1$$、$$A_2$$、$$A_3$$并列在一起成10000行3列的矩阵作为training data,$$B_1$$、$$B_2$$、$$B_3$$合并在一起成2500行3列的矩阵作为testing data,让下层学习器基于这样的数据进行再训练。
- 再训练是基于每个基础模型的预测结果作为特征(三个特征),次学习器会学习训练如果往这样的基学习的预测结果上赋予权重w,来使得最后的预测最为准确。
下面,我们来实际应用下Stacking是如何集成算法的
1、首先我们会得到两组数据:训练集和测试集。将训练集分成5份:train1,train2,train3,train4,train5。 2、选定基模型。这里假定我们选择了xgboost, lightgbm 和 randomforest 这三种作为基模型。比如xgboost模型部分:依次用train1,train2,train3,train4,train5作为验证集,其余4份作为训练集,进行5折交叉验证进行模型训练;再在测试集上进行预测。这样会得到在训练集上由xgboost模型训练出来的5份predictions,和在测试集上的1份预测值B1。将这五份纵向重叠合并起来得到A1。lightgbm和randomforest模型部分同理。 3、三个基模型训练完毕后,将三个模型在训练集上的预测值作为分别作为3个”特征”A1,A2,A3,使用LR模型进行训练,建立LR模型。 4、使用训练好的LR模型,在三个基模型之前在测试集上的预测值所构建的三个”特征”的值(B1,B2,B3)上,进行预测,得出最终的预测类别或概率。
由于sklearn并没有直接对Stacking的方法,因此我们需要下载mlxtend工具包(pip install mlxtend)
stacking主要有几种使用方法: 1、最基本的使用方法,即使用基分类器所产生的预测类别作为meta-classifier“特征”的输入数据
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
basemodel1 = XGBClassifier()
basemodel2 = lgb.LGBMClassifier()
basemodel3 = RandomForestClassifier(random_state=1)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[basemodel1, basemodel2, basemodel3],
meta_classifier=lr)
print('5-fold cross validation:\n')
for basemodel, label in zip([basemodel1, basemodel2, basemodel3, sclf],
['xgboost',
'lightgbm',
'Random Forest',
'StackingClassifier']):
scores = model_selection.cross_val_score(basemodel,X, y,
cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
2、这一种是使用第一层所有基分类器所产生的类别概率值作为meta-classfier的输入。需要在StackingClassifier 中增加一个参数设置:use_probas = True。
另外,还有一个参数设置average_probas = True,那么这些基分类器所产出的概率值将按照列被平均,否则会拼接。
例如: 基分类器1:predictions=[0.2,0.2,0.7] 基分类器2:predictions=[0.4,0.3,0.8] 基分类器3:predictions=[0.1,0.4,0.6]
1)若use_probas = True,average_probas = True, 则产生的meta-feature 为:[0.233, 0.3, 0.7]
2)若use_probas = True,average_probas = False, 则产生的meta-feature 为:[0.2,0.2,0.7,0.4,0.3,0.8,0.1,0.4,0.6]
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
basemodel1 = XGBClassifier()
basemodel2 = lgb.LGBMClassifier()
basemodel3 = RandomForestClassifier(random_state=1)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[basemodel1, basemodel2, basemodel3],
use_probas=True,
average_probas=False,
meta_classifier=lr)
print('5-fold cross validation:\n')
for basemodel, label in zip([basemodel1, basemodel2, basemodel3, sclf],
['xgboost',
'lightgbm',
'Random Forest',
'StackingClassifier']):
scores = model_selection.cross_val_score(basemodel,X, y,
cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
3、这一种方法是对基分类器训练的特征维度进行操作的,并不是给每一个基分类器全部的特征,而是赋予不同的基分类器不同的特征。比如:基分类器1训练前半部分的特征,基分类器2训练后半部分的特征。这部分的操作是通过sklearn中的pipelines实现。最终通过StackingClassifier组合起来。
from sklearn.datasets import load_iris
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
iris = load_iris()
X = iris.data
y = iris.target
#基分类器1:xgboost
pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)),
XGBClassifier())
#基分类器2:RandomForest
pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)),
RandomForestClassifier())
sclf = StackingClassifier(classifiers=[pipe1, pipe2],
meta_classifier=LogisticRegression())
sclf.fit(X, y)
StackingClassifier使用API和参数说明:
StackingClassifier(classifiers, meta_classifier, use_probas=False, average_probas=False, verbose=0, use_features_in_secondary=False)
参数: classifiers: 基分类器,数组形式,[cl1, cl2, cl3]. 每个基分类器的属性被存储在类属性 self.clfs_.
meta_classifier: 目标分类器,即将前面分类器合起来的分类器
use_probas: bool (default: False) ,如果设置为True, 那么目标分类器的输入就是前面分类输出的类别概率值而不是类别标签
average_probas: bool (default: False),当上一个参数use_probas = True时需设置,average_probas=True表示所有基分类器输出的概率值需被平均,否则拼接。verbose: int, optional (default=0)。用来控制使用过程中的日志输出,当 verbose = 0时,什么也不输出, verbose = 1,输出回归器的序号和名字。verbose = 2,输出详细的参数信息。verbose > 2, 自动将verbose设置为小于2的,verbose -2.use_features_in_secondary: bool (default: False). 如果设置为True,那么最终的目标分类器就被基分类器产生的数据和最初的数据集同时训练。如果设置为False,最终的分类器只会使用基分类器产生的数据训练。
备注:Stacking一般多是两层就够了,多层也是可以的。
例如下图:
案例
'''
Author: ydzhao
Description: stacking模型融合
Date: 2020-11-23 10:44:16
LastEditTime: 2020-11-23 13:35:17
FilePath: \project\code\DC-hi_guides-master\stacking.py
'''
from __future__ import print_function
from __future__ import division
from m1_xgb import xgb_fit
from m1_xgb import xgb_predict
from m1_xgb import Config as XGB_Config
from m2_lgb import lgb_fit
from m2_lgb import lgb_predict
from m2_lgb import Config as LGB_Config
from m3_cgb import cgb_fit
from m3_cgb import cgb_predict
from m3_cgb import Config as CGB_Config
from data_helper import get_action_rate_before_jp
from data_helper import get_pair_rate_before_jp
from data_helper import get_triple_rate_before_jp
from data_helper import get_gender_convert_rate
from data_helper import get_feat
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import check_path
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVR
import logging.handlers
import time
"""Model stacking.
my_stacking: 每个单模型需要具备两个函数:
- 1.model_fit, 返回 best model;
- 2.返回的 best_model 具有 predict 预测类别概率。
- sklearn_stacking: 所有 base_model 都是 sklearn 中的模型,这样具有统一的 fit, perdict 接口。
"""
LOG_FILE = 'log/stacking.log'
check_path(LOG_FILE)
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1) # 实例化handler
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('stack')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
SEED = 3
def load_features(feature_path='features_lin/'):
"""加载训练集和测试集"""
train_data, test_data = load_feat(re_get=False, feature_path=feature_path)
train_feats = train_data.columns.values
test_feats = test_data.columns.values
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
X_train = train_data.drop(drop_columns, axis=1)
y_train = train_data['label']
X_test = test_data
data_message = 'X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape)
print(data_message)
logger.info(data_message)
return X_train, y_train, X_test
def my_stacking(fit_funcs, predict_funcs, configs, X_train, y_train, X_test, n_fold=5):
"""Stacking 自定义组合:xgb, lgb, cgb.
每个模型包含:fit和predict函数
Args:
fit_funcs: return the best model
predict_funcs: return the probability of the positive class
configs: the config for each model.
X_train: shape=[n_sample_train, n_feats], feature for training data
y_train: shape=[n_sample_train, 1], labels for training data.
X_test: shape=[n_sample_test, n_feats]feature for testing data.
n_fold: n_fold for cv.
save_path: the path to save stack features.
Returns:
X_train_stack: shape=[n_sample_train, n_model]
y_train_stack: shape=[n_sample_test, 1]
X_test_stack: shape=[n_sample_test, n_model]
"""
if type(X_train) == pd.DataFrame:
X_train = X_train.values
if type(X_test) == pd.DataFrame:
X_test = X_test.values
if (type(y_train) == pd.DataFrame) | (type(y_train) == pd.Series):
y_train = y_train.values
n_train = len(X_train)
n_test = len(X_test)
n_model = len(fit_funcs)
# shuffle数据集
new_idx = np.random.permutation(n_train)
y_train = y_train[new_idx]
X_train = X_train[new_idx]
print('X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape))
kf = KFold(n_splits=n_fold, shuffle=False)
X_train_stack = None
X_test_stack = None
t1 = time.time()
for k in range(n_model):
message = 'Training model {}/{}, pass {}s.'.format(k + 1, n_model, time.time() - t1)
print(message)
logger.info(message)
fit_func = fit_funcs[k]
predict_func = predict_funcs[k]
config = configs[k]
oof_train = np.zeros((n_train,))
oof_test_skf = np.zeros((n_test, n_fold))
for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):
X_tr = X_train[train_idx]
y_tr = y_train[train_idx]
X_te = X_train[test_idx]
best_model, best_auc, _, _ = fit_func(config, X_tr, y_tr)
message = 'Fished fold {}/{}, auc={}'.format(i + 1, n_fold, best_auc)
logger.info(message)
y_pred_prob = predict_func(best_model, X_te)
oof_train[test_idx] = y_pred_prob
oof_test_skf[:, i] = predict_func(best_model, X_test)
oof_train = oof_train.reshape(-1, 1)
oof_test = np.mean(oof_test_skf, axis=1).reshape(-1, 1)
if X_train_stack is None: # the first model
X_train_stack = oof_train
X_test_stack = oof_test
else:
X_train_stack = np.hstack((X_train_stack, oof_train))
X_test_stack = np.hstack((X_test_stack, oof_test))
stack_feats_path = 'features/stack_feats/round_{}.npz'.format(k + 1) # 训练过程中进行保存
check_path(stack_feats_path)
np.savez(stack_feats_path, X_train=X_train_stack, y_train=y_train, X_test=X_test_stack)
message = 'X_train_stack.shape={}, X_test_stack.shape={}'.format(X_train_stack.shape, X_test_stack.shape)
print(message)
logger.info(message)
save_path = 'features/stack_feat_{}.npz'.format(time.strftime("%m%d-%H%M%S"))
check_path(save_path)
np.savez(save_path, X_train=X_train_stack, y_train=y_train, X_test=X_test_stack)
message = 'Finished stacking, saved the features to {}'.format(save_path)
logger.info(message)
print(message)
return X_train_stack, y_train, X_test_stack
def final_fit_predict(X_train, y_train, X_test, save_result_path=None):
"""最后使用stacking特征.
这层使用简单的LR模型
"""
param_grids = {
"C": list(np.linspace(0.0001, 10, 100))
}
print('Begin final fit with params:{}'.format(param_grids))
grid = GridSearchCV(
LogisticRegression(penalty='l2', max_iter=200),
param_grid=param_grids,
cv=5,
scoring="roc_auc"
)
grid.fit(X_train, y_train)
try:
message = 'Final fit: param_grids is: {};\n best_param is {};\n best cv_score is {};\n best_estimator is {}'.format(param_grids, grid.best_params_, grid.best_score_, grid.best_estimator_)
logger.info(message)
print(message)
except Exception as e:
print(e.message)
y_pred_prob = grid.predict_proba(X_test)[:, 1]
if save_result_path is not None:
df_result = X_test
df_result['orderType'] = y_pred_prob
df_result.to_csv(save_result_path, index=False)
print('Save the result to {}'.format(save_result_path))
return y_pred_prob
def run_my_stack():
"""My stacking function test."""
X_train, y_train, X_test = load_features()
# 初始化stacking 的 config/fit/predict函数
configs = list()
fit_funcs = list()
predict_funcs = list()
MAX_ROUND = 3
# lgb
num_leaves = [31, 41, 51, 61, 71, 81, 91]
feature_fractions = [0.4, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3]
for i in range(len(num_leaves)):
lgb_config = LGB_Config()
lgb_config.params['num_leaves'] = num_leaves[i]
lgb_config.params['feature_fraction'] = feature_fractions[i]
lgb_config.seed = np.random.randint(0, 10000)
lgb_config.save_model_path = None
# lgb_config.max_round = MAX_ROUND
configs.append(lgb_config) # 加载lgb模型的配置项
fit_funcs.append(lgb_fit) # 加载lgb模型训练函数 为 stacking
predict_funcs.append(lgb_predict) # 加载lgb模型预测函数 为 stacking
# xgb
# max_depths = [6, 7]
# colsample_bytrees = [0.7, 0.6]
# for i in range(len(max_depths)):
# xgb_config = XGB_Config()
# xgb_config.params['max_depth'] = max_depths[i]
# xgb_config.params['colsample_bytree'] = colsample_bytrees[i]
# xgb_config.seed = np.random.randint(0, 10000)
# xgb_config.save_model_path = None
# # xgb_config.max_round = MAX_ROUND
#
# configs.append(xgb_config) # 加载xgb模型的配置项
# fit_funcs.append(xgb_fit) # 加载xgb模型训练函数 为 stacking
# predict_funcs.append(xgb_predict) # 加载xgb模型预测函数 为 stacking
# cgb
max_depths = [8]
for i in range(len(max_depths)):
cgb_config = CGB_Config()
cgb_config.params['depth'] = max_depths[i]
cgb_config.seed = np.random.randint(0, 10000)
cgb_config.save_model_path = None
# cgb_config.max_round = MAX_ROUND
configs.append(cgb_config) # 加载cgb模型的配置项
fit_funcs.append(cgb_fit) # 加载cgb模型训练函数 为 stacking
predict_funcs.append(cgb_predict) # 加载cgb模型训练函数 为 stacking
X_train_stack, y_train_stack, X_test_stack = my_stacking(
fit_funcs,
predict_funcs,
configs,
X_train,
y_train,
X_test
)
result_path = 'result/my_stack_result-{}.csv'.format(time.strftime("%m%d-%H%M%S"))
y_pred_prob = final_fit_predict(X_train_stack, y_train_stack, X_test_stack, save_result_path=result_path)
return y_pred_prob
def sklearn_stacking(base_models, X_train, y_train, X_test, n_fold=5, save_path='features/sklearn_stack_feat.npz'):
"""Stacking for sklearn models."""
pass
if type(X_train) == pd.DataFrame:
X_train = X_train.values
if type(X_test) == pd.DataFrame:
X_test = X_test.values
if (type(y_train) == pd.DataFrame) | (type(y_train) == pd.Series):
y_train = y_train.values
n_train = len(X_train)
n_test = len(X_test)
n_model = len(base_models)
# shuffle训练数据
new_idx = np.random.permutation(n_train)
X_train = X_train[new_idx]
y_train = y_train[new_idx]
print('X_train.shape={}, X_test.shape={}'.format(X_train.shape, X_test.shape))
kf = KFold(n_splits=n_fold, shuffle=False)
X_train_stack = None
X_test_stack = None
t1 = time.time()
for k in range(n_model):
message = 'Training model {}/{}, pass {}s.'.format(k + 1, n_model, time.time() - t1)
print(message)
logger.info(message)
oof_train = np.zeros((n_train,))
oof_test_skf = np.zeros((n_test, n_fold))
model = base_models[k]
for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):
X_tr = X_train[train_idx]
y_tr = y_train[train_idx]
y_tr = y_tr.reshape(-1,1)
X_te = X_train[test_idx]
print(X_tr.shape)
print(y_tr.shape)
print(X_te.shape)
model.fit(X_tr, y_tr)
message = 'Fished fold {}/{}, pass {}s'.format(i + 1, n_fold, time.time() - t1)
print(message)
logger.info(message)
y_pred_prob = model.predict_proba(X_te)[:, 1]
oof_train[test_idx] = y_pred_prob
oof_test_skf[:, i] = model.predict_proba(X_test)[:, 1]
oof_train = oof_train.reshape(-1, 1)
oof_test = np.mean(oof_test_skf, axis=1).reshape(-1, 1)
if X_train_stack is None: # the first model
X_train_stack = oof_train
X_test_stack = oof_test
else:
X_train_stack = np.hstack((X_train_stack, oof_train))
X_test_stack = np.hstack((X_test_stack, oof_test))
stack_feats_path = 'features/stack_feats/round_{}.npz'.format(k + 1) # 训练过程中进行保存
check_path(stack_feats_path)
np.savez(stack_feats_path, X_train=X_train_stack, y_train=y_train, X_test=X_test_stack)
message = 'X_train_stack.shape={}, X_test_stack.shape={}'.format(X_train_stack.shape, X_test_stack.shape)
print(message)
logger.info(message)
if save_path:
check_path(save_path)
np.savez(save_path, X_train=X_train_stack, y_train=y_train, X_test=X_test_stack)
message = 'Finished stacking, saved the features to {}'.format(save_path)
logger.info(message)
print(message)
return X_train_stack, y_train, X_test_stack
def run_sklearn_stack():
"""Stacking with sklearn model test."""
X_train, y_train, X_test = load_features()
base_models = [
XGBClassifier(learning_rate=0.05,
eval_metric='auc',
# n_estimators=712, # 750
n_estimators=7, # 750
max_depth=5,
min_child_weight=7,
gamma=0,
subsample=0.8,
colsample_bytree=0.6,
eta=0.05,
silent=1,
seed=3,
objective='binary:logistic',
scale_pos_weight=1),
LGBMClassifier(num_leaves=31,
learning_rate=0.05,
# n_estimators=543, # 443
n_estimators=5, # 443
objective='binary',
metric={'auc'},
seed=3,
colsample_bytree=0.8,
min_child_weight=7,
subsample=0.8,
silent=1),
CatBoostClassifier(iterations=5,
learning_rate=0.05,
eval_metric='AUC',
depth=8
),
]
X_train_stack, y_train_stack, X_test_stack = sklearn_stacking(base_models, X_train, y_train, X_test, n_fold=5)
result_path = 'result/sklearn_stack_result-{}.csv'.format(time.strftime("%m%d-%H%M%S"))
check_path(result_path)
y_pred_prob = final_fit_predict(X_train_stack, y_train_stack, X_test_stack, save_result_path=result_path)
return y_pred_prob
if __name__ == '__main__':
run_my_stack()
# run_sklearn_stack()
7 ML模型超参数调节:网格搜索、随机搜索与贝叶斯优化
在进行机器学习的过程中,最为核心的一个概念就是参数,而参数又分为模型参数与超参数。 * 模型参数,顾名思义就是我们使用的模型根据训练数据学习到的参数,这一部分不需要我们人为的先验经验。 * 超参数是在开始学习过程之前设置值的参数,而不是通过训练得到的参数数据。 通常情况下,需要对超参数进行优化,给模型选择一组最优超参数,以提高学习的性能和效果。 通常情况下,常用的超参数调参的方法有:网格搜索,随机搜索与贝叶斯优化
(1)网格搜索
网格搜索是应用最广泛的超参数搜索算法,网格搜索通过查找搜索范围内的所有的点,来确定最优值。 一般通过给出较大的搜索范围以及较小的步长,网格搜索是一定可以找到全局最大值或最小值的。 但是,网格搜索一个比较大的问题是,它十分消耗计算资源,特别是需要调优的超参数比较多的时候。 在比赛中,需要调参的模型数量与对应的超参数比较多,而涉及的数据量又比较大,因此相当的耗费时间。 此外,由于给出的超参数组合比较多,因此一般都会固定多数参数,分步对1~2个超参数进行调解,这样能够减少时间但是缺难以自动化进行,而且由于目标参数一般是非凸的,因此容易陷入局部最小值。 根据我们设定的超参数分布范围来看,对所有的参数组合进行一一尝试是不现实的,这可能会消耗数天甚至数星期的时间,尤其是在大样本训练集上。
(2)随机搜索
与网格搜索相比,随机搜索并未尝试所有参数值,而是从指定的分布中采样固定数量的参数设置。 它的理论依据是,如果随机样本点集足够大,那么也可以找到全局的最大或最小值,或它们的近似值。通过对搜索范围的随机取样,随机搜索一般会比网格搜索要快一些。但是和网格搜索的快速版(非自动版)相似,结果也是没法保证的。 随机搜索的过程如下,使用方法与网格搜索完全一致
(3)贝叶斯优化
贝叶斯优化用于机器学习调参由J. Snoek(2012)提出,主要思想是,给定优化的目标函数(广义的函数,只需指定输入和输出即可,无需知道内部结构以及数学性质), 通过不断地添加样本点来更新目标函数的后验分布(高斯过程,直到后验分布基本贴合于真实分布。
简单的说,就是考虑了上一次参数的信息,从而更好的调整当前的参数。
贝叶斯优化与常规的网格搜索或者随机搜索的区别是:
- 1.贝叶斯调参采用高斯过程,考虑之前的参数信息,不断地更新先验;网格搜索未考虑之前的参数信息。
- 2.贝叶斯调参迭代次数少,速度快;网格搜索速度慢,参数多时易导致维度爆炸。
- 3.贝叶斯调参针对非凸问题依然稳健;网格搜索针对非凸问题易得到局部优最。
贝叶斯优化调参的具体原理可以参考:拟合目标函数后验分布的调参利器:贝叶斯优化
我们使用BayesOpt包来进行贝叶斯优化调参,安装命令如下所示:
pip install bayesian-optimization
BayesOpt包主要使用BayesianOptimization函数来创建一个优化对象,该函数接受一个模型评估函数function, 这个function的输入应该是xgboost(或者其他ML模型)的超参数,输出是模型在测试集上的效果(可以是Accuracy,也可以是RMSE,取决于具体的任务,一般返回K-Fold的均值)。
基于5-Fold的LightGBM贝叶斯优化的
# -*- coding: utf-8 -*-#
# -------------------------------------------------------------------------------
# @Project :code
# @Author :ydzhao
# @Email :zydeve@163.com
# @Datetime :2020-11-26 16:56
# @Software :PyCharm
# @Name :hyper_parameters.py.py
# @Description :ML模型超参数调节:网格搜索、随机搜索与贝叶斯优化
# -------------------------------------------------------------------------------
from __future__ import print_function
from __future__ import division
import time
import sys
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import joblib
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
from data_helper import load_feat
from my_utils import feature_analyze
from my_utils import check_path
import logging.handlers
LOG_FILE = 'log/hyper_parameters.log'
check_path(LOG_FILE)
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1) # 实例化handler
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('train')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
class ConfigSearch(object):
def __init__(self):
self.params = {
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.05,
'num_leaves': 30, # 叶子设置为 50 线下过拟合严重
'min_sum_hessian_in_leaf': 0.1,
'feature_fraction': 0.3, # 相当于 colsample_bytree
'bagging_fraction': 0.5, # 相当于 subsample
'lambda_l1': 0,
'lambda_l2': 5
}
self.best_model_path = 'model/best_lgb_model_hyper_parameters.txt'
def GridSearch(clf, params, X, y):
"""网格搜索"""
grid_search = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=2, cv=5) # scoring指定损失函数类型,n_jobs=-1指定全部cpu跑,cv指定交叉验证
grid_result = grid_search.fit(X, y) # 运行网格搜索
data_message = "best_score: {} ,using best_params: {}".format(grid_result.best_score_, grid_result.best_params_)
logger.info(data_message)
data_cv_result = "cv_result:{}".format(grid_result.cv_results_)
logger.info(data_cv_result)
data_best_estimator = "best_estimator:{}".format(grid_result.best_estimator_)
logger.info(data_best_estimator)
# 输出网格搜索每组超参数的cv数据
# for p, s in zip(grid_result.cv_results_['params'],grid_result.cv_results_['mean_test_score']):
# print(p, s)
def RandomSearch(clf, params, X, y):
"""随机搜索"""
rscv_search = RandomizedSearchCV(clf, params, scoring='roc_auc', n_jobs=-1, cv=5)
rscv_result = rscv_search.fit(X, y)
data_message = "best_score: {} ,using best_params: {}".format(rscv_result.best_score_, rscv_result.best_params_)
logger.info(data_message)
data_cv_result = "cv_result:{}".format(rscv_result.cv_results_)
logger.info(data_cv_result)
data_best_estimator = "best_estimator:{}".format(rscv_result.best_estimator_)
logger.info(data_best_estimator)
def GBM_evaluate_reg(min_child_samples, min_child_weight, colsample_bytree, max_depth, subsample, reg_alpha, reg_lambda):
"""自定义的模型评估函数"""
#################### 回归问题 ####################
# 模型固定的超参数
param = {
'objective': 'regression',
'n_estimators': 275,
'metric': 'rmse',
'random_state': 2018
}
# 贝叶斯优化器生成的超参数
param['min_child_weight'] = int(min_child_weight)
param['colsample_bytree'] = float(colsample_bytree),
param['max_depth'] = int(max_depth),
param['subsample'] = float(subsample),
param['reg_lambda'] = float(reg_lambda),
param['reg_alpha'] = float(reg_alpha),
param['min_child_samples'] = int(min_child_samples)
# 5-flod 交叉检验,注意BayesianOptimization会向最大评估值的方向优化,因此对于回归任务需要取负数。
# 这里的评估函数为neg_mean_squared_error,即负的MSE。
val = cross_val_score(
lgb.LGBMRegressor(**param),
train_X,
train_y,
scoring = 'neg_mean_squared_error',
cv = 5
).mean()
return val
def GBM_evaluate_cls(num_leaves,learning_rate):
"""自定义的模型评估函数"""
#################### 分类问题 ####################
param = {
'objective': 'binary',
'metric': 'auc',
'min_sum_hessian_in_leaf': 0.1,
'feature_fraction': 0.3, # 相当于 colsample_bytree
'bagging_fraction': 0.5, # 相当于 subsample
'lambda_l1': 0,
'lambda_l2': 5
}
param['num_leaves'] = int(num_leaves)
param['learning_rate'] = float(learning_rate)
val = cross_val_score(
lgb.LGBMClassifier(**param),
train_X,
train_y,
scoring = 'roc_auc',
cv = 5
).mean()
return val
def BayesianSearch(clf, params):
"""贝叶斯优化器"""
# 迭代次数
num_iter = 25
init_points = 5
# 创建一个贝叶斯优化对象,输入为自定义的模型评估函数与超参数的范围
bayes = BayesianOptimization(clf, params)
# 开始优化
bayes.maximize(init_points=init_points, n_iter=num_iter)
# 输出结果
bayes_max = "bayes_max: {}".format(bayes.max)
print(bayes_max)
logger.info(bayes_max)
for i, res in enumerate(bayes.res):
print("Iteration {}: {}".format(i, res))
result = "Iteration {}: {}".format(i, res)
logger.info(result)
if __name__ == '__main__':
######## 获取训练集和测试集
feature_path = 'features/'
train_data, test_data = load_feat(re_get=False, feature_path=feature_path)
train_feats = train_data.columns.values
test_feats = test_data.columns.values
drop_columns = list(filter(lambda x: x not in test_feats, train_feats))
train_X = train_data.drop(drop_columns, axis=1) # <class 'pandas.core.frame.DataFrame'>
train_y = train_data['label'] # <class 'pandas.core.series.Series'>
test_X = test_data # <class 'pandas.core.frame.DataFrame'>
data_message = 'X_train.shape={}, X_test.shape={}'.format(train_X.shape, test_X.shape)
print(data_message)
logger.info(data_message)
######## 调参获取最佳参数
# config = ConfigSearch()
# adj_params = {
# 'num_leaves': [20, 30, 40, 50],
# 'learning_rate': [0.025, 0.05, 0.1, 0.15, 0.20]
# }
# logger.info(adj_params)
# print(config.params)
# classer = lgb.LGBMClassifier(**config.params)
# (1) 网格搜索
# GridSearch(classer,adj_params,train_X,train_y)
# (2) 随机搜索
# RandomSearch(classer,adj_params,train_X,train_y)
# 调用贝叶斯优化
BayesianSearch(
clf = GBM_evaluate_cls,
params = {
'num_leaves': (20, 50),
'learning_rate': (0.025, 0.20)
}
)
8 LightGBM各种操作
LightGBM是基于XGBoost的一款可以快速并行的树模型框架,内部集成了多种集成学习思路,在代码实现上对XGBoost的节点划分进行了改进,内存占用更低训练速度更快。
LightGBM官网: https://lightgbm.readthedocs.io/en/latest/
参数介绍: https://lightgbm.readthedocs.io/en/latest/Parameters.html
8.1 安装方法
LightGBM的安装非常简单,在Linux下很方便的就可以开启GPU训练。可以优先选用从pip安装,如果失败再从源码安装。
安装方法:从源码安装
git clone --recursive https://github.com/microsoft/LightGBM ;
cd LightGBM
mkdir build ; cd build
cmake ..
# 开启MPI通信机制,训练更快
# cmake -DUSE_MPI=ON ..
# GPU版本,训练更快
# cmake -DUSE_GPU=1 ..
make -j4
安装方法:pip安装
# 默认版本
pip install lightgbm
# MPI版本
pip install lightgbm --install-option=--mpi
# GPU版本
pip install lightgbm --install-option=--gpu
8.2 调用方法
在Python语言中LightGBM提供了两种调用方式,分为为原生的API和Scikit-learn API,两种方式都可以完成训练和验证。当然原生的API更加灵活,看个人习惯来进行选择。
定义数据集
df_train = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(
X_train,
y_train,
weight=W_train,
free_raw_data=False
)
lgb_eval = lgb.Dataset(
X_test,
y_test,
reference=lgb_train,
weight=W_test,
free_raw_data=False
)
模型训练
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21]
)
模型保存与加载
# save model to file
gbm.save_model('model.txt')
print('Dumping model to JSON...')
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
查看特征重要性
# feature names
print('Feature names:', gbm.feature_name())
# feature importances
print('Feature importances:', list(gbm.feature_importance()))
继续训练
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finished 10 - 20 rounds with model file...')
动态调整模型超参数
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('Finished 20 - 30 rounds with decay learning rates...')
# change other parameters during training
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finished 30 - 40 rounds with changing bagging_fraction...')
自定义损失函数
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
return grad, hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelihood,
feval=binary_error,
valid_sets=lgb_eval)
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
调参方法————网格搜索
# (1)num_leaves
# LightGBM使用的是leaf - wise的算法,因此在调节树的复杂程度时,使用的是num_leaves而不是max_depth。
# 大致换算关系:num_leaves = 2 ^ (max_depth)
# (2)样本分布非平衡数据集:可以param[‘is_unbalance’]=’true’
# (3)Bagging参数:bagging_fraction + bagging_freq(必须同时设置)、feature_fraction
# (4)min_data_in_leaf、min_sum_hessian_in_leaf
lg = lgb.LGBMClassifier(silent=False)
param_dist = {"max_depth": [4,5, 7],
"learning_rate" : [0.01,0.05,0.1],
"num_leaves": [300,900,1200],
"n_estimators": [50, 100, 150]
}
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=5)
grid_search.fit(train,y_train)
grid_search.best_estimator_, grid_search.best_score_
调参方法————贝叶斯优化
import warnings
import time
warnings.filterwarnings("ignore")
from bayes_opt import BayesianOptimization
def lgb_eval(max_depth, learning_rate, num_leaves, n_estimators):
params = {
"metric" : 'auc'
}
params['max_depth'] = int(max(max_depth, 1))
params['learning_rate'] = np.clip(0, 1, learning_rate)
params['num_leaves'] = int(max(num_leaves, 1))
params['n_estimators'] = int(max(n_estimators, 1))
cv_result = lgb.cv(params, d_train, nfold=5, seed=0, verbose_eval =200,stratified=False)
return 1.0 * np.array(cv_result['auc-mean']).max()
lgbBO = BayesianOptimization(lgb_eval, {'max_depth': (4, 8),
'learning_rate': (0.05, 0.2),
'num_leaves' : (20,1500),
'n_estimators': (5, 200)}, random_state=0)
lgbBO.maximize(init_points=5, n_iter=50,acq='ei')
print(lgbBO.max)