ligthgbm ex
https://dschloe.github.io/python/dacon/jeju2020/08_gbm_xgboost_lightgbm/
https://tpwkcorqhd.tistory.com/25
http://incredible.egloos.com/7478695
import lightgbm as lgb from bayes_opt import BayesianOptimization from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,confusion_matrix import numpy as np
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam): reg=lgb.LGBMRegressor( num_leaves=31, max_depth= 2, \tscale_pos_weight= scaleWeight, \tmin_child_weight= minChildWeight, \t \tlearning_rate=0.05, \tn_estimators=20 \t \tsubsample= 0.4, \tcolsample_bytree= 0.4, ) # scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='roc_auc') scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='neg_mean_squared_error') return np.mean(scores)
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam): clf = lgb.LGBMClassifier( objective = 'binary', metric= 'auc', reg_alpha= 0, reg_lambda= 2, # bagging_fraction= 0.999, min_split_gain= 0, min_child_samples= 10, subsample_freq= 3, # subsample_for_bin= 50000, # n_estimators= 9999999, n_estimators= 99, num_leaves= int(numLeaves), max_depth= int(maxDepth), scale_pos_weight= scaleWeight, min_child_weight= minChildWeight, subsample= subsample, colsample_bytree= colSam, verbose =-1) scores = cross_val_score(clf, train_x, train_y, cv=5, scoring='roc_auc') return np.mean(scores)
def bayesOpt(train_x, train_y): lgbBO = BayesianOptimization(lgb_evaluate, {'numLeaves':(5, 90), 'maxDepth':(2, 90), 'scaleWeight':(1,10000), 'minChildWeight':(0.01, 70), 'subsample': (0.4, 1), 'colSam':(0.4, 1)}) lgbBO.maximize(init_points=5, n_iter=50) print(lgbBO.res)
from sklearn.datasets import load_boston dataset = load_boston() X, y = dataset.data, dataset.target train_x, X_test, train_y, y_test = train_test_split(X, y, test_size=0.2) bayesOpt(train_x, train_y)
from numpy import loadtxt dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") X ,y = dataset[:,0:8], dataset[:,8] train_x, X_test, train_y, y_test = train_test_split(X, y, test_size=0.2) bayesOpt(train_x, train_y)
EXample …..sklearn_example.py
https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py
from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV import lightgbm as lgb ### print('Loading data...') df_train = pd.read_csv('../regression/regression.train', header=None, sep='\\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\\t') y_train = df_train[0] y_test = df_test[0] X_train = df_train.drop(0, axis=1) X_test = df_test.drop(0, axis=1) print('Starting training...') # train gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5) print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # eval rmse_test = mean_squared_error(y_test, y_pred) ** 0.5 print(f'The RMSE of prediction is: {rmse_test}') # feature importances print(f'Feature importances: {list(gbm.feature_importances_)}') # self-defined eval metric # f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool # Root Mean Squared Logarithmic Error (RMSLE) def rmsle(y_true, y_pred): return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False print('Starting training with custom eval function...') # train gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, early_stopping_rounds=5) # another self-defined eval metric # f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool # Relative Absolute Error (RAE) def rae(y_true, y_pred): return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False print('Starting training with multiple custom eval functions...') # train gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], early_stopping_rounds=5) print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # eval rmsle_test = rmsle(y_test, y_pred)[1] rae_test = rae(y_test, y_pred)[1] print(f'The RMSLE of prediction is: {rmsle_test}') print(f'The RAE of prediction is: {rae_test}') # other scikit-learn modules estimator = lgb.LGBMRegressor(num_leaves=31) param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40] } gbm = GridSearchCV(estimator, param_grid, cv=3) gbm.fit(X_train, y_train) print(f'Best parameters found by grid search are: {gbm.best_params_}')