ligthgbm ex

Published by onesixx on

https://dschloe.github.io/python/dacon/jeju2020/08_gbm_xgboost_lightgbm/

https://tpwkcorqhd.tistory.com/25

http://incredible.egloos.com/7478695

import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import numpy as np
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam):
    reg=lgb.LGBMRegressor(
        num_leaves=31,
        max_depth= 2,
\tscale_pos_weight= scaleWeight, 
\tmin_child_weight= minChildWeight, 
\t
    \tlearning_rate=0.05,   
\tn_estimators=20
\t
\tsubsample= 0.4, 
\tcolsample_bytree= 0.4, 

    )
    
    
    
    
    
#   scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='roc_auc')
    scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='neg_mean_squared_error')
    return np.mean(scores)
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam):
    clf = lgb.LGBMClassifier(
        objective = 'binary',
        metric= 'auc',
        reg_alpha= 0,
        reg_lambda= 2,
#       bagging_fraction= 0.999,
        min_split_gain= 0,
        min_child_samples= 10,
        subsample_freq= 3,
#       subsample_for_bin= 50000,
#       n_estimators= 9999999,
        n_estimators= 99,
        num_leaves= int(numLeaves),
        max_depth= int(maxDepth),
        scale_pos_weight= scaleWeight,
        min_child_weight= minChildWeight,
        subsample= subsample,
        colsample_bytree= colSam,
        verbose =-1)
    scores = cross_val_score(clf, train_x, train_y, cv=5, scoring='roc_auc')
    return np.mean(scores)
def bayesOpt(train_x, train_y):
    lgbBO = BayesianOptimization(lgb_evaluate, {'numLeaves':(5, 90), 'maxDepth':(2, 90), 'scaleWeight':(1,10000), 'minChildWeight':(0.01, 70),  'subsample': (0.4, 1), 'colSam':(0.4, 1)})
    lgbBO.maximize(init_points=5, n_iter=50)
    print(lgbBO.res)
from sklearn.datasets import load_boston
dataset = load_boston()
X, y = dataset.data, dataset.target
train_x, X_test, train_y, y_test = train_test_split(X, y, test_size=0.2)
bayesOpt(train_x, train_y)
from numpy import loadtxt
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
X ,y = dataset[:,0:8], dataset[:,8]
train_x, X_test, train_y, y_test = train_test_split(X, y, test_size=0.2)
bayesOpt(train_x, train_y)

EXample …..sklearn_example.py

https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

### print('Loading data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\\t')
df_test  = pd.read_csv('../regression/regression.test', header=None, sep='\\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

# feature importances
print(f'Feature importances: {list(gbm.feature_importances_)}')


# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)


# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False


print('Starting training with multiple custom eval functions...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=[rmsle, rae],
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1]
print(f'The RMSLE of prediction is: {rmsle_test}')
print(f'The RAE of prediction is: {rae_test}')

# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print(f'Best parameters found by grid search are: {gbm.best_params_}')

https://www.kaggle.com/janged/xgb-lgb

Categories: DeepLearning

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x