xgBoost :: classification example
https://deepsense.ai/wp-content/uploads/2015/11/xgboost.pdf
https://rstudio-pubs-static.s3.amazonaws.com/229885_4947089f114448e7836e25112bb05544.html#
input data는 (data.frame이 아니라) matrix를 사용한 예.
Agaricus (아가리쿠스버섯)
library(xgboost) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load data ---- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') trnData <- agaricus.train$data trnLabel <- agaricus.train$label tstData <- agaricus.test$data tstLabel <- agaricus.test$label ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Train the model ---- ###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ modelXGB <- xgboost(data=trnData, label=trnLabel, max_depth=2, eta=1, objective="binary:logistic", nthread=2, nrounds=2) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Evaluation ---- ###### (meric|pred) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pred <- predict(modelXGB, tstData) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Make predictions ---- ###### (Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #pred <- bst %>% predict(NEWDATA) #pred <- bst %>% predict(NEWDATA)
pacman::p_load(tidyverse, data.table) library(xgboost) library(caret) # for confusionMatrix library(DiagrammeR) # for xgb.plot.tree ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load data ---- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') trnData <- agaricus.train$data trnLabel <- agaricus.train$label tstData <- agaricus.test$data tstLabel <- agaricus.test$label ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Train the model ---- ###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ modelXGB <- xgboost(data=trnData, label=trnLabel, max_depth=2, eta=1, objective="binary:logistic", nthread=2, nrounds=2) # xgb.save(modelXGB, 'modelXGB.sixx') # modelXGB <- xgb.load('modelXGB.sixx') # ` improve model (recursive) -------------------------------------------------- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Evaluation ---- ###### (meric|pred) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pred <- predict(modelXGB, tstDM) #err <- mean((pred - tstTarget)^2) %>% round(2) pred_T <- as.numeric(pred>0.5) err <- mean(pred_T!=tstLabel) confusionMatrix(as.factor(pred_T), as.factor(tstLabel), dnn=c("pred", "actl"), positive="1") print(str_c("test-error = ", round(err,2))) # evaluating and visualizing the performance of scoring classifiers library(ROCR) # Receiver Operating Characteristic graphs pred_obj <- prediction(tstLabel, pred_T) perf <- performance(pred_obj, measure="tpr", x.measure="fpr") plot(perf, main="ROC Curve for Poisonous Mushrooms", col="blue", lwd=3) abline(a=0, b=1, lwd=2, lty=2) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Make predictions ---- ###### (Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #pred <- bst %>% predict(NEWDATA) # ` explain Model -------------------------------------------------------------- # Feature importance --- colNm <- colnames(tstData) Imp <- xgb.importance(feature_names=colNm, model=modelXGB) Imp %>% xgb.plot.importance() # History Visualization - model tree xgb.plot.tree(feature_names=colNm, model=modelXGB) #modelTree <- xgb.model.dt.tree(feature_names=colNm, model=modelXGB)
Outbreaks of animal diseases
https://www.kaggle.com/rtatman/machine-learning-with-xgboost-in-r
https://www.kaggle.com/rtatman/machine-learning-with-xgboost-in-r-workbook
Machine Learning with XGBoost (in R) Workbook
predict which outbreaks of animal diseases will lead to humans getting sick.
pacman::p_load(tidyverse, data.table, stringr, RCurl) library(xgboost) library(caret) # for confusionMatrix library(DiagrammeR) # for xgb.plot.tree ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load data ---- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # information on various outbreaks of animal diseases # URL <- "https://www.kaggle.com/tentotheminus9/empres-global-animal-disease-surveillance" # Outbreak_240817.csv diseaseInfo <- read_csv(file.choose()) diseaseInfo %>% str() d0 <- setDT(diseaseInfo); # rm("diseaseInfo") colnames(d0) # "Id" "source" "latitude" "longitude" "region" # "admin1" "localityName" "localityQuality" "observationDate" "reportingDate" "status" # "disease" "serotypes" # "humansGenderDesc" "humansAge" "humansDeaths" # "country" "speciesDescription" # "sumAtRisk" "sumCases" "sumDeaths" "sumDestroyed" "sumSlaughtered" # "humansAffected" d0 %>% head() #* Preparing our data & selecting features ---- d0[ , Affected:=!is.na(humansAffected)] #` -- Convert Categorical variable to a numeric ---- # One-hot encoding takes each category and makes it its own column. d0[ , speciesDescription] %>% unique() d0[ , is_domestic:=speciesDescription %>% str_detect("domestic")] # get a list of all the species by getting the last d0[ , speciesDesc:=speciesDescription %>% str_replace("[[:punct:]]", "") %>% # remove punctuation (some rows have parentheses) str_extract("[a-z]*$") ]# extract the least word in each row options(na.action='na.pass') # don't drop NA values! speciesM <- model.matrix(~ speciesDesc-1, d0) # sparse matrix d0[ , country] %>% unique() locationM <- model.matrix(~country-1, d0) # add our one-hot encoded variable and convert the dataframe into a matrix d1 <- d0[ , c("sumAtRisk","sumCases","sumDeaths","sumDestroyed","sumSlaughtered", "is_domestic","Affected")] diseaseInfo_matrix <- data.matrix(diseaseInfo_numeric) d2 <- cbind(locationM, speciesM, d1) trnIdx <- d2$Affected %>% createDataPartition(p=0.7, list=F) trnData <- d2[ trnIdx, ][ , Affected:=NULL] trnLabel<- d2[ trnIdx, as.numeric(Affected)] trnDM <- xgb.DMatrix(data=data.matrix(trnData), label=trnLabel) tstData <- d2[-trnIdx, ][ , Affected:=NULL] tstLabel <- d2[-trnIdx, as.numeric(Affected)] tstDM <- xgb.DMatrix(data=data.matrix(tstData), label=tstLabel) #` -- Split dataset into testing and training subsets ---- # numberOfTraingingSamples <- round(length(diseaseLabels)* .7) # # train_data <- diseaseInfo_matrix[1:numberOfTraingingSamples, ] # train_labels <- diseaseLabels[1:numberOfTraingingSamples] # # test_data <- diseaseInfo_matrix[-(1:numberOfTraingingSamples), ] # test_labels <- diseaseLabels[-(1:numberOfTraingingSamples)] # # #` -- Convert the cleaned dataframe to a Dmatrix ---- # # train move more quickly & to train a model on multiple cores. # dtrain <- xgb.DMatrix(data=train_data, label=train_labels) # dtest <- xgb.DMatrix(data=test_data, label=test_labels) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Train the model ---- ###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md modelXGB <- xgboost(data=trnDM, # max_depth=3, # default:6, the maximum depth of each decision tree # eta=0.2, objective="binary:logistic", # nthread=3, nrounds=2) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Evaluation ---- ###### (meric|pred) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pred <- predict(modelXGB, tstDM) #err <- mean((pred - tstTarget)^2) %>% round(2) pred_T <- as.numeric(pred>0.5) err <- mean(pred_T!=tstLabel) confusionMatrix(as.factor(pred_T), as.factor(tstLabel), dnn=c("predicted", "actual"), positive="1") print(str_c("test-error = ", round(err,2))) # evaluating and visualizing the performance of scoring classifiers library(ROCR) # Receiver Operating Characteristic graphs pred_obj <- prediction(tstLabel, pred_T) perf <- performance(pred_obj, measure="tpr", x.measure="fpr") plot(perf, main="ROC Curve for Poisonous Mushrooms", col="blue", lwd=3) abline(a=0, b=1, lwd=2, lty=2) data.table(pred, pred_T) %>% ggplot(aes(pred))+ geom_histogram() + stat_bin(geom="text", aes(label=..count..)) if (min(modelXGB$evaluation_log$train_error)-err >0){ print("Great") }else{ print("Over fitting") } # train-error:0.015 "test-error= 0.012" ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Improve the model ---- ###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ params <- list("max_depth"=3, objective="binary:logistic") # reg:squarederror cv.nfold <- 3 cv.nround <- 500 modelXGB_cv <- xgb.cv(data=trnDM, param=params, nfold=cv.nfold, nrounds=cv.nround, early_stopping_rounds=200, verbose=1) modelXGB_tuned <- xgboost(data = trnDM, param=params, nrounds=modelXGB_cv$best_iteration) pred <- predict(modelXGB_tuned, tstDM) ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Make predictions ---- ###### (Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #pred <- modelXGB %>% predict(NEWDATA) # ` explain Model -------------------------------------------------------------- # Feature importance --- colNm <- colnames(tstDM) Imp <- xgb.importance(feature_names=colNm, model=modelXGB) Imp %>% xgb.plot.importance() # History Visualization - model tree xgb.plot.tree(feature_names=colNm, model=modelXGB) #xgb.plot.multi.trees(feature_names=colNm, model=modelXGB) #modelTree <- xgb.model.dt.tree(feature_names=colNm, model=modelXGB) # convert log odds to probability uF_odds_to_probs <- function(odds){ return( exp(odds)/ (1+exp(odds)) ) } # probability of leaf above countryPortugul uF_odds_to_probs(-0.599) # ==> 35% < threshold of 50%