xgBoost :: classification example

Published onesixx on

https://deepsense.ai/wp-content/uploads/2015/11/xgboost.pdf
https://rstudio-pubs-static.s3.amazonaws.com/229885_4947089f114448e7836e25112bb05544.html#

input data는 (data.frame이 아니라) matrix를 사용한 예.

Agaricus (아가리쿠스버섯)

library(xgboost)

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Load data ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
data(agaricus.train, package='xgboost')
data(agaricus.test,  package='xgboost')

trnData  <- agaricus.train$data
trnLabel <- agaricus.train$label

tstData  <- agaricus.test$data
tstLabel <- agaricus.test$label

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Train the model ----
###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
modelXGB <- xgboost(data=trnData, label=trnLabel, 
                    max_depth=2, eta=1, objective="binary:logistic", 
                    nthread=2, nrounds=2)

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Evaluation ----
###### (meric|pred) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pred <- predict(modelXGB, tstData)

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Make predictions ----
###### (Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#pred <- bst %>% predict(NEWDATA)
#pred <- bst %>% predict(NEWDATA)
pacman::p_load(tidyverse, data.table)
library(xgboost)
library(caret)      # for confusionMatrix
library(DiagrammeR) # for xgb.plot.tree

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Load data ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
data(agaricus.train, package='xgboost')
data(agaricus.test,  package='xgboost')

trnData  <- agaricus.train$data
trnLabel <- agaricus.train$label

tstData  <- agaricus.test$data
tstLabel <- agaricus.test$label

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Train the model ----
###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
modelXGB <- xgboost(data=trnData, label=trnLabel, 
                    max_depth=2, eta=1, objective="binary:logistic", 
                    nthread=2, nrounds=2)
# xgb.save(modelXGB, 'modelXGB.sixx')
# modelXGB <- xgb.load('modelXGB.sixx')

# ` improve model (recursive) --------------------------------------------------

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Evaluation ----
###### (meric|pred) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pred <- predict(modelXGB, tstDM)


#err <- mean((pred - tstTarget)^2) %>% round(2)
pred_T <- as.numeric(pred>0.5)
err <- mean(pred_T!=tstLabel)   
confusionMatrix(as.factor(pred_T), as.factor(tstLabel), dnn=c("pred", "actl"), positive="1")

print(str_c("test-error = ", round(err,2)))

# evaluating and visualizing the performance of scoring classifiers 
library(ROCR) # Receiver Operating Characteristic graphs
pred_obj <- prediction(tstLabel, pred_T)
perf <- performance(pred_obj, measure="tpr", x.measure="fpr")
 
plot(perf, main="ROC Curve for Poisonous Mushrooms", col="blue", lwd=3)
abline(a=0, b=1, lwd=2, lty=2)

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Make predictions ----
###### (Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#pred <- bst %>% predict(NEWDATA)

# ` explain Model --------------------------------------------------------------
# Feature importance ---
colNm <- colnames(tstData)
Imp <- xgb.importance(feature_names=colNm, model=modelXGB)
Imp %>% xgb.plot.importance()

# History Visualization - model tree 
xgb.plot.tree(feature_names=colNm, model=modelXGB)
#modelTree <- xgb.model.dt.tree(feature_names=colNm, model=modelXGB)

Outbreaks of animal diseases

https://www.kaggle.com/rtatman/machine-learning-with-xgboost-in-r
https://www.kaggle.com/rtatman/machine-learning-with-xgboost-in-r-workbook

Machine Learning with XGBoost (in R) Workbook
predict which outbreaks of animal diseases will lead to humans getting sick.

pacman::p_load(tidyverse, data.table, stringr, RCurl)
library(xgboost)
library(caret)      # for confusionMatrix
library(DiagrammeR) # for xgb.plot.tree


###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Load data ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# information on various outbreaks of animal diseases
# URL <- "https://www.kaggle.com/tentotheminus9/empres-global-animal-disease-surveillance"
# Outbreak_240817.csv
diseaseInfo  <- read_csv(file.choose())
diseaseInfo %>% str()
d0 <- setDT(diseaseInfo); # rm("diseaseInfo")
colnames(d0)
# "Id"                 "source"             "latitude"           "longitude"          "region"
# "admin1"             "localityName"       "localityQuality"    "observationDate"    "reportingDate"      "status"
# "disease"            "serotypes"          
# "humansGenderDesc"   "humansAge"  "humansDeaths"             

# "country"            "speciesDescription"
# "sumAtRisk"          "sumCases"           "sumDeaths"          "sumDestroyed"       "sumSlaughtered"
# "humansAffected"     
d0 %>% head()

#* Preparing our data & selecting features ----
d0[ , Affected:=!is.na(humansAffected)]

#` -- Convert Categorical variable to a numeric ----
# One-hot encoding takes each category and makes it its own column.
d0[ , speciesDescription] %>% unique()

d0[ , is_domestic:=speciesDescription %>% str_detect("domestic")]

# get a list of all the species by getting the last
d0[ , speciesDesc:=speciesDescription %>%
                    str_replace("[[:punct:]]", "") %>%  # remove punctuation (some rows have parentheses)
                    str_extract("[a-z]*$") ]# extract the least word in each row

options(na.action='na.pass') # don't drop NA values!
speciesM <- model.matrix(~ speciesDesc-1, d0) # sparse matrix


d0[ , country] %>% unique()
locationM <- model.matrix(~country-1, d0) 

# add our one-hot encoded variable and convert the dataframe into a matrix
d1 <- d0[ , c("sumAtRisk","sumCases","sumDeaths","sumDestroyed","sumSlaughtered", "is_domestic","Affected")]
diseaseInfo_matrix <- data.matrix(diseaseInfo_numeric)
d2 <- cbind(locationM, speciesM, d1)

trnIdx <- d2$Affected %>% createDataPartition(p=0.7, list=F)

trnData  <- d2[ trnIdx, ][ , Affected:=NULL]
trnLabel<- d2[ trnIdx, as.numeric(Affected)]
trnDM <- xgb.DMatrix(data=data.matrix(trnData), label=trnLabel)

tstData  <- d2[-trnIdx, ][ , Affected:=NULL]
tstLabel <- d2[-trnIdx, as.numeric(Affected)]
tstDM    <- xgb.DMatrix(data=data.matrix(tstData),  label=tstLabel)

#` -- Split dataset into testing and training subsets ----
# numberOfTraingingSamples <- round(length(diseaseLabels)* .7)
# 
# train_data   <- diseaseInfo_matrix[1:numberOfTraingingSamples, ]
# train_labels <- diseaseLabels[1:numberOfTraingingSamples]
# 
# test_data    <- diseaseInfo_matrix[-(1:numberOfTraingingSamples), ]
# test_labels  <- diseaseLabels[-(1:numberOfTraingingSamples)]
# 
# #` -- Convert the cleaned dataframe to a Dmatrix ----
# # train move more quickly & to train a model on multiple cores.
# dtrain <- xgb.DMatrix(data=train_data, label=train_labels)
# dtest  <- xgb.DMatrix(data=test_data,  label=test_labels)

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Train the model ----
###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

modelXGB <- xgboost(data=trnDM, 
                    # max_depth=3, # default:6, the maximum depth of each decision tree
                    # eta=0.2, 
                    objective="binary:logistic", 
                    # nthread=3, 
                    nrounds=2)


###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Evaluation ----
###### (meric|pred) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pred <- predict(modelXGB, tstDM)

#err <- mean((pred - tstTarget)^2) %>% round(2)
pred_T <- as.numeric(pred>0.5)
err <- mean(pred_T!=tstLabel)   
confusionMatrix(as.factor(pred_T), as.factor(tstLabel), dnn=c("predicted", "actual"), positive="1")

print(str_c("test-error = ", round(err,2)))


# evaluating and visualizing the performance of scoring classifiers 
library(ROCR) # Receiver Operating Characteristic graphs
pred_obj <- prediction(tstLabel, pred_T)
perf <- performance(pred_obj, measure="tpr", x.measure="fpr")
 
plot(perf, main="ROC Curve for Poisonous Mushrooms", col="blue", lwd=3)
abline(a=0, b=1, lwd=2, lty=2)

data.table(pred, pred_T) %>% 
  ggplot(aes(pred))+ geom_histogram() + 
  stat_bin(geom="text", aes(label=..count..)) 

if (min(modelXGB$evaluation_log$train_error)-err >0){
  print("Great")
}else{
  print("Over fitting")
}
# train-error:0.015  "test-error= 0.012"

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Improve the model ----
###### (Fitting, summary|history) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
params <- list("max_depth"=3, objective="binary:logistic") # reg:squarederror
cv.nfold <- 3
cv.nround <- 500

modelXGB_cv <- xgb.cv(data=trnDM, 
                      param=params,
                      nfold=cv.nfold, nrounds=cv.nround,
                      early_stopping_rounds=200, verbose=1)  
modelXGB_tuned <- xgboost(data = trnDM,      
                       param=params,
                       nrounds=modelXGB_cv$best_iteration) 

pred <- predict(modelXGB_tuned, tstDM)

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Make predictions ----
###### (Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#pred <- modelXGB %>% predict(NEWDATA)


# ` explain Model --------------------------------------------------------------
# Feature importance ---
colNm <- colnames(tstDM)
Imp <- xgb.importance(feature_names=colNm, model=modelXGB)
Imp %>% xgb.plot.importance()

# History Visualization - model tree 
xgb.plot.tree(feature_names=colNm, model=modelXGB)
#xgb.plot.multi.trees(feature_names=colNm, model=modelXGB)
#modelTree <- xgb.model.dt.tree(feature_names=colNm, model=modelXGB)

# convert log odds to probability
uF_odds_to_probs <- function(odds){ 
  return( exp(odds)/ (1+exp(odds)) ) 
}
# probability of leaf above countryPortugul
uF_odds_to_probs(-0.599)   
# ==> 35% < threshold of 50% 
Categories: R Analysis

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x