reuter nn-data (multiclass classification)

Published by onesixx on

DATA & EDA

source(file.path(getwd(),"../00.global_dl.R"))
### Title:  Reuters multi-class classification --- --- --- -- --- --- --- --- ----
# Deep Learning with R by François Chollet
# 3.5 Classifying newswires: a multiclass classification example

##1. Loading DATA -------------------------------------------------------
# classify Reuters newswires into 46 different mutually exclusive topics
# The dimensionality of the output space is much larger. (from 2 to 46)
#reuters <- dataset_reuters(num_words = 10000)
reuters <- readRDS("~/DATA/DL/reuters.RData")
c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% reuters
###` ` EDA ---------------------------------------------------------------------
trnData[1]
trnLabels[1]

# decode to words
word_index <- dataset_reuters_word_index()      # Word-Idxnumber list
reverse_word_index <- names(word_index) 
names(reverse_word_index) <- word_index
reverse_word_index %>% head()                   # Idxnumber-Word vector

decoded_newswire <- sapply(trnData[[1]], function(index) {
\tword <- if (index >= 3) reverse_word_index[[as.character(index - 3)]]
\tif (!is.null(word)) word else "?" 
})
decoded_newswire

trnLabels %>% table()

tmpDT <- trnLabels %>% table() %>% as.data.table() 
names(tmpDT) <- c("Label","Cnt")
tmpDT[ , Label:=factor(Label, levels=c(0:45))]
tmpDT %>% ggplot(aes(Label, Cnt)) + geom_bar(stat="identity") 
# Encoding --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ----
# DATA vectorize (해당 변수에 값이 없다/있다(0/1)로 표현)
uF_vectorize_sequences <- function(txxdataList, dimemsion=10000){
  results <- matrix(0, nrow=length(txxdataList), ncol=dimemsion)   # all 0 matrix
  for (sample in seq_along(txxdataList)){
    results[sample, txxdataList[[sample]] ] <- 1
  }
  return(results)
}

uF_OneHot <- function(txxLabels, dimemsion=46){
\tresults <- matrix(0, nrow=length(txxLabels), ncol=dimemsion)
\tfor (i in 1:length(txxLabels)){
\t\tresults[i, txxLabels[[i]]+1] <- 1
\t}
\treturn(results)
}

# one-hot test
test <- 1:4
uF_OneHot(test, 6)
#      [,1] [,2] [,3] [,4] [,5] [,6]
# [1,]    0    1    0    0    0    0
# [2,]    0    0    1    0    0    0
# [3,]    0    0    0    1    0    0
# [4,]    0    0    0    0    1    0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
trn_Data <- uF_vectorize_sequences(trnData)
tst_Data <- uF_vectorize_sequences(tstData)

# LABEL one hot
trn_Labels <- to_categorical(trnLabels)
tst_Labels <- to_categorical(tstLabels)
# trn_Labels <- uF_OneHot(trnLabels)
# tst_Labels <- uF_OneHot(tstLabels)
Categories: Keras

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x