imdb nn-data (binary classification)
predict a single discrete label
Loading DATA
source(file.path(getwd(),"../00.global_dl.R")) ### Title: IMDB binary classification --- --- --- -- --- --- --- --- --- --- ---- # Deep Learning with R by François Chollet :: 3.4 Classifying movie reviews ##1. Loading DATA ------------------------------------------------------- # imdb <- dataset_imdb(num_words=10000) imdb <- readRDS(file.path(DATA_PATH,"imdb.RData")) c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% imdb
Preprocessing
List -> Matrix
way1. pad_sequence
List를 Matrix로 reshape[sample, feature]할때, 각기 다른 행의 length를 맞추기위해
잘라내거나(truncating)나 특정value로 채워넣는(padding) 함수
MAX_LEN <- 20 trnData <- pad_sequences(trnData, maxlen=MAX_LEN) tstData <- pad_sequences(tstData, maxlen=MAX_LEN)
exList <- list( c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), \t\t\t\tc( 2, 3, 4 ), \t\t\t\tc( 3, 4, 5, 6, 7, 8), \t\t\t\tc( 4 )) > exList %>% pad_sequences(maxlen=5) # [,1] [,2] [,3] [,4] [,5] # [1,] 6 7 8 9 10 # [2,] 0 0 2 3 4 # [3,] 4 5 6 7 8 # [4,] 0 0 0 0 4\t\t\t\t\t\t\t\t pad_sequences(sequences=exList, maxlen=NULL, dtype="int32", \t\t\t truncating="pre", padding="pre", value=0) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] # [1,] 1 2 3 4 5 6 7 8 9 10 # [2,] 0 0 0 0 0 0 0 2 3 4 # [3,] 0 0 0 0 3 4 5 6 7 8 # [4,] 0 0 0 0 0 0 0 0 0 4 pad_sequences(sequences=exList[2:4], maxlen=5, \t\t\ttruncating="pre", padding="pre", value=0) # [,1] [,2] [,3] [,4] [,5] # [1,] 0 0 2 3 4 # [2,] 4 5 6 7 8 # [3,] 0 0 0 0 4 pad_sequences(sequences=exList[2:4], maxlen=5, \t\t\ttruncating="post", padding="pre", value=0) # [,1] [,2] [,3] [,4] [,5] # [1,] 0 0 2 3 4 # [2,] 3 4 5 6 7 # [3,] 0 0 0 0 4 pad_sequences(sequences=exList[2:4], maxlen=5, \t\t\ttruncating="post", padding="post", value=0) # [,1] [,2] [,3] [,4] [,5] # [1,] 2 3 4 0 0 # [2,] 3 4 5 6 7 # [3,] 4 0 0 0 0 pad_sequences(sequences=exList[2:4], maxlen=5, \t\t\ttruncating="post", padding="post", value=99) # [,1] [,2] [,3] [,4] [,5] # [1,] 2 3 4 99 99 # [2,] 3 4 5 6 7 # [3,] 4 99 99 99 99
way2. uF_vectorize_sequences
uF_vectorize_sequences <- function(txxDataList, dimemsion=10000){ results <- matrix(0, nrow=length(txxDataList), ncol=dimemsion) # all 0 matrix [25000, 10000] for (sample in seq_along(txxDataList)){ results[sample, txxDataList[[sample]] ] <- 1 } return(results) }
exList <- list( c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), \t\t\t\tc( 2, 3, 4 ), \t\t\t\tc( 3, 4, 5, 6, 7, 8), \t\t\t\tc( 4 )) uF_vectorize_sequences <- function(txxdataList, dimemsion=10000){ results <- matrix(0, nrow=length(txxdataList), ncol=dimemsion) # all 0 matrix for (i in seq_along(txxdataList)){ results[i, txxdataList[[i]] ] <- 1 } return(results) } uF_vectorize_sequences(exList, dimemsion=10) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] # [1,] 1 1 1 1 1 1 1 1 1 1 # [2,] 0 1 1 1 0 0 0 0 0 0 # [3,] 0 0 1 1 1 1 1 1 0 0 # [4,] 0 0 0 1 0 0 0 0 0 0
trn_Data <- uF_vectorize_sequences(trnData) tst_Data <- uF_vectorize_sequences(tstData) # trnData %>% length() # 25000 No. of samples # lapply(trnData, max) %>% unlist %>% max # 9999 No. of max element in all list # trnData[[1]] %>% unique() %>% length() # 120 # trn_Data[1,] %>% sum() # 120
## ` ` Ploting ----------------------------------------------------------------- mxA <- trn_Data[seq_len(20), seq_len(10)] dtA <- data.table(row=str_c("r", rep(seq_len(nrow(mxA)), times=ncol(mxA))), \t\t\t\t\t\t\t\t\tcol=str_c("c", rep(seq_len(ncol(mxA)), each=nrow(mxA))), \t\t\t\t\t\t\t\t\tval=c(mxA)) dtA[, row:=factor(row, levels=str_c("r", rep(seq_len(nrow(mxA)))) %>% rev())] dtA[, col:=factor(col, levels=str_c("c", rep(seq_len(ncol(mxA)))) )] dtA %>% ggplot(aes(x=col, y=row)) + geom_tile(aes(fill=val), color="white", size=.6) + \tscale_fill_gradient(low="white", high="red") + labs(x="sample", y="variable", title="Matrix ggplot")
code
source(file.path(getwd(),"../00.global_dl.R")) ### Title: IMDB binary classification --- --- --- -- --- --- --- --- --- --- ---- # Deep Learning with R by François Chollet :: 3.4 Classifying movie reviews ##1. Loading DATA ------------------------------------------------------- imdb <- readRDS(file.path(DATA_PATH,"imdb.RData")) c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% imdb ##2# Preprocess : rescale & reshape ------------------------------------------------------- ### way1 MAX_LEN <- 20 trnData <- pad_sequences(trnData, maxlen=MAX_LEN) tstData <- pad_sequences(tstData, maxlen=MAX_LEN) ### way2 uF_vectorize_sequences <- function(txxDataList, dimemsion=10000){ results <- matrix(0, nrow=length(txxDataList), ncol=dimemsion) # all 0 matrix [25000, 10000] for (sample in seq_along(txxDataList)){ results[sample, txxDataList[[sample]] ] <- 1 } return(results) } trnData <- uF_vectorize_sequences(trnData) tstData <- uF_vectorize_sequences(tstData)