imdb nn-data (binary classification)

Published onesixx on

predict a single discrete label

Loading DATA

source(file.path(getwd(),"../00.global_dl.R"))
### Title: IMDB binary classification --- --- --- -- --- --- --- --- --- --- ----
# Deep Learning with R by François Chollet :: 3.4 Classifying movie reviews

##1. Loading DATA -------------------------------------------------------
# imdb <- dataset_imdb(num_words=10000) 
imdb <- readRDS(file.path(DATA_PATH,"imdb.RData"))
c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% imdb

Preprocessing

List -> Matrix

way1. pad_sequence

List를 Matrix로 reshape[sample, feature]할때, 각기 다른 행의 length를 맞추기위해
잘라내거나(truncating)나 특정value로 채워넣는(padding) 함수

MAX_LEN <- 20
trnData <- pad_sequences(trnData, maxlen=MAX_LEN)
tstData <- pad_sequences(tstData, maxlen=MAX_LEN)
exList <- list( c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
\t\t\t\tc(    2, 3, 4 ),
\t\t\t\tc(       3, 4, 5, 6, 7, 8),
\t\t\t\tc(          4 ))
> exList %>% pad_sequences(maxlen=5)
#      [,1] [,2] [,3] [,4] [,5]
# [1,]    6    7    8    9   10
# [2,]    0    0    2    3    4
# [3,]    4    5    6    7    8
# [4,]    0    0    0    0    4\t\t\t\t\t\t\t\t
pad_sequences(sequences=exList, maxlen=NULL, dtype="int32",
  \t\t\t truncating="pre", padding="pre", value=0)
#      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,]    1    2    3    4    5    6    7    8    9    10
# [2,]    0    0    0    0    0    0    0    2    3     4
# [3,]    0    0    0    0    3    4    5    6    7     8
# [4,]    0    0    0    0    0    0    0    0    0     4

pad_sequences(sequences=exList[2:4], maxlen=5, 
  \t\t\ttruncating="pre", padding="pre", value=0)
#      [,1] [,2] [,3] [,4] [,5]
# [1,]    0    0    2    3    4
# [2,]    4    5    6    7    8
# [3,]    0    0    0    0    4

pad_sequences(sequences=exList[2:4], maxlen=5, 
  \t\t\ttruncating="post", padding="pre", value=0)
#      [,1] [,2] [,3] [,4] [,5]
# [1,]    0    0    2    3    4
# [2,]    3    4    5    6    7
# [3,]    0    0    0    0    4

pad_sequences(sequences=exList[2:4], maxlen=5, 
  \t\t\ttruncating="post", padding="post", value=0)
#      [,1] [,2] [,3] [,4] [,5]
# [1,]    2    3    4    0    0
# [2,]    3    4    5    6    7
# [3,]    4    0    0    0    0


pad_sequences(sequences=exList[2:4], maxlen=5, 
  \t\t\ttruncating="post", padding="post", value=99)
#      [,1] [,2] [,3] [,4] [,5]
# [1,]    2    3    4   99   99
# [2,]    3    4    5    6    7
# [3,]    4   99   99   99   99

way2. uF_vectorize_sequences

uF_vectorize_sequences <- function(txxDataList, dimemsion=10000){
  results <- matrix(0, nrow=length(txxDataList), ncol=dimemsion)   # all 0 matrix [25000, 10000]
  for (sample in seq_along(txxDataList)){
    results[sample, txxDataList[[sample]] ] <- 1
  }
  return(results)
}
exList <- list( c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
\t\t\t\tc(    2, 3, 4 ),
\t\t\t\tc(       3, 4, 5, 6, 7, 8),
\t\t\t\tc(          4 ))

uF_vectorize_sequences <- function(txxdataList, dimemsion=10000){
  results <- matrix(0, nrow=length(txxdataList), ncol=dimemsion)   # all 0 matrix
  for (i in seq_along(txxdataList)){
    results[i, txxdataList[[i]] ] <- 1
  }
  return(results)
}

uF_vectorize_sequences(exList, dimemsion=10)
#      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,]    1    1    1    1    1    1    1    1    1     1
# [2,]    0    1    1    1    0    0    0    0    0     0
# [3,]    0    0    1    1    1    1    1    1    0     0
# [4,]    0    0    0    1    0    0    0    0    0     0
trn_Data <- uF_vectorize_sequences(trnData) 
tst_Data <- uF_vectorize_sequences(tstData)
# trnData %>% length()                     # 25000 No. of samples
# lapply(trnData, max) %>% unlist %>% max  # 9999  No. of max element in all list
# trnData[[1]] %>% unique() %>% length()   # 120
# trn_Data[1,] %>% sum()                   # 120
## ` ` Ploting -----------------------------------------------------------------
mxA <- trn_Data[seq_len(20), seq_len(10)]
dtA <- data.table(row=str_c("r", rep(seq_len(nrow(mxA)), times=ncol(mxA))),
\t\t\t\t\t\t\t\t\tcol=str_c("c", rep(seq_len(ncol(mxA)),  each=nrow(mxA))),
\t\t\t\t\t\t\t\t\tval=c(mxA))
dtA[, row:=factor(row, levels=str_c("r", rep(seq_len(nrow(mxA)))) %>% rev())]
dtA[, col:=factor(col, levels=str_c("c", rep(seq_len(ncol(mxA)))) )]

dtA %>% ggplot(aes(x=col, y=row)) + geom_tile(aes(fill=val), color="white", size=.6) +
\tscale_fill_gradient(low="white", high="red") +
  labs(x="sample", y="variable", title="Matrix ggplot") 

code

source(file.path(getwd(),"../00.global_dl.R"))
### Title: IMDB binary classification --- --- --- -- --- --- --- --- --- --- ----
# Deep Learning with R by François Chollet :: 3.4 Classifying movie reviews

##1. Loading DATA  -------------------------------------------------------
imdb <- readRDS(file.path(DATA_PATH,"imdb.RData"))
c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% imdb

##2# Preprocess : rescale & reshape -------------------------------------------------------
### way1
MAX_LEN <- 20
trnData <- pad_sequences(trnData, maxlen=MAX_LEN)
tstData <- pad_sequences(tstData, maxlen=MAX_LEN)

### way2
uF_vectorize_sequences <- function(txxDataList, dimemsion=10000){
  results <- matrix(0, nrow=length(txxDataList), ncol=dimemsion)   # all 0 matrix [25000, 10000]
  for (sample in seq_along(txxDataList)){
    results[sample, txxDataList[[sample]] ] <- 1
  }
  return(results)
}
trnData <- uF_vectorize_sequences(trnData)
tstData <- uF_vectorize_sequences(tstData)

Categories: Keras

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x