imdb 로딩

MAX_FEATURES <- 10000 
imdb <- dataset_imdb(num_words=MAX_FEATURES)
saveRDS(imdb, "imdb10000.rds")


word_index <- dataset_imdb_word_index()     # fawn 34701

### word_index # List명: 특정단어, value:자주사용한 순위
# 49
str_which(names(word_index), "^good$")
# 84396
#[1] 49
data.table(names(word_index), unlist(word_index)) %>% arrange(V2)
#               V1    V2
#     1:       the     1
#     2:       and     2
#     3:         a     3
#     4:        of     4
#     5:        to     5
# ---                
# 88580:    pipe's 88580
# 88581: copywrite 88581
# 88582:    artbox 88582
# 88583: voorhees' 88583
# 88584:       'l' 88584


imdb <- dataset_imdb(num_words=1)
imdb %>% str()
# List of 2
#  $ train:List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:218] 2 2 2 2 2 2 2 2 2 2 ...
#   .. ..$ : int [1:189] 2 2 2 2 2 2 2 2 2 2 ...
#   .. ..$ : int [1:141] 2 2 2 2 2 2 2 2 2 2 ...
# ...
#   ..$ y: int [1:25000] 1 0 0 1 0 0 1 0 1 0 ...
#  $ test :List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:68] 2 2 2 2 2 2 2 2 2 2 ...
#   .. ..$ : int [1:260] 2 2 2 2 2 2 2 2 2 2 ...
#   .. ..$ : int [1:603] 2 2 2 2 2 2 2 2 2 2 ...
# ...
#   ..$ y: int [1:25000] 0 1 1 0 1 1 1 0 0 1 ...
imdb <- dataset_imdb(num_words=MAX_FEATURES) 
# List of 2
#  $ train:List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:218] 1 14 22 16 43 2 2 2 2 65 ...
#   .. ..$ : int [1:189] 1 2 2 2 2 78 2 5 6 2 ...
#   .. ..$ : int [1:141] 1 14 47 8 30 31 7 4 2 2 ...
# ...
#   ..$ y: int [1:25000] 1 0 0 1 0 0 1 0 1 0 ...
#  $ test :List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:68] 1 2 2 14 31 6 2 10 10 2 ...
#   .. ..$ : int [1:260] 1 14 22 2 6 2 7 2 88 12 ...
#   .. ..$ : int [1:603] 1 2 2 2 2 2 2 4 87 2 ...
# ...
#   ..$ y: int [1:25000] 0 1 1 0 1 1 1 0 0 1 ...
imdb <- dataset_imdb(num_words=10)

# List of 2
#  $ train:List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:218] 1 2 2 2 2 2 2 2 2 2 ...
#   .. ..$ : int [1:189] 1 2 2 2 2 2 2 5 6 2 ...
#   .. ..$ : int [1:141] 1 2 2 8 2 2 7 4 2 2 ...
# ...
# \t..$ y: int [1:25000] 1 0 0 1 0 0 1 0 1 0 ...
#  $ test :List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:68] 1 2 2 2 2 6 2 2 2 2 ...
#   .. ..$ : int [1:260] 1 2 2 2 6 2 7 2 2 2 ...
#   .. ..$ : int [1:603] 1 2 2 2 2 2 2 4 2 2 ... 
# ...
#    ..$ y: int [1:25000] 0 1 1 0 1 1 1 0 0 1 ... 
MAX_FEATURES <- 10000 
imdb <- dataset_imdb(num_words=MAX_FEATURES) 
# List of 2
#  $ train:List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:218] 1 14 22 16 43 530 973 1622 1385 65 ...
#   .. ..$ : int [1:189] 1 194 1153 194 8255 78 228 5 6 1463 ...
#   .. ..$ : int [1:141] 1 14 47 8 30 31 7 4 249 108 ...
# ...
#   ..$ y: int [1:25000] 1 0 0 1 0 0 1 0 1 0 ...
#  $ test :List of 2
#   ..$ x:List of 25000
#   .. ..$ : int [1:68] 1 591 202 14 31 6 717 10 10 2 ...
#   .. ..$ : int [1:260] 1 14 22 3443 6 176 7 5063 88 12 ...
#   .. ..$ : int [1:603] 1 111 748 4368 1133 2 2 4 87 1551 ...
# ...
#   ..$ y: int [1:25000] 0 1 1 0 1 1 1 0 0 1 ...  

정수벡터 decoding

> imdb <- dataset_imdb()
> imdb$train$x[[1]] 
  [1]     1    14    22    16    43   530   973  1622  1385    65   458  4468    66  3941     4   173    36   256     5    25   100
 [22]    43   838   112    50   670 22665     9    35   480   284     5   150     4   172   112   167 21631   336   385    39     4
 [43]   172  4536  1111    17   546    38    13   447     4   192    50    16     6   147  2025    19    14    22     4  1920  4613
 [64]   469     4    22    71    87    12    16    43   530    38    76    15    13  1247     4    22    17   515    17    12    16
 [85]   626    18 19193     5    62   386    12     8   316     8   106     5     4  2223  5244    16   480    66  3785    33     4
[106]   130    12    16    38   619     5    25   124    51    36   135    48    25  1415    33     6    22    12   215    28    77
[127]    52     5    14   407    16    82 10311     8     4   107   117  5952    15   256     4 31050     7  3766     5   723    36
[148]    71    43   530   476    26   400   317    46     7     4 12118  1029    13   104    88     4   381    15   297    98    32
[169]  2071    56    26   141     6   194  7486    18     4   226    22    21   134   476    26   480     5   144    30  5535    18
[190]    51    36    28   224    92    25   104     4   226    65    16    38  1334    88    12    16   283     5    16  4472   113
[211]   103    32    15    16  5345    19   178    32
> imdb <- dataset_imdb(num_words=50)
> imdb$train$x[[1]]
  [1]  1 14 22 16 43  2  2  2  2  2  2  2  2  2  4  2 36  2  5 25  2 43  2  2  2  2  2  9 35  2  2  5  2  4  2  2  2  2  2  2 39  4  2
 [44]  2  2 17  2 38 13  2  4  2  2 16  6  2  2 19 14 22  4  2  2  2  4 22  2  2 12 16 43  2 38  2 15 13  2  4 22 17  2 17 12 16  2 18
 [87]  2  5  2  2 12  8  2  8  2  5  4  2  2 16  2  2  2 33  4  2 12 16 38  2  5 25  2  2 36  2 48 25  2 33  6 22 12  2 28  2  2  5 14
[130]  2 16  2  2  8  4  2  2  2 15  2  4  2  7  2  5  2 36  2 43  2  2 26  2  2 46  7  4  2  2 13  2  2  4  2 15  2  2 32  2  2 26  2
[173]  6  2  2 18  4  2 22 21  2  2 26  2  5  2 30  2 18  2 36 28  2  2 25  2  4  2  2 16 38  2  2 12 16  2  5 16  2  2  2 32 15 16  2
[216] 19  2 32

1이나 2 값은 의미 없음.

### reverse_word_index   ### 벡터명이 단어 벡터값 순위
reverse_word_index <- names(word_index)     # 34701 fawn
names(reverse_word_index) <- word_index

decoded_review <- sapply(imdb$train$x[[1]], function(index){
  # index 0, 1, 2 => reserved indices for "padding", "start of sequence", and "unknown"
  word <- if(index>=3) reverse_word_index[[as.character(index-3)]]
  if(!is.null(word)) word else "?"

> str_c(decoded_review, collapse = " ")
[1] "? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all"

> str_c(decoded_review, collapse = " ")
[1] "? this film was just ? ? ? ? ? ? ? ? ? the ? they ? and you ? just ? ? ? ? ? is an ? ? and ? the ? ? ? ? ? ? from the ? ? ? as ? so i ? the ? ? was a ? ? with this film the ? ? ? the film ? ? it was just ? so ? that i ? the film as ? as it was ? for ? and ? ? it to ? to ? and the ? ? was ? ? ? at the ? it was so ? and you ? ? they ? if you ? at a film it ? have ? ? and this ? was ? ? to the ? ? ? that ? the ? of ? and ? they ? just ? ? are ? ? out of the ? ? i ? ? the ? that ? ? all ? ? are ? a ? ? for the ? film but ? ? are ? and ? be ? for ? they have ? ? you ? the ? ? was so ? ? it was ? and was ? ? ? all that was ? with ? all"

train / test dataset 나누기

imdb <- readRDS(file.path(DATA_PATH,"imdb10000.rds"))
c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% imdb

trnData %>% length()   # 25,000개의 samples을 가진 list
tstData %>% length()   # 25,000개의 samples을 가진 list
# [1] 25000
trnLabels %>% table()  # 0: Negative, 1: Positive
# .
# 0     1 
# 12500 12500


List -> Matrix

MAX_LEN <- 20
trnData <- pad_sequences(trnData, maxlen=MAX_LEN)
tstData <- pad_sequences(tstData, maxlen=MAX_LEN)

IMDB movie-review sentiment-prediction task

  • 일단 raw 데이터를 받아서,
  • 감상평에서 , 많이 나온 순서대로 10,000개(feature로 생각되는 words 갯수)의 word만을 남겨둔다.
  • 뒤에서부터 20 words 만 남기고 잘라낸다.
    shape은 정수 List 에서 2D 정수 tentor 로 바뀐다.
  • Network은 10,000 words 대상으로 각각 8차원 embedding을 학습한다.
  • 입력 2d tensor 를 embedding하여 3d tensor로 만든다.
  • embedded 3d tensor를 flatten하여 2d tensor (sample, MAX_LEN*8)로 만든다.
  • 분류를 위해 single dense layer을 학습한다.
### Title: IMDB binary classification --- --- --- -- --- --- --- --- --- --- ----
# Deep Learning with R by François Chollet :: 3.4 Classifying movie reviews
# 3.4 Classifying movie reviews: a binary classification example
MAX_FEATURES <- 10000 
imdb <- dataset_imdb(num_words=MAX_FEATURES)
saveRDS(imdb, "imdb10000.rds")

word_index <- dataset_imdb_word_index()     # fawn 34701

reverse_word_index <- names(word_index)     # 34701 fawn
names(reverse_word_index) <- word_index
decoded_review <- sapply(trnData[[1]], function(index){
  # index 0, 1, 2 => reserved indices for "padding", "start of sequence", and "unknown"
  word <- if(index>=3) reverse_word_index[[as.character(index-3)]]
  if(!is.null(word)) word else "?"
##1. Loading DATA -------------------------------------------------------
imdb <- readRDS(file.path(DATA_PATH,"imdb10000.rds"))
c(c(trnData, trnLabels), c(tstData, tstLabels)) %<-% imdb

MAX_LEN <- 20
trnData <- pad_sequences(trnData, maxlen=MAX_LEN)
tstData <- pad_sequences(tstData, maxlen=MAX_LEN)

model <- keras_model_sequential() %>%
  layer_embedding(input_dim=10000, output_dim=8, input_length=MAX_LEN) %>%
  layer_flatten() %>%
  layer_dense(units=1, activation="sigmoid")
model %>% compile(
  optimizer = "rmsprop",
  loss      = "binary_crossentropy",
  metrics   = c("acc"))

history <- model %>% fit(trnData, tstData,
                         epochs = 10,
                         batch_size = 32,
                         validation_split = 0.2)
