data.table reshape (dcast,melt)

Published by onesixx on

DT %>% melt.data.table(id.vars= c("col1"), measure.vars=c("col2"),
                       variable.name="var", value.name="val")
DT %>% dcast.data.table(formula= col1+col2 ~ co3, 
                        value.var=c("col4"))

https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf

melt, dcast ==> gather, spread

#--- --- --- -----
table4a %>% as.data.table()
moltenDD <- as.data.table(table4a) %>% melt.data.table(id.vars="country", variable.name="year", value.name="case")
moltenDD %>% dcast.data.table(formula=country~year, value.var="case")

table4a
dcastDD <- table4a %>% gather(`1999`, `2000`, key="year", value="case")
dcastDD %>% spread(key=year, value=case)

#--- --- --- -----
table2 %>% as.data.table()
dcastDD <- as.data.table(table2) %>% dcast.data.table(country+year ~ type, value.var="count")
dcastDD %>% melt.data.table(id.vars=c('country','year'),\tvariable.name="cases", value.name="count") 

table2
moltenDD <- table2 %>% spread(key=type, value=count)
#table2 %>% spread(key=year, value=count)
moltenDD %>% gather(`cases`, `population`, key="type", value="count")

melt, dcast

DT0 <- data.table(x=1:20, y1=rnorm(20), y2=rnorm(20,.5))
#      x          y1          y2
# 1:   1 -0.58122437  0.58533709
# 2:   2 -1.01616426  0.73756413
# 3:   3 -0.36792958 -0.03372934
# ...
# 19: 19  0.56873638  1.41555847
# 20: 20  1.46933741 -0.63625173

moltenDT <- DT0 %>% melt.data.table(id.vars=c("x"), measure.vars=c("y1","y2"), 
\t\t\t\t\t\t\t\t  variable.name="Var", value.name="Val")
#      x      Var         Val
#  1:  1       y1 -0.58122437
#  2:  2       y1 -1.01616426
# ....
# 39: 19       y2  1.41555847
# 40: 20       y2 -0.63625173

dcastDT <- moltenDT %>% dcast.data.table(formula = x ~ Var,
\t\t\t\t\t\t\t\t\t   value.var = c('Val'))

dcast하면 key 가 생긴다.

all.equal(DT0, dcastDT)
# [1] "Datasets has different keys. 'target' has no key. 'current': x."
DT0 %>% key()         # NULL
dcastDT %>% key()     # [1] "x"

DT0 %>% setkey(x)
identical(DT0, dcastDT) # TRUE

Timeserise to data.table

passenger <- tsbox::ts_dt(AirPassengers)

melt.data.table() & dcast.data.table()

data.table에 맞게 reshape2 package의 함수를 수정한 dcast & melt

Convert DT to long form where each dob is a separate observation.

rs <- dcast.data.table(rs,"Step~variable")

data_melt <- melt.data.table(data, id.vars = c("A", "B", "C"))
data_cast <- dcast.data.table(result, id ~ Param)

DT <- data.table(
  i_1 = c(1:5, NA),
  i_2 = c(NA,6,7,8,9,10),
  f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
  f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
  c_1 = sample(c(letters[1:3], NA), 6, TRUE),
  d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
  d_2 = as.Date(6:1, origin="2012-01-01") )
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]
#    i_1 i_2  f_1 f_2  c_1        d_1        d_2            l_1       l_2
# 1:   1  NA    a   z    c 2013-09-02 2012-01-07      1,1,1,1,1   c,c,c,c
# 2:   2   6    c   a    c 2013-09-03 2012-01-06        2,2,2,2   c,c,c,c
# 3:   3   7    b   x  2013-09-04 2012-01-05        3,3,3,3  NA,NA,NA
# 4:   4   8    c    c        2012-01-04        4,4,4,4 c,c,c,c,c
# 5:   5   9    c   x  2013-09-05 2012-01-03              5        NA
# 6:  NA  10    b   x    c 2013-09-06 2012-01-02 NA,NA,NA,NA,NA       c,c

# id, measure ------------------------------------------------------------------
# as character/integer/numeric vectors
DT %>% melt.data.table(id.vars=1:2, measure.vars=c("f_1"))
DT %>% melt.data.table(id.vars=c("i_1","i_2"), measure.vars=3) # same as above
DT %>% melt.data.table(id.vars=1:2, measure.vars=3L,  value.factor=T) # same, but 'value' is factor
DT %>% melt.data.table(id.vars=1:2, measure.vars=3:4, value.factor=T) # 'value' is *ordered* factor

# preserves attribute when types are identical, ex: Date
DT %>% melt.data.table(id.vars=3:4, measure.vars=c("d_1","d_2"))
DT %>% melt.data.table(id.vars=3:4, measure.vars=c("i_1","d_1")) # attribute not preserved

# on list
DT %>% melt.data.table(id.vars=c("l_1", "l_2")) # value is a list
DT %>% melt.data.table(id.vars=c("c_1", "l_1")) # c1 coerced to list

# on character
DT %>% melt.data.table(id.vars=c("c_1", "f_1"), measure.vars=c("i_1","i_2","f_1","f_2","c_1","d_1","d_2")) # value is char
DT %>% melt.data.table(id.vars=c("c_1", "i_2")) # i2 coerced to char

# on na.rm=TRUE. NAs are removed efficiently, from within C
DT %>% melt.data.table(id.vars=c("c_1", "i_2"), na.rm=TRUE) # remove NA

# measure.vars can be also a list
# melt "f_1,f_2" and "d_1,d_2" simultaneously, retain 'factor' attribute
# convenient way using internal function patterns()
melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE)
# same as above, but provide list of columns directly by column names or indices
melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE)
# same as above, but provide names directly:
melt(DT, id=1:2, measure=patterns(f="^f_"), value.factor=TRUE)

# na.rm=TRUE removes rows with NAs in any 'value' columns
melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE)

# return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column
melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE)

dd
merged.df
melt.data.table(data.table(merged.df), id=1, measure=patterns("^name"), value.factor=F)
make.nam

Examples

Q1. Calculate total number of rows by month and then sort on descending order

mydata[, .N, by = month] [order(-N)]

The .N operator is used to find count.  

Q2. Find top 3 months with high mean arrival delay

mydata[, .(mean_arr_delay = mean(arr_delay, na.rm = TRUE)), by = month][order(-mean_arr_delay)][1:3]

Q3. Find origin of flights having average total delay is greater than 20 minutes

mydata[, lapply(.SD, mean, na.rm = TRUE), .SDcols = c("arr_delay", "dep_delay"), by = origin][(arr_delay + dep_delay) > 20]

Q4.  Extract average of arrival and departure delays for carrier == 'DL' by 'origin' and 'dest' variables

mydata[carrier == "DL",
        lapply(.SD, mean, na.rm = TRUE),
        by = .(origin, dest),
        .SDcols = c("arr_delay", "dep_delay")]

Q5. Pull first value of 'air_time' by 'origin' and then sum the returned values when it is greater than 300

mydata[, .SD[1], .SDcols="air_time", by=origin][air_time > 300, sum(air_time)]

Categories: Reshaping

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x