data.table reshape (dcast,melt)
DT %>% melt.data.table(id.vars= c("col1"), measure.vars=c("col2"),
variable.name="var", value.name="val")
DT %>% dcast.data.table(formula= col1+col2 ~ co3,
value.var=c("col4"))
https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf
melt, dcast ==> gather, spread
#--- --- --- -----
table4a %>% as.data.table()
moltenDD <- as.data.table(table4a) %>% melt.data.table(id.vars="country", variable.name="year", value.name="case")
moltenDD %>% dcast.data.table(formula=country~year, value.var="case")
table4a
dcastDD <- table4a %>% gather(`1999`, `2000`, key="year", value="case")
dcastDD %>% spread(key=year, value=case)
#--- --- --- -----
table2 %>% as.data.table()
dcastDD <- as.data.table(table2) %>% dcast.data.table(country+year ~ type, value.var="count")
dcastDD %>% melt.data.table(id.vars=c('country','year'),\tvariable.name="cases", value.name="count")
table2
moltenDD <- table2 %>% spread(key=type, value=count)
#table2 %>% spread(key=year, value=count)
moltenDD %>% gather(`cases`, `population`, key="type", value="count")
melt, dcast
DT0 <- data.table(x=1:20, y1=rnorm(20), y2=rnorm(20,.5))
# x y1 y2
# 1: 1 -0.58122437 0.58533709
# 2: 2 -1.01616426 0.73756413
# 3: 3 -0.36792958 -0.03372934
# ...
# 19: 19 0.56873638 1.41555847
# 20: 20 1.46933741 -0.63625173
moltenDT <- DT0 %>% melt.data.table(id.vars=c("x"), measure.vars=c("y1","y2"),
\t\t\t\t\t\t\t\t variable.name="Var", value.name="Val")
# x Var Val
# 1: 1 y1 -0.58122437
# 2: 2 y1 -1.01616426
# ....
# 39: 19 y2 1.41555847
# 40: 20 y2 -0.63625173
dcastDT <- moltenDT %>% dcast.data.table(formula = x ~ Var,
\t\t\t\t\t\t\t\t\t value.var = c('Val'))
dcast하면 key 가 생긴다.
all.equal(DT0, dcastDT) # [1] "Datasets has different keys. 'target' has no key. 'current': x." DT0 %>% key() # NULL dcastDT %>% key() # [1] "x" DT0 %>% setkey(x) identical(DT0, dcastDT) # TRUE
Timeserise to data.table
passenger <- tsbox::ts_dt(AirPassengers)
melt.data.table() & dcast.data.table()
data.table에 맞게 reshape2 package의 함수를 수정한 dcast & melt
Convert DT to long form where each dob is a separate observation.
rs <- dcast.data.table(rs,"Step~variable")
data_melt <- melt.data.table(data, id.vars = c("A", "B", "C"))
data_cast <- dcast.data.table(result, id ~ Param)
DT <- data.table(
i_1 = c(1:5, NA),
i_2 = c(NA,6,7,8,9,10),
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
c_1 = sample(c(letters[1:3], NA), 6, TRUE),
d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
d_2 = as.Date(6:1, origin="2012-01-01") )
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]
# i_1 i_2 f_1 f_2 c_1 d_1 d_2 l_1 l_2
# 1: 1 NA a z c 2013-09-02 2012-01-07 1,1,1,1,1 c,c,c,c
# 2: 2 6 c a c 2013-09-03 2012-01-06 2,2,2,2 c,c,c,c
# 3: 3 7 b x 2013-09-04 2012-01-05 3,3,3,3 NA,NA,NA
# 4: 4 8 c c 2012-01-04 4,4,4,4 c,c,c,c,c
# 5: 5 9 c x 2013-09-05 2012-01-03 5 NA
# 6: NA 10 b x c 2013-09-06 2012-01-02 NA,NA,NA,NA,NA c,c
# id, measure ------------------------------------------------------------------
# as character/integer/numeric vectors
DT %>% melt.data.table(id.vars=1:2, measure.vars=c("f_1"))
DT %>% melt.data.table(id.vars=c("i_1","i_2"), measure.vars=3) # same as above
DT %>% melt.data.table(id.vars=1:2, measure.vars=3L, value.factor=T) # same, but 'value' is factor
DT %>% melt.data.table(id.vars=1:2, measure.vars=3:4, value.factor=T) # 'value' is *ordered* factor
# preserves attribute when types are identical, ex: Date
DT %>% melt.data.table(id.vars=3:4, measure.vars=c("d_1","d_2"))
DT %>% melt.data.table(id.vars=3:4, measure.vars=c("i_1","d_1")) # attribute not preserved
# on list
DT %>% melt.data.table(id.vars=c("l_1", "l_2")) # value is a list
DT %>% melt.data.table(id.vars=c("c_1", "l_1")) # c1 coerced to list
# on character
DT %>% melt.data.table(id.vars=c("c_1", "f_1"), measure.vars=c("i_1","i_2","f_1","f_2","c_1","d_1","d_2")) # value is char
DT %>% melt.data.table(id.vars=c("c_1", "i_2")) # i2 coerced to char
# on na.rm=TRUE. NAs are removed efficiently, from within C
DT %>% melt.data.table(id.vars=c("c_1", "i_2"), na.rm=TRUE) # remove NA
# measure.vars can be also a list
# melt "f_1,f_2" and "d_1,d_2" simultaneously, retain 'factor' attribute
# convenient way using internal function patterns()
melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE)
# same as above, but provide list of columns directly by column names or indices
melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE)
# same as above, but provide names directly:
melt(DT, id=1:2, measure=patterns(f="^f_"), value.factor=TRUE)
# na.rm=TRUE removes rows with NAs in any 'value' columns
melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE)
# return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column
melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE)
dd
merged.df
melt.data.table(data.table(merged.df), id=1, measure=patterns("^name"), value.factor=F)
make.nam
Examples
Q1. Calculate total number of rows by month and then sort on descending order
mydata[, .N, by = month] [order(-N)]
The .N operator is used to find count.
Q2. Find top 3 months with high mean arrival delay
mydata[, .(mean_arr_delay = mean(arr_delay, na.rm = TRUE)), by = month][order(-mean_arr_delay)][1:3]
Q3. Find origin of flights having average total delay is greater than 20 minutes
mydata[, lapply(.SD, mean, na.rm = TRUE), .SDcols = c("arr_delay", "dep_delay"), by = origin][(arr_delay + dep_delay) > 20]
Q4. Extract average of arrival and departure delays for carrier == 'DL' by 'origin' and 'dest' variables
mydata[carrier == "DL",
lapply(.SD, mean, na.rm = TRUE),
by = .(origin, dest),
.SDcols = c("arr_delay", "dep_delay")]
Q5. Pull first value of 'air_time' by 'origin' and then sum the returned values when it is greater than 300
mydata[, .SD[1], .SDcols="air_time", by=origin][air_time > 300, sum(air_time)]