data.table reshape (dcast,melt)
DT %>% melt.data.table(id.vars= c("col1"), measure.vars=c("col2"), variable.name="var", value.name="val") DT %>% dcast.data.table(formula= col1+col2 ~ co3, value.var=c("col4"))
https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf
melt, dcast ==> gather, spread
#--- --- --- ----- table4a %>% as.data.table() moltenDD <- as.data.table(table4a) %>% melt.data.table(id.vars="country", variable.name="year", value.name="case") moltenDD %>% dcast.data.table(formula=country~year, value.var="case") table4a dcastDD <- table4a %>% gather(`1999`, `2000`, key="year", value="case") dcastDD %>% spread(key=year, value=case) #--- --- --- ----- table2 %>% as.data.table() dcastDD <- as.data.table(table2) %>% dcast.data.table(country+year ~ type, value.var="count") dcastDD %>% melt.data.table(id.vars=c('country','year'),\tvariable.name="cases", value.name="count") table2 moltenDD <- table2 %>% spread(key=type, value=count) #table2 %>% spread(key=year, value=count) moltenDD %>% gather(`cases`, `population`, key="type", value="count")
melt, dcast
DT0 <- data.table(x=1:20, y1=rnorm(20), y2=rnorm(20,.5)) # x y1 y2 # 1: 1 -0.58122437 0.58533709 # 2: 2 -1.01616426 0.73756413 # 3: 3 -0.36792958 -0.03372934 # ... # 19: 19 0.56873638 1.41555847 # 20: 20 1.46933741 -0.63625173 moltenDT <- DT0 %>% melt.data.table(id.vars=c("x"), measure.vars=c("y1","y2"), \t\t\t\t\t\t\t\t variable.name="Var", value.name="Val") # x Var Val # 1: 1 y1 -0.58122437 # 2: 2 y1 -1.01616426 # .... # 39: 19 y2 1.41555847 # 40: 20 y2 -0.63625173 dcastDT <- moltenDT %>% dcast.data.table(formula = x ~ Var, \t\t\t\t\t\t\t\t\t value.var = c('Val'))
dcast하면 key 가 생긴다.
all.equal(DT0, dcastDT) # [1] "Datasets has different keys. 'target' has no key. 'current': x." DT0 %>% key() # NULL dcastDT %>% key() # [1] "x" DT0 %>% setkey(x) identical(DT0, dcastDT) # TRUE
Timeserise to data.table
passenger <- tsbox::ts_dt(AirPassengers)
melt.data.table() & dcast.data.table()
data.table에 맞게 reshape2 package의 함수를 수정한 dcast & melt
Convert DT
to long form where each dob
is a separate observation.
rs <- dcast.data.table(rs,"Step~variable")
data_melt <- melt.data.table(data, id.vars = c("A", "B", "C"))
data_cast <- dcast.data.table(result, id ~ Param)
DT <- data.table( i_1 = c(1:5, NA), i_2 = c(NA,6,7,8,9,10), f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), c_1 = sample(c(letters[1:3], NA), 6, TRUE), d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), d_2 = as.Date(6:1, origin="2012-01-01") ) DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c] # i_1 i_2 f_1 f_2 c_1 d_1 d_2 l_1 l_2 # 1: 1 NA a z c 2013-09-02 2012-01-07 1,1,1,1,1 c,c,c,c # 2: 2 6 c a c 2013-09-03 2012-01-06 2,2,2,2 c,c,c,c # 3: 3 7 b x2013-09-04 2012-01-05 3,3,3,3 NA,NA,NA # 4: 4 8 c c 2012-01-04 4,4,4,4 c,c,c,c,c # 5: 5 9 c x 2013-09-05 2012-01-03 5 NA # 6: NA 10 b x c 2013-09-06 2012-01-02 NA,NA,NA,NA,NA c,c # id, measure ------------------------------------------------------------------ # as character/integer/numeric vectors DT %>% melt.data.table(id.vars=1:2, measure.vars=c("f_1")) DT %>% melt.data.table(id.vars=c("i_1","i_2"), measure.vars=3) # same as above DT %>% melt.data.table(id.vars=1:2, measure.vars=3L, value.factor=T) # same, but 'value' is factor DT %>% melt.data.table(id.vars=1:2, measure.vars=3:4, value.factor=T) # 'value' is *ordered* factor # preserves attribute when types are identical, ex: Date DT %>% melt.data.table(id.vars=3:4, measure.vars=c("d_1","d_2")) DT %>% melt.data.table(id.vars=3:4, measure.vars=c("i_1","d_1")) # attribute not preserved # on list DT %>% melt.data.table(id.vars=c("l_1", "l_2")) # value is a list DT %>% melt.data.table(id.vars=c("c_1", "l_1")) # c1 coerced to list # on character DT %>% melt.data.table(id.vars=c("c_1", "f_1"), measure.vars=c("i_1","i_2","f_1","f_2","c_1","d_1","d_2")) # value is char DT %>% melt.data.table(id.vars=c("c_1", "i_2")) # i2 coerced to char # on na.rm=TRUE. NAs are removed efficiently, from within C DT %>% melt.data.table(id.vars=c("c_1", "i_2"), na.rm=TRUE) # remove NA # measure.vars can be also a list # melt "f_1,f_2" and "d_1,d_2" simultaneously, retain 'factor' attribute # convenient way using internal function patterns() melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE) # same as above, but provide list of columns directly by column names or indices melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE) # same as above, but provide names directly: melt(DT, id=1:2, measure=patterns(f="^f_"), value.factor=TRUE) # na.rm=TRUE removes rows with NAs in any 'value' columns melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) dd merged.df melt.data.table(data.table(merged.df), id=1, measure=patterns("^name"), value.factor=F) make.nam
Examples
Q1. Calculate total number of rows by month and then sort on descending order
mydata[, .N, by = month] [order(-N)]
The .N operator is used to find count.
Q2. Find top 3 months with high mean arrival delay
mydata[, .(mean_arr_delay = mean(arr_delay, na.rm = TRUE)), by = month][order(-mean_arr_delay)][1:3]
Q3. Find origin of flights having average total delay is greater than 20 minutes
mydata[, lapply(.SD, mean, na.rm = TRUE), .SDcols = c("arr_delay", "dep_delay"), by = origin][(arr_delay + dep_delay) > 20]
Q4. Extract average of arrival and departure delays for carrier == 'DL' by 'origin' and 'dest' variables
mydata[carrier == "DL",
lapply(.SD, mean, na.rm = TRUE),
by = .(origin, dest),
.SDcols = c("arr_delay", "dep_delay")]
Q5. Pull first value of 'air_time' by 'origin' and then sum the returned values when it is greater than 300
mydata[, .SD[1], .SDcols="air_time", by=origin][air_time > 300, sum(air_time)]