data.table Tip

Published by onesixx on

Using thread

?getDTthreads

생략된 rows 보기

많지 않은 데이터 전체를 Console창에서 보고 싶을때,  가장 쉽게  View(데이터.테이블)  도 있지만,
옵션의 max.print를 늘리는 방법이 있다.

options(max.print=6666) 
[ reached getOption("max.print") -- omitted 93 rows ]

# 현재 max.print값 확인 
getOption("max.print")    # [1] 1000

# options값 변경 
options(max.print=2000)

myData %>% print(nrow=666)

display entire row 

data[ , col1 , with=FALSE] %>% print(n=Inf)

Union 

AB <- rbind(A, B)
AB <- AB[!duplicated(AB),]

Shift

shift() example

DT <- data.table(A=1:5)
#    A
# 1: 1
# 2: 2
# 3: 3
# 4: 4
# 5: 5
DT[ , D:= shift(A)]
DT[ , D:= shift(A, n=1, type="lag", fill=NA)]
#    A  D
# 1: 1 NA
# 2: 2  1
# 3: 3  2
# 4: 4  3
# 5: 5  4

DT[ , D:= shift(A, 2, type="lead", fill=0)]
#    A D
# 1: 1 3
# 2: 2 4
# 3: 3 5
# 4: 4 0
# 5: 5 0
https://www.rdocumentation.org/packages/data.table/versions/1.12.8/topics/shift
DT = data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])

# with names automatically set
DT[, shift(.SD, n=1:2, fill=NA, type="lag", give.names=TRUE), .SDcols=2:4]

# lag columns 'v1,v2,v3' DT by 1 and fill with 0
cols = c("v1","v2","v3")
anscols = str_c("lead", cols, sep="_")
DT[           , (anscols) := shift(.SD, n=1, fill=NA, type="lead"), .SDcols=cols]

# lag/lead in the right order
DT = DT[sample(nrow(DT))]
DT[order(year), (anscols) := shift(.SD, n=1, fill=NA, type="lead"), .SDcols=cols]

shift() vs. diff()

shift(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FALSE)
set.seed(666)
DT <- data.table(A=sample(1:10, 6))
#--------------------------------------
DT[ , lagA:= shift(A, n=1, fill=0)]

DT[ , lagDiff1:= A - shift(A, fill=0)]

DT[ , lagDiff2:= A - shift(A)]
DT[is.na(lagDiff2), lagDiff2:=0]

DT[ , oriDiff:= c(0, diff(A))]

#     A lagA lagDiff1 lagDiff2 oriDiff
# 1:  8    0        8        0       0
# 2:  2    8       -6       -6      -6
# 3: 10    2        8        8       8
# 4:  9   10       -1       -1      -1
# 5:  3    9       -6       -6      -6
# 6:  4    3        1        1       1

lapply(.SD 활용

DT[3:5,]
DT[, .SD[3:5]]  #identical
   A B C D
1: b 3 5 4
2: b 1 3 2
3: c 2 4 1

data.table에서는 colSums보다는 lapply(.SD,...) 을 많이 사용한다. cf. rowSums은 rowSums

DT[ , lapply(.SD, sum), .SDcols=2:4]       
    B  C  D
1: 12 21 21
DT[ , colSums(.SD), .SDcols=!"A"]           
 B  C  D 
12 21 21 

.SD는 lapply에서 Column 기준으로 반복

DT[, .SD[2], by=A]                         # 2nd row of each group (group by)
DT[, lapply(.SD, sum), by=A, .SDcols=2:4]  # colSums of each group (group by)
DT[, lapply(.SD, function(x){is.numeric(x)})]
   # 2nd row of each group
      A B C D
   1: a 2 6 6
   2: b 1 3 2
   3: c 3 2 5
  # colSums of each group
      A B C D
   1: a 3 7 9
   2: b 4 8 6
   3: c 5 6 6
 # 
          A    B    C    D
   1: FALSE TRUE TRUE TRUE 

숫자인 열만 가져오기

data <- iris
setDT(data)[ , .SD, .SDcols=is.numeric]

NA 처리

df는 Logical(T/F) matrix를 이용한 Indexing이 가능하지만, data.table은 다르다. (http://onesixx.com/indexing/ )

set.seed(666)
df <- data.frame(
  a=sample(c(1,2,NA), 10, replace=T),
  b=sample(c(1,2,NA), 10, replace=T), 
  c=sample(c(1:5,NA), 10, replace=T))

df[is.na(df)]<-0

colNm <- c("a","b")
df[,colNm][is.na(df[,colNm])]<-0

colIdx <- c(1:2)
df[,colIdx][is.na(df[,colIdx])]<-0

colIdx <- c(1:2)
for(i in colIdx){
  df[,i][is.na(df[,i])]<-0
}","showLines":false,"wrapLines":false,"highlightStart":"9","highlightEnd":"18
set.seed(666) 
DT <- data.table(
  a=sample(c(1,2,NA), 10, replace=T),
  b=sample(c(1,2,NA), 10, replace=T), 
  c=sample(c(1:5,NA), 10, replace=T))

#     a  b  c
# 1: NA NA NA
# 2:  1  1  4
# 3: NA  1  1
# 4:  1  1  1
# 5:  2  1  5
# 6: NA NA  2
# 7: NA  1  1
# 8:  2 NA  4
# 9:  1  2  4
# 10:  1  2 NA
DT[is.na(DT)] <- 0    # data.table은 이것만 가능

remove NA rows

> na.omit(DT)
> DT[ is.finite(rowSums(DT))]
   a b c
1: 1 1 4
2: 1 1 1
3: 2 1 5
4: 1 2 4

https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
colNm <- c("a","b")
colIdx <- c(1:2)
f_dowle3 = function(DT) {
  # by name :
  for (j in names(DT))
    set(DT,which(is.na(DT[[j]])),j,0)

  for (j in colNm)
    set(DT,which(is.na(DT[,..j])),j,0)  
    
  # by number : (slightly faster than by name) :
  for (j in seq_len(ncol(DT)))
    set(DT,which(is.na(DT[[j]])),j,0)
    
  for (j in colIdx)
    set(DT,which(is.na(DT[,..j])),j,0)
}

setDT() vs. as.data.table()

setDT()

typeof()가 list인 object를 data.table로 변환해 준다.
(vector나 column이 하나인 list, matrix, array는 에러난다. )
Copy없이 같은 list를 공유하면서 만들기 때문에 memory 주소가 같다. tracemem()

as.data.table()

Type 상관없이 data.table로 모두 변환해 준다
Copy해서 만들기 때문에 memory 주소가 다르다. tracemem()

Special symbols

?"special-symbols"

. .SD .SDcols .I .N .GRP .BY .EACHI
.SD.BY.N.I and .GRP are read only symbols for use in j. (.N can be used in i as well)

.I (row순번) .N (row 갯수)

DT[, .I]           # row number


DT[, .I[2], by=A]  # 2nd row number of each group
[1] 1 2 3 4 5 6

   A V1
1: a  2
2: b  4
3: c  6

DT[, .N]  # total number of rows in DT
nrow(DT)  # identical 

DT[, .N, by=A][order(-A)] # number of rows in each group
[1] 6

   A N
1: a 2
2: b 2
3: c 2

.GRP

DT[, grp := .GRP, by=x]                 # add a group counter column

.BY

X[, DT[.BY, y, on="x"], by=x]           # join within each group

 := 

 data.table's reference semantics vignette 

update, data의 불필요한 Copy를 방지

Subsetting의 인덱싱 결과는 또 다른 새로운 data.table이다.(즉 원래 DT에 재할당하지 않는 이상, DT는 변하지 않고 유지된다) 따라서, DT의 size가 big한 경우,  := 를 사용하여 컬럼을 참조하여 update함으로써  data의 불필요한 Copy를 방지한다.

Column Type변환

cName <- c('A')
DT[  ,  (cName):= lapply(.SD, as.factor), .SDcols=cName]

cIdx <- which(sapply(DT, is.character))
DT[ , (cIdx):=lapply(.SD, as.factor), .SDcols=cIdx]

여러 방법을 통해 out_idx를 정의하고, 해당 컬럼에 함수(as.factor)를 적용하여,  column type을 covert한다. 
data.table이 각 element가 column인 list로 여겨질수 있다는 것이다.  
(out_idx)를 괄호로 감싸는 이유는 컬럼명으로 인식하게 하기 위해. 

Data.Frame을 사용해서도 같은 결과를 얻을수 있다.

setDF(DT)                              # convert to data.frame for illustration 
sapply(DT[ ,out_idx], is.character)



Transpose

> fread(valuePath)
    V1    x
1: PER 16.7
2: PBR  1.4
3: PCR  7.9
4: PSR  1.6

> fread(valuePath) %>% transpose(make.names="V1")
    PER PBR PCR PSR
1: 16.7 1.4 7.9 1.6

> fread(valuePath) %>% transpose(keep.names="names")
   names   V1  V2  V3  V4
1:    V1  PER PBR PCR PSR
2:     x 16.7 1.4 7.9 1.6
https://github.com/Rdatatable/data.table/wiki/Getting-started
https://stackoverflow.com/questions/1299871/how-to-join-merge-data-frames-inner-outer-left-right
https://stackoverflow.com/questions/8508482/what-does-sd-stand-for-in-data-table-in-r
https://www.analyticsvidhya.com/blog/2016/05/data-table-data-frame-work-large-data-sets/
Use of lapply .SD in data.table R
HTML vignettes: https://github.com/Rdatatable/data.table/wiki/Getting-started

예제

#http://onesixx.com/data-nycflights13/
flights <- nycflights13::flights %>% as.data.table()

#Calculate total number of rows by month and then sort on descending order
flights[, .N, by = month] [order(-N)]
#Find top 3 months with high mean arrival delay
flights[, .(mean_delay=mean(arr_delay, na.rm=T)), by=month][order(-mean_delay)][1:3]
#Find 출발공항(origin)  having average total delay is greater than 20 minutes
flights[, lapply(.SD, mean, na.rm=T), .SDcols=c("arr_delay", "dep_delay"), by=origin][(arr_delay+dep_delay)>20]
#Extract average of arrival and departure delays for carrier == 'DL' by 'origin' and 'dest' variables
flights[carrier=='DL', lapply(.SD, mean, na.rm=T), by=.(origin, dest), .SDcols=c("arr_delay", "dep_delay")]
#Pull first value of 'air_time' by 'origin' and then sum the returned values when it is greater than 300
mydata[, .SD[1], .SDcols="air_time", by=origin][air_time > 300, sum(air_time)]
> example(data.table)
dt.tbl> DF = data.frame(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
dt.tbl> DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
# ---------------------------------------------------------
dt.tbl> DF
  x y v
1 b 1 1
2 b 3 2
3 b 6 3
4 a 1 4
5 a 3 5
6 a 6 6
7 c 1 7
8 c 3 8
9 c 6 9

dt.tbl> DT
   x y v
1: b 1 1
2: b 3 2
3: b 6 3
4: a 1 4
5: a 3 5
6: a 6 6
7: c 1 7
8: c 3 8
9: c 6 9

dt.tbl> identical(dim(DT), dim(DF))    # TRUE
dt.tbl> identical(DF$a, DT$a)          # TRUE
dt.tbl> is.list(DF)                    # TRUE
dt.tbl> is.list(DT)                    # TRUE
dt.tbl> is.data.frame(DT)              # TRUE
dt.tbl> tables()
   NAME NROW NCOL MB  COLS KEY
1:   DT    9    3  0 x,y,v    


# ---------------------------------------------------------
# basic row subset operations
dt.tbl> DT[2]                          # 2nd row
   x y v
1: b 3 2

dt.tbl> DT[3:2]                        # 3rd and 2nd row
   x y v
1: b 6 3
2: b 3 2

dt.tbl> DT[y>2]                        # all rows where DT$y > 2
   x y v
1: b 3 2
2: b 6 3
3: a 3 5
4: a 6 6
5: c 3 8
6: c 6 9

dt.tbl> DT[y>2 & v>5]                  # compound logical expressions
   x y v
1: a 6 6
2: c 3 8
3: c 6 9

dt.tbl> DT[!2:4]                       # all rows other than 2:4
dt.tbl> DT[-(2:4)]                     # same
   x y v
1: b 1 1
2: a 3 5
3: a 6 6
4: c 1 7
5: c 3 8
6: c 6 9

dt.tbl> DT[order(x)]                   # no need for order(DT$x)
dt.tbl> DT[order(x), ]                 # same as above. The ',' is optional
   x y v
1: a 1 4
2: a 3 5
3: a 6 6
4: b 1 1
5: b 3 2
6: b 6 3
7: c 1 7
8: c 3 8
9: c 6 9

# ---------------------------------------------------------
# select|compute columns data.table way
dt.tbl> DT[, v]                        # v column (as vector)
[1] 1 2 3 4 5 6 7 8 9

dt.tbl> DT[, list(v)]                  # v column (as data.table)
dt.tbl> DT[, .(v)]                     # same as above, .() is a shorthand alias to list()
dt.tbl> DT[, c("v")] 
dt.tbl> DT[, c(3)]
dt.tbl> DT[, c(FALSE,FALSE,TRUE)]
   v
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
7: 7
8: 8
9: 9

dt.tbl> DT[, sum(v)]                   # sum of column v, returned as vector
[1] 45

dt.tbl> DT[, .(sum(v))]                # same, but return data.table (column autonamed V1)
   V1
1: 45

dt.tbl> DT[, .(sv=sum(v))]             # same, but column named "sv"
   sv
1: 45

dt.tbl> DT[, .(v, v*2)]                # return two column data.table, v and v*2
   v V2
1: 1  2
2: 2  4
3: 3  6
4: 4  8
5: 5 10
6: 6 12
7: 7 14
8: 8 16
9: 9 18


# ---------------------------------------------------------
 # subset rows and select|compute data.table way
dt.tbl> DT[2:3, sum(v)]                # sum(v) over rows 2 and 3, return vector
[1] 5

dt.tbl> DT[2:3, .(sum(v))]             # same, but return data.table with column V1
   V1
1:  5

dt.tbl> DT[2:3, .(sv=sum(v))]          # same, but return data.table with column sv
   sv
1:  5

dt.tbl> DT[2:5, cat(v, "\
")]          # just for j's side effect
2 3 4 5 
NULL

# ---------------------------------------------------------
# select columns the data.frame way
dt.tbl> DT[, 2]                        # 2nd column, returns a data.table always
   y
1: 1
2: 3
3: 6
4: 1
5: 3
6: 6
7: 1
8: 3
9: 6

dt.tbl> colNum = 2                     # to refer vars in `j` from the outside of data use `..` prefix
dt.tbl> DT[, ..colNum]                 # same, equivalent to DT[, .SD, .SDcols=colNum]

dt.tbl> DT[["v"]]                      # same as DT[, v] but much faster
[1] 1 2 3 4 5 6 7 8 9



# ---------------------------------------------------------
# grouping operations - j and by
dt.tbl> DT[, sum(v), by=x]             # ad hoc by, order of groups preserved in result
   x V1
1: b  6
2: a 15
3: c 24

dt.tbl> DT[, sum(v), keyby=x]          # same, but order the result on by cols
   x V1
1: a 15
2: b  6
3: c 24

dt.tbl> DT[, sum(v), by=x][order(x)]   # same but by chaining expressions together
   x V1
1: a 15
2: b  6
3: c 24



# ---------------------------------------------------------
# fast ad hoc row subsets (subsets as joins)
dt.tbl> DT[x=="a"]                     # same, single "==" internally optimised to use binary search (fast)
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> DT["a", on="x"]             # same as x == "a" but uses binary search (fast)
dt.tbl> DT["a", on=.(x)]            # same, for convenience, no need to quote every column
dt.tbl> DT[.("a"), on="x"]          # same
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

> DT
   x y v
1: b 1 1
2: b 3 2
3: b 6 3
4: a 1 4
5: a 3 5
6: a 6 6
7: c 1 7
8: c 3 8
9: c 6 9

> DT[.(1:2), on=.(y)]
      x y  v
1:    b 1  1
2:    a 1  4
3:    c 1  7
4:  2 NA

> DT[.(1:2), on=.(y), nomatch=NULL]
   x y v
1: b 1 1
2: a 1 4
3: c 1 7
> DT[.(1:2), on=.(y), roll=Inf]
   x y v
1: b 1 1
2: a 1 4
3: c 1 7
4: c 2 7
> DT[.(1:2), on=.(y), roll=-Inf]
   x y v
1: b 1 1
2: a 1 4
3: c 1 7
4: b 2 2


dt.tbl> DT[x!="b" | y!=3]              # not yet optimized, currently vector scan subset
   x y v
1: b 1 1
2: b 6 3
3: a 1 4
4: a 3 5
5: a 6 6
6: c 1 7
7: c 3 8
8: c 6 9
dt.tbl> DT[x=="b" & y==3, ]
dt.tbl> DT[.("b", 3), on=c("x", "y")]  # join on columns x,y of DT; uses binary search (fast)
dt.tbl> DT[.("b", 3), on=.(x, y)]      # same, but using on=.()
   x y v
1: b 3 2

dt.tbl> DT[.("b", 1:2), on=c("x", "y")]             # no match returns NA
   x y  v
1: b 1  1
2: b 2 NA

dt.tbl> DT[.("b", 1:2), on=.(x, y), nomatch=NULL]   # no match row is not returned
   x y v
1: b 1 1

dt.tbl> DT[.("b", 1:2), on=c("x", "y"), roll=Inf]   # locf, nomatch row gets rolled by previous row
   x y v
1: b 1 1
2: b 2 1

dt.tbl> DT[.("b", 1:2), on=.(x, y), roll=-Inf]      # nocb, nomatch row gets rolled by next row
   x y v
1: b 1 1
2: b 2 2

dt.tbl> DT["b", sum(v*y), on="x"]                   # on rows where DT$x=="b", calculate sum(v*y)
[1] 25

# ---------------------------------------------------------
# all together now
dt.tbl> DT[x!="a", sum(v), by=x]                    # get sum(v) by "x" for each i != "a"
   x V1
1: b  6
2: c 24

dt.tbl> DT[!"a", sum(v), by=.EACHI, on="x"]         # same, but using subsets-as-joins
   x V1
1: b  6
2: c 24

dt.tbl> DT[c("b","c"), sum(v), by=.EACHI, on="x"]   # same
   x V1
1: b  6
2: c 24

dt.tbl> DT[c("b","c"), sum(v), by=.EACHI, on=.(x)]  # same, using on=.()
   x V1
1: b  6
2: c 24

# ---------------------------------------------------------
# joins as subsets
dt.tbl> X = data.table(x=c("c","b"), v=8:7, foo=c(4,2))
   x v foo
1: c 8   4
2: b 7   2

dt.tbl> DT[X, on="x"]                         # right join
   x y v i.v foo
1: c 1 7   8   4
2: c 3 8   8   4
3: c 6 9   8   4
4: b 1 1   7   2
5: b 3 2   7   2
6: b 6 3   7   2

dt.tbl> X[DT, on="x"]                         # left join
   x  v foo y i.v
1: b  7   2 1   1
2: b  7   2 3   2
3: b  7   2 6   3
4: a NA  NA 1   4
5: a NA  NA 3   5
6: a NA  NA 6   6
7: c  8   4 1   7
8: c  8   4 3   8
9: c  8   4 6   9

dt.tbl> DT[X, on="x", nomatch=NULL]           # inner join
   x y v i.v foo
1: c 1 7   8   4
2: c 3 8   8   4
3: c 6 9   8   4
4: b 1 1   7   2
5: b 3 2   7   2
6: b 6 3   7   2

dt.tbl> DT[!X, on="x"]                        # not join
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> DT[X, on=c(y="v")]                    # join using column "y" of DT with column "v" of X
      x y  v i.x foo
1:  8 NA   c   4
2:  7 NA   b   2

dt.tbl> DT[X, on="y==v"]                      # same as above (v1.9.8+)
      x y  v i.x foo
1:  8 NA   c   4
2:  7 NA   b   2

dt.tbl> DT[X, on=.(y<=foo)]                   # NEW non-equi join (v1.9.8+)
   x y v i.x i.v
1: b 4 1   c   8
2: b 4 2   c   8
3: a 4 4   c   8
4: a 4 5   c   8
5: c 4 7   c   8
6: c 4 8   c   8
7: b 2 1   b   7
8: a 2 4   b   7
9: c 2 7   b   7

dt.tbl> DT[X, on="y<=foo"]                    # same as above
   x y v i.x i.v
1: b 4 1   c   8
2: b 4 2   c   8
3: a 4 4   c   8
4: a 4 5   c   8
5: c 4 7   c   8
6: c 4 8   c   8
7: b 2 1   b   7
8: a 2 4   b   7
9: c 2 7   b   7

dt.tbl> DT[X, on=c("y<=foo")]                 # same as above
   x y v i.x i.v
1: b 4 1   c   8
2: b 4 2   c   8
3: a 4 4   c   8
4: a 4 5   c   8
5: c 4 7   c   8
6: c 4 8   c   8
7: b 2 1   b   7
8: a 2 4   b   7
9: c 2 7   b   7

dt.tbl> DT[X, on=.(y>=foo)]                   # NEW non-equi join (v1.9.8+)
   x y v i.x i.v
1: b 4 3   c   8
2: a 4 6   c   8
3: c 4 9   c   8
4: b 2 2   b   7
5: b 2 3   b   7
6: a 2 5   b   7
7: a 2 6   b   7
8: c 2 8   b   7
9: c 2 9   b   7

dt.tbl> DT[X, on=.(x, y<=foo)]                # NEW non-equi join (v1.9.8+)
   x y v i.v
1: c 4 7   8
2: c 4 8   8
3: b 2 1   7

dt.tbl> DT[X, .(x,y,x.y,v), on=.(x, y>=foo)]  # Select x's join columns as well
   x y x.y v
1: c 4   6 9
2: b 2   3 2
3: b 2   6 3

dt.tbl> DT[X, on="x", mult="first"]           # first row of each group
   x y v i.v foo
1: c 1 7   8   4
2: b 1 1   7   2

dt.tbl> DT[X, on="x", mult="last"]            # last row of each group
   x y v i.v foo
1: c 6 9   8   4
2: b 6 3   7   2

dt.tbl> DT[X, sum(v), by=.EACHI, on="x"]      # join and eval j for each row in i
   x V1
1: c 24
2: b  6

dt.tbl> DT[X, sum(v)*foo, by=.EACHI, on="x"]  # join inherited scope
   x V1
1: c 96
2: b 12

dt.tbl> DT[X, sum(v)*i.v, by=.EACHI, on="x"]  # 'i,v' refers to X's v column
   x  V1
1: c 192
2: b  42

dt.tbl> DT[X, on=.(x, v>=v), sum(y)*foo, by=.EACHI] # NEW non-equi join with by=.EACHI (v1.9.8+)
   x v V1
1: c 8 36
2: b 7 NA

# ---------------------------------------------------------
# setting keys
dt.tbl> kDT = copy(DT)                        # (deep) copy DT to kDT to work with it.

dt.tbl> setkey(kDT,x)                         # set a 1-column key. No quotes, for convenience.

dt.tbl> setkeyv(kDT,"x")                      # same (v in setkeyv stands for vector)


dt.tbl> v="x"
dt.tbl> setkeyv(kDT,v)                        # same

dt.tbl> # key(kDT)<-"x"                       # copies whole table, please use set* functions instead
dt.tbl> haskey(kDT)                           # TRUE
[1] TRUE

dt.tbl> key(kDT)                              # "x"
[1] "x"

dt.tbl> # fast *keyed* subsets
dt.tbl> kDT["a"]                              # subset-as-join on *key* column 'x'
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> kDT["a", on="x"]                      # same, being explicit using 'on=' (preferred)
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> # all together
dt.tbl> kDT[!"a", sum(v), by=.EACHI]          # get sum(v) for each i != "a"
   x V1
1: b  6
2: c 24

dt.tbl> # multi-column key
dt.tbl> setkey(kDT,x,y)                       # 2-column key

dt.tbl> setkeyv(kDT,c("x","y"))               # same

dt.tbl> # fast *keyed* subsets on multi-column key
dt.tbl> kDT["a"]                              # join to 1st column of key
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> kDT["a", on="x"]                      # on= is optional, but is preferred
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> kDT[.("a")]                           # same, .() is an alias for list()
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> kDT[list("a")]                        # same
   x y v
1: a 1 4
2: a 3 5
3: a 6 6

dt.tbl> kDT[.("a", 3)]                        # join to 2 columns
   x y v
1: a 3 5

dt.tbl> kDT[.("a", 3:6)]                      # join 4 rows (2 missing)
   x y  v
1: a 3  5
2: a 4 NA
3: a 5 NA
4: a 6  6

dt.tbl> kDT[.("a", 3:6), nomatch=NULL]        # remove missing
   x y v
1: a 3 5
2: a 6 6

dt.tbl> kDT[.("a", 3:6), roll=TRUE]           # locf rolling join
   x y v
1: a 3 5
2: a 4 5
3: a 5 5
4: a 6 6

dt.tbl> kDT[.("a", 3:6), roll=Inf]            # same as above
   x y v
1: a 3 5
2: a 4 5
3: a 5 5
4: a 6 6

dt.tbl> kDT[.("a", 3:6), roll=-Inf]           # nocb rolling join
   x y v
1: a 3 5
2: a 4 6
3: a 5 6
4: a 6 6

dt.tbl> kDT[!.("a")]                          # not join
   x y v
1: b 1 1
2: b 3 2
3: b 6 3
4: c 1 7
5: c 3 8
6: c 6 9

dt.tbl> kDT[!"a"]                             # same
   x y v
1: b 1 1
2: b 3 2
3: b 6 3
4: c 1 7
5: c 3 8
6: c 6 9

dt.tbl> # more on special symbols, see also ?"special-symbols"
dt.tbl> DT[.N]                                  # last row
   x y v
1: c 6 9

dt.tbl> DT[, .N]                                # total number of rows in DT
[1] 9

dt.tbl> DT[, .N, by=x]                          # number of rows in each group
   x N
1: b 3
2: a 3
3: c 3

dt.tbl> DT[, .SD, .SDcols=x:y]                  # select columns 'x' through 'y'
   x y
1: b 1
2: b 3
3: b 6
4: a 1
5: a 3
6: a 6
7: c 1
8: c 3
9: c 6

dt.tbl> DT[ , .SD, .SDcols = !x:y]              # drop columns 'x' through 'y'
   v
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
7: 7
8: 8
9: 9

dt.tbl> DT[ , .SD, .SDcols = patterns('^[xv]')] # select columns matching '^x' or '^v'
   x v
1: b 1
2: b 2
3: b 3
4: a 4
5: a 5
6: a 6
7: c 7
8: c 8
9: c 9

dt.tbl> DT[, .SD[1]]                            # first row of all columns
   x y v
1: b 1 1

dt.tbl> DT[, .SD[1], by=x]                      # first row of 'y' and 'v' for each group in 'x'
   x y v
1: b 1 1
2: a 1 4
3: c 1 7

dt.tbl> DT[, c(.N, lapply(.SD, sum)), by=x]     # get rows *and* sum columns 'v' and 'y' by group
   x N  y  v
1: b 3 10  6
2: a 3 10 15
3: c 3 10 24

dt.tbl> DT[, .I[1], by=x]                       # row number in DT corresponding to each group
   x V1
1: b  1
2: a  4
3: c  7

dt.tbl> DT[, grp := .GRP, by=x]                 # add a group counter column
   x y v grp
1: b 1 1   1
2: b 3 2   1
3: b 6 3   1
4: a 1 4   2
5: a 3 5   2
6: a 6 6   2
7: c 1 7   3
8: c 3 8   3
9: c 6 9   3

dt.tbl> X[, DT[.BY, y, on="x"], by=x]           # join within each group
   x V1
1: c  1
2: c  3
3: c  6
4: b  1
5: b  3
6: b  6

dt.tbl> # add/update/delete by reference (see ?assign)
dt.tbl> print(DT[, z:=42L])                   # add new column by reference
   x y v grp  z
1: b 1 1   1 42
2: b 3 2   1 42
3: b 6 3   1 42
4: a 1 4   2 42
5: a 3 5   2 42
6: a 6 6   2 42
7: c 1 7   3 42
8: c 3 8   3 42
9: c 6 9   3 42

dt.tbl> print(DT[, z:=NULL])                  # remove column by reference
   x y v grp
1: b 1 1   1
2: b 3 2   1
3: b 6 3   1
4: a 1 4   2
5: a 3 5   2
6: a 6 6   2
7: c 1 7   3
8: c 3 8   3
9: c 6 9   3

dt.tbl> print(DT["a", v:=42L, on="x"])        # subassign to existing v column by reference
   x y  v grp
1: b 1  1   1
2: b 3  2   1
3: b 6  3   1
4: a 1 42   2
5: a 3 42   2
6: a 6 42   2
7: c 1  7   3
8: c 3  8   3
9: c 6  9   3

dt.tbl> print(DT["b", v2:=84L, on="x"])       # subassign to new column by reference (NA padded)
   x y  v grp v2
1: b 1  1   1 84
2: b 3  2   1 84
3: b 6  3   1 84
4: a 1 42   2 NA
5: a 3 42   2 NA
6: a 6 42   2 NA
7: c 1  7   3 NA
8: c 3  8   3 NA
9: c 6  9   3 NA

dt.tbl> DT[, m:=mean(v), by=x][]              # add new column by reference by group
   x y  v grp v2  m
1: b 1  1   1 84  2
2: b 3  2   1 84  2
3: b 6  3   1 84  2
4: a 1 42   2 NA 42
5: a 3 42   2 NA 42
6: a 6 42   2 NA 42
7: c 1  7   3 NA  8
8: c 3  8   3 NA  8
9: c 6  9   3 NA  8
dt.tbl>                                       # NB: postfix [] is shortcut to print()
dt.tbl> # advanced usage
dt.tbl> DT = data.table(x=rep(c("b","a","c"),each=3), v=c(1,1,1,2,2,1,1,2,2), y=c(1,3,6), a=1:9, b=9:1)

dt.tbl> DT[, sum(v), by=.(y%%2)]              # expressions in by
   y V1
1: 1  9
2: 0  4

dt.tbl> DT[, sum(v), by=.(bool = y%%2)]       # same, using a named list to change by column name
   bool V1
1:    1  9
2:    0  4

dt.tbl> DT[, .SD[2], by=x]                    # get 2nd row of each group
   x v y a b
1: b 1 3 2 8
2: a 2 3 5 5
3: c 2 3 8 2

dt.tbl> DT[, tail(.SD,2), by=x]               # last 2 rows of each group
   x v y a b
1: b 1 3 2 8
2: b 1 6 3 7
3: a 2 3 5 5
4: a 1 6 6 4
5: c 2 3 8 2
6: c 2 6 9 1

dt.tbl> DT[, lapply(.SD, sum), by=x]          # sum of all (other) columns for each group
   x v  y  a  b
1: b 3 10  6 24
2: a 5 10 15 15
3: c 5 10 24  6

dt.tbl> DT[, .SD[which.min(v)], by=x]         # nested query by group
   x v y a b
1: b 1 1 1 9
2: a 1 6 6 4
3: c 1 1 7 3

dt.tbl> DT[, list(MySum=sum(v),
dt.tbl+           MyMin=min(v),
dt.tbl+           MyMax=max(v)),
dt.tbl+     by=.(x, y%%2)]                    # by 2 expressions
   x y MySum MyMin MyMax
1: b 1     2     1     1
2: b 0     1     1     1
3: a 1     4     2     2
4: a 0     1     1     1
5: c 1     3     1     2
6: c 0     2     2     2

dt.tbl> DT[, .(a = .(a), b = .(b)), by=x]     # list columns
   x     a     b
1: b 1,2,3 9,8,7
2: a 4,5,6 6,5,4
3: c 7,8,9 3,2,1

dt.tbl> DT[, .(seq = min(a):max(b)), by=x]    # j is not limited to just aggregations
    x seq
 1: b   1
 2: b   2
 3: b   3
 4: b   4
 5: b   5
 6: b   6
 7: b   7
 8: b   8
 9: b   9
10: a   4
11: a   5
12: a   6
13: c   7
14: c   6
15: c   5
16: c   4
17: c   3

dt.tbl> DT[, sum(v), by=x][V1<20]             # compound query
   x V1
1: b  3
2: a  5
3: c  5

dt.tbl> DT[, sum(v), by=x][order(-V1)]        # ordering results
   x V1
1: a  5
2: c  5
3: b  3

dt.tbl> DT[, c(.N, lapply(.SD,sum)), by=x]    # get number of observations and sum per group
   x N v  y  a  b
1: b 3 3 10  6 24
2: a 3 5 10 15 15
3: c 3 5 10 24  6

dt.tbl> DT[, {tmp <- mean(y);
dt.tbl+       .(a = a-tmp, b = b-tmp)
dt.tbl+       }, by=x]                        # anonymous lambda in 'j', j accepts any valid
   x          a          b
1: b -2.3333333  5.6666667
2: b -1.3333333  4.6666667
3: b -0.3333333  3.6666667
4: a  0.6666667  2.6666667
5: a  1.6666667  1.6666667
6: a  2.6666667  0.6666667
7: c  3.6666667 -0.3333333
8: c  4.6666667 -1.3333333
9: c  5.6666667 -2.3333333

dt.tbl>                                       # expression. TO REMEMBER: every element of
dt.tbl>                                       # the list becomes a column in result.
dt.tbl> pdf("new.pdf")

dt.tbl> DT[, plot(a,b), by=x]                 # can also plot in 'j'
Empty data.table (0 rows) of 1 col: x

dt.tbl> dev.off()
pdf 
  2 

dt.tbl> # using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
dt.tbl> DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b]
   rleid y v y a b
1:     1 6 1 1 1 7
2:     2 3 2 1 4 5
3:     3 6 1 1 6 3
4:     4 6 2 3 8 1

dt.tbl> # Support guide and links:
dt.tbl> # https://github.com/Rdatatable/data.table/wiki/Support
dt.tbl> 
dt.tbl> ## Not run: 
dt.tbl> ##D if (interactive()) {
dt.tbl> ##D   vignette("datatable-intro")
dt.tbl> ##D   vignette("datatable-reference-semantics")
dt.tbl> ##D   vignette("datatable-keys-fast-subset")
dt.tbl> ##D   vignette("datatable-secondary-indices-and-auto-indexing")
dt.tbl> ##D   vignette("datatable-reshape")
dt.tbl> ##D   vignette("datatable-faq")
dt.tbl> ##D 
dt.tbl> ##D   test.data.table()          # over 6,000 low level tests
dt.tbl> ##D 
dt.tbl> ##D   # keep up to date with latest stable version on CRAN
dt.tbl> ##D   update.packages()
dt.tbl> ##D 
dt.tbl> ##D   # get the latest devel version
dt.tbl> ##D   update.dev.pkg()
dt.tbl> ##D   # compiled devel binary for Windows available -- no Rtools needed
dt.tbl> ##D   update.dev.pkg(repo="https://Rdatatable.github.io/data.table")
dt.tbl> ##D   # read more at:
dt.tbl> ##D   # https://github.com/Rdatatable/data.table/wiki/Installation
dt.tbl> ##D }
dt.tbl> ## End(Not run)
Categories: R Reshaping

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x