fread
https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread
- read.table() 대신 사용가능
- Text파일을 읽어들이는데 많이 사용됨.
Xlsx파일은 readxl::read_xlsx를 사용 - 빠르고
- character를 factor로 자동변환하진 않는다.
fread ( input= 파일명 or URL,
header= “auto”, sep=”auto” ,
stringsAsFactors=FALSE,
drop= NULL, select=NULL , colClasses=NULL
)
fread(input, file, text, cmd, sep="auto/,/delimiter설정", sep2="auto", dec=".", quote="\\"", nrows=Inf, header="auto/T/F", na.strings=getOption("datatable.na.strings","NA"), # due to change to ""; see NEWS stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, # character number Date POSIXct .... integer64=getOption("datatable.integer64", "integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS keepLeadingZeros = getOption("datatable.keepLeadingZeros", FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="" )
data = " A,B,C,D 1,3,5,7 2,4,6,8 " dd <- data %>% fread() # A B C D #1: 1 3 5 7 #2: 2 4 6 8 dd %>% str #Classes ‘data.table’ and 'data.frame':\t2 obs. of 4 variables: # $ A: int 1 2 # $ B: int 3 4 # $ C: int 5 6 # $ D: int 7 8 # - attr(*, ".internal.selfref")=
colClasses
character–>datetime 변경하고 싶을때, colClasses=c(“날짜시간으로바꾸고싶은문자변수명”=”POSIXct”)
“Date”,”integer”,”numeric”, “character”
dd <- data %>% fread(colClasses=c(B="character",C="character",D="character")) #fread(data, colClasses=list(character=c("B","C","D"))) #fread(data, colClasses=list(character=2:4)) dd %>% str Classes ‘data.table’ and 'data.frame':\t2 obs. of 4 variables: $ A: int 1 2 $ B: chr "3" "4" $ C: chr "5" "6" $ D: chr "7" "8" - attr(*, ".internal.selfref")=
https://www.r-bloggers.com/2013/09/using-colclasses-to-load-data-more-quickly-in-r/
sampleData <- read.csv("huge-file.csv", header = TRUE, nrows = 5) classes <- sapply(sampleData, class) largeData <- read.csv("huge-file.csv", header = TRUE, colClasses = classes) str(largeData)
drop/select
data %>% fread(drop=c("B","C")) # same but less typing, easier to read data %>% fread(drop=2:3) # same using column numbers data %>% fread(colClasses=c("B"="NULL","C"="NULL")) # as read.csv data %>% fread(colClasses=list(NULL=c("B","C"))) # same data %>% fread(select=c("A","D")) # less typing, easier to read data %>% fread(select=c(1,4)) # same using column numbers # A D #1: 1 7 #2: 2 8
https://stackoverrun.com/ko/q/5099655
data = "A,B,C,D\ 1,3,5,7\ 2,4,6,8\ " data %>% fread() # A B C D # 1: 1 3 5 7 # 2: 2 4 6 8 data %>% fread() %>% str() # Classes ‘data.table’ and 'data.frame':\t2 obs. of 4 variables: # $ A: int 1 2 # $ B: int 3 4 # $ C: int 5 6 # $ D: int 7 8 # - attr(*, ".internal.selfref")=### colClasses ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data %>% fread(colClasses=c(B="character",C="character",D="character")) %>% str() # Classes ‘data.table’ and 'data.frame':\t2 obs. of 4 variables: # $ A: int 1 2 # $ B: chr "3" "4" # $ C: chr "5" "6" # $ D: chr "7" "8" # - attr(*, ".internal.selfref")= data %>% fread(colClasses=list(character=c("B","C","D"))) # same above, saves typing data %>% fread(colClasses=list(character=2:4)) # same above, using column numbers ### drop ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data %>% fread(colClasses=c("B"="NULL","C"="NULL")) # as read.csv # A D # 1: 1 7 # 2: 2 8 data %>% fread(colClasses=list(NULL=c("B","C"))) # same data %>% fread(drop=c("B","C")) # same but less typing, easier to read data %>% fread(drop=2:3) # same using column numbers ### select ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # (in read.csv you need to work out which to drop) data %>% fread(select=c("A","D")) # less typing, easier to read data %>% fread(select=c(1,4)) # same using column numbers
ex
library(tidyverse) library(data.table) files <- list.files(path="./data", pattern = "[t].+csv", full.names=TRUE) dd <- sapply(files, function(x){fread(input=x, sep = ",", header = T, encoding = 'UTF-8')}) # for(i in 1:length(dd)){ # names(dd[[1]][,8]) <- "rename" # } dd <- rbindlist(dd) dd <- dd[!duplicated(dd)] #identical(unique(dd), dd[!duplicated(dd)]) fwrite(x = d1, sep = ",", file = "aaa.csv", quote = T, row.names = F )