Time series data
시계열 데이터 (ts)
co2 {datasets}
1959~1997년까지(39년간) 월별 CO2농도 468개 (from Mauna Loa observatory )
> co2 # Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec # 1959 315.42 316.31 316.50 317.56 318.13 318.00 316.39 314.65 313.68 313.18 314.66 315.43 # 1960 316.27 316.81 317.42 318.87 319.87 319.43 318.01 315.74 314.00 313.68 314.84 316.03 # 1961 316.73 317.54 318.38 319.31 320.42 319.61 318.42 316.63 314.83 315.16 315.94 316.85 # ... co2 %>% str ## Time-Series [1:468] from 1959 to 1998: 315 316 316 318 318 ... co2 %>% attributes() ## $tsp ## [1] 1959.000 1997.917 12.000 ## $class ## [1] "ts"
sunspots {datasets}
https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/sunspot.month.html
흑점 갯수의 월별평균값 1749–1983
dt_sunspots <- data.table( date =sunspots %>% time() %>% as.yearmon() %>% as.Date(), \tvalue=as.vector(sunspots)) p <- dt_sunspots %>% ggplot(aes(x=date,y=value)) + geom_line(alpha=.66) + scale_x_date(date_breaks="20 years", date_labels="%Y") + theme_ipsum() p
AirPassengers {datasets}
> AirPassengers Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 1949 112 118 132 129 121 135 148 148 136 119 104 118 1950 115 126 141 135 125 149 170 170 158 133 114 140 ...
library(ggfortify) AirPassengers %>% ggplot2::autoplot()
TS => Data.table
from co2 (using zoo)
http:// Heading https://www.rdocumentation.org/packages/zoo/versions/1.8-6
co2 %>% time() %>% as.yearmon() %>% as.Date() %>% class
> library(zoo) > data.table( index= as.Date(as.yearmon(time(co2))), value=as.matrix(co2))
from AirPassengers (using broom)
broom::tidy.ts (fortify) 로 data.frame형태로 변경가능
https://cran.r-project.org/web/packages/broom/vignettes/broom.html
https://ggplot2.tidyverse.org/reference/fortify.html
fortify()대신 broom의 tidy()사용
> library(broom) > tidy(AirPassengers) # A tibble: 144 x 2 index value1 1949 112 2 1949. 118 3 1949. 132 4 1949. 129 5 1949. 121 6 1949. 135 7 1950. 148 8 1950. 148 9 1950. 136 10 1950. 119 # … with 134 more rows
ggfortyfy는 ggplot2::autoplot()를 이용해서 ts object를 그릴수 있도록 도와준다.
tidy(AirPassengers) %>% ggplot(aes(x=index, y=value)) + geom_line()
Data.table => Xts
co2ts (from co2new)
최근 Full data from Earth System Research Laboratory
library(lubridate) url <- "ftp://aftp.cmdl.noaa.gov/products/trends/co2/co2_mm_mlo.txt" if (!file.exists("co2new.dat") || now() > file.mtime("co2new.dat") + weeks(4)) download.file(url, "co2new.dat") co2new <- read.table("co2new.dat") %>% data.table() names(co2new) <- c("year", "month", "decimal_data", "average", "interpolated", "trend", "ndays") co2ts <- ts(co2new$interpolated, start=c(1958, 3), frequency=12)
co2ts <- ts(co2new$interpolated, start=c(1958, 3), frequency=12) library(zoo) co2zoo <- as.zoo(co2ts) library(xts) co2xts1 <- as.xts(co2ts) co2xts2 <- as.xts(co2zoo) co2xts3 <- xts(co2new$interpolated, order.by= seq(as.Date("1958-03-01"), by="months", length=nrow(co2new)) %>% as.yearmon)
ts object 만들기
- index : Index (i.e. date or time) for the "ts" object.
- series : Name of the series (multivariate "ts" objects only).
- value : Value of the observation.
ts_single <- ts(co2new$average, start=c(1958, 3), frequency=12) # Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec # 1958 315.71 317.45 317.50 -99.99 315.86 314.93 313.20 -99.99 313.33 314.67 # 1959 315.62 316.38 316.71 317.72 318.29 318.15 316.54 314.80 313.84 313.26 314.80 315.58 # 1960 316.43 316.97 317.58 319.02 320.03 319.59 318.18 315.91 314.16 313.83 315.00 316.19 # ... ts_multi <- ts(co2new[, .(decimal_data,interpolated,trend)], start=c(1958, 3), frequency=12) # decimal_data interpolated trend # Mar 1958 1958.208 315.71 314.62 # Apr 1958 1958.292 317.45 315.29 # May 1958 1958.375 317.50 314.71 # ....
Airline Flights From NYC
nycflights13
2013년 New York를 출발한 항공데이터
library("nycflights13") flight <- flights %>% data.table() dd <- data.table(index=make_date(flight$year, flight$month, flight$day)) dd <- dd[order(index),.N, by=.(index)] dd[ , weekdays:=index %>% wday(label=T)] ufunc_mw <- function(d, w) ceiling((d-w)/7) + 1 dd[ , monthweeks:=ufunc_mw(mday(index), wday(index)) %>% factor()] dd[ , monthweeks:=factor(monthweeks, levels=rev(levels(monthweeks)))]
numeric => Data.table (with Index)
river ( Index가 없는 Single Numeric)
river <- scan("https://www.stat.uiowa.edu/~luke/data/river.dat") riverDT <- data.table(index=seq_along(river), data=river)
from Web Data
GDP Growth Data
# Scrape Web Pages -------------------------------------------------------- library("rvest") # Easily Harvest (Scrape) Web Pages URL = "https://www.multpl.com/us-real-gdp-growth-rate/table/by-quarter" # GDP data gdptbl <- URL %>% read_html() %>% html_table() # Read the data from the web page gdptbl <- gdptbl[[1]] %>% data.table() names(gdptbl) <- c("index", "value") # Clean the data ---------------------------------------------------------- library(lubridate) gdptbl[ , index:=mdy(index)] gdptbl[ , value:=str_replace(value,"%","") %>% as.numeric()] # irregular (unequally spaced) gdptbl <- gdptbl[index>=make_date(1948, 1, 1), ] gdptbl <- gdptbl[order(index),] # increasing order > gdptbl # index value # 1: 1948-03-31 2.60 # 2: 1948-06-30 4.58 # 3: 1948-09-30 5.39 # 4: 1948-12-31 3.89 # 5: 1949-03-31 0.94 # --- # 282: 2018-06-30 3.20 # 283: 2018-09-30 3.13 # 284: 2018-12-31 2.52 # 285: 2019-03-31 2.65 # 286: 2019-06-30 2.28 > gdpts <- ts(gdptbl$value, start=1948, frequency=4)
example
https://www.neonscience.org/dc-convert-date-time-POSIX-r
# https://machinelearningmastery.com/time-series-datasets-for-machine-learning/ #Shampoo Sales Dataset URL <- "https://raw.githubusercontent.com/jbrownlee/Datasets/master/shampoo.csv" dd <- fread(URL) dd %>% ggplot(aes(Month, Sales, group=1))+ geom_path() + \t themeSixx
# https://machinelearningmastery.com/time-series-datasets-for-machine-learning/ # Minimum Daily Temperatures Dataset URL <- "https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv" dd <- fread(URL) dd %>% ggplot(aes(Date, Temp, group=1))+ geom_path()
# https://stackoverflow.com/questions/16596811/display-the-x-axis-on-ggplot-as-month-only-in-r url <- "http://nwis.waterdata.usgs.gov/usa/nwis/uv/?cb_00060=on&cb_00065=on&format=rdb&period=&begin_date=2009-01-01&end_date=2012-12-31&site_no=02428400" dd <- fread(url, header=T) sapply(dd,class) dd <- dd[-1,] names(dd)<- c("Agency","SiteNo","Datetime", "TZ","Discharge","Status","Gageheight","gstatus") dd$date <- anydate(dd$Datetime) dd[ ,`:=`(Discharge=as.numeric(Discharge), date=anydate(date),Year=as.numeric(format(date, "%Y")))] dd %>% ggplot(aes(x=date,y=Discharge))+ \tgeom_line()+ \tfacet_wrap(~Year,scales=("free_x"))+ \tscale_x_date(labels=scales::date_format("%b"), \t\t\t\t\t\t\t breaks=scales::date_breaks("month"))+ \tthemeSixx