# https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
# https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
# LSTM model for multivariate time series forecasting
# How to transform a raw dataset into something we can use for time series forecasting.
# How to prepare DATA and fit an LSTM for a multivariate time series forecasting problem.
# How to make a forecast and rescale the result back into the original units.
### Title: Air Pollution Forecasting ----
source(file.path(getwd(),"../00.global_dl.R"))
library(reticulate)
library(keras)
#.rs.restartR() # cmd+shift+F10
use_condaenv(condaenv='sixxDL', required=T) #cf.> use_python, use_virtualenv, use_miniconda
use_python(python="~/.local/share/r-miniconda/envs/sixxDL/bin/python" )
###### For mac ~~~
#use_backend(backend="plaidml") # (cf. "tensorflow", "cntk", "theano")
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 10. Load data ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ` ` Raw Data ----------------------------------------------------------------
# the weather and the level of pollution
# each hour for five years
# at the US embassy in Beijing, China
# https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data
URL <- "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv"
d0 <- fread(URL)
d0 %>% str()
d1 <- copy(d0)
d1[ ,daytime:=ymd_h(str_c(year,month,day,hour, sep="."))]
d1[ , ':='(year=NULL,month=NULL,day=NULL,hour=NULL)]
##MISSING.......
d1[is.na(pm2.5), .(format(daytime, "%Y-%m"))] %>% ggplot(aes(V1)) +
geom_histogram(stat='count') + coord_flip()
# drop the first 24 hours
d2 <- d1[daytime>=ymd("2010-01-02", tz="UTC"), ]
# mark all NA values with 0
d2[is.na(pm2.5), pm2.5:=0]
# manually specify column names
colNm = c('No','pollution','dew','temp','press','wnd_dir','wnd_spd','snow','rain','daytime')
names(d2) <- colNm
plot <- lapply(1:8, function(i){
#i=1
p <- d2 %>% ggplot(aes(x=daytime, y=get(colNm[i+1]) )) + geom_line() +
labs(x="time", y=colNm[i+1]) +
theme(plot.margin = unit(c(t=6,r=3,b=0,l=3), unit="pt"))
return(p)
})
ggarrange(plot[[1]], plot[[2]], plot[[3]], plot[[4]], plot[[6]],plot[[7]], plot[[8]], ncol=1)
#write.csv(d2[,2:10],'pollution.csv')
## ` ` Preprocess : plotting/ Normalize / rescale / ---------------------------
# integer encode - Wind direction
d3 <- d2[ ,2:9]
d3[ , wnd_dir:=wnd_dir %>% as.factor() %>% as.numeric()]
#d3[ , wnd_dir:=as.character(as.numeric(as.factor(wnd_dir)))]
#to_categorical(d3[ , wnd_dir])
# ensure all data is float
d3 %>% str()
d4 <- d3[, lapply(.SD, as.numeric)]
d4 %>% glimpse()
# normalize features scaled
uF_MinMax_Normalize <- function(x){ return((x- min(x)) /(max(x)-min(x))) }
d5 <- sapply(d4, uF_MinMax_Normalize) %>% as.data.table()
# normal <- preProcess(d4, method="range")
# d5 <- predict(normal, d4)
reframed = series_to_supervised(d5, 1, 1)
## ` ` INPUT LAYER -------------------------------------------------------------
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 20. Train the model ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ` ` Build/Reshape/complie the model -----------------------------------------
## ` ` Train(fitting) the model : history, summary -----------------------------
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 30. Evaluation ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ` ` Evaluate accuracy -------------------------------------------------------
## ` ` Improve the model -------------------------------------------------------
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 40. Make predictions ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ` ` explain Model ------------------------------------------------------------
# ` ` NEW DATA predictions -----------------------------------------------------
python
# library(reticulate)
# library(keras)
# #.rs.restartR() # cmd+shift+F10
# use_condaenv(condaenv='sixxDL', required=T) #cf.> use_python, use_virtualenv, use_miniconda
# use_python(python="~/.local/share/r-miniconda/envs/sixxDL/bin/python" )
# #Sys.setenv(RETICULATE_PYTHON = "python/bin/python")
# #py_config()
#repl_python()
from pandas import read_csv
from matplotlib import pyplot
# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7]
i = 1
# plot each column
pyplot.figure()
for group in groups:
\tpyplot.subplot(len(groups), 1, i)
\tpyplot.plot(values[:, group])
\tpyplot.title(dataset.columns[group], y=0.5, loc='right')
\ti += 1
pyplot.show()
# prepare data for lstm
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
\tn_vars = 1 if type(data) is list else data.shape[1]
\tdf = DataFrame(data)
\tcols, names = list(), list()
\t# input sequence (t-n, ... t-1)
\tfor i in range(n_in, 0, -1):
\t\tcols.append(df.shift(i))
\t\tnames += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
\t# forecast sequence (t, t+1, ... t+n)
\tfor i in range(0, n_out):
\t\tcols.append(df.shift(-i))
\t\tif i == 0:
\t\t\tnames += [('var%d(t)' % (j+1)) for j in range(n_vars)]
\t\telse:
\t\t\tnames += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
\t# put it all together
\tagg = concat(cols, axis=1)
\tagg.columns = names
\t# drop rows with NaN values
\tif dropnan:
\t\tagg.dropna(inplace=True)
\treturn agg
# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values
values = dataset.values[:,0:8]
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(scaled, 1, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(reframed.head())
tf$keras$callbacks tf$keras로 training 중에 , 동작을 확장/수정하고자 모델로 전달하는 객체 그 중 유용한 callback. tf$keras$callbacks$ModelCheckpoint tf$keras$callbacks$LearningRateScheduler tf$keras$callbacks$EarlyStopping tf$keras$callbacks$TensorBoard model & weight 저장 tf$data$dataset