RNN – Air pollution

Published by onesixx on

https://onesixx.com/global-dl
# https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
# https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

# LSTM model for multivariate time series forecasting 

# How to transform a raw dataset into something we can use for time series forecasting.
# How to prepare DATA and fit an LSTM for a multivariate time series forecasting problem.
# How to make a forecast and rescale the result back into the original units.

### Title: Air Pollution Forecasting ----
source(file.path(getwd(),"../00.global_dl.R"))

library(reticulate)
library(keras)
#.rs.restartR()  # cmd+shift+F10
use_condaenv(condaenv='sixxDL', required=T) #cf.> use_python, use_virtualenv, use_miniconda
use_python(python="~/.local/share/r-miniconda/envs/sixxDL/bin/python" )

###### For mac ~~~
#use_backend(backend="plaidml") #  (cf. "tensorflow", "cntk", "theano")

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 10. Load data ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## ` ` Raw Data ----------------------------------------------------------------
# the weather and the level of pollution 
# each hour for five years 
# at the US embassy in Beijing, China
# https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data

URL <- "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv"
d0 <- fread(URL)
d0 %>% str() 
d1 <- copy(d0)
d1[ ,daytime:=ymd_h(str_c(year,month,day,hour, sep="."))]
d1[ , ':='(year=NULL,month=NULL,day=NULL,hour=NULL)]
##MISSING.......
d1[is.na(pm2.5), .(format(daytime, "%Y-%m"))] %>% ggplot(aes(V1)) + 
  geom_histogram(stat='count') + coord_flip() 
# drop the first 24 hours
d2 <- d1[daytime>=ymd("2010-01-02", tz="UTC"), ]
# mark all NA values with 0
d2[is.na(pm2.5), pm2.5:=0]

# manually specify column names
colNm = c('No','pollution','dew','temp','press','wnd_dir','wnd_spd','snow','rain','daytime')
names(d2) <- colNm

plot <- lapply(1:8, function(i){ 
  #i=1
  p <- d2 %>% ggplot(aes(x=daytime, y=get(colNm[i+1]) )) + geom_line() + 
    labs(x="time", y=colNm[i+1]) + 
    theme(plot.margin = unit(c(t=6,r=3,b=0,l=3), unit="pt"))
  return(p)
})
ggarrange(plot[[1]], plot[[2]], plot[[3]], plot[[4]], plot[[6]],plot[[7]], plot[[8]], ncol=1)

#write.csv(d2[,2:10],'pollution.csv')


## ` ` Preprocess : plotting/ Normalize / rescale /  ---------------------------
# integer encode - Wind direction
d3 <- d2[ ,2:9]

d3[ , wnd_dir:=wnd_dir %>% as.factor() %>% as.numeric()]
#d3[ , wnd_dir:=as.character(as.numeric(as.factor(wnd_dir)))]
#to_categorical(d3[ , wnd_dir])


# ensure all data is float
d3 %>% str()
d4 <- d3[, lapply(.SD, as.numeric)]
d4 %>% glimpse()

# normalize features scaled
uF_MinMax_Normalize <- function(x){ return((x- min(x)) /(max(x)-min(x))) }
d5 <- sapply(d4, uF_MinMax_Normalize) %>% as.data.table()

# normal <- preProcess(d4, method="range")
# d5 <- predict(normal, d4)

reframed = series_to_supervised(d5, 1, 1)




## ` ` INPUT LAYER -------------------------------------------------------------


###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 20. Train the model ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ` ` Build/Reshape/complie the model -----------------------------------------
## ` ` Train(fitting) the model : history, summary -----------------------------

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 30. Evaluation ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ` ` Evaluate accuracy -------------------------------------------------------
## ` ` Improve the model -------------------------------------------------------

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## 40. Make predictions ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ` ` explain Model ------------------------------------------------------------
# ` ` NEW DATA predictions -----------------------------------------------------

python

# library(reticulate)
# library(keras)
# #.rs.restartR()  # cmd+shift+F10
# use_condaenv(condaenv='sixxDL', required=T) #cf.> use_python, use_virtualenv, use_miniconda
# use_python(python="~/.local/share/r-miniconda/envs/sixxDL/bin/python" )
# #Sys.setenv(RETICULATE_PYTHON = "python/bin/python")
# #py_config()

#repl_python()

from pandas import read_csv
from matplotlib import pyplot
# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7]
i = 1
# plot each column
pyplot.figure()
for group in groups:
\tpyplot.subplot(len(groups), 1, i)
\tpyplot.plot(values[:, group])
\tpyplot.title(dataset.columns[group], y=0.5, loc='right')
\ti += 1
pyplot.show()


# prepare data for lstm
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
\tn_vars = 1 if type(data) is list else data.shape[1]
\tdf = DataFrame(data)
\tcols, names = list(), list()
\t# input sequence (t-n, ... t-1)
\tfor i in range(n_in, 0, -1):
\t\tcols.append(df.shift(i))
\t\tnames += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
\t# forecast sequence (t, t+1, ... t+n)
\tfor i in range(0, n_out):
\t\tcols.append(df.shift(-i))
\t\tif i == 0:
\t\t\tnames += [('var%d(t)' % (j+1)) for j in range(n_vars)]
\t\telse:
\t\t\tnames += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
\t# put it all together
\tagg = concat(cols, axis=1)
\tagg.columns = names
\t# drop rows with NaN values
\tif dropnan:
\t\tagg.dropna(inplace=True)
\treturn agg

# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values
values = dataset.values[:,0:8]
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])

# ensure all data is float
values = values.astype('float32')

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

# frame as supervised learning
reframed = series_to_supervised(scaled, 1, 1)

# drop columns we don't want to predict
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(reframed.head())
Categories: Keras

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x