Data :: wine
wine Quality
http://archive.ics.uci.edu/ml/datasets/Wine+Quality data set from the UC Irvine Machine Learning Repo.
It gives us a nice mix of classification and regression problems to test on.
Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], [Web Link]).
Input variables (based on physicochemical tests):
1 – fixed acidity
2 – volatile acidity
3 – citric acid
4 – residual sugar
5 – chlorides
6 – free sulfur dioxide
7 – total sulfur dioxide
8 – density
9 – pH
10 – sulphates
11 – alcohol
Output variable (based on sensory data):
12 – quality (score between 0 and 10)
PATH_ADATA <- "/Users/onesixx/Dropbox/Rhome/aData/" url_wineQuality_red <- 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' url_wineQuality_white <- 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' wineQuality_red <- read.table(url_wineQuality_red, header=T, sep=';', stringsAsFactors=F) wineQuality_white <- read.table(url_wineQuality_white, header=T, sep=';', stringsAsFactors=F) wineQ_red <- data.table(wineQuality_red, color="red") wineQ_white <- data.table(wineQuality_white, color="white") wineQ <- rbind(wineQ_red, wineQ_white) rm(url_wineQuality_red, url_wineQuality_white, wineQuality_red, wineQuality_white, wineQ_red, wineQ_white) wineQ$is_red <- as.factor(ifelse(wineQ$color=='red', 1, 0)) wineQ$high_quality <- as.factor(ifelse(wineQ$quality > 6, 1, 0)) wineQ$quality <- as.factor(wineQ$quality)
saveRDS(wineQ, str_c(PATH_ADATA, "wineQ.rds")) wineQ <- readRDS(str_c(PATH_ADATA, "wineQ.rds"))
### Data Original PATH_ADATA <- "/Users/onesixx/Dropbox/Rhome/aData/" wineQ <- readRDS(str_c(PATH_ADATA, "wineQ.rds")) cols <- c('is_red', 'fixed.acidity', 'density', 'pH', 'alcohol') dd <- wineQ[ , cols, with=F]
wine
#* Data Loading ---- wineUrl <- 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' wine <- read.table(wineUrl, header=FALSE, sep=',', stringsAsFactors=FALSE, col.names=c('Cultivar', 'Alcohol', 'Malic.acid', 'Ash', 'Alcalinity.of.ash', 'Magnesium', 'Total.phenols', 'Flavanoids', 'Nonflavanoid.phenols', 'Proanthocyanin', 'Color.intensity', 'Hue', 'OD280.OD315.of.diluted.wines', 'Proline'))
wine price
rawDataUrl <- "http://pub.data.gov.bc.ca/datasets/176284/BC_Liquor_Store_Product_Price_List.csv" bcl <- read.csv(rawDataUrl, stringsAsFactors = FALSE) products <- c("BEER", "REFRESHMENT BEVERAGE", "SPIRITS", "WINE") bcl <- dplyr::filter(bcl, PRODUCT_CLASS_NAME %in% products) %>% dplyr::select(PRODUCT_CLASS_NAME, PRODUCT_MINOR_CLASS_NAME, PRODUCT_LONG_NAME, PRODUCT_COUNTRY_ORIGIN_NAME, PRODUCT_ALCOHOL_PERCENT, CURRENT_DISPLAY_PRICE, SWEETNESS_CODE) %>% rename(Type = PRODUCT_CLASS_NAME, Subtype = PRODUCT_MINOR_CLASS_NAME, Name = PRODUCT_LONG_NAME, Country = PRODUCT_COUNTRY_ORIGIN_NAME, Alcohol_Content= PRODUCT_ALCOHOL_PERCENT, Price = CURRENT_DISPLAY_PRICE, Sweetness = SWEETNESS_CODE) bcl$Type <- sub("^REFRESHMENT BEVERAGE$", "REFRESHMENT", bcl$Type) bcl %>% head