titanic
https://www.kaggle.com/c/titanic
https://www.mirror.co.uk/news/uk-news/iceberg-sunk-titanic-100000-years-7506651

https://www.kaggle.com/erikbruin/titanic-2nd-degree-families-and-majority-voting
‘PassengerId’, ‘Survived’, ‘Pclass’, ‘Name’, ‘Sex’, ‘Age’, ‘Fare’, ‘Embarked’
library(Hmisc)
library(knitr)
library(ggplot2)
library(dplyr)
library(caret)
library(randomForest)
library(ROCR)
library(cowplot)
train <- read.csv("./input/train.csv", stringsAsFactors=F, na.strings = c("NA", ""))
test <- read.csv("./input/test.csv", stringsAsFactors=F, na.strings = c("NA", ""))
train %>% str
test %>% str
test$Survived <- NA
all <- rbind(train, test)
# Check Missing Data
sapply(all, function(x) {sum(is.na(x))}) [sapply(all, function(x) {sum(is.na(x))}) >0]
all$Survived <- as.factor(all$Survived)
all$Sex <- as.factor(all$Sex)
all$Pclass <- as.ordered(all$Pclass)
all[!is.na(all$Survived),] %>%
ggplot(aes(x=Survived, fill=Survived)) +
geom_bar(stat='count') +
geom_label(stat='count', aes(label=..count..)) +
labs(x='How many people died and survived on the Titanic?')
p1 <- all %>%
ggplot(aes(x=Sex, fill=Sex)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count', aes(label=..count..)) +
labs(x='All data')
p2 <- all[!is.na(all$Survived),] %>%
ggplot(aes(x=Sex, fill=Survived)) +
geom_bar(stat='count', position='dodge') +
geom_label(stat='count', aes(label=..count..)) +
labs(x='Training data only')
plot_grid(p1, p2)
ggplot(diamonds, aes(clarity, fill = cut)) + geom_bar() +
theme(axis.text.x = element_text(angle=70, vjust=0.5)) +
draw_label("DRAFT!", angle = 45, size = 80, alpha = .2)