Galton’s father & son height data

Published by onesixx on

https://rstudio-pubs-static.s3.amazonaws.com/204984_dd2112475db84af2a03260c4a4f830ac.html

 

#install.packages("UsingR")

library(tidyverse)
library(data.table)
library(UsingR)
data(father.son)
dd <- father.son %>% data.table()
g <- dd %>% ggplot(aes(x=fheight, y=sheight)) + geom_point(size=2, alpha=0.7) +
              xlab("Height of father") + ylab("Height of son") + ggtitle("Father-son Height Data")

# mean and standard deviations of the father and son heights
shmean <- mean(dd$sheight)
fhmean <- mean(dd$fheight)
shsd <- sd(dd$sheight)
fhsd <- sd(dd$fheight)
# correlation of father.son data - the off-diagonal terms are of interest
fscor <- cor(dd)[1, 2] 

# slope = r*s_y/s_x - father data is on the x axis.
bhat <- fscor * shsd / fhsd
# y-intercept = mean(y) - slope*mean(x)
ahat <- shmean - bhat*fhmean
# print regression line parameters

# computation of mean squared data average of residula error squared
MSE <- sum((father.son$sheight - (ahat + bhat * father.son$fheight))^2) / dim(father.son)[1]
# root mean squared error
RMSE <- sqrt(MSE)

# minimum and maximum father height
fhmin <- min(father.son$fheight)
fhmax <- max(father.son$fheight)

# equally space points between from the min-max height interval
xdat <- (fhmax - fhmin) * seq(0, 1, 0.01) + fhmin
ydat <- ahat + bhat*xdat

# regression line data frame
regressionLine <- data.frame(xdat, ydat)
names(regressionLine) <- c("sheight", "fheight")

# plot of data set with regression line
g + geom_line(data=regressionLine, aes(x=sheight, y=fheight), lwd=1.5, colour="red")

# using the built in linear regression model in R to fit the data
fslm <- lm(sheight ~ fheight, data=dd)
fslm %>% summary()

 

 

 

Categories: Analysis

onesixx

Blog Owner

Leave a Reply

Your email address will not be published.