Galton’s father & son height data
https://rstudio-pubs-static.s3.amazonaws.com/204984_dd2112475db84af2a03260c4a4f830ac.html
#install.packages("UsingR") library(tidyverse) library(data.table) library(UsingR) data(father.son) dd <- father.son %>% data.table() g <- dd %>% ggplot(aes(x=fheight, y=sheight)) + geom_point(size=2, alpha=0.7) + xlab("Height of father") + ylab("Height of son") + ggtitle("Father-son Height Data") # mean and standard deviations of the father and son heights shmean <- mean(dd$sheight) fhmean <- mean(dd$fheight) shsd <- sd(dd$sheight) fhsd <- sd(dd$fheight) # correlation of father.son data - the off-diagonal terms are of interest fscor <- cor(dd)[1, 2] # slope = r*s_y/s_x - father data is on the x axis. bhat <- fscor * shsd / fhsd # y-intercept = mean(y) - slope*mean(x) ahat <- shmean - bhat*fhmean # print regression line parameters # computation of mean squared data average of residula error squared MSE <- sum((father.son$sheight - (ahat + bhat * father.son$fheight))^2) / dim(father.son)[1] # root mean squared error RMSE <- sqrt(MSE) # minimum and maximum father height fhmin <- min(father.son$fheight) fhmax <- max(father.son$fheight) # equally space points between from the min-max height interval xdat <- (fhmax - fhmin) * seq(0, 1, 0.01) + fhmin ydat <- ahat + bhat*xdat # regression line data frame regressionLine <- data.frame(xdat, ydat) names(regressionLine) <- c("sheight", "fheight") # plot of data set with regression line g + geom_line(data=regressionLine, aes(x=sheight, y=fheight), lwd=1.5, colour="red") # using the built in linear regression model in R to fit the data fslm <- lm(sheight ~ fheight, data=dd) fslm %>% summary()