Scree Plot for Hierarchical clustering
https://www.youtube.com/watch?v=aMYCFtoBrdA by Gopal Malakar
library(ggplot2)
### 1. Data & EDA ----
# (Hierarchical clustering with the sample data)
# Way1
temp_str <- "Name physics math
P 15 20
Q 20 15
R 26 21
X 44 52
Y 50 45
Z 57 38
A 80 85
B 90 88
C 98 98"
base_data <- read.table(textConnection(temp_str), header = TRUE)
closeAllConnections()
# Way2
base_data <- data.frame(
c("P","Q","R","X","Y","Z","A","B","C"),
c(15,20,26,44,50,57,80,90,98),
c(20,15,21,52,45,38,85,88,98)
)
# Check distinct categories of Variables
names(base_data) <- c("Name","physics","math")
base_data %>% str
base_data %>% dim
# Plot data
ggplot(base_data, aes(x=physics, y=math, label=Name))+
geom_point() + geom_text(hjust=0, nudge_x=1) +
theme_bw() + ggtitle("Base Data")
### 2. Obtain distance matrix ----
my_dist <- dist(base_data[c(2,3)], method="euclidean")
### 3. Apply Hierarchical Clustering ----
fit <- hclust(my_dist, method="ward.D2")
fit %>% str
### 4. Decide # of Cluster ----
# 4.1. Scree Plot
# Way 1
ggplot(NULL, aes(x=length(fit$height):1, y=fit$height)) +
geom_point() + geom_line() +
theme_bw() + labs(title="Scree Plot of HCluster(euclidean, ward.D2)", x = "# of clusters", y="Height")
# Way 2
plot(9:1, append(0, fit$height), type="b", xlab="# of clusters", ylab="Dendogram Height")
#Way3
Dendogram_Height=0
fit$height
for (i in 2:9) {
Dendogram_Height[i] <- fit$height[i-1]
print(paste0(i," : ", Dendogram_Height[i]));
}
plot(9:1, Dendogram_Height, type="b", xlab="# of clusters", ylab="Dendogram Height")
### way 2- Dendogram
plot(fit, labels=base_data$Name)
# draw dendogram with color borders
# (One can use this step to take a look at execution)
rect.hclust(fit, k=8, border="red")
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=7, border="red")
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=6, border="red")
# draw color borders around required clusterd
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=3, border="blue")
### 5. Cut tree (into 3 clusters) ----
my_groups <- cutree(fit, k=3)
(원본) Scree_Plot_for_Hierarchical_clustering_Using_R.R
#-----------------------------------------------
# Hierarchical clustering with the sample data
#------------------------------------------------
# Reading data into R similar to CARDS
temp_str <- "Name physics math
P 15 20
Q 20 15
R 26 21
X 44 52
Y 50 45
Z 57 38
A 80 85
B 90 88
C 98 98"
base_data <- read.table(textConnection(
temp_str), header = TRUE)
closeAllConnections()
# Check distinct categories of Variables useing STR function
str(base_data)
# Plot data
plot(base_data$physics, base_data$math,
pch=21, bg=c("red","green3","blue","red","green3","blue",
"red","green3","blue")[unclass(base_data$Name)],
main="Base Data")
# Step 01- obtain distance matrix (right way)
my_dist <- dist(base_data[c(2,3)], method = "euclidean")
print(my_dist)
# Step 02- Apply Hierarchical Clustering
fit <- hclust(my_dist, method="ward.D2")
# Step 03- Display dendogram
plot(fit, labels = base_data$Name)
Dendogram_Height=0
for (i in 2:9) Dendogram_Height[i] <- fit$height[i-1]
plot(1:9, Dendogram_Height, type="b", xlab="Sequence of merging",
ylab="Dendogram Height")
plot(9:1, Dendogram_Height, type="b", xlab="# of clusters",
ylab="Dendogram Height")
# Step 04- draw dendogram with color borders
# One can use this step to take a look at execution
rect.hclust(fit, k=8, border="red")
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=7, border="red")
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=6, border="red")
# draw color borders around required clusterd
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=3, border="blue")
# cut tree into 3 clusters
my_groups <- cutree(fit, k=3)