K-Means Clustering
Implementation
<- function(X, K, tol=0.0001, maxit=1000) {
kmeans
# X dimensions
<- nrow(X)
N <- ncol(X)
J
# Distance to centroid function
<- function(x, c, k) {
cdist rowSums(t(t(x) - c[k,])^2)
}
# Vector of cluster memberships
<- numeric(N)
s
# Distances to each centroid
<- matrix(0, nrow=N, ncol=K)
d
# Kmeans++ initialisation
<- matrix(0, nrow=K, ncol=J) #centroids on rows, variables on columns
C 1,] <- X[sample(1:N, 1),]
C[for (k in 2:K) {
-1] <- cdist(X, C, k-1) #compute distances to previously initialised centroid
d[,kif (k == 2) w <- d[,1] / sum(d,1)
if (k > 2) {
<- apply(d[,1:(k-1)], 1, min)
w <- w/sum(w)
w
}<- X[sample(1:N, 1, prob=w),]
C[k,]
}
# Loop
<- Inf
diff <- 0
it while (diff > tol & it < maxit) {
# Membership step
for (k in 1:K) {
<- cdist(X, C, k)
d[,k]
}<- apply(d, 1, which.min)
s
# Centroid mean step
<- C
old for (k in 1:K) {
<- apply(X[s == k,], 2, mean)
C[k,]
}
# Checks
<- it + 1
it <- max(abs(C - old))
diff
}
# Output
return(list(iterations = it,
centroids = C,
clusters = s))
}
Testing the function:
# Simulate draws from 3 distributions
set.seed(1234)
<- 500
n <- c(rnorm(n, -3, 1), rnorm(n, 0, 1), rnorm(n, 3, 1))
x1 <- c(rnorm(n, 0, 1), rnorm(n, -3, 1), rnorm(n, 3, 1))
x2
# Combine into matrix
<- matrix(c(x1, x2), nrow=n*3, ncol=2)
m
# Estimate clusters
<- kmeans(m, 3)
est
# Plot
library(ggplot2)
ggplot(mapping=aes(x=x1, y=x2, color=est$clusters)) +
geom_point() +
theme(aspect.ratio = 1)