K-Means Clustering
Implementation
kmeans <- function(X, K, tol=0.0001, maxit=1000) {
# X dimensions
N <- nrow(X)
J <- ncol(X)
# Distance to centroid function
cdist <- function(x, c, k) {
rowSums(t(t(x) - c[k,])^2)
}
# Vector of cluster memberships
s <- numeric(N)
# Distances to each centroid
d <- matrix(0, nrow=N, ncol=K)
# Kmeans++ initialisation
C <- matrix(0, nrow=K, ncol=J) #centroids on rows, variables on columns
C[1,] <- X[sample(1:N, 1),]
for (k in 2:K) {
d[,k-1] <- cdist(X, C, k-1) #compute distances to previously initialised centroid
if (k == 2) w <- d[,1] / sum(d,1)
if (k > 2) {
w <- apply(d[,1:(k-1)], 1, min)
w <- w/sum(w)
}
C[k,] <- X[sample(1:N, 1, prob=w),]
}
# Loop
diff <- Inf
it <- 0
while (diff > tol & it < maxit) {
# Membership step
for (k in 1:K) {
d[,k] <- cdist(X, C, k)
}
s <- apply(d, 1, which.min)
# Centroid mean step
old <- C
for (k in 1:K) {
C[k,] <- apply(X[s == k,], 2, mean)
}
# Checks
it <- it + 1
diff <- max(abs(C - old))
}
# Output
return(list(iterations = it,
centroids = C,
clusters = s))
}Testing the function:
# Simulate draws from 3 distributions
set.seed(1234)
n <- 500
x1 <- c(rnorm(n, -3, 1), rnorm(n, 0, 1), rnorm(n, 3, 1))
x2 <- c(rnorm(n, 0, 1), rnorm(n, -3, 1), rnorm(n, 3, 1))
# Combine into matrix
m <- matrix(c(x1, x2), nrow=n*3, ncol=2)
# Estimate clusters
est <- kmeans(m, 3)
# Plot
library(ggplot2)
ggplot(mapping=aes(x=x1, y=x2, color=est$clusters)) +
geom_point() +
theme(aspect.ratio = 1)