Gaussian Mixture Models
Univariate Gaussian Mixture Models
Implementation
ugmm <- function(X, K, tol=0.0001, maxit=1000) {
# Basics of X
N <- length(X)
# Initialise means
mu <- sample(X, K)
# Initialise variances
theta <- rep(var(X), K)
# Initialise mixing proportions
prop <- rep(1/K, K)
# Initialise individual label probabilities and weighted densities
g <- matrix(0, nrow=N, ncol=K)
# Initialise the difference
diff <- Inf
old <- 0
# Initialise iterations
it <- 0
# EM Algo
while (it < maxit) {
# Old values
oldmu <- mu
oldtheta <- theta
oldprop <- prop
# E-step
for (k in 1:K) {
g[,k] <- prop[k] * dnorm(X, mu[k], sqrt(theta[k]))
}
g <- g / rowSums(g)
# M-step
Nk <- colSums(g)
for (k in 1:K) {
# Calculate mu_k
mu[k] <- sum(g[,k] * X) / Nk[k]
# Calculate theta^2_k
theta[k] <- sum(g[,k]*(X - mu[k])^2) / Nk[k]
# Update mixing proportions
prop[k] <- Nk[k]/N
}
# Calculate log-likelihood
lli <- numeric(N)
for (k in 1:K) {
lli <- lli + g[,k] * (log(prop[k]) + log(dnorm(X, mu[k], sqrt(theta[k]))))
}
ll <- sum(lli)
# Recalulate difference
diff <- abs(ll - old)
# Update iterations
it <- it+1
# Check
if (diff < tol) break
# Assume the loop hasn't ended, update the old ll
old <- ll
}
return(list(it=it, mu=mu, theta=sqrt(theta), prop=prop))
}Testing the function:
# Simulate draws from 3 distributions
set.seed(1234)
x1 <- rnorm(1000, 0, 1)
x2 <- rnorm(1500, -4, 2)
x3 <- rnorm(500, 2, 0.5)
# Combine into single vector
x <- c(x1, x2, x3)
# Density Plot
plot(density(x))
# Estimate clusters
ugmm(x, 3)## $it
## [1] 231
##
## $mu
## [1] -3.9174651 1.9712504 -0.1162445
##
## $theta
## [1] 2.0231925 0.5045694 0.8715375
##
## $prop
## [1] 0.5054033 0.1863845 0.3082122