Gaussian Mixture Model and Expectation-Maximization Algorithm- Code
Zhao Zhao

Gaussian Mixture Models

In this notebook, we will look at density modeling with Gaussian mixture models (GMMs). In Gaussian mixture models, we describe the density of the data as

Define a GMM from which we generate data

Set up the true GMM from which we will generate data.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
import scipy.linalg as la
import matplotlib.cm as cm
from matplotlib import rc
import time
from IPython import display

# Choose a GMM with 3 components

# means
m = np.zeros((3,2))
m[0] = np.array([1.2, 0.4])
m[1] = np.array([-4.4, 1.0])
m[2] = np.array([4.1, -0.3])

# covariances
S = np.zeros((3,2,2))
S[0] = np.array([[0.8, -0.4], [-0.4, 1.0]])
S[1] = np.array([[1.2, -0.8], [-0.8, 1.0]])
S[2] = np.array([[1.2, 0.6], [0.6, 3.0]])

# mixture weights
w = np.array([0.3, 0.2, 0.5])

Generate some data

1
2
3
4
5
6
7
8
9
10
N_split = 200 # number of data points per mixture component
N = N_split*3 # total number of data points
x = []
y = []
for k in range(3):
x_tmp, y_tmp = np.random.multivariate_normal(m[k], S[k], N_split).T
x = np.hstack([x, x_tmp])
y = np.hstack([y, y_tmp])

data = np.vstack([x, y])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
clear
clc
close all
% Choose a GMM with 3 components with 2-D data

% define the means
m = zeros(3,2);
m(1, :) = [1.2, 0.4] ;
m(2, :) = [-4.4, 1.0] ;
m(3, :) = [4.1, -0.3] ;

% define the covariances
S = zeros(2, 2, 3) ;
S(:, :, 1) = [0.8, -0.4; -0.4, 1.0];
S(:, :, 2) = [1.2, -0.8; -0.8, 1.2];
S(:, :, 3) = [1.2, 0.6; 0.6, 3.0];

% mixture weights
w = [0.3, 0.2, 0.5] ;

rng(10)
N_split = 200; % number of data points per mixture component
N = N_split*3; % total number of data points
data = [] ;
for k = 1:3
tmp = mvnrnd(m(k, :), S(: ,:, k), N_split);
data = [data; tmp];
end
x = data(: ,1) ;
y = data(: ,2) ;

Visualization of the dataset

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
X, Y = np.meshgrid(np.linspace(-10,10,100), np.linspace(-10,10,100))
pos = np.dstack((X, Y))

mvn = multivariate_normal(m[0,:].ravel(), S[0,:,:]) # 第一个高斯分布的均值和协方差
xx = mvn.pdf(pos) # 将xx的值对应到color map的暖色组中寻找(X,Y)对应的点对应的颜色

# plot the dataset
plt.figure()
plt.title("Mixture components")
plt.plot(x, y, 'ko', alpha=0.3) # 绘制x-y的坐标图分布图
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# plot the individual components of the GMM
plt.plot(m[:,0], m[:,1], 'or') # 绘制三个成分的均值坐标

for k in range(3):
mvn = multivariate_normal(m[k,:].ravel(), S[k,:,:])
xx = mvn.pdf(pos)
plt.contour(X, Y, xx, alpha = 1.0, zorder=10) # 将xx的值对应到color map的暖色组中寻找(X,Y)对应的点对应的颜色

# plot the GMM
plt.figure()
plt.title("GMM")
plt.plot(x, y, 'ko', alpha=0.3) # 绘制x-y的坐标图分布图
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# build the GMM
gmm = 0
for k in range(3):
mix_comp = multivariate_normal(m[k,:].ravel(), S[k,:,:])
gmm += w[k]*mix_comp.pdf(pos) # 乘以权重w

plt.contour(X, Y, gmm, alpha = 1.0, zorder=10); # 将gmm的值对应到color map的暖色组中寻找(X,Y)对应的点对应的颜色
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
% plot the dataset
figure(1)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

% plot the individual components of the GMM
plot(m(:, 1), m(: ,2), 'o')
scatter2=scatter(m(:, 1), m(: ,2), 'MarkerFaceColor','r','MarkerEdgeColor','r');
scatter2.MarkerFaceAlpha = .1;
% hold off
% plot the guassian distribution on the mesh data
[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
% figure(2)
for k = 1:3
p = mvnpdf([X(:) Y(:)], m(k, :), S(: ,:, k)) ; % 概率密度函数
Z = reshape(p,size(X));%将Z值对应到相应的坐标上
contour(X,Y,Z, 'LineWidth', 1.2)
hold on
end
xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title('Mixture components')
hold off

% plot the GMM
figure(2)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

% plot the individual components of the GMM
plot(m(:, 1), m(: ,2), 'o')
scatter2=scatter(m(:, 1), m(: ,2), 'MarkerFaceColor','r','MarkerEdgeColor','r');
scatter2.MarkerFaceAlpha = .1;
% hold off
xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title('GMM')


[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
gmm = zeros(100, 100) ;
for k = 1:3
p = mvnpdf([X(:) Y(:)], m(k, :), S(: ,:, k)) ; % 概率密度函数
gmm = gmm+ w(k).*reshape(p,size(X));%将Z值对应到相应的坐标上
end
contour(X,Y,gmm, 'LineWidth', 1.2)
hold off

Train the GMM via EM

Initialize the parameters for EM
1
2
3
4
5
6
7
8
9
10
11
K = 3 # number of clusters

means = np.zeros((K,2))
covs = np.zeros((K,2,2))
for k in range(K):
means[k] = np.random.normal(size=(2,))
covs[k] = np.eye(2)

weights = np.ones((K,1))/K
print("Initial mean vectors (one per row):\n" + str(means))

1
2
3
4
5
6
7
8
9
K = 3; % number of clusters
means = zeros(K,2);
covs = zeros(2, 2, K);
for k = 1:K
means(k, :) = randn(1, 2);
% means(k, :) = [1 1];
covs(:, :, k) = eye(2);
end
weights = ones(K,1)/K;

Calculate the log-likelihood of the GMM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
NLL = [] # log-likelihood of the GMM
gmm_nll = 0
for k in range(K):
gmm_nll += weights[k]*multivariate_normal.pdf(mean=means[k,:], cov=covs[k,:,:], x=data.T) # 似然函数
NLL += [-np.sum(np.log(gmm_nll))]

plt.figure()
plt.plot(x, y, 'ko', alpha=0.3)
plt.plot(means[:,0], means[:,1], 'oy', markersize=25)

for k in range(K):
rv = multivariate_normal(means[k,:], covs[k,:,:])
plt.contour(X, Y, rv.pdf(pos), alpha = 1.0, zorder=10) # 初始值的分布

plt.xlabel("$x_1$");
plt.ylabel("$x_2$");
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
NLL = 0; % log-likelihood of the GMM
gmm_nll = 0;
for k = 1:K
gmm_nll = gmm_nll + weights(k).* mvnpdf(data, means(k, :), covs(: ,:, k)) ;
end
NLL = NLL + (-sum(log(gmm_nll)));
NLL_all(1) = NLL;
% plot the GMM with initial parameters
figure(3)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

% plot the individual components of the GMM
% % plot(m(:, 1), m(: ,2), 'o')
% % scatter2=scatter(m(:, 1), m(: ,2), 'MarkerFaceColor','r','MarkerEdgeColor','r');
% % scatter2.MarkerFaceAlpha = .1;
% hold off
xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title('initial GMM')


[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
gmm = zeros(100, 100) ;
for k = 1:K
p = mvnpdf([X(:) Y(:)], means(k, :), covs(: ,:, k)) ; % 概率密度函数
Z = reshape(p,size(X));%将Z值对应到相应的坐标上
contour(X,Y,Z, 'LineWidth', 1.2)
hold on
end
hold off

First, we define the responsibilities (which are updated in the E-step), given the model parameters as Given the responsibilities we just defined, we can update the model parameters in the M-step as follows:

EM Algorithm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
r = np.zeros((K,N)) # will store the responsibilities

for em_iter in range(100):
means_old = means.copy()

# E-step: update responsibilities
for k in range(K):
r[k] = weights[k]*multivariate_normal.pdf(mean=means[k,:], cov=covs[k,:,:], x=data.T)

r = r/np.sum(r, axis=0)

# M-step
N_k = np.sum(r, axis=1)

for k in range(K):
# update means
means[k] = np.sum(r[k]*data, axis=1)/N_k[k]

# update covariances
bb = means[k:k+1]
diff = data - means[k:k+1].T
_tmp = np.sqrt(r[k:k+1])*diff
aa = np.sqrt(r[k:k+1])*diff
covs[k] = np.inner(_tmp, _tmp)/N_k[k]

# weights
weights = N_k/N

# log-likelihood
gmm_nll = 0
for k in range(K):
gmm_nll += weights[k]*multivariate_normal.pdf(mean=means[k,:].ravel(), cov=covs[k,:,:], x=data.T)
NLL += [-np.sum(np.log(gmm_nll))]

plt.figure()
plt.plot(x, y, 'ko', alpha=0.3)
plt.plot(means[:,0], means[:,1], 'oy', markersize=25)
for k in range(K):
rv = multivariate_normal(means[k,:], covs[k])
plt.contour(X, Y, rv.pdf(pos), alpha = 1.0, zorder=10)

plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.text(x=3.5, y=8, s="EM iteration "+str(em_iter+1))

if la.norm(NLL[em_iter+1]-NLL[em_iter]) < 1e-6:
print("Converged after iteration ", em_iter+1)
break

# plot final the mixture model
plt.figure()
gmm = 0
for k in range(3):
mix_comp = multivariate_normal(means[k,:].ravel(), covs[k,:,:])
gmm += weights[k]*mix_comp.pdf(pos)

plt.plot(x, y, 'ko', alpha=0.3)
plt.contour(X, Y, gmm, alpha = 1.0, zorder=10)
plt.xlim([-8,8]);
plt.ylim([-6,6]);

plt.figure()
plt.semilogy(np.linspace(1,len(NLL), len(NLL)), NLL)
plt.xlabel("EM iteration");
plt.ylabel("Negative log-likelihood");

idx = [0, 1, 9, em_iter+1]

for i in idx:
plt.plot(i+1, NLL[i], 'or')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
r = zeros(K, N); % store the responsibilities \pi
N_iter = 100;
for iter = 1: N_iter
% E-step: update responsibilities
for k = 1:K
r(k, :) = weights(k).* mvnpdf(data, means(k, :), covs(: ,:, k));
end
r = r./sum(r); % 这里sum(r)是列相加

% for k = 1: K
% for n = 1:N
% aa(k, n) = weights(k) * mvnpdf(data(n, :), means(k, :), covs(: ,:, k));
% end
% end
% for n = 1:N
% for k = 1:K
% rr(k, n) = aa(k, n)/sum(aa(:,n));
% end
% end
N_k = sum(r, 2);
for k = 1:K
tmp = 0;
for n = 1:N
tmp = tmp + r(k, n)*data(n, :);
end
means_new = tmp/N_k(k);
means(k, :) = means_new;
end

for k = 1:K
tmp_covs = 0;
for n = 1: N
tmp_covs = tmp_covs+ r(k, n)*(data(n,:)- means(k, :) )' * (data(n,:)- means(k, :) );
end
covs_new = tmp_covs/N_k(k);
covs(:, :, k) = covs_new ;
end
% update weights
weights = N_k/ N;

% log-likelihood
gmm_nll = 0;
for k = 1:K
gmm_nll = gmm_nll + weights(k).* mvnpdf(data, means(k, :), covs(: ,:, k)) ;
end
NLL = -sum(log(gmm_nll));
NLL_all(iter+1) = NLL;
%
if mod(iter, 10) == 0
% % plot
figure(iter+4)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title(['Iteration-',num2str(iter)] )


[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
% gmm = zeros(100, 100) ;
for k = 1:K
p = mvnpdf([X(:) Y(:)], means(k, :), covs(: ,:, k)) ; % 概率密度函数
Z = reshape(p,size(X));%将Z值对应到相应的坐标上
contour(X,Y,Z, 'LineWidth', 1.2)
hold on
end
hold off
end

if abs(NLL_all(iter+1)-NLL_all(iter))< 1e-6
disp(['Converged after iteration-', num2str(iter+1)])
break
end
%
end

% plot the NLL

figure(222)
plot(1: length(NLL_all), NLL_all)
xlabel('EM iteration');
ylabel('Negative log-likelihood');