Gaussian Mixture Model and Expectation-Maximization Algorithm- Code

Gaussian Mixture Models

In this notebook, we will look at density modeling with Gaussian mixture models (GMMs). In Gaussian mixture models, we describe the density of the data as

Define a GMM from which we generate data

Set up the true GMM from which we will generate data.

import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
import scipy.linalg as la
import matplotlib.cm as cm
from matplotlib import rc
import time
from IPython import display

# Choose a GMM with 3 components

# means
m = np.zeros((3,2))
m[0] = np.array([1.2, 0.4])
m[1] = np.array([-4.4, 1.0])
m[2] = np.array([4.1, -0.3])

# covariances
S = np.zeros((3,2,2))
S[0] = np.array([[0.8, -0.4], [-0.4, 1.0]])
S[1] = np.array([[1.2, -0.8], [-0.8, 1.0]])
S[2] = np.array([[1.2, 0.6], [0.6, 3.0]])

# mixture weights
w = np.array([0.3, 0.2, 0.5])

Generate some data

N_split = 200 # number of data points per mixture component
N = N_split*3 # total number of data points
x = []
y = []
for k in range(3):
    x_tmp, y_tmp = np.random.multivariate_normal(m[k], S[k], N_split).T 
    x = np.hstack([x, x_tmp])
    y = np.hstack([y, y_tmp])

data = np.vstack([x, y])

clear
clc
close all
% Choose a GMM with 3 components with 2-D data

% define the means
m = zeros(3,2);
m(1, :) = [1.2, 0.4] ;
m(2, :) = [-4.4, 1.0] ;
m(3, :) = [4.1, -0.3] ;

% define the covariances
S = zeros(2, 2, 3) ;
S(:, :, 1) = [0.8, -0.4; -0.4, 1.0];
S(:, :, 2) = [1.2, -0.8; -0.8, 1.2];
S(:, :, 3) = [1.2, 0.6; 0.6, 3.0];

% mixture weights
w = [0.3, 0.2, 0.5] ;

rng(10)
N_split = 200; % number of data points per mixture component
N = N_split*3; % total number of data points
data = [] ;
for k = 1:3
    tmp = mvnrnd(m(k, :), S(: ,:, k), N_split);
    data = [data; tmp];
end
x = data(: ,1) ;
y = data(: ,2) ;

Visualization of the dataset

X, Y = np.meshgrid(np.linspace(-10,10,100), np.linspace(-10,10,100))
pos = np.dstack((X, Y))

mvn = multivariate_normal(m[0,:].ravel(), S[0,:,:])  # 第一个高斯分布的均值和协方差
xx = mvn.pdf(pos)   # 将xx的值对应到color map的暖色组中寻找(X,Y)对应的点对应的颜色

# plot the dataset
plt.figure()
plt.title("Mixture components")
plt.plot(x, y, 'ko', alpha=0.3)    #  绘制x-y的坐标图分布图
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# plot the individual components of the GMM
plt.plot(m[:,0], m[:,1], 'or')   # 绘制三个成分的均值坐标

for k in range(3):
    mvn = multivariate_normal(m[k,:].ravel(), S[k,:,:])
    xx = mvn.pdf(pos)
    plt.contour(X, Y, xx,  alpha = 1.0, zorder=10)  # 将xx的值对应到color map的暖色组中寻找(X,Y)对应的点对应的颜色
     
# plot the GMM 
plt.figure()
plt.title("GMM")
plt.plot(x, y, 'ko', alpha=0.3)    #  绘制x-y的坐标图分布图
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# build the GMM
gmm = 0
for k in range(3):
    mix_comp = multivariate_normal(m[k,:].ravel(), S[k,:,:])
    gmm += w[k]*mix_comp.pdf(pos)   #  乘以权重w
    
plt.contour(X, Y, gmm,  alpha = 1.0, zorder=10);  # 将gmm的值对应到color map的暖色组中寻找(X,Y)对应的点对应的颜色

% plot the dataset
figure(1)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

% plot the individual components of the GMM
plot(m(:, 1), m(: ,2), 'o')
scatter2=scatter(m(:, 1), m(: ,2), 'MarkerFaceColor','r','MarkerEdgeColor','r');
scatter2.MarkerFaceAlpha = .1;
% hold off
% plot the guassian distribution on the mesh data
[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
% figure(2)
for k = 1:3
    p = mvnpdf([X(:) Y(:)], m(k, :), S(: ,:, k)) ;  % 概率密度函数
    Z = reshape(p,size(X));%将Z值对应到相应的坐标上
    contour(X,Y,Z, 'LineWidth', 1.2)
    hold on
end
xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title('Mixture components')
hold off

% plot the GMM
figure(2)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

% plot the individual components of the GMM
plot(m(:, 1), m(: ,2), 'o')
scatter2=scatter(m(:, 1), m(: ,2), 'MarkerFaceColor','r','MarkerEdgeColor','r');
scatter2.MarkerFaceAlpha = .1;
% hold off
xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title('GMM')


[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
gmm = zeros(100, 100) ;
for k = 1:3
    p = mvnpdf([X(:) Y(:)], m(k, :), S(: ,:, k)) ;  % 概率密度函数
    gmm = gmm+ w(k).*reshape(p,size(X));%将Z值对应到相应的坐标上
end
contour(X,Y,gmm, 'LineWidth', 1.2)
hold off

Train the GMM via EM

Initialize the parameters for EM

K = 3 # number of clusters

means = np.zeros((K,2))
covs = np.zeros((K,2,2))
for k in range(K):
    means[k] = np.random.normal(size=(2,))
    covs[k] = np.eye(2)

weights = np.ones((K,1))/K
print("Initial mean vectors (one per row):\n" + str(means))

K = 3; % number of clusters
means = zeros(K,2);
covs = zeros(2, 2, K);
for k  = 1:K
    means(k, :) = randn(1, 2);
    %     means(k, :) = [1 1];
    covs(:, :, k) = eye(2);
end
weights = ones(K,1)/K;

Calculate the log-likelihood of the GMM

NLL = [] # log-likelihood of the GMM
gmm_nll = 0
for k in range(K):
    gmm_nll += weights[k]*multivariate_normal.pdf(mean=means[k,:], cov=covs[k,:,:], x=data.T) # 似然函数
NLL += [-np.sum(np.log(gmm_nll))]

plt.figure()
plt.plot(x, y, 'ko', alpha=0.3)
plt.plot(means[:,0], means[:,1], 'oy', markersize=25)

for k in range(K):
    rv = multivariate_normal(means[k,:], covs[k,:,:])
    plt.contour(X, Y, rv.pdf(pos), alpha = 1.0, zorder=10)   # 初始值的分布
    
plt.xlabel("$x_1$");
plt.ylabel("$x_2$");

NLL = 0; % log-likelihood of the GMM
gmm_nll = 0;
for k = 1:K
    gmm_nll = gmm_nll + weights(k).* mvnpdf(data, means(k, :), covs(: ,:, k)) ;
end
NLL = NLL + (-sum(log(gmm_nll)));
NLL_all(1) = NLL;
% plot the GMM with initial parameters
figure(3)
plot(x, y, 'o')
hold on
scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
scatter1.MarkerFaceAlpha = .3;

% plot the individual components of the GMM
% % plot(m(:, 1), m(: ,2), 'o')
% % scatter2=scatter(m(:, 1), m(: ,2), 'MarkerFaceColor','r','MarkerEdgeColor','r');
% % scatter2.MarkerFaceAlpha = .1;
% hold off
xlabel('$x_1$', "Interpreter","latex")
ylabel('$x_2$', "Interpreter","latex")
title('initial GMM')


[X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
gmm = zeros(100, 100) ;
for k = 1:K
    p = mvnpdf([X(:) Y(:)], means(k, :), covs(: ,:, k)) ;  % 概率密度函数
    Z = reshape(p,size(X));%将Z值对应到相应的坐标上
    contour(X,Y,Z, 'LineWidth', 1.2)
    hold on
end
hold off

First, we define the responsibilities (which are updated in the E-step), given the model parameters as Given the responsibilities we just defined, we can update the model parameters in the M-step as follows:

EM Algorithm

r = np.zeros((K,N)) # will store the responsibilities

for em_iter in range(100):    
    means_old = means.copy()
    
    # E-step: update responsibilities
    for k in range(K):
        r[k] = weights[k]*multivariate_normal.pdf(mean=means[k,:], cov=covs[k,:,:], x=data.T)  
        
    r = r/np.sum(r, axis=0) 
        
    # M-step
    N_k = np.sum(r, axis=1)

    for k in range(K): 
        # update means
        means[k] = np.sum(r[k]*data, axis=1)/N_k[k]
        
        # update covariances
        bb = means[k:k+1]
        diff = data - means[k:k+1].T
        _tmp = np.sqrt(r[k:k+1])*diff
        aa = np.sqrt(r[k:k+1])*diff 
        covs[k] = np.inner(_tmp, _tmp)/N_k[k]
        
    # weights
    weights = N_k/N 
    
    # log-likelihood
    gmm_nll = 0
    for k in range(K):
        gmm_nll += weights[k]*multivariate_normal.pdf(mean=means[k,:].ravel(), cov=covs[k,:,:], x=data.T)
    NLL += [-np.sum(np.log(gmm_nll))]
    
    plt.figure() 
    plt.plot(x, y, 'ko', alpha=0.3)
    plt.plot(means[:,0], means[:,1], 'oy', markersize=25)
    for k in range(K):
        rv = multivariate_normal(means[k,:], covs[k])
        plt.contour(X, Y, rv.pdf(pos), alpha = 1.0, zorder=10)
        
    plt.xlabel("$x_1$")
    plt.ylabel("$x_2$")
    plt.text(x=3.5, y=8, s="EM iteration "+str(em_iter+1))
    
    if la.norm(NLL[em_iter+1]-NLL[em_iter]) < 1e-6:
        print("Converged after iteration ", em_iter+1)
        break
   
# plot final the mixture model
plt.figure() 
gmm = 0
for k in range(3):
    mix_comp = multivariate_normal(means[k,:].ravel(), covs[k,:,:])
    gmm += weights[k]*mix_comp.pdf(pos)

plt.plot(x, y, 'ko', alpha=0.3)
plt.contour(X, Y, gmm,  alpha = 1.0, zorder=10)    
plt.xlim([-8,8]);
plt.ylim([-6,6]);

plt.figure()
plt.semilogy(np.linspace(1,len(NLL), len(NLL)), NLL)
plt.xlabel("EM iteration");
plt.ylabel("Negative log-likelihood");

idx = [0, 1, 9, em_iter+1]

for i in idx:
    plt.plot(i+1, NLL[i], 'or')

r = zeros(K, N); % store the responsibilities \pi
N_iter = 100;
for iter = 1: N_iter
    % E-step: update responsibilities
    for k = 1:K
        r(k, :) = weights(k).* mvnpdf(data, means(k, :), covs(: ,:, k));
    end
    r = r./sum(r);  % 这里sum(r)是列相加

    %     for k = 1: K
    %         for n = 1:N
    %             aa(k, n)  = weights(k) * mvnpdf(data(n, :), means(k, :), covs(: ,:, k));
    %         end
    %     end
    %     for n = 1:N
    %         for k = 1:K
    %             rr(k, n) = aa(k, n)/sum(aa(:,n));
    %         end
    %     end
    N_k = sum(r, 2);
    for k = 1:K
        tmp = 0;
        for n = 1:N
            tmp = tmp + r(k, n)*data(n, :);
        end
        means_new = tmp/N_k(k);
        means(k, :) = means_new;
    end

    for k = 1:K
        tmp_covs = 0;
        for n = 1: N
            tmp_covs = tmp_covs+ r(k, n)*(data(n,:)- means(k, :) )' * (data(n,:)- means(k, :) );
        end
        covs_new = tmp_covs/N_k(k);
        covs(:, :, k) =  covs_new ;
    end
    % update weights
    weights = N_k/ N;

    % log-likelihood
    gmm_nll = 0;
    for k = 1:K
        gmm_nll = gmm_nll + weights(k).* mvnpdf(data, means(k, :), covs(: ,:, k)) ;
    end
    NLL = -sum(log(gmm_nll));
    NLL_all(iter+1) = NLL;
    %
    if mod(iter, 10) == 0
        %     % plot
        figure(iter+4)
        plot(x, y, 'o')
        hold on
        scatter1 = scatter(x,y,'MarkerFaceColor','k','MarkerEdgeColor','k');
        scatter1.MarkerFaceAlpha = .3;

        xlabel('$x_1$', "Interpreter","latex")
        ylabel('$x_2$', "Interpreter","latex")
        title(['Iteration-',num2str(iter)] )


        [X, Y] = meshgrid(linspace(-10, 10, 100), linspace(-10, 10, 100));
        %     gmm = zeros(100, 100) ;
        for k = 1:K
            p = mvnpdf([X(:) Y(:)], means(k, :), covs(: ,:, k)) ;  % 概率密度函数
            Z = reshape(p,size(X));%将Z值对应到相应的坐标上
            contour(X,Y,Z, 'LineWidth', 1.2)
            hold on
        end
        hold off
    end

    if abs(NLL_all(iter+1)-NLL_all(iter))< 1e-6
        disp(['Converged after iteration-', num2str(iter+1)])
        break
    end
    %
end

% plot the NLL

figure(222)
plot(1: length(NLL_all), NLL_all)
xlabel('EM iteration');
ylabel('Negative log-likelihood');