G. Nuel
October 9, 2015
\( \boldsymbol{X}=X_1,\ldots,X_n \) and \( \boldsymbol{S}=S_1,\ldots,S_n \) iid with:
\( \theta=(\boldsymbol{\pi},\boldsymbol{\mu},\sigma) \)
size (cm) of men (\( S_i=1 \)) and women (\( S_i=2 \))
pi=c(0.4,0.6) ; mu=c(175,165) ; sigma=10
set.seed(42); n=100; sigma=1
s=sample(1:2,size=n,replace=TRUE,prob=pi)
x=rnorm(n,mean=mu[s],sd=sigma)
set.seed(42); n=100; sigma=5
s=sample(1:2,size=n,replace=TRUE,prob=pi)
x=rnorm(n,mean=mu[s],sd=sigma)
set.seed(42); n=100; sigma=10
s=sample(1:2,size=n,replace=TRUE,prob=pi)
x=rnorm(n,mean=mu[s],sd=sigma)
\[ P(S_i=j|X_i=x)\propto \pi_j \text{dnorm}(x,\mu_j,\sigma) \]
eta=cbind(pi[1]*dnorm(x,mu[1],sigma),
pi[2]*dnorm(x,mu[2],sigma))
eta=eta/apply(eta,1,sum)
\[ P(S_i=j|X_i=x)\propto \pi_j \text{dnorm}(x,\mu_j,\sigma) \]
eta=cbind(pi[1]*dnorm(x,mu[1],sigma),
pi[2]*dnorm(x,mu[2],sigma))
eta=eta/apply(eta,1,sum)
\[ P(S_i=j|X_i=x)\propto \pi_j \text{dnorm}(x,\mu_j,\sigma) \]
eta=cbind(pi[1]*dnorm(x,mu[1],sigma),
pi[2]*dnorm(x,mu[2],sigma))
eta=eta/apply(eta,1,sum)
pred=apply(eta,1,which.max)
table(s,pred)
pred
s 1 2
1 30 16
2 8 46
mean(pred!=s)
[1] 0.24
Expectation-Maximization algorithm (Dempster'77)
\[ \pi_j = \frac{1}{n} \sum_{i=1}^n \eta_i(j) \]
\[ \mu_j = \frac{\sum_{i=1}^n \eta_i(j)X_i}{\sum_{i=1}^n \eta_i(j)} \]
\[ \sigma^2= \frac{1}{n} \sum_{i=1}^n \sum_{j=1}^2 \eta_i(j)(X_i-\mu_j)^2 \]
Expectation-Maximization algorithm (Dempster'77)
\[ \pi_j = \frac{S_0(j)}{n} \quad \mu_j = \frac{S_1(j)}{S_0(j)} \]
\[ \sigma^2= \frac{1}{n} \sum_{j=1}^2 \left( S_2(j)-2 S_1(j)\mu_j +S_0(j)\mu_j^2 \right) \]
\[ S_d(j)=\sum_{i=1}^n \eta_i(j) X_i^d \]
# arbitrary initialization
pi=c(0.5,0.5); mu=c(180,160); sigma=20
for (iter in 1:100) {
# compute posterior probabilities
eta=cbind(pi[1]*dnorm(x,mu[1],sigma),
pi[2]*dnorm(x,mu[2],sigma))
eta=eta/apply(eta,1,sum)
# compute exhaustive stats
S0=apply(eta,2,sum); S1=apply(eta*x,2,sum);
S2=apply(eta*x^2,2,sum);
# update parameters
pi=S0/n; mu=S1/S0
sigma=sqrt(sum(S2-2*S1*mu+S0*mu^2)/n)
}
unlist(list(pi=pi,mu=mu,sigma=sigma))
pi1 pi2 mu1 mu2 sigma
0 0.5000 0.5000 180.0 160.0 20.00
1 0.4917 0.5083 172.3 166.5 10.66
2 0.4911 0.5089 172.3 166.5 10.66
3 0.4904 0.5096 172.3 166.5 10.65
pi1 pi2 mu1 mu2 sigma
96 0.4012 0.5988 179.1 162.8 7.656
97 0.4015 0.5985 179.1 162.8 7.653
98 0.4017 0.5983 179.1 162.8 7.650
99 0.4019 0.5981 179.1 162.8 7.648
pi1 pi2 mu1 mu2 sigma
196 0.4037 0.5963 179 162.8 7.63
197 0.4037 0.5963 179 162.8 7.63
198 0.4037 0.5963 179 162.8 7.63
199 0.4037 0.5963 179 162.8 7.63
x
and s
in R x
s
\[ P(\mathbf{S})=P(S_1)\prod_{i=2}^n P(S_i | S_{i-1}) \]
with \( P(S_1=r)=0.5 \) and \( P(S_i=s | S_{i-1}=r)=\pi(r,s) \) we get: \[ P(\mathbf{S})=0.5\prod_{i=2}^n \pi(S_{i-1},S_i) \]
pi=rbind(c(0.8,0.2),
c(0.1,0.9))
s=rep(NA,n)
s[1]=sample(1:2,size=1)
for (i in 2:n)
s[i]=sample(1:2,prob=pi[s[i-1],],size=1)
\[ P(\mathbf{S})=0.5\prod_{i=2}^n \pi(S_{i-1},S_i) \]
\[ P(X_i=x|S_i=j)=\text{dnorm}(x,\mu_j,\sigma) \]
\( \theta=(\boldsymbol{\pi},\boldsymbol{\mu},\sigma) \)
size (cm) of men (\( S_i=1 \)) and women (\( S_i=2 \))
pi=rbind(c(0.8,0.2),
c(0.1,0.9));
mu=c(175,165) ; sigma=10
s=rep(NA,n)
s[1]=sample(1:2,size=1)
for (i in 2:n)
s[i]=sample(1:2,prob=pi[s[i-1],],size=1)
x=rnorm(n,mu[s],sigma)
s=rep(NA,n)
s[1]=sample(1:2,size=1)
for (i in 2:n)
s[i]=sample(1:2,prob=pi[s[i-1],],size=1)
x=rnorm(n,mu[s],sigma)
s=rep(NA,n)
s[1]=sample(1:2,size=1)
for (i in 2:n)
s[i]=sample(1:2,prob=pi[s[i-1],],size=1)
x=rnorm(n,mu[s],sigma)
\[ P(S_i=j|\boldsymbol{X})=\frac{\sum_{\boldsymbol{S}_{-i}} P(S_i=j,\boldsymbol{S}_{-i},\boldsymbol{X})} {\sum_{\boldsymbol{S}} P(\boldsymbol{S},\boldsymbol{X})} \]
\[ F_i(j)=P(S_i=j,X_1,\ldots,X_i) \]
\[ B_i(j)=P(X_{i+1},\ldots,X_{n} |S_i=j) \]
\[ P(S_i=j,\boldsymbol{X})=F_i(s)B_i(s) \]
\[ P(S_{i-1}=j,S_i=k,\boldsymbol{X})=F_{i-1}(j)\pi(j,k)e_i(k)B_i(k) \]
with \( e_i(k)=P(X_i|S_i=k) \)
\[ P(S_i=j,\boldsymbol{X})=F_i(j)B_i(j) \]
\[ P(S_{i-1}=j,S_i=k,\boldsymbol{X})=F_{i-1}(j)\pi(j,k)e_i(k)B_i(k) \]
\[ F_1(j)=0.5 e_1(j) \quad F_i(k)=\sum_{j} F_{i-1}(j)\pi(j,k)e_i(k) \]
\[ B_n(k)=1 \quad B_{i-1}(j)=\sum_{k} \pi(j,k)e_i(k)B_i(k) \]
# emission matrix
e=cbind(dnorm(x,mu[1],sigma),
dnorm(x,mu[2],sigma))
# forward recursion
Fw=matrix(NA,n,2)
Fw[1,]=0.5*e[1,]
for (i in 2:n) for (k in 1:2)
Fw[i,k]=sum(Fw[i-1,]*pi[,k]*e[i,k])
# forward recursion
Bk=matrix(NA,n,2)
Bk[n,]=1
for (i in seq(n,2,by=-1)) for (j in 1:2)
Bk[i-1,j]=sum(pi[j,]*e[i,]*Bk[i,])
# loglik and validation
log(apply(Fw*Bk,1,sum))[1:10]
[1] -381.5 -381.5 -381.5 -381.5 -381.5 -381.5 -381.5 -381.5 -381.5 -381.5
\[ P(S_i=k|\boldsymbol{X}) = \frac{F_i(k)B_i(k)}{\exp(\text{loglik})} \]
Expectation-Maximization algorithm (Dempster'77)
\[ S_d(j)=\sum_{i=1}^n \eta_i(j) X_i^d \quad \mu_j = \frac{S_1(j)}{S_0(j)} \]
\[ \sigma^2= \frac{1}{n} \sum_{j=1}^2 S_2(j)-2 S_1(j)\mu_j +S_0(j)\mu_j^2 \]
\[ \pi(j,k) \propto \sum_{i=2}^n P(S_{i-1}=j,S_{i}=k,\boldsymbol{X}) \]
posterior=function(x,pi,mu,sigma) {
e=cbind(dnorm(x,mu[1],sigma),
dnorm(x,mu[2],sigma))
Fw=matrix(NA,n,2)
Fw[1,]=0.5*e[1,]
for (i in 2:n) for (k in 1:2)
Fw[i,k]=sum(Fw[i-1,]*pi[,k]*e[i,k])
Bk=matrix(NA,n,2)
Bk[n,]=1
for (i in seq(n,2,by=-1)) for (j in 1:2)
Bk[i-1,j]=sum(pi[j,]*e[i,]*Bk[i,])
# compute posterior distribution
eta=Fw*Bk/sum(Fw[1,]*Bk[1,])
count=matrix(NA,2,2)
for (j in 1:2) for (k in 1:2)
count[j,k]=sum(Fw[-n,j]*
pi[j,k]*e[-1,k]*Bk[-1,k])
return(list(eta=eta,count=count))
}
# arbitrary initialization
pi=rbind(c(0.6,0.4),c(0.4,0.6));
mu=c(180,160); sigma=20
for (iter in 1:200) {
# posterior distributions
post=posterior(x,pi,mu,sigma)
attach(post)
# compute exhaustive stats
S0=apply(eta,2,sum); S1=apply(eta*x,2,sum);
S2=apply(eta*x^2,2,sum);
# update parameters
pi=S0/n; mu=S1/S0
sigma=sqrt(sum(S2-2*S1*mu+S0*mu^2)/n)
pi=count/apply(count,2,sum)
}
unlist(list(pi=pi,mu=mu,sigma=sigma))
diagpi1 diagpi2 mu1 mu2 sigma
0 0.6000 0.6000 180.0 160.0 20.00
1 0.5857 0.6356 171.5 164.8 10.93
2 0.6019 0.6459 171.9 164.5 10.82
3 0.6210 0.6591 172.3 164.1 10.67
diagpi1 diagpi2 mu1 mu2 sigma
40 0.7938 0.8770 177.6 162.5 8.833
41 0.7940 0.8772 177.6 162.5 8.834
42 0.7941 0.8773 177.6 162.5 8.835
43 0.7942 0.8774 177.6 162.5 8.836
diagpi1 diagpi2 mu1 mu2 sigma
80 0.7951 0.8781 177.6 162.6 8.843
81 0.7951 0.8781 177.6 162.6 8.843
82 0.7951 0.8781 177.6 162.6 8.843
83 0.7951 0.8781 177.6 162.6 8.843
diagpi1 diagpi2 mu1 mu2 sigma
0.7951 0.8781 177.6458 162.5613 8.8432
error rate = 0.09
pi1 pi2 mu1 mu2 sigma
0.3764 0.6236 177.6925 162.0651 8.5508
error rate = 0.17
x
and s
in R x
s