############################################ # STAT 740 R examples # Tim Hanson # U. South Carolina # September 27, 2017 ############################################ ############################################ # finite mixture model ############################################ # The "Galaxy data" is # a numeric vector of velocities in km/sec of 82 # galaxies from 6 well-separated conic sections # of an unfilled survey of the Corona Borealis region. # Multimodality in such surveys is evidence for voids # and superclusters in the far universe. library(MASS) # has galaxies data set x=galaxies/1000 em=function(x,J){ n=length(x) w=matrix(0,n,J) delta=(max(x)-min(x))/(J+1) o=list(pi=rep(1/J,J),mu=min(x)+delta*1:J,sd=rep(delta/2,J),aic=0,it=0) repeat{ for(i in 1:n){ w[i,]=dnorm(x[i],o$mu,o$sd)*o$pi w[i,]=w[i,]/sum(w[i,]) } to=c(o$mu,o$sd,o$pi) for(j in 1:J){ o$mu[j]=sum(w[,j]*x)/sum(w[,j]) o$sd[j]=sqrt(sum(w[,j]*(x-o$mu[j])^2)/sum(w[,j])) o$pi[j]=sum(w[,j])/n } o$it=o$it+1 if(max(abs(c(o$mu,o$sd,o$pi)-to))<0.0001){break} } o$aic=-2*sum(log(colSums(dnorm(t(matrix(rep(x,J),n,J)),o$mu,o$sd)*o$pi)))+ 2*(3*J-1) o } J=3 # number of components o=em(x,J) hist(x,freq=F,xlim=c(min(x),max(x)),ylim=c(0,0.2)) g=seq(min(x),max(x),length=1000) dens=Vectorize(function(x){sum(dnorm(x,o$mu,o$sd)*o$pi)}) lines(g,dens(g)) ############################################ # finite mixture model # another example where truth is known ############################################ # pi=(0.2,0.6,0.2), mu=(-5,0,5), sd=(1,2,1) n=500 x=rep(0,n) for(i in 1:n){ x[i]=sum(rmultinom(1,1,c(0.2,0.6,0.2))*c(rnorm(1,-5,1),rnorm(1,0,2),rnorm(1,5,1))) } o=em(x,3) hist(x,freq=F,ylim=c(0,0.15)) g=seq(min(x),max(x),length=1000) dens=Vectorize(function(x){sum(dnorm(x,o$mu,o$sd)*o$pi)}) lines(g,dens(g)) # bootstrapped SEs "by hand" emb=function(x,J,os){ n=length(x) w=matrix(0,n,J) # starting values from real data o=list(mu=os$mu,sd=os$sd,pi=os$pi) repeat{ for(i in 1:n){ w[i,]=dnorm(x[i],o$mu,o$sd)*o$pi w[i,]=w[i,]/sum(w[i,]) } to=c(o$mu,o$sd,o$pi) for(j in 1:J){ o$mu[j]=sum(w[,j]*x)/sum(w[,j]) o$sd[j]=sqrt(sum(w[,j]*(x-o$mu[j])^2)/sum(w[,j])) o$pi[j]=sum(w[,j])/n } if(max(abs(c(o$mu,o$sd,o$pi)-to))<0.0001){break} } o } m=500 # bootstrap replicates J=3 # number of clusters os=em(x,J); os=list(mu=os$mu,sd=os$sd,pi=os$pi) se1=rep(0,3*J); se2=se1 # this is slooooowwwwww..... for(i in 1:m){ if(floor(i/100)==i/100){cat(i," bootsrapped parameters...","\n")} o=emb(sample(x,length(x),replace=T),J,os) se1=se1+c(o$mu,o$sd,o$pi) se2=se2+c(o$mu,o$sd,o$pi)^2 } se1=se1/m se2=sqrt(se2/m-se1^2) se2 ############################################ # exponential w/ censored data # x=times, d=censoring (0=censored, 1=not) ############################################ library(MASS) # has VA data data(VA) VAs=subset(VA,prior==0) attach(VAs) help(VA) em=function(x,d){ o=list(la=1/mean(x),aic=0,it=0,se=0) repeat{ to=o$la o$la=length(x)/(sum(x[d==1])+sum(x[d==0]+1/to)) o$it=o$it+1 if(abs(o$la-to)<0.00001){break} } o$aic=2-2*(sum(log(dexp(x[d==1],o$la)))+sum(log(1-pexp(x[d==0],o$la)))) o$se=o$la/sqrt(sum(d==1)) o } o=em(stime,status) hist(stime,freq=F) g=seq(0,600,length=1000) lines(g,dexp(g,o$la)) # standard error via bootstrap m=500 # bootstrap replicates n=length(stime) se1=0; se2=0 for(i in 1:m){ if(floor(i/100)==i/100){cat(i," bootsrapped parameters...","\n")} indices=sample(n,n,replace=T) o=em(stime[indices],status[indices]) se1=se1+o$la se2=se2+o$la^2 } se1=se1/m se2=sqrt(se2/m-se1^2) se2