library(mice)
library(VIM)
library(lattice)
library(ggplot2)

# To see the data matrix (containing variables for the 50 states, as of 1977):

state.x77

# Creating a data frame out of a matrix:

state.x77.df <- data.frame(state.x77)

names(state.x77.df) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area")

attach(state.x77.df)

# randomly removing some data values to make them 'missing':

missing.index <- sample(1:400,size=40,replace=F)

state.x77.miss <- state.x77

state.x77.miss[missing.index] <- NA

state.x77.miss <- data.frame(state.x77.miss)

names(state.x77.miss) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area")

# diagnostic plot:

pbox(state.x77.miss)

# Multiple Imputation by the Predictive Mean Matching (PMM) method

imp1 <- mice(state.x77.miss) # 5 imputed data sets are created by default; use "m=" argument to adjust this

# Seeing which values were imputed:

imp1$imp$Area

# More diagnostic plots to compare distributions of original values (blue) to distributions of imputed values (red)

xyplot(imp1,Income ~ Murder+HSGrad+Frost,pch=18,cex=1)

densityplot(imp1)

stripplot(imp1, pch = 20, cex = 1.2)

#

completed.Data1 <- complete(imp1, 1) # uses the 1st imputed data set

pca.1 <- princomp(completed.Data1,cor=T)
summary(pca.1, loadings=T)
biplot(pca.1,main='PCA with 1st imputed data')

# Compare to original PCA with the complete data:

windows()

pca.org <- princomp(state.x77.df,cor=T)
summary(pca.org, loadings=T)
biplot(pca.org,main='PCA with original data')

# To account for imputation variation, should do the analysis 
# with SEVERAL imputed data sets (5 are produced by default)
# and see whether results change:

completed.Data2 <- complete(imp1, 2) # uses the 2nd imputed data set

pca.2 <- princomp(completed.Data2,cor=T)
summary(pca.2, loadings=T)
biplot(pca.2,main='PCA with 2nd imputed data')

completed.Data3 <- complete(imp1, 3) # uses the 3rd imputed data set

pca.3 <- princomp(completed.Data3,cor=T)
summary(pca.3, loadings=T)
biplot(pca.3,main='PCA with 3rd imputed data')


### Looking at all the imputed data sets at once:

# Combining the imputed data with the original data:

imp_tot2 <- complete(imp1, 'long', inc=TRUE)

# Doing a regression, and pooling estimates and standard errors across all 5 imputed data sets:

modelFit1 <- with(imp1,lm(Income ~ Murder+HSGrad+Frost))
summary(pool(modelFit1))