library(mice) library(VIM) library(lattice) library(ggplot2) # To see the data matrix (containing variables for the 50 states, as of 1977): state.x77 # Creating a data frame out of a matrix: state.x77.df <- data.frame(state.x77) names(state.x77.df) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area") attach(state.x77.df) # randomly removing some data values to make them 'missing': missing.index <- sample(1:400,size=40,replace=F) state.x77.miss <- state.x77 state.x77.miss[missing.index] <- NA state.x77.miss <- data.frame(state.x77.miss) names(state.x77.miss) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area") # diagnostic plot: pbox(state.x77.miss) # Multiple Imputation by the Predictive Mean Matching (PMM) method imp1 <- mice(state.x77.miss) # 5 imputed data sets are created by default; use "m=" argument to adjust this # Seeing which values were imputed: imp1$imp$Area # More diagnostic plots to compare distributions of original values (blue) to distributions of imputed values (red) xyplot(imp1,Income ~ Murder+HSGrad+Frost,pch=18,cex=1) densityplot(imp1) stripplot(imp1, pch = 20, cex = 1.2) # completed.Data1 <- complete(imp1, 1) # uses the 1st imputed data set pca.1 <- princomp(completed.Data1,cor=T) summary(pca.1, loadings=T) biplot(pca.1,main='PCA with 1st imputed data') # Compare to original PCA with the complete data: windows() pca.org <- princomp(state.x77.df,cor=T) summary(pca.org, loadings=T) biplot(pca.org,main='PCA with original data') # To account for imputation variation, should do the analysis # with SEVERAL imputed data sets (5 are produced by default) # and see whether results change: completed.Data2 <- complete(imp1, 2) # uses the 2nd imputed data set pca.2 <- princomp(completed.Data2,cor=T) summary(pca.2, loadings=T) biplot(pca.2,main='PCA with 2nd imputed data') completed.Data3 <- complete(imp1, 3) # uses the 3rd imputed data set pca.3 <- princomp(completed.Data3,cor=T) summary(pca.3, loadings=T) biplot(pca.3,main='PCA with 3rd imputed data') ### Looking at all the imputed data sets at once: # Combining the imputed data with the original data: imp_tot2 <- complete(imp1, 'long', inc=TRUE) # Doing a regression, and pooling estimates and standard errors across all 5 imputed data sets: modelFit1 <- with(imp1,lm(Income ~ Murder+HSGrad+Frost)) summary(pool(modelFit1))