library(mice) library(VIM) library(lattice) library(ggplot2) # To see the data matrix (containing variables for the 50 states, as of 1977): state.x77 # Creating a data frame out of a matrix: state.x77.df <- data.frame(state.x77) names(state.x77.df) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area") attach(state.x77.df) # randomly removing some data values to make them 'missing': missing.index <- sample(1:400,size=40,replace=F) state.x77.miss <- state.x77 state.x77.miss[missing.index] <- NA state.x77.miss <- data.frame(state.x77.miss) names(state.x77.miss) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area") # summary plot of missingness: md.pattern(state.x77.miss) # diagnostic plot: pbox(state.x77.miss) # Multiple Imputation by the Predictive Mean Matching (PMM) method # PMM uses an imputation model (e.g., a regression model) to predict the value of the target variable # for the observation with the missing value. # It then uses a pool of candidate observations (usually between 3 and 10) with complete cases # whose predicted values for that variable (based on the imputation model) are # "close" to the predicted value of the observation with the missing data. # It then randomly selects one of the candidate observations and uses the OBSERVED value of the target variable # from that selected candidate observation as the imputed value of the variable for the observation with missing data. imp.data <- mice(state.x77.miss) # 5 imputed data sets are created by default; use "m=" argument to adjust this # Seeing which values were imputed: imp.data$imp$Area # More diagnostic plots to compare distributions of original values (blue) to distributions of imputed values (red) xyplot(imp.data,Income ~ Murder+HSGrad+Frost,pch=18,cex=1) densityplot(imp.data) stripplot(imp.data, pch = 20, cex = 1.2) # completed.Data1 <- complete(imp.data, 1) # uses the 1st imputed data set pca.1 <- princomp(completed.Data1,cor=T) summary(pca.1, loadings=T) biplot(pca.1,main='PCA with 1st imputed data') # Compare to original PCA with the complete data: dev.new() pca.org <- princomp(state.x77.df,cor=T) summary(pca.org, loadings=T) biplot(pca.org,main='PCA with original data') # To account for imputation variation, should do the analysis # with SEVERAL imputed data sets (5 are produced by default) # and see whether results change: completed.Data2 <- complete(imp.data, 2) # uses the 2nd imputed data set pca.2 <- princomp(completed.Data2,cor=T) summary(pca.2, loadings=T) biplot(pca.2,main='PCA with 2nd imputed data') completed.Data3 <- complete(imp.data, 3) # uses the 3rd imputed data set pca.3 <- princomp(completed.Data3,cor=T) summary(pca.3, loadings=T) biplot(pca.3,main='PCA with 3rd imputed data') ### Looking at all the imputed data sets at once: # Combining the imputed data with the original data: imp_tot2 <- complete(imp.data, 'long', inc=TRUE) # Doing a regression, and pooling estimates and standard errors across all 5 imputed data sets: modelFit1 <- with(imp.data,lm(Income ~ Murder+HSGrad+Frost)) summary(pool(modelFit1))