############################################################################# ###################### # # R example code for multidimensional scaling and correspondence analysis: # ###################### # ############################################################################# # The built-in R data set called eurodist contains a "dist" object # with road distances between 21 European cities. # For information, type: help(eurodist) # To see the (partial) distance matrix, type: eurodist ############################################################################### ############################################################################### ###################################### # Classical multidimensional scaling # ###################################### # Classical MDS for the European distances using the cmdscale function: # The default number of dimensions is k = 2: euro.mds.2 <- cmdscale(eurodist, eig=T) euro.mds.2 # The first number in the GOF section is what I called P_k, which is # (almost) the P^(1) criterion given near the top of page 96. # Changing the number of dimensions k: euro.mds.4 <- cmdscale(eurodist, k=4, eig=T) euro.mds.4 # Using the P_k criterion for a variety of values of k # to choose the appropriate amount of dimension reduction: # If we are considering values of k from 1 to, say, 12: max.k <- 12 P.k <- rep(0,max.k) SStress <- rep(0,max.k) for (kk in 1:max.k){ my.mds.kk <- cmdscale(eurodist,k=kk,eig=T) P.k[kk] <- my.mds.kk$GOF[1] #SStress[kk] <- ( sum( (eurodist^2 - (dist(my.mds.kk$points))^2)^2 )/sum(eurodist^4) )^0.5 } cbind(1:max.k,P.k) # It looks like 2 or 3 dimensions would be reasonable, and 4 gives quite a good fit. # A 2-D representation of the solution for k=2: par(pty="s") # Creates a square plotting region ## The reason for the (-2500, 2500) limits below is because the coordinates in the first two dimensions, ## which can be seen by typing: euro.mds.2$points ## range from -2048.449113 to 2290.274680 ## Making the axes go from -2500 to 2500 allows all the labels to fit nicely on the plot. ## For other data sets, you'll have to adjust these numbers. ## You can look at the '$points' component of the output and see the smallest and largest values ## and use limits that add a little space beyond this range. plot(euro.mds.2$points[,1], euro.mds.2$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-2500,2500), ylim=c(-2500,2500) ) text(euro.mds.2$points[,1], euro.mds.2$points[,2], labels=labels(eurodist) ) # Same thing, but using abbreviations: euro.abb <- abbreviate(labels(eurodist)) plot(euro.mds.2$points[,1], euro.mds.2$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-2500,2500), ylim=c(-2500,2500) ) text(euro.mds.2$points[,1], euro.mds.2$points[,2], labels=euro.abb ) # Maybe rotating across the x-axis would produce a better reflection of reality! plot(euro.mds.2$points[,1], -euro.mds.2$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-2500,2500), ylim=c(-2500,2500) ) text(euro.mds.2$points[,1], -euro.mds.2$points[,2], labels=euro.abb ) ### A 3-D plot of the 3-D solution, including labels of the city names: library(scatterplot3d) s3d <- scatterplot3d(euro.mds.4$points[,1],euro.mds.4$points[,2],euro.mds.4$points[,3], # x y and z axis color="blue", pch=19, # filled blue circles type="h", # vertical lines to the x-y plane main="3-D Scatterplot of 3-D MDS Solution", xlab="Coordinate 1", ylab="Coordinate 2", zlab="Coordinate 3") s3d.coords <- s3d$xyz.convert(euro.mds.4$points[,1],euro.mds.4$points[,2],euro.mds.4$points[,3]) # convert 3D coords to 2D projection text(s3d.coords$x, s3d.coords$y, # x and y coordinates labels=labels(eurodist), # text to plot cex=.7, pos=4) # shrink text and place to right of points) ### This plot works well. ############################################################################### ############################################################################### # Cola dissimilarity matrix: # [Data from Table 5.1 (Subject 1)] cola.diss <- matrix(c( 0,16,81,56,87,60,84,50,99,16, 16,0,47,32,68,35,94,87,25,92, 81,47,0,71,44,21,98,79,53,90, 56,32,71,0,71,98,57,73,98,83, 87,68,44,71,0,34,99,19,52,79, 60,35,21,98,34,0,99,92,17,44, 84,94,98,57,99,99,0,45,99,24, 50,87,79,73,19,92,45,0,84,18, 99,25,53,98,52,17,99,84,0,98, 16,92,90,83,79,44,24,18,98,0 ),ncol=10,nrow=10,byrow=T) # Using the P_k criterion for a variety of values of k # to choose the appropriate amount of dimension reduction: # If we are considering values of k from 1 to, say, 8: max.k <- 8 P.k <- rep(0,max.k) SStress <- rep(0,max.k) for (kk in 1:max.k){ my.mds.kk <- cmdscale(cola.diss,k=kk,eig=T) P.k[kk] <- my.mds.kk$GOF[1] #SStress[kk] <- ( sum( (cola.diss^2 - (dist(my.mds.kk$points))^2)^2 )/sum(cola.diss^4) )^0.5 } cbind(1:max.k,P.k) # Plotting SStress against k for several values of k: #plot(1:max.k, SStress, type='b') # It looks like we need at least 4 dimensions, # certainly no more than 5. cola.mds.4 <- cmdscale(cola.diss,k=4,eig=T) cola.mds.4 # A representation of the 2-dimensional solution: par(pty="s") # Creates a square plotting region plot(cola.mds.4$points[,1], cola.mds.4$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-55,55), ylim=c(-55,55) ) text(cola.mds.4$points[,1], cola.mds.4$points[,2]) # Interesting: Note the positions of colas 1, 8, and 10 on the map # and examine the pairwise dissimilarities for these three colas... ############################################################################### ############################################################################### # Reading in the College data set from the 'ISLR' packages: # install.packages("ISLR") # if needed library(ISLR) data(College) LargeColleges <- subset(College, Apps >= 7600) coll.abbs<- abbreviate(row.names(LargeColleges)) LargeColleges.numeric <- LargeColleges[,-1] attach(LargeColleges.numeric) std <- apply(LargeColleges.numeric,2,sd) # finding standard deviations of variables # dividing each variable by its standard deviation: LargeColleges.std <- sweep(LargeColleges.numeric,2,std,FUN="/") # Calculating Euclidean distances between these 80 scaled observations: coll.dist <- dist(LargeColleges.std) # Using the P_k criterion for a variety of values of k # to choose the appropriate amount of dimension reduction: # If we are considering values of k from 1 to, say, 6: max.k <- 6 P.k <- rep(0,max.k) SStress <- rep(0,max.k) for (kk in 1:max.k){ my.mds.kk <- cmdscale(coll.dist,k=kk,eig=T) P.k[kk] <- my.mds.kk$GOF[1] SStress[kk] <- ( sum( (coll.dist^2 - (dist(my.mds.kk$points))^2)^2 )/sum(coll.dist^4) )^0.5 } cbind(1:max.k,P.k) # Plotting SStress against k for several values of k: # This makes more sense when the MDS is based on Euclidean distances: plot(1:max.k, SStress, type='b') # Using 4 or 5 dimensions seems appropriate here. coll.mds.5 <- cmdscale(coll.dist,k=5,eig=T) coll.mds.5 # A representation of the 2-dimensional solution: par(pty="s") # Creates a square plotting region ## The value of 'cex' controls how large the labels are printed on the plot. plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-7,7), ylim=c(-7,7) ) text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=coll.abbs, cex=0.7) # Zooming in on the cluster of points in the middle: plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-7,7), ylim=c(-4,4) ) text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=coll.abbs, cex=0.7) # Is it better to use the full row names as labels? # The Zoomed-in plot (doesn't show Rutgers): plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-7,7), ylim=c(-4,4) ) text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=row.names(LargeColleges), cex=0.5) ## Zooming in even more to see our favorite university: plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n', xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(0,4), ylim=c(-2,0) ) text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=row.names(LargeColleges), cex=0.7) ### A 3-D plot of the 3-D solution, including labels of the college names: library(scatterplot3d) s3d <- scatterplot3d(coll.mds.5$points[,1],coll.mds.5$points[,2],coll.mds.5$points[,3], # x y and z axis color="blue", pch=19, # filled blue circles type="h", # vertical lines to the x-y plane main="3-D Scatterplot of 3-D MDS Solution", xlab="Coordinate 1", ylab="Coordinate 2", zlab="Coordinate 3") s3d.coords <- s3d$xyz.convert(coll.mds.5$points[,1],coll.mds.5$points[,2],coll.mds.5$points[,3]) # convert 3D coords to 2D projection text(s3d.coords$x, s3d.coords$y, # x and y coordinates labels=row.names(LargeColleges), # text to plot cex=.5, pos=4) # shrink text 50% and place to right of points) ### Honestly there are too many observations for this plot to work well. ############################################################################### ############################################################################### library(MASS) # loading the MASS package # Nonmetric scaling on the cola distances: cola.iso<-isoMDS(cola.diss, k = 2) plot(cola.iso$points, type = "n",xlab="Coordinate 1", ylab="Coordinate 2") text(cola.iso$points, labels = as.character(1:nrow(cola.diss))) abline(h=0); abline(v=0) ############################### # Correspondence analysis # ############################### # Several contingency tables from the book: girls <- matrix(c( 21,21,14,13,8, 8,9,6,8,2, 2,3,4,10,10),byrow=T, ncol=5, nrow=3, dimnames = list(c('nbf','bfns','bfs'), c('AG1', 'AG2', 'AG3', 'AG4', 'AG5'))) smokemoms <- matrix(c( 50,315,24,4012, 9,40,6,459, 41,147,14,1594, 4,11,1,124), byrow=T, ncol=4, nrow=4, dimnames = list(c('YNS','YS','ONS','OS'), c('pd','pa','ftd', 'fta'))) hodgkin <- matrix(c( 74,18,12, 68,16,12, 154,54,58, 18,10,44), byrow=T, ncol=3, nrow=4, dimnames = list(c('LP','NS','MC','LD'), c('positive','partial','none'))) # One that's not in the book --- a snoring / heart disease data set: snore.heart.data <- matrix(c(24,35,51,1355,603,416), nrow=2, ncol=3, byrow=TRUE, dimnames = list(c("Yes", "No"), c("Never", "Occasionally", "~ Every Night"))) as.table(snore.heart.data) ######################################################### ## Code to get the matrices of chi-squared distances ## for both rows and columns ######################################################### ################# BEGIN CODE ########################### # Enter name of data matrix / contingency table: mymat <- hodgkin rr <- nrow(mymat); cc <- ncol(mymat) row.sums <- apply(mymat,1,sum) col.sums <- apply(mymat,2,sum) N<-sum(row.sums) pijrow <- matrix(0,nrow=rr,ncol=cc); pijcol <- matrix(0,nrow=rr,ncol=cc) distmat.row <- matrix(0,nrow=rr,ncol=rr); distmat.col <- matrix(0,nrow=cc,ncol=cc) for (i in 1:rr){ pijcol[i,] <- mymat[i,]/row.sums[i] } for (j in 1:cc){ pijrow[,j] <- mymat[,j]/col.sums[j] } for (i in 1:rr){ for (ii in 1:(i-1)) { my.hold <- sqrt( sum( (N/col.sums)*(pijcol[i,]-pijcol[ii,])^2 ) ) distmat.row[i,ii] <- my.hold; distmat.row[ii,i] <- my.hold } } for (j in 1:cc){ for (jj in 1:(j-1)) { my.hold <- sqrt( sum( (N/row.sums)*(pijrow[,j]-pijrow[,jj])^2 ) ) distmat.col[j,jj] <- my.hold; distmat.col[jj,j] <- my.hold } } round(distmat.col,digits=2) round(distmat.row,digits=2) ################### END CODE ########################### ######################################################### # A straightforward chi-square test for independence of rows and columns: chisq.test(hodgkin) # We see that histological classification and response to treatment are clearly associated. ############################################################################ # A quick way to do correspondence analysis using the "corresp" function # in the MASS library: library(MASS) # loading the MASS package corresp(hodgkin,nf=2) # The two-dimensional solution. biplot(corresp(hodgkin,nf=2)); abline(h=0); abline(v=0) # It apparently uses a slightly different algorithm than the book's method: # While the results are close, they are not identical. # Some conclusions: # LP and NS have very similar row profiles. # "No response" occurs especially often with LD. # "Positive response" is associated with LP and with NS. # "Partial response" is linked with MC. # Examining the 1-dimensional solution can be useful in interpretation as well: corresp(hodgkin,nf=1) # Note "LD" and "none" both have large positive coordinates. # Note "LP" (and "NS") and "positive" both have large negative coordinates. # But "LD" and "positive" have opposite signs. # To (nearly) replicate the book's examples in section 5.3 and in 5.3.1, # replace "hodgkin" with "girls" or "smokemoms" in the above code. # What are the substantive conclusions in the "smoking / motherhood" analysis?