#############################################################################
######################                                                      #           
# R example code for multidimensional scaling and correspondence analysis:  #
######################                                                      #
#############################################################################


# The built-in R data set called eurodist contains a "dist" object 
# with road distances between 21 European cities.
# For information, type:

help(eurodist)

# To see the (partial) distance matrix, type:

eurodist

###############################################################################
###############################################################################

######################################
# Classical multidimensional scaling #
######################################

# Classical MDS for the European distances using the cmdscale function:
# The default number of dimensions is k = 2:

euro.mds.2 <- cmdscale(eurodist, eig=T)
euro.mds.2

# The first number in the GOF section is what I called P_k, which is 
# (almost) the P^(1) criterion given near the top of page 96.

# Changing the number of dimensions k:

euro.mds.4 <- cmdscale(eurodist, k=4, eig=T)
euro.mds.4

# Using the P_k criterion for a variety of values of k
# to choose the appropriate amount of dimension reduction:

# If we are considering values of k from 1 to, say, 12:

max.k <- 12
P.k <- rep(0,max.k)
SStress <- rep(0,max.k)
for (kk in 1:max.k){
my.mds.kk <- cmdscale(eurodist,k=kk,eig=T)
P.k[kk] <- my.mds.kk$GOF[1]
#SStress[kk] <- ( sum( (eurodist^2 - (dist(my.mds.kk$points))^2)^2 )/sum(eurodist^4) )^0.5
}
cbind(1:max.k,P.k)


# It looks like 2 or 3 dimensions would be reasonable, and 4 gives quite a good fit.

# A 2-D representation of the solution for k=2:

par(pty="s")  # Creates a square plotting region

## The reason for the (-2500, 2500) limits below is because the coordinates in the first two dimensions,
## which can be seen by typing:  euro.mds.2$points
## range from -2048.449113 to 2290.274680 
## Making the axes go from -2500 to 2500 allows all the labels to fit nicely on the plot.
## For other data sets, you'll have to adjust these numbers.
## You can look at the '$points' component of the output and see the smallest and largest values
## and use limits that add a little space beyond this range.

plot(euro.mds.2$points[,1], euro.mds.2$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-2500,2500), ylim=c(-2500,2500) )
text(euro.mds.2$points[,1], euro.mds.2$points[,2], labels=labels(eurodist) )

# Same thing, but using abbreviations:

euro.abb <- abbreviate(labels(eurodist))

plot(euro.mds.2$points[,1], euro.mds.2$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-2500,2500), ylim=c(-2500,2500) )
text(euro.mds.2$points[,1], euro.mds.2$points[,2], labels=euro.abb )

# Maybe rotating across the x-axis would produce a better reflection of reality!

plot(euro.mds.2$points[,1], -euro.mds.2$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-2500,2500), ylim=c(-2500,2500) )
text(euro.mds.2$points[,1], -euro.mds.2$points[,2], labels=euro.abb )

### A 3-D plot of the 3-D solution, including labels of the city names:

library(scatterplot3d)

   s3d <- scatterplot3d(euro.mds.4$points[,1],euro.mds.4$points[,2],euro.mds.4$points[,3],        # x y and z axis
                 color="blue", pch=19,        # filled blue circles
                 type="h",                    # vertical lines to the x-y plane
                 main="3-D Scatterplot of 3-D MDS Solution",
                 xlab="Coordinate 1",
                 ylab="Coordinate 2",
                 zlab="Coordinate 3")
s3d.coords <- s3d$xyz.convert(euro.mds.4$points[,1],euro.mds.4$points[,2],euro.mds.4$points[,3]) # convert 3D coords to 2D projection
text(s3d.coords$x, s3d.coords$y,             # x and y coordinates
    labels=labels(eurodist),               # text to plot
    cex=.7, pos=4)           # shrink text and place to right of points)

### This plot works well.


###############################################################################
###############################################################################


# Cola dissimilarity matrix:
# [Data from Table 5.1 (Subject 1)]


cola.diss <- matrix(c(
0,16,81,56,87,60,84,50,99,16,
16,0,47,32,68,35,94,87,25,92,
81,47,0,71,44,21,98,79,53,90,
56,32,71,0,71,98,57,73,98,83,
87,68,44,71,0,34,99,19,52,79,
60,35,21,98,34,0,99,92,17,44,
84,94,98,57,99,99,0,45,99,24,
50,87,79,73,19,92,45,0,84,18,
99,25,53,98,52,17,99,84,0,98,
16,92,90,83,79,44,24,18,98,0
),ncol=10,nrow=10,byrow=T)


# Using the P_k criterion for a variety of values of k
# to choose the appropriate amount of dimension reduction:

# If we are considering values of k from 1 to, say, 8:

max.k <- 8
P.k <- rep(0,max.k)
SStress <- rep(0,max.k)
for (kk in 1:max.k){
my.mds.kk <- cmdscale(cola.diss,k=kk,eig=T)
P.k[kk] <- my.mds.kk$GOF[1]
#SStress[kk] <- ( sum( (cola.diss^2 - (dist(my.mds.kk$points))^2)^2 )/sum(cola.diss^4) )^0.5
}
cbind(1:max.k,P.k)

# Plotting SStress against k for several values of k:

#plot(1:max.k, SStress, type='b')


# It looks like we need at least 4 dimensions,
# certainly no more than 5.

cola.mds.4 <- cmdscale(cola.diss,k=4,eig=T)
cola.mds.4

# A representation of the 2-dimensional solution:


par(pty="s")  # Creates a square plotting region

plot(cola.mds.4$points[,1], cola.mds.4$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-55,55), ylim=c(-55,55) )
text(cola.mds.4$points[,1], cola.mds.4$points[,2])

# Interesting:  Note the positions of colas 1, 8, and 10 on the map
# and examine the pairwise dissimilarities for these three colas...

###############################################################################
###############################################################################


# Reading in the College data set from the 'ISLR' packages:

# install.packages("ISLR")  # if needed

library(ISLR)
data(College)

LargeColleges <- subset(College, Apps >= 7600)

coll.abbs<- abbreviate(row.names(LargeColleges))

LargeColleges.numeric <- LargeColleges[,-1]
attach(LargeColleges.numeric)



std <- apply(LargeColleges.numeric,2,sd) # finding standard deviations of variables

# dividing each variable by its standard deviation:
LargeColleges.std <- sweep(LargeColleges.numeric,2,std,FUN="/")  

# Calculating Euclidean distances between these 80 scaled observations:

coll.dist <- dist(LargeColleges.std)


# Using the P_k criterion for a variety of values of k
# to choose the appropriate amount of dimension reduction:

# If we are considering values of k from 1 to, say, 6:

max.k <- 6
P.k <- rep(0,max.k)
SStress <- rep(0,max.k)
for (kk in 1:max.k){
my.mds.kk <- cmdscale(coll.dist,k=kk,eig=T)
P.k[kk] <- my.mds.kk$GOF[1]
SStress[kk] <- ( sum( (coll.dist^2 - (dist(my.mds.kk$points))^2)^2 )/sum(coll.dist^4) )^0.5
}
cbind(1:max.k,P.k)

# Plotting SStress against k for several values of k:
# This makes more sense when the MDS is based on Euclidean distances:

plot(1:max.k, SStress, type='b')

# Using 4 or 5 dimensions seems appropriate here.

coll.mds.5 <- cmdscale(coll.dist,k=5,eig=T)
coll.mds.5

# A representation of the 2-dimensional solution:

par(pty="s")  # Creates a square plotting region

## The value of 'cex' controls how large the labels are printed on the plot.

plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-7,7), ylim=c(-7,7) )
text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=coll.abbs, cex=0.7)

# Zooming in on the cluster of points in the middle:

plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-7,7), ylim=c(-4,4) )
text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=coll.abbs, cex=0.7)

# Is it better to use the full row names as labels?

# The Zoomed-in plot (doesn't show Rutgers):

plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(-7,7), ylim=c(-4,4) )
text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=row.names(LargeColleges), cex=0.5)

## Zooming in even more to see our favorite university:

plot(coll.mds.5$points[,1], coll.mds.5$points[,2], type='n',
     xlab="Coordinate 1", ylab="Coordinate 2", xlim=c(0,4), ylim=c(-2,0) )
text(coll.mds.5$points[,1], coll.mds.5$points[,2], labels=row.names(LargeColleges), cex=0.7)

### A 3-D plot of the 3-D solution, including labels of the college names:

library(scatterplot3d)

   s3d <- scatterplot3d(coll.mds.5$points[,1],coll.mds.5$points[,2],coll.mds.5$points[,3],        # x y and z axis
                 color="blue", pch=19,        # filled blue circles
                 type="h",                    # vertical lines to the x-y plane
                 main="3-D Scatterplot of 3-D MDS Solution",
                 xlab="Coordinate 1",
                 ylab="Coordinate 2",
                 zlab="Coordinate 3")
s3d.coords <- s3d$xyz.convert(coll.mds.5$points[,1],coll.mds.5$points[,2],coll.mds.5$points[,3]) # convert 3D coords to 2D projection
text(s3d.coords$x, s3d.coords$y,             # x and y coordinates
    labels=row.names(LargeColleges),               # text to plot
    cex=.5, pos=4)           # shrink text 50% and place to right of points)

### Honestly there are too many observations for this plot to work well.

###############################################################################
###############################################################################

library(MASS)  # loading the MASS package

# Nonmetric scaling on the cola distances:

cola.iso<-isoMDS(cola.diss, k = 2)

plot(cola.iso$points, type = "n",xlab="Coordinate 1", ylab="Coordinate 2")
text(cola.iso$points, labels = as.character(1:nrow(cola.diss)))
abline(h=0); abline(v=0)


###############################
# Correspondence analysis     #
###############################

# Several contingency tables from the book:

girls <- matrix(c(
21,21,14,13,8,
8,9,6,8,2,
2,3,4,10,10),byrow=T, ncol=5, nrow=3,
dimnames = list(c('nbf','bfns','bfs'),
                               c('AG1', 'AG2', 'AG3', 'AG4', 'AG5')))

smokemoms <- matrix(c(
50,315,24,4012,
9,40,6,459,
41,147,14,1594,
4,11,1,124), byrow=T, ncol=4, nrow=4,
dimnames = list(c('YNS','YS','ONS','OS'),
                               c('pd','pa','ftd', 'fta')))

hodgkin <- matrix(c(
74,18,12,
68,16,12,
154,54,58,
18,10,44), byrow=T, ncol=3, nrow=4,
dimnames = list(c('LP','NS','MC','LD'),
                               c('positive','partial','none')))

# One that's not in the book --- a snoring / heart disease data set:

snore.heart.data <- matrix(c(24,35,51,1355,603,416), nrow=2, ncol=3, byrow=TRUE, 
dimnames = list(c("Yes", "No"), c("Never", "Occasionally", "~ Every Night")))

as.table(snore.heart.data)

#########################################################
## Code to get the matrices of chi-squared distances 
## for both rows and columns 
#########################################################
#################  BEGIN CODE ###########################

# Enter name of data matrix / contingency table:
mymat <- hodgkin
rr <- nrow(mymat); cc <- ncol(mymat)
row.sums <- apply(mymat,1,sum)
col.sums <- apply(mymat,2,sum)
N<-sum(row.sums)

pijrow <- matrix(0,nrow=rr,ncol=cc); pijcol <- matrix(0,nrow=rr,ncol=cc)
distmat.row <- matrix(0,nrow=rr,ncol=rr); distmat.col <- matrix(0,nrow=cc,ncol=cc)

for (i in 1:rr){
pijcol[i,] <- mymat[i,]/row.sums[i]
}
for (j in 1:cc){
pijrow[,j] <- mymat[,j]/col.sums[j]
}

for (i in 1:rr){
for (ii in 1:(i-1)) {
my.hold <- sqrt( sum( (N/col.sums)*(pijcol[i,]-pijcol[ii,])^2 ) )
distmat.row[i,ii] <- my.hold; distmat.row[ii,i] <- my.hold
}
}

for (j in 1:cc){
for (jj in 1:(j-1)) {
my.hold <- sqrt( sum( (N/row.sums)*(pijrow[,j]-pijrow[,jj])^2 )  )
distmat.col[j,jj] <- my.hold; distmat.col[jj,j] <- my.hold
}
}

round(distmat.col,digits=2)
round(distmat.row,digits=2)

###################  END CODE ###########################
#########################################################

# A straightforward chi-square test for independence of rows and columns:

chisq.test(hodgkin)

# We see that histological classification and response to treatment are clearly associated.

############################################################################

# A quick way to do correspondence analysis using the "corresp" function
# in the MASS library:

library(MASS) # loading the MASS package

corresp(hodgkin,nf=2)  # The two-dimensional solution.

biplot(corresp(hodgkin,nf=2)); abline(h=0); abline(v=0)

# It apparently uses a slightly different algorithm than the book's method:
# While the results are close, they are not identical.

# Some conclusions:

# LP and NS have very similar row profiles.
# "No response" occurs especially often with LD.
# "Positive response" is associated with LP and with NS.
# "Partial response" is linked with MC.

# Examining the 1-dimensional solution can be useful in interpretation as well:

corresp(hodgkin,nf=1) 

# Note "LD" and "none" both have large positive coordinates.
# Note "LP" (and "NS") and "positive" both have large negative coordinates.
# But "LD" and "positive" have opposite signs.

# To (nearly) replicate the book's examples in section 5.3 and in 5.3.1,
# replace "hodgkin" with "girls" or "smokemoms" in the above code.

# What are the substantive conclusions in the "smoking / motherhood" analysis?