##########################################################################
######################                                                   #           
# R example code to calculate summary statistics for multivariate data:  #
######################                                                   #
##########################################################################


# load 'HSAUR3' package:
library(HSAUR3)

# load USairpollution data set:
data(USairpollution)
# See first 5 data rows:
head(USairpollution)

# The observations are 41 cities (data were collected in 1981).
# These are the 7 variables:
# SO2: SO2 (sulphur dioxide) content of air in micrograms per cubic metre;
# temp: average annual temperature in degrees Fahrenheit;
# manu: number of manufacturing enterprises employing 20 or more workers;
# popul: population size (1970 census) in thousands;
# wind: average annual wind speed in miles per hour;
# precip: average annual precipitation in inches;
# predays: average number of days with precipitation per year.

#attach(USairpollution) # attaching the data frame


############################################
############################################

# Getting the sample mean vector:

colMeans(USairpollution)

# Getting the sample covariance matrix:

var(USairpollution)

# Rounding off:  round(var(USairpollution),digits=2)

# Note the diagonal elements of this are the sample variances for the 7 variables.

############################################
############################################

# Getting the sample correlation matrix:

cor(USairpollution)

# Rounding off:  round(cor(USairpollution),digits=2)

############################################
############################################

# Calculations to show the relationship between the covariance and correlation matrices:

# This is the long way...

my.S <- var(USairpollution)

D.minus.12 <- diag( 1/sqrt(diag(my.S) )  )

my.R <- D.minus.12 %*% my.S %*% D.minus.12

my.R

# Getting the correlation matrix (if you are given the covariance matrix) the quick way:

cov2cor(my.S)

## Be careful about the difference between the 'cor' and 'cov2cor' functions!

# NOTE:  If you have the DATA matrix and you want to get the sample correlation matrix, then use
# the 'cor' function, e.g.:
# cor(data_matrix)

# If you have the COVARIANCE matrix and you want to calculate the correlation matrix from this, then use
# the 'cov2cor' function, e.g.:
# cov2cor(covar_matrix)


############################################
############################################

# Getting distance matrix (Euclidean distances) between raw observations

dis <- dist(USairpollution)

dist2full<-function(ds){
  n<-attr(ds,"Size")
  full<-matrix(0,n,n)
  full[lower.tri(full)]<-ds
  dist.mat<-full+t(full)
  rownames(dist.mat) <- colnames(dist.mat) <- attributes(ds)$Labels
  return(dist.mat)
}
dis.matrix<-dist2full(dis)
round(dis.matrix,digits=2)

# Getting distance matrix (Euclidean distances) between SCALED observations

std <- sapply(USairpollution, sd)  # finding standard deviations of variables

USairpollution.std <- sweep(USairpollution,2,std,FUN="/")  # dividing each variable by its standard deviation

dis <- dist(USairpollution.std)

dis.matrix<-dist2full(dis)
round(dis.matrix,digits=2)

# Finding the smallest and largest distances within a distance matrix or distance object,
# and also printing which pairs of objects these correspond to:
# (Make sure dist2full function above has been copied into workspace first)

# Copy and paste the following pick.smallest.largest function into R:

############
##
#
pick.smallest.largest <- function(ds, howmany=1){
if (is.matrix(ds)==TRUE) {
if (is.vector(rownames(ds))==TRUE) {
  ds.Labels <- rownames(ds)
  } else {
  ds.Labels <- 1:nrow(ds)
  }
#print(ds.Labels)
disLT <- ds[lower.tri(ds)]
ds2 <- ds
} else {
if (is.vector(attributes(ds)$Labels)==TRUE) {
  ds.Labels <- attributes(ds)$Labels
  } else {
  ds.Labels <- 1:attributes(ds)$Size
  }
ds2 <- dist2full(ds)
disLT <- ds2[lower.tri(ds2)]
}
smallest.few <- sort(disLT)[1:howmany]
largest.few <- sort(disLT,decreasing=T)[1:howmany]
print(paste("smallest", howmany, "distances are: ")); print(smallest.few)
object.pairs <- object.pairs2 <- matrix(0,nrow=howmany,ncol=2)
for (i in 1:howmany){
object.pairs[i,] <- which(ds2==smallest.few[i],arr.ind=T)[2,]
}
print("these smallest distances correspond to the pairs of objects given in the rows below:"); print(object.pairs)
object.pairs.labels <- matrix(ds.Labels[object.pairs],ncol=2)
print("the labels for these pairs of objects given in the rows below:"); print(object.pairs.labels)
print(paste("largest", howmany, "distances are: ")); print(largest.few)
for (i in 1:howmany){
object.pairs2[i,] <- which(ds2==largest.few[i],arr.ind=T)[2,]
}
print("these largest distances correspond to the pairs of objects given in the rows below:"); print(object.pairs2)
object.pairs2.labels <- matrix(ds.Labels[object.pairs2],ncol=2)
print("the labels for these pairs of objects are given in the rows below:"); print(object.pairs2.labels)
}
#
##
############

# Using the pick.smallest.largest function:
pick.smallest.largest(dis,3)
pick.smallest.largest(dis.matrix,3)


# A measure of distance between the variables:

1 - abs(cor(USairpollution))

############################################
############################################

# Normal Q-Q plots for the first four variables in the data set, considered separately:
# This checks normality of the marginal distributions

par(mfrow=c(2,2))  # Setting up for a 2 by 2 array of plots

qqnorm(USairpollution[,1], ylab="Ordered Observations", main = "Normal Q-Q Plot, Sulphur Dioxide")
qqline(USairpollution[,1])
qqnorm(USairpollution[,2], ylab="Ordered Observations", main = "Normal Q-Q Plot, Temperature")
qqline(USairpollution[,2])
qqnorm(USairpollution[,3], ylab="Ordered Observations", main = "Normal Q-Q Plot, Manufacturing")
qqline(USairpollution[,3])
qqnorm(USairpollution[,4], ylab="Ordered Observations", main = "Normal Q-Q Plot, Population")
qqline(USairpollution[,4])

par(mfrow=c(1,1))


# Using Everitt's chisplot function to check multivariate normality of entire data set:

# Copy the chisplot function into R:
###############
###
chisplot <- function(x) {
    if (!is.matrix(x)) stop("x is not a matrix")

    ### determine dimensions
    n <- nrow(x)
    p <- ncol(x)
    #
    xbar <- apply(x, 2, mean)
    S <- var(x)
    S <- solve(S,tol=1e-40)
    index <- (1:n)/(n+1)
    #
    xcent <- t(t(x) - xbar)
    di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S)
    #
    quant <- qchisq(index,p)
    plot(quant, sort(di), ylab = "Ordered distances",
         xlab = "Chi-square quantile", lwd=2,pch=1)
   
}
###
###############

# Use the chisplot function:

chisplot(as.matrix(USairpollution))


# Try it on a bigger data set (Fisher's iris data set, which is a built-in data set in R):

fisher.iris <- iris[,1:4]
chisplot(as.matrix(fisher.iris))

# For the three types of irises, separately:

fisher.iris.setosa <- iris[1:50,1:4]
chisplot(as.matrix(fisher.iris.setosa))

fisher.iris.versicolor <- iris[51:100,1:4]
chisplot(as.matrix(fisher.iris.versicolor))

fisher.iris.virginica <- iris[101:150,1:4]
chisplot(as.matrix(fisher.iris.virginica))