########################################################################## ###################### # # R example code to calculate summary statistics for multivariate data: # ###################### # ########################################################################## # load 'HSAUR3' package: library(HSAUR3) # load USairpollution data set: data(USairpollution) # See first 5 data rows: head(USairpollution) # The observations are 41 cities (data were collected in 1981). # These are the 7 variables: # SO2: SO2 (sulphur dioxide) content of air in micrograms per cubic metre; # temp: average annual temperature in degrees Fahrenheit; # manu: number of manufacturing enterprises employing 20 or more workers; # popul: population size (1970 census) in thousands; # wind: average annual wind speed in miles per hour; # precip: average annual precipitation in inches; # predays: average number of days with precipitation per year. #attach(USairpollution) # attaching the data frame ############################################ ############################################ # Getting the sample mean vector: colMeans(USairpollution) # Getting the sample covariance matrix: var(USairpollution) # Rounding off: round(var(USairpollution),digits=2) # Note the diagonal elements of this are the sample variances for the 7 variables. ############################################ ############################################ # Getting the sample correlation matrix: cor(USairpollution) # Rounding off: round(cor(USairpollution),digits=2) ############################################ ############################################ # Calculations to show the relationship between the covariance and correlation matrices: # This is the long way... my.S <- var(USairpollution) D.minus.12 <- diag( 1/sqrt(diag(my.S) ) ) my.R <- D.minus.12 %*% my.S %*% D.minus.12 my.R # Getting the correlation matrix (if you are given the covariance matrix) the quick way: cov2cor(my.S) ## Be careful about the difference between the 'cor' and 'cov2cor' functions! # NOTE: If you have the DATA matrix and you want to get the sample correlation matrix, then use # the 'cor' function, e.g.: # cor(data_matrix) # If you have the COVARIANCE matrix and you want to calculate the correlation matrix from this, then use # the 'cov2cor' function, e.g.: # cov2cor(covar_matrix) ############################################ ############################################ # Getting distance matrix (Euclidean distances) between raw observations dis <- dist(USairpollution) dist2full<-function(ds){ n<-attr(ds,"Size") full<-matrix(0,n,n) full[lower.tri(full)]<-ds dist.mat<-full+t(full) rownames(dist.mat) <- colnames(dist.mat) <- attributes(ds)$Labels return(dist.mat) } dis.matrix<-dist2full(dis) round(dis.matrix,digits=2) # Getting distance matrix (Euclidean distances) between SCALED observations std <- sapply(USairpollution, sd) # finding standard deviations of variables USairpollution.std <- sweep(USairpollution,2,std,FUN="/") # dividing each variable by its standard deviation dis <- dist(USairpollution.std) dis.matrix<-dist2full(dis) round(dis.matrix,digits=2) # Finding the smallest and largest distances within a distance matrix or distance object, # and also printing which pairs of objects these correspond to: # (Make sure dist2full function above has been copied into workspace first) # Copy and paste the following pick.smallest.largest function into R: ############ ## # pick.smallest.largest <- function(ds, howmany=1){ if (is.matrix(ds)==TRUE) { if (is.vector(rownames(ds))==TRUE) { ds.Labels <- rownames(ds) } else { ds.Labels <- 1:nrow(ds) } #print(ds.Labels) disLT <- ds[lower.tri(ds)] ds2 <- ds } else { if (is.vector(attributes(ds)$Labels)==TRUE) { ds.Labels <- attributes(ds)$Labels } else { ds.Labels <- 1:attributes(ds)$Size } ds2 <- dist2full(ds) disLT <- ds2[lower.tri(ds2)] } smallest.few <- sort(disLT)[1:howmany] largest.few <- sort(disLT,decreasing=T)[1:howmany] print(paste("smallest", howmany, "distances are: ")); print(smallest.few) object.pairs <- object.pairs2 <- matrix(0,nrow=howmany,ncol=2) for (i in 1:howmany){ object.pairs[i,] <- which(ds2==smallest.few[i],arr.ind=T)[2,] } print("these smallest distances correspond to the pairs of objects given in the rows below:"); print(object.pairs) object.pairs.labels <- matrix(ds.Labels[object.pairs],ncol=2) print("the labels for these pairs of objects given in the rows below:"); print(object.pairs.labels) print(paste("largest", howmany, "distances are: ")); print(largest.few) for (i in 1:howmany){ object.pairs2[i,] <- which(ds2==largest.few[i],arr.ind=T)[2,] } print("these largest distances correspond to the pairs of objects given in the rows below:"); print(object.pairs2) object.pairs2.labels <- matrix(ds.Labels[object.pairs2],ncol=2) print("the labels for these pairs of objects are given in the rows below:"); print(object.pairs2.labels) } # ## ############ # Using the pick.smallest.largest function: pick.smallest.largest(dis,3) pick.smallest.largest(dis.matrix,3) # A measure of distance between the variables: 1 - abs(cor(USairpollution)) ############################################ ############################################ # Normal Q-Q plots for the first four variables in the data set, considered separately: # This checks normality of the marginal distributions par(mfrow=c(2,2)) # Setting up for a 2 by 2 array of plots qqnorm(USairpollution[,1], ylab="Ordered Observations", main = "Normal Q-Q Plot, Sulphur Dioxide") qqline(USairpollution[,1]) qqnorm(USairpollution[,2], ylab="Ordered Observations", main = "Normal Q-Q Plot, Temperature") qqline(USairpollution[,2]) qqnorm(USairpollution[,3], ylab="Ordered Observations", main = "Normal Q-Q Plot, Manufacturing") qqline(USairpollution[,3]) qqnorm(USairpollution[,4], ylab="Ordered Observations", main = "Normal Q-Q Plot, Population") qqline(USairpollution[,4]) par(mfrow=c(1,1)) # Using Everitt's chisplot function to check multivariate normality of entire data set: # Copy the chisplot function into R: ############### ### chisplot <- function(x) { if (!is.matrix(x)) stop("x is not a matrix") ### determine dimensions n <- nrow(x) p <- ncol(x) # xbar <- apply(x, 2, mean) S <- var(x) S <- solve(S,tol=1e-40) index <- (1:n)/(n+1) # xcent <- t(t(x) - xbar) di <- apply(xcent, 1, function(x,S) x %*% S %*% x,S) # quant <- qchisq(index,p) plot(quant, sort(di), ylab = "Ordered distances", xlab = "Chi-square quantile", lwd=2,pch=1) } ### ############### # Use the chisplot function: chisplot(as.matrix(USairpollution)) # Try it on a bigger data set (Fisher's iris data set, which is a built-in data set in R): fisher.iris <- iris[,1:4] chisplot(as.matrix(fisher.iris)) # For the three types of irises, separately: fisher.iris.setosa <- iris[1:50,1:4] chisplot(as.matrix(fisher.iris.setosa)) fisher.iris.versicolor <- iris[51:100,1:4] chisplot(as.matrix(fisher.iris.versicolor)) fisher.iris.virginica <- iris[101:150,1:4] chisplot(as.matrix(fisher.iris.virginica))