# R Examples for Chapter 14: Inference for Categorical Data ## Example 1: # The M&Ms data set: # The observed counts are 61, 64, 54, 61, 96, 64. # We test whether the corresponding cell probabilities are # (0.13, 0.14, 0.13, 0.24, 0.20, 0.16) using the chisq.test() function in R: chisq.test(c(61, 64, 54, 61, 96, 64), p=c(0.13, 0.14, 0.13, 0.24, 0.20, 0.16), correct=FALSE) ## Example 2(a): # Category counts are 229, 211, 93, 35, 8. # Category probabilities based on a Poisson(1) distribution: pois.probs <- c(dpois(0:3,lambda=1), ppois(3,lambda=1,lower=F)) sum(pois.probs) # check that they sum to 1... # Expected counts under H0: pois.probs*576 # All are at least 5... chisq.test(c(229, 211, 93, 35, 8), p=pois.probs, correct=FALSE) ## Example 2(b): # Category counts are 229, 211, 93, 35, 8. # Category probabilities based on a Poisson distribution (mean unspecified): # First need to estimate lambda using MLE (see notes) lambda.hat_ML <- 534/576 pois.probs.est <- c(dpois(0:3,lambda=lambda.hat_ML), ppois(3,lambda=lambda.hat_ML,lower=F)) sum(pois.probs.est) # check that they sum to 1... # Expected counts under H0: pois.probs.est*576 # All are at least 5... # Can't use 'chisq.test' directly (R doesn't know to take away 1 d.f. for the estimated parameter): chisq.stat <- chisq.test(c(229, 211, 93, 35, 8), p=pois.probs.est, correct=FALSE)$statistic print(paste("Chi-square test stat= ",round(chisq.stat,2),"P-value= ", round(1-pchisq(chisq.stat,df=3),3) )) ## Example 3: # R code to analyze an r by c contingency table # and do chi-squared test of independence # We look at the happiness / marital status data set: # The observed counts may be input as a matrix as follows: happy.data <- matrix(c(1320,467,603,93,119,127), nrow=2, ncol=3, byrow=TRUE, dimnames = list(c("Very/Pretty Happy", "Not too Happy"), c("Married", "Divorced/Separated", "Never Married"))) # Printing the 2 by 3 table: as.table(happy.data) # We test for independence of the two classifications # using the chisq.test() function in R: chisq.test(happy.data, correct=FALSE) # The P-value is near zero. # A quick way to get expected cell counts to verify large-sample assumption is met: expected.counts <- (apply(happy.data,1,sum) %o% apply(happy.data,2,sum))/sum(happy.data) print(expected.counts) ## Example 5: # R code to analyze an r by c contingency table # and do chi-squared test of homogeneity # We look at the political ideology data set: # The observed counts may be input as a matrix as follows: polit.data <- matrix(c(13,21,16,17,13,12,20,16,22), nrow=3, ncol=3, byrow=TRUE, dimnames = list(c("Liberal", "Moderate", "Conservative"), c("Clemson", "UofSC", "CofC"))) # Printing the 3 by 3 table: as.table(polit.data) # We test for homogeneity # using the chisq.test() function in R: chisq.test(polit.data, correct=FALSE) # The P-value is 0.416. # A quick way to get expected cell counts to verify large-sample assumption is met: expected.counts <- (apply(polit.data,1,sum) %o% apply(polit.data,2,sum))/sum(polit.data) print(expected.counts)