1 Introduction

R is one of the most popular scripting langurages in both academic and industry, which is widely employed in ITechology, engineering, Biomedical, econometrics and finance etc. The Comprehensive R Archieve Network(CRAN) has a considerable and consistenly increasing number of packages that contain lots of high-quality functions that make R more popular in many disciplines.

This notebook includes the R code that produces related outputs in the lecture notes. You can copy-and-paste the code to your RStudio console for realizing the results and plots. In order to use some certain packages, you need first install them from the CRAN by using install.packages(“package_name”). Then, you can source the according package by doing library(packagename) in order to use the functions inside it.

2 Probability

2.1 GPA simulation

x1=rnorm(30,630,50)     ##SAT MATH scores
x2=rnorm(30,3,.25)      ##the MATH 141 grades
y=0.5*x1+2*x2+rnorm(30) ##linear relationship with random noise
y=(max(y)-y)/(max(y)-min(y))*100 ##scaled to range from 0 to 100

library(scatterplot3d)
with(mtcars, {
  s3d<-scatterplot3d(x1, x2, y,           #x y and z axis
                      color="blue",pch=19, #filled blue circles
                      type="h", #vertical lines to the x-y plane
                      main=" ",
                      xlab="SAT MATH",
                      ylab="MATH 141",
                      zlab="STAT 509")
})

2.2 stock price simulation

n=480
xx <- vector("numeric",n)
ww <- rnorm(n)     # innovations (process errors)
xx[1:2] <- c(0,1)  # set first 2 times to innovations
# simulate AR(2)
for(t in 3:n) { 
  xx[t] <- 0.571+0.7*xx[t-1] + 0.25*xx[t-2] + ww[t]
}

3 Discrete Distribution

3.1 Discrete Distribution

x = c(0,1,2,3,4,5,6)
prob = c(0.10,0.15,0.20,0.25,0.20,0.06,0.04)
cdf = c(0,cumsum(prob)); cdf.plot = stepfun(x,cdf,f=0)
op<-par(mfrow=c(1,2))
# Plot PMF
plot(x,prob,type="h",xlab="x",ylab="PMF",ylim=c(0,0.26))
abline(h=0)
# Plot CDF
plot.stepfun(cdf.plot,xlab="x",ylab="CDF",verticals=FALSE,do.points=TRUE,main="",pch=16)

3.2 Binomial Distribution

y = 0:10; prob = dbinom(y,10,0.4)
cdf = c(0,cumsum(prob)); cdf.plot = stepfun(y,cdf,f=0)
op<-par(mfrow=c(1,2))
# Plot PMF
plot(y,prob,type="h",xlab="y",ylab="PMF",ylim=c(0,0.275))
abline(h=0)
# Plot CDF

plot.stepfun(cdf.plot,xlab="y",ylab="CDF",verticals=FALSE,do.points=TRUE,main="",pch=16)

3.3 Geometric Distribution

y = 1:30; prob = dgeom(y-1,0.25)
cdf = c(0,cumsum(prob)); cdf.plot = stepfun(y,cdf,f=0)
op<-par(mfrow=c(1,2))
# Plot PMF
plot(y,prob,type="h",xlab="y",ylab="PMF",ylim=c(0,0.25))
abline(h=0)
# Plot CDF
plot.stepfun(cdf.plot,xlab="y",ylab="CDF",verticals=FALSE,do.points=TRUE,main="",pch=16)

3.4 Negative Binomial Distribution

y = 3:70; prob = dnbinom(y-3,3,0.15)
cdf = c(0,cumsum(prob)); cdf.plot = stepfun(y,cdf,f=0)
op<-par(mfrow=c(1,2))
# Plot PMF
plot(y,prob,type="h",xlab="y",ylab="PMF",ylim=c(0,0.05))
abline(h=0)
# Plot CDF
plot.stepfun(cdf.plot,xlab="y",ylab="CDF",verticals=FALSE,do.points=TRUE,main="",pch=16)

3.5 Hypergeometric Distribution

# Hypergeometric PMF and CDF
y = 0:5; prob = dhyper(y,10,100-10,5)
cdf = c(0,cumsum(prob)); cdf.plot = stepfun(y,cdf,f=0)
op<-par(mfrow=c(1,2))
# Plot PMF
plot(y,prob,type="h",xlab="y",ylab="PMF",ylim=c(0,0.6))
abline(h=0)
# Plot CDF
plot.stepfun(cdf.plot,xlab="y",ylab="CDF",verticals=FALSE,do.points=TRUE,main="",pch=16)

3.6 Poisson Distribution

y = 0:10; prob = dpois(y,2.5)
cdf = c(0,cumsum(prob)); cdf.plot = stepfun(y,cdf,f=0)
op<-par(mfrow=c(1,2))
# Plot PMF
plot(y,prob,type="h",xlab="y",ylab="PMF",ylim=c(0,0.275))
abline(h=0)
# Plot CDF
plot.stepfun(cdf.plot,xlab="y",ylab="CDF",verticals=FALSE,do.points=TRUE,main="",pch=16)

4 Continuous Distribution

4.1 Continuous Distribution

mypdf<- function(y) ifelse(y>12.5,20*exp(-20*(y-12.5)),0)
mycdf<- function(y) ifelse(y>12.5,1-exp(-20*(y-12.5)),0)
par(mfrow=c(1,2))
curve(mypdf,12.4,13,xlab="y",ylab = "pdf")
curve(mycdf,12.4,13,xlab="y",ylab = "cdf")

##calculate E(Y)
integrand1<-function(y) y*20*exp(-20*(y-12.5))
integrate(integrand1,lower=12.5,upper = Inf)

## 12.55 with absolute error < 1.3e-07

##calculate Var(Y)
integrand2<-function(y) (y-12.55)^2*20*exp(-20*(y-12.5))
integrate(integrand2,lower=12.5,upper = Inf)

## 0.0025 with absolute error < 4.1e-08

##calculate the median
integrand3<-function(y) 1-exp(-20*(y-12.5))-0.5
uniroot(integrand3,interval = c(12.5,13))$root

## [1] 12.53463

4.2 exponential distribution

par(mfrow=c(1,2))
curve(dexp(x,0.5),0,8,lty=1,col=1,ylab="pdf")
curve(dexp(x,1),0,8,lty=2,col=2,add=TRUE)
curve(dexp(x,2),0,8,lty=3,col=3,add=TRUE)
legend("topright",legend=c(expression(lambda==1/2),
                             expression(lambda==1),
                             expression(lambda==2)),
        col=1:3,lty=1:3,seg.len=1,bty="n")

curve(pexp(x,0.5),0,8,lty=1,col=1,ylab="cdf")
curve(pexp(x,1),0,8,lty=2,col=2,add=TRUE)
curve(pexp(x,2),0,8,lty=3,col=3,add=TRUE)

4.3 Gamma distribution

par(mfrow=c(1,2))
curve(dgamma(x,1,.5),0,15,ylim=c(0,0.5),col=1,ylab="pdf")
curve(dgamma(x,2,1),0,15,col=2,add=TRUE)
curve(dgamma(x,5,2),0,15,col=3,add=TRUE)
curve(dgamma(x,9,2),0,15,col=4,add=TRUE)
legend("topright",
       legend=c(expression(paste(alpha==1,", ",lambda==0.5)),
                expression(paste(alpha==2,", ",lambda==1)),
                expression(paste(alpha==5,", ",lambda==2)),
                expression(paste(alpha==9,", ",lambda==2))),
        col=1:4,lty=rep(1,4),seg.len=1,bty="n")

curve(pgamma(x,1,.5),0,15,col=1,ylab="cdf")
curve(pgamma(x,2,1),0,15,col=2,add=TRUE)
curve(pgamma(x,5,2),0,15,col=3,add=TRUE)
curve(pgamma(x,9,2),0,15,col=4,add=TRUE)

4.4 Normal distribution

par(mfrow=c(1,2))
curve(dnorm(x,-2,2),-10,10,ylim=c(0,0.5),col=1,ylab="pdf")
curve(dnorm(x,0,1),-10,10,col=2,add=TRUE)
curve(dnorm(x,1,3),-10,10,col=3,add=TRUE)
legend("topright",
       legend=c(expression(paste(mu==-2,", ",sigma==2)),
                expression(paste(mu==0,", ",sigma==1)),
                expression(paste(mu==1,", ",sigma==3))),
        col=1:3,lty=rep(1,3),seg.len=1,bty="n")

curve(pnorm(x,-2,2),-10,10,col=1,ylab="cdf")
curve(pnorm(x,0,1),-10,10,col=2,add=TRUE)
curve(pnorm(x,1,3),-10,10,col=3,add=TRUE)

5 Reliability and Lifetime Distribution

5.1 Weibull distribution

par(mfrow=c(1,2))
curve(dweibull(x,2,5),0,25,col=1,lty=1,xlab="t",ylab="pdf")
curve(dweibull(x,2,10),0,25,col=2,lty=2,add=T)
curve(dweibull(x,3,10),0,25,col=3,lty=3,add=T)
legend("topright",
       legend=c(expression(paste(beta==2,", ",eta==5)),
                expression(paste(beta==2,", ",eta==10)),
                expression(paste(beta==3,", ",eta==10))),
        col=1:3,lty=1:3,seg.len=1,bty="n")

curve(pweibull(x,2,5),0,25,col=1,lty=1,xlab="t",ylab="cdf")
curve(pweibull(x,2,10),0,25,col=2,lty=2,add=T)
curve(pweibull(x,3,10),0,25,col=3,lty=3,add=T)

5.2 hazard function

Given \(T \sim Weibull(\beta,\eta)\), the hazard function of \(T\) is \[h_T(t)=\frac{f_T(t)}{S_T(t)}=\frac{\frac{\beta}{\eta}(\frac{t}{\eta})^{\beta-1} e^{-(t/\eta)^{\beta}}}{e^{-(t/\eta)^{\beta}}}=\frac{\beta}{\eta}(\frac{t}{\eta})^{\beta-1} \]

par(mfrow=c(2,2))
curve(dweibull(x,3)/pweibull(x,3,lower.tail=F),0,4,col=1,
      lty=1,xlab="t",ylab="hazard")
legend("bottomright",legend=expression(paste(beta==3,", ",eta==1)),
       col=1,lty=1,seg.len=1,bty="n")

curve(dweibull(x,1.5)/pweibull(x,1.5,lower.tail = F),0,4,
      col=2,lty=2,xlab="t",ylab="")
legend("bottomright",legend=expression(paste(beta==1.5,", ",eta==1)),
       col=2,lty=2,seg.len=1,bty="n")

curve(dweibull(x,1)/pweibull(x,1,lower.tail = F),0,4,
      col=3,lty=3,xlab="t",ylab="hazard")
legend("topright",legend=expression(paste(beta==1,", ",eta==1)),
       col=3,lty=3,seg.len=1,bty="n")

curve(dweibull(x,0.5)/pweibull(x,0.5,lower.tail = F),0,4,
      col=4,lty=4,xlab = "t",ylab = " ")
legend("topright",legend=expression(paste(beta==0.5,", ",eta==1)),
       col=4,lty=4,seg.len=1,bty="n")

5.3 Quantile-quantile plot

library(MASS)
##input the cart data
cart.data = c(3.9,4.2,5.4,6.5,7.0,8.8,9.2,11.4,14.3,15.1,
              15.3,15.5,17.9,18.0,19.0,19.0,23.9,24.8,26.0,34.2)
# Fit Weibull model
fitdistr(cart.data,densfun="weibull")

##      shape        scale   
##    1.9889187   16.9359807 
##  ( 0.3504024) ( 2.0081518)

beta.hat=1.99; eta.hat=16.94

##draw the QQ plot 
library(car)
par(mfrow=c(1,1))
qqPlot(cart.data,distribution="weibull",shape=beta.hat,scale=eta.hat,
       xlab="Weibull quantiles",ylab="Cart data",pch=16)

6 Statistical Inference

6.1 Battery lifetime data

battery = c(4285,2066,2584,1009,318,1429,981,1402,1137,414,
564,604,14,4152,737,852,1560,1786,520,396,
1278,209,349,478,3032,1461,701,1406,261,83,
205,602,3770,726,3894,2662,497,35,2778,1379,
3920,1379,99,510,582,308,3367,99,373,454)

# Create histogram and boxplot
op<-par(mfrow=c(1,2))
hist(battery,xlab="Lifetime (in hours)",ylab="Count",main="")
boxplot(battery,xlab="",ylab="Lifetime (in hours)",col="grey")

par(op)
# Calculate summary statistics
mean(battery) ## sample mean

## [1] 1274.14

var(battery) ## sample variance

## [1] 1505156

sd(battery) ## sample standard deviation

## [1] 1226.848

6.2 Brake time example

y = seq(0,3,0.01)
# Plot population distribution
curve(dnorm(x,1.5,sqrt(0.16)),0,3,type="l",lty=1,col=1,lwd=2,
      xlab="Brake times (sec)",ylab="PDF",ylim=c(0,5))
# Add sampling distribution when n=5
curve(dnorm(x,1.5,sqrt(0.16/5)),0,3,lty=4,col=2,lwd=2,add=TRUE)
# Add sampling distribution when n=25
curve(dnorm(x,1.5,sqrt(0.16/25)),0,3,lty=8,col=3,lwd=2,add=TRUE)
abline(h=0)
# Add legend
legend("topleft",lty = c(1,4,8),bty = "n",cex = 0.8,col=1:3,lwd=rep(2,3),
c(
expression(paste("Population distribution")),
expression(paste("Sample mean, n=5")),
expression(paste("Sample mean, n=25"))
))

6.3 Rat example

y = seq(0,20,0.01)
# Plot population distribution
curve(dexp(x,1/5),0,20,type="l",lty=1,col=1,lwd=2,
      xlab="Time to death (in days)",ylab="PDF",ylim=c(0,0.4))
# Add the exact distribution for sample mean when n=5
curve(dgamma(x,5,1),0,20,col=2,lwd=2,lty=4,add=TRUE)
# Add the exact distribution for sample mean when n=25
curve(dgamma(x,25,5),0,20,col=3,lwd=2,lty=8,add=TRUE)
abline(h=0)
abline(v=0,lty=2)
# Add legend
legend("topright",lty = c(1,4,8),bty="n",cex=0.8,col=1:3,lwd=rep(2,3),
c(
expression(paste("Population distribution")),
expression(paste("Sample mean, n=5")),
expression(paste("Sample mean, n=25"))
))

6.4 \(N(0,1)\) and t PDFs

# Plot N(0,1) pdf
curve(dnorm(x,0,1),-5,5,type="l",lty=1,col=1,lwd=2,
      xlab="",ylab="PDF",ylim=c(0,0.4))
# Add other pdfs
curve(dt(x,2),-5,5,col=2,lwd=2,lty=4,add=TRUE)
curve(dt(x,10),-5,5,col=3,lwd=2,lty=8,add=TRUE)
abline(h=0)
# Add legend
legend("topleft",lty = c(1,4,8),col=1:3,lwd=rep(2,3),cex=.8,
c(
expression(paste("N(0,1)")),
expression(paste("t(2)")),
expression(paste("t(10)"))
))

6.5 Pipe diameter data

pipes = c(1.296,1.320,1.311,1.298,1.315,1.305,1.278,1.294,1.311,1.290,1.284,1.287,1.289,1.292,1.301,
1.298,1.287,1.302,1.304,1.301,1.313,1.315,1.306,1.289,1.291)
# Calculate summary statistics
mean(pipes) ## sample mean

## [1] 1.29908

sd(pipes) ## sample standard deviation

## [1] 0.01108272

# Plot t(24) pdf
curve(dt(x,24),-5,5,type="l",lty=1,lwd=2,
      xlab="t",ylab="PDF",ylim=c(0,0.4))
abline(h=0)
# Add points
points(x=4.096,y=0,pch=4,cex=1.5)

# Create normal qqplot
# Need to install (then load) car package in R
qqPlot(pipes,distribution="norm",xlab="N(0,1) quantiles",ylab="Pipe data",pch=16)

7 One-Sample Inference

7.1 t pdf with quantiles and shading

x = seq(-4,4,0.001)
pdf = dt(x,10)
plot(x,pdf,type="l",lty=1,xlab="t",ylab="PDF",ylim=c(0,0.4))
abline(h=0)
x = seq(-4,qt(0.025,10),0.001)
y = dt(x,10)
polygon(c(-4,x,qt(0.025,10)),c(0,y,0),col="grey")
# Add points
points(x=qt(0.025,10),y=0,pch=19,cex=1.5)
x = seq(qt(0.975,10),4,0.001)
y = dt(x,10)
polygon(c(qt(0.975,10),x,4),c(0,y,0),col="grey")
# Add points
points(x=qt(0.975,10),y=0,pch=19,cex=1.5)
# Add text
text(-0.025,0.1,expression(1-alpha))
text(-3.5,0.05,expression(alpha/2))
text(3.5,0.05,expression(alpha/2))

7.2 Cadmium data

cadmium = c(0.044,0.030,0.052,0.044,0.046,0.020,0.066,
0.052,0.049,0.030,0.040,0.045,0.039,0.039,
0.039,0.057,0.050,0.056,0.061,0.042,0.055,
0.037,0.062,0.062,0.070,0.061,0.061,0.058,
0.053,0.060,0.047,0.051,0.054,0.042,0.051)
# Calculate summary statistics
mean(cadmium) ## sample mean

## [1] 0.04928571

sd(cadmium) ## sample standard deviation

## [1] 0.0110894

# Calculate confidence interval directly
t.test(cadmium,conf.level=0.99)$conf.int

## [1] 0.04417147 0.05439996
## attr(,"conf.level")
## [1] 0.99

# Create normal qqplot
# Need to install (then load) car package in R
qqPlot(cadmium,distribution="norm",xlab="N(0,1) quantiles",ylab="Cadmium data",pch=16)

7.3 Chi-square PDFs

y = seq(0,45,0.01)
# Plot chi-square pdf with 5 df
plot(y,dchisq(y,5),type="l",lty=1,xlab=expression(chi^2),ylab="PDF",ylim=c(0,0.16),col=1,lwd=2)
# Add other pdfs
lines(y,dchisq(y,10),lty=4,col=2,lwd=2)
lines(y,dchisq(y,20),lty=8,col=3,lwd=2)
abline(h=0)
# Add legend
legend(25,0.125,lty = c(1,4,8),col=1:3,lwd=rep(2,3),
c(
expression(paste(nu, "=5")),
expression(paste(nu, "=10")),
expression(paste(nu, "=20"))
))

7.4 chi-square pdf with quantiles and shading

x = seq(0,30,0.001)
pdf = dchisq(x,10)
plot(x,pdf,type="l",lty=1,xlab=expression(chi^2),ylab="PDF",ylim=c(0,0.1))
abline(h=0)
x = seq(0,qchisq(0.025,10),0.001)
y = dchisq(x,10)
polygon(c(0,x,qchisq(0.025,10)),c(0,y,0),col="grey")
# Add points
points(x=qchisq(0.025,10),y=0,pch=19,cex=1.5)
x = seq(qchisq(0.975,10),30,0.001)
y = dchisq(x,10)
polygon(c(qchisq(0.975,10),x,30),c(0,y,0),col="grey")
# Add points
points(x=qchisq(0.975,10),y=0,pch=19,cex=1.5)
# Add text
text(9.8,0.02,expression(1-alpha))
text(0.5,0.01,expression(alpha/2))
text(25,0.0075,expression(alpha/2))

7.5 IKEA diameter data

diameters = c(1.206,1.190,1.200,1.195,1.201,1.200,1.198,1.196,1.195,
1.202,1.203,1.210,1.206,1.193,1.207,1.201,1.199,1.200,1.199,
1.204,1.194,1.203,1.194,1.199,1.203,1.200,1.197,1.208,1.199,
1.205,1.199,1.204,1.202,1.196,1.211,1.204)
 
# R does not have an internal function to calculate CI for population variance
# I wrote this function to do it
var.interval = function(data,conf.level=0.95){
df = length(data)-1
chi.lower = qchisq((1-conf.level)/2,df)
chi.upper = qchisq((1+conf.level)/2,df)
s2 = var(data)
c(df*s2/chi.upper,df*s2/chi.lower)
}
 
# CI for population variance
var.interval(diameters)

## [1] 1.545590e-05 3.997717e-05

# CI for population standard deviation
sd.interval = sqrt(var.interval(diameters))
sd.interval

## [1] 0.003931399 0.006322751

# Create normal qqplot
# Need to install (then load) car package in R
qqPlot(diameters,distribution="norm",xlab="N(0,1) quantiles",ylab="Diameter data",pch=16)

7.6 N(0,1) pdf with quantiles and shading

x = seq(-4,4,0.001)
pdf = dnorm(x,0,1)
plot(x,pdf,type="l",lty=1,xlab="z",ylab="PDF",ylim=c(0,0.4))
abline(h=0)
x = seq(-4,qnorm(0.025,0,1),0.001)
y = dnorm(x,0,1)
polygon(c(-4,x,qnorm(0.025,0,1)),c(0,y,0),col="grey")
# Add points
points(x=qnorm(0.025,0,1),y=0,pch=19,cex=1.5)
x = seq(qnorm(0.975,0,1),4,0.001)
y = dnorm(x,0,1)
polygon(c(qnorm(0.975,0,1),x,4),c(0,y,0),col="grey")
# Add points
points(x=qnorm(0.975,0,1),y=0,pch=19,cex=1.5)
# Add text
text(-0.025,0.1,expression(1-alpha))
text(-2.8,0.04,expression(alpha/2))
text(2.8,0.04,expression(alpha/2))

8 Two-Sample Inference

8.1 two-normal distribution

y = seq(35,75,0.01)
curve(dnorm(x,50,3),35,75,type="l",lty=1,xlab="y",ylab="PDF",
     ylim=c(0,0.15),col="red",lwd=2)
curve(dnorm(x,60,3),35,75,lty=4,col="blue",lwd=2,add=TRUE)
abline(h=0)
# Add legend
legend("topright",lty = c(1,4),col=c("red","blue"),lwd=rep(2,2),
c(
expression(paste("Population 1")),
expression(paste("Population 2"))
))

8.2 t pdf with quantiles and shading

x = seq(-4,4,0.001)
pdf = dt(x,10)
plot(x,pdf,type="l",lty=1,xlab="t",ylab="PDF",ylim=c(0,0.4))
abline(h=0)
x = seq(-4,qt(0.025,10),0.001)
y = dt(x,10)
polygon(c(-4,x,qt(0.025,10)),c(0,y,0),col="grey")
# Add points
points(x=qt(0.025,10),y=0,pch=19,cex=1.5)
x = seq(qt(0.975,10),4,0.001)
y = dt(x,10)
polygon(c(qt(0.975,10),x,4),c(0,y,0),col="grey")
# Add points
points(x=qt(0.975,10),y=0,pch=19,cex=1.5)
# Add text
text(-0.025,0.1,expression(1-alpha))
text(-3.5,0.05,expression(alpha/2))
text(3.5,0.05,expression(alpha/2))

8.3 Fish weight data

loc.1 = c(21.9,18.5,12.3,16.7,21.0,15.1,18.2,23.0,36.8,26.6)
loc.2 = c(21.0,19.6,14.4,16.9,23.4,14.6,10.4,16.5)
# Create side by side boxplots
boxplot(loc.1,loc.2,xlab="",names=c("Location 1","Location 2"),ylab="Weight (in ounces)",ylim=c(0,40),col="grey")

# Calculate confidence interval
t.test(loc.1,loc.2,conf.level=0.90,var.equal=TRUE)$conf.int

## [1] -0.9404376  8.7604376
## attr(,"conf.level")
## [1] 0.9

# Page 116
# Create normal qqplots for each sample
# Need to install (then load) car package in R
op<-par(mfrow=c(1,2))
qqPlot(loc.1,distribution="norm",xlab="N(0,1) quantiles",ylab="Location 1",pch=16)
qqPlot(loc.2,distribution="norm",xlab="N(0,1) quantiles",ylab="Location 2",pch=16)

par(op)

8.4 White paper data

plant.1 = c(3.01,2.58,3.04,1.75,2.87,2.57,2.51,2.93,2.85,3.09,
1.43,3.36,3.18,2.74,2.25,1.95,3.68,2.29,1.86,2.63,
2.83,2.04,2.23,1.92,3.02)
plant.2 = c(3.99,2.08,3.66,1.53,4.27,4.31,2.62,4.52,3.80,5.30,
3.41,0.82,3.03,1.95,6.45,1.86,1.87,3.98,2.74,4.81)
# Create side by side boxplots
par(mfrow=c(1,1 ))
boxplot(plant.1,plant.2,xlab="",names=c("Plant 1","Plant 2"),ylab="Weight (in 100s lb)",ylim=c(0,8),col="grey")

# Calculate confidence interval
t.test(plant.1,plant.2,conf.level=0.95,var.equal=FALSE)$conf.int

## [1] -1.46179176 -0.06940824
## attr(,"conf.level")
## [1] 0.95

# Create normal qqplots for each sample
# Need to install (then load) car package in R
op<-par(mfrow=c(1,2))
qqPlot(plant.1,distribution="norm",xlab="N(0,1) quantiles",ylab="Plant 1",pch=16)
qqPlot(plant.2,distribution="norm",xlab="N(0,1) quantiles",ylab="Plant 2",pch=16)

par(op)

8.5 Ergonomics data: Matched pairs analysis

before = c(81.3,87.2,86.1,82.2,90.8,86.9,96.5,73.0,84.2,74.5,72.0,73.8,74.2,
74.9,75.8,72.6,80.8,66.5,72.2,56.5,82.4,88.8,80.0,91.1,97.5,70.0)
after = c(78.9,91.4,78.3,78.3,84.4,67.4,92.8,69.9,63.8,69.7,68.4,71.8,58.3,
58.3,62.5,70.2,58.7,66.6,60.7,65.0,73.7,80.4,78.8,81.8,91.6,74.2)
# Create data differences
diff = before-after
# Calculate confidence interval
t.test(diff,conf.level=0.95)$conf.int

## [1] 3.636034 9.894735
## attr(,"conf.level")
## [1] 0.95

# Create normal qqplots for each sample
# Need to install (then load) car package in R
par(mfrow=c(1,1))
qqPlot(diff,distribution="norm",xlab="N(0,1) quantiles",ylab="Data differences",pch=16)

8.6 F PDFs

curve(df(x,5,10),0,8,type="l",lty=1,xlab="F",ylab="PDF",
      ylim=c(0,0.85),col=1,lwd=2)
# Add other F pdfs
curve(df(x,5,20),0,8,lty=4,col=2,lwd=2,add=TRUE)
curve(df(x,10,20),0,8,lty=8,col=3,lwd=3,add=TRUE)
abline(h=0)
# Add legend
legend(4,0.5,lty = c(1,4,8),col=1:3,lwd=rep(2,3),
c(
expression(paste(nu[1], "=5, ", nu[2], "=10")),
expression(paste(nu[1], "=5, ", nu[2], "=20")),
expression(paste(nu[1], "=10, ", nu[2], "=20"))
))

8.7 F pdf with quantiles and shading

x = seq(0,6,0.001)
pdf = df(x,10,10)
plot(x,pdf,type="l",lty=1,xlab="F",ylab="PDF",ylim=c(0,0.8))
abline(h=0)
x = seq(0,qf(0.025,10,10),0.001)
y = df(x,10,10)
polygon(c(0,x,qf(0.025,10,10)),c(0,y,0),col="grey")
# Add points
points(x=qf(0.025,10,10),y=0,pch=19,cex=1.5)
x = seq(qf(0.975,10,10),6,0.001)
y = df(x,10,10)
polygon(c(qf(0.975,10,10),x,6),c(0,y,0),col="grey")
# Add points
points(x=qf(0.975,10,10),y=0,pch=19,cex=1.5)
# Add text
text(1.2,0.15,expression(1-alpha))
text(0.0,0.2,expression(alpha/2))
text(4.5,0.05,expression(alpha/2))

8.8 Paint fill volume data

process.1 = c(127.75,127.87,127.86,127.92,128.03,127.94,127.91,128.10,128.01,128.11,127.79,127.93,
127.89,127.96,127.80,127.94,128.02,127.82,128.11,127.92,127.74,127.78,127.85,127.96)
process.2 = c(127.90,127.90,127.74,127.93,127.62,127.76,127.63,127.93,127.86,127.73,127.82,127.84,
128.06,127.88,127.85,127.60,128.02,128.05,127.95,127.89,127.82,127.92,127.71,127.78)
# Create side by side boxplots
boxplot(process.1,process.2,xlab="",names=c("Process 1","Process 2"),
ylab="Fluid ounces",ylim=c(127.5,128.25),col="grey")

# R does not have an internal function to calculate CI for the ratio of population variances
# I wrote this function to do it
ratio.var.interval = function(data.1,data.2,conf.level=0.95){
df.1 = length(data.1)-1
df.2 = length(data.2)-1
F.lower = qf((1-conf.level)/2,df.1,df.2)
F.upper = qf((1+conf.level)/2,df.1,df.2)
s2.1 = var(data.1)
s2.2 = var(data.2)
c((s2.2/s2.1)*F.lower,(s2.2/s2.1)*F.upper)
}
 
# CI for ratio of population variances
ratio.var.interval(process.1,process.2)

## [1] 0.5885236 3.1448830

# Page 130
# Create normal qqplots for each sample
# Need to install (then load) car package in R
op<-par(mfrow=c(1,2))
qqPlot(process.1,distribution="norm",xlab="N(0,1) quantiles",ylab="Process 1",pch=16)
qqPlot(process.2,distribution="norm",xlab="N(0,1) quantiles",ylab="Process 2",pch=16)

par(op)

8.9 Irish HBV data

# R does not have an internal function to calculate CI for the difference of two population proportions
# I wrote this function to do it
proportion.diff.interval = function(y.1,n.1,y.2,n.2,conf.level=0.95){
z.upper = qnorm((1+conf.level)/2)
var.1 = (y.1/n.1)*(1-y.1/n.1)/n.1
var.2 = (y.2/n.2)*(1-y.2/n.2)/n.2
se = sqrt(var.1+var.2)
moe = z.upper*se
c((y.1/n.1-y.2/n.2)-moe,(y.1/n.1-y.2/n.2)+moe)
}
# CI for difference of two population proportions
proportion.diff.interval(18,82,28,555)

## [1] 0.07764115 0.26048234

9 One-way ANOVA

9.1 Mortar strength data

OCM = c(51.45,42.96,41.11,48.06,38.27,38.88,42.74,49.62)
PIM = c(64.97,64.21,57.39,52.79,64.87,53.27,51.24,55.87,61.76,67.15)
RM = c(48.95,62.41,52.11,60.45,58.07,52.16,61.71,61.06,57.63,56.80)
PCM = c(35.28,38.59,48.64,50.99,51.52,52.85,46.75,48.31)
# Create side-by-side boxplots
boxplot(OCM,PIM,RM,PCM,
xlab="",names=c("OCM","PIM","RM","PCM"),ylab="Strength (MPa)",col="grey")

9.2 Perform F test/Create ANOVA table

# Create response with all data
strength = c(OCM,PIM,RM,PCM)
# Create a treatment indicator variable
mortar.type = c(
rep("OCM",length(OCM)),
rep("PIM",length(PIM)),
rep("RM",length(RM)),
rep("PCM",length(PCM))
)
# mortar.type = factor(mortar.type)
mortar=data.frame(type=mortar.type,strength=strength)
# Analysis of variance
pander(anova(lm(strength ~ mortar.type)))

Analysis of Variance Table
	Df	Sum Sq	Mean Sq	F value	Pr(>F)
mortar.type	3	1521	507	16.85	9.576e-07
Residuals	32	962.9	30.09	NA	NA

9.3 Follow-up analysis

f = seq(0,18,0.01)
plot(f,df(f,3,32),type="l",lty=1,xlab="F",ylab="PDF",ylim=c(0,0.75))
abline(h=0)
# Add points
points(x=16.848,y=0,pch=4,cex=1.5)

# Tukey confidence intervals
TukeyHSD(aov(lm(strength ~ mortar.type)),conf.level=0.95)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lm(strength ~ mortar.type))
## 
## $mortar.type
##             diff       lwr       upr     p adj
## PCM-OCM  2.48000 -4.950955  9.910955 0.8026758
## PIM-OCM 15.21575  8.166127 22.265373 0.0000097
## RM-OCM  12.99875  5.949127 20.048373 0.0001138
## PIM-PCM 12.73575  5.686127 19.785373 0.0001522
## RM-PCM  10.51875  3.469127 17.568373 0.0016850
## RM-PIM  -2.21700 -8.863448  4.429448 0.8029266

10 Simple Linear Regression

10.1 sewage example

filtration.rate=c(125.3,98.2,201.4,147.3,145.9,124.7,112.2,120.2,161.2,178.9,
159.5,145.8,75.1,151.4,144.2,125.0,198.8,132.5,159.6,110.7)
moisture=c(77.9,76.8,81.5,79.8,78.2,78.3,77.5,77.0,80.1,80.2,
79.9,79.0,76.7,78.2,79.5,78.1,81.5,77.0,79.0,78.6)

# Fit the model
fit = lm(moisture~filtration.rate)
# Construct scatterplot (with least squares line superimposed)
plot(filtration.rate,moisture,xlab="Filtration rate (kg-DS/m/hr)",
ylab="Moisture (Percentage)",pch=16)
abline(fit)

# Confidence interval for beta0 and beta1
confint(fit,level=0.95)

##                       2.5 %      97.5 %
## (Intercept)     71.49309400 74.42399995
## filtration.rate  0.03087207  0.05119547

# Regression output for inference
summary(fit)

## 
## Call:
## lm(formula = moisture ~ filtration.rate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39552 -0.27694  0.03548  0.42913  1.09901 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     72.958547   0.697528 104.596  < 2e-16 ***
## filtration.rate  0.041034   0.004837   8.484 1.05e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6653 on 18 degrees of freedom
## Multiple R-squared:  0.7999, Adjusted R-squared:  0.7888 
## F-statistic: 71.97 on 1 and 18 DF,  p-value: 1.052e-07

# Calculate confidence/prediction intervals
predict(fit,data.frame(filtration.rate=150),level=0.95,interval="confidence")

##        fit      lwr      upr
## 1 79.11361 78.78765 79.43958

predict(fit,data.frame(filtration.rate=150),level=0.95,interval="prediction")

##        fit     lwr      upr
## 1 79.11361 77.6783 80.54893

curve(dt(x,18),-9,9,xlab="t",ylab="PDF",ylim=c(0,0.4),lwd=1.5)
abline(h=0)
points(x=8.484,y=0,pch=4,cex=1.5)

10.2 Confidence/Prediction bands

fit = lm(moisture~filtration.rate)
x.not = seq(80,200,1)
conf = predict(fit,data.frame(filtration.rate=x.not),level=0.95,interval="confidence")
pred = predict(fit,data.frame(filtration.rate=x.not),level=0.95,interval="prediction")

plot(filtration.rate,moisture,xlab = "Filtration rate (kg-DS/m/hr)",
ylab = "Moisture (Percentage)",pch=16,
xlim=c(min(filtration.rate),max(filtration.rate)),
ylim=c(min(moisture),max(moisture)))
abline(fit)
par(new=T)
plot(x.not,conf[,2],type="l",lty=2,xlab="",ylab="",col="red",
xlim=c(min(filtration.rate),max(filtration.rate)),
ylim=c(min(moisture),max(moisture)))
par(new=T)
plot(x.not,conf[,3],type="l",lty=2,xlab="",ylab="",col="red",
xlim=c(min(filtration.rate),max(filtration.rate)),
ylim=c(min(moisture),max(moisture)))
par(new=T)
plot(x.not,pred[,2],type="l",lty=4,xlab="",ylab="",col="blue",
xlim=c(min(filtration.rate),max(filtration.rate)),
ylim=c(min(moisture),max(moisture)))
par(new=T)
plot(x.not,pred[,3],type="l",lty=4,xlab="",ylab="",col="blue",
xlim=c(min(filtration.rate),max(filtration.rate)),
ylim=c(min(moisture),max(moisture)))
# Create legend
# You MUST "click" where you want it to go (on figure)
names = c("95% confidence","95% prediction")
legend("topleft",legend=names,lty=c(2,4),bty="n",col=c("red","blue"))

STAT 509 R Tutorial

Chong Ma

2017-04-20