# STAT 720 sp 2019 Lec 11 supplement: bootstrapping

###############################################################
###############################################################
# IID bootstrap: Consider the pivot 
# T_n = \sqrt{n}(\bar X_n - \mu)/S_n 
# use IID bootstrap to construc a CI for \mu.
###############################################################
###############################################################

# generate a random sample from a non-Normal distribution:
n <- 15
nu <- 2
x <- rchisq(n,nu) # has expected value equal to nu
x.bar <- mean(x)
sigma.hat <- sd(x)

# run Monte-Carlo simulation to get approximate quantiles of 
# distribution of T_n^*
B <- 1000
T.star <- numeric()
for( b in 1:B)
{
	
	x.star <- sample(x,n,replace=TRUE)
	sigma.hat.star <- sd(x.star)	
	x.bar.star <- mean(x.star)
	
	T.star[b] <- sqrt(n)*(x.bar.star - x.bar)/sigma.hat.star

}

# get Monte-Carlo-approximated alpha/2 and 1-alpha/2 quantiles
alpha <- 0.05
G.alphaby2 <- sort(T.star)[floor(B*(1-alpha/2))]
G.1minusalphaby2 <- sort(T.star)[floor(B*(alpha/2))]

# construct (1-alpha)*100% CI for the mean using the bootstrap
# estimates of the quantiles; record whether the interval
# contains the true mean.
lo.ci <- x.bar - G.alphaby2 * sigma.hat / sqrt(n)
up.ci <- x.bar - G.1minusalphaby2 * sigma.hat / sqrt(n)
covered <- (lo.ci < nu) & (up.ci > nu)

lo.ci
up.ci
covered

###############################################################
# run simulation to get coverage probability:
###############################################################

# specify parameters
n <- 15
nu <- 2

alpha <- 0.05
S <- 100
B <- 1000

# use the bootstrap to build a (1-alpha)*100% CI for the mean
# on S randomly generated datasets
lo.ci <- up.ci <- lo.ci.t <- up.ci.t <- numeric()
boot.covered <- t.covered <- logical(S)
for( s in 1:S )
{	
	
	# generate data
	x <- rchisq(n,nu)
	x.bar <- mean(x)
	sigma.hat <- sd(x)
	
	# run Monte-Carlo simulation to get bootstrap distribution of 
	# the pivot quantity
	T.star <- numeric(B)
	for( b in 1:B )
	{
		
		x.star <- sample(x,n,replace=TRUE)
		sigma.hat.star <- sd(x.star)	
		x.bar.star <- mean(x.star)
		
		T.star[b] <- sqrt(n)*(x.bar.star - x.bar)/sigma.hat.star
	
	}
	
	# extract desired quantiles and construct (1-alpha)*100% CI for the mean
	G.alphaby2 <- sort(T.star)[floor(B*(1-alpha/2))]
	G.1minusalphaby2 <- sort(T.star)[floor(B*(alpha/2))]
	
	lo.ci[s] <- x.bar - G.alphaby2 * sigma.hat / sqrt(n)
	up.ci[s] <- x.bar - G.1minusalphaby2 * sigma.hat / sqrt(n)
	
	
	# for comparison construct t-based (1-alpha)*100% CI for the mean
	lo.ci.t[s] <- x.bar - qt(1-alpha/2,n-1) * sigma.hat / sqrt(n)
	up.ci.t[s] <- x.bar + qt(1-alpha/2,n-1) * sigma.hat / sqrt(n)
	
	# record whether the intervals covered the true mean
	boot.covered[s] <- (lo.ci[s] < nu) & (up.ci[s] > nu)
	t.covered[s] <- (lo.ci.t[s] < nu) & (up.ci.t[s] > nu)
	
}

# plot the confidence intervals and display the coverage proportion over the 
# simulated data sets
par(mfrow=c(1,2))

# the bootstrap intervals:
plot(NA,main = paste("IID bootstrap CI coverage (",mean(boot.covered),")",sep=""),
		ylim=c(0,1),
		xlim = range(lo.ci, up.ci),
		xlab = "Confidence intervals",
		yaxt = "n",
		ylab= "")
abline(v = nu,lty=3)
for(s in 1:S)
{
	lines(y = c(s/S,s/S),x = c(lo.ci[s], up.ci[s]),lty = 2 - boot.covered[s])
}

# the t-based intervals:
plot(NA,main = paste("t-based CI coverage (",mean(t.covered),")",sep=""),
		ylim=c(0,1),
		xlim = range(lo.ci.t, up.ci.t),
		xlab = "Confidence intervals",
		yaxt = "n",
		ylab= "")
abline(v = nu,lty=3)
for(s in 1:S)
{
	lines(y = c(s/S,s/S), x = c(lo.ci.t[s], up.ci.t[s]), lty = 2 - t.covered[s])
}

###############################################################
###############################################################
# Moving-block bootstrap for time series: Consider the pivot 
# T_n = sqrt{n}(\bar X_n - \mu) and use MBB to build confidence 
# intervals for \mu.  In addition, estimate Var(T_n) with MBB.
###############################################################
###############################################################

library(tscourse)

# generate some data
phi <- c(.3,.2)
theta <- c(.4)
n <- 300
sigma <- 2
X <- get.ARMA.data(phi,theta,sigma,n) # true mean is equal to zero.
X.bar <- mean(X)

# choose block size l and build matrix with n- l + 1 rows 
# such that each row is a block; we will resample the rows
# of this matrix
l <- floor(sqrt(n))
X.blocks <- matrix(NA,n-l+1,l)
for( j in 1:(n-l+1))
{
	
	X.blocks[j,] <- X[j:(j+l-1)]
	
}

# compute the mean of the MBB-induced distribution:
X.bar.MBB <- mean(X.blocks)

# compute MBB bootstrap estimator of var.T analytically
accum <- 0
for( j in 1:(n-l+1) )
	for(k in 1:l)
		for(m in 1:l)
		{
			
			accum <- accum + (X.blocks[j,k] - X.bar.MBB)*(X.blocks[j,m] - X.bar.MBB)
			
		}
		
var.T.hat.analytic <- accum / (l*(n-l+1))

# run Monte-Carlo simulation to get approximate quantiles of 
# distribution of T_n^*
B <- 500
T.star <- numeric(B)
for( b in 1:B)
{
	n.blocks <- floor(n/l) + 1 # choose an extra block if n/l is non-integer
	which.blocks <- sample(1:(n-l+1),n.blocks,replace=TRUE)
	X.star.mat <- X.blocks[which.blocks,]
	X.star <- as.vector(t(X.star.mat))[1:n] # truncate to length n
	
	X.bar.star <- mean(X.star)
	T.star[b] <- sqrt(n)*( X.bar.star - X.bar.MBB )
}

# compute MC version of Var(T_n) in order to compare it
# with the analytical one
var.T.hat.MC <- var(T.star)

var.T.hat.MC
var.T.hat.analytic

# get Monte-Carlo-approximated alpha/2 and 1-alpha/2 quantiles
alpha <- 0.05
G.alphaby2 <- sort(T.star)[floor(B*(1-alpha/2))]
G.1minusalphaby2 <- sort(T.star)[floor(B*(alpha/2))]
	
# construct (1-alpha)*100% CI for the mean using the MBB
# estimates of the quantiles; record whether the interval
# contains the true mean.
lo.ci <- X.bar - G.alphaby2 / sqrt(n)
up.ci <- X.bar - G.1minusalphaby2 / sqrt(n)
covered <- (lo.ci < 0) & (up.ci > 0)

lo.ci
up.ci
covered

###############################################################
# run simulation to get coverage probability:
###############################################################
rm(list=ls())

# specify parameters:
phi <- c(.3,.2)
theta <- c(.4)
n <- 300
sigma <- 2

alpha <- 0.05
S <- 100
B <- 1000
l <- floor(sqrt(n))

# use the bootstrap to build a (1-alpha)*100% CI for the mean
# and to estimate Var(T_n) on S randomly generated datasets
lo.ci <- up.ci <- var.T.hat.analytic <- var.T.hat.MC <- numeric()
covered <- logical()
for(s in 1:S)
{
	# generate data; compute sample mean.
	X <- get.ARMA.data(phi,theta,sigma,n)
	X.bar <- mean(X)
	
	# build matrix with n- l + 1 rows such that each row is a block
	X.blocks <- matrix(NA,n-l+1,l)
	for( j in 1:(n-l+1))
	{
		
		X.blocks[j,] <- X[j:(j+l-1)]
		
	}
	
	# compute the mean of the MBB-induced distribution
	X.bar.MBB <- mean(X.blocks)
	
	# compute MBB bootstrap estimator of var.T analytically
	accum <- 0
	for( j in 1:(n-l+1) )
		for(k in 1:l)
			for(m in 1:l)
			{
				
				accum <- accum + (X.blocks[j,k] - X.bar.MBB)*(X.blocks[j,m] - X.bar.MBB)
				
			}
	var.T.hat.analytic[s] <- accum / (l*(n-l+1))
	
	# run Monte-Carlo simulation to get approximate quantiles of 
	# distribution of T_n^*
	T.star <- numeric(B)
	for( b in 1:B)
	{
		n.blocks <- floor(n/l) + 1 # choose an extra block if n/l is non-integer
		which.blocks <- sample(1:(n-l+1),n.blocks,replace=TRUE)
		X.star.mat <- X.blocks[which.blocks,]
		X.star <- as.vector(t(X.star.mat))[1:n] # truncate to length n
		
		X.bar.star <- mean(X.star)
		T.star[b] <- sqrt(n)*( X.bar.star - X.bar.MBB )
	}
	
	# compute MC version of Var(T_n) in order to compare it
	# with the analytical one
	var.T.hat.MC[s] <- var(T.star)
	
	# get Monte-Carlo-approximated alpha/2 and 1-alpha/2 quantiles
	G.alphaby2 <- sort(T.star)[floor(B*(1-alpha/2))]
	G.1minusalphaby2 <- sort(T.star)[floor(B*(alpha/2))]	

	# construct (1-alpha)*100% CI for the mean using the MBB
	# estimates of the quantiles; record whether the interval
	# contains the true mean.
	lo.ci[s] <- X.bar - G.alphaby2 / sqrt(n)
	up.ci[s] <- X.bar - G.1minusalphaby2 / sqrt(n)
	covered[s] <- (lo.ci[s] < 0) & (up.ci[s] > 0)

}

# plot the confidence intervals and display the coverage proportion over the 
# simulated data sets
par(mfrow=c(1,1))

plot(NA,main = paste("Moving-blocks bootstrap CI coverage (",mean(covered),")",sep=""),
		ylim=c(0,1),
		xlim = range(lo.ci, up.ci),
		xlab = "Confidence intervals",
		yaxt = "n",
		ylab= "")
abline(v = 0,lty=3)
for(s in 1:S)
{
	lines(y = c(s/S,s/S),x = c(lo.ci[s], up.ci[s]),lty = 2 - covered[s])
}

# compute the true value of Var(T_n) and check performance of 
# the MBB estimator
gamma <- ARMAacvf(phi,theta,sigma,max.lag=l)
var.T <- sum(gamma,gamma[-1])
var.T
E.var.T.hat.analytic  <- mean(var.T.hat.analytic)

# plot histogram of the MBB estimates of Var(T_n) with vertical lines
# at the true value and the mean of the estimates.  This gives us a sense of 
# the bias.
hist(var.T.hat.analytic)
abline(v=var.T,lty=3)
abline(v=E.var.T.hat.analytic,lty=2)