# Using R to plot an empirical distribution function for a set of data: # Example from class: female.heights <- c(69.3,66.3,62.6,62.9,67.4) # Three different styles of edf plot: plot(ecdf(female.heights)) plot(ecdf(female.heights), verticals=TRUE) plot(ecdf(female.heights), verticals=TRUE, do.points=FALSE) # An example with a simulated data set with LOTS of observations: simul.height.data <- rnorm(n=500, mean=65, sd=2.5) plot(ecdf(simul.height.data)) plot(ecdf(simul.height.data), verticals=TRUE) plot(ecdf(simul.height.data), verticals=TRUE, do.points=FALSE) # Note how close the estimator of the cdf comes to the true cdf when the sample size is very large: gridpts <- seq(55,75,by=0.1) lines(gridpts,pnorm(gridpts,mean=65,sd=2.5),col='red') # true cdf superimposed in red ### Bootstrap example: # function to calculate the 85th percentile of a sample vector: perc85 <- function(input.vec){ output <- quantile(input.vec, prob=0.85) return(output) } bmi.samp <- c(21.8,36.6,22.0,24.4,22.2,20.0,19.2,21.6,27.2,28.9,19.4,28.1,18.6,26.6,20.6,26.7,26.5,25.3,29.6,24.7) my.n <- length(bmi.samp) #Defining the number of resamples: my.m <- 1000 # Setting up the matrix to hold bootstrap-sample values setup.data.matrix <- matrix(bmi.samp, nrow=my.m, ncol=my.n, byrow=T) # carrying out the sampling (with replacement): bootstrap.data.matrix <- apply(setup.data.matrix, 1, sample, size=my.n, replace=TRUE) # Transposing to get back to same dimensions as setup.data.matrix bootstrap.data.matrix <- t(bootstrap.data.matrix) # Calculating the sample mean for each of the bootstrap samples my.85.percs <- apply(bootstrap.data.matrix, 1, perc85) # standard error of this statistic: sd(my.85.percs) # 95% Bootstrap interval estimates for population 85th percentile: lower.upper.CI <- quantile(my.85.percs, probs=c(0.025, 0.975)) print(paste("95% bootstrap interval for 85th percentile: ", round(lower.upper.CI,2) ))