## Section 5.5 examples ## MBA Example: # GMAT = X gmat <- c(710,610,640,580,545,560,610,530,560,540,570,560) # GPA = Y gpa <- c(4.0,4.0,3.9,3.8,3.7,3.6,3.5,3.5,3.5,3.3,3.2,3.2) # Plot of data and least-squares line: plot(gmat,gpa) abline(lm(gpa~gmat)) # Estimated intercept and slope: lm(gpa~gmat) ## Testing H0: beta = 0.01 U <- gpa - 0.01*gmat cor.test(gmat, U, alternative="two.sided", method="spearman", exact=T) #### Distribution-free CI for slope: two.point.slopes <- outer(gpa,gpa,'-')/outer(gmat,gmat,'-') # removing those where Xi = Xj: all.slopes <- sort(two.point.slopes[abs(two.point.slopes)!=Inf]) # just taking those where i < j: ordered.slopes <- sort(all.slopes[2*(1:length(all.slopes))]) N <- length(ordered.slopes) # Since w_0.975 = 28 for n = 12: r <- floor(0.5*(N-28)) s <- ceiling(N + 1 - r) print(c(ordered.slopes[r], ordered.slopes[s])) ### Drug dosage example dose <- c(1,2,3,4,5,6,7) lipid <- c(2.5,3.1,3.4,4.0,4.6,11.1,5.1) plot(dose,lipid) ls.est <- lm(lipid~dose) abline(ls.est,col='blue') two.point.slopes <- outer(lipid,lipid,'-')/outer(dose,dose,'-') all.slopes <- sort(two.point.slopes[abs(two.point.slopes)!=Inf]) ordered.slopes <- sort(all.slopes[2*(1:length(all.slopes))]) theil.est.b <- median(ordered.slopes) theil.est.a <- median(lipid) - theil.est.b*median(dose) abline(theil.est.a, theil.est.b, col='red') # Which method is more robust against outliers? ################################ ## Nonparametric Regression ################################ ## ## The Nadaraya-Watson kernel smoothing estimator ## on the "automobiles" data set: ## horsepower <- c(49,55,55,70,53,70,55,62,62,80,73,92,92,73,66,73,78,92,78,90,92,74,95,81,95,92,92,92,90,52,103,84,84,102,102,81,90,90,102,102,130,95,95,102,95,93,100,100,98,130,115,115,115,115,180,160,130,96,115,100,100,145,120,140,140,150,165,165,165,165,245,280,162,162,140,140,175,322,238,263,295,236) mileage <- c(65.4,56,55.9,49,46.5,46.2,45.4,59.2,53.3,43.4,41.1,40.9,40.9,40.4,39.6,39.3,38.9,38.8,38.2,42.2,40.9,40.7,40,39.3,38.8,38.4,38.4,38.4,29.5,46.9,36.3,36.1,36.1,35.4,35.3,35.1,35.1,35.0,33.2,32.9,32.3,32.2,32.2,32.2,32.2,31.5,31.5,31.4,31.4,31.2,33.7,32.6,31.3,31.3,30.4,28.9,28,28,28,28,28,27.7,25.6,25.3,23.9,23.6,23.6,23.6,23.6,23.6,23.5,23.4,23.4,23.1,22.9,22.9,19.5,18.1,17.2,17,16.7,13.2) ## Using normal kernel: # A too-small bandwidth: plot(horsepower, mileage, main="bandwidth=25") lines(ksmooth(horsepower, mileage, "normal", bandwidth=25)) # A too-large bandwidth: plot(horsepower, mileage, main="bandwidth=75") lines(ksmooth(horsepower, mileage, "normal", bandwidth=75)) # A reasonable bandwidth: plot(horsepower, mileage, main="bandwidth=60") lines(ksmooth(horsepower, mileage, "normal", bandwidth=60)) ## We see some problems with biased estimation at the edge of the scatterplot. ## Using uniform kernel: # A reasonable bandwidth: plot(horsepower, mileage, main="uniform kernel, bandwidth=60") lines(ksmooth(horsepower, mileage, "box", bandwidth=60)) # Not the best choice of kernel.