## Section 5.5 examples

## MBA Example:

# GMAT = X
gmat <- c(710,610,640,580,545,560,610,530,560,540,570,560)

# GPA = Y
gpa <- c(4.0,4.0,3.9,3.8,3.7,3.6,3.5,3.5,3.5,3.3,3.2,3.2)

# Plot of data and least-squares line:

plot(gmat,gpa)
abline(lm(gpa~gmat))

# Estimated intercept and slope:

lm(gpa~gmat)

## Testing H0: beta = 0.01

U <- gpa - 0.01*gmat

cor.test(gmat, U, alternative="two.sided", method="spearman", exact=T)

#### Distribution-free CI for slope:

two.point.slopes <- outer(gpa,gpa,'-')/outer(gmat,gmat,'-')

# removing those where Xi = Xj:

all.slopes <- sort(two.point.slopes[abs(two.point.slopes)!=Inf])  

# just taking those where i < j:

ordered.slopes <- sort(all.slopes[2*(1:length(all.slopes))]) 
N <- length(ordered.slopes)

# Since w_0.975 = 28 for n = 12:

r <- floor(0.5*(N-28))
s <- ceiling(N + 1 - r)

print(c(ordered.slopes[r], ordered.slopes[s]))

### Drug dosage example

dose <- c(1,2,3,4,5,6,7)
lipid <- c(2.5,3.1,3.4,4.0,4.6,11.1,5.1)

plot(dose,lipid)

ls.est <- lm(lipid~dose)

abline(ls.est,col='blue')

two.point.slopes <- outer(lipid,lipid,'-')/outer(dose,dose,'-')
all.slopes <- sort(two.point.slopes[abs(two.point.slopes)!=Inf]) 
ordered.slopes <- sort(all.slopes[2*(1:length(all.slopes))]) 

theil.est.b <- median(ordered.slopes)
theil.est.a <- median(lipid) - theil.est.b*median(dose)

abline(theil.est.a, theil.est.b, col='red')

# Which method is more robust against outliers?

################################
## Nonparametric Regression
################################

##
## The Nadaraya-Watson kernel smoothing estimator
## on the "automobiles" data set:
##

horsepower <- c(49,55,55,70,53,70,55,62,62,80,73,92,92,73,66,73,78,92,78,90,92,74,95,81,95,92,92,92,90,52,103,84,84,102,102,81,90,90,102,102,130,95,95,102,95,93,100,100,98,130,115,115,115,115,180,160,130,96,115,100,100,145,120,140,140,150,165,165,165,165,245,280,162,162,140,140,175,322,238,263,295,236)
mileage <- c(65.4,56,55.9,49,46.5,46.2,45.4,59.2,53.3,43.4,41.1,40.9,40.9,40.4,39.6,39.3,38.9,38.8,38.2,42.2,40.9,40.7,40,39.3,38.8,38.4,38.4,38.4,29.5,46.9,36.3,36.1,36.1,35.4,35.3,35.1,35.1,35.0,33.2,32.9,32.3,32.2,32.2,32.2,32.2,31.5,31.5,31.4,31.4,31.2,33.7,32.6,31.3,31.3,30.4,28.9,28,28,28,28,28,27.7,25.6,25.3,23.9,23.6,23.6,23.6,23.6,23.6,23.5,23.4,23.4,23.1,22.9,22.9,19.5,18.1,17.2,17,16.7,13.2)

## Using normal kernel:

# A too-small bandwidth:
plot(horsepower, mileage, main="bandwidth=25")
lines(ksmooth(horsepower, mileage, "normal", bandwidth=25))

# A too-large bandwidth:
plot(horsepower, mileage, main="bandwidth=75")
lines(ksmooth(horsepower, mileage, "normal", bandwidth=75))

# A reasonable bandwidth:
plot(horsepower, mileage, main="bandwidth=60")
lines(ksmooth(horsepower, mileage, "normal", bandwidth=60))

## We see some problems with biased estimation at the edge of the scatterplot.


## Using uniform kernel:

# A reasonable bandwidth:
plot(horsepower, mileage, main="uniform kernel, bandwidth=60")
lines(ksmooth(horsepower, mileage, "box", bandwidth=60))

# Not the best choice of kernel.