# R code to analyze the simulated (X,Y) data
# using nonparametric kernel regression 


# Save the data file into a directory and 
# use the full path name:

simul101.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/simulated101datapairs.txt", 
header=FALSE, col.names = c('x', 'y'))

# attaching the data frame:

attach(simul101.data)

# First look at a scatter plot of Y against X:

plot(x, y)

# There appears to be a relationship between Y and X, but it is
# not a conventional functional relationship

########## Kernel Regression Estimation ######################


# The ksmooth() function in R offers the 
# Nadaraya-Watson kernel regression estimator
# with the choice of "normal" or "box" (uniform) kernels.

# Using the kernel regression estimator 
# with a relatively large bandwidth:

nonpar.reg.largebw <- ksmooth(x, y, kernel="normal", bandwidth=0.8)

# Plotting the estimated curve on top of the data:

plot(x, y); lines(nonpar.reg.largebw)

# Using the kernel regression estimator 
# with a relatively small bandwidth:

nonpar.reg.smallbw <- ksmooth(x, y, kernel="normal", bandwidth=0.05)

# Plotting the estimated curve on top of the data:

plot(x, y); lines(nonpar.reg.smallbw)

# Using the kernel regression estimator 
# with the default bandwidth of 0.5:

nonpar.reg.default <- ksmooth(x, y, kernel="normal")

# Plotting the estimated curve on top of the data:

plot(x, y); lines(nonpar.reg.default)

# The default bandwidth seems a bit too large here:
# Let's use a bandwidth of 0.2:

nonpar.reg <- ksmooth(x, y, kernel="normal", bandwidth=0.2)

# Plotting the estimated curve on top of the data:

plot(x, y); lines(nonpar.reg)

# For comparision, the true mean function these data were simulated 
# from is shown in red here:

lines(nonpar.reg$x, (sin(2*pi*(nonpar.reg$x^3)))^3, col="red")

# You can alter the bandwidth and see if another 
# bandwidth would have done better.


# Note:  Various automated bandwidth selection functions are available 
# in packages such as "ks" that you can download from CRAN.  Type:
help.search("bandwidth")
# for details.

########## Kernel Regression Estimation on a Real Data Set ######################


# Save the data file into a directory and 
# use the full path name:

# Using the Old Faithful geyser data set (actually this is a data set 
# built into R; type: help(faithful) for details about it.)

oldfaithful.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/oldfaithfuldata.txt", 
header=FALSE, col.names = c('obsno', 'eruption.length', 'waiting.time'))

# The X variable here is the length of time (in minutes) it takes for the geyser to erupt.
# The Y variable is the waiting time until the next eruption.
# It is believed that the waiting time depends on the eruption's length of time.

# attaching the data frame:

attach(oldfaithful.data)

# First look at a scatter plot of waiting.time against eruption.length:

plot(eruption.length, waiting.time)

# The relationship between the variables seems somewhat linear, but is that really the best fit?

# Using the kernel regression estimator 
# with the default bandwidth of 0.5:

oldfaithful.reg.default <- ksmooth(eruption.length, waiting.time, kernel="normal")

# Plotting the estimated curve on top of the data:

plot(eruption.length, waiting.time); lines(oldfaithful.reg.default)

# A smaller bandwidth produces an implausibly wiggly curve:

oldfaithful.reg.smallbw <- ksmooth(eruption.length, waiting.time, kernel="normal", bandwidth=0.1); 
plot(eruption.length, waiting.time); lines(oldfaithful.reg.smallbw)


# A larger bandwidth produces an overly smooth (?) curve:

oldfaithful.reg.largebw <- ksmooth(eruption.length, waiting.time, kernel="normal", bandwidth=2); 
plot(eruption.length, waiting.time); lines(oldfaithful.reg.largebw)

##########################################################################################

### The 'loess' method is similar to kernel smoothing, and can be more useful for making predictions:

# span (between 0 and 1) plays a similar role as the bandwidth in kernel regression:
of.lo <- loess(waiting.time ~ eruption.length, data=oldfaithful.data)  # span=0.75 is the default 'span'
plot(of.lo);lines(sort(of.lo$x),sort(of.lo$fit))

predict(of.lo, data.frame(eruption.length = c(2.0, 3.0, 4.0)), se = TRUE)
#predicting Y for three different values of X (standard errors of prediction are included)

# A more wiggly fit:

of.lo.w <- loess(waiting.time ~ eruption.length, data=oldfaithful.data, span=0.3)  
plot(of.lo.w);lines(sort(of.lo.w$x),sort(of.lo.w$fit))