# R code to analyze the simulated (X,Y) data # using nonparametric kernel regression # Save the data file into a directory and # use the full path name: simul101.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/simulated101datapairs.txt", header=FALSE, col.names = c('x', 'y')) # attaching the data frame: attach(simul101.data) # First look at a scatter plot of Y against X: plot(x, y) # There appears to be a relationship between Y and X, but it is # not a conventional functional relationship ########## Kernel Regression Estimation ###################### # The ksmooth() function in R offers the # Nadaraya-Watson kernel regression estimator # with the choice of "normal" or "box" (uniform) kernels. # Using the kernel regression estimator # with a relatively large bandwidth: nonpar.reg.largebw <- ksmooth(x, y, kernel="normal", bandwidth=0.8) # Plotting the estimated curve on top of the data: plot(x, y); lines(nonpar.reg.largebw) # Using the kernel regression estimator # with a relatively small bandwidth: nonpar.reg.smallbw <- ksmooth(x, y, kernel="normal", bandwidth=0.05) # Plotting the estimated curve on top of the data: plot(x, y); lines(nonpar.reg.smallbw) # Using the kernel regression estimator # with the default bandwidth of 0.5: nonpar.reg.default <- ksmooth(x, y, kernel="normal") # Plotting the estimated curve on top of the data: plot(x, y); lines(nonpar.reg.default) # The default bandwidth seems a bit too large here: # Let's use a bandwidth of 0.2: nonpar.reg <- ksmooth(x, y, kernel="normal", bandwidth=0.2) # Plotting the estimated curve on top of the data: plot(x, y); lines(nonpar.reg) # For comparision, the true mean function these data were simulated # from is shown in red here: lines(nonpar.reg$x, (sin(2*pi*(nonpar.reg$x^3)))^3, col="red") # You can alter the bandwidth and see if another # bandwidth would have done better. # Note: Various automated bandwidth selection functions are available # in packages such as "ks" that you can download from CRAN. Type: help.search("bandwidth") # for details. ########## Kernel Regression Estimation on a Real Data Set ###################### # Save the data file into a directory and # use the full path name: # Using the Old Faithful geyser data set (actually this is a data set # built into R; type: help(faithful) for details about it.) oldfaithful.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/oldfaithfuldata.txt", header=FALSE, col.names = c('obsno', 'eruption.length', 'waiting.time')) # The X variable here is the length of time (in minutes) it takes for the geyser to erupt. # The Y variable is the waiting time until the next eruption. # It is believed that the waiting time depends on the eruption's length of time. # attaching the data frame: attach(oldfaithful.data) # First look at a scatter plot of waiting.time against eruption.length: plot(eruption.length, waiting.time) # The relationship between the variables seems somewhat linear, but is that really the best fit? # Using the kernel regression estimator # with the default bandwidth of 0.5: oldfaithful.reg.default <- ksmooth(eruption.length, waiting.time, kernel="normal") # Plotting the estimated curve on top of the data: plot(eruption.length, waiting.time); lines(oldfaithful.reg.default) # A smaller bandwidth produces an implausibly wiggly curve: oldfaithful.reg.smallbw <- ksmooth(eruption.length, waiting.time, kernel="normal", bandwidth=0.1); plot(eruption.length, waiting.time); lines(oldfaithful.reg.smallbw) # A larger bandwidth produces an overly smooth (?) curve: oldfaithful.reg.largebw <- ksmooth(eruption.length, waiting.time, kernel="normal", bandwidth=2); plot(eruption.length, waiting.time); lines(oldfaithful.reg.largebw) ########################################################################################## ### The 'loess' method is similar to kernel smoothing, and can be more useful for making predictions: # span (between 0 and 1) plays a similar role as the bandwidth in kernel regression: of.lo <- loess(waiting.time ~ eruption.length, data=oldfaithful.data) # span=0.75 is the default 'span' plot(of.lo);lines(sort(of.lo$x),sort(of.lo$fit)) predict(of.lo, data.frame(eruption.length = c(2.0, 3.0, 4.0)), se = TRUE) #predicting Y for three different values of X (standard errors of prediction are included) # A more wiggly fit: of.lo.w <- loess(waiting.time ~ eruption.length, data=oldfaithful.data, span=0.3) plot(of.lo.w);lines(sort(of.lo.w$x),sort(of.lo.w$fit))