# R code to analyze the simulated (X,Y) data # using spline methods # Save the data file into a directory and # use the full path name: simul101.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/simulated101datapairs.txt", header=FALSE, col.names = c('x', 'y')) # attaching the data frame: attach(simul101.data) # First look at a scatter plot of Y against X: plot(x, y) # There appears to be a relationship between Y and X, but it is # not a conventional functional relationship ############### (Cubic) Regression Splines ###################### # Load the "splines" package: library(splines) # The bs() function in R produces B-splines, a computationally efficient # way to compute cubic regression splines ###################################### # Specifying 4 equally spaced knots: number.knots <- 4 spacings <- seq(from=min(x),to=max(x),length=number.knots+2)[2:(number.knots+1)] regr.spline <- lm(y ~ bs(x, df = NULL, knots=spacings, degree = 3, intercept=T)) # plotting the data with the regression spline overlain: x.values <- seq(from=min(x), to=max(x), length=200) plot(x, y); lines(x.values, predict(regr.spline, data.frame(x=x.values))) ###################################### # Specifying 10 equally spaced knots: number.knots <- 10 spacings <- seq(from=min(x),to=max(x),length=number.knots+2)[2:(number.knots+1)] regr.spline <- lm(y ~ bs(x, df = NULL, knots=spacings, degree = 3, intercept=T)) # plotting the data with the regression spline overlain: x.values <- seq(from=min(x), to=max(x), length=200) plot(x, y); lines(x.values, predict(regr.spline, data.frame(x=x.values))) ###################################### # We can specify unequally spaced knots. For example, we could put more # knots in the "wiggly" region of the data (from x=0.5 to x=1.0) and # fewer knots in the "flat" region of the data: regr.spline <- lm(y ~ bs(x, df = NULL, knots=c(0.25, 0.5, 0.6, 0.7, 0.8, 0.9), degree = 3, intercept=T)) x.values <- seq(from=min(x), to=max(x), length=200) plot(x, y); lines(x.values, predict(regr.spline, data.frame(x=x.values))) # Seems to work well for this function!! ########## (Cubic) Smoothing Splines ###################### # The smooth.spline() function in R computes (cubic) smoothing splines smoothspline.reg <- smooth.spline(x, y) # plotting the data with the smoothing spline overlain: plot(x, y); lines(smoothspline.reg) # By default, the value of the smoothing parameter is # determined by (generalized) cross-validation. # We can see the default choice of smoothing parameter by typing: smoothspline.reg$spar # The default choice is 0.5296. # We can specify a larger smoothing parameter value: smoothspline.reg.large <- smooth.spline(x, y, spar = 0.9) plot(x, y); lines(smoothspline.reg.large) # Or we can specify a smaller smoothing parameter value: smoothspline.reg.small <- smooth.spline(x, y, spar = 0.3) plot(x, y); lines(smoothspline.reg.small) ######### Spline-based Regression with the Old Faithful Data Set ####### # Save the data file into a directory and # use the full path name: oldfaithful.data <- read.table(file = "http://people.stat.sc.edu/Hitchcock/oldfaithfuldata.txt", header=FALSE, col.names = c('obsno', 'eruption.length', 'waiting.time')) # The X variable here is the length of time (in minutes) it takes for the geyser to erupt. # The Y variable is the waiting time until the next eruption. # attaching the data frame: attach(oldfaithful.data) ################################## # A regression spline with relatively more knots placed where there is denser data: regr.spline.OF <- lm(waiting.time ~ bs(eruption.length, df = NULL, knots=c(1.9, 2.2, 2.5, 3.0, 3.6, 4.1, 4.6), degree = 3, intercept=T)) x.values <- seq(from=min(eruption.length), to=max(eruption.length), length=200) plot(eruption.length, waiting.time); lines(x.values, predict(regr.spline.OF, data.frame(eruption.length=x.values))) ####################################### # Using the smoothing-spline method: smoothspline.reg.OF <- smooth.spline(eruption.length, waiting.time) plot(eruption.length, waiting.time); lines(smoothspline.reg.OF)