## loading packages: library(tidyverse) library(mdsr) library(Lahman) names(Teams) ## Getting information about the columns in Teams: ## str(Teams) glimpse(Teams) ## In R, most things are stored as vectors! a <- "a string" class(a) is.vector(a) length(a) ## Vectorized operation (takes a vector as input, returns a vector as output): exp(1:3) ## A summary function (takes a vector as input, returns a single number as output): mean(1:3) ## The 'if' function can only accept as input a logical vector of length 1: if (c(TRUE, FALSE)) { cat("This is a great book!") } # gives error ## An iterative operation using a loop (not recommended): averages <- NULL for (i in 15:40) { averages[i - 14] <- mean(Teams[, i], na.rm = TRUE) } names(averages) <- names(Teams)[15:40] averages ## Simpler code using the colMeans function (recommended) colMeans(Teams[,15:40], na.rm = TRUE) # Note that using the numbers 15 and 40 in the code makes this code non-reproducible on other data tables # or on a potentially altered version of this data table... ## Same iterative operation using 'map_dbl': Teams %>% select(15:40) %>% map_dbl(mean, na.rm = TRUE) ## Why does this attempted use of 'map_dbl' with 'mean' fail? Teams %>% select(teamID) %>% map_dbl(mean, na.rm = TRUE) ## This works: Teams %>% select(name) %>% map(nchar) ## comparing a loop to a vectorized operation to applying a function with 'map_dbl': # install.packages("bench") library(bench) exploop <- function(vec){ evec <- rep(0, length=length(vec)) for (i in 1:length(vec)) { evec[i] <- exp(vec[i]) } return(evec) } x <- 1:100000 bench::mark( exploop(x), exp(x), map_dbl(x, exp) ) ## Using 'across' to specify WHICH variables to summarize: Teams %>% summarize(across(where(is.numeric), mean, na.rm = TRUE)) ## A more updated syntax, avoids warning... Teams %>% summarize(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)) ) ## Another way to use 'across' to specify WHICH variables to summarize: Teams %>% summarize(across(c(yearID, R:SF, BPF), mean, na.rm = TRUE)) ## Summaries of the Angels franchise, separated by different versions of the team name: angels <- Teams %>% filter(franchID == "ANA") %>% group_by(teamID, name) %>% summarize(began = first(yearID), ended = last(yearID)) %>% arrange(began) angels ## Summaries of the Braves franchise, separated by (surprisingly many!) versions of the team name: braves <- Teams %>% filter(franchID == "ATL") %>% group_by(teamID, name) %>% summarize(began = first(yearID), ended = last(yearID)) %>% arrange(began) braves ## Iterating manually to see how long each 'angels' team name is: angels_names <- angels %>% pull(name) angels_names # a character vector containing the various Angels team names nchar(angels_names[1]) nchar(angels_names[2]) nchar(angels_names[3]) nchar(angels_names[4]) ## Using 'map_int' to automate the iterated operations is better: map_int(angels_names, nchar) ## Since 'nchar' is vectorized, using it directly is even better! nchar(angels_names) ## writing our own function 'top5' to pick out the top 5 seasons based on Wins: top5 <- function(data, team_name) { data %>% filter(name == team_name) %>% select(teamID, yearID, W, L, name) %>% arrange(desc(W)) %>% head(n = 5) } ## ----------------------------------------------------------------------------- angels_names %>% map(top5, data = Teams) ## Each element of 'angels_names' will in turn be the value of the 'team_name' argument in the 'top5' function. ## 'map_dfr' will return a data frame (which we can then summarize) rather than a list, which 'map' returns: angels_names %>% map_dfr(top5, data = Teams) ## Summary table separated by team name: angels_names %>% map_dfr(top5, data = Teams) %>% group_by(teamID, name) %>% summarize(N = n(), mean_wins = mean(W)) %>% arrange(desc(mean_wins)) ## Fit for Nonlinear Pythagorean Winning Percentage Model to predict Winning Percentage, for teams since 1954, ## as a function of 'run_ratio' = RS/RA # Defining the nonlinear function of x (run_ratio will play the role of x): exp_wpct <- function(x) { return(1/(1 + (1/x)^2)) } TeamRuns <- Teams %>% filter(yearID >= 1954) %>% rename(RS = R) %>% mutate(WPct = W / (W + L), run_ratio = RS/RA) %>% select(yearID, teamID, lgID, WPct, run_ratio) ggplot(data = TeamRuns, aes(x = run_ratio, y = WPct)) + geom_vline(xintercept = 1, color = "darkgray", linetype = 2) + geom_hline(yintercept = 0.5, color = "darkgray", linetype = 2) + geom_point(alpha = 0.2) + stat_function(fun = exp_wpct, size = 2, color = "blue") + xlab("Ratio of Runs Scored to Runs Allowed") + ylab("Winning Percentage") ## Letting the exponent k be a parameter and using nonlinear least squares to estimate k: TeamRuns %>% nls( formula = WPct ~ 1/(1 + (1/run_ratio)^k), start = list(k = 2) ) %>% coef() ## A general function that will fit the nonlinear regression model and return the estimate of k: fit_k <- function(x) { mod <- nls( formula = WPct ~ 1/(1 + (1/run_ratio)^k), data = x, start = list(k = 2) ) return(tibble(k = coef(mod), n = nrow(x))) } ## Applying the 'fit_k' function once to the whole data set fit_k(TeamRuns) ## using group_modify to apply the 'fit_k' function to each decade: TeamRuns %>% mutate(decade = yearID %/% 10 * 10) %>% # defines a 'decade' variable group_by(decade) %>% # groups the rows by decade instead of having one row for each year group_modify(~fit_k(.x)) # Note the .x is a placeholder for the input data frame for a decade ## A function to pick out the team that was the leader in home runs: hr_leader <- function(x) { # x is a subset of Teams for a single year and league x %>% select(teamID, HR) %>% arrange(desc(HR)) %>% head(1) } ## Using 'hr_leader' to pick out the team that was the leader in home runs for a particular league and year: Teams %>% filter(yearID == 1961 & lgID == "AL") %>% hr_leader() ## Now applying this function over all league-year combinations with 'group_modify': hr_leaders <- Teams %>% group_by(yearID, lgID) %>% group_modify(~hr_leader(.x), .keep = TRUE) # Note the .x is a placeholder for the input data frame for a league-year combination tail(hr_leaders, 4) # Printing the most recent league leaders ## For all the different major leagues (did you know there were so many?), finding the average number of HRs per season by its league leaders throughout history: hr_leaders %>% group_by(lgID) %>% summarize(mean_hr = mean(HR)) ## Just restricting attention to the "modern" era: hr_leaders %>% filter(yearID >= 1916) %>% group_by(lgID) %>% summarize(mean_hr = mean(HR)) ## Number of home runs hit by the team with the most home runs, 1916--2023. ## Note how the AL has consistently bested the NL since the introduction of the designated hitter (DH) in 1973. hr_leaders %>% filter(yearID >= 1916) %>% ggplot(aes(x = yearID, y = HR, color = lgID)) + geom_line() + geom_point() + geom_smooth(se = FALSE) + geom_vline(xintercept = 1973) + annotate( "text", x = 1974, y = 25, label = "AL adopts DH", hjust = "left" ) + geom_vline(xintercept = 2022) + # updating the book's code, since the NL adopted the DH in 2022... annotate( "text", x = 2000, y = 30, label = "NL adopts DH", hjust = "left" ) + labs(x = "Year", y = "Home runs", color = "League") ## Do you see a couple of outlier years? Any explanation for those? dev.new() # setting up new plotting window ## The above plot just focused on the league-leading teams for each year. ## Let's look at the average number of home runs hit among ALL teams each year in each league, since 1916: ## A function to find average number of home runs hit among ALL teams: hr_avg_overall <- function(x) { # x is a subset of Teams for a single year and league x %>% summarize(avg_team_HRs = mean(HR)) } ## Now applying this function over all league-year combinations with 'group_modify': hr_avg_all_teams <- Teams %>% group_by(yearID, lgID) %>% group_modify(~hr_avg_overall(.x), .keep = TRUE) # Note the .x is a placeholder for the input data frame for a league-year combination # The most recent years: tail(hr_avg_all_teams) ## Number of team home runs hit on average, 1916--2023 ## Note how the AL has consistently bested the NL since the introduction of the designated hitter (DH) in 1973. hr_avg_all_teams %>% filter(yearID >= 1916) %>% ggplot(aes(x = yearID, y = avg_team_HRs, color = lgID)) + geom_line() + geom_point() + geom_smooth(se = FALSE) + geom_vline(xintercept = 1973) + annotate( "text", x = 1974, y = 25, label = "AL adopts DH", hjust = "left" ) + geom_vline(xintercept = 2022) + # updating the book's code, since the NL adopted the DH in 2022... annotate( "text", x = 2000, y = 30, label = "NL adopts DH", hjust = "left" ) + labs(x = "Year", y = "Home runs", color = "League") ## Similar pattern as above with slight differences (1920s/1930s), but note the different numbers on the y-axis. #### Extended example on BMI: ## loading package: library(NHANES) head(NHANES) ## Basic scatterplot of BMI against age: ggplot(NHANES, aes(x = Age, y = BMI)) + geom_point() + geom_smooth() ## Write a generic function to plot BMI against any other variable: bmi_plot <- function(.data, x_var) { ggplot(.data, aes(y = BMI)) + aes_string(x = x_var) + geom_jitter(alpha = 0.3) + # geom_jitter is in case one of the variables has lots of tied values... geom_smooth() + labs( title = paste("BMI by", x_var), subtitle = "NHANES", caption = "US National Center for Health Statistics (NCHS)" ) } ## Using 'bmi_plot' function to plot BMI against only Age: bmi_plot(NHANES, "Age") # install.packages("patchwork") library(patchwork) ## Now using 'map' to apply the 'bmi_plot' function to repeatedly plot BMI against a set of other variables: c("Age", "HHIncomeMid", "PhysActiveDays", "TVHrsDay", "AlcoholDay", "Pulse") %>% map(bmi_plot, .data = NHANES) %>% patchwork::wrap_plots(ncol = 2) #names(NHANES)[24:35] %>% # map(bmi_plot, .data = NHANES) %>% # patchwork::wrap_plots(ncol = 3)