## loading packages:
library(tidyverse)
library(mdsr)
library(Lahman)
names(Teams)


## Getting information about the columns in Teams:
## str(Teams)
 glimpse(Teams)


## In R, most things are stored as vectors!
a <- "a string"
class(a)
is.vector(a)
length(a)


## Vectorized operation (takes a vector as input, returns a vector as output):
exp(1:3)


## A summary function (takes a vector as input, returns a single number as output):
mean(1:3)


## The 'if' function can only accept as input a logical vector of length 1:
if (c(TRUE, FALSE)) {
  cat("This is a great book!")
}
# gives error


## An iterative operation using a loop (not recommended):
averages <- NULL
for (i in 15:40) {
  averages[i - 14] <- mean(Teams[, i], na.rm = TRUE)
}
names(averages) <- names(Teams)[15:40]
averages

## Simpler code using the colMeans function (recommended)
colMeans(Teams[,15:40], na.rm = TRUE)

# Note that using the numbers 15 and 40 in the code makes this code non-reproducible on other data tables
# or on a potentially altered version of this data table...

## Same iterative operation using 'map_dbl':
Teams %>% 
  select(15:40) %>%
  map_dbl(mean, na.rm = TRUE)


## Why does this attempted use of 'map_dbl' with 'mean' fail?
Teams %>% 
  select(teamID) %>% 
  map_dbl(mean, na.rm = TRUE)


## This works:
Teams %>% 
  select(name) %>% 
  map(nchar)


## comparing a loop to a vectorized operation to applying a function with 'map_dbl':
# install.packages("bench")
library(bench)
exploop <- function(vec){
evec <- rep(0, length=length(vec))
for (i in 1:length(vec)) {
evec[i] <- exp(vec[i])
}
return(evec)
}
x <- 1:100000
bench::mark(
  exploop(x),
  exp(x),
  map_dbl(x, exp)
)


## Using 'across' to specify WHICH variables to summarize:
Teams %>%
  summarize(across(where(is.numeric), mean, na.rm = TRUE))

## A more updated syntax, avoids warning...
Teams %>%
  summarize(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)) )


## Another way to use 'across' to specify WHICH variables to summarize:
Teams %>%
  summarize(across(c(yearID, R:SF, BPF), mean, na.rm = TRUE))


## Summaries of the Angels franchise, separated by different versions of the team name:
angels <- Teams %>% 
  filter(franchID == "ANA") %>% 
  group_by(teamID, name) %>%
  summarize(began = first(yearID), ended = last(yearID)) %>% 
  arrange(began)
angels

## Summaries of the Braves franchise, separated by (surprisingly many!) versions of the team name:
braves <- Teams %>% 
  filter(franchID == "ATL") %>% 
  group_by(teamID, name) %>%
  summarize(began = first(yearID), ended = last(yearID)) %>% 
  arrange(began)
braves


## Iterating manually to see how long each 'angels' team name is:
angels_names <- angels %>%
  pull(name)
angels_names  # a character vector containing the various Angels team names

nchar(angels_names[1])
nchar(angels_names[2])
nchar(angels_names[3])
nchar(angels_names[4])


## Using 'map_int' to automate the iterated operations is better:
map_int(angels_names, nchar)


## Since 'nchar' is vectorized, using it directly is even better!
nchar(angels_names)


## writing our own function 'top5' to pick out the top 5 seasons based on Wins:
top5 <- function(data, team_name) {
  data %>%
    filter(name == team_name) %>%
    select(teamID, yearID, W, L, name) %>%
    arrange(desc(W)) %>%
    head(n = 5)
}


## -----------------------------------------------------------------------------
angels_names %>%
  map(top5, data = Teams)

## Each element of 'angels_names' will in turn be the value of the 'team_name' argument in the 'top5' function.


## 'map_dfr' will return a data frame (which we can then summarize) rather than a list, which 'map' returns:

angels_names %>% 
  map_dfr(top5, data = Teams)

## Summary table separated by team name:
angels_names %>% 
  map_dfr(top5, data = Teams) %>%
  group_by(teamID, name) %>%
  summarize(N = n(), mean_wins = mean(W)) %>%
  arrange(desc(mean_wins))


## Fit for Nonlinear Pythagorean Winning Percentage Model to predict Winning Percentage, for teams since 1954,
## as a function of 'run_ratio' = RS/RA

# Defining the nonlinear function of x (run_ratio will play the role of x):
exp_wpct <- function(x) { 
  return(1/(1 + (1/x)^2))
}

TeamRuns <- Teams %>% 
  filter(yearID >= 1954) %>%
  rename(RS = R) %>% 
  mutate(WPct = W / (W + L), run_ratio = RS/RA) %>%
  select(yearID, teamID, lgID, WPct, run_ratio)

ggplot(data = TeamRuns, aes(x = run_ratio, y = WPct)) +
  geom_vline(xintercept = 1, color = "darkgray", linetype = 2) +
  geom_hline(yintercept = 0.5, color = "darkgray", linetype = 2) +
  geom_point(alpha = 0.2) + 
  stat_function(fun = exp_wpct, size = 2, color = "blue") + 
  xlab("Ratio of Runs Scored to Runs Allowed") + 
  ylab("Winning Percentage")


## Letting the exponent k be a parameter and using nonlinear least squares to estimate k:
TeamRuns %>%
  nls(
    formula = WPct ~ 1/(1 + (1/run_ratio)^k), 
    start = list(k = 2)
  ) %>%
  coef()


## A general function that will fit the nonlinear regression model and return the estimate of k:
fit_k <- function(x) {
  mod <- nls(
    formula = WPct ~ 1/(1 + (1/run_ratio)^k), 
    data = x,
    start = list(k = 2)
  )
  return(tibble(k = coef(mod), n = nrow(x)))
}


## Applying the 'fit_k' function once to the whole data set
fit_k(TeamRuns)


## using group_modify to apply the 'fit_k' function to each decade:
TeamRuns %>% 
  mutate(decade = yearID %/% 10 * 10) %>%   # defines a 'decade' variable
  group_by(decade) %>%    # groups the rows by decade instead of having one row for each year
  group_modify(~fit_k(.x))  # Note the .x is a placeholder for the input data frame for a decade


## A function to pick out the team that was the leader in home runs:
hr_leader <- function(x) {
# x is a subset of Teams for a single year and league
  x %>% 
    select(teamID, HR) %>% 
    arrange(desc(HR)) %>% 
    head(1)
}


##  Using 'hr_leader' to pick out the team that was the leader in home runs for a particular league and year:
Teams %>% 
  filter(yearID == 1961 & lgID == "AL") %>% 
  hr_leader()


## Now applying this function over all league-year combinations with 'group_modify':
hr_leaders <- Teams %>% 
  group_by(yearID, lgID) %>% 
  group_modify(~hr_leader(.x), .keep = TRUE)  # Note the .x is a placeholder for the input data frame for a league-year combination

tail(hr_leaders, 4)  # Printing the most recent league leaders


## For all the different major leagues (did you know there were so many?), finding the average number of HRs per season by its league leaders throughout history:
hr_leaders %>%
  group_by(lgID) %>%
  summarize(mean_hr = mean(HR))


## Just restricting attention to the "modern" era:
hr_leaders %>%
  filter(yearID >= 1916) %>%
  group_by(lgID) %>%
  summarize(mean_hr = mean(HR))


## Number of home runs hit by the team with the most home runs, 1916--2023. 
## Note how the AL has consistently bested the NL since the introduction of the designated hitter (DH) in 1973.
hr_leaders %>% 
  filter(yearID >= 1916) %>%
  ggplot(aes(x = yearID, y = HR, color = lgID)) + 
  geom_line() + 
  geom_point() + 
  geom_smooth(se = FALSE) + 
  geom_vline(xintercept = 1973) + 
  annotate(
    "text", x = 1974, y = 25, 
    label = "AL adopts DH", hjust = "left"
  ) +
 geom_vline(xintercept = 2022) +     # updating the book's code, since the NL adopted the DH in 2022...
  annotate(
    "text", x = 2000, y = 30, 
    label = "NL adopts DH", hjust = "left"
  ) +
  labs(x = "Year", y = "Home runs", color = "League")

## Do you see a couple of outlier years?  Any explanation for those?

dev.new()  # setting up new plotting window

## The above plot just focused on the league-leading teams for each year.
## Let's look at the average number of home runs hit among ALL teams each year in each league, since 1916:


## A function to find average number of home runs hit among ALL teams:
hr_avg_overall <- function(x) {
# x is a subset of Teams for a single year and league
  x %>% 
    summarize(avg_team_HRs = mean(HR)) 
}


## Now applying this function over all league-year combinations with 'group_modify':
hr_avg_all_teams <- Teams %>% 
  group_by(yearID, lgID) %>% 
  group_modify(~hr_avg_overall(.x), .keep = TRUE)  # Note the .x is a placeholder for the input data frame for a league-year combination

# The most recent years:
tail(hr_avg_all_teams)


## Number of team home runs hit on average, 1916--2023
## Note how the AL has consistently bested the NL since the introduction of the designated hitter (DH) in 1973.
hr_avg_all_teams %>% 
  filter(yearID >= 1916) %>%
  ggplot(aes(x = yearID, y = avg_team_HRs, color = lgID)) + 
  geom_line() + 
  geom_point() + 
  geom_smooth(se = FALSE) + 
  geom_vline(xintercept = 1973) + 
  annotate(
    "text", x = 1974, y = 25, 
    label = "AL adopts DH", hjust = "left"
  ) +
 geom_vline(xintercept = 2022) +     # updating the book's code, since the NL adopted the DH in 2022...
  annotate(
    "text", x = 2000, y = 30, 
    label = "NL adopts DH", hjust = "left"
  ) +
  labs(x = "Year", y = "Home runs", color = "League")

## Similar pattern as above with slight differences (1920s/1930s), but note the different numbers on the y-axis.


#### Extended example on BMI:

## loading package:
library(NHANES)

head(NHANES)


## Basic scatterplot of BMI against age:
ggplot(NHANES, aes(x = Age, y = BMI)) +
  geom_point() + 
  geom_smooth()


## Write a generic function to plot BMI against any other variable:
bmi_plot <- function(.data, x_var) {
  ggplot(.data, aes(y = BMI)) +
    aes_string(x = x_var) + 
    geom_jitter(alpha = 0.3) + # geom_jitter is in case one of the variables has lots of tied values...
    geom_smooth() + 
    labs(
      title = paste("BMI by", x_var),
      subtitle = "NHANES",
      caption = "US National Center for Health Statistics (NCHS)"
    )
}


## Using 'bmi_plot' function to plot BMI against only Age:
bmi_plot(NHANES, "Age")

# install.packages("patchwork")
library(patchwork)

## Now using 'map' to apply the 'bmi_plot' function to repeatedly plot BMI against a set of other variables:
c("Age", "HHIncomeMid", "PhysActiveDays", 
  "TVHrsDay", "AlcoholDay", "Pulse") %>%
  map(bmi_plot, .data = NHANES) %>%
  patchwork::wrap_plots(ncol = 2)


#names(NHANES)[24:35] %>%
#  map(bmi_plot, .data = NHANES) %>%
#  patchwork::wrap_plots(ncol = 3)