## loading packages
library(tidyverse)
library(mdsr)


# Creating a simple data frame with students in a Math class:
name <- c("Jenny", "James", "Ming", "Alisha", "Tara", "Niels")
test <- c(78, 81, 74, 82, 83, 91)
quiz1 <- c(9,10,8,9.5,8.5,8)
math <- data.frame(name,test,quiz1)
print(math)

# Creating a simple data frame with students in a Reading class:
student <- c("Kyle", "Jenny", "Alisha", "Bob", "Laura")
exercise <- c(3,4.5,5,4,5)
test <- c(72, 91, 90, 84, 88)
reading <- data.frame(student, exercise, test)
print(reading)

# A basic inner join, and seeing the result:
both_inner <- math %>%
 inner_join(reading, by = c("name" = "student"))
head(both_inner,10)
nrow(both_inner)

# A basic left join, and seeing the result:
both_left <- math %>%
 left_join(reading, by = c("name" = "student"))
head(both_left,10)
nrow(both_left)

# A basic right join, and seeing the result:
both_right <- math %>%
 right_join(reading, by = c("name" = "student"))
head(both_right,10)
nrow(both_right)

# A basic full join, and seeing the result:
both_full <- math %>%
 full_join(reading, by = c("name" = "student"))
head(both_full,10)
nrow(both_full)

# It can be convenient to rename some of the columns in the joined data set
# (with the pipe operation, this really could be done while doing the join):
both_inner <- both_inner %>%
 rename(math_test = test.x, reading_test=test.y)
head(both_inner,10)

# Can actually join by a logical condition other than equality (less common):

both_left_better_math <- math %>%
 left_join(reading, by = join_by(test > test))
head(both_left_better_math,10)
nrow(both_left_better_math)

# gives all comparison cases where the math test is higher than the reading test...


# installing and loading the 'nycflights13' package, and glimpsing the 'flights' data frame:
# install.packages("nycflights13")
library(nycflights13)
glimpse(flights)


## The 'airlines' data frame with the full airline names as a column:
head(airlines, 3)
dim(airlines)
print(airlines)

## Inner join of the 'flights' data frame with the 'airlines' data frame.
## note the key (ID) column is called "carrier" in both tables.
flights_joined <- flights %>% 
  inner_join(airlines, by = c("carrier" = "carrier"))
glimpse(flights_joined)

# Because of the identical name of the key column, could simply use:

flights_joined <- flights %>% 
  inner_join(airlines, by = join_by(carrier))
glimpse(flights_joined)


## Just selecting a few columns from the joined table:
flights_joined %>% 
  select(carrier, name, flight, origin, dest) %>% 
  head(3)


## Note in this case, the inner join produces a table with the same number of rows as the first table
nrow(flights)
nrow(flights_joined)


## Creating a data frame with only the airports that are in the Pacific time zone:
nrow(airports)
airports_pt <- airports %>%
  filter(tz == -8)
nrow(airports_pt)


## Inner join to obtain only the flights that had destinations in the Pacific time zone:
nyc_dests_pt <- flights %>% 
  inner_join(airports_pt, by = c("dest" = "faa"))
nrow(nyc_dests_pt)


## Left join of 'flights' and 'airports_pt'.  What will result?
nyc_dests <- flights %>% 
  left_join(airports_pt, by = c("dest" = "faa"))  # Note the key column has a different name in the two datasets being joined
glimpse(nyc_dests)

# Only contains airport characteristics for SOME of the flights...

# Summary counts of number of flights going to the Pacific time zone and number going elsewhere:
nyc_dests %>%
  summarize(
    num_flights = n(),
    num_flights_pt = sum(!is.na(name)),
    num_flights_not_pt = sum(is.na(name))
  )

# Joining multiple tables:

# A third simple table:
student <- c("Kyle", "Jenny", "Alisha", "Bob")
activity <- c(4, 6, 8, 9)
test <- c(55, 78, 92, 82)
biology <- data.frame(student, activity, test)

all_full <- math %>%
 full_join(reading, by = c("name" = "student")) %>%
 full_join(biology, by = c("name" = "student")) %>%
 rename(math_test = test.x, reading_test=test.y, biology_test=test)
print(all_full)


############ Baseball example:

## Creating a data table 'manny' with the seasonal batting statistics for Manny Ramirez:
library(Lahman)
manny <- Batting %>%
  filter(playerID == "ramirma02")
nrow(manny)


## Getting a few summary statistics for the 'manny' table:
manny %>% 
  summarize(
    span = paste(min(yearID), max(yearID), sep = "-"),  #the 'paste' function pastes numbers and character strings together into one big character string
    num_years = n_distinct(yearID),  # n_distinct returns the number of distinct (nonidentical) values of a variable
    num_teams = n_distinct(teamID), 
    BA = sum(H)/sum(AB), 
    tH = sum(H), 
    tHR = sum(HR), 
    tRBI = sum(RBI)
  )


## Same summary table, but now grouped by team, and arranged from earliest span of employment to latest span:
manny %>% 
  group_by(teamID) %>%
  summarize(
    span = paste(min(yearID), max(yearID), sep = "-"), 
    num_years = n_distinct(yearID), 
    num_teams = n_distinct(teamID), 
    BA = sum(H)/sum(AB), 
    tH = sum(H), 
    tHR = sum(HR), 
    tRBI = sum(RBI)
  ) %>%
  arrange(span)


## Similar summary table, but now grouped by league, and arranged from earliest span of employment to latest span:
manny %>% 
  group_by(lgID) %>%
  summarize(
    span = paste(min(yearID), max(yearID), sep = "-"), 
    num_years = n_distinct(yearID), 
    num_teams = n_distinct(teamID), 
    BA = sum(H)/sum(AB), 
    tH = sum(H), 
    tHR = sum(HR), 
    tRBI = sum(RBI)
  ) %>%
  arrange(span)


## Only picking the rows where Manny has at least 30 home runs, and counting the number of rows in this filtered table:
manny %>% 
  filter(HR >= 30) %>% 
  nrow()
## This doesn't actually count all the seasons when Manny hit at least 30 home runs.  Why?

## This does count all the seasons when Manny hit at least 30 home runs:
manny %>% 
  group_by(yearID) %>% 
  summarize(tHR = sum(HR)) %>%
  filter(tHR >= 30) %>% 
  nrow()


## Checking for all people named "Manny Ramirez" in the master dataset called "People"
## Note the book's code uses Master but the master data set is now called People.
People %>% 
  filter(nameLast == "Ramirez" & nameFirst == "Manny")

## Note that this returns characteristics of the player that are permanent and do not change from season to season.
## Well, the 'weight' variable might change, but we don't keep track of that...


## We can merge the 'Batting' table with the 'People' table to bring the variable birthYear in.
## That allows us to track the Age of the player over the years.
Batting %>% 
  filter(playerID == "ramirma02") %>%
  inner_join(People, by = c("playerID" = "playerID")) %>%
  group_by(yearID) %>%
  summarize(
    Age = max(yearID - birthYear),  # note we don't really need to ask for the max of Age ... the min or the mean would give the same result
    num_teams = n_distinct(teamID), 
    BA = sum(H)/sum(AB), 
    tH = sum(H), 
    tHR = sum(HR), 
    tRBI = sum(RBI)
  ) %>%
  arrange(yearID)


## Calculating some specialized summary statistics, by Year, with the 'summarize' function and saving the result as a new data table:
manny_by_season <- Batting %>%
  filter(playerID == "ramirma02") %>%
  inner_join(People, by = c("playerID" = "playerID"))  %>%
  group_by(yearID) %>%
  summarize(
    Age = max(yearID - birthYear), 
    num_teams = n_distinct(teamID), 
    BA = sum(H)/sum(AB), 
    tH = sum(H), 
    tHR = sum(HR), 
    tRBI = sum(RBI), 
    OBP = sum(H + BB + HBP) / sum(AB + BB + SF + HBP),
    SLG = sum(H + X2B + 2 * X3B + 3 * HR) / sum(AB)
  ) %>%
  mutate(OPS = OBP + SLG) %>% 
  arrange(desc(OPS))  # arranging the table from best OPS to worst
manny_by_season


## Calculating a few statistics (OBP, SLG, OPS) for the major leagues as a whole for the years Manny was active:
mlb <- Batting %>%
  filter(yearID %in% 1993:2011) %>%
  group_by(yearID) %>%
  summarize(
    lg_OBP = sum(H + BB + HBP, na.rm = TRUE) / 
      sum(AB + BB + SF + HBP, na.rm = TRUE), 
    lg_SLG = sum(H + X2B + 2*X3B + 3*HR, na.rm = TRUE) /   #Note doubles, triples, HRs are already counted as hits in the 'H' column
      sum(AB, na.rm = TRUE)
  ) %>%
  mutate(lg_OPS = lg_OBP + lg_SLG)


## Merging Manny's seasonal statistics with the major league seasonal statistics so we can compare Manny to the major-league standard each year.
manny_ratio <- manny_by_season %>%
  inner_join(mlb, by = c("yearID" = "yearID")) %>%
  mutate(OPS_plus = OPS / lg_OPS) %>%
  select(yearID, Age, OPS, lg_OPS, OPS_plus) %>%
  arrange(desc(OPS_plus))
manny_ratio


## Creating a data set 'ripken' with Cal Ripken's seasonal statistics, and merging it with the 'mlb' data table:
ripken <- Batting %>% 
  filter(playerID == "ripkeca01")
ripken %>%
  inner_join(mlb, by = c("yearID" = "yearID")) %>%
  nrow()
# same
mlb %>%
  inner_join(ripken, by = c("yearID" = "yearID")) %>%
  nrow()


## The left join will return all years when Ripken played, but lg_OPS will only have a value for the years when Ramirez also played:
ripken %>% 
  left_join(mlb, by = c("yearID" = "yearID")) %>%
  select(yearID, playerID, lg_OPS) %>% 
  head(3)  # just the first 3 rows of the left-joined table


ripken %>% 
  left_join(mlb, by = c("yearID" = "yearID")) %>%
  select(yearID, playerID, lg_OPS) %>% 
  head(Inf)  # all the rows of the left-joined table


## Reversing the tables in the left join returns all years of 'mlb' (which are all years that Ramirez played), but Ripken only played for some of those:
mlb %>%
 left_join(ripken, by = c("yearID" = "yearID")) %>%
 select(yearID, playerID, lg_OPS)

## Basically equivalent to doing "ripken right_join mlb" with some slight differences in data display:
ripken %>% 
  right_join(mlb, by = c("yearID" = "yearID")) %>%
  select(yearID, playerID, lg_OPS)

## Full join gets all rows from either player's career:

ripken %>% 
  full_join(mlb, by = c("yearID" = "yearID")) %>%
  select(yearID, playerID, lg_OPS)