## loading packages and looking at the 'presidential' data frame
library(tidyverse)
library(mdsr)
presidential


## Using the 'select' function to get a subset of columns
select(presidential, name, party)


## Using the 'filter' function to get a subset of rows
filter(presidential, party == "Republican")


## Selecting, and filtering on a compound condition
select(
  filter(presidential, lubridate::year(start) > 1973 & party == "Democratic"), 
  name
)


## Another way to do the same thing, but with the pipe operation:
presidential %>%
  filter(lubridate::year(start) > 1973 & party == "Democratic") %>% 
  select(name)


## Other examples with logicals:

try1<-1:3
try2<-4:6
try3<-c(5,0,3)

(try1 < try2)

(try1 < try3)

(try1 < try2) & (try1 <try3)  # elementwise AND

(try1 < try2) && (try1 <try3) # Gives error

all(try1 < try2) && all(try1 <try3)

(try1 < try2) | (try1 <try3)  # elementwise OR

all(try1 < try2) || all(try1 <try3)

try1 != try3  # not-equals operator

## The 'interval' function can calculate the duration of time between two date values:
library(lubridate)
my_presidents <- presidential %>%
  mutate(term.length = interval(start, end) / dyears(1))
my_presidents


## The 'year' function will pick out the year of a date value:
my_presidents <- my_presidents %>% 
  mutate(elected = year(start) - 1)
my_presidents

## Similar examples with the 'month', 'wday', and 'day' functions:
# (label=T returns the abbreviated word of the weekday rather than the number)
my_presidents_more <- my_presidents %>% 
  mutate(inaug_month = month(start)) %>% 
  mutate(inaug_weekday = wday(start,label=T)) %>%  
  mutate(inaug_day = day(start)) 
my_presidents_more


## Putting missing values for presidents who were not actually elected
## the syntax for 'ifelse' is:  ifelse(test_condition, result_if_TRUE, result_if_FALSE)
my_presidents <- my_presidents %>%
  mutate(elected = ifelse(elected %in% c(1962, 1973), NA, elected))
my_presidents


## Stylistic choice to prefer underscores to periods in variable names, function names, etc.
my_presidents <- my_presidents %>% 
  rename(term_length = term.length)
my_presidents


## ordering the rows based on one column's values
my_presidents %>% 
  arrange(desc(term_length))


## A nested sorting.  Can you explain the sorted result?
my_presidents %>% 
  arrange(desc(term_length), party, elected)

## If Hmisc is loaded, may need to detach it to avoid confusion with Hmisc's summarize function:
# detach(package:Hmisc) 
# Or could use the 'summarise' spelling which works in the 'dplyr' package.

## Summary statistics for the whole data set
my_presidents %>%
  summarize(
    N = n(), 
    first_year = min(year(start)), 
    last_year = max(year(end)), 
    num_dems = sum(party == "Democratic"), 
    years = sum(term_length), 
    avg_term_length = mean(term_length)
  )


## Summary statistics, separate by party:

my_presidents %>% 
  group_by(party) %>% 
  summarize(
    N = n(), 
    first_year = min(year(start)), 
    last_year = max(year(end)), 
    num_dems = sum(party == "Democratic"), 
    years = sum(term_length), 
    avg_term_length = mean(term_length)
  )


## install and load package:
# install.packages("Lahman")
library(Lahman)
dim(Teams)

head(Teams)

## Selecting a few columns from several recent years for the New York Mets (this creates two new data frames in the workspace)
mets <- Teams %>% 
  filter(teamID == "NYN")
my_mets <- mets %>% 
  filter(yearID %in% 2004:2012)
my_mets %>% 
  select(yearID, teamID, W, L)


## Checking the number of rows in the resulting data set:
nrow(mets)


## doing the same selection and filtering with one line rather than through pipe operations:
select(filter(Teams, teamID == "NYN" & yearID %in% 2004:2012),
  yearID, teamID, W, L)


## Same operation, but using the pipe operator and not creating new data frames:
## Teams %>%
##   filter(teamID == "NYN" & yearID %in% 2004:2012) %>%
##   select(yearID, teamID, W, L)

# Clean up workspace:
rm(mets)
rm(my_mets)


## Similar, but selecting more columns for this data frame 'mets_ben'
mets_ben <- Teams %>% 
  select(yearID, teamID, W, L, R, RA) %>%
  filter(teamID == "NYN" & yearID %in% 2004:2012)
mets_ben


## Changing R ("runs") column name to RS ("runs scored")
mets_ben <- mets_ben %>% 
  rename(RS = R)    # new name = old name
mets_ben


## Creating a new "winning percentage" column:
mets_ben <- mets_ben %>% 
  mutate(WPct = W / (W + L))
mets_ben


## "Pythagorean" estimate of expected winning percentage based on runs scored vs. runs allowed:
mets_ben <- mets_ben %>% 
  mutate(WPct_hat = 1 / (1 + (RA/RS)^2))
mets_ben


## New variable: "Expected" number of wins
mets_ben <- mets_ben %>% 
  mutate(W_hat = WPct_hat * (W + L))
mets_ben


## Picking which years the Mets won at least as many games than "expected"
filter(mets_ben, W >= W_hat)
## Picking which years the Mets won fewer games than "expected"
filter(mets_ben, W < W_hat)

# Were the Mets "especially lucky" during this time period, or "especially unlucky"?


## sorting rows from best record to worst:
arrange(mets_ben, desc(WPct))


## Sorting from "luckiest" years to "unluckiest" years:
mets_ben %>% 
  mutate(Diff = W - W_hat) %>% 
  arrange(desc(Diff))


## Summary statistics for a single variable:
mets_ben %>%
  skim(W)

## Summary statistics for all variables:
mets_ben %>%
  skim()

## Use skimr::skim to get summary statistics for different TYPES of varaibles at once:

mpg %>%
skimr::skim() %>%
print(n=Inf)  ## This prints all rows of the resulting data frame

## Summary statistics for several variables:
mets_ben %>% 
  summarize(
    num_years = n(), 
    total_W = sum(W), 
    total_L = sum(L), 
    total_WPct = sum(W) / sum(W + L), 
    sum_resid = sum(W - W_hat)
  )
# If an error, change 'summarize' to 'summarise'


## Using nested-ifelse to define a gm (general manager) column:
mets_ben <- mets_ben %>% 
  mutate(
    gm = ifelse(
      yearID == 2004, 
      "Duquette", 
      ifelse(
        yearID >= 2011, 
        "Alderson", 
        "Minaya")
    )
  )


## Same operation, but using case-when (simpler syntax)
mets_ben <- mets_ben %>% 
  mutate(
    gm = case_when(
      yearID == 2004 ~ "Duquette", 
      yearID >= 2011 ~ "Alderson", 
      TRUE ~ "Minaya"
    )
  )

## Similar operation, but using case-when with .default:

mets_ben <- mets_ben %>% 
  mutate(
    gm = case_when(
      yearID == 2004 ~ "Duquette", 
      yearID >= 2011 ~ "Alderson",
      .default = "Minaya"  # This will assign "Minaya" if the previous logicals are FALSE or in the case of missing values
    )
  )


## Summary statistics, separated by value of general manager:
mets_ben %>% 
  group_by(gm) %>% 
  summarize(
    num_years = n(), 
    total_W = sum(W), 
    total_L = sum(L), 
    total_WPct = sum(W) / sum(W + L), 
    sum_resid = sum(W - W_hat)
  ) %>%
  arrange(desc(sum_resid))


## Doing all the above operations at once, using chaining with pipe operators:
Teams %>%
  select(yearID, teamID, W, L, R, RA) %>%
  filter(teamID == "NYN" & yearID %in% 2004:2012) %>%
  rename(RS = R) %>% 
  mutate(
    WPct = W / (W + L), 
    WPct_hat = 1 / (1 + (RA/RS)^2), 
    W_hat = WPct_hat * (W + L), 
    gm = case_when(
      yearID == 2004 ~ "Duquette", 
      yearID >= 2011 ~ "Alderson", 
      TRUE ~ "Minaya"
    )
  ) %>%
  group_by(gm) %>%
  summarize(
    num_years = n(), 
    total_W = sum(W), 
    total_L = sum(L),
    total_WPct = sum(W) / sum(W + L), 
    sum_resid = sum(W - W_hat)
  ) %>%
  arrange(desc(sum_resid))


## All franchises from that era, from unluckiest to luckiest:
Teams %>%
  select(yearID, teamID, franchID, W, L, R, RA) %>%
  filter(yearID %in% 2004:2012) %>%
  rename(RS = R) %>% 
  mutate(
    WPct = W / (W + L), 
    WPct_hat = 1 / (1 + (RA/RS)^2), 
    W_hat = WPct_hat * (W + L)
  ) %>%
  group_by(franchID) %>%
  summarize(
    num_years = n(), 
    total_W = sum(W), 
    total_L = sum(L),
    total_WPct = sum(W) / sum(W + L), 
    sum_resid = sum(W - W_hat)
  ) %>%
  arrange(sum_resid) %>%
  head(6)

# Change head(6)
# to print(n=20) or whatever to see more rows,
# or print (n=Inf) to see all rows

# A simple scatterplot to look at the association between luck and success:
Teams %>%
  select(yearID, teamID, franchID, W, L, R, RA) %>%
  filter(yearID %in% 2004:2012) %>%
  rename(RS = R) %>% 
  mutate(
    WPct = W / (W + L), 
    WPct_hat = 1 / (1 + (RA/RS)^2), 
    W_hat = WPct_hat * (W + L)
  ) %>%
  group_by(franchID) %>%
  summarize(
    num_years = n(), 
    total_W = sum(W), 
    total_L = sum(L),
    total_WPct = sum(W) / sum(W + L), 
    sum_resid = sum(W - W_hat)
  ) %>%
  arrange(sum_resid) %>%
ggplot(aes(x=sum_resid, y=total_WPct)) + geom_point()