## loading packages and looking at the 'presidential' data frame library(tidyverse) library(mdsr) presidential ## Using the 'select' function to get a subset of columns select(presidential, name, party) ## Using the 'filter' function to get a subset of rows filter(presidential, party == "Republican") ## Selecting, and filtering on a compound condition select( filter(presidential, lubridate::year(start) > 1973 & party == "Democratic"), name ) ## Another way to do the same thing, but with the pipe operation: presidential %>% filter(lubridate::year(start) > 1973 & party == "Democratic") %>% select(name) ## Other examples with logicals: try1<-1:3 try2<-4:6 try3<-c(5,0,3) (try1 < try2) (try1 < try3) (try1 < try2) & (try1 % mutate(term.length = interval(start, end) / dyears(1)) my_presidents ## The 'year' function will pick out the year of a date value: my_presidents <- my_presidents %>% mutate(elected = year(start) - 1) my_presidents ## Similar examples with the 'month', 'wday', and 'day' functions: # (label=T returns the abbreviated word of the weekday rather than the number) my_presidents_more <- my_presidents %>% mutate(inaug_month = month(start)) %>% mutate(inaug_weekday = wday(start,label=T)) %>% mutate(inaug_day = day(start)) my_presidents_more ## Putting missing values for presidents who were not actually elected ## the syntax for 'ifelse' is: ifelse(test_condition, result_if_TRUE, result_if_FALSE) my_presidents <- my_presidents %>% mutate(elected = ifelse(elected %in% c(1962, 1973), NA, elected)) my_presidents ## Stylistic choice to prefer underscores to periods in variable names, function names, etc. my_presidents <- my_presidents %>% rename(term_length = term.length) my_presidents ## ordering the rows based on one column's values my_presidents %>% arrange(desc(term_length)) ## A nested sorting. Can you explain the sorted result? my_presidents %>% arrange(desc(term_length), party, elected) ## If Hmisc is loaded, may need to detach it to avoid confusion with Hmisc's summarize function: # detach(package:Hmisc) # Or could use the 'summarise' spelling which works in the 'dplyr' package. ## Summary statistics for the whole data set my_presidents %>% summarize( N = n(), first_year = min(year(start)), last_year = max(year(end)), num_dems = sum(party == "Democratic"), years = sum(term_length), avg_term_length = mean(term_length) ) ## Summary statistics, separate by party: my_presidents %>% group_by(party) %>% summarize( N = n(), first_year = min(year(start)), last_year = max(year(end)), num_dems = sum(party == "Democratic"), years = sum(term_length), avg_term_length = mean(term_length) ) ## install and load package: # install.packages("Lahman") library(Lahman) dim(Teams) head(Teams) ## Selecting a few columns from several recent years for the New York Mets (this creates two new data frames in the workspace) mets <- Teams %>% filter(teamID == "NYN") my_mets <- mets %>% filter(yearID %in% 2004:2012) my_mets %>% select(yearID, teamID, W, L) ## Checking the number of rows in the resulting data set: nrow(mets) ## doing the same selection and filtering with one line rather than through pipe operations: select(filter(Teams, teamID == "NYN" & yearID %in% 2004:2012), yearID, teamID, W, L) ## Same operation, but using the pipe operator and not creating new data frames: ## Teams %>% ## filter(teamID == "NYN" & yearID %in% 2004:2012) %>% ## select(yearID, teamID, W, L) # Clean up workspace: rm(mets) rm(my_mets) ## Similar, but selecting more columns for this data frame 'mets_ben' mets_ben <- Teams %>% select(yearID, teamID, W, L, R, RA) %>% filter(teamID == "NYN" & yearID %in% 2004:2012) mets_ben ## Changing R ("runs") column name to RS ("runs scored") mets_ben <- mets_ben %>% rename(RS = R) # new name = old name mets_ben ## Creating a new "winning percentage" column: mets_ben <- mets_ben %>% mutate(WPct = W / (W + L)) mets_ben ## "Pythagorean" estimate of expected winning percentage based on runs scored vs. runs allowed: mets_ben <- mets_ben %>% mutate(WPct_hat = 1 / (1 + (RA/RS)^2)) mets_ben ## New variable: "Expected" number of wins mets_ben <- mets_ben %>% mutate(W_hat = WPct_hat * (W + L)) mets_ben ## Picking which years the Mets won at least as many games than "expected" filter(mets_ben, W >= W_hat) ## Picking which years the Mets won fewer games than "expected" filter(mets_ben, W < W_hat) # Were the Mets "especially lucky" during this time period, or "especially unlucky"? ## sorting rows from best record to worst: arrange(mets_ben, desc(WPct)) ## Sorting from "luckiest" years to "unluckiest" years: mets_ben %>% mutate(Diff = W - W_hat) %>% arrange(desc(Diff)) ## Summary statistics for a single variable: mets_ben %>% skim(W) ## Summary statistics for several variables: mets_ben %>% summarize( num_years = n(), total_W = sum(W), total_L = sum(L), total_WPct = sum(W) / sum(W + L), sum_resid = sum(W - W_hat) ) # If an error, change 'summarize' to 'summarise' ## Using nested-ifelse to define a gm (general manager) column: mets_ben <- mets_ben %>% mutate( gm = ifelse( yearID == 2004, "Duquette", ifelse( yearID >= 2011, "Alderson", "Minaya") ) ) ## Same operation, but using case-when (simpler syntax) mets_ben <- mets_ben %>% mutate( gm = case_when( yearID == 2004 ~ "Duquette", yearID >= 2011 ~ "Alderson", TRUE ~ "Minaya" ) ) ## Summary statistics, separated by value of general manager: mets_ben %>% group_by(gm) %>% summarize( num_years = n(), total_W = sum(W), total_L = sum(L), total_WPct = sum(W) / sum(W + L), sum_resid = sum(W - W_hat) ) %>% arrange(desc(sum_resid)) ## Doing all the above operations at once, using chaining with pipe operators: Teams %>% select(yearID, teamID, W, L, R, RA) %>% filter(teamID == "NYN" & yearID %in% 2004:2012) %>% rename(RS = R) %>% mutate( WPct = W / (W + L), WPct_hat = 1 / (1 + (RA/RS)^2), W_hat = WPct_hat * (W + L), gm = case_when( yearID == 2004 ~ "Duquette", yearID >= 2011 ~ "Alderson", TRUE ~ "Minaya" ) ) %>% group_by(gm) %>% summarize( num_years = n(), total_W = sum(W), total_L = sum(L), total_WPct = sum(W) / sum(W + L), sum_resid = sum(W - W_hat) ) %>% arrange(desc(sum_resid)) ## All franchises from that era, from unluckiest to luckiest: Teams %>% select(yearID, teamID, franchID, W, L, R, RA) %>% filter(yearID %in% 2004:2012) %>% rename(RS = R) %>% mutate( WPct = W / (W + L), WPct_hat = 1 / (1 + (RA/RS)^2), W_hat = WPct_hat * (W + L) ) %>% group_by(franchID) %>% summarize( num_years = n(), total_W = sum(W), total_L = sum(L), total_WPct = sum(W) / sum(W + L), sum_resid = sum(W - W_hat) ) %>% arrange(sum_resid) %>% head(6) # Change head(6) # to print(n=20) or whatever to see more rows, # or print (n=Inf) to see all rows