## loading packages library(tidyverse) library(mdsr) # Creating a simple data frame with students in a Math class: name <- c("Jenny", "James", "Ming", "Alisha", "Tara", "Niels") test <- c(78, 81, 74, 82, 83, 91) quiz1 <- c(9,10,8,9.5,8.5,8) math <- data.frame(name,test,quiz1) print(math) # Creating a simple data frame with students in a Reading class: student <- c("Kyle", "Jenny", "Alisha", "Bob", "Laura") exercise <- c(3,4.5,5,4,5) test <- c(72, 91, 90, 84, 88) reading <- data.frame(student, exercise, test) print(reading) # A basic inner join, and seeing the result: both_inner <- math %>% inner_join(reading, by = c("name" = "student")) head(both_inner,10) nrow(both_inner) # A basic left join, and seeing the result: both_left <- math %>% left_join(reading, by = c("name" = "student")) head(both_left,10) nrow(both_left) # A basic right join, and seeing the result: both_right <- math %>% right_join(reading, by = c("name" = "student")) head(both_right,10) nrow(both_right) # A basic full join, and seeing the result: both_full <- math %>% full_join(reading, by = c("name" = "student")) head(both_full,10) nrow(both_full) # It can be convenient to rename some of the columns in the joined data set # (with the pipe operation, this really could be done while doing the join): both_inner <- both_inner %>% rename(math_test = test.x, reading_test=test.y) head(both_inner,10) # Can actually join by a logical condition other than equality (less common): both_left_better_math <- math %>% left_join(reading, by = join_by(test > test)) head(both_left_better_math,10) nrow(both_left_better_math) # gives all comparison cases where the math test is higher than the reading test... # installing and loading the 'nycflights13' package, and glimpsing the 'flights' data frame: # install.packages("nycflights13") library(nycflights13) glimpse(flights) ## The 'airlines' data frame with the full airline names as a column: head(airlines, 3) dim(airlines) print(airlines) ## Inner join of the 'flights' data frame with the 'airlines' data frame. ## note the key (ID) column is called "carrier" in both tables. flights_joined <- flights %>% inner_join(airlines, by = c("carrier" = "carrier")) glimpse(flights_joined) # Because of the identical name of the key column, could simply use: flights_joined <- flights %>% inner_join(airlines, by = join_by(carrier)) glimpse(flights_joined) ## Just selecting a few columns from the joined table: flights_joined %>% select(carrier, name, flight, origin, dest) %>% head(3) ## Note in this case, the inner join produces a table with the same number of rows as the first table nrow(flights) nrow(flights_joined) ## Creating a data frame with only the airports that are in the Pacific time zone: nrow(airports) airports_pt <- airports %>% filter(tz == -8) nrow(airports_pt) ## Inner join to obtain only the flights that had destinations in the Pacific time zone: nyc_dests_pt <- flights %>% inner_join(airports_pt, by = c("dest" = "faa")) nrow(nyc_dests_pt) ## Left join of 'flights' and 'airports_pt'. What will result? nyc_dests <- flights %>% left_join(airports_pt, by = c("dest" = "faa")) # Note the key column has a different name in the two datasets being joined glimpse(nyc_dests) # Only contains airport characteristics for SOME of the flights... # Summary counts of number of flights going to the Pacific time zone and number going elsewhere: nyc_dests %>% summarize( num_flights = n(), num_flights_pt = sum(!is.na(name)), num_flights_not_pt = sum(is.na(name)) ) # Joining multiple tables: # A third simple table: student <- c("Kyle", "Jenny", "Alisha", "Bob") activity <- c(4, 6, 8, 9) test <- c(55, 78, 92, 82) biology <- data.frame(student, activity, test) all_full <- math %>% full_join(reading, by = c("name" = "student")) %>% full_join(biology, by = c("name" = "student")) %>% rename(math_test = test.x, reading_test=test.y, biology_test=test) print(all_full) ############ Baseball example: ## Creating a data table 'manny' with the seasonal batting statistics for Manny Ramirez: library(Lahman) manny <- Batting %>% filter(playerID == "ramirma02") nrow(manny) ## Getting a few summary statistics for the 'manny' table: manny %>% summarize( span = paste(min(yearID), max(yearID), sep = "-"), #the 'paste' function pastes numbers and character strings together into one big character string num_years = n_distinct(yearID), # n_distinct returns the number of distinct (nonidentical) values of a variable num_teams = n_distinct(teamID), BA = sum(H)/sum(AB), tH = sum(H), tHR = sum(HR), tRBI = sum(RBI) ) ## Same summary table, but now grouped by team, and arranged from earliest span of employment to latest span: manny %>% group_by(teamID) %>% summarize( span = paste(min(yearID), max(yearID), sep = "-"), num_years = n_distinct(yearID), num_teams = n_distinct(teamID), BA = sum(H)/sum(AB), tH = sum(H), tHR = sum(HR), tRBI = sum(RBI) ) %>% arrange(span) ## Similar summary table, but now grouped by league, and arranged from earliest span of employment to latest span: manny %>% group_by(lgID) %>% summarize( span = paste(min(yearID), max(yearID), sep = "-"), num_years = n_distinct(yearID), num_teams = n_distinct(teamID), BA = sum(H)/sum(AB), tH = sum(H), tHR = sum(HR), tRBI = sum(RBI) ) %>% arrange(span) ## Only picking the rows where Manny has at least 30 home runs, and counting the number of rows in this filtered table: manny %>% filter(HR >= 30) %>% nrow() ## This doesn't actually count all the seasons when Manny hit at least 30 home runs. Why? ## This does count all the seasons when Manny hit at least 30 home runs: manny %>% group_by(yearID) %>% summarize(tHR = sum(HR)) %>% filter(tHR >= 30) %>% nrow() ## Checking for all people named "Manny Ramirez" in the master dataset called "People" ## Note the book's code uses Master but the master data set is now called People. People %>% filter(nameLast == "Ramirez" & nameFirst == "Manny") ## Note that this returns characteristics of the player that are permanent and do not change from season to season. ## Well, the 'weight' variable might change, but we don't keep track of that... ## We can merge the 'Batting' table with the 'People' table to bring the variable birthYear in. ## That allows us to track the Age of the player over the years. Batting %>% filter(playerID == "ramirma02") %>% inner_join(People, by = c("playerID" = "playerID")) %>% group_by(yearID) %>% summarize( Age = max(yearID - birthYear), # note we don't really need to ask for the max of Age ... the min or the mean would give the same result num_teams = n_distinct(teamID), BA = sum(H)/sum(AB), tH = sum(H), tHR = sum(HR), tRBI = sum(RBI) ) %>% arrange(yearID) ## Calculating some specialized summary statistics, by Year, with the 'summarize' function and saving the result as a new data table: manny_by_season <- Batting %>% filter(playerID == "ramirma02") %>% inner_join(People, by = c("playerID" = "playerID")) %>% group_by(yearID) %>% summarize( Age = max(yearID - birthYear), num_teams = n_distinct(teamID), BA = sum(H)/sum(AB), tH = sum(H), tHR = sum(HR), tRBI = sum(RBI), OBP = sum(H + BB + HBP) / sum(AB + BB + SF + HBP), SLG = sum(H + X2B + 2 * X3B + 3 * HR) / sum(AB) ) %>% mutate(OPS = OBP + SLG) %>% arrange(desc(OPS)) # arranging the table from best OPS to worst manny_by_season ## Calculating a few statistics (OBP, SLG, OPS) for the major leagues as a whole for the years Manny was active: mlb <- Batting %>% filter(yearID %in% 1993:2011) %>% group_by(yearID) %>% summarize( lg_OBP = sum(H + BB + HBP, na.rm = TRUE) / sum(AB + BB + SF + HBP, na.rm = TRUE), lg_SLG = sum(H + X2B + 2*X3B + 3*HR, na.rm = TRUE) / #Note doubles, triples, HRs are already counted as hits in the 'H' column sum(AB, na.rm = TRUE) ) %>% mutate(lg_OPS = lg_OBP + lg_SLG) ## Merging Manny's seasonal statistics with the major league seasonal statistics so we can compare Manny to the major-league standard each year. manny_ratio <- manny_by_season %>% inner_join(mlb, by = c("yearID" = "yearID")) %>% mutate(OPS_plus = OPS / lg_OPS) %>% select(yearID, Age, OPS, lg_OPS, OPS_plus) %>% arrange(desc(OPS_plus)) manny_ratio ## Creating a data set 'ripken' with Cal Ripken's seasonal statistics, and merging it with the 'mlb' data table: ripken <- Batting %>% filter(playerID == "ripkeca01") ripken %>% inner_join(mlb, by = c("yearID" = "yearID")) %>% nrow() # same mlb %>% inner_join(ripken, by = c("yearID" = "yearID")) %>% nrow() ## The left join will return all years when Ripken played, but lg_OPS will only have a value for the years when Ramirez also played: ripken %>% left_join(mlb, by = c("yearID" = "yearID")) %>% select(yearID, playerID, lg_OPS) %>% head(3) # just the first 3 rows of the left-joined table ripken %>% left_join(mlb, by = c("yearID" = "yearID")) %>% select(yearID, playerID, lg_OPS) %>% head(Inf) # all the rows of the left-joined table ## Reversing the tables in the left join returns all years of 'mlb' (which are all years that Ramirez played), but Ripken only played for some of those: mlb %>% left_join(ripken, by = c("yearID" = "yearID")) %>% select(yearID, playerID, lg_OPS) ## Basically equivalent to doing "ripken right_join mlb" with some slight differences in data display: ripken %>% right_join(mlb, by = c("yearID" = "yearID")) %>% select(yearID, playerID, lg_OPS) ## Full join gets all rows from either player's career: ripken %>% full_join(mlb, by = c("yearID" = "yearID")) %>% select(yearID, playerID, lg_OPS)