## HW 3 Example R code: library(mdsr) library(tidyverse) #Problem 2, Chapter 4 Exercises: library(babynames) babynames %>% select(n > 100) babynames %>% select(- year) babynames %>% mutate(name_length == nchar(name)) babynames %>% sex == M %>% select(-prop) babynames %>% select(year, year, sex) babynames %>% group_by(n) %>% summarize(ave = mean(n)) babynames %>% group_by(n > 100) %>% summarize(total = sum(n)) #Problem 3, Chapter 4 Exercises: library(tidyverse) mtcars %>% group_by(cyl) %>% summarize(avg_mpg = mean(mpg)) %>% filter(am == 1) # Corrected: library(tidyverse) mtcars %>% filter(am == 1) %>% group_by(cyl) %>% summarize(avg_mpg = mean(mpg)) # Problem 11, Chapter 4 exercises: library(mdsr) head(Violations) # If basing the data set on having at least 50 VIOLATIONS: summary_by_zip <- Violations %>% filter(!is.na(violation_code)) %>% group_by(zipcode) %>% summarise(N=n(), median_violation_score = median(score,na.rm=T) ) summary_by_zip %>% filter(N>=50) %>% print(n=Inf) # Looking at Pattern: g11 <- summary_by_zip %>% filter(N>=50) %>% ggplot(aes(x=N,y=median_violation_score)) g11 + geom_point()+ ylab("Median Violation Score") + xlab("Number of Violations") # If basing the data set on having at least 50 INSPECTIONS: summary_by_zip <- Violations %>% group_by(zipcode) %>% summarise(N=n(), total_violations = sum(!is.na(violation_code)), median_violation_score = median(score,na.rm=T) ) summary_by_zip %>% filter(N>=50) %>% print(n=Inf) # Looking at Pattern: g11 <- summary_by_zip %>% filter(N>=50) %>% ggplot(aes(x=total_violations,y=median_violation_score)) g11 + geom_point()+ ylab("Median Violation Score") + xlab("Number of Violations") # If basing the data set on zip codes IN MANHATTAN having at least 50 INSPECTIONS: summary_by_zip <- Violations %>% filter(boro == "MANHATTAN") %>% group_by(zipcode) %>% summarise(N=n(), total_violations = sum(!is.na(violation_code)), median_violation_score = median(score,na.rm=T) ) summary_by_zip %>% filter(N>=50) %>% print(n=Inf) # Looking at Pattern: g11 <- summary_by_zip %>% filter(N>=50) %>% ggplot(aes(x=total_violations,y=median_violation_score)) g11 + geom_point()+ ylab("Median Violation Score") + xlab("Number of Violations") #Problem 13, Chapter 4 exercises: library(Lahman) ## Selecting a few columns for the Angels: all_angels <- Teams %>% filter(teamID == "CAL" | teamID == "ANA" | teamID == "LAA") %>% select(yearID, teamID, lgID, W, L, WSWin) all_angels_sort <- all_angels %>% mutate(WinPct = W/(W+L)) %>% arrange(desc(WinPct)) head(all_angels_sort,10) #or: print(all_angels_sort[1:10,]) #Problem 14, Chapter 4 exercises: library(nycflights13) head(flights) plane_summary <- flights %>% group_by(tailnum) %>% summarise(N=n() ) %>% arrange(desc(N)) plane_summary flights %>% filter(tailnum=='N725MQ') %>% group_by(month) %>% summarise(N=n() ) %>% ggplot(aes( x = month, y = N ) ) + geom_line() + xlab("Month") + ylab("Number of Flights") # Problem 2, Chapter 5 exercises: # (a) ## great_HR_SB <- Batting %>% group_by(playerID) %>% summarize( tHR = sum(HR), tSB = sum(SB) ) %>% filter(tHR >= 300 & tSB >= 300) %>% arrange(desc(tHR)) # arranging the table from best tHR to worst great_HR_SB great_HR_SB %>% inner_join(People, by = c("playerID" = "playerID")) %>% select(playerID, nameFirst, nameLast, tHR, tSB) # (b) ## great_W_SO <- Pitching %>% group_by(playerID) %>% summarize( tW = sum(W), tSO = sum(SO) ) %>% filter(tW >= 300 & tSO >= 3000) %>% arrange(desc(tW)) # arranging the table from best tW to worst great_W_SO great_W_SO %>% inner_join(People, by = c("playerID" = "playerID")) %>% select(playerID, nameFirst, nameLast, tW, tSO) # (c) ## seasons_over_50HR <- Batting %>% filter(HR >= 50) %>% mutate(BatAvg = H/AB) %>% arrange(desc(BatAvg)) # arranging the table from best BatAvg to worst seasons_over_50HR seasons_over_50HR %>% inner_join(People, by = c("playerID" = "playerID")) %>% select(playerID, yearID, nameFirst, nameLast, HR, BatAvg) # Problem 4, Chapter 5 exercises: # inner join, to get the planes that flew from NYC in 2013: planes_NYC <- planes %>% inner_join(flights, by = c("tailnum" = "tailnum")) head(planes_NYC,10) nrow(planes_NYC) planes_NYC %>% arrange(year.x) # using n_distinct to get the planes that flew from NYC in 2013: planes_NYC %>% summarize(N=n(), num_planes=n_distinct(tailnum))