## HW 3 Example R code:

library(mdsr)
library(tidyverse)

#Problem 2, Chapter 4 Exercises:

library(babynames)

babynames %>% select(n > 100)

babynames %>% select(- year)

babynames %>% mutate(name_length == nchar(name))

babynames %>% sex == M %>% select(-prop)

babynames %>% select(year, year, sex)

babynames %>% group_by(n) %>% summarize(ave = mean(n))

babynames %>% group_by(n > 100) %>% summarize(total = sum(n))


#Problem 3, Chapter 4 Exercises:

library(tidyverse)
mtcars %>%
  group_by(cyl) %>%
  summarize(avg_mpg = mean(mpg)) %>%
  filter(am == 1)

# Corrected:

library(tidyverse)
mtcars %>%
  filter(am == 1) %>%
  group_by(cyl) %>%
  summarize(avg_mpg = mean(mpg))

# Problem 11, Chapter 4 exercises:

library(mdsr)
head(Violations)

# If basing the data set on having at least 50 VIOLATIONS:

summary_by_zip <- Violations %>%
 filter(!is.na(violation_code)) %>%
 group_by(zipcode) %>%
 summarise(N=n(),
 median_violation_score = median(score,na.rm=T)
  )

summary_by_zip %>%
 filter(N>=50) %>%
 print(n=Inf)

# Looking at Pattern:

g11 <- summary_by_zip %>%
 filter(N>=50) %>%
 ggplot(aes(x=N,y=median_violation_score))
g11 + geom_point()+
  ylab("Median Violation Score") + 
  xlab("Number of Violations")


# If basing the data set on having at least 50 INSPECTIONS:

summary_by_zip <- Violations %>%
 group_by(zipcode) %>%
 summarise(N=n(),
 total_violations = sum(!is.na(violation_code)),
 median_violation_score = median(score,na.rm=T)
  )

summary_by_zip %>%
 filter(N>=50) %>%
 print(n=Inf)

# Looking at Pattern:

g11 <- summary_by_zip %>%
 filter(N>=50) %>%
 ggplot(aes(x=total_violations,y=median_violation_score))
g11 + geom_point()+
  ylab("Median Violation Score") + 
  xlab("Number of Violations")


# If basing the data set on zip codes IN MANHATTAN having at least 50 INSPECTIONS:

summary_by_zip <- Violations %>%
 filter(boro == "MANHATTAN") %>%
 group_by(zipcode) %>%
 summarise(N=n(),
 total_violations = sum(!is.na(violation_code)),
 median_violation_score = median(score,na.rm=T)
  )

summary_by_zip %>%
 filter(N>=50) %>%
 print(n=Inf)

# Looking at Pattern:

g11 <- summary_by_zip %>%
 filter(N>=50) %>%
 ggplot(aes(x=total_violations,y=median_violation_score))
g11 + geom_point()+
  ylab("Median Violation Score") + 
  xlab("Number of Violations")




#Problem 13, Chapter 4 exercises:

library(Lahman)
## Selecting a few columns for the Angels:
all_angels <- Teams %>% 
  filter(teamID == "CAL" | teamID == "ANA" | teamID == "LAA") %>%
  select(yearID, teamID, lgID, W, L, WSWin)

all_angels_sort <- all_angels %>% 
  mutate(WinPct = W/(W+L)) %>%
  arrange(desc(WinPct)) 

head(all_angels_sort,10)
#or:
print(all_angels_sort[1:10,])


#Problem 14, Chapter 4 exercises:

library(nycflights13)
head(flights)

plane_summary <- flights %>%
 group_by(tailnum) %>%
 summarise(N=n()
  ) %>%
arrange(desc(N))

plane_summary

flights %>%
filter(tailnum=='N725MQ') %>%
group_by(month) %>% 
 summarise(N=n()
  ) %>%
ggplot(aes(
    x = month, 
    y = N
  )
) + 
  geom_line() +
  xlab("Month") + 
  ylab("Number of Flights")

# Problem 2, Chapter 5 exercises:

# (a)

## 
great_HR_SB <- Batting %>%
group_by(playerID) %>%
  summarize(
    tHR = sum(HR), 
    tSB = sum(SB)
  ) %>%
  filter(tHR >= 300 & tSB >= 300) %>% 
  arrange(desc(tHR))  # arranging the table from best tHR to worst
great_HR_SB

great_HR_SB %>%
inner_join(People, by = c("playerID" = "playerID")) %>%
select(playerID, nameFirst, nameLast, tHR, tSB)

# (b)

## 
great_W_SO <- Pitching %>%
group_by(playerID) %>%
  summarize(
    tW = sum(W), 
    tSO = sum(SO)
  ) %>%
  filter(tW >= 300 & tSO >= 3000) %>% 
  arrange(desc(tW))  # arranging the table from best tW to worst
great_W_SO

great_W_SO %>%
inner_join(People, by = c("playerID" = "playerID")) %>%
select(playerID, nameFirst, nameLast, tW, tSO)

# (c)


## 
seasons_over_50HR <- Batting %>%
  filter(HR >= 50) %>% 
  mutate(BatAvg = H/AB) %>%
  arrange(desc(BatAvg))  # arranging the table from best BatAvg to worst
seasons_over_50HR

seasons_over_50HR %>%
inner_join(People, by = c("playerID" = "playerID")) %>%
select(playerID, yearID, nameFirst, nameLast, HR, BatAvg)

# Problem 4, Chapter 5 exercises:

# inner join, to get the planes that flew from NYC in 2013:
planes_NYC <- planes %>%
 inner_join(flights, by = c("tailnum" = "tailnum"))
head(planes_NYC,10)
nrow(planes_NYC)

planes_NYC %>%
arrange(year.x)

# using n_distinct to get the planes that flew from NYC in 2013:
planes_NYC %>%
 summarize(N=n(),
num_planes=n_distinct(tailnum))