## loading packages
library(tidyverse)
library(mdsr)
# install.packages("googlesheets4")
library(googlesheets4)


## This section of code requires authentication and permission ... it didn't work for me.
hiv_key <- "1kWH_xdJDM4SMfT_Kzpkk-1yuxWChfurZuWYjfmv51EA"
hiv <- read_sheet(hiv_key) %>%
  rename(Country = 1) %>%
  filter(
    Country %in% c("United States", "France", "South Africa")
  ) %>%
  select(Country, `1979`, `1989`, `1999`, `2009`) %>%
  unnest(cols = c(`2009`)) %>%
  mutate(across(matches("[0-9]"), as.double))
hiv
##

## Another way to obtain the wide-format HIV data in the book, which can be downloaded from the Gapminder website:

# Reading a .csv file from a web location:
hiv_data <- readr::read_csv(file="https://people.stat.sc.edu/hitchcock/adults_with_hiv_percent_age_15_49.csv")
# Or if reading from a file stored in a directory on your computer:
# hiv_data <- readr::read_csv(file="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/adults_with_hiv_percent_age_15_49.csv")
head(hiv_data,10)

hiv <- hiv_data %>%
  rename(Country = 1) %>%
  filter(
    Country %in% c("USA", "France", "South Africa")  # Note the U.S. is labeled "USA" in this data set, not "United States" as in the book code
  ) %>%
  select(Country, `1979`, `1989`, `1999`, `2009`) %>%
  unnest(cols = c(`2009`)) %>%
  mutate(across(matches("[0-9]"), as.double))
hiv


## Presenting the same data in long form:
hiv %>% 
  pivot_longer(-Country, names_to = "Year", values_to = "hiv_rate")


## Counts for a sample of a few baby name/year combinations:
library(babynames)
set.seed(325)
names_short <- babynames %>%
  slice_sample(n = 7) %>%
  select(-prop)
names_short %>%
  mdsr_table(
    caption = "A data table showing how many babies were given each name in each year in the United States, for a few names.", 
    digits = 0
  ) %>%
  kableExtra::kable_styling(full_width = FALSE)


## arranging names from most popular to least, over all years:
popular_names <- babynames %>% 
  group_by(sex, name) %>%
  summarize(total_births = sum(n)) %>% 
  arrange(desc(total_births))

# Basic printing of output:
head(popular_names,10)


## mdsr_table output of 10 most popular names:
popular_names %>%
  head(10) %>%
  mdsr_table(caption = "The most popular baby names across all years.")


## 
neat <- Elections %>%
  mutate(Ward = factor(Ward)) %>%
  mutate(Precinct = factor(Precinct, 
                        levels = c("1","1C","2","2D","3","3A","4","4D",
                                 "5","5A","6","6C","7","8","9","10"))) %>%
  select(1,2,6,7,8,10)
names(neat) <- c("ward", "precinct","registered","voters","absentee","total_turnout")

head(neat)

## printing the 'neat' election data in mdsr_table format:
neat %>%
  slice(round(seq(from = 1, to = 25, by = 3))) %>%
  mdsr_table(caption = "A selection from the Minneapolis election data in tidy form.")


## geom_jitter does a jittered dotplot of the various precincts, here separated by Ward on the x-axis:
ggplot(data = neat, aes(x = ward, y = 100 * total_turnout)) + 
  geom_jitter(width = 0.05, alpha = 0.5) + ylim(0,55) + 
  ylab("Voter Turnout (%)") + xlab("Ward")


## Printing selected voter data rows as an mdsr table:
mdsr::Minneapolis2013[c(6,2,10,5,27), ] %>%
  as_tibble() %>%
  mdsr_table(caption = "Individual ballots in the Minneapolis election. Each voter votes in one precinct within one ward. The ballot marks the voter's first three choices for mayor.") %>%
  kableExtra::column_spec(2:4, width = "9em")

## Printing selected voter data rows to R console:
mdsr::Minneapolis2013[c(6,2,10,5,27), ] %>%
  as_tibble() %>%
 print(n=Inf)


## loading 'Cherry' data frame and creating a data frame called 'runners':
data(Cherry)
runners <- Cherry


## Printing selected runner data rows as an mdsr table:
mdsr_table(runners[15996:16010, c(1,5,2,6,4)], caption = "An excerpt of runners' performance over time in a 10-mile race.") %>%
  kableExtra::column_spec(1, width = "10em") %>%
  kableExtra::column_spec(2, width = "3em")


## 
## help(HELPrct)


## Reading in the blood pressure data:
# From a web location:
BP_full <- readr::read_csv(file="https://people.stat.sc.edu/hitchcock/Table6_8.csv")
# from a directory on my computer:
#BP_full <- readr::read_csv(file="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/Table6_8.csv")
BP_narrow <- BP_full %>%
  distinct(subject, when, .keep_all = TRUE) %>%
  select(subject, when, sbp) %>%
  arrange(desc(when), subject)
BP_wide <- BP_narrow %>%
  pivot_wider(names_from = when, values_from = sbp) %>%
  select(subject, before, after)


## ----wide-example
##mdsr_table(BP_wide, caption = "A blood pressure data table in a wide format.")

BP_wide

## ----narrow-example
## mdsr_table(BP_narrow, caption = "A tidy blood pressure data table in a narrow format.")

BP_narrow

## defining a column for change in blood pressure:
BP_wide %>% 
  mutate(change = after - before)


## Printing the BP_full table as an mdsr table:
mdsr_table(BP_full, caption = "A data table extending the information in the previous two to include additional variables and repeated measurements. The narrow format facilitates including new cases or variables.")

BP_full

## Making a wide table from the narrow table:
BP_narrow %>% 
  pivot_wider(names_from = when, values_from = sbp)


## Making a long narrow table from the wide table:
BP_wide %>% 
  pivot_longer(-subject, names_to = "when", values_to = "sbp")


## Using 'summarize' to calculate mean SBP by subject:
BP_full %>%
  group_by(subject, when) %>%
  summarize(mean_sbp = mean(sbp, na.rm = TRUE))

## Changing number of digits printed in columns of a tibble (optional):
BP_full %>%
  group_by(subject, when) %>%
  summarize(mean_sbp = num(mean(sbp, na.rm = TRUE),digits=2))


##### Wrong approach:
## attempting to combine individual measurements, but 'paste' creates character vectors (not the best way):
BP_summary <- BP_full %>%
  group_by(subject, when) %>%
  summarize(
    sbps = paste(sbp, collapse = ", "),
    dbps = paste(dbp, collapse = ", ")
  )
## Doesn't give correct means for subject-time combinations:
BP_summary %>%
  mutate(mean_sbp = mean(parse_number(sbps)))
###################


##### Better approach:
## adding a variable that is a list-column:
BP_nested <- BP_full %>%
  group_by(subject, when) %>%
  nest()
BP_nested


## Just using 'pull' gives an error:
BP_nested %>%
  mutate(sbp_list = pull(data, sbp))


## apply 'pull' to each item in the list using the 'map' function:
BP_nested <- BP_nested %>%
  mutate(sbp_list = map(data, pull, sbp))
BP_nested
## This creates another list-column.

## The 'pluck' function lets you see what's inside the list-column:
BP_nested %>% 
  pluck("sbp_list")
## Note the elements of this list-column have different sizes.

## The 'map' function will apply the 'mean' operation to each element in he list-column:
BP_nested <- BP_nested %>%
  mutate(sbp_mean = map(sbp_list, mean, na.rm = TRUE))
BP_nested


## We can now 'unnest' the last column to turn it back into a vector rather than a list-column:
BP_nested %>%
  unnest(cols = c(sbp_mean))


## We can unnest the sbp_list column as well, but this creates more rows and a longer table:
BP_nested %>%
  unnest(cols = c(sbp_list))

### Some more baby name examples:

## There aren't very many "Boys Named Sue":
babynames %>% 
  filter(name == "Sue") %>%
  group_by(name, sex) %>% 
  summarize(total = sum(n))


## 'Robin' is somewhat gender-neutral, but favoring females:
babynames %>% 
  filter(name == "Robin") %>%
  group_by(name, sex) %>% 
  summarize(total = sum(n))


## If we're comparing gender counts for a lot of names, it's more convenient to have the display in a wide format.
## Compare these options:

babynames %>%
  filter(name %in% c("Sue", "Robin", "Leslie")) %>%
  group_by(name, sex) %>%
  summarize(total = sum(n))

# Use of pivot_wider to turn a long table into a wide one:
babynames %>%
  filter(name %in% c("Sue", "Robin", "Leslie")) %>%
  group_by(name, sex) %>%
  summarize(total = sum(n)) %>%
  pivot_wider(
    names_from = sex, 
    values_from = total
  )


## This shows gender counts in the wide form for ALL names:
baby_wide <- babynames %>%
  group_by(sex, name) %>%
  summarize(total = sum(n)) %>%
  pivot_wider(
    names_from = sex, 
    values_from = total, 
    values_fill = 0  # Puts 0 if a name has no babies of one gender
  )
head(baby_wide, 3)  # Only printing the first three names (note the zeroes filled in)


## Arranging the names based on which gender ratio is closest to 50/50:
baby_wide %>% 
  filter(M > 50000, F > 50000) %>%
  mutate(ratio = pmin(M / F, F / M) ) %>%   
# the ratio would be 1 if there were exactly the same number of males and females for the name.
# pmin returns the minimum ratio for each pair (i.e., for each name); 'min' would return the minimum of all the ratios for the whole data set
  arrange(desc(ratio)) %>% 
  head(3)
# Can change the head(3) to see more gender-neutral names...

# Taylor Swift effect?

StartYr <- 1880
#StartYr <- 1980

baby_wide_Taylor_by_year <- babynames %>%
  filter(name=="Taylor" & year >=StartYr) %>%
  group_by(year, sex) %>%
  summarize(total = sum(n)) %>%
  pivot_wider(
    names_from = sex, 
    values_from = total, 
    values_fill = 0  # Puts 0 if a name has no babies of one gender
  ) %>%
mutate(adj_ratio = (M+1)/(F+1)) 

t1<- ggplot(data = baby_wide_Taylor_by_year, aes(x=year)) + geom_line(aes(y=adj_ratio)) + 
 geom_vline(xintercept = 1989, col='red') + geom_hline(yintercept = 1, col='blue') + 
  coord_trans(y = "log10")
t1