## loading packages library(tidyverse) library(mdsr) # install.packages("googlesheets4") library(googlesheets4) ## This section of code requires authentication and permission ... it didn't work for me. hiv_key <- "1kWH_xdJDM4SMfT_Kzpkk-1yuxWChfurZuWYjfmv51EA" hiv <- read_sheet(hiv_key) %>% rename(Country = 1) %>% filter( Country %in% c("United States", "France", "South Africa") ) %>% select(Country, `1979`, `1989`, `1999`, `2009`) %>% unnest(cols = c(`2009`)) %>% mutate(across(matches("[0-9]"), as.double)) hiv ## ## Another way to obtain the wide-format HIV data in the book, which can be downloaded from the Gapminder website: # Reading a .csv file from a web location: hiv_data <- readr::read_csv(file="https://people.stat.sc.edu/hitchcock/adults_with_hiv_percent_age_15_49.csv") # Or if reading from a file stored in a directory on your computer: # hiv_data <- readr::read_csv(file="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/adults_with_hiv_percent_age_15_49.csv") head(hiv_data,10) hiv <- hiv_data %>% rename(Country = 1) %>% filter( Country %in% c("USA", "France", "South Africa") # Note the U.S. is labeled "USA" in this data set, not "United States" as in the book code ) %>% select(Country, `1979`, `1989`, `1999`, `2009`) %>% unnest(cols = c(`2009`)) %>% mutate(across(matches("[0-9]"), as.double)) hiv ## Presenting the same data in long form: hiv %>% pivot_longer(-Country, names_to = "Year", values_to = "hiv_rate") ## Counts for a sample of a few baby name/year combinations: library(babynames) set.seed(325) names_short <- babynames %>% slice_sample(n = 7) %>% select(-prop) names_short %>% mdsr_table( caption = "A data table showing how many babies were given each name in each year in the United States, for a few names.", digits = 0 ) %>% kableExtra::kable_styling(full_width = FALSE) ## arranging names from most popular to least, over all years: popular_names <- babynames %>% group_by(sex, name) %>% summarize(total_births = sum(n)) %>% arrange(desc(total_births)) # Basic printing of output: head(popular_names,10) ## mdsr_table output of 10 most popular names: popular_names %>% head(10) %>% mdsr_table(caption = "The most popular baby names across all years.") ## neat <- Elections %>% mutate(Ward = factor(Ward)) %>% mutate(Precinct = factor(Precinct, levels = c("1","1C","2","2D","3","3A","4","4D", "5","5A","6","6C","7","8","9","10"))) %>% select(1,2,6,7,8,10) names(neat) <- c("ward", "precinct","registered","voters","absentee","total_turnout") head(neat) ## printing the 'neat' election data in mdsr_table format: neat %>% slice(round(seq(from = 1, to = 25, by = 3))) %>% mdsr_table(caption = "A selection from the Minneapolis election data in tidy form.") ## geom_jitter does a jittered dotplot of the various precincts, here separated by Ward on the x-axis: ggplot(data = neat, aes(x = ward, y = 100 * total_turnout)) + geom_jitter(width = 0.05, alpha = 0.5) + ylim(0,55) + ylab("Voter Turnout (%)") + xlab("Ward") ## Printing selected voter data rows as an mdsr table: mdsr::Minneapolis2013[c(6,2,10,5,27), ] %>% as_tibble() %>% mdsr_table(caption = "Individual ballots in the Minneapolis election. Each voter votes in one precinct within one ward. The ballot marks the voter's first three choices for mayor.") %>% kableExtra::column_spec(2:4, width = "9em") ## Printing selected voter data rows to R console: mdsr::Minneapolis2013[c(6,2,10,5,27), ] %>% as_tibble() %>% print(n=Inf) ## loading 'Cherry' data frame and creating a data frame called 'runners': data(Cherry) runners <- Cherry ## Printing selected runner data rows as an mdsr table: mdsr_table(runners[15996:16010, c(1,5,2,6,4)], caption = "An excerpt of runners' performance over time in a 10-mile race.") %>% kableExtra::column_spec(1, width = "10em") %>% kableExtra::column_spec(2, width = "3em") ## ## help(HELPrct) ## Reading in the blood pressure data: # From a web location: BP_full <- readr::read_csv(file="https://people.stat.sc.edu/hitchcock/Table6_8.csv") # from a directory on my computer: #BP_full <- readr::read_csv(file="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/Table6_8.csv") BP_narrow <- BP_full %>% distinct(subject, when, .keep_all = TRUE) %>% select(subject, when, sbp) %>% arrange(desc(when), subject) BP_wide <- BP_narrow %>% pivot_wider(names_from = when, values_from = sbp) %>% select(subject, before, after) ## ----wide-example ##mdsr_table(BP_wide, caption = "A blood pressure data table in a wide format.") BP_wide ## ----narrow-example ## mdsr_table(BP_narrow, caption = "A tidy blood pressure data table in a narrow format.") BP_narrow ## defining a column for change in blood pressure: BP_wide %>% mutate(change = after - before) ## Printing the BP_full table as an mdsr table: mdsr_table(BP_full, caption = "A data table extending the information in the previous two to include additional variables and repeated measurements. The narrow format facilitates including new cases or variables.") BP_full ## Making a wide table from the narrow table: BP_narrow %>% pivot_wider(names_from = when, values_from = sbp) ## Making a long narrow table from the wide table: BP_wide %>% pivot_longer(-subject, names_to = "when", values_to = "sbp") ## Using 'summarize' to calculate mean SBP by subject: BP_full %>% group_by(subject, when) %>% summarize(mean_sbp = mean(sbp, na.rm = TRUE)) ## Changing number of digits printed in columns of a tibble (optional): BP_full %>% group_by(subject, when) %>% summarize(mean_sbp = num(mean(sbp, na.rm = TRUE),digits=2)) ##### Wrong approach: ## attempting to combine individual measurements, but 'paste' creates character vectors (not the best way): BP_summary <- BP_full %>% group_by(subject, when) %>% summarize( sbps = paste(sbp, collapse = ", "), dbps = paste(dbp, collapse = ", ") ) ## Doesn't give correct means for subject-time combinations: BP_summary %>% mutate(mean_sbp = mean(parse_number(sbps))) ################### ##### Better approach: ## adding a variable that is a list-column: BP_nested <- BP_full %>% group_by(subject, when) %>% nest() BP_nested ## Just using 'pull' gives an error: BP_nested %>% mutate(sbp_list = pull(data, sbp)) ## apply 'pull' to each item in the list using the 'map' function: BP_nested <- BP_nested %>% mutate(sbp_list = map(data, pull, sbp)) BP_nested ## This creates another list-column. ## The 'pluck' function lets you see what's inside the list-column: BP_nested %>% pluck("sbp_list") ## Note the elements of this list-column have different sizes. ## The 'map' function will apply the 'mean' operation to each element in he list-column: BP_nested <- BP_nested %>% mutate(sbp_mean = map(sbp_list, mean, na.rm = TRUE)) BP_nested ## We can now 'unnest' the last column to turn it back into a vector rather than a list-column: BP_nested %>% unnest(cols = c(sbp_mean)) ## We can unnest the sbp_list column as well, but this creates more rows and a longer table: BP_nested %>% unnest(cols = c(sbp_list)) ### Some more baby name examples: ## There aren't very many "Boys Named Sue": babynames %>% filter(name == "Sue") %>% group_by(name, sex) %>% summarize(total = sum(n)) ## 'Robin' is somewhat gender-neutral, but favoring females: babynames %>% filter(name == "Robin") %>% group_by(name, sex) %>% summarize(total = sum(n)) ## If we're comparing gender counts for a lot of names, it's more convenient to have the display in a wide format. ## Compare these options: babynames %>% filter(name %in% c("Sue", "Robin", "Leslie")) %>% group_by(name, sex) %>% summarize(total = sum(n)) # Use of pivot_wider to turn a long table into a wide one: babynames %>% filter(name %in% c("Sue", "Robin", "Leslie")) %>% group_by(name, sex) %>% summarize(total = sum(n)) %>% pivot_wider( names_from = sex, values_from = total ) ## This shows gender counts in the wide form for ALL names: baby_wide <- babynames %>% group_by(sex, name) %>% summarize(total = sum(n)) %>% pivot_wider( names_from = sex, values_from = total, values_fill = 0 # Puts 0 if a name has no babies of one gender ) head(baby_wide, 3) # Only printing the first three names (note the zeroes filled in) ## Arranging the names based on which gender ratio is closest to 50/50: baby_wide %>% filter(M > 50000, F > 50000) %>% mutate(ratio = pmin(M / F, F / M) ) %>% # the ratio would be 1 if there were exactly the same number of males and females for the name. # pmin returns the minimum ratio for each pair (i.e., for each name); 'min' would return the minimum of all the ratios for the whole data set arrange(desc(ratio)) %>% head(3) # Can change the head(3) to see more gender-neutral names... # Taylor Swift effect? StartYr <- 1880 #StartYr <- 1980 baby_wide_Taylor_by_year <- babynames %>% filter(name=="Taylor" & year >=StartYr) %>% group_by(year, sex) %>% summarize(total = sum(n)) %>% pivot_wider( names_from = sex, values_from = total, values_fill = 0 # Puts 0 if a name has no babies of one gender ) %>% mutate(adj_ratio = (M+1)/(F+1)) t1<- ggplot(data = baby_wide_Taylor_by_year, aes(x=year)) + geom_line(aes(y=adj_ratio)) + geom_vline(xintercept = 1989, col='red') + geom_hline(yintercept = 1, col='blue') + coord_trans(y = "log10") t1