## ----cache=FALSE, echo=FALSE,include=FALSE------------------------------------
knitr::opts_chunk$set(error = TRUE)


## ----mdsr, message=FALSE------------------------------------------------------
library(mdsr)
library(tidyverse)


# Creating the CIACountries data frame:
CIACountries %>%
  select(-area, -pop) %>%
  head() %>%
  mdsr_table(caption = "A selection of variables from the first six rows of the CIA countries data table.") %>%
  kableExtra::column_spec(1, width = "10em")

# Printing the first few rows of the CIACountries data frame:
head(CIACountries)

# Simple scatterplot using geom_point (Fig. 3.1):
g <- ggplot(data = CIACountries, aes(y = gdp, x = educ))
g + geom_point(size = 3)

# Try changing the size value
# The warning message is because of missing values in either gdp or educ, or both, for some countries

# Addition of information on a third variable (categorical) using the Color aesthetic (Fig. 3.2):
g + geom_point(aes(color = net_users), size = 3)

# Replacing the plotting characters with country names (Fig. 3.3):
g + geom_text(aes(label = country, color = net_users), size = 3)

#Shorter (but less descriptive?) names using abbreviate:
g + geom_text(aes(label = abbreviate(country), color = net_users), size = 3)

# Adding information on a fourth variable using the size of the bubbles (Fig. 3.4):
g + geom_point(aes(color = net_users, size = roadways))


# Plotting the y-axis in log10 scale (Fig. 3.5):
g + 
  geom_point(aes(color = net_users, size = roadways)) + 
  coord_trans(y = "log10")


# A different way to present y-axis on log scale, and a better y-axis label (Fig. 3.6):
g + 
  geom_point(aes(color = net_users, size = roadways)) + 
  scale_y_continuous(
    name = "Gross Domestic Product", 
    trans = "log10",
    labels = scales::comma
  )


# Using facets rather than colors to separate the different levels of net_users (Fig. 3.7):
g + 
  geom_point(alpha = 0.9, aes(size = roadways)) + 
  coord_trans(y = "log10") + 
  facet_wrap(~net_users, nrow = 1) + 
  theme(legend.position = "top")
# What are pros and cons of using facets vs. colors here?

# A simple example of facet_grid to show a grid of facets for two categorical variables:

# Based on the 'mpg' dataset:
head(mpg)

# creating a simple ggplot object for a scatterplot of city gas mileage against displacement
 p <- ggplot(mpg, aes(displ, cty)) + geom_point()
# Use vars() to supply variables from the dataset:
# grid of scatterplots for combinations of type of drive (rows) and number of cylinders (columns)
p + facet_grid(vars(drv), vars(cyl))
# grid of scatterplots for combinations of class of vehicle (rows) and number of cylinders (columns)
p + facet_grid(vars(class), vars(cyl))


## Obtaining the ChargesNJ data frame from a larger data frame, MedicareCharges:
ChargesNJ <- MedicareCharges %>%
  filter(stateProvider == "NJ")


## ----drg-NJ, echo=FALSE,warning=FALSE, message=FALSE, results='asis'----------
set.seed(101)
ChargesNJ %>% 
  head(10) %>%
  mdsr_table(caption = "Glyph-ready data for the barplot layer.")

# First few observations of ChargesNJ data frame:
head(ChargesNJ)


## Bar graph of average charges for various medical procedures in New Jersey (Fig. 3.8)
p <- ggplot(
  data = ChargesNJ, 
  aes(x = reorder(drg, mean_charge), y = mean_charge)
) +
  geom_col(fill = "gray") +
  ylab("Statewide Average Charges ($)") + 
  xlab("Medical Procedure (DRG)") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = rel(0.5)))
p
# Note the first group of lines of code create the graphics object p,
# but we have to type p to display the graph here


## Bar graph with second layer to compare New Jersey to other states. Each dot represents one state, while the bars represent New Jersey (Fig 3.9)
p + geom_point(data = MedicareCharges, size = 1, alpha = 0.3) 

# Note than MedicareCharges data frame has data from ALL the states...
# Maybe we should make the dots exclude the NJ values:

MedicareChargesNoNJ <- MedicareCharges %>%
  filter(stateProvider != "NJ")
p + geom_point(data = MedicareChargesNoNJ, size = 1, alpha = 0.3) 


## Creates basic ggplot object that we can then build on:
g <- ggplot(data = SAT_2010, aes(x = math))


## Histogram of math SAT scores by state (Fig. 3.10)
g + geom_histogram(binwidth = 10) + labs(x = "Average Math SAT score")

## Trying different bin widths:

g + geom_histogram(binwidth = 6) + labs(x = "Average Math SAT score")


g + geom_histogram(binwidth = 14) + labs(x = "Average Math SAT score")

# Which looks better for this data set?

# Could also specify the number of bins directly:
g + geom_histogram(bins = 8) + labs(x = "Average Math SAT score")


## Density plot of average math SAT scores by state (Fig. 3.11)
g + geom_density(adjust = 0.3)

# Change the "adjust" argument to get a smoother or more wiggly density estimate:

# lower bandwidth = more wiggly estimate
g + geom_density(adjust = 0.1)

# higher bandwidth = smoother estimate
g + geom_density(adjust = 0.9)


## bar plot of average math SAT scores for a selection of states (Fig. 3.12)
ggplot(
  data = head(SAT_2010, 10), 
  aes(x = reorder(state, math), y = math)
) +
  geom_col() +
  labs(x = "State", y = "Average Math SAT score")

# Installing (if needed) and loading the mosaicData package:

# install.packages("mosaicData")
library(mosaicData)

## Stacked Bar Plot with Color (Fig. 3.13)
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") +
  scale_fill_brewer(palette = "Spectral") + 
  coord_flip()

# Using 'fct_relevel' to reorder the levels of the factor 'substance' before redoing the plot:
HELPrct_alt <- mosaicData::HELPrct  %>%
  mutate(substance = fct_relevel(substance, c('heroin','cocaine','alcohol')))

ggplot(data = HELPrct_alt, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") +
  scale_fill_brewer(palette = "Spectral") + 
  coord_flip()


## Stacked VERTICAL Bar Plot with Color (like Fig. 3.13 but without flipping the coordinates)
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") +
  scale_fill_brewer(palette = "Spectral") 

## Note the difference between position="stack" (shows counts) and position="fill" (shows proportions):
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "stack") +
  scale_fill_brewer(palette = "Spectral") 


## Creating a basic ggplot object
g <- ggplot(
  data = SAT_2010, 
  aes(x = expenditure, y = math)
) + 
  geom_point()
#plotting it:
g

## Plotting it as a scatterplot with a trend line:
g <- g + 
  geom_smooth(method = "lm", se = FALSE) +     # Note "lm" will show a LINEAR trend
  xlab("Average expenditure per student ($1000)") +
  ylab("Average score on math SAT")
g

## Creating categories ("low", "medium", "high") of SAT_2010 with the 'cut' function:
SAT_2010 <- SAT_2010 %>%
  mutate(
    SAT_rate = cut(
      sat_pct, 
      breaks = c(0, 30, 60, 100), 
      labels = c("low", "medium", "high")
    )
  )
g <- g %+% SAT_2010

## Plotting the symbolic scatterplot with SAT categories separated by color (Fig. 3.14):
g + aes(color = SAT_rate)


## Plotting 3 separate scatterplots for the SAT categories (Fig. 3.15):
g + facet_wrap(~ SAT_rate)


## install and load "NHANES" package:
# install.packages("NHANES")
library(NHANES)
# Scatterplot of height vs. age, separated in Color by Gender, with smooth trend curves (NONlinear here) added (Fig. 3.16):
ggplot(
  data = slice_sample(NHANES, n = 1000), 
  aes(x = Age, y = Height, color = fct_relevel(Gender, "male"))  #reorders Gender levels so "male" is first
) + 
  geom_point() + 
  geom_smooth() + 
  xlab("Age (years)") + 
  ylab("Height (cm)") +
  labs(color = "Gender")


## time series plot showing the change in temperature at the MacLeish field station in 2015 (Fig. 3.17)
## install and load "macleish" package:
# install.packages("macleish")
library(macleish)
ggplot(data = whately_2015, aes(x = when, y = temperature)) + 
  geom_line(color = "darkgray") + 
  geom_smooth() + 
  xlab(NULL) + 
  ylab("Temperature (degrees Celsius)")


## Creating summary statistics by month for the whately_2015 data frame:
whately_2015 %>%
  mutate(month = as.factor(lubridate::month(when, label = TRUE))) %>%
  group_by(month) %>% 
  skim(temperature) %>%
  select(-na)

## Note we had to pull the Month out of the original data frame's 'when' variable (more about this later)
head(whately_2015)

## boxplot of temperatures by month at the MacLeish field station (Fig. 3.18)
ggplot(
  data = whately_2015, 
  aes(
    x = lubridate::month(when, label = TRUE), 
    y = temperature
  )
) + 
  geom_boxplot() +
  xlab("Month") + 
  ylab("Temperature (degrees Celsius)")


## installing and loading packages:
library(NHANES)
# install.packages("ggmosaic")
library(ggmosaic)
## Creating the mosaic plot with the three variables:
mosaic_to_plot <- NHANES %>%
  filter(Age > 19) %>%
  mutate(AgeDecade = droplevels(AgeDecade)) %>%
  select(AgeDecade, Diabetes, BMI_WHO) %>% 
  na.omit()

## Plotting the Mosaic plot (eikosogram) of diabetes by age and weight status (BMI) (Fig. 3.19)
ggplot(mosaic_to_plot) +
  geom_mosaic(
    aes(x = product(BMI_WHO, AgeDecade), fill = Diabetes)
  ) + 
  ylab("BMI") + 
  xlab("Age (by decade)") + 
  coord_flip()


# install.packages("mosaic")
library(mosaic)

## A choropleth map displaying oil production by countries (Fig. 3.20)
# Country outline
CIACountries %>%
  select(country, oil_prod) %>%
  mutate(oil_prod_disc = cut(oil_prod, 
                             breaks = c(0, 1e3, 1e5, 1e6, 1e7, 1e8), 
                             labels = c(">1000", ">10,000", ">100,000", ">1 million", ">10 million"))) %>%
mosaic::mWorldMap(key = "country") +
  geom_polygon(aes(fill = oil_prod_disc)) + 
  scale_fill_brewer("Oil Prod. (bbl/day)", na.value = "white") +
  theme(legend.position = "top")


## Figure 3.21 code
# An example
library(tidygraph)
library(mdsr)
CellEdges <- Cancer
SmallEdges <- head(CellEdges,200)
#VV <- mdsr::edgesToVertices( SmallEdges, 
#                       from=cellLine, to=otherCellLine )
g <- SmallEdges %>%
  select(cellLine, otherCellLine, correlation) %>%
  as_tbl_graph(directed = FALSE) %>%
  mutate(type = substr(name, 0, 2))

library(ggraph)
ggraph(g, layout = 'kk') +
  geom_edge_arc(aes(width = correlation), color = "lightgray", strength = 0.2) +
  geom_node_point(aes(color = type), size = 10, alpha = 0.6) +
  geom_node_text(aes(label = type)) + 
  scale_edge_width_continuous(range = c(0.1, 1)) +
  guides(color = guide_legend(override.aes = list(size = 6))) + 
  theme_void() + 
  coord_cartesian(clip = "off")


## Extended example
library(babynames)
BabynamesDist <- make_babynames_dist()
BabynamesDist


## ----eval=FALSE---------------------------------------------------------------
## BabynamesDist %>%
##   filter(name == "Benjamin")


## creating a data frame called 'joseph' with only the males Josephs:
joseph <- BabynamesDist %>%
  filter(name == "Joseph" & sex == "M")
name_plot <- ggplot(data = joseph, aes(x = year))  # basic ggplot object (with x-axis) to build on


## adding barplots with estimated number of Josephs (who are presently -- in 2014) alive for each year of birth
name_plot <- name_plot +
  geom_col(
    aes(y = count_thousands * alive_prob), 
    fill = "#b2d7e9", 
    color = "white",
    size = 0.1
  )


## adding line plot with estimated number of Josephs BORN each year
name_plot <- name_plot + 
  geom_line(aes(y = count_thousands), size = 2)


## Adding a y-axis label
name_plot <- name_plot +
  ylab("Number of People (thousands)") + 
  xlab(NULL)


## printing a summary of the ggplot object so far
summary(name_plot)


## Calculating the median year of birth for all the Josephs in the data set.
wtd_quantile <- Hmisc::wtd.quantile  # a weighted quantile function
median_yob <- joseph %>%
  summarize(
    year = wtd_quantile(year, est_alive_today, probs = 0.5)
  ) %>% 
  pull(year)
median_yob


## Putting a blue bar at the median year (and zero-height bars at the other years):
name_plot <- name_plot +
  geom_col(
    color = "white", fill = "#008fd5", 
    aes(y = ifelse(year == median_yob, est_alive_today / 1000, 0))
  )


## Some text to add to plots:
context <- tribble(
  ~year, ~num_people, ~label,
  1935, 40, "Number of Josephs\nborn each year",
  1915, 13, "Number of Josephs\nborn each year
  \nestimated to be alive\non 1/1/2014", 
  2003, 40, "The median\nliving Joseph\nis 37 years old", 
)

## Placing the text created above onto the graph (Fig. 3.22):
name_plot +
  ggtitle("Age Distribution of American Boys Named Joseph") + 
  geom_text(
    data = context, 
    aes(y = num_people, label = label, color = label)
  ) + 
  geom_curve(
    x = 1990, xend = 1974, y = 40, yend = 24, 
    arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5
  ) + 
  scale_color_manual(
    guide = FALSE, 
    values = c("black", "#b2d7e9", "darkgray")
  ) + 
  ylim(0, 42)

## Note the text pieces (related specifically to the name "Joseph") after the 
# name_plot +
## are not part of the name_plot object.  Why?

## This allows us to use the generic "name_plot" object (updating the data argument)
## using the %+% operator to override the default existing data frame
## and to create similar plots for other names:

## For "Josephine" (Fig. 3.23):
name_plot %+% filter(
  BabynamesDist, 
  name == "Josephine" & sex == "F"
)

## Let's try some more:

MyName <- "Robert"
MySex <- "M"
name_plot %+% filter(
  BabynamesDist, 
  name == MyName & sex == MySex
)

## Note the blue bar is still based on the median for the "joseph" data frame, so ignore it.

## For more "unisex" names:

## For "Jessie" (Fig. 3.24):
names_plot <- name_plot + 
  facet_wrap(~sex)
names_plot %+% filter(BabynamesDist, name == "Jessie")


## Gender breakdown for the three most unisex names (Fig. 3.25):
many_names_plot <- name_plot + 
  facet_grid(name ~ sex)
mnp <- many_names_plot %+% filter(
  BabynamesDist, 
  name %in% c("Jessie", "Marion", "Jackie")
)
mnp


## Same plot, but switching rows and columns in facet_grid:
mnp + facet_grid(sex ~ name)


## Finding the 25 most common female names today (as of 2014):
com_fem <- BabynamesDist %>%
  filter(n > 100, sex == "F") %>% 
  group_by(name) %>%
  mutate(wgt = est_alive_today / sum(est_alive_today))

female_name_count <- rev(sort(tapply(com_fem$est_alive_today,com_fem$name,sum)))[1:25]
print(data.frame(female_name_count))

## Finding the 25 most common male names today:
com_m <- BabynamesDist %>%
  filter(n > 100, sex == "M") %>% 
  group_by(name) %>%
  mutate(wgt = est_alive_today / sum(est_alive_today))

male_name_count <- rev(sort(tapply(com_m$est_alive_today,com_m$name,sum)))[1:25]
print(data.frame(male_name_count))

## The book gives a plot based on these that we will not discuss here.