## ----cache=FALSE, echo=FALSE,include=FALSE------------------------------------ knitr::opts_chunk$set(error = TRUE) ## ----mdsr, message=FALSE------------------------------------------------------ library(mdsr) library(tidyverse) # Creating the CIACountries data frame: CIACountries %>% select(-area, -pop) %>% head() %>% mdsr_table(caption = "A selection of variables from the first six rows of the CIA countries data table.") %>% kableExtra::column_spec(1, width = "10em") # Printing the first few rows of the CIACountries data frame: head(CIACountries) # Simple scatterplot using geom_point (Fig. 3.1): g <- ggplot(data = CIACountries, aes(y = gdp, x = educ)) g + geom_point(size = 3) # Try changing the size value # The warning message is because of missing values in either gdp or educ, or both, for some countries # Addition of information on a third variable (categorical) using the Color aesthetic (Fig. 3.2): g + geom_point(aes(color = net_users), size = 3) # Replacing the plotting characters with country names (Fig. 3.3): g + geom_text(aes(label = country, color = net_users), size = 3) #Shorter (but less descriptive?) names using abbreviate: g + geom_text(aes(label = abbreviate(country), color = net_users), size = 3) # Adding information on a fourth variable using the size of the bubbles (Fig. 3.4): g + geom_point(aes(color = net_users, size = roadways)) # Plotting the y-axis in log10 scale (Fig. 3.5): g + geom_point(aes(color = net_users, size = roadways)) + coord_trans(y = "log10") # A different way to present y-axis on log scale, and a better y-axis label (Fig. 3.6): g + geom_point(aes(color = net_users, size = roadways)) + scale_y_continuous( name = "Gross Domestic Product", trans = "log10", labels = scales::comma ) # Using facets rather than colors to separate the different levels of net_users (Fig. 3.7): g + geom_point(alpha = 0.9, aes(size = roadways)) + coord_trans(y = "log10") + facet_wrap(~net_users, nrow = 1) + theme(legend.position = "top") # What are pros and cons of using facets vs. colors here? # A simple example of facet_grid to show a grid of facets for two categorical variables: # Based on the 'mpg' dataset: head(mpg) # creating a simple ggplot object for a scatterplot of city gas mileage against displacement p <- ggplot(mpg, aes(displ, cty)) + geom_point() # Use vars() to supply variables from the dataset: # grid of scatterplots for combinations of type of drive (rows) and number of cylinders (columns) p + facet_grid(vars(drv), vars(cyl)) # grid of scatterplots for combinations of class of vehicle (rows) and number of cylinders (columns) p + facet_grid(vars(class), vars(cyl)) ## Obtaining the ChargesNJ data frame from a larger data frame, MedicareCharges: ChargesNJ <- MedicareCharges %>% filter(stateProvider == "NJ") ## ----drg-NJ, echo=FALSE,warning=FALSE, message=FALSE, results='asis'---------- set.seed(101) ChargesNJ %>% head(10) %>% mdsr_table(caption = "Glyph-ready data for the barplot layer.") # First few observations of ChargesNJ data frame: head(ChargesNJ) ## Bar graph of average charges for various medical procedures in New Jersey (Fig. 3.8) p <- ggplot( data = ChargesNJ, aes(x = reorder(drg, mean_charge), y = mean_charge) ) + geom_col(fill = "gray") + ylab("Statewide Average Charges ($)") + xlab("Medical Procedure (DRG)") + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = rel(0.5))) p # Note the first group of lines of code create the graphics object p, # but we have to type p to display the graph here ## Bar graph with second layer to compare New Jersey to other states. Each dot represents one state, while the bars represent New Jersey (Fig 3.9) p + geom_point(data = MedicareCharges, size = 1, alpha = 0.3) # Note than MedicareCharges data frame has data from ALL the states... # Maybe we should make the dots exclude the NJ values: MedicareChargesNoNJ <- MedicareCharges %>% filter(stateProvider != "NJ") p + geom_point(data = MedicareChargesNoNJ, size = 1, alpha = 0.3) ## Creates basic ggplot object that we can then build on: g <- ggplot(data = SAT_2010, aes(x = math)) ## Histogram of math SAT scores by state (Fig. 3.10) g + geom_histogram(binwidth = 10) + labs(x = "Average Math SAT score") ## Trying different bin widths: g + geom_histogram(binwidth = 6) + labs(x = "Average Math SAT score") g + geom_histogram(binwidth = 14) + labs(x = "Average Math SAT score") # Which looks better for this data set? # Could also specify the number of bins directly: g + geom_histogram(bins = 8) + labs(x = "Average Math SAT score") ## Density plot of average math SAT scores by state (Fig. 3.11) g + geom_density(adjust = 0.3) # Change the "adjust" argument to get a smoother or more wiggly density estimate: # lower bandwidth = more wiggly estimate g + geom_density(adjust = 0.1) # higher bandwidth = smoother estimate g + geom_density(adjust = 0.9) ## bar plot of average math SAT scores for a selection of states (Fig. 3.12) ggplot( data = head(SAT_2010, 10), aes(x = reorder(state, math), y = math) ) + geom_col() + labs(x = "State", y = "Average Math SAT score") # Installing (if needed) and loading the mosaicData package: # install.packages("mosaicData") library(mosaicData) ## Stacked Bar Plot with Color (Fig. 3.13) ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + geom_bar(aes(fill = substance), position = "fill") + scale_fill_brewer(palette = "Spectral") + coord_flip() # Using 'fct_relevel' to reorder the levels of the factor 'substance' before redoing the plot: HELPrct_alt <- mosaicData::HELPrct %>% mutate(substance = fct_relevel(substance, c('heroin','cocaine','alcohol'))) ggplot(data = HELPrct_alt, aes(x = homeless)) + geom_bar(aes(fill = substance), position = "fill") + scale_fill_brewer(palette = "Spectral") + coord_flip() ## Stacked VERTICAL Bar Plot with Color (like Fig. 3.13 but without flipping the coordinates) ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + geom_bar(aes(fill = substance), position = "fill") + scale_fill_brewer(palette = "Spectral") ## Note the difference between position="stack" (shows counts) and position="fill" (shows proportions): ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + geom_bar(aes(fill = substance), position = "stack") + scale_fill_brewer(palette = "Spectral") ## Creating a basic ggplot object g <- ggplot( data = SAT_2010, aes(x = expenditure, y = math) ) + geom_point() #plotting it: g ## Plotting it as a scatterplot with a trend line: g <- g + geom_smooth(method = "lm", se = FALSE) + # Note "lm" will show a LINEAR trend xlab("Average expenditure per student ($1000)") + ylab("Average score on math SAT") g ## Creating categories ("low", "medium", "high") of SAT_2010 with the 'cut' function: SAT_2010 <- SAT_2010 %>% mutate( SAT_rate = cut( sat_pct, breaks = c(0, 30, 60, 100), labels = c("low", "medium", "high") ) ) g <- g %+% SAT_2010 ## Plotting the symbolic scatterplot with SAT categories separated by color (Fig. 3.14): g + aes(color = SAT_rate) ## Plotting 3 separate scatterplots for the SAT categories (Fig. 3.15): g + facet_wrap(~ SAT_rate) ## install and load "NHANES" package: # install.packages("NHANES") library(NHANES) # Scatterplot of height vs. age, separated in Color by Gender, with smooth trend curves (NONlinear here) added (Fig. 3.16): ggplot( data = slice_sample(NHANES, n = 1000), aes(x = Age, y = Height, color = fct_relevel(Gender, "male")) #reorders Gender levels so "male" is first ) + geom_point() + geom_smooth() + xlab("Age (years)") + ylab("Height (cm)") + labs(color = "Gender") ## time series plot showing the change in temperature at the MacLeish field station in 2015 (Fig. 3.17) ## install and load "macleish" package: # install.packages("macleish") library(macleish) ggplot(data = whately_2015, aes(x = when, y = temperature)) + geom_line(color = "darkgray") + geom_smooth() + xlab(NULL) + ylab("Temperature (degrees Celsius)") ## Creating summary statistics by month for the whately_2015 data frame: whately_2015 %>% mutate(month = as.factor(lubridate::month(when, label = TRUE))) %>% group_by(month) %>% skim(temperature) %>% select(-na) ## Note we had to pull the Month out of the original data frame's 'when' variable (more about this later) head(whately_2015) ## boxplot of temperatures by month at the MacLeish field station (Fig. 3.18) ggplot( data = whately_2015, aes( x = lubridate::month(when, label = TRUE), y = temperature ) ) + geom_boxplot() + xlab("Month") + ylab("Temperature (degrees Celsius)") ## installing and loading packages: library(NHANES) # install.packages("ggmosaic") library(ggmosaic) ## Creating the mosaic plot with the three variables: mosaic_to_plot <- NHANES %>% filter(Age > 19) %>% mutate(AgeDecade = droplevels(AgeDecade)) %>% select(AgeDecade, Diabetes, BMI_WHO) %>% na.omit() ## Plotting the Mosaic plot (eikosogram) of diabetes by age and weight status (BMI) (Fig. 3.19) ggplot(mosaic_to_plot) + geom_mosaic( aes(x = product(BMI_WHO, AgeDecade), fill = Diabetes) ) + ylab("BMI") + xlab("Age (by decade)") + coord_flip() # install.packages("mosaic") library(mosaic) ## A choropleth map displaying oil production by countries (Fig. 3.20) # Country outline CIACountries %>% select(country, oil_prod) %>% mutate(oil_prod_disc = cut(oil_prod, breaks = c(0, 1e3, 1e5, 1e6, 1e7, 1e8), labels = c(">1000", ">10,000", ">100,000", ">1 million", ">10 million"))) %>% mosaic::mWorldMap(key = "country") + geom_polygon(aes(fill = oil_prod_disc)) + scale_fill_brewer("Oil Prod. (bbl/day)", na.value = "white") + theme(legend.position = "top") ## Figure 3.21 code # An example library(tidygraph) library(mdsr) CellEdges <- Cancer SmallEdges <- head(CellEdges,200) #VV <- mdsr::edgesToVertices( SmallEdges, # from=cellLine, to=otherCellLine ) g <- SmallEdges %>% select(cellLine, otherCellLine, correlation) %>% as_tbl_graph(directed = FALSE) %>% mutate(type = substr(name, 0, 2)) library(ggraph) ggraph(g, layout = 'kk') + geom_edge_arc(aes(width = correlation), color = "lightgray", strength = 0.2) + geom_node_point(aes(color = type), size = 10, alpha = 0.6) + geom_node_text(aes(label = type)) + scale_edge_width_continuous(range = c(0.1, 1)) + guides(color = guide_legend(override.aes = list(size = 6))) + theme_void() + coord_cartesian(clip = "off") ## Extended example library(babynames) BabynamesDist <- make_babynames_dist() BabynamesDist ## ----eval=FALSE--------------------------------------------------------------- ## BabynamesDist %>% ## filter(name == "Benjamin") ## creating a data frame called 'joseph' with only the males Josephs: joseph <- BabynamesDist %>% filter(name == "Joseph" & sex == "M") name_plot <- ggplot(data = joseph, aes(x = year)) # basic ggplot object (with x-axis) to build on ## adding barplots with estimated number of Josephs (who are presently -- in 2014) alive for each year of birth name_plot <- name_plot + geom_col( aes(y = count_thousands * alive_prob), fill = "#b2d7e9", color = "white", size = 0.1 ) ## adding line plot with estimated number of Josephs BORN each year name_plot <- name_plot + geom_line(aes(y = count_thousands), size = 2) ## Adding a y-axis label name_plot <- name_plot + ylab("Number of People (thousands)") + xlab(NULL) ## printing a summary of the ggplot object so far summary(name_plot) ## Calculating the median year of birth for all the Josephs in the data set. wtd_quantile <- Hmisc::wtd.quantile # a weighted quantile function median_yob <- joseph %>% summarize( year = wtd_quantile(year, est_alive_today, probs = 0.5) ) %>% pull(year) median_yob ## Putting a blue bar at the median year (and zero-height bars at the other years): name_plot <- name_plot + geom_col( color = "white", fill = "#008fd5", aes(y = ifelse(year == median_yob, est_alive_today / 1000, 0)) ) ## Some text to add to plots: context <- tribble( ~year, ~num_people, ~label, 1935, 40, "Number of Josephs\nborn each year", 1915, 13, "Number of Josephs\nborn each year \nestimated to be alive\non 1/1/2014", 2003, 40, "The median\nliving Joseph\nis 37 years old", ) ## Placing the text created above onto the graph (Fig. 3.22): name_plot + ggtitle("Age Distribution of American Boys Named Joseph") + geom_text( data = context, aes(y = num_people, label = label, color = label) ) + geom_curve( x = 1990, xend = 1974, y = 40, yend = 24, arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5 ) + scale_color_manual( guide = FALSE, values = c("black", "#b2d7e9", "darkgray") ) + ylim(0, 42) ## Note the text pieces (related specifically to the name "Joseph") after the # name_plot + ## are not part of the name_plot object. Why? ## This allows us to use the generic "name_plot" object (updating the data argument) ## using the %+% operator to override the default existing data frame ## and to create similar plots for other names: ## For "Josephine" (Fig. 3.23): name_plot %+% filter( BabynamesDist, name == "Josephine" & sex == "F" ) ## Let's try some more: MyName <- "Robert" MySex <- "M" name_plot %+% filter( BabynamesDist, name == MyName & sex == MySex ) ## Note the blue bar is still based on the median for the "joseph" data frame, so ignore it. ## For more "unisex" names: ## For "Jessie" (Fig. 3.24): names_plot <- name_plot + facet_wrap(~sex) names_plot %+% filter(BabynamesDist, name == "Jessie") ## Gender breakdown for the three most unisex names (Fig. 3.25): many_names_plot <- name_plot + facet_grid(name ~ sex) mnp <- many_names_plot %+% filter( BabynamesDist, name %in% c("Jessie", "Marion", "Jackie") ) mnp ## Same plot, but switching rows and columns in facet_grid: mnp + facet_grid(sex ~ name) ## Finding the 25 most common female names today (as of 2014): com_fem <- BabynamesDist %>% filter(n > 100, sex == "F") %>% group_by(name) %>% mutate(wgt = est_alive_today / sum(est_alive_today)) female_name_count <- rev(sort(tapply(com_fem$est_alive_today,com_fem$name,sum)))[1:25] print(data.frame(female_name_count)) ## Finding the 25 most common male names today: com_m <- BabynamesDist %>% filter(n > 100, sex == "M") %>% group_by(name) %>% mutate(wgt = est_alive_today / sum(est_alive_today)) male_name_count <- rev(sort(tapply(com_m$est_alive_today,com_m$name,sum)))[1:25] print(data.frame(male_name_count)) ## The book gives a plot based on these that we will not discuss here.