### Example Chapter 17 R code: ## Fake names for Table 17.1 in book: fake_names <- tribble( ~Date, ~Last_Name, ~First_Name, ~Address, ~Age, ~Cause_death, "Aug 31, 1854", "Jones", "Thomas", "26 Broad St.", 37, "cholera", "Aug 31, 1854", "Jones", "Mary", "26 Broad St.", 11, "cholera", "Oct 1, 1854", "Warwick", "Martin", "14 Broad St.", 23, "cholera" ) mdsr_table(fake_names, caption = "Hypothetical data from 1854 cholera outbreak.") %>% kableExtra::column_spec(1, width = "6em") %>% kableExtra::column_spec(4, width = "6em") %>% kableExtra::column_spec(6, width = "6em") ## ignore this... # root <- fs::path("data/shp/") ## loading needed packages: library(tidyverse) library(mdsr) library(sf) # A *very* simple and context-free plot of locations of Cholera Deaths in the 1854 epidemic: plot(CholeraDeaths["Count"]) ## The Figure 17.2 from the book shows John Snow's original map of the 1854 Broad Street cholera outbreak. Source: Wikipedia. # knitr::include_graphics("gfx/1280px-Snow-cholera-map-1.jpg") ## Can we recreate this in R? ## First, download and unzip the files in the zipped folder at ## http://rtwilson.com/downloads/SnowGIS_SHP.zip library(sf) # The correct path on your computer will be different, will depend on where you sent the files when you unzipped the folder above. root <- fs::path("D:/Files_from_Orange_Flash_Drive") # Naming the root directory where the files are located dsn <- fs::path(root, "stat542files", "SnowGIS_SHP") # Specifiying the root directory object, plus the next folder name, then the zipped folder name. # listing the files in the directory: list.files(dsn) ## Listing the "layers" (the sets of shapefiles in the directory) st_layers(dsn) ## Loading the "Cholera_Deaths" later into R: CholeraDeaths <- st_read(dsn, layer = "Cholera_Deaths") class(CholeraDeaths) CholeraDeaths # There are 250 rows of the data frame for the 250 locations. # Each location has an ID number and a count of number of cholera deaths at that location. # The "geometry" of each location is connected to a specific Coordinate Reference System (CRS) ## Simple ggplot2 plot of the cholera deaths (a little context here -- latitude/longitude -- but not much) ggplot(CholeraDeaths) + geom_sf() ## An Erroneous reproduction of John Snow's original map of the 1854 cholera outbreak (dots representing the deaths from cholera are off by hundreds of meters) # install and load the "ggspatial", "prettymapr", "raster" packages: # install.packages("ggspatial") library(ggspatial) # install.packages("prettymapr") library(prettymapr) # install.packages("raster") library(raster) ggplot(CholeraDeaths) + annotation_map_tile(type = "osm", zoomin = 0) + geom_sf(aes(size = Count), alpha = 0.7) # Close, but not quite right. Why? ## the bounding box values for the CholeraDeaths object: st_bbox(CholeraDeaths) # The coordinate values in the CholeraDeaths object and those in the map tiles pulled from Open Street Map ("osm") are not in the same units. # The 'annotation_map_tile' function tries to compensate and adjust, but doesn't get it exactly right... ## The world map according to the Mercator and Gall--Peters projections #install.packages("mapproj") library(mapproj) library(maps) map("world", projection = "mercator", wrap = TRUE) dev.new() map("world", projection = "cylequalarea", param = 45, wrap = TRUE) ## The difference is slighter on smaller-scale maps, but still evident: map("state", projection = "mercator", wrap = TRUE) dev.new() map("state", projection = "cylequalarea", param = 45, wrap = TRUE) ## Better? The contiguous United States according to the Lambert conformal conic and Albers equal area projections ## The scales are set to be true on the 20th and 50th parallels map( "state", projection = "lambert", parameters = c(lat0 = 20, lat1 = 50), wrap = TRUE ) dev.new() map( "state", projection = "albers", parameters = c(lat0 = 20, lat1 = 50), wrap = TRUE ) ## Retrieving the WKT (Well-Known Text) version of the Coordinate Reference System for the 'CholeraDeaths' object: st_crs(CholeraDeaths) # Find the model of the earth used: (OSGB36 / British National Grid) # Find the method of encoding: Transverse Mercator # Find the Ellipsoid type and the units of length and angle # The EPSG code for this CRS is 27700 ## Sjhowing different representations of several Coordinate Reference Systems based on their EPSG codes: st_crs(4326)$epsg st_crs(3857)$Wkt st_crs(27700)$proj4string # 'proj4string' is not too interpretable ... ## 'st_transform' function will project the 'CholeraDeaths' object to the EPSG code 4326 cholera_4326 <- CholeraDeaths %>% st_transform(4326) ## We see the bounding box values are now in units of latitude and longitude: st_bbox(cholera_4326) ## The book says this code to reproduce John Snow's original map of the 1854 cholera outbreak is still wrong. ## However, that's based on an older version of the 'sp' package. ## If you're using the latest version, this plot is correct: ggplot(cholera_4326) + annotation_map_tile(type = "osm", zoomin = 0) + geom_sf(aes(size = Count), alpha = 0.7) #### The following code is to "fix" the problem the book noted, but actually, with the newest version, #### it produces the same map as the code above... ## help file for the "spTransform-methods" function in the "rgdal" package: ## help("spTransform-methods", package = "rgdal") ## Key quote from help file: ## "Not providing the appropriate +datum and +towgs84 tags led to coordinates being out by hundreds of metres. ## Unfortunately, there is no easy way to provide this information: the user has to know the correct metadata ## for the data being used, even if this can be hard to discover." ## Note the proj4string for 'CholeraDeaths' has no +datum and +towgs84 tags: st_crs(CholeraDeaths)$proj4string ## Setting the CRS to be 27700 (recommended EPSG for these data) and then transforming to EPSG 4326: cholera_latlong <- CholeraDeaths %>% st_set_crs(27700) %>% st_transform(4326) snow <- ggplot(cholera_latlong) + annotation_map_tile(type = "osm", zoomin = 0) + geom_sf(aes(size = Count)) snow # printing the map object that we saved as 'snow' ## Accurate Recreation of John Snow's original map of the 1854 cholera outbreak, with the location of the water pumps added. pumps <- st_read(dsn, layer = "Pumps") pumps_latlong <- pumps %>% st_set_crs(27700) %>% st_transform(4326) snow + geom_sf(data = pumps_latlong, size = 3, color = "red") ## The size of each black dot is proportional to the number of people who died from cholera at that location. ## The red dots indicate the location of public water pumps. ## The strong clustering of deaths around the water pump on Broad(wick) Street suggests that perhaps the cholera was spread through water obtained at that pump. ## This was a famous early historical example of the use of statistics and graphics in epidemiology, which eventually saved lives! # install.packages("tidygeocoder") library(tidygeocoder) ## creating a tibble with the location of the White House: white_house <- tibble( address = "The White House, Washington, DC" ) %>% tidygeocoder::geocode(address, method = "osm") head(white_house) # install.packages("leaflet") library(leaflet) white_house_map <- leaflet() %>% addTiles() %>% addMarkers(data = white_house) white_house_map # Nice interactive map is the result... ## Adding a popup to the White House map: white_house <- white_house %>% mutate( title = "The White House", street_address = "1600 Pennsylvania Ave" ) white_house_map %>% addPopups( data = white_house, popup = ~paste0("", title, "
", street_address) ) ## ----eval=TRUE, message=FALSE------------------------------------------------- library(fec12) # If having difficulty with the 'fec12' package, can get the data frames needed for this example by: # results_house <- as_tibble(read.table(file="https://people.stat.sc.edu/hitchcock/results_house.txt", header=T)) # candidates <- as_tibble(read.table(file="https://people.stat.sc.edu/hitchcock/candidates.txt", header=T)) ## Counting the number of congressional (House) elections: 435, plus a few from Puerto Rico, US Virgin Islands, etc. results_house %>% group_by(state, district_id) %>% summarize(N = n()) %>% nrow() ## All the candidates in every district, ordered by number of votes -- this is a big table! results_house %>% left_join(candidates, by = "cand_id") %>% dplyr::select(state, district_id, cand_name, party, general_votes) %>% # had to specify the 'dplyr' package here to avoid error... arrange(desc(general_votes)) ## Creating a 'district_elections' data frame with several new variables district_elections <- results_house %>% mutate(district = parse_number(district_id)) %>% group_by(state, district) %>% summarize( N = n(), total_votes = sum(general_votes, na.rm = TRUE), d_votes = sum(ifelse(party == "D", general_votes, 0), na.rm = TRUE), r_votes = sum(ifelse(party == "R", general_votes, 0), na.rm = TRUE) ) %>% mutate( other_votes = total_votes - d_votes - r_votes, r_prop = r_votes / total_votes, winner = ifelse(r_votes > d_votes, "Republican", "Democrat") ) ## Creating a 'nc_results' data frame with ONLY the North Carolina districts: nc_results <- district_elections %>% filter(state == "NC") nc_results %>% dplyr::select(-state) ## the 'skim' function here gives a summary of the 'total_votes' variable across the 13 districts ## (not much variability in total votes across districts): nc_results %>% skim(total_votes) %>% dplyr::select(-na) ## The summary of the vote totals and partywise proportions for the whole state of NC, not by district: nc_results %>% summarize( N = n(), state_votes = sum(total_votes), state_d = sum(d_votes), state_r = sum(r_votes) ) %>% mutate( d_prop = state_d / state_votes, r_prop = state_r / state_votes ) ## Ordering the NC districts by proportion of Republican votes: nc_results %>% dplyr::select(district, r_prop, winner) %>% arrange(desc(r_prop)) # Which party won more of the "competitive" districts? ## One approach to getting the files from this zipped folder into R... ## src <- "http://cdmaps.polisci.ucla.edu/shp/districts113.zip" ## dsn_districts <- usethis::use_zip(src, destdir = fs::path("data_large")) ## Another way: ## First, download and unzip the files in the zipped folder at ## http://cdmaps.polisci.ucla.edu/shp/districts113.zip library(sf) # The correct path on your computer will be different, will depend on where you sent the files when you unzipped the folder above. root <- fs::path("D:/Files_from_Orange_Flash_Drive") # Naming the root directory where the files are located dsn_districts <- fs::path(root, "stat542files", "districts113", "districtShapes") # Specifying the root directory object, plus the next folder name, then the zipped folder name. ## Reading the shapefiles into R as an 'sf' object, which is a data frame: library(sf) st_layers(dsn_districts) districts <- st_read(dsn_districts, layer = "districts113") %>% mutate(DISTRICT = parse_number(as.character(DISTRICT))) glimpse(districts) ## This contains many states... ## We just want the NC districts, so we filter on "North Carolina": nc_shp <- districts %>% filter(STATENAME == "North Carolina") # Creating a basic map of the NC districts: nc_shp %>% st_geometry() %>% plot(col = gray.colors(nrow(nc_shp))) ## Merging the NC election results with the NC map information: ## The key column here is "DISTRICT" = "district" nc_merged <- nc_shp %>% st_transform(4326) %>% inner_join(nc_results, by = c("DISTRICT" = "district")) glimpse(nc_merged) ## Making a Bichromatic choropleth map of the results of the 2012 congressional elections in North Carolina ## We fill with colors that are blue or red according to the value of the "winner" column in the 'nc_merged' table: nc <- ggplot(data = nc_merged, aes(fill = winner)) + annotation_map_tile(zoom = 6, type = "osm") + geom_sf(alpha = 0.5) + scale_fill_manual("Winner", values = c("blue", "red")) + geom_sf_label(aes(label = DISTRICT), fill = "white") + theme_void() nc ## Full color choropleth map of the results of the 2012 congressional elections in North Carolina. ## The blue and red here are shaded according to the proportion of votes for the winning party in that district: nc + aes(fill = r_prop) + scale_fill_distiller( "Proportion\nRepublican", palette = "RdBu", limits = c(0.2, 0.8) ) ## Book: "The clustering of Democratic voters is evident from the deeper blue in Democratic districts, versus the pale red in the more numerous Republican districts." ### An interactive plot with 'leaflet': ## Setting up a color palette with a range of colors from red to blue: library(leaflet) pal <- colorNumeric(palette = "RdBu", domain = c(0, 1)) ## Creating a similar leaflet plot: leaflet_nc <- leaflet(nc_merged) %>% addTiles() %>% addPolygons( weight = 1, fillOpacity = 0.7, color = ~pal(1 - r_prop), popup = ~paste("District", DISTRICT, "
", round(r_prop, 4)) ) %>% setView(lng = -80, lat = 35, zoom = 7) leaflet_nc # Try panning and zooming ... ## This generalizes the display to either save the map as a .png file or to display it in a web browser: if (knitr::is_latex_output()) { # mdsr::save_webshot(leaflet_nc, path_to_img = "gfx/leaflet-nc.png") knitr::include_graphics("gfx/leaflet-nc.png") } else { leaflet_nc } ### Example with ALL congressional district results, not just North Carolina: ## Need to connect the state names with the state abbreviations across data sets: districts_full <- districts %>% left_join( tibble(state.abb, state.name), # These two character vectors are built into R... by = c("STATENAME" = "state.name") ) %>% left_join( district_elections, by = c("state.abb" = "state", "DISTRICT" = "district") ) ## Setting up a bounding box and preparing a world map that specifically shows North America: box <- st_bbox(districts_full) world <- map_data("world") %>% st_as_sf(coords = c("long", "lat")) %>% group_by(group) %>% summarize(region = first(region), do_union = FALSE) %>% st_cast("POLYGON") %>% st_set_crs(4269) ## Filling districts with red or blue according to the proportion of votes in the U.S. congressional elections of 2012: ## (Note this uses the Mercator projection) map_4269 <- ggplot(data = districts_full) + geom_sf(data = world, size = 0.1) + geom_sf(aes(fill = r_prop), size = 0.1) + scale_fill_distiller(palette = "RdBu", limits = c(0, 1)) + theme_void() + labs(fill = "Proportion\nRepublican") + xlim(-180, -50) + ylim(box[c("ymin", "ymax")]) map_4269 # Works, but the projection is inaccurate ... dev.new() ## Filling districts with red or blue according to the proportion of votes in the U.S. congressional elections of 2012: ## (Note this uses the Albers equal area projection) districts_aea <- districts_full %>% st_transform(5070) box <- st_bbox(districts_aea) map_4269 %+% districts_aea + xlim(box[c("xmin", "xmax")]) + ylim(box[c("ymin", "ymax")]) # More accurate projection? ## This code can write an 'sf' object to a KML (Keyhole Markup Language) file, which can then be read by Google Earth: ## nc_merged %>% ## st_transform(4326) %>% ## st_write("/tmp/nc_congress113.kml", driver = "kml") ## A Google Earth representation of the NC district results map: knitr::include_graphics("gfx/googleearth-nc.png")