# Example code for Chapter 6, Part 2 library(tidyverse) library(mdsr) ## saving a data set as an RDS file: ## saveRDS(mtcars, file = "mtcars.rda", compress = TRUE) ## reading an RDS file using readRDS: ## mtcars1 <- readRDS("mtcars.rda") ## Showing the basic format of a .csv file: library(babynames) write.csv(head(babynames), row.names = FALSE) ## Reading a file directly from a website mdsr_url <- "https://raw.githubusercontent.com/beanumber/mdsr/master/data-raw/" # This is the partial URL houses <- mdsr_url %>% paste0("houses-for-sale.csv") %>% # This is the file name that finishes the URL; paste0() pastes the two parts together as one string read_csv() head(houses, 3) ## ----eval=FALSE, include=FALSE------------------------------------------------ ## webshot::webshot( ## url = "http://en.wikipedia.org/wiki/Mile_run_world_record_progression", ## file = "gfx/wiki_running.png", cliprect = c(2300, 0, 992, 644)) ### Another example of reading .csv and Excel files from external sources: #install.packages("openxlsx") library(openxlsx) library(readxl) election1 <- openxlsx::read.xlsx(xlsxFile="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy.xlsx") # Will work form reading from URLs # election1 <- readxl::read_excel(path="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/Minneapolis_tidy.xlsx") # readxl::read_excel will work only for reading from path name on local computer # If you're sure which type of file (.xls or .xlsx) you're reading in, could use 'read_xls' or 'read_xlsx' directly: # election1 <- readxl::read_xlsx(path="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/Minneapolis_tidy.xlsx") election1 library(readr) election2 <- read_csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy.csv") # Or could use the base R 'read.csv' function (not as fast with large files): # election2 <- read.csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy.csv") election2 # If the first row does NOT contain variable names: election3 <- openxlsx::read.xlsx(path="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.xlsx", col_names = F) # election3 <- readxl::read_excel(path="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.xlsx", col_names = F) # If you're sure which type of file (.xls or .xlsx) you're reading in, could use 'read_xls' or 'read_xlsx' directly: # election3 <- readxl::read_xlsx(path="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.xlsx", col_names = F) election3 # Then we should provide the names in a separate step: names(election3) <- c("ward","precinct","registered","voters","absentee","total_turnout") election3 election4 <- read_csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.csv", col_names = F) # Or could use the base R 'read.csv' function (not as fast with large files): # election4 <- read.csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.csv", header = F) election4 # Then we should provide the names in a separate step: names(election4) <- c("ward","precinct","registered","voters","absentee","total_turnout") rm(election1, election2, election3, election4) ## For files in which the delimiter that separates data values is something other than a comma, can use ## read_delim in the readr package: # read_delim(file="fullpathname.txt", delim = "|") ## There are lots of other options in the read_csv and read_delim functions... ## Reading HTML tables from a website library(rvest) url <- "http://en.wikipedia.org/wiki/Mile_run_world_record_progression" tables <- url %>% read_html() %>% html_nodes("table") ## The resulting object, tables, is a list: is.list(tables) ## There are 15 tables in the list 'tables': length(tables) ## plucking the 3rd of the 15 tables and saving it as 'amateur': amateur <- tables %>% purrr::pluck(3) %>% html_table() print(amateur, n=Inf) ## Printing as an mdsr_table: head(amateur) %>% mdsr_table(caption = "The third table embedded in the Wikipedia page on running records.") %>% kableExtra::column_spec(1, width = "3em") %>% kableExtra::column_spec(2, width = "10em") %>% kableExtra::column_spec(3:4, width = "8em") ## Plucking the 5th of the 15 tables and calling it 'records': records <- tables %>% purrr::pluck(5) %>% # Changed book's code from 4 to 5 since this is now the 5th table on the webpage html_table() print(records, n=Inf) ## Same, but removing the Auto column which only has a couple of values: records <- tables %>% purrr::pluck(5) %>% # Changed book's code from 4 to 5 since this is now the 5th table on the webpage html_table() %>% select(-Auto) # remove unwanted column, the -Auto selects all the columns EXCEPT Auto print(records, n=Inf) ## Printing as an msdr_table: head(records) %>% mdsr_table(caption = "The fifth table embedded in the Wikipedia page on running records.") %>% # changed from 'fourth' to 'fifth' kableExtra::column_spec(1, width = "3em") %>% kableExtra::column_spec(c(2, 4), width = "9em") # Calculating how much each record reduced the previous one by, and how long (in days) each record lasted (or has lasted, for the current record): records_more <- records %>% mutate(reduction = c(NA, -diff(as.numeric(lubridate::ms(records$Time))) )) %>% # 'ms' converts the character column 'Time' into a minutes-seconds object: mutate(lasted = diff(c(na.omit(lubridate::dmy(unlist(strsplit(records$Date, "[[]")))),today() ))) # Here, 'strsplit' splits the string when it finds a [ character... print(records_more,n=Inf) # Does how much you break the record by predict how long your record will last? ggplot(data=records_more,aes(y= parse_number(as.character(lasted)),x=reduction)) +geom_point() +ylab("Record duration")+xlab("Reduction in Previous Record") # Some useful R packages for dealing with APIs: install.packages(c("httr", "jsonlite")) library(httr) library(jsonlite) #install.packages("devtools") #library(devtools) ## Install and load gtrendsR package install.packages("gtrendsR") library(gtrendsR) ## get web query activity for keywords = c("NHL", "NBA", "MLB", "MLS") in last hour myresult <- gtrends(c("NHL", "NBA", "MLB", "MLS"), time="now 1-H") head(myresult) plot(myresult) ## houses table with coded categorical variables: houses %>% select(fuel, heat, sewer, construction) %>% head(5) ## ----house-systems, echo=FALSE------------------------------------------------ houses %>% select(fuel, heat, sewer, construction) %>% head(5) %>% mdsr_table(caption = "Four of the variables from the tables giving features of the Saratoga houses stored as integer codes. Each case is a different house.") ## A table of translations of what the numeric codes for the various categories mean, taken from a .csv file on the web: translations <- mdsr_url %>% # The first part of the URL (specified in a previous example) paste0("house_codes.csv") %>% # finishing the URL with the file name read_csv() translations %>% head(15) ## Putting the 'translations' table in wide format and calling it 'codes': codes <- translations %>% pivot_wider( names_from = system_type, values_from = meaning, values_fill = "invalid" # Any variable/number combination that is not specified in 'translations' is an invalid code, # so we fill the missing cells of 'codes' with the word 'invalid' ) ## Printing 'codes' as a mdsr_table: codes %>% mdsr_table(caption = "The Translations data table rendered in a wide format.") ## Merging the information in the original 'houses' table with the information in the 'codes' table ## converts the numeric codes to more meaningful string values. ## Why do we want a LEFT join here? houses <- houses %>% left_join( codes %>% select(code, fuel_type), by = c(fuel = "code") ) %>% left_join( codes %>% select(code, heat_type), by = c(heat = "code") ) %>% left_join( codes %>% select(code, sewer_type), by = c(sewer = "code") ) houses %>% select(fuel_type, heat_type, sewer_type) %>% head() ## Printing as a mdsr_table: houses %>% select(fuel_type, heat_type, sewer_type) %>% head() %>% mdsr_table(caption = "The Saratoga houses data with re-coded categorical variables.") ## glimpse a few variables in the ordway_birds data frame: ordway_birds %>% select(Timestamp, Year, Month, Day) %>% glimpse() # Try to calculate the mean year for the data set: mean(ordway_birds$Year, na.rm=T) ## Using parse_number to extract numeric information from a character string and store it as a numeric column: library(readr) ordway_birds <- ordway_birds %>% mutate( Month = parse_number(Month), Year = parse_number(Year), Day = parse_number(Day) ) ordway_birds %>% select(Timestamp, Year, Month, Day) %>% glimpse() # Try to calculate the mean year for the data set now: mean(ordway_birds$Year, na.rm=T) # na.rm=T will remove the missing values before calculating the mean # Note TimeStamp (which has date-time information) was a character variable, so we can't do mathematical operations on it. ## Use mdy_hms to convert TimeStamp to a true date-time (dttm) object called 'When': library(lubridate) birds <- ordway_birds %>% mutate(When = mdy_hms(Timestamp)) %>% select(Timestamp, Year, Month, Day, When, DataEntryPerson) birds %>% glimpse() ## Now we can plot 'When' on a meaningful numeric axis: birds %>% ggplot(aes(x = When, y = DataEntryPerson)) + geom_point(alpha = 0.1, position = "jitter") ## the 'first', 'last' and 'interval' function can work on date-time values: bird_summary <- birds %>% group_by(DataEntryPerson) %>% summarize( start = first(When), # Picks out the earliest date-time value for a person finish = last(When) # Picks out the latest date-time value for a person ) %>% mutate(duration = interval(start, finish) / ddays(1)) # 'interval' computes the difference between date-time values # Printing summary table: bird_summary %>% na.omit() ## Printing as an mdsr_table: bird_summary %>% na.omit() %>% mdsr_table(caption = paste("Starting and ending dates for each transcriber involved in the Ordway Birds project.")) %>% kableExtra::column_spec(1, width = "10em") ## classes of date-time objects: now() class(now()) class(as.POSIXlt(now())) as.POSIXct(now()) # stored as number of seconds since Jan. 1, 1970 as.POSIXlt(now()) # stored as character string # Display and usage of the two classes is similar... now() - birds$When[1] ## A date that does not include a time: as.Date(now()) # Also: today() ## Converting date-time information stored in a character object into a true date-time object: library(lubridate) example <- c("2021-04-29 06:00:00", "2021-12-31 12:00:00") str(example) converted <- ymd_hms(example) str(converted) # See the difference: now() - example now() - converted ## math on date-time values: converted converted[2] - converted[1] ## ----wikijapan, echo=FALSE, fig.cap="Screenshot of Wikipedia's list of Japanese nuclear reactors."---- # knitr::include_graphics("gfx/japanreactor.png") library(rvest) ## find the HTML tables from a specific website: tables <- "https://en.wikipedia.org/wiki/List_of_commercial_nuclear_reactors" %>% # Had to update the URL from the book's code... read_html() %>% html_nodes(css = "table") ## Detecting the table(s) that contain the string "Fukushima Daiichi" and saving its/their index number(s) as 'idx': idx <- tables %>% html_text() %>% str_detect("Fukushima Daiichi") %>% which() reactors <- tables %>% purrr::pluck(idx) %>% html_table(fill = TRUE) %>% # run the first three lines in this block to see the originally imported column names ... janitor::clean_names() %>% # run the first four lines in this block to see the effect of the clean_names function ... rename( reactor_type = type, # Providing more illustrative names to some columns... reactor_model = model, capacity_net = capacity_mw ) %>% tail(-1) glimpse(reactors) ## Creating a new 'plant_status' variable and ## Converting some character columns into date-time values: reactors <- reactors %>% mutate( plant_status = ifelse( str_detect(status, "Shut down"), "Shut down", "Not formally shut down" ), construct_date = dmy(beginbuilding), # Had to update some of the column names from the book's code operation_date = dmy(commercialoperation), closure_date = dmy(closed) ) glimpse(reactors) ## Now the data are ready to do some statistical analysis, like this symbolic scatter plot with smooth trend curves: ggplot( data = reactors, aes(x = construct_date, y = capacity_net, color = plant_status ) ) + geom_point() + geom_smooth() + xlab("Date of Plant Construction") + ylab("Net Plant Capacity (MW)")