# Example code for Chapter 6, Part 2

library(tidyverse)
library(mdsr)

## saving a data set as an RDS file:
## saveRDS(mtcars, file = "mtcars.rda", compress = TRUE)


## reading an RDS file using readRDS:
## mtcars1 <- readRDS("mtcars.rda")


## Showing the basic format of a .csv file:
library(babynames)
write.csv(head(babynames), row.names = FALSE)


## Reading a file directly from a website
mdsr_url <- "https://raw.githubusercontent.com/beanumber/mdsr/master/data-raw/"  # This is the partial URL
houses <- mdsr_url %>%
  paste0("houses-for-sale.csv") %>%   # This is the file name that finishes the URL; paste0() pastes the two parts together as one string
  read_csv()
head(houses, 3)


## ----eval=FALSE, include=FALSE------------------------------------------------
## webshot::webshot(
##   url = "http://en.wikipedia.org/wiki/Mile_run_world_record_progression",
##   file = "gfx/wiki_running.png", cliprect = c(2300, 0, 992, 644))


### Another example of reading .csv and Excel files from external sources:

#install.packages("openxlsx")
library(openxlsx)
library(readxl)


election1 <- openxlsx::read.xlsx(xlsxFile="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy.xlsx") # Will work form reading from URLs
# election1 <- readxl::read_excel(path="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/Minneapolis_tidy.xlsx") 
# readxl::read_excel will work only for reading from path name on local computer
# If you're sure which type of file (.xls or .xlsx) you're reading in, could use 'read_xls' or 'read_xlsx' directly:
# election1 <- readxl::read_xlsx(path="Z:/Z from MATHSTAT/My Documents/teaching/stat_542/Minneapolis_tidy.xlsx")

election1

library(readr)
election2 <- read_csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy.csv")
# Or could use the base R 'read.csv' function (not as fast with large files):
# election2 <- read.csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy.csv")

election2


# If the first row does NOT contain variable names:
election3 <- openxlsx::read.xlsx(path="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.xlsx", col_names = F)
# election3 <- readxl::read_excel(path="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.xlsx", col_names = F)
# If you're sure which type of file (.xls or .xlsx) you're reading in, could use 'read_xls' or 'read_xlsx' directly:
# election3 <- readxl::read_xlsx(path="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.xlsx", col_names = F)

election3

# Then we should provide the names in a separate step:
names(election3) <- c("ward","precinct","registered","voters","absentee","total_turnout")

election3

election4 <- read_csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.csv", col_names = F)
# Or could use the base R 'read.csv' function (not as fast with large files):
# election4 <- read.csv(file="https://people.stat.sc.edu/hitchcock/Minneapolis_tidy_no_headers.csv", header = F)

election4

# Then we should provide the names in a separate step:
names(election4) <- c("ward","precinct","registered","voters","absentee","total_turnout")

rm(election1, election2, election3, election4)

## For files in which the delimiter that separates data values is something other than a comma, can use
## read_delim in the readr package:

# read_delim(file="fullpathname.txt", delim = "|")

## There are lots of other options in the read_csv and read_delim functions...

## Reading HTML tables from a website
library(rvest)
url <- "http://en.wikipedia.org/wiki/Mile_run_world_record_progression"
tables <- url %>% 
  read_html() %>% 
  html_nodes("table")
## The resulting object, tables, is a list:
is.list(tables)


## There are 15 tables in the list 'tables':
length(tables)


## plucking the 3rd of the 15 tables and saving it as 'amateur':
amateur <- tables %>%
  purrr::pluck(3) %>%
  html_table()

print(amateur, n=Inf)

## Printing as an mdsr_table:
head(amateur) %>%
  mdsr_table(caption = "The third table embedded in the Wikipedia page on running records.") %>%
  kableExtra::column_spec(1, width = "3em") %>%
  kableExtra::column_spec(2, width = "10em") %>%
  kableExtra::column_spec(3:4, width = "8em")

## Plucking the 5th of the 15 tables and calling it 'records':
records <- tables %>%
  purrr::pluck(5) %>%  # Changed book's code from 4 to 5 since this is now the 5th table on the webpage
  html_table()

print(records, n=Inf)


## Same, but removing the Auto column which only has a couple of values:
records <- tables %>%
  purrr::pluck(5) %>%  # Changed book's code from 4 to 5 since this is now the 5th table on the webpage
  html_table() %>%
  select(-Auto)  # remove unwanted column, the -Auto selects all the columns EXCEPT Auto

print(records, n=Inf)

## Printing as an msdr_table:
head(records) %>%
  mdsr_table(caption = "The fifth table embedded in the Wikipedia page on running records.") %>%  # changed from 'fourth' to 'fifth'
  kableExtra::column_spec(1, width = "3em") %>%
  kableExtra::column_spec(c(2, 4), width = "9em")

# Calculating how much each record reduced the previous one by, and how long (in days) each record lasted (or has lasted, for the current record):

records_more <- records %>%
 mutate(reduction = c(NA, -diff(as.numeric(lubridate::ms(records$Time))) )) %>%  # 'ms' converts the character column 'Time' into a minutes-seconds object:
 mutate(lasted = diff(c(na.omit(lubridate::dmy(unlist(strsplit(records$Date, "[[]")))),today() )))  
# Here, 'strsplit' splits the string when it finds a [ character...

print(records_more,n=Inf)

# Does how much you break the record by predict how long your record will last?

ggplot(data=records_more,aes(y= parse_number(as.character(lasted)),x=reduction)) +geom_point() +ylab("Record duration")+xlab("Reduction in Previous Record")


# Some useful R packages for dealing with APIs:

install.packages(c("httr", "jsonlite"))
library(httr)
library(jsonlite)


#install.packages("devtools")
#library(devtools)
## Install and load gtrendsR package
install.packages("gtrendsR")  

library(gtrendsR)

## get web query activity for keywords = c("NHL", "NBA", "MLB", "MLS") in last hour

myresult <- gtrends(c("NHL", "NBA", "MLB", "MLS"), time="now 1-H")
head(myresult)
plot(myresult)


## houses table with coded categorical variables:

houses %>% 
  select(fuel, heat, sewer, construction) %>% 
  head(5)

## ----house-systems, echo=FALSE------------------------------------------------
houses %>% 
  select(fuel, heat, sewer, construction) %>% 
  head(5) %>%
  mdsr_table(caption = "Four of the variables from the tables giving features of the Saratoga houses stored as integer codes. Each case is a different house.")


## A table of translations of what the numeric codes for the various categories mean, taken from a .csv file on the web:
translations <- mdsr_url %>%  # The first part of the URL (specified in a previous example)
  paste0("house_codes.csv") %>%  # finishing the URL with the file name
  read_csv()
translations %>% head(15)


## Putting the 'translations' table in wide format and calling it 'codes':
codes <- translations %>%
  pivot_wider(
    names_from = system_type, 
    values_from = meaning, 
    values_fill = "invalid"  # Any variable/number combination that is not specified in 'translations' is an invalid code, 
                             # so we fill the missing cells of 'codes' with the word 'invalid'
  )


## Printing 'codes' as a mdsr_table:
codes %>%
  mdsr_table(caption = "The Translations data table rendered in a wide format.")


## Merging the information in the original 'houses' table with the information in the 'codes' table
## converts the numeric codes to more meaningful string values.
## Why do we want a LEFT join here?
houses <- houses %>%
  left_join(
    codes %>% select(code, fuel_type), 
    by = c(fuel = "code")
  ) %>%
  left_join(
    codes %>% select(code, heat_type), 
    by = c(heat = "code")
  ) %>%
  left_join(
    codes %>% select(code, sewer_type), 
    by = c(sewer = "code")
  )

houses %>% 
  select(fuel_type, heat_type, sewer_type) %>% 
  head()


## Printing as a mdsr_table:
houses %>% 
  select(fuel_type, heat_type, sewer_type) %>% 
  head() %>%
  mdsr_table(caption = "The Saratoga houses data with re-coded categorical variables.")


## glimpse a few variables in the ordway_birds data frame:
ordway_birds %>% 
  select(Timestamp, Year, Month, Day) %>% 
  glimpse()

# Try to calculate the mean year for the data set:

mean(ordway_birds$Year, na.rm=T)


## Using parse_number to extract numeric information from a character string and store it as a numeric column:
library(readr)
ordway_birds <- ordway_birds %>%
  mutate(
    Month = parse_number(Month), 
    Year = parse_number(Year),
    Day = parse_number(Day)
  )
ordway_birds %>% 
  select(Timestamp, Year, Month, Day) %>% 
  glimpse()

# Try to calculate the mean year for the data set now:

mean(ordway_birds$Year, na.rm=T)  # na.rm=T will remove the missing values before calculating the mean

# Note TimeStamp (which has date-time information) was a character variable, so we can't do mathematical operations on it.


## Use mdy_hms to convert TimeStamp to a true date-time (dttm) object called 'When':
library(lubridate)
birds <- ordway_birds %>% 
  mutate(When = mdy_hms(Timestamp)) %>% 
  select(Timestamp, Year, Month, Day, When, DataEntryPerson)
birds %>% 
  glimpse()


## Now we can plot 'When' on a meaningful numeric axis:
birds %>% 
  ggplot(aes(x = When, y = DataEntryPerson)) + 
  geom_point(alpha = 0.1, position = "jitter") 


## the 'first', 'last' and 'interval' function can work on date-time values:
bird_summary <- birds %>% 
  group_by(DataEntryPerson) %>% 
  summarize(
    start = first(When),   # Picks out the earliest date-time value for a person
    finish = last(When)    # Picks out the latest date-time value for a person
  ) %>%
  mutate(duration = interval(start, finish) / ddays(1))  # 'interval' computes the difference between date-time values

# Printing summary table:

bird_summary %>%
  na.omit()

## Printing as an mdsr_table:
bird_summary %>%
  na.omit() %>%
  mdsr_table(caption = paste("Starting and ending dates for each transcriber involved in the Ordway Birds project.")) %>%
  kableExtra::column_spec(1, width = "10em")


## classes of date-time objects:
now()
class(now())
class(as.POSIXlt(now()))

as.POSIXct(now())  # stored as number of seconds since Jan. 1, 1970
as.POSIXlt(now()) # stored as character string
# Display and usage of the two classes is similar...

now() - birds$When[1]

## A date that does not include a time:
as.Date(now())

# Also:
today()

## Converting date-time information stored in a character object into a true date-time object:
library(lubridate)
example <- c("2021-04-29 06:00:00", "2021-12-31 12:00:00")
str(example)

converted <- ymd_hms(example)
str(converted)

# See the difference:

now() - example

now() - converted


## math on date-time values:
converted
converted[2] - converted[1]


## ----wikijapan, echo=FALSE, fig.cap="Screenshot of Wikipedia's list of Japanese nuclear reactors."----
# knitr::include_graphics("gfx/japanreactor.png")


library(rvest)

## find the HTML tables from a specific website:
tables <- "https://en.wikipedia.org/wiki/List_of_commercial_nuclear_reactors" %>%   # Had to update the URL from the book's code...
  read_html() %>% 
  html_nodes(css = "table")

## Detecting the table(s) that contain the string "Fukushima Daiichi" and saving its/their index number(s) as 'idx':
idx <- tables %>%
  html_text() %>%
  str_detect("Fukushima Daiichi") %>%  
  which()

reactors <- tables %>%
  purrr::pluck(idx) %>%
  html_table(fill = TRUE) %>%  # run the first three lines in this block to see the originally imported column names ...
  janitor::clean_names() %>%   # run the first four lines in this block to see the effect of the clean_names function ...
  rename(
    reactor_type = type,  # Providing more illustrative names to some columns...
    reactor_model = model,
    capacity_net = capacity_mw
  ) %>%
  tail(-1)

glimpse(reactors)


## Creating a new 'plant_status' variable and
## Converting some character columns into date-time values:
reactors <- reactors %>% 
  mutate(
    plant_status = ifelse(
      str_detect(status, "Shut down"), 
      "Shut down", "Not formally shut down"
    ), 
    construct_date = dmy(beginbuilding),          # Had to update some of the column names from the book's code
    operation_date = dmy(commercialoperation), 
    closure_date = dmy(closed)
  )
glimpse(reactors)


## Now the data are ready to do some statistical analysis, like this symbolic scatter plot with smooth trend curves:
ggplot(
  data = reactors, 
  aes(x = construct_date, y = capacity_net, color = plant_status
  )
) +
  geom_point() + 
  geom_smooth() + 
  xlab("Date of Plant Construction") + 
  ylab("Net Plant Capacity (MW)")