# Example R code, Chapter 19, Part 1

## loading packages
library(tidyverse)
library(mdsr)

# URL of text file of Macbeth from Gutenberg Project:
macbeth_url <- "http://www.gutenberg.org/cache/epub/1129/pg1129.txt"

# getting the text from the URL into an R object
Macbeth_raw <- RCurl::getURL(macbeth_url)


## loading the 'Macbeth_raw' data frame, which is actually part of the 'mdsr' package:
data(Macbeth_raw)

length(Macbeth_raw)

nchar(Macbeth_raw)

# 'Macbeth_raw' is actually just one VERY long character string

## str_split will split the long character string into a vector of many character strings
## We split at the end-of-line characters \r and \n

# str_split returns a list: we only want the first element
macbeth <- Macbeth_raw %>%
  str_split("\r\n") %>%
  pluck(1)  # plucks the first element from the list

length(macbeth)

head(macbeth)  # the first few strings are the publisher's notes...


## picking some consecutive strings from inside the document:
macbeth[300:310]


## Finding the lines where the character MACBETH speaks 
## by looking for the subset of strings with "  MACBETH" in them.
## The 'str_subset' function does this:

macbeth_lines <- macbeth %>%
  str_subset("  MACBETH")
length(macbeth_lines)
head(macbeth_lines)


## Finding the lines where the character MACDUFF speaks 
## by looking for the subset of strings with "  MACDUFF" in them.

macbeth %>%
  str_subset("  MACDUFF") %>%
  length()


## str_subset returns the subset of the elements containing the specified string:
macbeth %>%
  str_subset("  MACBETH") %>%
  length()

## str_detect returns a vector (having the same length as the whole large object) containing
## TRUEs and FALSEs elementwise, depending on whether each element contains the specified string:
macbeth %>%
  str_detect("  MACBETH") %>%
  length()

# We see the first 6 lines of 'macbeth' do NOT contain the string "  MACBETH":
macbeth %>%
  str_detect("  MACBETH") %>%
  head()

# To find the indices of the elements where "  MACBETH" *does* appear, use str_which:
macbeth %>%
  str_which("  MACBETH")


## 'str_extract' from the 'stringr' package returns the actual matching piece from each selected element 
## having that specified pattern:
pattern <- "  MACBETH"
macbeth %>%
  str_subset(pattern) %>%
  str_extract(pattern) %>% 
  head()


## The '.' metacharacter matches any character, so searching for "MAC." will return strings that start with 'MAC'.
## This includes MACBETH and MACDUFF, but also some unrelated words ...
macbeth %>%
  str_subset("MAC.") %>%
  head(12)

## To actually search for a period (.) character, you must precede the period with two backslashes:
macbeth %>%
  str_subset("MACBETH\\.") %>%
  head()


## The [B-Z] stands for any character between B and Z:
macbeth %>%
  str_subset("MAC[B-Z]") %>%
  head()

## Using [D-Z] will exclude MACBETH from the search results: 
macbeth %>%
  str_subset("MAC[D-Z]") %>%
  head()


## The (B|D) represents EITHER the B or D characters, so this will find MACBETH or MACDUFF
macbeth %>%
  str_subset("MAC(B|D)") %>%
  head()


## The ^ searches only for the specified string when it appears at the beginning of the line of text.
## Note the difference here:

macbeth %>%
  str_subset("^  MAC[B-Z]") %>%
  head()

macbeth %>%
  str_subset(" MAC[B-Z]") %>%
  head()

## The $ searches only for the specified string when it appears at the end of the line of text.
## Note the difference here:

macbeth %>%
  str_subset("MACBETH$") %>%
  head()

macbeth %>%
  str_subset("MACBETH") %>%
  head()


## The ? searches for instances where the previous element in the pattern (here, a space) is repeated 0 times or 1 time.
## The * searches for instances where the previous element in the pattern (here, a space) is repeated 0 or more times.
## The + searches for instances where the previous element in the pattern (here, a space) is repeated 1 or more times.

macbeth %>%
  str_subset("^ ?MAC[B-Z]") %>%
  head()
macbeth %>%
  str_subset("^ *MAC[B-Z]") %>%
  head()
macbeth %>%
  str_subset("^ +MAC[B-Z]") %>%
  head()


## Note that all these searches are case-sensitive!
## See the difference in these three results:

macbeth %>%
  str_subset("MACBETH") %>%
  head()

macbeth %>%
  str_subset("Macbeth") %>%
  head()

macbeth %>%
  str_subset("macbeth") %>%
  head()

### An extended example:

## This creates a row-wise tibble known as a 'tribble' that is a useful way to enter a small number of data values by typing them directly.
## The tibble is given the name 'macbeth_chars' since it involves four characters in the play 'Macbeth'.
## We also use the 'mutate' verb to create a new object called 'speaks' that is a set of logical vectors of length 3194 for EACH of the four characters.
## Each element of these will be FALSE if the corresponding string does not show that character speaking and 
## TRUE if the corresponding string shows that character speaking. 

macbeth_chars <- tribble(
  ~name, ~regexp,
  "Macbeth", "  MACBETH\\.",
  "Lady Macbeth", "  LADY MACBETH\\.",
  "Banquo", "  BANQUO\\.",
  "Duncan", "  DUNCAN\\.",
) %>%
  mutate(speaks = map(regexp, str_detect, string = macbeth))

# Visualizing the form of 'macbeth_chars':
str(macbeth_chars)


## Unnesting the 'speaks' variable and saving the result into a long-format data frame called 'speaker_freq':
## Also, we convert the 'speaks' variable from logical (TRUE/FALSE) to numeric (1/0):
## We also filter to remove the junk strings at the beginning and end of the document, leaving 2953 lines of text from the actual play (rather than 3194):
speaker_freq <- macbeth_chars %>%
  unnest(cols = speaks) %>%
  mutate(
    line = rep(1:length(macbeth), 4),
    speaks = as.numeric(speaks)
  ) %>%
  filter(line > 218 & line < 3172)

# Now instead of having 2953 rows, the 'speaker_freq' data frame has 2953*4 = 11812 rows:

dim(speaker_freq)

head(speaker_freq)

glimpse(speaker_freq)


## This uses some tools we've mentioned to create a tibble with information about on which line each Act of the play begins...
acts <- tibble(
  line = str_which(macbeth, "^ACT [I|V]+"), 
  line_text = str_subset(macbeth, "^ACT [I|V]+"),
  labels = str_extract(line_text, "^ACT [I|V]+")
)

head(acts)

## Plots of the speaking frequency over time of each of the four characters
## Vertical lines indicate when each Act begins

ggplot(data = speaker_freq, aes(x = line, y = speaks)) + 
  geom_smooth(
    aes(color = name), method = "loess", 
    se = FALSE, span = 0.4
  ) + 
  geom_vline(
    data = acts, 
    aes(xintercept = line), 
    color = "darkgray", lty = 3
  ) + 
  geom_text(
    data = acts, 
    aes(y = 0.085, label = labels), 
    hjust = "left", color = "darkgray"
  ) + 
  ylim(c(0, NA)) + 
  xlab("Line Number") +
  ylab("Proportion of Speeches") + 
  scale_color_brewer(palette = "Set2")