# Example R code, Chapter 19, Part 1 ## loading packages library(tidyverse) library(mdsr) # URL of text file of Macbeth from Gutenberg Project: macbeth_url <- "http://www.gutenberg.org/cache/epub/1129/pg1129.txt" # getting the text from the URL into an R object Macbeth_raw <- RCurl::getURL(macbeth_url) ## loading the 'Macbeth_raw' data frame, which is actually part of the 'mdsr' package: data(Macbeth_raw) length(Macbeth_raw) nchar(Macbeth_raw) # 'Macbeth_raw' is actually just one VERY long character string ## str_split will split the long character string into a vector of many character strings ## We split at the end-of-line characters \r and \n # str_split returns a list: we only want the first element macbeth <- Macbeth_raw %>% str_split("\r\n") %>% pluck(1) # plucks the first element from the list length(macbeth) head(macbeth) # the first few strings are the publisher's notes... ## picking some consecutive strings from inside the document: macbeth[300:310] ## Finding the lines where the character MACBETH speaks ## by looking for the subset of strings with " MACBETH" in them. ## The 'str_subset' function does this: macbeth_lines <- macbeth %>% str_subset(" MACBETH") length(macbeth_lines) head(macbeth_lines) ## Finding the lines where the character MACDUFF speaks ## by looking for the subset of strings with " MACDUFF" in them. macbeth %>% str_subset(" MACDUFF") %>% length() ## str_subset returns the subset of the elements containing the specified string: macbeth %>% str_subset(" MACBETH") %>% length() ## str_detect returns a vector (having the same length as the whole large object) containing ## TRUEs and FALSEs elementwise, depending on whether each element contains the specified string: macbeth %>% str_detect(" MACBETH") %>% length() # We see the first 6 lines of 'macbeth' do NOT contain the string " MACBETH": macbeth %>% str_detect(" MACBETH") %>% head() # To find the indices of the elements where " MACBETH" *does* appear, use str_which: macbeth %>% str_which(" MACBETH") ## 'str_extract' from the 'stringr' package returns the actual matching piece from each selected element ## having that specified pattern: pattern <- " MACBETH" macbeth %>% str_subset(pattern) %>% str_extract(pattern) %>% head() ## The '.' metacharacter matches any character, so searching for "MAC." will return strings that start with 'MAC'. ## This includes MACBETH and MACDUFF, but also some unrelated words ... macbeth %>% str_subset("MAC.") %>% head(12) ## To actually search for a period (.) character, you must precede the period with two backslashes: macbeth %>% str_subset("MACBETH\\.") %>% head() ## The [B-Z] stands for any character between B and Z: macbeth %>% str_subset("MAC[B-Z]") %>% head() ## Using [D-Z] will exclude MACBETH from the search results: macbeth %>% str_subset("MAC[D-Z]") %>% head() ## The (B|D) represents EITHER the B or D characters, so this will find MACBETH or MACDUFF macbeth %>% str_subset("MAC(B|D)") %>% head() ## The ^ searches only for the specified string when it appears at the beginning of the line of text. ## Note the difference here: macbeth %>% str_subset("^ MAC[B-Z]") %>% head() macbeth %>% str_subset(" MAC[B-Z]") %>% head() ## The $ searches only for the specified string when it appears at the end of the line of text. ## Note the difference here: macbeth %>% str_subset("MACBETH$") %>% head() macbeth %>% str_subset("MACBETH") %>% head() ## The ? searches for instances where the previous element in the pattern (here, a space) is repeated 0 times or 1 time. ## The * searches for instances where the previous element in the pattern (here, a space) is repeated 0 or more times. ## The + searches for instances where the previous element in the pattern (here, a space) is repeated 1 or more times. macbeth %>% str_subset("^ ?MAC[B-Z]") %>% head() macbeth %>% str_subset("^ *MAC[B-Z]") %>% head() macbeth %>% str_subset("^ +MAC[B-Z]") %>% head() ## Note that all these searches are case-sensitive! ## See the difference in these three results: macbeth %>% str_subset("MACBETH") %>% head() macbeth %>% str_subset("Macbeth") %>% head() macbeth %>% str_subset("macbeth") %>% head() ### An extended example: ## This creates a row-wise tibble known as a 'tribble' that is a useful way to enter a small number of data values by typing them directly. ## The tibble is given the name 'macbeth_chars' since it involves four characters in the play 'Macbeth'. ## We also use the 'mutate' verb to create a new object called 'speaks' that is a set of logical vectors of length 3194 for EACH of the four characters. ## Each element of these will be FALSE if the corresponding string does not show that character speaking and ## TRUE if the corresponding string shows that character speaking. macbeth_chars <- tribble( ~name, ~regexp, "Macbeth", " MACBETH\\.", "Lady Macbeth", " LADY MACBETH\\.", "Banquo", " BANQUO\\.", "Duncan", " DUNCAN\\.", ) %>% mutate(speaks = map(regexp, str_detect, string = macbeth)) # Visualizing the form of 'macbeth_chars': str(macbeth_chars) ## Unnesting the 'speaks' variable and saving the result into a long-format data frame called 'speaker_freq': ## Also, we convert the 'speaks' variable from logical (TRUE/FALSE) to numeric (1/0): ## We also filter to remove the junk strings at the beginning and end of the document, leaving 2953 lines of text from the actual play (rather than 3194): speaker_freq <- macbeth_chars %>% unnest(cols = speaks) %>% mutate( line = rep(1:length(macbeth), 4), speaks = as.numeric(speaks) ) %>% filter(line > 218 & line < 3172) # Now instead of having 2953 rows, the 'speaker_freq' data frame has 2953*4 = 11812 rows: dim(speaker_freq) head(speaker_freq) glimpse(speaker_freq) ## This uses some tools we've mentioned to create a tibble with information about on which line each Act of the play begins... acts <- tibble( line = str_which(macbeth, "^ACT [I|V]+"), line_text = str_subset(macbeth, "^ACT [I|V]+"), labels = str_extract(line_text, "^ACT [I|V]+") ) head(acts) ## Plots of the speaking frequency over time of each of the four characters ## Vertical lines indicate when each Act begins ggplot(data = speaker_freq, aes(x = line, y = speaks)) + geom_smooth( aes(color = name), method = "loess", se = FALSE, span = 0.4 ) + geom_vline( data = acts, aes(xintercept = line), color = "darkgray", lty = 3 ) + geom_text( data = acts, aes(y = 0.085, label = labels), hjust = "left", color = "darkgray" ) + ylim(c(0, NA)) + xlab("Line Number") + ylab("Proportion of Speeches") + scale_color_brewer(palette = "Set2")