# HW 7 R code, STAT 542 library(tidyverse) library(mdsr) ## Preparing the data: data(Macbeth_raw) # str_split returns a list: we only want the first element macbeth <- Macbeth_raw %>% str_split("\r\n") %>% pluck(1) # plucks the first element from the list # Problem 1(a), Chapter 19 Exercises: ## str_subset returns the subset of the elements containing the specified string: macbeth %>% str_subset("^ [A-Z ]+\\.") %>% head(12) # The total number of speaking lines: macbeth %>% str_subset("^ [A-Z ]+\\.") %>% length() # Problem 1(b), Chapter 19 Exercises: # Define a "hyphenated word" to be any lowercase letter followed by a hyphen (-) followed by another lowercase letter. macbeth %>% str_subset("[a-z]-[a-z]") # Problem 2(a), Chapter 19 Exercises: macbeth %>% str_subset("([a-z]more |[a-z]less )") # Problem 2(b), Chapter 19 Exercises: macbeth %>% str_subset("(Exit|Exeunt)") # Problem 3, Chapter 19 Exercises: x <- c( "popular", "popularity", "popularize", "popularise", "Popular", "Population", "repopulate", "reproduce", "happy family", "happier\tfamily", " happy family", "P6dn" ) x str_subset(x, pattern = "pop") #1 str_detect(x, pattern = "^pop") #2 str_detect(x, pattern = "populari[sz]e") #3 str_detect(x, pattern = "pop.*e") #4 str_detect(x, pattern = "p[a-z]*e") #5 str_detect(x, pattern = "^[Pp][a-z]+.*n") #6 str_subset(x, pattern = "^[^Pp]") #7 str_detect(x, pattern = "^[A-Za-p]") #8 str_detect(x, pattern = "[ ]") #9 str_subset(x, pattern = "[\t]") #10 str_detect(x, pattern = "[ \t]") #11 str_subset(x, pattern = "^[ ]") #12 # Problem 10, Chapter 19 exercises: # Importing the poem from a text file on the web: emily_url <- "https://people.stat.sc.edu/hitchcock/lonelyhouse.txt" emily_raw <- RCurl::getURL(emily_url) # str_split returns a list: we only want the first element emily <- emily_raw %>% str_split("\r\n") %>% pluck(1) # plucks the first element from the list #length(emily) #head(emily,25) d_emily <- tibble(txt = emily) d_emily d_emily %>% unnest_tokens(output = word, input = txt) d_emily_clean <- d_emily %>% unnest_tokens(output = word, input = txt) %>% anti_join(get_stopwords(), by = "word") # install.packages("textdata") library(textdata) afinn <- get_sentiments("afinn") # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5) emily_sentiments <- d_emily_clean %>% left_join(afinn, by = "word") %>% skim(value) emily_sentiments