# HW 7 R code, STAT 542

library(tidyverse)
library(mdsr)

## Preparing the data:

data(Macbeth_raw)

# str_split returns a list: we only want the first element
macbeth <- Macbeth_raw %>%
  str_split("\r\n") %>%
  pluck(1)  # plucks the first element from the list

# Problem 1(a), Chapter 19 Exercises:

## str_subset returns the subset of the elements containing the specified string:
macbeth %>%
  str_subset("^  [A-Z ]+\\.") %>%
  head(12)

# The total number of speaking lines:
macbeth %>%
  str_subset("^  [A-Z ]+\\.") %>%
  length()

# Problem 1(b), Chapter 19 Exercises:

# Define a "hyphenated word" to be any lowercase letter followed by a hyphen (-) followed by another lowercase letter. 
macbeth %>%
  str_subset("[a-z]-[a-z]")



# Problem 2(a), Chapter 19 Exercises:

macbeth %>%
  str_subset("([a-z]more |[a-z]less )")

# Problem 2(b), Chapter 19 Exercises:

macbeth %>%
  str_subset("(Exit|Exeunt)")


# Problem 3, Chapter 19 Exercises:

x <- c(
  "popular", "popularity", "popularize", "popularise",
  "Popular", "Population", "repopulate", "reproduce",
  "happy family", "happier\tfamily", " happy family", "P6dn"
)
x

str_subset(x, pattern = "pop")                  #1
str_detect(x, pattern = "^pop")                 #2
str_detect(x, pattern = "populari[sz]e")        #3
str_detect(x, pattern = "pop.*e")               #4
str_detect(x, pattern = "p[a-z]*e")             #5
str_detect(x, pattern = "^[Pp][a-z]+.*n")       #6
str_subset(x, pattern = "^[^Pp]")               #7
str_detect(x, pattern = "^[A-Za-p]")            #8
str_detect(x, pattern = "[ ]")                  #9
str_subset(x, pattern = "[\t]")                 #10
str_detect(x, pattern = "[ \t]")                #11
str_subset(x, pattern = "^[ ]")                 #12



# Problem 10, Chapter 19 exercises:

# Importing the poem from a text file on the web:

emily_url <- "https://people.stat.sc.edu/hitchcock/lonelyhouse.txt"

emily_raw <- RCurl::getURL(emily_url)

# str_split returns a list: we only want the first element
emily <- emily_raw %>%
  str_split("\r\n") %>%
  pluck(1)  # plucks the first element from the list

#length(emily)

#head(emily,25)

d_emily <- tibble(txt = emily)
d_emily

d_emily %>%
  unnest_tokens(output = word, input = txt)


d_emily_clean <- d_emily %>%
  unnest_tokens(output = word, input = txt) %>%
  anti_join(get_stopwords(), by = "word")


# install.packages("textdata")
library(textdata)
afinn <- get_sentiments("afinn")  # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5)


emily_sentiments <- d_emily_clean %>% 
  left_join(afinn, by = "word") %>%
  skim(value)

emily_sentiments