# Example R code, Chapter 19, Part 2

## loading packages
library(tidyverse)
library(mdsr)

################################################################################
#
#  TEXT ANALYSIS OF THE "Data Science Papers" CORPUS
#
################################################################################



## This is a way to create the DataSciencePapers data frame
## library(aRxiv)
## DataSciencePapers <- arxiv_search(
##   query = '"Data Science"',
##   limit = 20000,
##   batchsize = 100
## )


## Or just load it, since it's already in the 'mdsr' package that we've loaded earlier:
data(DataSciencePapers)


## Information about the 15 different columns.  There are 1089 rows for the 1089 research papers in the dataset.
glimpse(DataSciencePapers)


## The dates in 'submitted' and 'updated' are read initially as character strings.
## Let's convert them to date-time objects using the 'ymd_hms' function in the 'lubridate' package:

library(lubridate)
DataSciencePapers <- DataSciencePapers %>%
  mutate(
    submitted = lubridate::ymd_hms(submitted), 
    updated = lubridate::ymd_hms(updated)
  )
glimpse(DataSciencePapers)


## The 'tally' function in the 'mosaic' package summarizes the counts of papers by year (similar to what the 'table' function in base R can do):
mosaic::tally(~ year(submitted), data = DataSciencePapers)


## Examining one specific paper by filtering on its ID number:
DataSciencePapers %>% 
  filter(id == "1809.02408v2") %>%
  glimpse()


## Examining primary_category of the papers is not very helpful...
DataSciencePapers %>%
  group_by(primary_category) %>%
  count() %>%
  head()


## Using 'str_extract', we can pick out the primary field (the part of the primary_category variable BEFORE the period):
DataSciencePapers <- DataSciencePapers %>%
  mutate(
    field = str_extract(primary_category, "^[a-z,-]+"),
  )
mosaic::tally(x = ~field, margins = TRUE, data = DataSciencePapers) %>%
  sort()

# What are the most common fields for Data Science papers here?

# 'unnest_tokens' splits the text lines in each abstract into *tokens*, which are words here.
# The sorting below will return the most common words in each of the abstracts --- not too interesting, as we see!

# install.packages("tidytext")
library(tidytext)
DataSciencePapers %>%
  unnest_tokens(word, abstract) %>%
  count(id, word, sort = TRUE)


## Now, ignore the 'stopwords' and only focus on "meaningful" words in our search:
## Note:  To see the default list of stopwords in English, type:
#    print(get_stopwords(),n=Inf)

### Using the 'unnest_tokens' function, the created object 'arxiv_words' is a "flattened" form of the 'DataSciencePapers' data frame 
### where the last column is called 'word' and each row is one word of an abstract.

### The 'anti_join' function will produce all rows of the FIRST table (here, the flattened table) that do NOT have 
### a match in the second table (here, the list of stopwords).
### This 'anti_join' will produce a table with the stopwords REMOVED.

arxiv_words <- DataSciencePapers %>%
  unnest_tokens(word, abstract) %>%
  anti_join(get_stopwords(), by = "word")

arxiv_words %>%
  count(id, word, sort = TRUE)


## Creating a variable called 'abstract_clean' that is like the 'abstract' variable 
## but which has all stopwords removed and all characters converted to lowercase:

arxiv_abstracts <- arxiv_words %>%
  group_by(id) %>%
  summarize(abstract_clean = paste(word, collapse = " "))

# Left-joining to incorporate the 'abstract_clean' information along with the information in the 'DataSciencePapers' table:
arxiv_papers <- DataSciencePapers %>%
  left_join(arxiv_abstracts, by = "id")


## Picking one single paper out of the data table and calling it 'single_paper':
single_paper <- arxiv_papers %>%
  filter(id == "1809.02408v2")

## The beginning of the original abstract of the 'single_paper':
single_paper %>%
  pull(abstract) %>%
  strwrap() %>%
  head()

## The beginning of the cleaned-up abstract of the 'single_paper': 
single_paper %>%
  pull(abstract_clean) %>%
  strwrap() %>%
  head(4)


## Making a word cloud of the text in the abstracts

# install.packages("wordcloud")
# install.packages("tm")
# install.packages("slam")
library(wordcloud)
#set.seed(1962)
#set.seed(1966)  # The appearance of the word cloud changes slightly with different seeds.  Choose a particular seed to get replicable results...
arxiv_abstracts %>%
  pull(abstract_clean) %>%
  wordcloud(
    max.words = 40, 
    scale = c(8, 1), 
    colors = topo.colors(n = 30), 
    random.color = TRUE
  )


## Sentiment analysis:

# install.packages("textdata")
library(textdata)
afinn <- get_sentiments("afinn")  # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5)

## Showing a sample of words and how positive or negative they are ... run this a few times to see a good selection of words...
afinn %>%
  slice_sample(n = 15) %>%
  arrange(desc(value))


## Joining the 'arxiv_words' data frame to this lexicon, to find the sentiment value of each word in 'arxiv_words':
arxiv_words %>% 
  inner_join(afinn, by = "word") %>%
  select(word, id, value)


## The left join is better here, since there may be some words in the abstracts that do not appear in the 'afinn' lexicon.
## In the left-joined table, these will have a missing sentiment value, which will be treated as a 0 when we sum the words' sentiment values.
## For each paper (labeled by 'id'), we will sum the sentiment values for the words in its abstract.
## With 'mutate', we create a 'sentiment_per_word' variable that finds the average sentiment per word (to not favor shorter or longer abstracts)

arxiv_sentiments <- arxiv_words %>% 
  left_join(afinn, by = "word") %>%
  group_by(id) %>%
  summarize(
    num_words = n(),
    sentiment = sum(value, na.rm = TRUE), 
    .groups = "drop"
  ) %>%
  mutate(sentiment_per_word = sentiment / num_words) %>%
  arrange(desc(sentiment))

# Printing the 10 most positive in terms of SUMMED sentiment values:
 head(arxiv_sentiments, 10)

# Printing the 10 most positive in terms of AVERAGE (per-word) sentiment values:
arxiv_sentiments %>%
 arrange(desc(sentiment_per_word)) %>%
 head(10)



## Some summary statistics for the 'sentiment' and 'sentiment_per_word' variables across the 1089 abstracts:
arxiv_papers <- arxiv_papers %>%
  left_join(arxiv_sentiments, by = "id")
arxiv_papers %>%
  skim(sentiment, sentiment_per_word)


## Pulling out the abstract with the most positive sentiment and looking at it using 'strwrap':
most_positive <- arxiv_papers %>%
  filter(sentiment_per_word == max(sentiment_per_word)) %>%
  pull(abstract)
strwrap(most_positive)


## a time series plot of average sentiment plotted against submission date:
## Using the color aesthetic, we can create two curves based on whether the field was 'cs' (computer science) or not:
ggplot(
  arxiv_papers, 
  aes(
    x = submitted, y = sentiment_per_word, 
    color = field == "cs"
  )
) + 
  geom_smooth(se = TRUE) + 
  scale_color_brewer("Computer Science?", palette = "Set2") +  # scale_color_brewer will let us pick a color palette 
  labs(x = "Date submitted", y = "Sentiment score per word")


## Finding all the bigrams in the 'arxiv_papers' data:
arxiv_bigrams <- arxiv_papers %>%
  unnest_tokens(
    arxiv_bigram, 
    abstract_clean, 
    token = "ngrams", 
    n = 2
  ) %>%
  select(arxiv_bigram, id)  # only selecting two of the columns
arxiv_bigrams


## How many times each bigram appears, sorted from most to least:
arxiv_bigrams %>%
  count(arxiv_bigram, sort = TRUE)

# What do you notice about the 4th most common bigram?


### Recall 'arxiv_words' is a "flattened" form of the 'DataSciencePapers' data frame (with stopwords removed)
### where the last column is called 'word' and each row is one word of an abstract.

## Using 'count' and 'arrange', we can list the most common words in the collection of abstracts:
arxiv_words %>%
  count(word) %>%
  arrange(desc(n)) %>%
  head()


## The 'bind_tf_idf' function will calculate and print the 
## tf (term frequency), idf (inverse document frequency), and tf_idf (product of tf and idf)
## for all terms in the collection of documents:
## We must specify which variable identifies each document in the collection -- here, 'id' identifies the document.

tidy_DTM <- arxiv_words %>%
  count(id, word) %>%
  bind_tf_idf(word, id, n)

## This shows the words with the largest tf values:
tidy_DTM %>%
  arrange(desc(tf)) %>%
  head()


## The same table, but showing the words with the largest idf values:

tidy_DTM %>%
  arrange(desc(idf), desc(n)) %>%
  head()



## Which abstract contains the word 'wildfire', which has such a high idf score?
arxiv_papers %>%
  pull(abstract) %>%
  str_subset("wildfire") %>%
  strwrap() %>%
  head()


## Look at how many abstracts the word 'implications' appears in:
tidy_DTM %>%
  filter(word == "implications")


## This will print the words in a specific document (the one with id 1809.02408v2) that have the highest tf_idf scores:
## (Recall that tf_idf is a characteristic of the COMBINATION of the word AND the document)

tidy_DTM %>%
  filter(id == "1809.02408v2") %>%
  arrange(desc(tf_idf)) %>%
  head()
# These are words that appear frequently in THIS document, but not so frequently in the whole collection of documents.


## This result shows the abstracts in which the word "covid" appears much more frequently than it appears across the whole corpus of documents:
tidy_DTM %>%
  filter(word == "covid") %>%
  arrange(desc(tf_idf)) %>%
  head() %>%
  left_join(select(arxiv_papers, id, abstract), by = "id")


## This lists the document-term combinations with the highest tf_idf scores:
tidy_DTM %>%
  arrange(desc(tf_idf)) %>%
  head() %>%
  left_join(select(arxiv_papers, id, abstract), by = "id")


## cast_dtm calculates the document-term matrix:
## By default, the entries for a particular document-term combination would be the term frequency in that document, 
## but the option
##   weighting = tm::weightTfIdf
## specifies that we want the entries to be the normalized tf_idf values.

tm_DTM <- arxiv_words %>%
  count(id, word) %>%
  cast_dtm(id, word, n, weighting = tm::weightTfIdf)
tm_DTM
# Sparsity of 99% indicates that most words do not appear in most documents.

## This will find all the terms with tf_idf scores (since that's what's in tm_DTM here) that are at least 7 (summed across documents)
tm::findFreqTerms(tm_DTM, lowfreq = 7)


## Printing the terms with the highest tf_idf scores (summed across documents)
tm_DTM %>% 
  as.matrix() %>% 
  as_tibble() %>%
  map_dbl(sum) %>%
  sort(decreasing = TRUE) %>%
  head()


## The 'findAssocs' function here will find words that tend to appear in the same documents as the word "causal" does:
tm::findAssocs(tm_DTM, terms = "causal", corlimit = 0.35)



################################################################################
#
#  TEXT ANALYSIS OF ARTICLE BY Prof. Hitchcock
#
################################################################################

# Importing the article from a text file on the web:

DBH_url <- "https://people.stat.sc.edu/hitchcock/HistoryStatisticsCourseTASrev.txt"

DBH_raw <- RCurl::getURL(DBH_url)

length(DBH_raw)

nchar(DBH_raw)


# str_split returns a list: we only want the first element
dbh <- DBH_raw %>%
  str_split("\r\n") %>%
  pluck(1)  # plucks the first element from the list

length(dbh)

head(dbh,25)

## This uses some tools we've mentioned to create a tibble with information about on which line each section of the article begins...
sections <- tibble(
  line = str_which(dbh, "^[1-9] +"), 
  line_text = str_subset(dbh, "^[1-9] +"),
  labels = str_extract(line_text, "^[1-9] +")
)



# install.packages("tidytext")
library(tidytext)


d <- tibble(txt = dbh)
d
d %>%
  unnest_tokens(output = word, input = txt)

d_clean <- d %>%
  unnest_tokens(output = word, input = txt) %>%
  anti_join(get_stopwords(), by = "word")

# Default word cloud:

wordcloud(d_clean$word)

# Formatted word cloud:

  wordcloud(d_clean$word,
    max.words = 40, 
    scale = c(5, 1), 
    colors = topo.colors(n = 30), 
    random.color = TRUE
  )

# Section analysis:


num_lines_per_section <-diff(c(1,sections$line,nrow(d)+1))

Section_ID <- rep(0:6, num_lines_per_section)
# Cleaning up with the short end-sections: 
# 7=Supplementary Material, 8=Acknowledgments, 9=Disclosure, 10=References
Section_ID[189:191]<-7; Section_ID[192:195]<-8; Section_ID[196:198]<-9; Section_ID[199:226]<-10; 
d_w_section <- data.frame(d,Section_ID)

## Cleaning out stopwords separately by section:

clean_stopwords <- function(x){
 result <- x %>%
  unnest_tokens(output = word, input = txt) %>%
  anti_join(get_stopwords(), by = "word")
return(result)
}

d_clean_by_section <- vector("list", max(d_w_section$Section_ID)+1)  # +1 to account for the "Section 0"

for (i in 0:max(d_w_section$Section_ID)){
d_clean_by_section[[i+1]] <- d_w_section %>%    # +1 to account for the "Section 0"
 filter(Section_ID==i) %>%
 clean_stopwords()
}

d_clean_all_sections <- bind_rows(d_clean_by_section)

## Sentiment analysis:

# install.packages("textdata")
library(textdata)
afinn <- get_sentiments("afinn")  # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5)



dbh_sentiments <- d_clean_all_sections %>% 
  left_join(afinn, by = "word") %>%
  group_by(Section_ID) %>%
  summarize(
    num_words = n(),
    sentiment = sum(value, na.rm = TRUE), 
    .groups = "drop"
  ) %>%
  mutate(sentiment_per_word = sentiment / num_words) %>%
  arrange(desc(sentiment))

# Printing the most positive in terms of SUMMED sentiment values:
 head(dbh_sentiments, 11)

# Printing the most positive in terms of AVERAGE (per-word) sentiment values:
dbh_sentiments %>%
 arrange(desc(sentiment_per_word)) %>%
 head(11)

#Recall:
sections$line_text
# 7=Supplementary Material, 8=Acknowledgments, 9=Disclosure, 10=References

# Printing the most positive (only counting the "real" sections, 1-6) in terms of AVERAGE (per-word) sentiment values:
dbh_sentiments %>%
 filter(Section_ID %in% 1:6) %>% 
 arrange(desc(sentiment_per_word)) %>%
 head(10)



## Pulling out section 8 with the most positive sentiment and looking at it using 'strwrap':

strwrap(d_clean_all_sections$word[d_clean_all_sections$Section_ID == 8])


d_clean_all_sections %>% 
  filter(Section_ID == 8) %>% 
  left_join(afinn, by = "word")


## Bigrams and N-grams:

## 4-grams:

## Finding all the 4-grams in the article:
dbh_4grams <- d %>%
  unnest_tokens(output = dbh_4gram, input = txt, token = "ngrams", 
    n = 4) %>%
  select(dbh_4gram)  # only selecting two of the columns
dbh_4grams

## How many times each 4-gram appears, sorted from most to least:
dbh_4grams %>%
  count(dbh_4gram, sort = TRUE)


## Most common words in the whole article:

d_clean %>%
  count(word) %>%
  arrange(desc(n)) %>%
  head()


## The 'bind_tf_idf' function will calculate and print the 
## tf (term frequency), idf (inverse document frequency), and tf_idf (product of tf and idf)
## for all terms in the collection of documents:
## Here, each "document" is a section and the "collection" is the whole article -- here, 'id' identifies the document.

dbh_DTM <- d_clean_all_sections %>%
  filter(Section_ID %in% 1:6) %>%
  count(Section_ID, word) %>%
  bind_tf_idf(word, Section_ID, n)

## This shows the words with the largest tf values:
dbh_DTM %>%
  arrange(desc(tf)) %>%
  head()


## The same table, but showing the words with the largest idf values:

dbh_DTM %>%
  arrange(desc(idf), desc(n)) %>%
  head()

## The same table, but showing the words/sections with the largest tf_idf values:

dbh_DTM %>%
  arrange(desc(tf_idf), desc(n)) %>%
  head()



## cast_dtm calculates the document-term matrix:
## Here, the entries for a particular section-term combination would be the term frequency in that section (the default choice)

dbh_tm_DTM <- d_clean_all_sections %>%
  filter(Section_ID %in% 1:6) %>%
  count(Section_ID, word) %>%
  cast_dtm(Section_ID, word, n)
dbh_tm_DTM


## The 'findAssocs' function here will find words that tend to appear in the same sections as the word "guest" does:
tm::findAssocs(dbh_tm_DTM, terms = "guest", corlimit = 0.92)



################################################################################
#
#  Extended example:  TEXT ANALYSIS OF Beatles song titles
#
################################################################################


## ----message=FALSE------------------------------------------------------------
library(rvest)
url <- "http://en.wikipedia.org/wiki/List_of_songs_recorded_by_the_Beatles"
tables <- url %>% 
  read_html() %>% 
  html_nodes("table")
Beatles_songs <- tables %>%
  purrr::pluck(3) %>%
  html_table(fill = TRUE) %>%
  janitor::clean_names() %>%
  select(song, lead_vocal_s_d)
glimpse(Beatles_songs)


## Removing the quotation marks (denoted WITHIN the string as \" ) from the 'song' variable, and renaming the 'lead vocal' variable:
Beatles_songs <- Beatles_songs %>%
  mutate(song = str_remove_all(song, pattern = '\\"')) %>%
  rename(vocals = lead_vocal_s_d)
glimpse(Beatles_songs)

## Counts for each vocalist (or combination of vocalist)
Beatles_songs %>%
  group_by(vocals) %>%
  count() %>%
  arrange(desc(n))

# The table of results is slightly different from what's shown in the book ...
# Wikipedia must have updated its data ...

## Comparing the number of songs sung solely by Paul McCartney to the number sung solely by John Lennon:
Beatles_songs %>%
  pull(vocals) %>%
  str_subset("McCartney") %>%
  length()
Beatles_songs %>%
  pull(vocals) %>%
  str_subset("Lennon") %>%
  length()


## the number of songs sung by McCartney or by John Lennon or by both:
Beatles_songs %>%
  pull(vocals) %>%
  str_subset("(McCartney|Lennon)") %>%
  length()


## the number of songs sung by both Lennon AND McCartney, NOT including those with sole vocals from either:
pj_regexp <- "(McCartney|Lennon).*(McCartney|Lennon)"
Beatles_songs %>%
  pull(vocals) %>%
  str_subset(pj_regexp) %>%
  length()


## Using 'str_detect' to get the same result as above:

Beatles_songs %>%
 filter(str_detect(vocals, pj_regexp)) %>%
 nrow()

# And listing the first few of these songs on which they both sang:

Beatles_songs %>%
  filter(str_detect(vocals, pj_regexp)) %>%
  select(song, vocals) %>%
  head()


## Listing the most common words in Beatles song titles (removing stopwords)
Beatles_songs %>%
  unnest_tokens(word, song) %>%
  anti_join(get_stopwords(), by = "word") %>%
  count(word, sort = TRUE) %>%
  arrange(desc(n)) %>%
  head()


## An analysis by albums and by year:

# The full data set with more columns, including the album each song appeared on ...
Beatles_songs_full <- tables %>%
 purrr::pluck(3) %>%
 html_table(fill = TRUE) %>%
 janitor::clean_names() %>%
  rename(vocals = lead_vocal_s_d, 
         album = core_catalogue_release_s, 
         writer = songwriter_s) %>%
  mutate(song = str_remove_all(song, pattern = '\\"'), album = str_remove_all(album, pattern = '\\"')) %>%
  select(-ref_s)
glimpse(Beatles_songs_full)


unique(Beatles_songs_full$album)
# Some of the album titles could use some cleaning up ...

mypatterns <- c("Let It BePast Masters", "Past Masters", "Magical Mystery Tour")

Beatles_songs_full$album = ifelse(Beatles_songs_full$album == mypatterns[1], "Let It Be", Beatles_songs_full$album) 
Beatles_songs_full$album = ifelse(str_detect(Beatles_songs_full$album, mypatterns[2]), "Past Masters", Beatles_songs_full$album)
Beatles_songs_full$album = ifelse(str_detect(Beatles_songs_full$album, mypatterns[3]), "Magical Mystery Tour", Beatles_songs_full$album)

unique(Beatles_songs_full$album)
# Looks better!

## Did the length of Beatles song titles change over time?

ggplot(data=Beatles_songs_full, aes(x=year,y=nchar(song)) ) +
  geom_point() +
  geom_smooth(method='loess',se=FALSE) +
  ylab("Number of Characters in Song Title")

## Was the length of Beatles song titles different across albums?

ggplot(data=Beatles_songs_full, aes(x=album,y=nchar(song)) ) +
  geom_boxplot() +
  ylab("Number of Characters in Song Title") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) # so that the album titles don't run into each other...

# Or could make the boxplots horizontal...
ggplot(data=Beatles_songs_full, aes(x=album,y=nchar(song)) ) +
  geom_boxplot() +
  ylab("Number of Characters in Song Title") +
  coord_flip()


## A quick sentiment analysis:

Beatles_words <- Beatles_songs_full %>%
  unnest_tokens(word, song) %>%
  anti_join(get_stopwords(), by = "word")
# install.packages("textdata")
library(textdata)
afinn <- get_sentiments("afinn")  # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5)



## Joining the 'Beatles_words' data frame to this lexicon, to find the sentiment value of each word in 'Beatles_words':
Beatles_words %>% 
  inner_join(afinn, by = "word") %>%
  select(word, year, album, value)


Beatles_sentiments_by_album <- Beatles_words %>% 
  left_join(afinn, by = "word") %>%
  group_by(album) %>%
  summarize(
    num_words = n(),
    sentiment = sum(value, na.rm = TRUE), 
    .groups = "drop"
  ) %>%
  mutate(sentiment_per_word = sentiment / num_words) %>%
  arrange(desc(sentiment))


# Printing the most positive in terms of AVERAGE (per-word) sentiment values:
Beatles_sentiments_by_album %>%
 arrange(desc(sentiment_per_word)) %>%
 head(15)

 ## Not sure this is very valuable ...