# Example R code, Chapter 19, Part 2 ## loading packages library(tidyverse) library(mdsr) ################################################################################ # # TEXT ANALYSIS OF THE "Data Science Papers" CORPUS # ################################################################################ ## This is a way to create the DataSciencePapers data frame ## library(aRxiv) ## DataSciencePapers <- arxiv_search( ## query = '"Data Science"', ## limit = 20000, ## batchsize = 100 ## ) ## Or just load it, since it's already in the 'mdsr' package that we've loaded earlier: data(DataSciencePapers) ## Information about the 15 different columns. There are 1089 rows for the 1089 research papers in the dataset. glimpse(DataSciencePapers) ## The dates in 'submitted' and 'updated' are read initially as character strings. ## Let's convert them to date-time objects using the 'ymd_hms' function in the 'lubridate' package: library(lubridate) DataSciencePapers <- DataSciencePapers %>% mutate( submitted = lubridate::ymd_hms(submitted), updated = lubridate::ymd_hms(updated) ) glimpse(DataSciencePapers) ## The 'tally' function in the 'mosaic' package summarizes the counts of papers by year (similar to what the 'table' function in base R can do): mosaic::tally(~ year(submitted), data = DataSciencePapers) ## Examining one specific paper by filtering on its ID number: DataSciencePapers %>% filter(id == "1809.02408v2") %>% glimpse() ## Examining primary_category of the papers is not very helpful... DataSciencePapers %>% group_by(primary_category) %>% count() %>% head() ## Using 'str_extract', we can pick out the primary field (the part of the primary_category variable BEFORE the period): DataSciencePapers <- DataSciencePapers %>% mutate( field = str_extract(primary_category, "^[a-z,-]+"), ) mosaic::tally(x = ~field, margins = TRUE, data = DataSciencePapers) %>% sort() # What are the most common fields for Data Science papers here? # 'unnest_tokens' splits the text lines in each abstract into *tokens*, which are words here. # The sorting below will return the most common words in each of the abstracts --- not too interesting, as we see! # install.packages("tidytext") library(tidytext) DataSciencePapers %>% unnest_tokens(word, abstract) %>% count(id, word, sort = TRUE) ## Now, ignore the 'stopwords' and only focus on "meaningful" words in our search: ## Note: To see the default list of stopwords in English, type: # print(get_stopwords(),n=Inf) ### Using the 'unnest_tokens' function, the created object 'arxiv_words' is a "flattened" form of the 'DataSciencePapers' data frame ### where the last column is called 'word' and each row is one word of an abstract. ### The 'anti_join' function will produce all rows of the FIRST table (here, the flattened table) that do NOT have ### a match in the second table (here, the list of stopwords). ### This 'anti_join' will produce a table with the stopwords REMOVED. arxiv_words <- DataSciencePapers %>% unnest_tokens(word, abstract) %>% anti_join(get_stopwords(), by = "word") arxiv_words %>% count(id, word, sort = TRUE) ## Creating a variable called 'abstract_clean' that is like the 'abstract' variable ## but which has all stopwords removed and all characters converted to lowercase: arxiv_abstracts <- arxiv_words %>% group_by(id) %>% summarize(abstract_clean = paste(word, collapse = " ")) # Left-joining to incorporate the 'abstract_clean' information along with the information in the 'DataSciencePapers' table: arxiv_papers <- DataSciencePapers %>% left_join(arxiv_abstracts, by = "id") ## Picking one single paper out of the data table and calling it 'single_paper': single_paper <- arxiv_papers %>% filter(id == "1809.02408v2") ## The beginning of the original abstract of the 'single_paper': single_paper %>% pull(abstract) %>% strwrap() %>% head() ## The beginning of the cleaned-up abstract of the 'single_paper': single_paper %>% pull(abstract_clean) %>% strwrap() %>% head(4) ## Making a word cloud of the text in the abstracts # install.packages("wordcloud") # install.packages("tm") # install.packages("slam") library(wordcloud) #set.seed(1962) #set.seed(1966) # The appearance of the word cloud changes slightly with different seeds. Choose a particular seed to get replicable results... arxiv_abstracts %>% pull(abstract_clean) %>% wordcloud( max.words = 40, scale = c(8, 1), colors = topo.colors(n = 30), random.color = TRUE ) ## Sentiment analysis: # install.packages("textdata") library(textdata) afinn <- get_sentiments("afinn") # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5) ## Showing a sample of words and how positive or negative they are ... run this a few times to see a good selection of words... afinn %>% slice_sample(n = 15) %>% arrange(desc(value)) ## Joining the 'arxiv_words' data frame to this lexicon, to find the sentiment value of each word in 'arxiv_words': arxiv_words %>% inner_join(afinn, by = "word") %>% select(word, id, value) ## The left join is better here, since there may be some words in the abstracts that do not appear in the 'afinn' lexicon. ## In the left-joined table, these will have a missing sentiment value, which will be treated as a 0 when we sum the words' sentiment values. ## For each paper (labeled by 'id'), we will sum the sentiment values for the words in its abstract. ## With 'mutate', we create a 'sentiment_per_word' variable that finds the average sentiment per word (to not favor shorter or longer abstracts) arxiv_sentiments <- arxiv_words %>% left_join(afinn, by = "word") %>% group_by(id) %>% summarize( num_words = n(), sentiment = sum(value, na.rm = TRUE), .groups = "drop" ) %>% mutate(sentiment_per_word = sentiment / num_words) %>% arrange(desc(sentiment)) # Printing the 10 most positive in terms of SUMMED sentiment values: head(arxiv_sentiments, 10) # Printing the 10 most positive in terms of AVERAGE (per-word) sentiment values: arxiv_sentiments %>% arrange(desc(sentiment_per_word)) %>% head(10) ## Some summary statistics for the 'sentiment' and 'sentiment_per_word' variables across the 1089 abstracts: arxiv_papers <- arxiv_papers %>% left_join(arxiv_sentiments, by = "id") arxiv_papers %>% skim(sentiment, sentiment_per_word) ## Pulling out the abstract with the most positive sentiment and looking at it using 'strwrap': most_positive <- arxiv_papers %>% filter(sentiment_per_word == max(sentiment_per_word)) %>% pull(abstract) strwrap(most_positive) ## a time series plot of average sentiment plotted against submission date: ## Using the color aesthetic, we can create two curves based on whether the field was 'cs' (computer science) or not: ggplot( arxiv_papers, aes( x = submitted, y = sentiment_per_word, color = field == "cs" ) ) + geom_smooth(se = TRUE) + scale_color_brewer("Computer Science?", palette = "Set2") + # scale_color_brewer will let us pick a color palette labs(x = "Date submitted", y = "Sentiment score per word") ## Finding all the bigrams in the 'arxiv_papers' data: arxiv_bigrams <- arxiv_papers %>% unnest_tokens( arxiv_bigram, abstract_clean, token = "ngrams", n = 2 ) %>% select(arxiv_bigram, id) # only selecting two of the columns arxiv_bigrams ## How many times each bigram appears, sorted from most to least: arxiv_bigrams %>% count(arxiv_bigram, sort = TRUE) # What do you notice about the 4th most common bigram? ### Recall 'arxiv_words' is a "flattened" form of the 'DataSciencePapers' data frame (with stopwords removed) ### where the last column is called 'word' and each row is one word of an abstract. ## Using 'count' and 'arrange', we can list the most common words in the collection of abstracts: arxiv_words %>% count(word) %>% arrange(desc(n)) %>% head() ## The 'bind_tf_idf' function will calculate and print the ## tf (term frequency), idf (inverse document frequency), and tf_idf (product of tf and idf) ## for all terms in the collection of documents: ## We must specify which variable identifies each document in the collection -- here, 'id' identifies the document. tidy_DTM <- arxiv_words %>% count(id, word) %>% bind_tf_idf(word, id, n) ## This shows the words with the largest tf values: tidy_DTM %>% arrange(desc(tf)) %>% head() ## The same table, but showing the words with the largest idf values: tidy_DTM %>% arrange(desc(idf), desc(n)) %>% head() ## Which abstract contains the word 'wildfire', which has such a high idf score? arxiv_papers %>% pull(abstract) %>% str_subset("wildfire") %>% strwrap() %>% head() ## Look at how many abstracts the word 'implications' appears in: tidy_DTM %>% filter(word == "implications") ## This will print the words in a specific document (the one with id 1809.02408v2) that have the highest tf_idf scores: ## (Recall that tf_idf is a characteristic of the COMBINATION of the word AND the document) tidy_DTM %>% filter(id == "1809.02408v2") %>% arrange(desc(tf_idf)) %>% head() # These are words that appear frequently in THIS document, but not so frequently in the whole collection of documents. ## This result shows the abstracts in which the word "covid" appears much more frequently than it appears across the whole corpus of documents: tidy_DTM %>% filter(word == "covid") %>% arrange(desc(tf_idf)) %>% head() %>% left_join(select(arxiv_papers, id, abstract), by = "id") ## This lists the document-term combinations with the highest tf_idf scores: tidy_DTM %>% arrange(desc(tf_idf)) %>% head() %>% left_join(select(arxiv_papers, id, abstract), by = "id") ## cast_dtm calculates the document-term matrix: ## By default, the entries for a particular document-term combination would be the term frequency in that document, ## but the option ## weighting = tm::weightTfIdf ## specifies that we want the entries to be the normalized tf_idf values. tm_DTM <- arxiv_words %>% count(id, word) %>% cast_dtm(id, word, n, weighting = tm::weightTfIdf) tm_DTM # Sparsity of 99% indicates that most words do not appear in most documents. ## This will find all the terms with tf_idf scores (since that's what's in tm_DTM here) that are at least 7 (summed across documents) tm::findFreqTerms(tm_DTM, lowfreq = 7) ## Printing the terms with the highest tf_idf scores (summed across documents) tm_DTM %>% as.matrix() %>% as_tibble() %>% map_dbl(sum) %>% sort(decreasing = TRUE) %>% head() ## The 'findAssocs' function here will find words that tend to appear in the same documents as the word "causal" does: tm::findAssocs(tm_DTM, terms = "causal", corlimit = 0.35) ################################################################################ # # TEXT ANALYSIS OF ARTICLE BY Prof. Hitchcock # ################################################################################ # Importing the article from a text file on the web: DBH_url <- "https://people.stat.sc.edu/hitchcock/HistoryStatisticsCourseTASrev.txt" DBH_raw <- RCurl::getURL(DBH_url) length(DBH_raw) nchar(DBH_raw) # str_split returns a list: we only want the first element dbh <- DBH_raw %>% str_split("\r\n") %>% pluck(1) # plucks the first element from the list length(dbh) head(dbh,25) ## This uses some tools we've mentioned to create a tibble with information about on which line each section of the article begins... sections <- tibble( line = str_which(dbh, "^[1-9] +"), line_text = str_subset(dbh, "^[1-9] +"), labels = str_extract(line_text, "^[1-9] +") ) # install.packages("tidytext") library(tidytext) d <- tibble(txt = dbh) d d %>% unnest_tokens(output = word, input = txt) d_clean <- d %>% unnest_tokens(output = word, input = txt) %>% anti_join(get_stopwords(), by = "word") # Default word cloud: wordcloud(d_clean$word) # Formatted word cloud: wordcloud(d_clean$word, max.words = 40, scale = c(5, 1), colors = topo.colors(n = 30), random.color = TRUE ) # Section analysis: num_lines_per_section <-diff(c(1,sections$line,nrow(d)+1)) Section_ID <- rep(0:6, num_lines_per_section) # Cleaning up with the short end-sections: # 7=Supplementary Material, 8=Acknowledgments, 9=Disclosure, 10=References Section_ID[189:191]<-7; Section_ID[192:195]<-8; Section_ID[196:198]<-9; Section_ID[199:226]<-10; d_w_section <- data.frame(d,Section_ID) ## Cleaning out stopwords separately by section: clean_stopwords <- function(x){ result <- x %>% unnest_tokens(output = word, input = txt) %>% anti_join(get_stopwords(), by = "word") return(result) } d_clean_by_section <- vector("list", max(d_w_section$Section_ID)+1) # +1 to account for the "Section 0" for (i in 0:max(d_w_section$Section_ID)){ d_clean_by_section[[i+1]] <- d_w_section %>% # +1 to account for the "Section 0" filter(Section_ID==i) %>% clean_stopwords() } d_clean_all_sections <- bind_rows(d_clean_by_section) ## Sentiment analysis: # install.packages("textdata") library(textdata) afinn <- get_sentiments("afinn") # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5) dbh_sentiments <- d_clean_all_sections %>% left_join(afinn, by = "word") %>% group_by(Section_ID) %>% summarize( num_words = n(), sentiment = sum(value, na.rm = TRUE), .groups = "drop" ) %>% mutate(sentiment_per_word = sentiment / num_words) %>% arrange(desc(sentiment)) # Printing the most positive in terms of SUMMED sentiment values: head(dbh_sentiments, 11) # Printing the most positive in terms of AVERAGE (per-word) sentiment values: dbh_sentiments %>% arrange(desc(sentiment_per_word)) %>% head(11) #Recall: sections$line_text # 7=Supplementary Material, 8=Acknowledgments, 9=Disclosure, 10=References # Printing the most positive (only counting the "real" sections, 1-6) in terms of AVERAGE (per-word) sentiment values: dbh_sentiments %>% filter(Section_ID %in% 1:6) %>% arrange(desc(sentiment_per_word)) %>% head(10) ## Pulling out section 8 with the most positive sentiment and looking at it using 'strwrap': strwrap(d_clean_all_sections$word[d_clean_all_sections$Section_ID == 8]) d_clean_all_sections %>% filter(Section_ID == 8) %>% left_join(afinn, by = "word") ## Bigrams and N-grams: ## 4-grams: ## Finding all the 4-grams in the article: dbh_4grams <- d %>% unnest_tokens(output = dbh_4gram, input = txt, token = "ngrams", n = 4) %>% select(dbh_4gram) # only selecting two of the columns dbh_4grams ## How many times each 4-gram appears, sorted from most to least: dbh_4grams %>% count(dbh_4gram, sort = TRUE) ## Most common words in the whole article: d_clean %>% count(word) %>% arrange(desc(n)) %>% head() ## The 'bind_tf_idf' function will calculate and print the ## tf (term frequency), idf (inverse document frequency), and tf_idf (product of tf and idf) ## for all terms in the collection of documents: ## Here, each "document" is a section and the "collection" is the whole article -- here, 'id' identifies the document. dbh_DTM <- d_clean_all_sections %>% filter(Section_ID %in% 1:6) %>% count(Section_ID, word) %>% bind_tf_idf(word, Section_ID, n) ## This shows the words with the largest tf values: dbh_DTM %>% arrange(desc(tf)) %>% head() ## The same table, but showing the words with the largest idf values: dbh_DTM %>% arrange(desc(idf), desc(n)) %>% head() ## The same table, but showing the words/sections with the largest tf_idf values: dbh_DTM %>% arrange(desc(tf_idf), desc(n)) %>% head() ## cast_dtm calculates the document-term matrix: ## Here, the entries for a particular section-term combination would be the term frequency in that section (the default choice) dbh_tm_DTM <- d_clean_all_sections %>% filter(Section_ID %in% 1:6) %>% count(Section_ID, word) %>% cast_dtm(Section_ID, word, n) dbh_tm_DTM ## The 'findAssocs' function here will find words that tend to appear in the same sections as the word "guest" does: tm::findAssocs(dbh_tm_DTM, terms = "guest", corlimit = 0.92) ################################################################################ # # Extended example: TEXT ANALYSIS OF Beatles song titles # ################################################################################ ## ----message=FALSE------------------------------------------------------------ library(rvest) url <- "http://en.wikipedia.org/wiki/List_of_songs_recorded_by_the_Beatles" tables <- url %>% read_html() %>% html_nodes("table") Beatles_songs <- tables %>% purrr::pluck(3) %>% html_table(fill = TRUE) %>% janitor::clean_names() %>% select(song, lead_vocal_s_d) glimpse(Beatles_songs) ## Removing the quotation marks (denoted WITHIN the string as \" ) from the 'song' variable, and renaming the 'lead vocal' variable: Beatles_songs <- Beatles_songs %>% mutate(song = str_remove_all(song, pattern = '\\"')) %>% rename(vocals = lead_vocal_s_d) glimpse(Beatles_songs) ## Counts for each vocalist (or combination of vocalist) Beatles_songs %>% group_by(vocals) %>% count() %>% arrange(desc(n)) # The table of results is slightly different from what's shown in the book ... # Wikipedia must have updated its data ... ## Comparing the number of songs sung solely by Paul McCartney to the number sung solely by John Lennon: Beatles_songs %>% pull(vocals) %>% str_subset("McCartney") %>% length() Beatles_songs %>% pull(vocals) %>% str_subset("Lennon") %>% length() ## the number of songs sung by McCartney or by John Lennon or by both: Beatles_songs %>% pull(vocals) %>% str_subset("(McCartney|Lennon)") %>% length() ## the number of songs sung by both Lennon AND McCartney, NOT including those with sole vocals from either: pj_regexp <- "(McCartney|Lennon).*(McCartney|Lennon)" Beatles_songs %>% pull(vocals) %>% str_subset(pj_regexp) %>% length() ## Using 'str_detect' to get the same result as above: Beatles_songs %>% filter(str_detect(vocals, pj_regexp)) %>% nrow() # And listing the first few of these songs on which they both sang: Beatles_songs %>% filter(str_detect(vocals, pj_regexp)) %>% select(song, vocals) %>% head() ## Listing the most common words in Beatles song titles (removing stopwords) Beatles_songs %>% unnest_tokens(word, song) %>% anti_join(get_stopwords(), by = "word") %>% count(word, sort = TRUE) %>% arrange(desc(n)) %>% head() ## An analysis by albums and by year: # The full data set with more columns, including the album each song appeared on ... Beatles_songs_full <- tables %>% purrr::pluck(3) %>% html_table(fill = TRUE) %>% janitor::clean_names() %>% rename(vocals = lead_vocal_s_d, album = core_catalogue_release_s, writer = songwriter_s) %>% mutate(song = str_remove_all(song, pattern = '\\"'), album = str_remove_all(album, pattern = '\\"')) %>% select(-ref_s) glimpse(Beatles_songs_full) unique(Beatles_songs_full$album) # Some of the album titles could use some cleaning up ... mypatterns <- c("Let It BePast Masters", "Past Masters", "Magical Mystery Tour") Beatles_songs_full$album = ifelse(Beatles_songs_full$album == mypatterns[1], "Let It Be", Beatles_songs_full$album) Beatles_songs_full$album = ifelse(str_detect(Beatles_songs_full$album, mypatterns[2]), "Past Masters", Beatles_songs_full$album) Beatles_songs_full$album = ifelse(str_detect(Beatles_songs_full$album, mypatterns[3]), "Magical Mystery Tour", Beatles_songs_full$album) unique(Beatles_songs_full$album) # Looks better! ## Did the length of Beatles song titles change over time? ggplot(data=Beatles_songs_full, aes(x=year,y=nchar(song)) ) + geom_point() + geom_smooth(method='loess',se=FALSE) + ylab("Number of Characters in Song Title") ## Was the length of Beatles song titles different across albums? ggplot(data=Beatles_songs_full, aes(x=album,y=nchar(song)) ) + geom_boxplot() + ylab("Number of Characters in Song Title") + scale_x_discrete(guide = guide_axis(n.dodge = 2)) # so that the album titles don't run into each other... # Or could make the boxplots horizontal... ggplot(data=Beatles_songs_full, aes(x=album,y=nchar(song)) ) + geom_boxplot() + ylab("Number of Characters in Song Title") + coord_flip() ## A quick sentiment analysis: Beatles_words <- Beatles_songs_full %>% unnest_tokens(word, song) %>% anti_join(get_stopwords(), by = "word") # install.packages("textdata") library(textdata) afinn <- get_sentiments("afinn") # Uses the 'afinn' lexicon, which rates words using integers from most negative (-5) to most positive (5) ## Joining the 'Beatles_words' data frame to this lexicon, to find the sentiment value of each word in 'Beatles_words': Beatles_words %>% inner_join(afinn, by = "word") %>% select(word, year, album, value) Beatles_sentiments_by_album <- Beatles_words %>% left_join(afinn, by = "word") %>% group_by(album) %>% summarize( num_words = n(), sentiment = sum(value, na.rm = TRUE), .groups = "drop" ) %>% mutate(sentiment_per_word = sentiment / num_words) %>% arrange(desc(sentiment)) # Printing the most positive in terms of AVERAGE (per-word) sentiment values: Beatles_sentiments_by_album %>% arrange(desc(sentiment_per_word)) %>% head(15) ## Not sure this is very valuable ...