Blog 4 -Supervised Learning Part 1

Blog 4

Polarity1

dictionary

healthcare

supervised learning

ggplot2

Author

Rhowena Vespa

Published

October 29, 2022

Code

knitr::opts_chunk$set(echo = TRUE, warning = FALSE)

Code

library(readr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Code

library(quanteda)

Package version: 3.2.3
Unicode version: 13.0
ICU version: 69.1

Parallel computing: 8 of 8 threads used.

See https://quanteda.io for tutorials and examples.

Code

library(quanteda.textstats)
library(quanteda.textplots)
library(ggplot2)
library(DT)
library(tm)

Loading required package: NLP


Attaching package: 'NLP'

The following object is masked from 'package:ggplot2':

    annotate

The following objects are masked from 'package:quanteda':

    meta, meta<-


Attaching package: 'tm'

The following object is masked from 'package:quanteda':

    stopwords

Code

library(stringr)
library(tidytext)
library(plyr)

------------------------------------------------------------------------------

You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)

------------------------------------------------------------------------------


Attaching package: 'plyr'

The following objects are masked from 'package:dplyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

Code

library(tidyverse)

── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──

✔ tibble  3.1.8     ✔ purrr   0.3.5
✔ tidyr   1.2.1     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ NLP::annotate()   masks ggplot2::annotate()
✖ plyr::arrange()   masks dplyr::arrange()
✖ purrr::compact()  masks plyr::compact()
✖ plyr::count()     masks dplyr::count()
✖ plyr::failwith()  masks dplyr::failwith()
✖ dplyr::filter()   masks stats::filter()
✖ plyr::id()        masks dplyr::id()
✖ dplyr::lag()      masks stats::lag()
✖ plyr::mutate()    masks dplyr::mutate()
✖ plyr::rename()    masks dplyr::rename()
✖ plyr::summarise() masks dplyr::summarise()
✖ plyr::summarize() masks dplyr::summarize()

Code

library(quanteda.textmodels)
library(devtools)

Loading required package: usethis

Code

library(caret)

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

Code

library(quanteda.dictionaries)
#library(devtools)
#devtools::install_github("kbenoit/quanteda.dictionaries")
library(quanteda.dictionaries)
library(syuzhet) 
#remotes::install_github("quanteda/quanteda.sentiment")
library(quanteda.sentiment)


Attaching package: 'quanteda.sentiment'

The following object is masked from 'package:quanteda':

    data_dictionary_LSD2015

This 4th Blog starts work on supervised machine learning from week 7 and 8. Using tweet replies as corpus, sentiment scores, polarity scores are calculated and visualized.

Supervised machine learning will continue on next blog post to build models for polarity classification

Code

# Read in data

IRA<- read_csv("IRA_med.csv")

New names:
Rows: 9497 Columns: 79
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(34): edit_history_tweet_ids, text, lang, source, reply_settings, entit... dbl
(18): id, conversation_id, referenced_tweets.replied_to.id, referenced_... lgl
(23): referenced_tweets.retweeted.id, edit_controls.is_edit_eligible, r... dttm
(4): edit_controls.editable_until, created_at, author.created_at, __tw...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...79`

Code

#remove @twitter handles
IRA$text <- gsub("@[[:alpha:]]*","", IRA$text) #remove Twitter handles

Code

IRA_corpus <- corpus(IRA,text_field = "text")

#tokenize and stemming

Code

IRA_tokens <- tokens(IRA_corpus)
IRA_tokens <- tokens_wordstem(IRA_tokens)

USING LECTURE week8 –NRC sentiment

Code

# use liwcalike() to estimate sentiment using NRC dictionary
IRAreviewSentiment_nrc <- liwcalike(IRA_corpus, data_dictionary_NRC)

names(IRAreviewSentiment_nrc)

 [1] "docname"      "Segment"      "WPS"          "WC"           "Sixltr"      
 [6] "Dic"          "anger"        "anticipation" "disgust"      "fear"        
[11] "joy"          "negative"     "positive"     "sadness"      "surprise"    
[16] "trust"        "AllPunc"      "Period"       "Comma"        "Colon"       
[21] "SemiC"        "QMark"        "Exclam"       "Dash"         "Quote"       
[26] "Apostro"      "Parenth"      "OtherP"

Code

ggplot(IRAreviewSentiment_nrc) +
  geom_histogram(aes(x = positive)) +
  theme_bw()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code

IRA_corpus[which(IRAreviewSentiment_nrc$positive > 15)]

Corpus consisting of 376 documents and 78 docvars.
text19 :
"4president   The US gets zero benefit by supporting Ukraine...."

text28 :
" we’re winning?"

text78 :
" bold faced lie"

text131 :
" Prove it"

text138 :
" Bullshit clown"

text139 :
" Nothing has been won you clown"

[ reached max_ndoc ... 370 more documents ]

Code

ggplot(IRAreviewSentiment_nrc) +
  geom_histogram(aes(x = negative)) +
  theme_bw()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code

IRA_corpus[which(IRAreviewSentiment_nrc$negative > 15)]

Corpus consisting of 884 documents and 78 docvars.
text24 :
"  suck on that"

text25 :
" press x to doubt"

text26 :
"  Hasbara troll.. shutthefuckup hoe"

text33 :
" Pharma didn't lose shit."

text78 :
" bold faced lie"

text91 :
" And inflation still went up"

[ reached max_ndoc ... 878 more documents ]

Code

# create a full dfm for comparison
IRA_Dfm <- tokens(IRA_corpus,
                         remove_punct = TRUE,
                         remove_symbols = TRUE,
                         remove_numbers = TRUE,
                         remove_url = TRUE,
                         split_hyphens = FALSE,
                         include_docvars = TRUE) %>%
  tokens_tolower() %>%
  dfm()

head(IRA_Dfm, 10)

Document-feature matrix of: 10 documents, 11,231 features (99.88% sparse) and 78 docvars.
       features
docs    so when do i get to see those cheap prices
  text1  0    0  0 0   0  0   0     0     0      0
  text2  1    1  1 1   1  1   1     1     1      1
  text3  0    0  0 1   0  0   0     0     0      0
  text4  0    0  0 0   0  1   0     0     0      0
  text5  0    0  0 0   1  1   0     0     0      0
  text6  0    0  0 0   0  0   0     0     0      0
[ reached max_ndoc ... 4 more documents, reached max_nfeat ... 11,221 more features ]

Code

dim(IRA_Dfm)

[1]  9497 11231

Code

# convert corpus to dfm using the dictionary
IRADfm_nrc <- tokens(IRA_corpus,
                         remove_punct = TRUE,
                         remove_symbols = TRUE,
                         remove_numbers = TRUE,
                         remove_url = TRUE,
                         split_hyphens = FALSE,
                         include_docvars = TRUE) %>%
  tokens_tolower() %>%
  dfm() %>%
  dfm_lookup(data_dictionary_NRC)
  
  
dim(IRADfm_nrc)

[1] 9497   10

Code

head(IRADfm_nrc, 10)

Document-feature matrix of: 10 documents, 10 features (76.00% sparse) and 78 docvars.
       features
docs    anger anticipation disgust fear joy negative positive sadness surprise
  text1     0            0       0    0   0        0        0       0        0
  text2     0            0       0    0   0        1        0       0        0
  text3     2            1       0    1   1        2        1       1        0
  text4     0            0       0    0   0        1        1       0        0
  text5     0            0       0    0   0        0        1       0        0
  text6     1            0       0    0   0        1        0       1        0
       features
docs    trust
  text1     0
  text2     0
  text3     1
  text4     0
  text5     0
  text6     0
[ reached max_ndoc ... 4 more documents ]

Code

class(IRADfm_nrc)

[1] "dfm"
attr(,"package")
[1] "quanteda"

POLARITY

Code

IRAdf_nrc <- convert(IRADfm_nrc, to = "data.frame")
names(IRAdf_nrc)

 [1] "doc_id"       "anger"        "anticipation" "disgust"      "fear"        
 [6] "joy"          "negative"     "positive"     "sadness"      "surprise"    
[11] "trust"

Code

IRAdf_nrc$polarity <- (IRAdf_nrc$positive - IRAdf_nrc$negative)/(IRAdf_nrc$positive + IRAdf_nrc$negative)

IRAdf_nrc$polarity[(IRAdf_nrc$positive + IRAdf_nrc$negative) == 0] <- 0

ggplot(IRAdf_nrc) +
  geom_histogram(aes(x=polarity)) +
  theme_bw()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

NEW DATAFRAME WITH TEXT AND POLARITY

Code

IRAdf_nrc_CBIND <- as.data.frame(cbind(IRAdf_nrc, IRA_text_df))

Error in data.frame(..., check.names = FALSE): object 'IRA_text_df' not found

Code

IRAdf_nrc_CBIND <- as.character(IRAdf_nrc_CBIND)

Error in eval(expr, envir, enclos): object 'IRAdf_nrc_CBIND' not found

Code

typeof(IRAdf_nrc_CBIND)

Error in typeof(IRAdf_nrc_CBIND): object 'IRAdf_nrc_CBIND' not found

NEW CORPUS with polarity scores

Code

IRApolarity_corpus <- corpus(IRAdf_nrc_CBIND)

Error in corpus(IRAdf_nrc_CBIND): object 'IRAdf_nrc_CBIND' not found

Code

writeLines(head(IRA_corpus[which(IRAdf_nrc$polarity == 1)]))

 What use to cost 2-3 dollars at grocery stores now cost 5-7 dollars and we get less product. Great job.
 How much of our money have you sent to Ukraine now joe?
4president   The US gets zero benefit by supporting Ukraine.  Absolutely nothing!
_di1200     Do you think the US will stand by when the Taliban use the “ARSENAL” left behind against an ally? \n\n5 years? \n10 years?\n\nPeace forever lol
 we’re winning?
570SEASONS  These are good provisions that will help people. Trump campaigned on a few of these too. https://t.co/KQmJRxKZYV

APPLY DICTIONARY within context

Code

# tokenize corpus
IRAtokens <- tokens(IRA_corpus, remove_punct = TRUE)
# what are the context (target) words or phrases
IRA_words <- c("inflation","POTUS", "price*","joe", "biden", "trump","medicare","drug","cost","america*","won","lost")

Code

# retain only our tokens and their context
IRAtokens_HC <- tokens_keep(IRAtokens, pattern = phrase(IRA_words), window = 40)

Code

IRAdata_dictionary_LSD2015_pos_neg <- data_dictionary_LSD2015[1:2]

IRAtokens_HC_lsd <- tokens_lookup(IRAtokens_HC,
                               dictionary = data_dictionary_LSD2015_pos_neg)

Error in is(x, "dictionary2"): object 'data_dictionary_LSD2015_pos_neg' not found

COnvert to dfm

Code

IRAdfm_HC <- dfm(IRAtokens_HC_lsd)

Error in dfm(IRAtokens_HC_lsd): object 'IRAtokens_HC_lsd' not found

Code

head(IRAdfm_HC, 10)

Error in head(IRAdfm_HC, 10): object 'IRAdfm_HC' not found

Code

# convert to data frame
IRAmat_HC <- convert(IRAdfm_HC, to = "data.frame")

Error in convert(IRAdfm_HC, to = "data.frame"): object 'IRAdfm_HC' not found

Code

# drop if both features are 0
IRAmat_HC <- IRAmat_HC[-which((IRAmat_HC$negative + IRAmat_HC$positive)==0),]

Error in eval(expr, envir, enclos): object 'IRAmat_HC' not found

Code

# print a little summary info
paste("We have ",nrow(IRAmat_HC)," tweets replies that mention positive or negative words in the context of Inflation terms.", sep="")

Error in nrow(IRAmat_HC): object 'IRAmat_HC' not found

Code

# create polarity scores
IRAmat_HC$polarity <- (IRAmat_HC$positive - IRAmat_HC$negative)/(IRAmat_HC$positive + IRAmat_HC$negative)

Error in eval(expr, envir, enclos): object 'IRAmat_HC' not found

Code

# summary
summary(IRAmat_HC$polarity)

Error in summary(IRAmat_HC$polarity): object 'IRAmat_HC' not found

Code

# plot
ggplot(IRAmat_HC) + 
  geom_histogram(aes(x=polarity)) + 
  theme_bw()

Error in ggplot(IRAmat_HC): object 'IRAmat_HC' not found

REFERENCES

    1. House, T., 2022. BY THE NUMBERS: The Inflation Reduction Act - The White House. [online] The White House. Available at: <https://www.whitehouse.gov/briefing-room/statements-releases/2022/08/15/by-the-numbers-the-inflation-reduction-act/> [Accessed 15 October 2022].
    2. Biden, P. (2022, October 15). We pay more for our prescription drugs than any other nation in the world. it's outrageous. but now, instead of money going into the pockets of drug companies, it will go into your pockets in the form of lower drug prices. Twitter. Retrieved October 15, 2022, from https://twitter.com/POTUS/status/1581374573815431168 
    3. Robinson, J. S. and D. (n.d.). Welcome to text mining with r: Text mining with R. Welcome to Text Mining with R | Text Mining with R. Retrieved October 15, 2022, from https://www.tidytextmining.com/