Nayan Jani
Topic Modeling

Nayan Jani


December 9, 2022


knitr::opts_chunk$set(echo = TRUE)


The FIFA 2022 World Cup has captivated almost everyone’s attention this year. However, most of that attention has been focused on human rights violations that are present. The Host nation Qatar has been under pressure for these violations because of their treatment of foreigners in their country and their failure to be inclusive. Migrant workers that have helped build stadiums have been mistreated, underpaid, overworked and even killed leading up to the World Cup. The Host nation also has disallowed for the LGBTQ community to represent themselves because of the Host nations beliefs. Soccer Fans from both cultures (Host nation vs Foreigners) have argued over what values to respect on the global level.


Find the overall Sentiment of the comments (Positive and Negative, Other emotions)

What is the main focus in the comments (what topic is most important to the people in the comments)

Based on the most important topics and the sentiment of those comments, are those comments classified correctly positive or negative? If yes, what are the comments POV? (western culture vs middle east culture) Are those comments “socially correct”? (logical/acceptable POV vs Stereotyped/Stigmatized POV)

Read in Data

df_bbc<- read_csv("_data/comments_bbc.csv")

df_bbc<- df_bbc%>% 
  rename(text = "i")

df_bbc<- df_bbc %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_bbc <- df_bbc %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_bbc <- corpus(df_bbc)
corpus_bbc_summary <- summary(corpus_bbc)

corpus_bbc_summary$video <- "BBC"
docvars(corpus_bbc) <- corpus_bbc_summary

df_q<- read_csv("_data/comments_q.csv")

df_q<- df_q %>% 
  rename(text = "I’ll try to get the next video essay out in less than a month lol")

df_q<- df_q %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_q <- df_q %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_q <- corpus(df_q)
corpus_q_summary <- summary(corpus_q)
corpus_q_summary$video <- "Maqwell"
docvars(corpus_q) <- corpus_q_summary

df_qRev<- read_csv("_data/comments_qRev.csv")

df_qRev <- df_qRev%>% 
  rename(text = "I’ll try to get the next video essay out in less than a month lol")

df_qRev<- df_qRev %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_qRev <- df_qRev %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_qRev <- corpus(df_qRev)
corpus_qRev_summary <- summary(corpus_qRev)
corpus_qRev_summary$video <- "MaqwellRev"
docvars(corpus_qRev) <- corpus_qRev_summary

df_sky<- read_csv("_data/comments_sky.csv")

df_sky<- df_sky%>% 
  rename(text = "i")

df_sky<- df_sky %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_sky <- df_sky %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_sky <- corpus(df_sky)
corpus_sky_summary <- summary(corpus_sky)
corpus_sky_summary$video <- "sky"
docvars(corpus_sky) <- corpus_sky_summary

df_bbcQ <- read_csv("_data/comments_bbcQ.csv")

df_bbcQ<- df_bbcQ%>% 
  rename(text = "i")

df_bbcQ<- df_bbcQ %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_bbcQ <- df_bbcQ %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_bbcQ<- corpus(df_bbcQ)
corpus_bbcQ_summary <- summary(corpus_bbcQ)
corpus_bbcQ_summary$video <- "BBC"

df_bbcOL <- read_csv("_data/comments_bbcOL.csv")

df_bbcOL<- df_bbcOL%>% 
  rename(text = "i")

df_bbcOL<- df_bbcOL %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_bbcOL <- df_bbcOL %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_bbcOL<- corpus(df_bbcOL)
corpus_bbcOL_summary <- summary(corpus_bbcOL)
corpus_bbcOL_summary$video <- "BBC"

df_BI <- read_csv("_data/comments_BI.csv")

df_BI<- df_BI%>% 
  rename(text = "i")

df_BI<- df_BI %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_BI <- df_BI %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_BI<- corpus(df_BI)
corpus_BI_summary <- summary(corpus_BI)
corpus_BI_summary$video <- "Business Insider"

df_fra <- read_csv("_data/comments_fra.csv")

df_fra<- df_fra%>% 
  rename(text = "i")

df_fra<- df_fra %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_fra <- df_fra %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_fra<- corpus(df_fra)
corpus_fra_summary <- summary(corpus_fra)
corpus_fra_summary$video <- "France 21"

df_H <- read_csv("_data/comments_H.csv")

df_H<- df_H%>% 
  rename(text = "i")

df_H<- df_H %>%
  select(text) %>% 
  mutate(text = str_remove_all(text,"39"))

df_H <- df_H %>% 
  select(text) %>% 
  mutate(text = str_remove_all(text,"<")) %>% 
  mutate(text = str_remove_all(text,">"))

corpus_H<- corpus(df_H)
corpus_H_summary <- summary(corpus_H)
corpus_H_summary$video <- "Harris"
full_df <- rbind(df_bbc,df_q,df_qRev,df_sky,df_bbcQ,df_bbcOL,df_BI,df_fra,df_H)
full_df$id <- 1:nrow(full_df)
full_df$id <- as.character(full_df$id)

# A tibble: 6 × 2
  text                                                                     id   
  <chr>                                                                    <chr>
1 Looking forward to it if you just act normal respect the culture and co… 1    
2 Honestly every country in the world has done bad. May Allah bless these… 2    
3 if you don't like it stay home and the last people talking about human … 3    
4 So we can’t boycott a football game ( a trivial matter)  to protest a r… 4    
5 I see everyone kept their mouth shut with Russia&#;s world cup           5    
6 Did Qatar invade any country and kill millions? Who are robbing Africa?… 6    
full_corpus <- corpus(full_df$text)
full_corpus_summary<- summary(full_corpus)
tokens1 <- tolower(full_df$text)

# performs tokenization
tokens1 <- word_tokenizer(tokens1,pos_remove = c("PUNCT", "DET", "ADP", "SYM", "PART", "AUX" ))

head(tokens1, 2)
 [1] "looking"    "forward"    "to"         "it"         "if"        
 [6] "you"        "just"       "act"        "normal"     "respect"   
[11] "the"        "culture"    "and"        "country"    "and"       
[16] "you"        "will"       "be"         "ok"         "no"        
[21] "one"        "needs"      "to"         "know"       "someone"   
[26] "is"         "gay"        "just"       "like"       "no"        
[31] "one"        "needs"      "to"         "know"       "if"        
[36] "you"        "hetero"     "just"       "save"       "that"      
[41] "for"        "behind"     "closed"     "doors"      "and"       
[46] "make"       "sure"       "it"         "s"          "between"   
[51] "consenting" "adults"     "children"   "and"        "animals"   
[56] "are"        "not"        "consenting" "adults"     "btw"       
[61] "you"        "perverts"  

 [1] "honestly"  "every"     "country"   "in"        "the"       "world"    
 [7] "has"       "done"      "bad"       "may"       "allah"     "bless"    
[13] "these"     "workers"   "who"       "have"      "done"      "so"       
[19] "much"      "for"       "the"       "country"   "however"   "to"       
[25] "call"      "it"        "slavery"   "is"        "too"       "far"      
[31] "they"      "r"         "not"       "taking"    "them"      "without"  
[37] "there"     "will"      "like"      "when"      "british"   "and"      
[43] "americans" "killed"    "and"       "took"      "thousands" "of"       
[49] "slaves"    "they"      "come"      "here"      "for"       "better"   
[55] "options"   "of"        "course"    "it"        "can"       "be"       
[61] "better"    "however"   "it"        "is"        "much"      "better"   
[67] "than"      "what"      "was"       "happening" "to"        "them"     
[73] "in"        "their"     "own"       "countries"
full_tokens <- tokens(full_corpus,
    remove_numbers = T,
    remove_url = T,
    remove_punct = T,
    remove_symbols = T)
full_tokens <-tokens_tolower(full_tokens)
full_tokens <- tokens_select(full_tokens, 
                              pattern = c(stopwords("en"),"quot","href","don"),
                              selection = "remove",
                              min_nchar = 3)

head(full_tokens, 2)
Tokens consisting of 2 documents.
text1 :
 [1] "looking" "forward" "just"    "act"     "normal"  "respect" "culture"
 [8] "country" "one"     "needs"   "know"    "someone"
[ ... and 22 more ]

text2 :
 [1] "honestly" "every"    "country"  "world"    "done"     "bad"     
 [7] "may"      "allah"    "bless"    "workers"  "done"     "much"    
[ ... and 25 more ]

Topic Model

For LDA topic Modeling, I used the package text2vec. I first wanted to find the best value for K and then extract the top 10 words from my topics.

it1 <- itoken(tokens1, ids = full_df$id, progressbar = FALSE)

stop_words1 = c("i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours","a","the","in","as","on", "is","it", "to","of","are","not","and","quot","don","","an","have","this","if","they","v","2","their","can", "than")
v1 <- create_vocabulary(it1,stopwords = stop_words1)

#v1 <- prune_vocabulary(v1, term_count_min = 5)

vectorizer1 <- vocab_vectorizer(v1)

dtm1 <- create_dtm(it1, vectorizer1, type = "dgTMatrix")

lda_model1 <- LDA$new(n_topics = 5, doc_topic_prior = 0.1,
                     topic_word_prior = 0.01)

doc_topic_distr1 <- 
  lda_model1$fit_transform(x = dtm1, n_iter = 1000,
                          convergence_tol = 0.001, n_check_convergence = 25,
                          progressbar = FALSE)
INFO  [14:58:37.340] early stopping at 125 iteration
INFO  [14:58:37.889] early stopping at 75 iteration
lda_model1$get_top_words(n = 10,
                        lambda = 0.4)
      [,1]      [,2]        [,3]    [,4]       [,5]          
 [1,] "feel"    "east"      "cup"   "respect"  "workers"     
 [2,] "people"  "middle"    "world" "or"       "for"         
 [3,] "way"     "was"       "will"  "there"    "he"          
 [4,] "still"   "lot"       "be"    "when"     "working"     
 [5,] "all"     "but"       "watch" "culture"  "conditions"  
 [6,] "rules"   "were"      "https" "west"     "these"       
 [7,] "saudi"   "man"       "2022"  "laws"     "from"        
 [8,] "company" "even"      "href"  "done"     "construction"
 [9,] "left"    "countries" "fifa"  "football" "lost"        
[10,] "am"      "qatari"    "host"  "no"       "migrant"     

The Topic Model implies that the major topic of discussion surrounds showing respect to both cultures (Western and Middle East). The other topics in the model suggests discussions surrounding migrant workers and how they are treated.

Sentiment Analysis

For Sentiment Analysis I used the packages tidytext and sentimentr. Sentimentr attempts to take into account valence shifters (i.e., negators, amplifiers (intensifiers), de-amplifiers (downtoners), and adversative conjunctions). This will give me better results than before.

Warning: package 'sentimentr' was built under R version 4.2.2

Attaching package: 'sentimentr'
The following object is masked from 'package:syuzhet':

mytext <- get_sentences(full_df$text)
senti<- sentiment_by(mytext)

senti<- senti %>% 
  filter(word_count > 4)

qplot(senti$ave_sentiment,   geom="histogram",binwidth=0.1,main="Sentiment Histogram")
Warning: `qplot()` was deprecated in ggplot2 3.4.0.

writeLines(head(full_corpus[which(senti$ave_sentiment >.5)]))
a country that has modern day slavery, human rights abuses, sharia law which is extremely discriminatory to women and people of any other religion other than Islam and Christianity, one of the highest death rates of workers, yet it&#;s okay to play football there, amazing isn&#;t it, people say money can&#;t buy you everything, show them this shitshow
This Arabic nation is little humanitarian concepts among workers , especially  home maid workers conditions are very brutal no human rights and laws. like fisherman attitude.
A golden opportunity to combat negative stereotypes... used to make them worse.
Chad Qatar : Bribes Virgin West to submission. Humiliaties the West. Doesn&#;t care about stupid lgbt flag. Profits
now do the same when US host a worldwide event lol
Im glad someone made a vid about this, I lived in Doha for 8 years and recently left back to my country. Whenever I would go home from school you could see workers being forced to work in the summers peak temp hour even though it was made illegal to work from 12pm to 4pm or something like that? Theres also a huge lack of safety, cranes with cargo would be moved OVER MOVING TRAFFIC:brbrTo any adults who plan to go to the world cup I will tell you now that many places in Qatar will not serve alchoholic drinks because they need a license and can only purchase their alchohol from QDC. So if you do want alchohol I&#;d reccommend going to a hotel like the Grand Hyatt or something. So don&#;t get upset if ya cant find a place to get beer. (but in the stadium there  will be alchohol drinking zones).

The overall sentiment of the corpus is skewed right, suggesting that most of the comments are negative. The most positive comments show more western culture beliefs and criticisms of Qatar. The reasoning and language the commenters use are socially acceptable based on their knowledge of the situation in Qatar and their experiences. Some of the comments are jokes but the main points get across about their beliefs.


full_dfm <- dfm(full_tokens)

smaller_dfm <- dfm_trim(full_dfm, min_termfreq = 2)

Different K’s

Based on the results of Held-Out Likelihood and Semantic Coherence, I would want to test different K’s between 1-10.

Pairwise Corelation

Here I want to examine correlation among words, which indicates how often they appear together relative to how often they appear separately. The pairwise_cor() function in widyr lets us find the phi coefficient between words based on how often they appear in the same section. Here I pick particular terms of interest and find the other terms most associated with them and create a visualization of the correlations and clusters of words.

section_words <- full_df %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>% 
  filter(word != "https") %>% 
  filter(word != "href") %>% 
  filter(word != "") %>%
  filter(word != "") %>% 
  filter(word!= "3") %>% 
  filter(word!= "2") %>% 
  filter(word!= "1") %>% 
  filter(word!= "12") %>% 
  filter(word!= "ve")

Warning: package 'widyr' was built under R version 4.2.2
Warning: package 'ggraph' was built under R version 4.2.2
word_cors <- section_words %>%
  group_by(word) %>%
  filter(n() >= 15) %>%
  pairwise_cor(word, section, sort = TRUE)


word_cors %>%
  filter(correlation > .35) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.

word_cors %>%
  filter(item1 %in% c("western", "qatar", "lgbtq", "workers","rights","respect","country","cultures")) %>%
  group_by(item1) %>%
  slice_max(correlation, n = 6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +

The Words of interest that I picked include “western”, “qatar”, “lgbtq”, “workers”,“rights”,“respect”,“country”,“cultures”. The correlation between the words respect and culture, politics suggests that respecting the culture and politics of a certain region is being discussed when talking about respect. The Visualization shows the correlation between many words. The relationships here are symmetrical, rather than directional. The connections between words help verify what topics are being discussed in all of the comments. For example, the cluster surrounding the word “worker” shows that the word is correlated with negative terms that relate to the treatment and condition they received.

Next Post

For my next post, I will put all my work together with my full dataset and continue to fine tune my analysis.