Adithya Parupudi
Author

Adithya Parupudi

Published

October 11, 2022

Libraries

Reading in all the libraries :)

Code
library(quanteda)
library(tidyverse)
library(rvest)
library(stringr)
library(tokenizers)
library(tm)
library(wordcloud)
library(wordcloud2)
library(stopwords)
library(tidyverse)
library(tidytext)
library(ggplot2)
knitr::opts_chunk$set(echo=TRUE)

Reading Data

From CSV

Code
dataset2 <- read_csv("./100FamousPeople.csv")
New names:
Rows: 116 Columns: 4
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(3): people_names, links, content dbl (1): ...1
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
Code
head(dataset2)
# A tibble: 6 × 4
   ...1 people_names     links                                           content
  <dbl> <chr>            <chr>                                           <chr>  
1     1 Abraham Lincoln  https://www.biographyonline.net/politicians/am… "“With…
2     2 Adolf Hitler     https://www.biographyonline.net/military/adolf… "Adolf…
3     3 Albert Einstein  https://www.biographyonline.net/scientists/alb… "Born …
4     4 Alfred Hitchcock https://www.biographyonline.net/actors/alfred-… "Sir A…
5     5 Amelia Earhart ( https://www.biographyonline.net/adventurers/am… "Ameli…
6     6 Angelina Jolie   https://www.biographyonline.net/actors/angelin… "Angel…
Code
dataset2<- tibble(dataset2)
Code
# counting number of words each row

str_count(dataset2$content, '\\s+')+1
  [1] 1903 1144 1879 1269 1445 1482 1441 1386 1082 1616 1567  734 1519  523  777
 [16]  598  975  862 1267 1043 1083  914 1402 1539 1086 1064 1325 1467 1165 1851
 [31] 1301 1491  654 1718 1252 1289 1510 1780 2546 1193 2133 1023 1432 1811 1872
 [46] 1173 1054  915 1800 1426 1638 1492  987 1030  886 1831  637 1405 1809 1915
 [61] 1218  852 1340  826 1268 1496 1746  536  851 1393 1650 1170 1201  930  774
 [76] 1052 1249  914 1698 1188 2668 1190 1634 1156 1253 1031 1223  914 1244 1035
 [91] 1464 1641 1265 1366 1161 1253 1436  993 2108 2215  427 1664 2184 1287 1370
[106]  745 1669 1185 1489 1559 1323 1361 1626 1763  979 1373

Using STM

Code
tidy_dataset <- dataset2 %>% 
  mutate(line=row_number()) %>%  # adding row number
  unnest_tokens(word, content) %>%  # tokenising content column to a 'word' column
  anti_join(stop_words) #removing stop words 
Joining, by = "word"
Code
tidy_dataset %>% count(word, sort=TRUE) # countind word frequency in the tidy_dataset
# A tibble: 15,377 × 2
   word          n
   <chr>     <int>
 1 world       486
 2 people      425
 3 war         351
 4 life        346
 5 time        339
 6 famous      320
 7 including   226
 8 women       171
 9 biography   168
10 oxford      159
# … with 15,367 more rows

Exploring tf-idf

Code
tidy_dataset %>% 
  count(people_names, word, sort=TRUE) %>% 
  bind_tf_idf(word,people_names,n) %>% 
  group_by(people_names) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(word=reorder(word,tf_idf)) %>% 
  filter(., people_names=='Muhammad Ali' | people_names=='Lionel Messi' | people_names=='Nelson Mandela')  %>% 
  ggplot(aes(word,tf_idf, fill=people_names)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~people_names, scales="free") +
  coord_flip() 
Selecting by tf_idf

explore topic modeling

Code
library(stm)
Warning: package 'stm' was built under R version 4.2.2
stm v1.3.6 successfully loaded. See ?stm for help. 
 Papers, resources, and other materials at structuraltopicmodel.com
Code
tidy_dfm <- tidy_dataset %>% 
  count(people_names, word, sort=TRUE) %>% 
  cast_dfm(people_names, word, n)

topic_model <- stm(tidy_dfm, K = 6, init.type = 'Spectral')
Beginning Spectral Initialization 
     Calculating the gram matrix...
     Using only 10000 most frequent terms during initialization...
     Finding anchor words...
    ......
     Recovering initialization...
    ....................................................................................................
Initialization complete.
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 1 (approx. per word bound = -8.775) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 2 (approx. per word bound = -7.897, relative change = 1.001e-01) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 3 (approx. per word bound = -7.844, relative change = 6.681e-03) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 4 (approx. per word bound = -7.837, relative change = 8.981e-04) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 5 (approx. per word bound = -7.834, relative change = 3.905e-04) 
Topic 1: women, queen, john, world, time 
 Topic 2: people, world, time, ford, gates 
 Topic 3: war, world, people, soviet, french 
 Topic 4: people, rights, civil, king, gandhi 
 Topic 5: world, ali, time, record, famous 
 Topic 6: life, da, film, freud, vinci 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 6 (approx. per word bound = -7.832, relative change = 2.422e-04) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 7 (approx. per word bound = -7.831, relative change = 1.175e-04) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 8 (approx. per word bound = -7.830, relative change = 6.183e-05) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 9 (approx. per word bound = -7.830, relative change = 4.783e-05) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 10 (approx. per word bound = -7.830, relative change = 3.034e-05) 
Topic 1: women, queen, john, time, world 
 Topic 2: people, world, time, ford, gates 
 Topic 3: war, world, people, french, life 
 Topic 4: people, rights, civil, king, gandhi 
 Topic 5: world, ali, time, record, famous 
 Topic 6: life, da, film, freud, vinci 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 11 (approx. per word bound = -7.830, relative change = 2.048e-05) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 12 (approx. per word bound = -7.829, relative change = 1.575e-05) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Completing Iteration 13 (approx. per word bound = -7.829, relative change = 1.133e-05) 
....................................................................................................................
Completed E-Step (0 seconds). 
Completed M-Step. 
Model Converged 
Code
summary(topic_model)
A topic model with 6 topics, 116 documents and a 15377 word dictionary.
Topic 1 Top Words:
     Highest Prob: women, queen, john, time, world, life, famous 
     FREX: magdalene, andrews, julie, pankhurst, j.k.rowling, amelia, anthony 
     Lift: 24th, andrews, baker, ballet, benedict, caesar’s, cleopatra’s 
     Score: audrey, hepburn, pope, wilde, magdalene, lennon, andrews 
Topic 2 Top Words:
     Highest Prob: people, world, time, ford, gates, einstein, jobs 
     FREX: web, apple, berners, baden, microsoft, hawking, computers 
     Lift: couzens, melinda, stewart’s, 1,000,000, 1,529, 1.25, 1.5 
     Score: web, berners, apple, baden, microsoft, keynes, hawking 
Topic 3 Top Words:
     Highest Prob: war, world, people, french, life, soviet, famous 
     FREX: bolivar, orwell, chanel, gaulle, putin, suu, kyi 
     Lift: 130george, 1755, 1766, 1778, 1781, 1783, 1785 
     Score: bolivar, orwell, chanel, castro, putin, gaulle, thatcher 
Topic 4 Top Words:
     Highest Prob: people, rights, civil, king, gandhi, lincoln, south 
     FREX: krugman, selassie, rosa, taliban, kennedy, mandela, haile 
     Lift: f.w, mohandas, vice, 16,000, 180after, 1842, 1850s 
     Score: krugman, tutu, parks, mandela, bus, selassie, luther 
Topic 5 Top Words:
     Highest Prob: world, ali, time, record, famous, people, player 
     FREX: messi, zatopek, olympic, ruth, babe, football, bolt 
     Lift: 0.94, 05, 07, 100m2015, 115, 19.19, 220 
     Score: messi, zatopek, ruth, olympic, babe, bolt, football 
Topic 6 Top Words:
     Highest Prob: life, da, film, freud, vinci, famous, plato 
     FREX: vinci, plato, bergman, columbus, wright, mao, wilbur 
     Lift: 10.9, 11the, 123concertospiano, 1350s, 1451, 1452, 1466 
     Score: plato, vinci, freud, van, gogh, leonardo, columbus 

Beta matrix

Code
tidy_beta <- tidy(topic_model)

tidy_beta %>% group_by(topic) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(term=reorder(term,beta)) %>% 
  ggplot(aes(term,beta, fill=topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~topic, scales="free") +
  coord_flip() 
Selecting by beta

using Gamma matrix

Code
tidy_gamma <- tidy(topic_model, matrix="gamma", document_names = rownames(tidy_dfm))


ggplot(tidy_gamma, aes(gamma, fill = as.factor(topic)))+
  geom_histogram(show.legend = FALSE)+
  facet_wrap(~topic, ncol=3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.