::opts_chunk$set(echo = TRUE) knitr
Ethan Campbell
November 16, 2022
There are 6 teams included in this study 2 from the top of the table 2 from the middle and 2 from the bottom. They are already in that order from top to bottom. Data needed to be web scraped from a page called match report. This page was located on each teams official website and this page included information about the match, statistics, and quotes from both the players and the managers. This data will include this current season and all of last season.
A. Does Premier League soccer teams language change over the course of the season?
B. Does the language grow in correlation to the success of the season?
The hypothesis will be tested as follows:
The Premier league soccer team language does not change over the course of the season.
The Premier league soccer team language does change over the course of the season.
The language does not correlate to the success of the season.
The language does correlate to the success of the season.
flowchart LR A[Web Scrape] --> B(Preprocess) B --> C[Organize] C --> D(TDM) D --> E[Sentiment Analysis] E --> F[Research Question 1] F --> G{Conclusion} D --> H[DTM] H --> I[Document Similarity] I --> J[Research Question 2] J --> G{Conclusion}
Here is the beginning of the web scraping process. I was unable to find a way to make the web scraper search for one object then proceed to the next page where you could then scrape whats inside. For the time being I decided to manually web scrape the information. The tidying process is the real issue as there are many unwanted variables inside. For example there are a lot of /n’s.
## The function is working at reading in the data however. parts of the cleaning process are failing and I am thinking this is because I am not specifying the create values
# I need to remove punct, capitalization, stopwords like (the, a ',') finish repeating the process to all teams and adjusting the function until it grabs every single problem once this is complete we should be able to tokenize then corpus and work with the data
Web_scrape_function_Arsenal <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".article-body")
data <- url %>%
html_node(css = css) %>%
tidy_function <- function(data){data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_replace_all("'\'", "#") %>%
str_replace_all("[0-9] of [0-9]To buy official Arsenal pictures visit Arsenal Pics", "#") %>%
str_remove("WHAT HAPPENED") %>%
str_remove_all("[0-9] of 42To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 29To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 45To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 38To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 32To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 36To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove("Play videoWatch Arsenal video online05:24Highlights | Crystal Palace 0-2 Arsenal - bitesize") %>%
str_remove("111111111122222222223333333333444") %>%
str_remove("111111111122222222223333333") %>%
str_remove("11111111112222222222") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_replace_all("||", "#") %>%
str_remove_all("'Play videoWatch Arsenal video online02:17Mikel Arteta post-match interview | Crystal Palace 0-2 Arsenal | Premier LeagueArteta: \'") %>%
str_remove_all("\"read everything from his press conferencePlay videoWatch Arsenal video online02:07William Saliba post-match interview || Premier LeagueSaliba:") %>%
str_remove_all("#") %>%
# Running the tidy function twice to clean up certain parts that are getting missed the first time for some reason and this is a temporary fix.
Arsenal_url <- ""
Match_1 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_1 <- tidy_function(Match_1)
Match_1 <- tidy_function(Match_1)
Arsenal_url <- ""
Match_2 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_2 <- tidy_function(Match_2)
Match_2 <- tidy_function(Match_2)
Arsenal_url <- ""
Match_3 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_3 <- tidy_function(Match_3)
Match_3 <- tidy_function(Match_3)
Arsenal_url <- ""
Match_4 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_4 <- tidy_function(Match_4)
Match_4 <- tidy_function(Match_4)
Arsenal_url <- ""
Match_5 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_5 <- tidy_function(Match_5)
Match_5 <- tidy_function(Match_5)
Arsenal_url <- ""
Match_6 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_6 <- tidy_function(Match_6)
Match_6 <- tidy_function(Match_6)
Arsenal_url <- ""
Match_7 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_7 <- tidy_function(Match_7)
Match_7 <- tidy_function(Match_7)
# Arsenal 2021 season
Arsenal_url <- ""
Match_1_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_1_2021 <- tidy_function(Match_1_2021)
Match_1_2021 <- tidy_function(Match_1_2021)
Arsenal_url <- ""
Match_2_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_2_2021 <- tidy_function(Match_2_2021)
Match_2_2021 <- tidy_function(Match_2_2021)
Arsenal_url <- ""
Match_3_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_3_2021 <- tidy_function(Match_3_2021)
Match_3_2021 <- tidy_function(Match_3_2021)
Arsenal_url <- ""
Match_4_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_4_2021 <- tidy_function(Match_4_2021)
Match_4_2021 <- tidy_function(Match_4_2021)
Arsenal_url <- ""
Match_5_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_5_2021 <- tidy_function(Match_5_2021)
Match_5_2021 <- tidy_function(Match_5_2021)
Arsenal_url <- ""
Match_6_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_6_2021 <- tidy_function(Match_6_2021)
Match_6_2021 <- tidy_function(Match_6_2021)
Arsenal_url <- ""
Match_7_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_7_2021 <- tidy_function(Match_7_2021)
Match_7_2021 <- tidy_function(Match_7_2021)
Arsenal_url <- ""
Match_8_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_8_2021 <- tidy_function(Match_8_2021)
Match_8_2021 <- tidy_function(Match_8_2021)
Arsenal_url <- ""
Match_9_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_9_2021 <- tidy_function(Match_9_2021)
Match_9_2021 <- tidy_function(Match_9_2021)
Arsenal_url <- ""
Match_10_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_10_2021 <- tidy_function(Match_10_2021)
Match_10_2021 <- tidy_function(Match_10_2021)
Arsenal_url <- ""
Match_11_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_11_2021 <- tidy_function(Match_11_2021)
Match_11_2021 <- tidy_function(Match_11_2021)
Arsenal_url <- ""
Match_12_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_12_2021 <- tidy_function(Match_12_2021)
Match_12_2021 <- tidy_function(Match_12_2021)
Arsenal_url <- ""
Match_13_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_13_2021 <- tidy_function(Match_13_2021)
Match_13_2021 <- tidy_function(Match_13_2021)
Arsenal_url <- ""
Match_14_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_14_2021 <- tidy_function(Match_14_2021)
Match_14_2021 <- tidy_function(Match_14_2021)
Arsenal_url <- ""
Match_15_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_15_2021 <- tidy_function(Match_15_2021)
Match_15_2021 <- tidy_function(Match_15_2021)
Arsenal_url <- ""
Match_16_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_16_2021 <- tidy_function(Match_16_2021)
Match_16_2021 <- tidy_function(Match_16_2021)
Arsenal_url <- ""
Match_17_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_17_2021 <- tidy_function(Match_17_2021)
Match_17_2021 <- tidy_function(Match_17_2021)
Arsenal_url <- ""
Match_18_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_18_2021 <- tidy_function(Match_18_2021)
Match_18_2021 <- tidy_function(Match_18_2021)
Arsenal_url <- ""
Match_19_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_19_2021 <- tidy_function(Match_19_2021)
Match_19_2021 <- tidy_function(Match_19_2021)
Arsenal_url <- ""
Match_20_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_20_2021 <- tidy_function(Match_20_2021)
Match_20_2021 <- tidy_function(Match_20_2021)
Arsenal_url <- ""
Match_21_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_21_2021 <- tidy_function(Match_21_2021)
Match_21_2021 <- tidy_function(Match_21_2021)
Arsenal_url <- ""
Match_22_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_22_2021 <- tidy_function(Match_22_2021)
Match_22_2021 <- tidy_function(Match_22_2021)
Arsenal_url <- ""
Match_23_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_23_2021 <- tidy_function(Match_23_2021)
Match_23_2021 <- tidy_function(Match_23_2021)
Arsenal_url <- ""
Match_24_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_24_2021 <- tidy_function(Match_24_2021)
Match_24_2021 <- tidy_function(Match_24_2021)
Arsenal_url <- ""
Match_25_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_25_2021 <- tidy_function(Match_25_2021)
Match_25_2021 <- tidy_function(Match_25_2021)
Arsenal_url <- ""
Match_26_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_26_2021 <- tidy_function(Match_26_2021)
Match_26_2021 <- tidy_function(Match_26_2021)
Arsenal_url <- ""
Match_27_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_27_2021 <- tidy_function(Match_27_2021)
Match_27_2021 <- tidy_function(Match_27_2021)
Arsenal_url <- ""
Match_28_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_28_2021 <- tidy_function(Match_28_2021)
Match_28_2021 <- tidy_function(Match_28_2021)
Arsenal_url <- ""
Match_29_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_29_2021 <- tidy_function(Match_29_2021)
Match_29_2021 <- tidy_function(Match_29_2021)
Arsenal_url <- ""
Match_30_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_30_2021 <- tidy_function(Match_30_2021)
Match_30_2021 <- tidy_function(Match_30_2021)
Arsenal_url <- ""
Match_31_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_31_2021 <- tidy_function(Match_31_2021)
Match_31_2021 <- tidy_function(Match_31_2021)
Arsenal_url <- ""
Match_32_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_32_2021 <- tidy_function(Match_32_2021)
Match_32_2021 <- tidy_function(Match_32_2021)
Arsenal_url <- ""
Match_33_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_33_2021 <- tidy_function(Match_33_2021)
Match_33_2021 <- tidy_function(Match_33_2021)
Arsenal_url <- ""
Match_34_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_34_2021 <- tidy_function(Match_34_2021)
Match_34_2021 <- tidy_function(Match_34_2021)
Arsenal_url <- ""
Match_35_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_35_2021 <- tidy_function(Match_35_2021)
Match_35_2021 <- tidy_function(Match_35_2021)
Arsenal_url <- ""
Match_36_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_36_2021 <- tidy_function(Match_36_2021)
Match_36_2021 <- tidy_function(Match_36_2021)
Arsenal_url <- ""
Match_37_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_37_2021 <- tidy_function(Match_37_2021)
Match_37_2021 <- tidy_function(Match_37_2021)
Arsenal_url <- ""
Match_38_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_38_2021 <- tidy_function(Match_38_2021)
Match_38_2021 <- tidy_function(Match_38_2021)
Arsenal_url <- ""
Match_39_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_39_2021 <- tidy_function(Match_39_2021)
Match_39_2021 <- tidy_function(Match_39_2021)
Arsenal_url <- ""
Match_40_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_40_2021 <- tidy_function(Match_40_2021)
Match_40_2021 <- tidy_function(Match_40_2021)
# Manchester City data
Web_scrape_function_mancity <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".article-body__article-text")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("#") %>%
mancity_url <- ""
Manc_1 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_2 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_3 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_4 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_5 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_6 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_7 <- Web_scrape_function_mancity(mancity_url)
# New Castle
# New Castle United first match against nottingham forest
# 1 rule for 1 bots crawl delay 5 seconds, scrapable
<polite session>
User-agent: polite R package
robots.txt: 1 rules are defined for 1 bots
Crawl delay: 5 sec
The path is scrapable for this user-agent
Web_scrape_function_Newcastle <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".article__body")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
Newcastle_url <- ""
nc_1 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_2 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_3 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_4 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_5 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_6 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_7 <- Web_scrape_function_Newcastle(Newcastle_url)
# Everton
# Everton vs Chelsea
# 1 rule for 1 bots crawl delay 5 seconds, scrapable
<polite session>
User-agent: polite R package
robots.txt: 1 rules are defined for 1 bots
Crawl delay: 5 sec
The path is scrapable for this user-agent
Web_scrape_function_Everton <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- ("")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
Everton_url <- ""
ever_1 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_2 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_3 <- Web_scrape_function_Everton(Everton_url)
Everton_url <-""
ever_4 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_5 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_6 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_7 <- Web_scrape_function_Everton(Everton_url)
# Leicester
# Leicester against Brentford
# 1 bot 1 rule scrapable 5 second crawl
<polite session>
User-agent: polite R package
robots.txt: 1 rules are defined for 1 bots
Crawl delay: 5 sec
The path is scrapable for this user-agent
Web_scrape_function_Leicester <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".col-12")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
str_remove_all("More on this story. . . In Photos -") %>%
Leicester_url <- ""
lei_1 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_2 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_3 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_4 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_5 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_6 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_7 <- Web_scrape_function_Leicester(Leicester_url)
# West Ham
# West Ham vs Manchester City
Web_scrape_function_WestHam <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".m-article__columns")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
str_remove_all("More on this story. . . In Photos -") %>%
WestHam_url <- ""
wh_1 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_2 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_3 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_4 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_5 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_6 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_7 <- Web_scrape_function_WestHam(WestHam_url)
# First step is to make these character vectors into a corpus to use for preprocessing
# Arsenal
Arsenal <- c(Match_1, Match_2, Match_3, Match_4, Match_5, Match_6, Match_7,Match_1_2021, Match_2_2021, Match_3_2021, Match_4_2021, Match_5_2021, Match_6_2021, Match_7_2021,Match_8_2021, Match_9_2021, Match_10_2021, Match_11_2021, Match_12_2021, Match_13_2021, Match_14_2021, Match_15_2021, Match_16_2021, Match_17_2021, Match_18_2021, Match_19_2021, Match_20_2021, Match_21_2021, Match_22_2021, Match_23_2021, Match_24_2021, Match_25_2021, Match_26_2021, Match_27_2021, Match_28_2021, Match_29_2021, Match_30_2021, Match_31_2021, Match_32_2021, Match_33_2021, Match_34_2021, Match_35_2021, Match_36_2021, Match_37_2021, Match_38_2021, Match_39_2021, Match_40_2021)
Arsenal_corpus <- corpus(Arsenal)
# Man city
Manchester_City <- c(Manc_1, Manc_2, Manc_3, Manc_4, Manc_5, Manc_6, Manc_7)
# Newcastle united
Newcastle_United <- c(nc_1, nc_2, nc_3, nc_4, nc_5, nc_6, nc_7)
# Everton
Everton <- c(ever_1, ever_2, ever_3, ever_4, ever_5, ever_6, ever_7)
# Leicester
Leicester <- c(lei_1, lei_2, lei_3, lei_4, lei_5, lei_6, lei_7)
# West Ham
West_Ham_United <- c(wh_1, wh_2, wh_3, wh_4, wh_5, wh_6, wh_7)
# Creating list of objects to put into the loop
Prem <- c("Arsenal", "Manchester_City", "Newcastle_United", "Everton", "Leicester", "West_Ham_United")
# create loop.
for (i in 1:length(Prem)){
# create corpora
corpusCall <- paste(Prem[i],"_corpus <- corpus(",Prem[i],")", sep = "")
# change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
namesCall <- paste("tmpNames <- docnames(",Prem[i],"_corpus)", sep = "")
bindCall <- paste("docnames(",Prem[i],"_corpus) <- paste(\"",Prem[i],"\", tmpNames, sep = \"-\")", sep = "")
# create summary data
summaryCall <- paste(Prem[i],"_summary <- summary(",Prem[i],"_corpus)", sep = "")
# add indicator
bookCall <- paste(Prem[i],"_summary$Team <- \"",Prem[i],"\"", sep = "")
# add chapter indicator
chapterCall <- paste(Prem[i],"_summary$Match <- as.numeric(str_extract(",Prem[i],"_summary$Text, \"[0-9]+\"))", sep = "")
# add meta data to each corpus
metaCall <- paste("docvars(",Prem[i],"_corpus) <- ",Prem[i],"_summary", sep = "")
[1] "tmpNames <- docnames(Arsenal_corpus)"
[1] "docnames(Arsenal_corpus) <- paste(\"Arsenal\", tmpNames, sep = \"-\")"
[1] "tmpNames <- docnames(Manchester_City_corpus)"
[1] "docnames(Manchester_City_corpus) <- paste(\"Manchester_City\", tmpNames, sep = \"-\")"
[1] "tmpNames <- docnames(Newcastle_United_corpus)"
[1] "docnames(Newcastle_United_corpus) <- paste(\"Newcastle_United\", tmpNames, sep = \"-\")"
[1] "tmpNames <- docnames(Everton_corpus)"
[1] "docnames(Everton_corpus) <- paste(\"Everton\", tmpNames, sep = \"-\")"
[1] "tmpNames <- docnames(Leicester_corpus)"
[1] "docnames(Leicester_corpus) <- paste(\"Leicester\", tmpNames, sep = \"-\")"
[1] "tmpNames <- docnames(West_Ham_United_corpus)"
[1] "docnames(West_Ham_United_corpus) <- paste(\"West_Ham_United\", tmpNames, sep = \"-\")"
[1] 82
Corpus consisting of 47 documents, showing 47 documents:
Text Types Tokens Sentences Text Types Tokens Sentences
Arsenal-text1 474 1027 12 Arsenal-text1 474 1027 12
Arsenal-text2 561 1297 22 Arsenal-text2 561 1297 22
Arsenal-text3 517 1124 15 Arsenal-text3 517 1124 15
Arsenal-text4 429 894 9 Arsenal-text4 429 894 9
Arsenal-text5 408 790 12 Arsenal-text5 408 790 12
Arsenal-text6 438 916 6 Arsenal-text6 438 916 6
Arsenal-text7 490 1107 21 Arsenal-text7 490 1107 21
Arsenal-text8 392 758 13 Arsenal-text8 392 758 13
Arsenal-text9 362 662 11 Arsenal-text9 362 662 11
Arsenal-text10 231 379 3 Arsenal-text10 231 379 3
Arsenal-text11 355 643 10 Arsenal-text11 355 643 10
Arsenal-text12 311 569 10 Arsenal-text12 311 569 10
Arsenal-text13 184 284 3 Arsenal-text13 184 284 3
Arsenal-text14 267 474 6 Arsenal-text14 267 474 6
Arsenal-text15 279 453 7 Arsenal-text15 279 453 7
Arsenal-text16 280 526 4 Arsenal-text16 280 526 4
Arsenal-text17 266 478 20 Arsenal-text17 266 478 20
Arsenal-text18 336 659 12 Arsenal-text18 336 659 12
Arsenal-text19 289 498 7 Arsenal-text19 289 498 7
Arsenal-text20 330 577 4 Arsenal-text20 330 577 4
Arsenal-text21 332 626 10 Arsenal-text21 332 626 10
Arsenal-text22 342 611 8 Arsenal-text22 342 611 8
Arsenal-text23 340 612 5 Arsenal-text23 340 612 5
Arsenal-text24 241 440 5 Arsenal-text24 241 440 5
Arsenal-text25 252 458 3 Arsenal-text25 252 458 3
Arsenal-text26 355 693 9 Arsenal-text26 355 693 9
Arsenal-text27 288 493 2 Arsenal-text27 288 493 2
Arsenal-text28 438 865 12 Arsenal-text28 438 865 12
Arsenal-text29 256 455 14 Arsenal-text29 256 455 14
Arsenal-text30 389 775 11 Arsenal-text30 389 775 11
Arsenal-text31 271 490 13 Arsenal-text31 271 490 13
Arsenal-text32 340 683 30 Arsenal-text32 340 683 30
Arsenal-text33 254 445 4 Arsenal-text33 254 445 4
Arsenal-text34 404 1122 35 Arsenal-text34 404 1122 35
Arsenal-text35 298 555 5 Arsenal-text35 298 555 5
Arsenal-text36 260 429 2 Arsenal-text36 260 429 2
Arsenal-text37 386 768 16 Arsenal-text37 386 768 16
Arsenal-text38 247 464 8 Arsenal-text38 247 464 8
Arsenal-text39 274 478 8 Arsenal-text39 274 478 8
Arsenal-text40 315 612 17 Arsenal-text40 315 612 17
Arsenal-text41 291 560 17 Arsenal-text41 291 560 17
Arsenal-text42 337 688 19 Arsenal-text42 337 688 19
Arsenal-text43 324 659 17 Arsenal-text43 324 659 17
Arsenal-text44 280 497 8 Arsenal-text44 280 497 8
Arsenal-text45 316 595 13 Arsenal-text45 316 595 13
Arsenal-text46 296 590 15 Arsenal-text46 296 590 15
Arsenal-text47 268 495 9 Arsenal-text47 268 495 9
Team Match
Arsenal 1
Arsenal 2
Arsenal 3
Arsenal 4
Arsenal 5
Arsenal 6
Arsenal 7
Arsenal 8
Arsenal 9
Arsenal 10
Arsenal 11
Arsenal 12
Arsenal 13
Arsenal 14
Arsenal 15
Arsenal 16
Arsenal 17
Arsenal 18
Arsenal 19
Arsenal 20
Arsenal 21
Arsenal 22
Arsenal 23
Arsenal 24
Arsenal 25
Arsenal 26
Arsenal 27
Arsenal 28
Arsenal 29
Arsenal 30
Arsenal 31
Arsenal 32
Arsenal 33
Arsenal 34
Arsenal 35
Arsenal 36
Arsenal 37
Arsenal 38
Arsenal 39
Arsenal 40
Arsenal 41
Arsenal 42
Arsenal 43
Arsenal 44
Arsenal 45
Arsenal 46
Arsenal 47
first minutes goal league ball premier city shot half back
308 269 255 251 248 239 203 200 199 185
just game second time home two side arsenal corner away
156 156 155 141 133 121 119 113 111 110
# Creating a table to show the highest frequency items and then ranking them
word_counts <-,dec=T))
colnames(word_counts) <- c("Frequency")
word_counts$Rank <- c(1:ncol(Prem_dfm))
ggplot(word_counts, mapping = aes(x = Rank, y = Frequency)) +
geom_point() +
labs(title = "Zipf's Law", x = "Rank", y = "Frequency") +
Prem_smaller_dfm <- dfm_trim(Prem_dfm, min_termfreq = 10)
# trim based on the proportion of documents that the feature appears in; here,
# the feature needs to appear in more than 10% of documents (chapters)
Prem_smaller_dfm <- dfm_trim(Prem_smaller_dfm, min_docfreq = 0.1, docfreq_type = "prop")
Document-feature matrix of: 82 documents, 629 features (73.81% sparse) and 6 docvars.
docs gabriel martinelli header late goal got premier league campaign
Arsenal-text1 2 3 2 1 6 1 9 8 3
Arsenal-text2 3 6 3 1 3 1 5 5 2
Arsenal-text3 3 3 1 2 8 1 10 11 0
Arsenal-text4 7 4 2 2 4 1 8 8 1
Arsenal-text5 4 5 0 0 5 1 7 9 0
Arsenal-text6 2 5 1 0 5 2 5 7 0
docs perfect
Arsenal-text1 1
Arsenal-text2 2
Arsenal-text3 0
Arsenal-text4 0
Arsenal-text5 1
Arsenal-text6 2
[ reached max_ndoc ... 76 more documents, reached max_nfeat ... 619 more features ]
# Creating the FCM
Prem_smaller_dfm <- dfm_trim(Prem_dfm, min_termfreq = 20)
Prem_smaller_dfm <- dfm_trim(Prem_smaller_dfm, min_docfreq = .3, docfreq_type = "prop")
# create fcm from dfm
Prem_smaller_fcm <- fcm(Prem_smaller_dfm)
# check the dimensions (i.e., the number of rows and the number of columnns)
# of the matrix we created
[1] 179 179
# pull the top features
myFeatures <- names(topfeatures(Prem_smaller_fcm, 30))
# retain only those top features as part of our matrix
Prem_smaller_fcm <- fcm_select(Prem_smaller_fcm, pattern = myFeatures, selection = "keep")
# compute size weight for vertices in network
size <- log(colSums(Prem_smaller_fcm))
# create plot
textplot_network(Prem_smaller_fcm, vertex_size = size / max(size) * 3)
title: "Text as Data Final Project"
author: "Ethan Campbell"
description: "Research into English Premier League and how language changes depending on the season"
date: "11/16/2022"
toc: true
code-fold: true
code-copy: true
code-tools: true
- Blog Post 2
knitr::opts_chunk$set(echo = TRUE)
# Loading Packages
#| warning: false
# Data Sources
There are 6 teams included in this study 2 from the top of the table 2 from the middle and 2 from the bottom. They are already in that order from top to bottom. Data needed to be web scraped from a page called match report. This page was located on each teams official website and this page included information about the match, statistics, and quotes from both the players and the managers. This data will include this current season and all of last season.
[Arsenal Data](
[Manchester City Data](
[Newcastle United Data](
[Everton Data](
[Leicester Data](
[West Ham United Data](
# Hypothesis for project
::: callout-note
## Research Questions
A. Does Premier League soccer teams language change over the course of the season?
B. Does the language grow in correlation to the success of the season?
The hypothesis will be tested as follows:
::: callout-tip
## H~0A~
The Premier league soccer team language [does not]{.underline} change over the course of the season.
::: callout-tip
## H~1A~
The Premier league soccer team language [does]{.underline} change over the course of the season.
::: callout-tip
## H~0A~
The language [does not]{.underline} correlate to the success of the season.
::: callout-tip
## H~1A~
The language [does]{.underline} correlate to the success of the season.
# Analytic planning
flowchart LR
A[Web Scrape] --> B(Preprocess)
B --> C[Organize]
C --> D(TDM)
D --> E[Sentiment Analysis]
E --> F[Research Question 1]
F --> G{Conclusion}
D --> H[DTM]
H --> I[Document Similarity]
I --> J[Research Question 2]
J --> G{Conclusion}
# Web Scraping/Tidying data
Here is the beginning of the web scraping process. I was unable to find a way to make the web scraper search for one object then proceed to the next page where you could then scrape whats inside. For the time being I decided to manually web scrape the information. The tidying process is the real issue as there are many unwanted variables inside. For example there are a lot of /n's.
## The function is working at reading in the data however. parts of the cleaning process are failing and I am thinking this is because I am not specifying the create values
# I need to remove punct, capitalization, stopwords like (the, a ',') finish repeating the process to all teams and adjusting the function until it grabs every single problem once this is complete we should be able to tokenize then corpus and work with the data
Web_scrape_function_Arsenal <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".article-body")
data <- url %>%
html_node(css = css) %>%
tidy_function <- function(data){data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_replace_all("'\'", "#") %>%
str_replace_all("[0-9] of [0-9]To buy official Arsenal pictures visit Arsenal Pics", "#") %>%
str_remove("WHAT HAPPENED") %>%
str_remove_all("[0-9] of 42To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 29To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 45To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 38To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 32To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove_all("[0-9] of 36To buy official Arsenal pictures visit Arsenal Pics") %>%
str_remove("Play videoWatch Arsenal video online05:24Highlights | Crystal Palace 0-2 Arsenal - bitesize") %>%
str_remove("111111111122222222223333333333444") %>%
str_remove("111111111122222222223333333") %>%
str_remove("11111111112222222222") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_replace_all("||", "#") %>%
str_remove_all("'Play videoWatch Arsenal video online02:17Mikel Arteta post-match interview | Crystal Palace 0-2 Arsenal | Premier LeagueArteta: \'") %>%
str_remove_all("\"read everything from his press conferencePlay videoWatch Arsenal video online02:07William Saliba post-match interview || Premier LeagueSaliba:") %>%
str_remove_all("#") %>%
# Running the tidy function twice to clean up certain parts that are getting missed the first time for some reason and this is a temporary fix.
Arsenal_url <- ""
Match_1 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_1 <- tidy_function(Match_1)
Match_1 <- tidy_function(Match_1)
Arsenal_url <- ""
Match_2 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_2 <- tidy_function(Match_2)
Match_2 <- tidy_function(Match_2)
Arsenal_url <- ""
Match_3 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_3 <- tidy_function(Match_3)
Match_3 <- tidy_function(Match_3)
Arsenal_url <- ""
Match_4 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_4 <- tidy_function(Match_4)
Match_4 <- tidy_function(Match_4)
Arsenal_url <- ""
Match_5 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_5 <- tidy_function(Match_5)
Match_5 <- tidy_function(Match_5)
Arsenal_url <- ""
Match_6 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_6 <- tidy_function(Match_6)
Match_6 <- tidy_function(Match_6)
Arsenal_url <- ""
Match_7 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_7 <- tidy_function(Match_7)
Match_7 <- tidy_function(Match_7)
# Arsenal 2021 season
Arsenal_url <- ""
Match_1_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_1_2021 <- tidy_function(Match_1_2021)
Match_1_2021 <- tidy_function(Match_1_2021)
Arsenal_url <- ""
Match_2_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_2_2021 <- tidy_function(Match_2_2021)
Match_2_2021 <- tidy_function(Match_2_2021)
Arsenal_url <- ""
Match_3_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_3_2021 <- tidy_function(Match_3_2021)
Match_3_2021 <- tidy_function(Match_3_2021)
Arsenal_url <- ""
Match_4_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_4_2021 <- tidy_function(Match_4_2021)
Match_4_2021 <- tidy_function(Match_4_2021)
Arsenal_url <- ""
Match_5_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_5_2021 <- tidy_function(Match_5_2021)
Match_5_2021 <- tidy_function(Match_5_2021)
Arsenal_url <- ""
Match_6_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_6_2021 <- tidy_function(Match_6_2021)
Match_6_2021 <- tidy_function(Match_6_2021)
Arsenal_url <- ""
Match_7_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_7_2021 <- tidy_function(Match_7_2021)
Match_7_2021 <- tidy_function(Match_7_2021)
Arsenal_url <- ""
Match_8_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_8_2021 <- tidy_function(Match_8_2021)
Match_8_2021 <- tidy_function(Match_8_2021)
Arsenal_url <- ""
Match_9_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_9_2021 <- tidy_function(Match_9_2021)
Match_9_2021 <- tidy_function(Match_9_2021)
Arsenal_url <- ""
Match_10_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_10_2021 <- tidy_function(Match_10_2021)
Match_10_2021 <- tidy_function(Match_10_2021)
Arsenal_url <- ""
Match_11_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_11_2021 <- tidy_function(Match_11_2021)
Match_11_2021 <- tidy_function(Match_11_2021)
Arsenal_url <- ""
Match_12_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_12_2021 <- tidy_function(Match_12_2021)
Match_12_2021 <- tidy_function(Match_12_2021)
Arsenal_url <- ""
Match_13_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_13_2021 <- tidy_function(Match_13_2021)
Match_13_2021 <- tidy_function(Match_13_2021)
Arsenal_url <- ""
Match_14_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_14_2021 <- tidy_function(Match_14_2021)
Match_14_2021 <- tidy_function(Match_14_2021)
Arsenal_url <- ""
Match_15_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_15_2021 <- tidy_function(Match_15_2021)
Match_15_2021 <- tidy_function(Match_15_2021)
Arsenal_url <- ""
Match_16_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_16_2021 <- tidy_function(Match_16_2021)
Match_16_2021 <- tidy_function(Match_16_2021)
Arsenal_url <- ""
Match_17_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_17_2021 <- tidy_function(Match_17_2021)
Match_17_2021 <- tidy_function(Match_17_2021)
Arsenal_url <- ""
Match_18_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_18_2021 <- tidy_function(Match_18_2021)
Match_18_2021 <- tidy_function(Match_18_2021)
Arsenal_url <- ""
Match_19_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_19_2021 <- tidy_function(Match_19_2021)
Match_19_2021 <- tidy_function(Match_19_2021)
Arsenal_url <- ""
Match_20_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_20_2021 <- tidy_function(Match_20_2021)
Match_20_2021 <- tidy_function(Match_20_2021)
Arsenal_url <- ""
Match_21_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_21_2021 <- tidy_function(Match_21_2021)
Match_21_2021 <- tidy_function(Match_21_2021)
Arsenal_url <- ""
Match_22_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_22_2021 <- tidy_function(Match_22_2021)
Match_22_2021 <- tidy_function(Match_22_2021)
Arsenal_url <- ""
Match_23_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_23_2021 <- tidy_function(Match_23_2021)
Match_23_2021 <- tidy_function(Match_23_2021)
Arsenal_url <- ""
Match_24_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_24_2021 <- tidy_function(Match_24_2021)
Match_24_2021 <- tidy_function(Match_24_2021)
Arsenal_url <- ""
Match_25_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_25_2021 <- tidy_function(Match_25_2021)
Match_25_2021 <- tidy_function(Match_25_2021)
Arsenal_url <- ""
Match_26_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_26_2021 <- tidy_function(Match_26_2021)
Match_26_2021 <- tidy_function(Match_26_2021)
Arsenal_url <- ""
Match_27_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_27_2021 <- tidy_function(Match_27_2021)
Match_27_2021 <- tidy_function(Match_27_2021)
Arsenal_url <- ""
Match_28_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_28_2021 <- tidy_function(Match_28_2021)
Match_28_2021 <- tidy_function(Match_28_2021)
Arsenal_url <- ""
Match_29_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_29_2021 <- tidy_function(Match_29_2021)
Match_29_2021 <- tidy_function(Match_29_2021)
Arsenal_url <- ""
Match_30_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_30_2021 <- tidy_function(Match_30_2021)
Match_30_2021 <- tidy_function(Match_30_2021)
Arsenal_url <- ""
Match_31_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_31_2021 <- tidy_function(Match_31_2021)
Match_31_2021 <- tidy_function(Match_31_2021)
Arsenal_url <- ""
Match_32_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_32_2021 <- tidy_function(Match_32_2021)
Match_32_2021 <- tidy_function(Match_32_2021)
Arsenal_url <- ""
Match_33_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_33_2021 <- tidy_function(Match_33_2021)
Match_33_2021 <- tidy_function(Match_33_2021)
Arsenal_url <- ""
Match_34_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_34_2021 <- tidy_function(Match_34_2021)
Match_34_2021 <- tidy_function(Match_34_2021)
Arsenal_url <- ""
Match_35_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_35_2021 <- tidy_function(Match_35_2021)
Match_35_2021 <- tidy_function(Match_35_2021)
Arsenal_url <- ""
Match_36_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_36_2021 <- tidy_function(Match_36_2021)
Match_36_2021 <- tidy_function(Match_36_2021)
Arsenal_url <- ""
Match_37_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_37_2021 <- tidy_function(Match_37_2021)
Match_37_2021 <- tidy_function(Match_37_2021)
Arsenal_url <- ""
Match_38_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_38_2021 <- tidy_function(Match_38_2021)
Match_38_2021 <- tidy_function(Match_38_2021)
Arsenal_url <- ""
Match_39_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_39_2021 <- tidy_function(Match_39_2021)
Match_39_2021 <- tidy_function(Match_39_2021)
Arsenal_url <- ""
Match_40_2021 <- Web_scrape_function_Arsenal(Arsenal_url)
Match_40_2021 <- tidy_function(Match_40_2021)
Match_40_2021 <- tidy_function(Match_40_2021)
# Manchester City data
Web_scrape_function_mancity <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".article-body__article-text")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("#") %>%
mancity_url <- ""
Manc_1 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_2 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_3 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_4 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_5 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_6 <- Web_scrape_function_mancity(mancity_url)
mancity_url <- ""
Manc_7 <- Web_scrape_function_mancity(mancity_url)
# New Castle
# New Castle United first match against nottingham forest
# 1 rule for 1 bots crawl delay 5 seconds, scrapable
Web_scrape_function_Newcastle <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".article__body")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
Newcastle_url <- ""
nc_1 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_2 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_3 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_4 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_5 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_6 <- Web_scrape_function_Newcastle(Newcastle_url)
Newcastle_url <- ""
nc_7 <- Web_scrape_function_Newcastle(Newcastle_url)
# Everton
# Everton vs Chelsea
# 1 rule for 1 bots crawl delay 5 seconds, scrapable
Web_scrape_function_Everton <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- ("")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
Everton_url <- ""
ever_1 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_2 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_3 <- Web_scrape_function_Everton(Everton_url)
Everton_url <-""
ever_4 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_5 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_6 <- Web_scrape_function_Everton(Everton_url)
Everton_url <- ""
ever_7 <- Web_scrape_function_Everton(Everton_url)
# Leicester
# Leicester against Brentford
# 1 bot 1 rule scrapable 5 second crawl
Web_scrape_function_Leicester <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".col-12")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
str_remove_all("More on this story. . . In Photos -") %>%
Leicester_url <- ""
lei_1 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_2 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_3 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_4 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_5 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_6 <- Web_scrape_function_Leicester(Leicester_url)
Leicester_url <- ""
lei_7 <- Web_scrape_function_Leicester(Leicester_url)
# West Ham
# West Ham vs Manchester City
Web_scrape_function_WestHam <- function(url,css,data) { # creating function to repeat web scrape
url <- read_html(url)
css <- (".m-article__columns")
data <- url %>%
html_node(css = css) %>%
data <- str_replace_all(data, "\n", "####") %>%
str_replace_all("/n", "####") %>%
str_remove_all("/n") %>%
str_remove_all("\n") %>%
str_remove_all(" - ") %>%
str_remove_all("\\(") %>%
str_remove_all("\\)") %>%
str_remove_all("\"") %>%
str_remove_all("#") %>%
str_remove_all("More on this story. . . In Photos -") %>%
WestHam_url <- ""
wh_1 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_2 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_3 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_4 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_5 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_6 <- Web_scrape_function_WestHam(WestHam_url)
WestHam_url <- ""
wh_7 <- Web_scrape_function_WestHam(WestHam_url)
# Preprocessing
# First step is to make these character vectors into a corpus to use for preprocessing
# Arsenal
Arsenal <- c(Match_1, Match_2, Match_3, Match_4, Match_5, Match_6, Match_7,Match_1_2021, Match_2_2021, Match_3_2021, Match_4_2021, Match_5_2021, Match_6_2021, Match_7_2021,Match_8_2021, Match_9_2021, Match_10_2021, Match_11_2021, Match_12_2021, Match_13_2021, Match_14_2021, Match_15_2021, Match_16_2021, Match_17_2021, Match_18_2021, Match_19_2021, Match_20_2021, Match_21_2021, Match_22_2021, Match_23_2021, Match_24_2021, Match_25_2021, Match_26_2021, Match_27_2021, Match_28_2021, Match_29_2021, Match_30_2021, Match_31_2021, Match_32_2021, Match_33_2021, Match_34_2021, Match_35_2021, Match_36_2021, Match_37_2021, Match_38_2021, Match_39_2021, Match_40_2021)
Arsenal_corpus <- corpus(Arsenal)
# Man city
Manchester_City <- c(Manc_1, Manc_2, Manc_3, Manc_4, Manc_5, Manc_6, Manc_7)
# Newcastle united
Newcastle_United <- c(nc_1, nc_2, nc_3, nc_4, nc_5, nc_6, nc_7)
# Everton
Everton <- c(ever_1, ever_2, ever_3, ever_4, ever_5, ever_6, ever_7)
# Leicester
Leicester <- c(lei_1, lei_2, lei_3, lei_4, lei_5, lei_6, lei_7)
# West Ham
West_Ham_United <- c(wh_1, wh_2, wh_3, wh_4, wh_5, wh_6, wh_7)
# Creating list of objects to put into the loop
Prem <- c("Arsenal", "Manchester_City", "Newcastle_United", "Everton", "Leicester", "West_Ham_United")
# create loop.
for (i in 1:length(Prem)){
# create corpora
corpusCall <- paste(Prem[i],"_corpus <- corpus(",Prem[i],")", sep = "")
# change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
namesCall <- paste("tmpNames <- docnames(",Prem[i],"_corpus)", sep = "")
bindCall <- paste("docnames(",Prem[i],"_corpus) <- paste(\"",Prem[i],"\", tmpNames, sep = \"-\")", sep = "")
# create summary data
summaryCall <- paste(Prem[i],"_summary <- summary(",Prem[i],"_corpus)", sep = "")
# add indicator
bookCall <- paste(Prem[i],"_summary$Team <- \"",Prem[i],"\"", sep = "")
# add chapter indicator
chapterCall <- paste(Prem[i],"_summary$Match <- as.numeric(str_extract(",Prem[i],"_summary$Text, \"[0-9]+\"))", sep = "")
# add meta data to each corpus
metaCall <- paste("docvars(",Prem[i],"_corpus) <- ",Prem[i],"_summary", sep = "")
Prem <- c(Arsenal_corpus, Manchester_City_corpus, Newcastle_United_corpus, Everton_corpus, Leicester_corpus, West_Ham_United_corpus)
Prem_summary <- summary(Prem)
Arsenal_1 <- corpus_subset(Prem, Team == 'Arsenal')
Prem_dfm <- dfm(tokens(Prem,
remove_punct = TRUE,
remove_symbols = TRUE) %>%
dfm(tolower = TRUE) %>%
topfeatures(Prem_dfm, 20)
# Creating a table to show the highest frequency items and then ranking them
word_counts <-,dec=T))
colnames(word_counts) <- c("Frequency")
word_counts$Rank <- c(1:ncol(Prem_dfm))
ggplot(word_counts, mapping = aes(x = Rank, y = Frequency)) +
geom_point() +
labs(title = "Zipf's Law", x = "Rank", y = "Frequency") +
Prem_smaller_dfm <- dfm_trim(Prem_dfm, min_termfreq = 10)
# trim based on the proportion of documents that the feature appears in; here,
# the feature needs to appear in more than 10% of documents (chapters)
Prem_smaller_dfm <- dfm_trim(Prem_smaller_dfm, min_docfreq = 0.1, docfreq_type = "prop")
textplot_wordcloud(Prem_smaller_dfm, min_count = 50,
random_order = FALSE)
# Creating the FCM
Prem_smaller_dfm <- dfm_trim(Prem_dfm, min_termfreq = 20)
Prem_smaller_dfm <- dfm_trim(Prem_smaller_dfm, min_docfreq = .3, docfreq_type = "prop")
# create fcm from dfm
Prem_smaller_fcm <- fcm(Prem_smaller_dfm)
# check the dimensions (i.e., the number of rows and the number of columnns)
# of the matrix we created
# pull the top features
myFeatures <- names(topfeatures(Prem_smaller_fcm, 30))
# retain only those top features as part of our matrix
Prem_smaller_fcm <- fcm_select(Prem_smaller_fcm, pattern = myFeatures, selection = "keep")
# compute size weight for vertices in network
size <- log(colSums(Prem_smaller_fcm))
# create plot
textplot_network(Prem_smaller_fcm, vertex_size = size / max(size) * 3)