library(tidyverse)
library(viridis)
library(patchwork)
library(hrbrthemes)
library(fmsb)
library(colormap)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 8 Instructions
true
true
true
Joining Data
Challenge Overview
Overview of SNL Dataset
Read in data
Reading 3 differnt datasets of snl csv files
#Load dataset
= read_csv("~/Desktop/601_Spring_2023/posts/_data/snl_seasons.csv", show_col_types = FALSE)
snl_seasons
= read_csv("~/Desktop/601_Spring_2023/posts/_data/snl_casts.csv", show_col_types = FALSE)
snl_casts
= read_csv("~/Desktop/601_Spring_2023/posts/_data/snl_actors.csv", show_col_types = FALSE)
snl_actors
head(snl_seasons)
# A tibble: 6 × 5
sid year first_epid last_epid n_episodes
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1975 19751011 19760731 24
2 2 1976 19760918 19770521 22
3 3 1977 19770924 19780520 20
4 4 1978 19781007 19790526 20
5 5 1979 19791013 19800524 20
6 6 1980 19801115 19810411 13
head(snl_casts)
# A tibble: 6 × 8
aid sid featured first_epid last_epid update_anchor n_episodes
<chr> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl>
1 A. Whitney Brown 11 TRUE 19860222 NA FALSE 8
2 A. Whitney Brown 12 TRUE NA NA FALSE 20
3 A. Whitney Brown 13 TRUE NA NA FALSE 13
4 A. Whitney Brown 14 TRUE NA NA FALSE 20
5 A. Whitney Brown 15 TRUE NA NA FALSE 20
6 A. Whitney Brown 16 TRUE NA NA FALSE 20
# ℹ 1 more variable: season_fraction <dbl>
head(snl_actors)
# A tibble: 6 × 4
aid url type gender
<chr> <chr> <chr> <chr>
1 Kate McKinnon /Cast/?KaMc cast female
2 Alex Moffat /Cast/?AlMo cast male
3 Ego Nwodim /Cast/?EgNw cast unknown
4 Chris Redd /Cast/?ChRe cast male
5 Kenan Thompson /Cast/?KeTh cast male
6 Carey Mulligan /Guests/?3677 guest andy
Briefly Describe the data
The dataset snl_seasons contains 46 observations about the different seasons of “Saturday Night Live”. The three datasets consist of actors, casts, and seasons, which contain details about each actor, type, and gender.
library(summarytools)
print(summarytools::dfSummary(snl_seasons,
varnumbers = FALSE,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.70,
valid.col = FALSE),
method = 'render',
table.classes = 'table-condensed')
Data Frame Summary
snl_seasons
Dimensions: 46 x 5Duplicates: 0
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sid [numeric] |
|
46 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
year [numeric] |
|
46 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
first_epid [numeric] |
|
46 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
last_epid [numeric] |
|
46 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
n_episodes [numeric] |
|
|
0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.2)
2023-05-05
print(summarytools::dfSummary(snl_casts,
varnumbers = FALSE,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.70,
valid.col = FALSE),
method = 'render',
table.classes = 'table-condensed')
Data Frame Summary
snl_casts
Dimensions: 614 x 8Duplicates: 0
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
aid [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
sid [numeric] |
|
46 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
featured [logical] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
first_epid [numeric] |
|
35 distinct values | 564 (91.9%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
last_epid [numeric] |
|
17 distinct values | 597 (97.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
update_anchor [logical] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
n_episodes [numeric] |
|
22 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
season_fraction [numeric] |
|
36 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.2)
2023-05-05
print(summarytools::dfSummary(snl_actors,
varnumbers = FALSE,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.70,
valid.col = FALSE),
method = 'render',
table.classes = 'table-condensed')
Data Frame Summary
snl_actors
Dimensions: 2306 x 4Duplicates: 0
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
aid [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
url [character] |
|
|
57 (2.5%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
type [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
gender [character] |
|
|
0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.2)
2023-05-05
Tidy Data (as needed)
To tidy the dataset I rename all columns in each data set.
# Renaming columns in seasons
<- snl_seasons %>%
snl_seasons rename(
Season = sid,
Year = year,
FirstEpisode = first_epid,
LastEpisode = last_epid,
TotEpisodes = n_episodes,
)
# Renaming columns in casts
<- snl_casts %>%
snl_casts rename(
Actor = aid,
Season = sid,
Featured = featured,
Anchor = update_anchor,
Episodes = n_episodes,
EpisodesProp = season_fraction
)# Renaming columns in actors
<- snl_actors %>%
snl_actors rename(
Actor = aid,
Type = type,
Gender = gender
)
# Mutate fields
<- snl_actors %>%
snl_actors mutate(Gender = str_replace(Gender, "andy", "unknown"), Type = str_replace(Type, "unknown", "celebrity")
)
Join Data
Be sure to include a sanity check, and double-check that case count is correct!
library(dplyr)
= merge(x=snl_casts, y=snl_actors, by.y = "aid", by.x = "aid") snl_joined
Error in fix.by(by.x, x): 'by' must specify a uniquely valid column
print(summarytools::dfSummary(snl_joined,
varnumbers = FALSE,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.70,
valid.col = FALSE),
method = 'render',
table.classes = 'table-condensed')
Error in summarytools::dfSummary(snl_joined, varnumbers = FALSE, plain.ascii = FALSE, : object 'snl_joined' not found