library(tidyverse)
library(ggplot2)
library(lubridate)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 8 - SNL
Read in snl data
- snl ⭐⭐⭐⭐⭐
<- read.csv("_data/snl_actors.csv")
snl_actors dim(snl_actors)
[1] 2306 4
head(snl_actors)
aid url type gender
1 Kate McKinnon /Cast/?KaMc cast female
2 Alex Moffat /Cast/?AlMo cast male
3 Ego Nwodim /Cast/?EgNw cast unknown
4 Chris Redd /Cast/?ChRe cast male
5 Kenan Thompson /Cast/?KeTh cast male
6 Carey Mulligan /Guests/?3677 guest andy
<- read.csv("_data/snl_casts.csv")
snl_casts dim(snl_casts)
[1] 614 8
head(snl_casts)
aid sid featured first_epid last_epid update_anchor n_episodes
1 A. Whitney Brown 11 True 19860222 NA False 8
2 A. Whitney Brown 12 True NA NA False 20
3 A. Whitney Brown 13 True NA NA False 13
4 A. Whitney Brown 14 True NA NA False 20
5 A. Whitney Brown 15 True NA NA False 20
6 A. Whitney Brown 16 True NA NA False 20
season_fraction
1 0.4444444
2 1.0000000
3 1.0000000
4 1.0000000
5 1.0000000
6 1.0000000
<-read.csv("_data/snl_seasons.csv")
snl_seasons dim(snl_seasons)
[1] 46 5
head(snl_seasons)
sid year first_epid last_epid n_episodes
1 1 1975 19751011 19760731 24
2 2 1976 19760918 19770521 22
3 3 1977 19770924 19780520 20
4 4 1978 19781007 19790526 20
5 5 1979 19791013 19800524 20
6 6 1980 19801115 19810411 13
Briefly describe the data
The SNL data consists of three data sets:
SNL actors is a list of all cast and guest members who have been on SNL (2306). There is a link out to more information, what type of actor they were (cast vs guest) and their gender. Each row is an actor.
SNL casts, on the other hand, is a much more robust data set in terms of variables. While it only includes the cast members (614) (not guests), it includes information on the seasons they were on the show, how many episodes in each season, and the dates of the first and last episodes in a given season that they were in. Most values for first and last episode are NA, as they are only included if they differ from the first or last episode of that season. This will be dealt with later on.
The SNL casts data has the general format we would like to use for analysis. Each row represents an “actor-year”. This is what each case should be in our final data set.
Finally, the SNL seasons data contains information on the seasons (46 total), including year, dates of the first and last episode, and episode count per season. Each row is a season.
SNL actors is a list of all cast and guest members who have been on SNL (2306). There is a link out to more information, what type of actor they were (cast vs guest) and their gender. Each row is an actor.
SNL casts, on the other hand, is a much more robust data set in terms of variables. While it only includes the cast members (614) (not guests), it includes information on the seasons they were on the show, how many episodes in each season, and the dates of the first and last episodes in a given season that they were in. Most values for first and last episode are NA, as they are only included if they differ from the first or last episode of that season. This will be dealt with later on.
The SNL casts data has the general format we would like to use for analysis. Each row represents an “actor-year”. This is what each case should be in our final data set.
Finally, the SNL seasons data contains information on the seasons (46 total), including year, dates of the first and last episode, and episode count per season. Each row is a season.
Tidy and mutation before joining
I will do enough tidy-ing to join the data, and then work on some additional mutations after to clean the final data set.
Before moving on, I’m going to filter out guests from the actors data set, as we don’t have any information about them except for gender to use in an analysis.
#filtering out guests from actors data
<- snl_actors %>%
snl_actors filter(type == "cast") %>%
select(aid, gender)
As you can see, their are two columns (whether the cast member was an update anchor, and whether they were featured) which are currently characters, but should be logical. This has been changed below.
#showing character class before mutation
class(snl_casts$update_anchor)
[1] "character"
class(snl_casts$featured)
[1] "character"
#mutating to change to logical
<- snl_casts %>%
snl_casts mutate(`update_anchor` = case_when(
`update_anchor` == "True" ~ TRUE,
`update_anchor` == "False" ~ FALSE)) %>%
mutate(`featured` = case_when(
`featured` == "True" ~ TRUE,
`featured` == "False" ~ FALSE))
#showing logical class after mutation
class(snl_casts$update_anchor)
[1] "logical"
class(snl_casts$featured)
[1] "logical"
Join Data
As we join the three data sets, I will use the SNL casts data as the primary data set, and add information from the other data sets into this. So my case count at the end should be 614 and each case will represent an “actor-season”.
First, I will add the data about each season into the casts data. This includes first and last date of the season, number of episodes, and year.
Next, I use the SNL actors data to add gender of the cast member into my SNL casts data.
#combining seasons data INTO casts data
<- left_join(snl_casts, snl_seasons, by = "sid")
snl_castsandseasons
#combining actors data INTO casts and seasons data
<- left_join(snl_castsandseasons, snl_actors, "aid")
snl_castsseasonsandactors
head(snl_castsseasonsandactors)
aid sid featured first_epid.x last_epid.x update_anchor
1 A. Whitney Brown 11 TRUE 19860222 NA FALSE
2 A. Whitney Brown 12 TRUE NA NA FALSE
3 A. Whitney Brown 13 TRUE NA NA FALSE
4 A. Whitney Brown 14 TRUE NA NA FALSE
5 A. Whitney Brown 15 TRUE NA NA FALSE
6 A. Whitney Brown 16 TRUE NA NA FALSE
n_episodes.x season_fraction year first_epid.y last_epid.y n_episodes.y
1 8 0.4444444 1985 19851109 19860524 18
2 20 1.0000000 1986 19861011 19870523 20
3 13 1.0000000 1987 19871017 19880227 13
4 20 1.0000000 1988 19881008 19890520 20
5 20 1.0000000 1989 19890930 19900519 20
6 20 1.0000000 1990 19900929 19910518 20
gender
1 male
2 male
3 male
4 male
5 male
6 male
dim(snl_castsseasonsandactors)
[1] 614 13
My data now consists of all variables and is the correct number of rows. There are still some things to do in terms of tidy-ing.
Currently, their are 4 dates in the data. Two “first episodes” and two “last episodes”. Since our cases are “actor-seasons”, I’m going to combine these in a way where it uses the dates of the first and last episode of the season, unless the actor was only there for a partial season. In those cases, the date will reflect the first or last episode they were involved in. This will reduce column count down to 11, as you can see below.
I also needed to turn the numeric date columns into actual dates.
Finally, there are two episode counts, one for the number of episodes an actor was involved in and one for the number of episodes in a season. I’ve renamed these to be clearer.
#creating final combined dataset
<- snl_castsseasonsandactors %>%
snl_all
#combining multiple first and last episode date columns to reflect dates participated by actors
mutate(first_episode = coalesce(first_epid.x, first_epid.y),
last_episode = coalesce(last_epid.x, last_epid.y)) %>%
#changing numeric values to be dates
mutate(first_episode = ymd(first_episode),
last_episode = ymd(last_episode)) %>%
#removing unused date columns
select(-c(first_epid.x, first_epid.y, last_epid.x, last_epid.y)) %>%
#renaming for clarity
rename("actor_episodes" = n_episodes.x) %>%
rename("season_episodes" = n_episodes.y)
#printing dimensions and summary
dim(snl_casts)
[1] 614 8
dim(snl_all)
[1] 614 11
head(snl_all)
aid sid featured update_anchor actor_episodes season_fraction
1 A. Whitney Brown 11 TRUE FALSE 8 0.4444444
2 A. Whitney Brown 12 TRUE FALSE 20 1.0000000
3 A. Whitney Brown 13 TRUE FALSE 13 1.0000000
4 A. Whitney Brown 14 TRUE FALSE 20 1.0000000
5 A. Whitney Brown 15 TRUE FALSE 20 1.0000000
6 A. Whitney Brown 16 TRUE FALSE 20 1.0000000
year season_episodes gender first_episode last_episode
1 1985 18 male 1986-02-22 1986-05-24
2 1986 20 male 1986-10-11 1987-05-23
3 1987 13 male 1987-10-17 1988-02-27
4 1988 20 male 1988-10-08 1989-05-20
5 1989 20 male 1989-09-30 1990-05-19
6 1990 20 male 1990-09-29 1991-05-18
print(summarytools::dfSummary(snl_all,
valid.col=FALSE),
method = 'render')
Data Frame Summary
snl_all
Dimensions: 614 x 11Duplicates: 0
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | aid [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
2 | sid [integer] |
|
46 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
3 | featured [logical] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
4 | update_anchor [logical] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
5 | actor_episodes [integer] |
|
22 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
6 | season_fraction [numeric] |
|
36 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
7 | year [integer] |
|
46 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
8 | season_episodes [integer] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
9 | gender [character] |
|
|
7 (1.1%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
10 | first_episode [Date] |
|
80 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
11 | last_episode [Date] |
|
63 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.2)
2023-04-26