library(tidyverse)
library(ggplot2)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 8 Solutions
Challenge Overview
Working with the SNL Dataset for this Challenge.
Read in data
The SNL data set consists of three different csv files. They are read into separate dataframes.
# Reading in the SNL dataset
<- read_csv("_data/snl_actors.csv", show_col_types = FALSE)
actors <- read_csv("_data/snl_casts.csv", show_col_types = FALSE)
casts <- read_csv("_data/snl_seasons.csv", show_col_types = FALSE)
seasons
head(actors)
# A tibble: 6 × 4
aid url type gender
<chr> <chr> <chr> <chr>
1 Kate McKinnon /Cast/?KaMc cast female
2 Alex Moffat /Cast/?AlMo cast male
3 Ego Nwodim /Cast/?EgNw cast unknown
4 Chris Redd /Cast/?ChRe cast male
5 Kenan Thompson /Cast/?KeTh cast male
6 Carey Mulligan /Guests/?3677 guest andy
head(casts)
# A tibble: 6 × 8
aid sid featured first_epid last_epid update_…¹ n_epi…² seaso…³
<chr> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl> <dbl>
1 A. Whitney Brown 11 TRUE 19860222 NA FALSE 8 0.444
2 A. Whitney Brown 12 TRUE NA NA FALSE 20 1
3 A. Whitney Brown 13 TRUE NA NA FALSE 13 1
4 A. Whitney Brown 14 TRUE NA NA FALSE 20 1
5 A. Whitney Brown 15 TRUE NA NA FALSE 20 1
6 A. Whitney Brown 16 TRUE NA NA FALSE 20 1
# … with abbreviated variable names ¹update_anchor, ²n_episodes,
# ³season_fraction
head(seasons)
# A tibble: 6 × 5
sid year first_epid last_epid n_episodes
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1975 19751011 19760731 24
2 2 1976 19760918 19770521 22
3 3 1977 19770924 19780520 20
4 4 1978 19781007 19790526 20
5 5 1979 19791013 19800524 20
6 6 1980 19801115 19810411 13
Briefly describe the data
There are three datasets. The first one is actors, which has 4 fields and 2306 observations. It basically consists of details about each actor, the type and their gender. The second dataset is casts which has 8 fields and 614 observations. Each row corresponds to a season that a particular actor has featured in. It also has details about the number of episodes that actor has featured in that particular season. The third dataset is seasons which consists of 5 fields and 46 observations. Each row corresponds to a season, the year it was released, the number of episodes in that season and the first and last episode dates.
Tidy Data (as needed)
# Actors
# Find the different types and genders
unique(actors$type)
[1] "cast" "guest" "crew" "unknown"
unique(actors$gender)
[1] "female" "male" "unknown" "andy"
<- actors %>% select(-c(url))
actors actors
# A tibble: 2,306 × 3
aid type gender
<chr> <chr> <chr>
1 Kate McKinnon cast female
2 Alex Moffat cast male
3 Ego Nwodim cast unknown
4 Chris Redd cast male
5 Kenan Thompson cast male
6 Carey Mulligan guest andy
7 Marcus Mumford guest male
8 Aidy Bryant cast female
9 Steve Higgins crew male
10 Mikey Day cast male
# … with 2,296 more rows
We can see that in actors, there is a particular type called ‘unknown’. In gender we can also see ‘unknown’ and another field called ‘andy’. We can retain these for now and remove them later if needed. We can remove the url field as it seems redundant for our analysis here.
The other two datasets seem to have the data in place. We can rename a few of these columns for better readability.
# Renaming columns in actors
<- actors %>%
actors rename(
Actor = aid,
Type = type,
Gender = gender
)
# Renaming columns in casts
<- casts %>%
casts rename(
Actor = aid,
Season = sid,
Featured = featured,
Anchor = update_anchor,
Episodes = n_episodes,
EpisodesProp = season_fraction
)
# Renaming columns in seasons
<- seasons %>%
seasons rename(
Season = sid,
Year = year,
FirstEpisode = first_epid,
LastEpisode = last_epid,
TotEpisodes = n_episodes,
)
There are some fields in these datasets which can be mutated. The ‘andy’ field under Gender in actors can also be mutated to ‘unknown’.
<- actors %>%
actors mutate(Gender = str_replace(Gender, "andy", "unknown"), Type = str_replace(Type, "unknown", "celebrity")
)
Join Data
We can now join the actors and casts data so that we can get an indicator of the gender and typeof all the actors being featured in different seasons. Here I am doing a left join wherein for each row in the casts table, I want the respective fields from actors table to be merged.
= merge(x=casts,y=actors,by="Actor",all.x=TRUE) actorsCasts
Next, I would also like to merge the seasons table to this, to get the Year and the total number of episodes.
= merge(x=actorsCasts, y=seasons, by="Season", all.x=TRUE) snl
Now we have a single table with all of the SNL data. From this table, I can now drop some redundant columns like ‘EpisodesProp’, ‘FirstEpisode’, ‘LastEpisode’ as these details can be inferred from the other fields.
<- snl %>% select(-c(EpisodesProp,FirstEpisode,LastEpisode)) %>% filter(Featured == FALSE) snl
From this table, we can further calculate the number of female, male and unknowns who have taken part from 2010-2020 and not featured.
<- snl %>% select(c(Season, Actor, Gender, Year)) %>%
genCount filter(Year > 2010) %>%
group_by(Season, Gender) %>%
summarise(count = n())
genCount
# A tibble: 21 × 3
# Groups: Season [10]
Season Gender count
<dbl> <chr> <int>
1 37 female 3
2 37 male 7
3 38 female 2
4 38 male 8
5 39 female 5
6 39 male 5
7 40 female 4
8 40 male 4
9 41 female 5
10 41 male 7
# … with 11 more rows