library(tidyverse)
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 8
challenge_8
railroads
snl
faostat
debt
Joining Data
Reading in the data
This dataset will look at SNL actor/actress gender over time.
<- read_csv("_data/snl_actors.csv")
actors <- read_csv("_data/snl_casts.csv")
casts <- read_csv("_data/snl_seasons.csv") seasons
tibble(actors, 10)
# A tibble: 2,306 × 5
aid url type gender `10`
<chr> <chr> <chr> <chr> <dbl>
1 Kate McKinnon /Cast/?KaMc cast female 10
2 Alex Moffat /Cast/?AlMo cast male 10
3 Ego Nwodim /Cast/?EgNw cast unknown 10
4 Chris Redd /Cast/?ChRe cast male 10
5 Kenan Thompson /Cast/?KeTh cast male 10
6 Carey Mulligan /Guests/?3677 guest andy 10
7 Marcus Mumford /Guests/?3679 guest male 10
8 Aidy Bryant /Cast/?AiBr cast female 10
9 Steve Higgins /Crew/?StHi crew male 10
10 Mikey Day /Cast/?MiDa cast male 10
# … with 2,296 more rows
tibble(casts, 10)
# A tibble: 614 × 9
aid sid featured first…¹ last_…² updat…³ n_epi…⁴ seaso…⁵ `10`
<chr> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl> <dbl> <dbl>
1 A. Whitney Brown 11 TRUE 1.99e7 NA FALSE 8 0.444 10
2 A. Whitney Brown 12 TRUE NA NA FALSE 20 1 10
3 A. Whitney Brown 13 TRUE NA NA FALSE 13 1 10
4 A. Whitney Brown 14 TRUE NA NA FALSE 20 1 10
5 A. Whitney Brown 15 TRUE NA NA FALSE 20 1 10
6 A. Whitney Brown 16 TRUE NA NA FALSE 20 1 10
7 Alan Zweibel 5 TRUE 1.98e7 NA FALSE 5 0.25 10
8 Sasheer Zamata 39 TRUE 2.01e7 NA FALSE 11 0.524 10
9 Sasheer Zamata 40 TRUE NA NA FALSE 21 1 10
10 Sasheer Zamata 41 FALSE NA NA FALSE 21 1 10
# … with 604 more rows, and abbreviated variable names ¹first_epid, ²last_epid,
# ³update_anchor, ⁴n_episodes, ⁵season_fraction
tibble(seasons, 10)
# A tibble: 46 × 6
sid year first_epid last_epid n_episodes `10`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1975 19751011 19760731 24 10
2 2 1976 19760918 19770521 22 10
3 3 1977 19770924 19780520 20 10
4 4 1978 19781007 19790526 20 10
5 5 1979 19791013 19800524 20 10
6 6 1980 19801115 19810411 13 10
7 7 1981 19811003 19820522 20 10
8 8 1982 19820925 19830514 20 10
9 9 1983 19831008 19840512 19 10
10 10 1984 19841006 19850413 17 10
# … with 36 more rows
In order to join these datasets I will have to join actors and casts by “aid” or actor ID and then join that with seasons through “sid” or season ID.
unique(actors$ "type")
[1] "cast" "guest" "crew" "unknown"
As a way to filter out the potential skewing of data through guests starring or crew (though those would also be interesting to look at later) I will filter these out of our datasets.
<- casts %>%
casts_gender_count left_join(filter(actors, type=="cast"),
by="aid") %>%
count(sid, gender)
tibble(casts_gender_count)
# A tibble: 102 × 3
sid gender n
<dbl> <chr> <int>
1 1 female 3
2 1 male 6
3 2 female 3
4 2 male 5
5 3 female 3
6 3 male 6
7 4 female 3
8 4 male 6
9 5 female 3
10 5 male 12
# … with 92 more rows
<- casts_gender_count %>%
cast_prop_all group_by(sid) %>%
mutate(prop=n/sum(n)) %>%
ungroup () %>%
select(-n) %>%
pivot_wider(names_from = gender,
values_from = prop) %>%
mutate(across(everything(),~replace_na(.,0))) %>%
pivot_longer(c(female, male, `NA`, unknown),
values_to = "prop",
names_to = "gender")
tibble(cast_prop_all)
# A tibble: 184 × 3
sid gender prop
<dbl> <chr> <dbl>
1 1 female 0.333
2 1 male 0.667
3 1 NA 0
4 1 unknown 0
5 2 female 0.375
6 2 male 0.625
7 2 NA 0
8 2 unknown 0
9 3 female 0.333
10 3 male 0.667
# … with 174 more rows