Challenge 8

challenge_8
Adithya Parupudi
snl data
Joining Data
Author

Adithya Parupudi

Published

August 25, 2022

library(tidyverse)
library(ggplot2)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Read in data

snl_actors <- read_csv("_data/snl_actors.csv",show_col_types = FALSE)
snl_casts <- read_csv("_data/snl_casts.csv",show_col_types = FALSE)
snl_seasons <- read_csv("_data/snl_seasons.csv",show_col_types = FALSE)

Briefly describe the data

Tidy Data (as needed)

Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.

print(summarytools::dfSummary(snl_actors, 
                              varnumbers = FALSE, 
                              plain.ascii = FALSE, 
                              style = "grid", 
                              graph.magnif = 0.70, 
                              valid.col = FALSE), 
      method = 'render', 
      table.classes = 'table-condensed')

Data Frame Summary

snl_actors

Dimensions: 2306 x 4
Duplicates: 0
Variable Stats / Values Freqs (% of Valid) Graph Missing
aid [character]
1. 'N Sync
2. 070 Shake
3. 10,000 Maniacs
4. 14 Karat Soul
5. 2 Chainz
6. 3-D
7. 3RDEYEGIRL
8. 50 Cent
9. A Tribe Called Quest
10. A$AP Rocky
[ 2296 others ]
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
2296(99.6%)
0 (0.0%)
url [character]
1. /Cast/?AbEl
2. /Cast/?AdSa
3. /Cast/?AiBr
4. /Cast/?AlFr
5. /Cast/?AlMo
6. /Cast/?AlZw
7. /Cast/?AmPo
8. /Cast/?AnDi
9. /Cast/?AnGa
10. /Cast/?AnMH
[ 2239 others ]
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
2239(99.6%)
57 (2.5%)
type [character]
1. cast
2. crew
3. guest
4. unknown
154(6.7%)
170(7.4%)
1926(83.5%)
56(2.4%)
0 (0.0%)
gender [character]
1. andy
2. female
3. male
4. unknown
21(0.9%)
671(29.1%)
1226(53.2%)
388(16.8%)
0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04

print(summarytools::dfSummary(snl_casts, 
                              varnumbers = FALSE, 
                              plain.ascii = FALSE, 
                              style = "grid", 
                              graph.magnif = 0.70, 
                              valid.col = FALSE), 
      method = 'render', 
      table.classes = 'table-condensed')

Data Frame Summary

snl_casts

Dimensions: 614 x 8
Duplicates: 0
Variable Stats / Values Freqs (% of Valid) Graph Missing
aid [character]
1. Kenan Thompson
2. Darrell Hammond
3. Seth Meyers
4. Al Franken
5. Fred Armisen
6. Kate McKinnon
7. Tim Meadows
8. Aidy Bryant
9. Bobby Moynihan
10. Cecily Strong
[ 146 others ]
18(2.9%)
14(2.3%)
13(2.1%)
11(1.8%)
11(1.8%)
10(1.6%)
10(1.6%)
9(1.5%)
9(1.5%)
9(1.5%)
500(81.4%)
0 (0.0%)
sid [numeric]
Mean (sd) : 25.5 (13.1)
min ≤ med ≤ max:
1 ≤ 26 ≤ 46
IQR (CV) : 22 (0.5)
46 distinct values 0 (0.0%)
featured [logical]
1. FALSE
2. TRUE
451(73.5%)
163(26.5%)
0 (0.0%)
first_epid [numeric]
Mean (sd) : 19909635 (111264.8)
min ≤ med ≤ max:
19770115 ≤ 19901110 ≤ 20141025
IQR (CV) : 156624.2 (0)
35 distinct values 564 (91.9%)
last_epid [numeric]
Mean (sd) : 19944038 (126122.2)
min ≤ med ≤ max:
19751011 ≤ 19950225 ≤ 20140201
IQR (CV) : 190005 (0)
17 distinct values 597 (97.2%)
update_anchor [logical]
1. FALSE
2. TRUE
541(88.1%)
73(11.9%)
0 (0.0%)
n_episodes [numeric]
Mean (sd) : 18.7 (4)
min ≤ med ≤ max:
1 ≤ 20 ≤ 24
IQR (CV) : 2 (0.2)
22 distinct values 0 (0.0%)
season_fraction [numeric]
Mean (sd) : 0.9 (0.2)
min ≤ med ≤ max:
0 ≤ 1 ≤ 1
IQR (CV) : 0 (0.2)
36 distinct values 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04

print(summarytools::dfSummary(snl_seasons, 
                              varnumbers = FALSE, 
                              plain.ascii = FALSE, 
                              style = "grid", 
                              graph.magnif = 0.70, 
                              valid.col = FALSE), 
      method = 'render', 
      table.classes = 'table-condensed')

Data Frame Summary

snl_seasons

Dimensions: 46 x 5
Duplicates: 0
Variable Stats / Values Freqs (% of Valid) Graph Missing
sid [numeric]
Mean (sd) : 23.5 (13.4)
min ≤ med ≤ max:
1 ≤ 23.5 ≤ 46
IQR (CV) : 22.5 (0.6)
46 distinct values 0 (0.0%)
year [numeric]
Mean (sd) : 1997.5 (13.4)
min ≤ med ≤ max:
1975 ≤ 1997.5 ≤ 2020
IQR (CV) : 22.5 (0)
46 distinct values 0 (0.0%)
first_epid [numeric]
Mean (sd) : 19975965 (134209.3)
min ≤ med ≤ max:
19751011 ≤ 19975927 ≤ 20201003
IQR (CV) : 224910.2 (0)
46 distinct values 0 (0.0%)
last_epid [numeric]
Mean (sd) : 19985509 (134223.9)
min ≤ med ≤ max:
19760731 ≤ 19985512 ≤ 20210410
IQR (CV) : 225066.2 (0)
46 distinct values 0 (0.0%)
n_episodes [numeric]
Mean (sd) : 19.7 (2.3)
min ≤ med ≤ max:
12 ≤ 20 ≤ 24
IQR (CV) : 1 (0.1)
12:1(2.2%)
13:2(4.3%)
17:2(4.3%)
18:2(4.3%)
19:3(6.5%)
20:23(50.0%)
21:7(15.2%)
22:5(10.9%)
24:1(2.2%)
0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04

colnames(snl_actors)
[1] "aid"    "url"    "type"   "gender"
colnames(snl_casts)
[1] "aid"             "sid"             "featured"        "first_epid"     
[5] "last_epid"       "update_anchor"   "n_episodes"      "season_fraction"
colnames(snl_seasons)
[1] "sid"        "year"       "first_epid" "last_epid"  "n_episodes"

Replacing NA in snl_casts with 0

snl_casts<- snl_casts %>% 
  replace_na(list(`first_epid`=0,
                  `last_epid`=0))

Removing url column from snl_actors as its not useful it is getting difficult to clean it.

snl_actors<- snl_actors %>% 
  select(-url)

Join Data

actors_casts <- snl_actors %>%
  inner_join(snl_casts, by = "aid")

fully_joined_dataset <- actors_casts %>%
  inner_join(snl_seasons, by = "sid")

fully_joined_dataset
# A tibble: 614 × 14
   aid        type  gender   sid featu…¹ first…² last_…³ updat…⁴ n_epi…⁵ seaso…⁶
   <chr>      <chr> <chr>  <dbl> <lgl>     <dbl>   <dbl> <lgl>     <dbl>   <dbl>
 1 Kate McKi… cast  female    37 TRUE     2.01e7       0 FALSE         5   0.227
 2 Kate McKi… cast  female    38 TRUE     0            0 FALSE        21   1    
 3 Kate McKi… cast  female    39 FALSE    0            0 FALSE        21   1    
 4 Kate McKi… cast  female    40 FALSE    0            0 FALSE        21   1    
 5 Kate McKi… cast  female    41 FALSE    0            0 FALSE        21   1    
 6 Kate McKi… cast  female    42 FALSE    0            0 FALSE        21   1    
 7 Kate McKi… cast  female    43 FALSE    0            0 FALSE        21   1    
 8 Kate McKi… cast  female    44 FALSE    0            0 FALSE        21   1    
 9 Kate McKi… cast  female    45 FALSE    0            0 FALSE        18   1    
10 Kate McKi… cast  female    46 FALSE    0            0 FALSE        17   1    
# … with 604 more rows, 4 more variables: year <dbl>, first_epid.y <dbl>,
#   last_epid.y <dbl>, n_episodes.y <dbl>, and abbreviated variable names
#   ¹​featured, ²​first_epid.x, ³​last_epid.x, ⁴​update_anchor, ⁵​n_episodes.x,
#   ⁶​season_fraction
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names