Challenge 8

challenge_8

Adithya Parupudi

snl data

Joining Data

Author

Adithya Parupudi

Published

August 25, 2022

library(tidyverse)
library(ggplot2)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Read in data

snl_actors <- read_csv("_data/snl_actors.csv",show_col_types = FALSE)
snl_casts <- read_csv("_data/snl_casts.csv",show_col_types = FALSE)
snl_seasons <- read_csv("_data/snl_seasons.csv",show_col_types = FALSE)

Briefly describe the data

Tidy Data (as needed)

Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.

print(summarytools::dfSummary(snl_actors, 
                              varnumbers = FALSE, 
                              plain.ascii = FALSE, 
                              style = "grid", 
                              graph.magnif = 0.70, 
                              valid.col = FALSE), 
      method = 'render', 
      table.classes = 'table-condensed')

Data Frame Summary

snl_actors

Dimensions: 2306 x 4
Duplicates: 0

Variable

Stats / Values

Freqs (% of Valid)

Graph

Missing

aid [character]

1. 'N Sync

2. 070 Shake

3. 10,000 Maniacs

4. 14 Karat Soul

5. 2 Chainz

6. 3-D

7. 3RDEYEGIRL

8. 50 Cent

9. A Tribe Called Quest

10. A$AP Rocky

[ 2296 others ]

1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
2296	(	99.6%	)

0 (0.0%)

url [character]

1. /Cast/?AbEl

2. /Cast/?AdSa

3. /Cast/?AiBr

4. /Cast/?AlFr

5. /Cast/?AlMo

6. /Cast/?AlZw

7. /Cast/?AmPo

8. /Cast/?AnDi

9. /Cast/?AnGa

10. /Cast/?AnMH

[ 2239 others ]

1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
2239	(	99.6%	)

57 (2.5%)

type [character]

1. cast

2. crew

3. guest

4. unknown

154	(	6.7%	)
170	(	7.4%	)
1926	(	83.5%	)
56	(	2.4%	)

0 (0.0%)

gender [character]

1. andy

2. female

3. male

4. unknown

21	(	0.9%	)
671	(	29.1%	)
1226	(	53.2%	)
388	(	16.8%	)

0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04

print(summarytools::dfSummary(snl_casts, 
                              varnumbers = FALSE, 
                              plain.ascii = FALSE, 
                              style = "grid", 
                              graph.magnif = 0.70, 
                              valid.col = FALSE), 
      method = 'render', 
      table.classes = 'table-condensed')

Data Frame Summary

snl_casts

Dimensions: 614 x 8
Duplicates: 0

Variable

Stats / Values

Freqs (% of Valid)

Graph

Missing

aid [character]

1. Kenan Thompson

2. Darrell Hammond

3. Seth Meyers

4. Al Franken

5. Fred Armisen

6. Kate McKinnon

7. Tim Meadows

8. Aidy Bryant

9. Bobby Moynihan

10. Cecily Strong

[ 146 others ]

18	(	2.9%	)
14	(	2.3%	)
13	(	2.1%	)
11	(	1.8%	)
11	(	1.8%	)
10	(	1.6%	)
10	(	1.6%	)
9	(	1.5%	)
9	(	1.5%	)
9	(	1.5%	)
500	(	81.4%	)

0 (0.0%)

sid [numeric]

Mean (sd) : 25.5 (13.1)

min ≤ med ≤ max:

1 ≤ 26 ≤ 46

IQR (CV) : 22 (0.5)

46 distinct values

0 (0.0%)

featured [logical]

1. FALSE

2. TRUE

451	(	73.5%	)
163	(	26.5%	)

0 (0.0%)

first_epid [numeric]

Mean (sd) : 19909635 (111264.8)

min ≤ med ≤ max:

19770115 ≤ 19901110 ≤ 20141025

IQR (CV) : 156624.2 (0)

35 distinct values

564 (91.9%)

last_epid [numeric]

Mean (sd) : 19944038 (126122.2)

min ≤ med ≤ max:

19751011 ≤ 19950225 ≤ 20140201

IQR (CV) : 190005 (0)

17 distinct values

597 (97.2%)

update_anchor [logical]

1. FALSE

2. TRUE

541	(	88.1%	)
73	(	11.9%	)

0 (0.0%)

n_episodes [numeric]

Mean (sd) : 18.7 (4)

min ≤ med ≤ max:

1 ≤ 20 ≤ 24

IQR (CV) : 2 (0.2)

22 distinct values

0 (0.0%)

season_fraction [numeric]

Mean (sd) : 0.9 (0.2)

min ≤ med ≤ max:

0 ≤ 1 ≤ 1

IQR (CV) : 0 (0.2)

36 distinct values

0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04

print(summarytools::dfSummary(snl_seasons, 
                              varnumbers = FALSE, 
                              plain.ascii = FALSE, 
                              style = "grid", 
                              graph.magnif = 0.70, 
                              valid.col = FALSE), 
      method = 'render', 
      table.classes = 'table-condensed')

Data Frame Summary

snl_seasons

Dimensions: 46 x 5
Duplicates: 0

Variable

Stats / Values

Freqs (% of Valid)

Graph

Missing

sid [numeric]

Mean (sd) : 23.5 (13.4)

min ≤ med ≤ max:

1 ≤ 23.5 ≤ 46

IQR (CV) : 22.5 (0.6)

46 distinct values

0 (0.0%)

year [numeric]

Mean (sd) : 1997.5 (13.4)

min ≤ med ≤ max:

1975 ≤ 1997.5 ≤ 2020

IQR (CV) : 22.5 (0)

46 distinct values

0 (0.0%)

first_epid [numeric]

Mean (sd) : 19975965 (134209.3)

min ≤ med ≤ max:

19751011 ≤ 19975927 ≤ 20201003

IQR (CV) : 224910.2 (0)

46 distinct values

0 (0.0%)

last_epid [numeric]

Mean (sd) : 19985509 (134223.9)

min ≤ med ≤ max:

19760731 ≤ 19985512 ≤ 20210410

IQR (CV) : 225066.2 (0)

46 distinct values

0 (0.0%)

n_episodes [numeric]

Mean (sd) : 19.7 (2.3)

min ≤ med ≤ max:

12 ≤ 20 ≤ 24

IQR (CV) : 1 (0.1)

12	:	1	(	2.2%	)
13	:	2	(	4.3%	)
17	:	2	(	4.3%	)
18	:	2	(	4.3%	)
19	:	3	(	6.5%	)
20	:	23	(	50.0%	)
21	:	7	(	15.2%	)
22	:	5	(	10.9%	)
24	:	1	(	2.2%	)

0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04

colnames(snl_actors)

[1] "aid"    "url"    "type"   "gender"

colnames(snl_casts)

[1] "aid"             "sid"             "featured"        "first_epid"     
[5] "last_epid"       "update_anchor"   "n_episodes"      "season_fraction"

colnames(snl_seasons)

[1] "sid"        "year"       "first_epid" "last_epid"  "n_episodes"

Replacing NA in snl_casts with 0

snl_casts<- snl_casts %>% 
  replace_na(list(`first_epid`=0,
                  `last_epid`=0))

Removing url column from snl_actors as its not useful it is getting difficult to clean it.

snl_actors<- snl_actors %>% 
  select(-url)

Join Data

actors_casts <- snl_actors %>%
  inner_join(snl_casts, by = "aid")

fully_joined_dataset <- actors_casts %>%
  inner_join(snl_seasons, by = "sid")

fully_joined_dataset

# A tibble: 614 × 14
   aid        type  gender   sid featu…¹ first…² last_…³ updat…⁴ n_epi…⁵ seaso…⁶
   <chr>      <chr> <chr>  <dbl> <lgl>     <dbl>   <dbl> <lgl>     <dbl>   <dbl>
 1 Kate McKi… cast  female    37 TRUE     2.01e7       0 FALSE         5   0.227
 2 Kate McKi… cast  female    38 TRUE     0            0 FALSE        21   1    
 3 Kate McKi… cast  female    39 FALSE    0            0 FALSE        21   1    
 4 Kate McKi… cast  female    40 FALSE    0            0 FALSE        21   1    
 5 Kate McKi… cast  female    41 FALSE    0            0 FALSE        21   1    
 6 Kate McKi… cast  female    42 FALSE    0            0 FALSE        21   1    
 7 Kate McKi… cast  female    43 FALSE    0            0 FALSE        21   1    
 8 Kate McKi… cast  female    44 FALSE    0            0 FALSE        21   1    
 9 Kate McKi… cast  female    45 FALSE    0            0 FALSE        18   1    
10 Kate McKi… cast  female    46 FALSE    0            0 FALSE        17   1    
# … with 604 more rows, 4 more variables: year <dbl>, first_epid.y <dbl>,
#   last_epid.y <dbl>, n_episodes.y <dbl>, and abbreviated variable names
#   ¹featured, ²first_epid.x, ³last_epid.x, ⁴update_anchor, ⁵n_episodes.x,
#   ⁶season_fraction
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names