Challenge 5: Public Schools Visualization

laurenzichittella

challenge_5

public_schools

Introduction to Visualization

Author

Lauren Zichittella

Published

March 25, 2023

library(tidyverse)
library(ggplot2)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Description Of Public Schools Characteristics, 2017-2017

This dataset provides characteristics on public schools, their students and teachers, in the US as of the 2017-2018 school year.

Please see steps below for steps to preparing this data for unvariate and by variate visualizations

Code Section 1 - Read in data Public School Characteristic

Read in a data set Public_School_Characteristics_2017-18.csv

After Review this data outside of R prior to import, it’s clear this is a wide dataset with plenty of variables.For the sake of efficiency in this exercise, this dataset will be limited to variables and observations that can be used to study the association between student in primary and secondary schools by state, charter status, and magnet status.

Steps

Read in dataset, limit variables to those that will be utilized to filter observations and create visualizations
Describe data to inform additional filtering or selection of variables that remains prior to tidying
Create new variable representing ratio of students to teachers

# Read in Public Schools Characteristics Dataset 
   
   # import data 
   publicschool_char_raw <-read_csv("_data/Public_School_Characteristics_2017-18.csv")

   # get full list of column names
   colnames(publicschool_char_raw)

 [1] "X"                "Y"                "OBJECTID"         "NCESSCH"         
 [5] "NMCNTY"           "SURVYEAR"         "STABR"            "LEAID"           
 [9] "ST_LEAID"         "LEA_NAME"         "SCH_NAME"         "LSTREET1"        
[13] "LSTREET2"         "LSTREET3"         "LCITY"            "LSTATE"          
[17] "LZIP"             "LZIP4"            "PHONE"            "GSLO"            
[21] "GSHI"             "VIRTUAL"          "TOTFRL"           "FRELCH"          
[25] "REDLCH"           "PK"               "KG"               "G01"             
[29] "G02"              "G03"              "G04"              "G05"             
[33] "G06"              "G07"              "G08"              "G09"             
[37] "G10"              "G11"              "G12"              "G13"             
[41] "TOTAL"            "MEMBER"           "AM"               "HI"              
[45] "BL"               "WH"               "HP"               "TR"              
[49] "FTE"              "LATCOD"           "LONCOD"           "ULOCALE"         
[53] "STUTERATIO"       "STITLEI"          "AMALM"            "AMALF"           
[57] "ASALM"            "ASALF"            "HIALM"            "HIALF"           
[61] "BLALM"            "BLALF"            "WHALM"            "WHALF"           
[65] "HPALM"            "HPALF"            "TRALM"            "TRALF"           
[69] "TOTMENROL"        "TOTFENROL"        "STATUS"           "UG"              
[73] "AE"               "SCHOOL_TYPE_TEXT" "SY_STATUS_TEXT"   "SCHOOL_LEVEL"    
[77] "AS"               "CHARTER_TEXT"     "MAGNET_TEXT"

  # select columns to id school, state, enrollment, staff, and other types of status 
  publicschool_nvars <-
     publicschool_char_raw %>% 
        select (NCESSCH, LSTATE, PK, KG, G01, G02, G03, G04, G05, G06, G07, G08, G09, G10, G11, G12, TOTAL, FTE, SCHOOL_TYPE_TEXT, SY_STATUS_TEXT, SCHOOL_LEVEL, CHARTER_TEXT,  MAGNET_TEXT)
  
     # confirm successful selection of vars
     colnames(publicschool_nvars)

 [1] "NCESSCH"          "LSTATE"           "PK"               "KG"              
 [5] "G01"              "G02"              "G03"              "G04"             
 [9] "G05"              "G06"              "G07"              "G08"             
[13] "G09"              "G10"              "G11"              "G12"             
[17] "TOTAL"            "FTE"              "SCHOOL_TYPE_TEXT" "SY_STATUS_TEXT"  
[21] "SCHOOL_LEVEL"     "CHARTER_TEXT"     "MAGNET_TEXT"

     head(publicschool_nvars)

# A tibble: 6 × 23
  NCESSCH     LSTATE    PK    KG   G01   G02   G03   G04   G05   G06   G07   G08
  <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0200510004… AK        30    81    63    80    62    58    73    NA    NA    NA
2 0200610004… AK        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 0200390004… AK        NA    23    23    27    22    25    28    19    NA    NA
4 0200390004… AK        NA    40    43    42    46    46    43    NA    NA    NA
5 0200390005… AK        NA    NA    NA    NA    NA    NA    NA    NA     0     1
6 0200700005… AK         0     0     3     1     2     2     2     1     5     1
# … with 11 more variables: G09 <dbl>, G10 <dbl>, G11 <dbl>, G12 <dbl>,
#   TOTAL <dbl>, FTE <dbl>, SCHOOL_TYPE_TEXT <chr>, SY_STATUS_TEXT <chr>,
#   SCHOOL_LEVEL <chr>, CHARTER_TEXT <chr>, MAGNET_TEXT <chr>

     # quick summary to inform whether to filter observations 
     print(summarytools::dfSummary(publicschool_nvars, 
                        plain.ascii  = FALSE, 
                        style        = "grid", 
                        graph.magnif = 0.70, ),
      method = 'render',
      table.classes = 'table-condensed')

Data Frame Summary

publicschool_nvars

Dimensions: 100729 x 23
Duplicates: 0

NCESSCH [character]

1. 010000500870
2. 010000500871
3. 010000500879
4. 010000500889
5. 010000501616
6. 010000502150
7. 010000600193
8. 010000600872
9. 010000600876
10. 010000600877
[ 100719 others ]

1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
100719	(	100.0%	)

100729 (100.0%)

0 (0.0%)

LSTATE [character]

1. CA
2. TX
3. NY
4. FL
5. IL
6. MI
7. OH
8. PA
9. NC
10. NJ
[ 45 others ]

10325	(	10.3%	)
9320	(	9.3%	)
4808	(	4.8%	)
4377	(	4.3%	)
4245	(	4.2%	)
3736	(	3.7%	)
3610	(	3.6%	)
2990	(	3.0%	)
2693	(	2.7%	)
2595	(	2.6%	)
52030	(	51.7%	)

100729 (100.0%)

0 (0.0%)

PK [numeric]

Mean (sd) : 34.8 (53.5)
min ≤ med ≤ max:
0 ≤ 22 ≤ 1912
IQR (CV) : 43 (1.5)

468 distinct values

36108 (35.8%)

64621 (64.2%)

KG [numeric]

Mean (sd) : 65 (46.9)
min ≤ med ≤ max:
0 ≤ 62 ≤ 948
IQR (CV) : 57 (0.7)

393 distinct values

57045 (56.6%)

43684 (43.4%)

G01 [numeric]

Mean (sd) : 64.4 (44.8)
min ≤ med ≤ max:
0 ≤ 62 ≤ 1408
IQR (CV) : 56 (0.7)

353 distinct values

57396 (57.0%)

43333 (43.0%)

G02 [numeric]

Mean (sd) : 64.6 (44.4)
min ≤ med ≤ max:
0 ≤ 63 ≤ 688
IQR (CV) : 56 (0.7)

345 distinct values

57461 (57.0%)

43268 (43.0%)

G03 [numeric]

Mean (sd) : 66.4 (46.3)
min ≤ med ≤ max:
0 ≤ 64 ≤ 783
IQR (CV) : 59 (0.7)

358 distinct values

57476 (57.1%)

43253 (42.9%)

G04 [numeric]

Mean (sd) : 67.9 (48.7)
min ≤ med ≤ max:
0 ≤ 65 ≤ 877
IQR (CV) : 61 (0.7)

382 distinct values

57259 (56.8%)

43470 (43.2%)

G05 [numeric]

Mean (sd) : 69.7 (56.7)
min ≤ med ≤ max:
0 ≤ 64 ≤ 985
IQR (CV) : 65 (0.8)

494 distinct values

56056 (55.7%)

44673 (44.3%)

G06 [numeric]

Mean (sd) : 91.5 (108.4)
min ≤ med ≤ max:
0 ≤ 56 ≤ 1155
IQR (CV) : 111 (1.2)

641 distinct values

42144 (41.8%)

58585 (58.2%)

G07 [numeric]

Mean (sd) : 102.7 (126.2)
min ≤ med ≤ max:
0 ≤ 52 ≤ 1439
IQR (CV) : 153 (1.2)

687 distinct values

37047 (36.8%)

63682 (63.2%)

G08 [numeric]

Mean (sd) : 101.9 (127.1)
min ≤ med ≤ max:
0 ≤ 50 ≤ 1608
IQR (CV) : 152 (1.2)

700 distinct values

37280 (37.0%)

63449 (63.0%)

G09 [numeric]

Mean (sd) : 124.7 (185.8)
min ≤ med ≤ max:
0 ≤ 40 ≤ 2799
IQR (CV) : 166 (1.5)

987 distinct values

32230 (32.0%)

68499 (68.0%)

G10 [numeric]

Mean (sd) : 120.4 (178.1)
min ≤ med ≤ max:
0 ≤ 39 ≤ 1837
IQR (CV) : 157 (1.5)

945 distinct values

32023 (31.8%)

68706 (68.2%)

G11 [numeric]

Mean (sd) : 115.4 (170.1)
min ≤ med ≤ max:
0 ≤ 40 ≤ 1719
IQR (CV) : 149 (1.5)

914 distinct values

32009 (31.8%)

68720 (68.2%)

G12 [numeric]

Mean (sd) : 114.1 (165.5)
min ≤ med ≤ max:
0 ≤ 43 ≤ 2580
IQR (CV) : 150 (1.5)

891 distinct values

31915 (31.7%)

68814 (68.3%)

TOTAL [numeric]

Mean (sd) : 515.7 (450.2)
min ≤ med ≤ max:
0 ≤ 434 ≤ 14286
IQR (CV) : 408 (0.9)

2945 distinct values

98500 (97.8%)

2229 (2.2%)

FTE [numeric]

Mean (sd) : 32.6 (25.6)
min ≤ med ≤ max:
0 ≤ 27.6 ≤ 1419
IQR (CV) : 24 (0.8)

10066 distinct values

95496 (94.8%)

5233 (5.2%)

SCHOOL_TYPE_TEXT [character]

1. Alternative/other school
2. Regular school
3. Special education school
4. Vocational school

5531	(	5.5%	)
91737	(	91.1%	)
1948	(	1.9%	)
1513	(	1.5%	)

100729 (100.0%)

0 (0.0%)

SY_STATUS_TEXT [character]

1. Currently operational
2. New school
3. School has changed agency
4. School has reopened
5. School temporarily closed
6. School to be operational
7. School was operational bu

98557	(	97.8%	)
1103	(	1.1%	)
110	(	0.1%	)
41	(	0.0%	)
500	(	0.5%	)
341	(	0.3%	)
77	(	0.1%	)

100729 (100.0%)

0 (0.0%)

SCHOOL_LEVEL [character]

1. Adult Education
2. Elementary
3. High
4. Middle
5. Not Applicable
6. Not Reported
7. Other
8. Prekindergarten
9. Secondary
10. Ungraded

28	(	0.0%	)
53287	(	52.9%	)
22977	(	22.8%	)
16506	(	16.4%	)
796	(	0.8%	)
1113	(	1.1%	)
3824	(	3.8%	)
1430	(	1.4%	)
602	(	0.6%	)
166	(	0.2%	)

100729 (100.0%)

0 (0.0%)

CHARTER_TEXT [character]

1. No
2. Not Applicable
3. Yes

87007	(	86.4%	)
6387	(	6.3%	)
7335	(	7.3%	)

100729 (100.0%)

0 (0.0%)

MAGNET_TEXT [character]

1. Missing
2. No
3. Not Applicable
4. Yes

6256	(	6.2%	)
77531	(	77.0%	)
13520	(	13.4%	)
3422	(	3.4%	)

100729 (100.0%)

0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-28

     #remove observations to limit to schools that are currently operational  
     publicschool_nvars_oper <-
         publicschool_nvars %>%
            filter(SY_STATUS_TEXT == "Currently operational")
       
     
        #sanity checks
        table(publicschool_nvars_oper$SY_STATUS_TEXT)


Currently operational 
                98557

     # investigate observations where missing student or faculty counts    
     summary(publicschool_nvars_oper$TOTAL)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0   249.0   436.0   518.5   655.0 14286.0    1287

     summary(publicschool_nvars_oper$FTE)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00   17.26   27.79   32.74   41.00 1419.00    4189

       missing_students <-
          publicschool_nvars_oper %>%
             filter(is.na(TOTAL))
        
         head(missing_students)

# A tibble: 6 × 23
  NCESSCH     LSTATE    PK    KG   G01   G02   G03   G04   G05   G06   G07   G08
  <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0101410018… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
2 0100480006… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 0100510006… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
4 0100630007… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
5 0100780007… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
6 0100900007… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
# … with 11 more variables: G09 <dbl>, G10 <dbl>, G11 <dbl>, G12 <dbl>,
#   TOTAL <dbl>, FTE <dbl>, SCHOOL_TYPE_TEXT <chr>, SY_STATUS_TEXT <chr>,
#   SCHOOL_LEVEL <chr>, CHARTER_TEXT <chr>, MAGNET_TEXT <chr>

         summary(missing_students$FTE)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0925  4.0150  7.3210 11.0000 60.4000     623

       missing_teachers <-
          publicschool_nvars_oper %>%
             filter(is.na(FTE))
        
         head(missing_teachers)

# A tibble: 6 × 23
  NCESSCH     LSTATE    PK    KG   G01   G02   G03   G04   G05   G06   G07   G08
  <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0101410018… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
2 0100510006… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 0100960008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
4 0101140008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
5 0101140008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
6 0101350008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
# … with 11 more variables: G09 <dbl>, G10 <dbl>, G11 <dbl>, G12 <dbl>,
#   TOTAL <dbl>, FTE <dbl>, SCHOOL_TYPE_TEXT <chr>, SY_STATUS_TEXT <chr>,
#   SCHOOL_LEVEL <chr>, CHARTER_TEXT <chr>, MAGNET_TEXT <chr>

         summary(missing_teachers$TOTAL)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     5.0   132.0   307.0   497.8  5683.0     623

# Since it doesn't make much sense for a school to not have students or teachers observations where either faculty or student counts are missing, filter these observations 
         
# Also it will make the most sense to evaluate the relationship of student to staff in primary and secondary schools only 
         
   publicschool_pretidy <-
      publicschool_nvars_oper %>%
          filter(!is.na(FTE)|!is.na(TOTAL))
   
     table(publicschool_nvars_oper$SCHOOL_LEVEL)


Adult Education      Elementary            High          Middle  Not Applicable 
             26           52752           22685           16291             744 
   Not Reported           Other Prekindergarten       Secondary        Ungraded 
            300            3727            1338             537             157

   publicschool_pretidy <-
      publicschool_pretidy %>%
         filter(SCHOOL_LEVEL %in% c("Elementary", "High", "Middle", "Secondary"))
   
     table(publicschool_pretidy$SCHOOL_LEVEL)


Elementary       High     Middle  Secondary 
     52745      22606      16289        537

     #sanity check
     print(summarytools::dfSummary(publicschool_pretidy, 
                        plain.ascii  = FALSE, 
                        style        = "grid", 
                        graph.magnif = 0.70, ),
      method = 'render',
      table.classes = 'table-condensed')

Data Frame Summary

publicschool_pretidy

Dimensions: 92177 x 23
Duplicates: 0

NCESSCH [character]

1. 010000500870
2. 010000500871
3. 010000500879
4. 010000500889
5. 010000501616
6. 010000502150
7. 010000600193
8. 010000600872
9. 010000600876
10. 010000600877
[ 92167 others ]

1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
1	(	0.0%	)
92167	(	100.0%	)

92177 (100.0%)

0 (0.0%)

LSTATE [character]

1. CA
2. TX
3. NY
4. IL
5. FL
6. OH
7. MI
8. PA
9. NC
10. NJ
[ 45 others ]

9574	(	10.4%	)
8298	(	9.0%	)
4541	(	4.9%	)
3996	(	4.3%	)
3714	(	4.0%	)
3393	(	3.7%	)
3231	(	3.5%	)
2891	(	3.1%	)
2518	(	2.7%	)
2447	(	2.7%	)
47574	(	51.6%	)

92177 (100.0%)

0 (0.0%)

PK [numeric]

Mean (sd) : 31.1 (36.8)
min ≤ med ≤ max:
0 ≤ 22 ≤ 650
IQR (CV) : 42 (1.2)

315 distinct values

32980 (35.8%)

59197 (64.2%)

KG [numeric]

Mean (sd) : 67.7 (45.6)
min ≤ med ≤ max:
0 ≤ 65 ≤ 948
IQR (CV) : 54 (0.7)

382 distinct values

53020 (57.5%)

39157 (42.5%)

G01 [numeric]

Mean (sd) : 67.2 (43.1)
min ≤ med ≤ max:
0 ≤ 65 ≤ 678
IQR (CV) : 52 (0.6)

339 distinct values

53274 (57.8%)

38903 (42.2%)

G02 [numeric]

Mean (sd) : 67.5 (43)
min ≤ med ≤ max:
0 ≤ 66 ≤ 548
IQR (CV) : 53 (0.6)

333 distinct values

53313 (57.8%)

38864 (42.2%)

G03 [numeric]

Mean (sd) : 69.5 (45)
min ≤ med ≤ max:
0 ≤ 67 ≤ 757
IQR (CV) : 55 (0.6)

349 distinct values

53271 (57.8%)

38906 (42.2%)

G04 [numeric]

Mean (sd) : 71.2 (47.4)
min ≤ med ≤ max:
0 ≤ 68 ≤ 763
IQR (CV) : 57 (0.7)

375 distinct values

52936 (57.4%)

39241 (42.6%)

G05 [numeric]

Mean (sd) : 73.1 (55.9)
min ≤ med ≤ max:
0 ≤ 68 ≤ 808
IQR (CV) : 60 (0.8)

487 distinct values

51724 (56.1%)

40453 (43.9%)

G06 [numeric]

Mean (sd) : 98.3 (110.6)
min ≤ med ≤ max:
0 ≤ 63 ≤ 1155
IQR (CV) : 114 (1.1)

637 distinct values

37794 (41.0%)

54383 (59.0%)

G07 [numeric]

Mean (sd) : 112.1 (129.5)
min ≤ med ≤ max:
0 ≤ 62 ≤ 1281
IQR (CV) : 170 (1.2)

684 distinct values

32730 (35.5%)

59447 (64.5%)

G08 [numeric]

Mean (sd) : 111.1 (130.3)
min ≤ med ≤ max:
0 ≤ 60 ≤ 1337
IQR (CV) : 170 (1.2)

693 distinct values

32964 (35.8%)

59213 (64.2%)

G09 [numeric]

Mean (sd) : 138.5 (193)
min ≤ med ≤ max:
0 ≤ 54 ≤ 2799
IQR (CV) : 199 (1.4)

983 distinct values

27976 (30.4%)

64201 (69.6%)

G10 [numeric]

Mean (sd) : 133.6 (184.8)
min ≤ med ≤ max:
0 ≤ 53 ≤ 1649
IQR (CV) : 189 (1.4)

938 distinct values

27874 (30.2%)

64303 (69.8%)

G11 [numeric]

Mean (sd) : 128 (176.4)
min ≤ med ≤ max:
0 ≤ 52 ≤ 1650
IQR (CV) : 179 (1.4)

908 distinct values

27919 (30.3%)

64258 (69.7%)

G12 [numeric]

Mean (sd) : 126.1 (170.7)
min ≤ med ≤ max:
0 ≤ 55 ≤ 2125
IQR (CV) : 178 (1.4)

885 distinct values

27877 (30.2%)

64300 (69.8%)

TOTAL [numeric]

Mean (sd) : 531 (439.7)
min ≤ med ≤ max:
0 ≤ 448 ≤ 5839
IQR (CV) : 394 (0.8)

2906 distinct values

92077 (99.9%)

100 (0.1%)

FTE [numeric]

Mean (sd) : 33.5 (25.4)
min ≤ med ≤ max:
0 ≤ 28.3 ≤ 1419
IQR (CV) : 23.4 (0.8)

9938 distinct values

89244 (96.8%)

2933 (3.2%)

SCHOOL_TYPE_TEXT [character]

1. Alternative/other school
2. Regular school
3. Special education school
4. Vocational school

3975	(	4.3%	)
86522	(	93.9%	)
627	(	0.7%	)
1053	(	1.1%	)

92177 (100.0%)

0 (0.0%)

SY_STATUS_TEXT [character]

1. Currently operational

92177	(	100.0%	)

92177 (100.0%)

0 (0.0%)

SCHOOL_LEVEL [character]

1. Elementary
2. High
3. Middle
4. Secondary

52745	(	57.2%	)
22606	(	24.5%	)
16289	(	17.7%	)
537	(	0.6%	)

92177 (100.0%)

0 (0.0%)

CHARTER_TEXT [character]

1. No
2. Not Applicable
3. Yes

80634	(	87.5%	)
5738	(	6.2%	)
5805	(	6.3%	)

92177 (100.0%)

0 (0.0%)

MAGNET_TEXT [character]

1. Missing
2. No
3. Not Applicable
4. Yes

5671	(	6.2%	)
71179	(	77.2%	)
12022	(	13.0%	)
3305	(	3.6%	)

92177 (100.0%)

0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-28

Code Section 2 - Tidy dataset

Steps

Mutate to create indicators for magnet and charter schools (.= unknown, 1= true, 0=false)
Mutate to create school level with two categories: primary (elementary) & secondary (middle, high school, secondar school)
Select only variables needed for analysis
Tidy dataset to create one observation for measure: student count, teacher count

# Mutate    CHARTER_TEXT to create indicator variable  

  publicschool_pretidy2<-
     publicschool_pretidy %>% 
        mutate(charter_ind =  if_else(CHARTER_TEXT== "Yes", 1,
                              if_else(CHARTER_TEXT== "No", 0, 9)))
               
               
     #sanity check
     table(publicschool_pretidy2$CHARTER_TEXT, publicschool_pretidy2$charter_ind, useNA="ifany")

                
                     0     1     9
  No             80634     0     0
  Not Applicable     0     0  5738
  Yes                0  5805     0

# Mutate MAGNET_TEXT to indicator variable
     
  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
        mutate(magnet_ind =  if_else(MAGNET_TEXT== "Yes", 1,
                             if_else(MAGNET_TEXT== "No", 0, 9)))
               
     #sanity check
     table(publicschool_pretidy2$MAGNET_TEXT, publicschool_pretidy2$magnet_ind, useNA="ifany")

                
                     0     1     9
  Missing            0     0  5671
  No             71179     0     0
  Not Applicable     0     0 12022
  Yes                0  3305     0

# Mutate SCHOOL_LEVEL to indicator variable     
     
  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
        mutate(level =  if_else(SCHOOL_LEVEL== "Elementary", 'primary', 'secondary'))
               
     # sanity check
     table(publicschool_pretidy2$level, useNA="ifany")


  primary secondary 
    52745     39432

# Remove school string from SCHOOL_TYPE_TEXT 
     
  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
        mutate(school_type = str_remove(SCHOOL_TYPE_TEXT, 'school'))
  
     # sanity check
     table(publicschool_pretidy2$school_type, publicschool_pretidy2$SCHOOL_TYPE_TEXT,useNA="ifany")

                    
                     Alternative/other school Regular school
  Alternative/other                      3975              0
  Regular                                   0          86522
  Special education                         0              0
  Vocational                                0              0
                    
                     Special education school Vocational school
  Alternative/other                         0                 0
  Regular                                   0                 0
  Special education                       627                 0
  Vocational                                0              1053

# Rename variables to get more descriptive name prior to pivot
     
  publicschool_pretidy2 <-
    publicschool_pretidy2 %>% 
       rename(  "total_students" = "TOTAL"
              , "total_faculty"  = "FTE"
              , "state"          = "LSTATE"
              , "school_id"      = "NCESSCH")
  
  colnames(publicschool_pretidy2)

 [1] "school_id"        "state"            "PK"               "KG"              
 [5] "G01"              "G02"              "G03"              "G04"             
 [9] "G05"              "G06"              "G07"              "G08"             
[13] "G09"              "G10"              "G11"              "G12"             
[17] "total_students"   "total_faculty"    "SCHOOL_TYPE_TEXT" "SY_STATUS_TEXT"  
[21] "SCHOOL_LEVEL"     "CHARTER_TEXT"     "MAGNET_TEXT"      "charter_ind"     
[25] "magnet_ind"       "level"            "school_type"

# Calculate ratio 
     
  publicschool_pretidy2 <-
    publicschool_pretidy2 %>% 
       mutate(ratio_student_faculty = total_students/total_faculty) 
  
  colnames(publicschool_pretidy2)

 [1] "school_id"             "state"                 "PK"                   
 [4] "KG"                    "G01"                   "G02"                  
 [7] "G03"                   "G04"                   "G05"                  
[10] "G06"                   "G07"                   "G08"                  
[13] "G09"                   "G10"                   "G11"                  
[16] "G12"                   "total_students"        "total_faculty"        
[19] "SCHOOL_TYPE_TEXT"      "SY_STATUS_TEXT"        "SCHOOL_LEVEL"         
[22] "CHARTER_TEXT"          "MAGNET_TEXT"           "charter_ind"          
[25] "magnet_ind"            "level"                 "school_type"          
[28] "ratio_student_faculty"

# Limit to analysis variables and values 
  

  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
         select("school_id", "state", "level", "school_type", "magnet_ind", "charter_ind", "total_students", "total_faculty", "ratio_student_faculty")
  
  head(publicschool_pretidy2)

# A tibble: 6 × 9
  school_id    state level     school_…¹ magne…² chart…³ total…⁴ total…⁵ ratio…⁶
  <chr>        <chr> <chr>     <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 020051000480 AK    primary   "Regular…       0       0     447  24.9     18.0 
2 020061000470 AK    secondary "Alterna…       0       0      30   3       10   
3 020039000448 AK    primary   "Regular…       0       1     167  10.4     16.1 
4 020039000463 AK    primary   "Regular…       0       1     260  16.8     15.5 
5 020039000513 AK    secondary "Alterna…       0       0       5   0.670    7.46
6 020072000340 AK    primary   "Regular…       0       0     217  13.5     16.1 
# … with abbreviated variable names ¹school_type, ²magnet_ind, ³charter_ind,
#   ⁴total_students, ⁵total_faculty, ⁶ratio_student_faculty

# pivot data to tidy by creating variables type of total (student versus faculty) and actual total (n)  
  publicschool_tidy <- 
     publicschool_pretidy2 %>%
        pivot_longer( cols     = c("total_students", "total_faculty", "ratio_student_faculty"), 
                      names_to = "measure_type"  )
  
     #sanity checks
     colnames(publicschool_tidy)

[1] "school_id"    "state"        "level"        "school_type"  "magnet_ind"  
[6] "charter_ind"  "measure_type" "value"

     print(summarytools::dfSummary(publicschool_tidy, 
                        plain.ascii  = FALSE, 
                        style        = "grid", 
                        graph.magnif = 0.70, ),
      method = 'render',
      table.classes = 'table-condensed')

Data Frame Summary

publicschool_tidy

Dimensions: 276531 x 8
Duplicates: 0

school_id [character]

1. 010000500870
2. 010000500871
3. 010000500879
4. 010000500889
5. 010000501616
6. 010000502150
7. 010000600193
8. 010000600872
9. 010000600876
10. 010000600877
[ 92167 others ]

3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
3	(	0.0%	)
276501	(	100.0%	)

276531 (100.0%)

0 (0.0%)

state [character]

1. CA
2. TX
3. NY
4. IL
5. FL
6. OH
7. MI
8. PA
9. NC
10. NJ
[ 45 others ]

28722	(	10.4%	)
24894	(	9.0%	)
13623	(	4.9%	)
11988	(	4.3%	)
11142	(	4.0%	)
10179	(	3.7%	)
9693	(	3.5%	)
8673	(	3.1%	)
7554	(	2.7%	)
7341	(	2.7%	)
142722	(	51.6%	)

276531 (100.0%)

0 (0.0%)

level [character]

1. primary
2. secondary

158235	(	57.2%	)
118296	(	42.8%	)

276531 (100.0%)

0 (0.0%)

school_type [character]

1. Alternative/other ·
2. Regular ·
3. Special education ·
4. Vocational ·

11925	(	4.3%	)
259566	(	93.9%	)
1881	(	0.7%	)
3159	(	1.1%	)

276531 (100.0%)

0 (0.0%)

magnet_ind [numeric]

Mean (sd) : 1.8 (3.5)
min ≤ med ≤ max:
0 ≤ 0 ≤ 9
IQR (CV) : 0 (2)

0	:	213537	(	77.2%	)
1	:	9915	(	3.6%	)
9	:	53079	(	19.2%	)

276531 (100.0%)

0 (0.0%)

charter_ind [numeric]

Mean (sd) : 0.6 (2.2)
min ≤ med ≤ max:
0 ≤ 0 ≤ 9
IQR (CV) : 0 (3.5)

0	:	241902	(	87.5%	)
1	:	17415	(	6.3%	)
9	:	17214	(	6.2%	)

276531 (100.0%)

0 (0.0%)

measure_type [character]

1. ratio_student_faculty
2. total_faculty
3. total_students

92177	(	33.3%	)
92177	(	33.3%	)
92177	(	33.3%	)

276531 (100.0%)

0 (0.0%)

value [numeric]

Mean (sd) : Inf (NaN)
min ≤ med ≤ max:
0 ≤ 28 ≤ Inf
IQR (CV) : 270.3 (NaN)

79293 distinct values

270152 (97.7%)

6379 (2.3%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-28

Univariate Visualizations

I’m using this exercise more to figure out ggplot than the dataset hence very simple choices in what is being displayed.

This plot shows the total count of schools per state stacked by level

library(ggplot2)

# isolate ratio 
publicschool_ratio <- 
     publicschool_tidy %>%
         filter(school_type == "Regular " & measure_type == "ratio_student_faculty" & !is.na(value))


   # group to get mean per categories 
   publicschools_ratio_grouped <-
      publicschool_ratio %>%
          group_by(state, level)%>%
              summarise(n_schools = n(),
                        mean_value   = mean(value),
                        med_value    = median(value))%>%
              arrange(state, level )

   # Stacked
   ggplot(publicschools_ratio_grouped, aes(fill=level, y=n_schools, x=state)) + 
          geom_bar(position="stack", stat="identity") + 
          ggtitle("Count of Schools by State and Education Level with Student and Faculty")+
          labs(y = "School Count", x = "State", colour = "Education Level")+
          theme(axis.text.x = element_text(angle = 90))

Bivariate Visualization(s)

For sake of simplicity, will evaluate the relationship of students to faculty via scatter plot, grouping data my school to get the mean of both to make the graph less busy

# isolate totals 
publicschool_totals <- 
     publicschool_tidy %>%
         filter(school_type == "Regular " & measure_type == "total_students"& !is.na(value)|measure_type == "total_faculty"&!is.na(value))   


   publicschools_totals_grouped <-
      publicschool_totals %>%
          group_by(state, measure_type)%>%
              summarise( mean_value   = mean(value) )%>%
              arrange(state, measure_type)


# pivot wide to plot student n by faculty b 
  publicschool_totals_wide <-
     publicschools_totals_grouped %>%
        pivot_wider(names_from = measure_type, values_from = mean_value)
  
  publicschool_plot_wide <-
     publicschool_totals_wide %>%
         filter(!is.na(total_students)&!is.na(total_faculty))   

  #scatter student n by faculty b, color by state  
  ggplot(publicschool_plot_wide, aes(x=total_students, y=total_faculty, color = state)) +         geom_point()+
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Average Count of Students and Faculty by State") +
  labs(x = "Student Count", y = "Faculty Count")