Challenge 5: Public Schools Visualization

laurenzichittella
challenge_5
public_schools
Introduction to Visualization
Author

Lauren Zichittella

Published

March 25, 2023

library(tidyverse)
library(ggplot2)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Description Of Public Schools Characteristics, 2017-2017

This dataset provides characteristics on public schools, their students and teachers, in the US as of the 2017-2018 school year.

Please see steps below for steps to preparing this data for unvariate and by variate visualizations

Code Section 1 - Read in data Public School Characteristic

Read in a data set Public_School_Characteristics_2017-18.csv

After Review this data outside of R prior to import, it’s clear this is a wide dataset with plenty of variables.For the sake of efficiency in this exercise, this dataset will be limited to variables and observations that can be used to study the association between student in primary and secondary schools by state, charter status, and magnet status.

Steps

  1. Read in dataset, limit variables to those that will be utilized to filter observations and create visualizations
  2. Describe data to inform additional filtering or selection of variables that remains prior to tidying
  3. Create new variable representing ratio of students to teachers
# Read in Public Schools Characteristics Dataset 
   
   # import data 
   publicschool_char_raw <-read_csv("_data/Public_School_Characteristics_2017-18.csv")

   # get full list of column names
   colnames(publicschool_char_raw)
 [1] "X"                "Y"                "OBJECTID"         "NCESSCH"         
 [5] "NMCNTY"           "SURVYEAR"         "STABR"            "LEAID"           
 [9] "ST_LEAID"         "LEA_NAME"         "SCH_NAME"         "LSTREET1"        
[13] "LSTREET2"         "LSTREET3"         "LCITY"            "LSTATE"          
[17] "LZIP"             "LZIP4"            "PHONE"            "GSLO"            
[21] "GSHI"             "VIRTUAL"          "TOTFRL"           "FRELCH"          
[25] "REDLCH"           "PK"               "KG"               "G01"             
[29] "G02"              "G03"              "G04"              "G05"             
[33] "G06"              "G07"              "G08"              "G09"             
[37] "G10"              "G11"              "G12"              "G13"             
[41] "TOTAL"            "MEMBER"           "AM"               "HI"              
[45] "BL"               "WH"               "HP"               "TR"              
[49] "FTE"              "LATCOD"           "LONCOD"           "ULOCALE"         
[53] "STUTERATIO"       "STITLEI"          "AMALM"            "AMALF"           
[57] "ASALM"            "ASALF"            "HIALM"            "HIALF"           
[61] "BLALM"            "BLALF"            "WHALM"            "WHALF"           
[65] "HPALM"            "HPALF"            "TRALM"            "TRALF"           
[69] "TOTMENROL"        "TOTFENROL"        "STATUS"           "UG"              
[73] "AE"               "SCHOOL_TYPE_TEXT" "SY_STATUS_TEXT"   "SCHOOL_LEVEL"    
[77] "AS"               "CHARTER_TEXT"     "MAGNET_TEXT"     
  # select columns to id school, state, enrollment, staff, and other types of status 
  publicschool_nvars <-
     publicschool_char_raw %>% 
        select (NCESSCH, LSTATE, PK, KG, G01, G02, G03, G04, G05, G06, G07, G08, G09, G10, G11, G12, TOTAL, FTE, SCHOOL_TYPE_TEXT, SY_STATUS_TEXT, SCHOOL_LEVEL, CHARTER_TEXT,  MAGNET_TEXT)
  
     # confirm successful selection of vars
     colnames(publicschool_nvars)
 [1] "NCESSCH"          "LSTATE"           "PK"               "KG"              
 [5] "G01"              "G02"              "G03"              "G04"             
 [9] "G05"              "G06"              "G07"              "G08"             
[13] "G09"              "G10"              "G11"              "G12"             
[17] "TOTAL"            "FTE"              "SCHOOL_TYPE_TEXT" "SY_STATUS_TEXT"  
[21] "SCHOOL_LEVEL"     "CHARTER_TEXT"     "MAGNET_TEXT"     
     head(publicschool_nvars)
# A tibble: 6 × 23
  NCESSCH     LSTATE    PK    KG   G01   G02   G03   G04   G05   G06   G07   G08
  <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0200510004… AK        30    81    63    80    62    58    73    NA    NA    NA
2 0200610004… AK        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 0200390004… AK        NA    23    23    27    22    25    28    19    NA    NA
4 0200390004… AK        NA    40    43    42    46    46    43    NA    NA    NA
5 0200390005… AK        NA    NA    NA    NA    NA    NA    NA    NA     0     1
6 0200700005… AK         0     0     3     1     2     2     2     1     5     1
# … with 11 more variables: G09 <dbl>, G10 <dbl>, G11 <dbl>, G12 <dbl>,
#   TOTAL <dbl>, FTE <dbl>, SCHOOL_TYPE_TEXT <chr>, SY_STATUS_TEXT <chr>,
#   SCHOOL_LEVEL <chr>, CHARTER_TEXT <chr>, MAGNET_TEXT <chr>
     # quick summary to inform whether to filter observations 
     print(summarytools::dfSummary(publicschool_nvars, 
                        plain.ascii  = FALSE, 
                        style        = "grid", 
                        graph.magnif = 0.70, ),
      method = 'render',
      table.classes = 'table-condensed')

Data Frame Summary

publicschool_nvars

Dimensions: 100729 x 23
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 NCESSCH [character]
1. 010000500870
2. 010000500871
3. 010000500879
4. 010000500889
5. 010000501616
6. 010000502150
7. 010000600193
8. 010000600872
9. 010000600876
10. 010000600877
[ 100719 others ]
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
100719 ( 100.0% )
100729 (100.0%) 0 (0.0%)
2 LSTATE [character]
1. CA
2. TX
3. NY
4. FL
5. IL
6. MI
7. OH
8. PA
9. NC
10. NJ
[ 45 others ]
10325 ( 10.3% )
9320 ( 9.3% )
4808 ( 4.8% )
4377 ( 4.3% )
4245 ( 4.2% )
3736 ( 3.7% )
3610 ( 3.6% )
2990 ( 3.0% )
2693 ( 2.7% )
2595 ( 2.6% )
52030 ( 51.7% )
100729 (100.0%) 0 (0.0%)
3 PK [numeric]
Mean (sd) : 34.8 (53.5)
min ≤ med ≤ max:
0 ≤ 22 ≤ 1912
IQR (CV) : 43 (1.5)
468 distinct values 36108 (35.8%) 64621 (64.2%)
4 KG [numeric]
Mean (sd) : 65 (46.9)
min ≤ med ≤ max:
0 ≤ 62 ≤ 948
IQR (CV) : 57 (0.7)
393 distinct values 57045 (56.6%) 43684 (43.4%)
5 G01 [numeric]
Mean (sd) : 64.4 (44.8)
min ≤ med ≤ max:
0 ≤ 62 ≤ 1408
IQR (CV) : 56 (0.7)
353 distinct values 57396 (57.0%) 43333 (43.0%)
6 G02 [numeric]
Mean (sd) : 64.6 (44.4)
min ≤ med ≤ max:
0 ≤ 63 ≤ 688
IQR (CV) : 56 (0.7)
345 distinct values 57461 (57.0%) 43268 (43.0%)
7 G03 [numeric]
Mean (sd) : 66.4 (46.3)
min ≤ med ≤ max:
0 ≤ 64 ≤ 783
IQR (CV) : 59 (0.7)
358 distinct values 57476 (57.1%) 43253 (42.9%)
8 G04 [numeric]
Mean (sd) : 67.9 (48.7)
min ≤ med ≤ max:
0 ≤ 65 ≤ 877
IQR (CV) : 61 (0.7)
382 distinct values 57259 (56.8%) 43470 (43.2%)
9 G05 [numeric]
Mean (sd) : 69.7 (56.7)
min ≤ med ≤ max:
0 ≤ 64 ≤ 985
IQR (CV) : 65 (0.8)
494 distinct values 56056 (55.7%) 44673 (44.3%)
10 G06 [numeric]
Mean (sd) : 91.5 (108.4)
min ≤ med ≤ max:
0 ≤ 56 ≤ 1155
IQR (CV) : 111 (1.2)
641 distinct values 42144 (41.8%) 58585 (58.2%)
11 G07 [numeric]
Mean (sd) : 102.7 (126.2)
min ≤ med ≤ max:
0 ≤ 52 ≤ 1439
IQR (CV) : 153 (1.2)
687 distinct values 37047 (36.8%) 63682 (63.2%)
12 G08 [numeric]
Mean (sd) : 101.9 (127.1)
min ≤ med ≤ max:
0 ≤ 50 ≤ 1608
IQR (CV) : 152 (1.2)
700 distinct values 37280 (37.0%) 63449 (63.0%)
13 G09 [numeric]
Mean (sd) : 124.7 (185.8)
min ≤ med ≤ max:
0 ≤ 40 ≤ 2799
IQR (CV) : 166 (1.5)
987 distinct values 32230 (32.0%) 68499 (68.0%)
14 G10 [numeric]
Mean (sd) : 120.4 (178.1)
min ≤ med ≤ max:
0 ≤ 39 ≤ 1837
IQR (CV) : 157 (1.5)
945 distinct values 32023 (31.8%) 68706 (68.2%)
15 G11 [numeric]
Mean (sd) : 115.4 (170.1)
min ≤ med ≤ max:
0 ≤ 40 ≤ 1719
IQR (CV) : 149 (1.5)
914 distinct values 32009 (31.8%) 68720 (68.2%)
16 G12 [numeric]
Mean (sd) : 114.1 (165.5)
min ≤ med ≤ max:
0 ≤ 43 ≤ 2580
IQR (CV) : 150 (1.5)
891 distinct values 31915 (31.7%) 68814 (68.3%)
17 TOTAL [numeric]
Mean (sd) : 515.7 (450.2)
min ≤ med ≤ max:
0 ≤ 434 ≤ 14286
IQR (CV) : 408 (0.9)
2945 distinct values 98500 (97.8%) 2229 (2.2%)
18 FTE [numeric]
Mean (sd) : 32.6 (25.6)
min ≤ med ≤ max:
0 ≤ 27.6 ≤ 1419
IQR (CV) : 24 (0.8)
10066 distinct values 95496 (94.8%) 5233 (5.2%)
19 SCHOOL_TYPE_TEXT [character]
1. Alternative/other school
2. Regular school
3. Special education school
4. Vocational school
5531 ( 5.5% )
91737 ( 91.1% )
1948 ( 1.9% )
1513 ( 1.5% )
100729 (100.0%) 0 (0.0%)
20 SY_STATUS_TEXT [character]
1. Currently operational
2. New school
3. School has changed agency
4. School has reopened
5. School temporarily closed
6. School to be operational
7. School was operational bu
98557 ( 97.8% )
1103 ( 1.1% )
110 ( 0.1% )
41 ( 0.0% )
500 ( 0.5% )
341 ( 0.3% )
77 ( 0.1% )
100729 (100.0%) 0 (0.0%)
21 SCHOOL_LEVEL [character]
1. Adult Education
2. Elementary
3. High
4. Middle
5. Not Applicable
6. Not Reported
7. Other
8. Prekindergarten
9. Secondary
10. Ungraded
28 ( 0.0% )
53287 ( 52.9% )
22977 ( 22.8% )
16506 ( 16.4% )
796 ( 0.8% )
1113 ( 1.1% )
3824 ( 3.8% )
1430 ( 1.4% )
602 ( 0.6% )
166 ( 0.2% )
100729 (100.0%) 0 (0.0%)
22 CHARTER_TEXT [character]
1. No
2. Not Applicable
3. Yes
87007 ( 86.4% )
6387 ( 6.3% )
7335 ( 7.3% )
100729 (100.0%) 0 (0.0%)
23 MAGNET_TEXT [character]
1. Missing
2. No
3. Not Applicable
4. Yes
6256 ( 6.2% )
77531 ( 77.0% )
13520 ( 13.4% )
3422 ( 3.4% )
100729 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-28

     #remove observations to limit to schools that are currently operational  
     publicschool_nvars_oper <-
         publicschool_nvars %>%
            filter(SY_STATUS_TEXT == "Currently operational")
       
     
        #sanity checks
        table(publicschool_nvars_oper$SY_STATUS_TEXT)

Currently operational 
                98557 
     # investigate observations where missing student or faculty counts    
     summary(publicschool_nvars_oper$TOTAL)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0   249.0   436.0   518.5   655.0 14286.0    1287 
     summary(publicschool_nvars_oper$FTE)    
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00   17.26   27.79   32.74   41.00 1419.00    4189 
       missing_students <-
          publicschool_nvars_oper %>%
             filter(is.na(TOTAL))
        
         head(missing_students)
# A tibble: 6 × 23
  NCESSCH     LSTATE    PK    KG   G01   G02   G03   G04   G05   G06   G07   G08
  <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0101410018… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
2 0100480006… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 0100510006… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
4 0100630007… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
5 0100780007… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
6 0100900007… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
# … with 11 more variables: G09 <dbl>, G10 <dbl>, G11 <dbl>, G12 <dbl>,
#   TOTAL <dbl>, FTE <dbl>, SCHOOL_TYPE_TEXT <chr>, SY_STATUS_TEXT <chr>,
#   SCHOOL_LEVEL <chr>, CHARTER_TEXT <chr>, MAGNET_TEXT <chr>
         summary(missing_students$FTE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0925  4.0150  7.3210 11.0000 60.4000     623 
       missing_teachers <-
          publicschool_nvars_oper %>%
             filter(is.na(FTE))
        
         head(missing_teachers)
# A tibble: 6 × 23
  NCESSCH     LSTATE    PK    KG   G01   G02   G03   G04   G05   G06   G07   G08
  <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0101410018… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
2 0100510006… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 0100960008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
4 0101140008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
5 0101140008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
6 0101350008… AL        NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
# … with 11 more variables: G09 <dbl>, G10 <dbl>, G11 <dbl>, G12 <dbl>,
#   TOTAL <dbl>, FTE <dbl>, SCHOOL_TYPE_TEXT <chr>, SY_STATUS_TEXT <chr>,
#   SCHOOL_LEVEL <chr>, CHARTER_TEXT <chr>, MAGNET_TEXT <chr>
         summary(missing_teachers$TOTAL)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     5.0   132.0   307.0   497.8  5683.0     623 
# Since it doesn't make much sense for a school to not have students or teachers observations where either faculty or student counts are missing, filter these observations 
         
# Also it will make the most sense to evaluate the relationship of student to staff in primary and secondary schools only 
         
   publicschool_pretidy <-
      publicschool_nvars_oper %>%
          filter(!is.na(FTE)|!is.na(TOTAL))
   
     table(publicschool_nvars_oper$SCHOOL_LEVEL)

Adult Education      Elementary            High          Middle  Not Applicable 
             26           52752           22685           16291             744 
   Not Reported           Other Prekindergarten       Secondary        Ungraded 
            300            3727            1338             537             157 
   publicschool_pretidy <-
      publicschool_pretidy %>%
         filter(SCHOOL_LEVEL %in% c("Elementary", "High", "Middle", "Secondary"))
   
     table(publicschool_pretidy$SCHOOL_LEVEL)   

Elementary       High     Middle  Secondary 
     52745      22606      16289        537 
     #sanity check
     print(summarytools::dfSummary(publicschool_pretidy, 
                        plain.ascii  = FALSE, 
                        style        = "grid", 
                        graph.magnif = 0.70, ),
      method = 'render',
      table.classes = 'table-condensed')    

Data Frame Summary

publicschool_pretidy

Dimensions: 92177 x 23
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 NCESSCH [character]
1. 010000500870
2. 010000500871
3. 010000500879
4. 010000500889
5. 010000501616
6. 010000502150
7. 010000600193
8. 010000600872
9. 010000600876
10. 010000600877
[ 92167 others ]
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
1 ( 0.0% )
92167 ( 100.0% )
92177 (100.0%) 0 (0.0%)
2 LSTATE [character]
1. CA
2. TX
3. NY
4. IL
5. FL
6. OH
7. MI
8. PA
9. NC
10. NJ
[ 45 others ]
9574 ( 10.4% )
8298 ( 9.0% )
4541 ( 4.9% )
3996 ( 4.3% )
3714 ( 4.0% )
3393 ( 3.7% )
3231 ( 3.5% )
2891 ( 3.1% )
2518 ( 2.7% )
2447 ( 2.7% )
47574 ( 51.6% )
92177 (100.0%) 0 (0.0%)
3 PK [numeric]
Mean (sd) : 31.1 (36.8)
min ≤ med ≤ max:
0 ≤ 22 ≤ 650
IQR (CV) : 42 (1.2)
315 distinct values 32980 (35.8%) 59197 (64.2%)
4 KG [numeric]
Mean (sd) : 67.7 (45.6)
min ≤ med ≤ max:
0 ≤ 65 ≤ 948
IQR (CV) : 54 (0.7)
382 distinct values 53020 (57.5%) 39157 (42.5%)
5 G01 [numeric]
Mean (sd) : 67.2 (43.1)
min ≤ med ≤ max:
0 ≤ 65 ≤ 678
IQR (CV) : 52 (0.6)
339 distinct values 53274 (57.8%) 38903 (42.2%)
6 G02 [numeric]
Mean (sd) : 67.5 (43)
min ≤ med ≤ max:
0 ≤ 66 ≤ 548
IQR (CV) : 53 (0.6)
333 distinct values 53313 (57.8%) 38864 (42.2%)
7 G03 [numeric]
Mean (sd) : 69.5 (45)
min ≤ med ≤ max:
0 ≤ 67 ≤ 757
IQR (CV) : 55 (0.6)
349 distinct values 53271 (57.8%) 38906 (42.2%)
8 G04 [numeric]
Mean (sd) : 71.2 (47.4)
min ≤ med ≤ max:
0 ≤ 68 ≤ 763
IQR (CV) : 57 (0.7)
375 distinct values 52936 (57.4%) 39241 (42.6%)
9 G05 [numeric]
Mean (sd) : 73.1 (55.9)
min ≤ med ≤ max:
0 ≤ 68 ≤ 808
IQR (CV) : 60 (0.8)
487 distinct values 51724 (56.1%) 40453 (43.9%)
10 G06 [numeric]
Mean (sd) : 98.3 (110.6)
min ≤ med ≤ max:
0 ≤ 63 ≤ 1155
IQR (CV) : 114 (1.1)
637 distinct values 37794 (41.0%) 54383 (59.0%)
11 G07 [numeric]
Mean (sd) : 112.1 (129.5)
min ≤ med ≤ max:
0 ≤ 62 ≤ 1281
IQR (CV) : 170 (1.2)
684 distinct values 32730 (35.5%) 59447 (64.5%)
12 G08 [numeric]
Mean (sd) : 111.1 (130.3)
min ≤ med ≤ max:
0 ≤ 60 ≤ 1337
IQR (CV) : 170 (1.2)
693 distinct values 32964 (35.8%) 59213 (64.2%)
13 G09 [numeric]
Mean (sd) : 138.5 (193)
min ≤ med ≤ max:
0 ≤ 54 ≤ 2799
IQR (CV) : 199 (1.4)
983 distinct values 27976 (30.4%) 64201 (69.6%)
14 G10 [numeric]
Mean (sd) : 133.6 (184.8)
min ≤ med ≤ max:
0 ≤ 53 ≤ 1649
IQR (CV) : 189 (1.4)
938 distinct values 27874 (30.2%) 64303 (69.8%)
15 G11 [numeric]
Mean (sd) : 128 (176.4)
min ≤ med ≤ max:
0 ≤ 52 ≤ 1650
IQR (CV) : 179 (1.4)
908 distinct values 27919 (30.3%) 64258 (69.7%)
16 G12 [numeric]
Mean (sd) : 126.1 (170.7)
min ≤ med ≤ max:
0 ≤ 55 ≤ 2125
IQR (CV) : 178 (1.4)
885 distinct values 27877 (30.2%) 64300 (69.8%)
17 TOTAL [numeric]
Mean (sd) : 531 (439.7)
min ≤ med ≤ max:
0 ≤ 448 ≤ 5839
IQR (CV) : 394 (0.8)
2906 distinct values 92077 (99.9%) 100 (0.1%)
18 FTE [numeric]
Mean (sd) : 33.5 (25.4)
min ≤ med ≤ max:
0 ≤ 28.3 ≤ 1419
IQR (CV) : 23.4 (0.8)
9938 distinct values 89244 (96.8%) 2933 (3.2%)
19 SCHOOL_TYPE_TEXT [character]
1. Alternative/other school
2. Regular school
3. Special education school
4. Vocational school
3975 ( 4.3% )
86522 ( 93.9% )
627 ( 0.7% )
1053 ( 1.1% )
92177 (100.0%) 0 (0.0%)
20 SY_STATUS_TEXT [character] 1. Currently operational
92177 ( 100.0% )
92177 (100.0%) 0 (0.0%)
21 SCHOOL_LEVEL [character]
1. Elementary
2. High
3. Middle
4. Secondary
52745 ( 57.2% )
22606 ( 24.5% )
16289 ( 17.7% )
537 ( 0.6% )
92177 (100.0%) 0 (0.0%)
22 CHARTER_TEXT [character]
1. No
2. Not Applicable
3. Yes
80634 ( 87.5% )
5738 ( 6.2% )
5805 ( 6.3% )
92177 (100.0%) 0 (0.0%)
23 MAGNET_TEXT [character]
1. Missing
2. No
3. Not Applicable
4. Yes
5671 ( 6.2% )
71179 ( 77.2% )
12022 ( 13.0% )
3305 ( 3.6% )
92177 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-28

Code Section 2 - Tidy dataset

Steps

  1. Mutate to create indicators for magnet and charter schools (.= unknown, 1= true, 0=false)
  2. Mutate to create school level with two categories: primary (elementary) & secondary (middle, high school, secondar school)
  3. Select only variables needed for analysis
  4. Tidy dataset to create one observation for measure: student count, teacher count
# Mutate    CHARTER_TEXT to create indicator variable  

  publicschool_pretidy2<-
     publicschool_pretidy %>% 
        mutate(charter_ind =  if_else(CHARTER_TEXT== "Yes", 1,
                              if_else(CHARTER_TEXT== "No", 0, 9)))
               
               
     #sanity check
     table(publicschool_pretidy2$CHARTER_TEXT, publicschool_pretidy2$charter_ind, useNA="ifany")
                
                     0     1     9
  No             80634     0     0
  Not Applicable     0     0  5738
  Yes                0  5805     0
# Mutate MAGNET_TEXT to indicator variable
     
  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
        mutate(magnet_ind =  if_else(MAGNET_TEXT== "Yes", 1,
                             if_else(MAGNET_TEXT== "No", 0, 9)))
               
     #sanity check
     table(publicschool_pretidy2$MAGNET_TEXT, publicschool_pretidy2$magnet_ind, useNA="ifany")    
                
                     0     1     9
  Missing            0     0  5671
  No             71179     0     0
  Not Applicable     0     0 12022
  Yes                0  3305     0
# Mutate SCHOOL_LEVEL to indicator variable     
     
  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
        mutate(level =  if_else(SCHOOL_LEVEL== "Elementary", 'primary', 'secondary'))
               
     # sanity check
     table(publicschool_pretidy2$level, useNA="ifany")   

  primary secondary 
    52745     39432 
# Remove school string from SCHOOL_TYPE_TEXT 
     
  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
        mutate(school_type = str_remove(SCHOOL_TYPE_TEXT, 'school'))
  
     # sanity check
     table(publicschool_pretidy2$school_type, publicschool_pretidy2$SCHOOL_TYPE_TEXT,useNA="ifany")   
                    
                     Alternative/other school Regular school
  Alternative/other                      3975              0
  Regular                                   0          86522
  Special education                         0              0
  Vocational                                0              0
                    
                     Special education school Vocational school
  Alternative/other                         0                 0
  Regular                                   0                 0
  Special education                       627                 0
  Vocational                                0              1053
# Rename variables to get more descriptive name prior to pivot
     
  publicschool_pretidy2 <-
    publicschool_pretidy2 %>% 
       rename(  "total_students" = "TOTAL"
              , "total_faculty"  = "FTE"
              , "state"          = "LSTATE"
              , "school_id"      = "NCESSCH")
  
  colnames(publicschool_pretidy2)  
 [1] "school_id"        "state"            "PK"               "KG"              
 [5] "G01"              "G02"              "G03"              "G04"             
 [9] "G05"              "G06"              "G07"              "G08"             
[13] "G09"              "G10"              "G11"              "G12"             
[17] "total_students"   "total_faculty"    "SCHOOL_TYPE_TEXT" "SY_STATUS_TEXT"  
[21] "SCHOOL_LEVEL"     "CHARTER_TEXT"     "MAGNET_TEXT"      "charter_ind"     
[25] "magnet_ind"       "level"            "school_type"     
# Calculate ratio 
     
  publicschool_pretidy2 <-
    publicschool_pretidy2 %>% 
       mutate(ratio_student_faculty = total_students/total_faculty) 
  
  colnames(publicschool_pretidy2)    
 [1] "school_id"             "state"                 "PK"                   
 [4] "KG"                    "G01"                   "G02"                  
 [7] "G03"                   "G04"                   "G05"                  
[10] "G06"                   "G07"                   "G08"                  
[13] "G09"                   "G10"                   "G11"                  
[16] "G12"                   "total_students"        "total_faculty"        
[19] "SCHOOL_TYPE_TEXT"      "SY_STATUS_TEXT"        "SCHOOL_LEVEL"         
[22] "CHARTER_TEXT"          "MAGNET_TEXT"           "charter_ind"          
[25] "magnet_ind"            "level"                 "school_type"          
[28] "ratio_student_faculty"
# Limit to analysis variables and values 
  

  publicschool_pretidy2<-
     publicschool_pretidy2 %>% 
         select("school_id", "state", "level", "school_type", "magnet_ind", "charter_ind", "total_students", "total_faculty", "ratio_student_faculty")
  
  head(publicschool_pretidy2)
# A tibble: 6 × 9
  school_id    state level     school_…¹ magne…² chart…³ total…⁴ total…⁵ ratio…⁶
  <chr>        <chr> <chr>     <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 020051000480 AK    primary   "Regular…       0       0     447  24.9     18.0 
2 020061000470 AK    secondary "Alterna…       0       0      30   3       10   
3 020039000448 AK    primary   "Regular…       0       1     167  10.4     16.1 
4 020039000463 AK    primary   "Regular…       0       1     260  16.8     15.5 
5 020039000513 AK    secondary "Alterna…       0       0       5   0.670    7.46
6 020072000340 AK    primary   "Regular…       0       0     217  13.5     16.1 
# … with abbreviated variable names ¹​school_type, ²​magnet_ind, ³​charter_ind,
#   ⁴​total_students, ⁵​total_faculty, ⁶​ratio_student_faculty
# pivot data to tidy by creating variables type of total (student versus faculty) and actual total (n)  
  publicschool_tidy <- 
     publicschool_pretidy2 %>%
        pivot_longer( cols     = c("total_students", "total_faculty", "ratio_student_faculty"), 
                      names_to = "measure_type"  )
  
     #sanity checks
     colnames(publicschool_tidy)
[1] "school_id"    "state"        "level"        "school_type"  "magnet_ind"  
[6] "charter_ind"  "measure_type" "value"       
     print(summarytools::dfSummary(publicschool_tidy, 
                        plain.ascii  = FALSE, 
                        style        = "grid", 
                        graph.magnif = 0.70, ),
      method = 'render',
      table.classes = 'table-condensed')    

Data Frame Summary

publicschool_tidy

Dimensions: 276531 x 8
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 school_id [character]
1. 010000500870
2. 010000500871
3. 010000500879
4. 010000500889
5. 010000501616
6. 010000502150
7. 010000600193
8. 010000600872
9. 010000600876
10. 010000600877
[ 92167 others ]
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
3 ( 0.0% )
276501 ( 100.0% )
276531 (100.0%) 0 (0.0%)
2 state [character]
1. CA
2. TX
3. NY
4. IL
5. FL
6. OH
7. MI
8. PA
9. NC
10. NJ
[ 45 others ]
28722 ( 10.4% )
24894 ( 9.0% )
13623 ( 4.9% )
11988 ( 4.3% )
11142 ( 4.0% )
10179 ( 3.7% )
9693 ( 3.5% )
8673 ( 3.1% )
7554 ( 2.7% )
7341 ( 2.7% )
142722 ( 51.6% )
276531 (100.0%) 0 (0.0%)
3 level [character]
1. primary
2. secondary
158235 ( 57.2% )
118296 ( 42.8% )
276531 (100.0%) 0 (0.0%)
4 school_type [character]
1. Alternative/other 
·
2. Regular 
·
3. Special education 
·
4. Vocational 
·
11925 ( 4.3% )
259566 ( 93.9% )
1881 ( 0.7% )
3159 ( 1.1% )
276531 (100.0%) 0 (0.0%)
5 magnet_ind [numeric]
Mean (sd) : 1.8 (3.5)
min ≤ med ≤ max:
0 ≤ 0 ≤ 9
IQR (CV) : 0 (2)
0 : 213537 ( 77.2% )
1 : 9915 ( 3.6% )
9 : 53079 ( 19.2% )
276531 (100.0%) 0 (0.0%)
6 charter_ind [numeric]
Mean (sd) : 0.6 (2.2)
min ≤ med ≤ max:
0 ≤ 0 ≤ 9
IQR (CV) : 0 (3.5)
0 : 241902 ( 87.5% )
1 : 17415 ( 6.3% )
9 : 17214 ( 6.2% )
276531 (100.0%) 0 (0.0%)
7 measure_type [character]
1. ratio_student_faculty
2. total_faculty
3. total_students
92177 ( 33.3% )
92177 ( 33.3% )
92177 ( 33.3% )
276531 (100.0%) 0 (0.0%)
8 value [numeric]
Mean (sd) : Inf (NaN)
min ≤ med ≤ max:
0 ≤ 28 ≤ Inf
IQR (CV) : 270.3 (NaN)
79293 distinct values 270152 (97.7%) 6379 (2.3%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-28

Univariate Visualizations

I’m using this exercise more to figure out ggplot than the dataset hence very simple choices in what is being displayed.

This plot shows the total count of schools per state stacked by level

library(ggplot2)

# isolate ratio 
publicschool_ratio <- 
     publicschool_tidy %>%
         filter(school_type == "Regular " & measure_type == "ratio_student_faculty" & !is.na(value))


   # group to get mean per categories 
   publicschools_ratio_grouped <-
      publicschool_ratio %>%
          group_by(state, level)%>%
              summarise(n_schools = n(),
                        mean_value   = mean(value),
                        med_value    = median(value))%>%
              arrange(state, level )

   # Stacked
   ggplot(publicschools_ratio_grouped, aes(fill=level, y=n_schools, x=state)) + 
          geom_bar(position="stack", stat="identity") + 
          ggtitle("Count of Schools by State and Education Level with Student and Faculty")+
          labs(y = "School Count", x = "State", colour = "Education Level")+
          theme(axis.text.x = element_text(angle = 90))

Bivariate Visualization(s)

For sake of simplicity, will evaluate the relationship of students to faculty via scatter plot, grouping data my school to get the mean of both to make the graph less busy

# isolate totals 
publicschool_totals <- 
     publicschool_tidy %>%
         filter(school_type == "Regular " & measure_type == "total_students"& !is.na(value)|measure_type == "total_faculty"&!is.na(value))   


   publicschools_totals_grouped <-
      publicschool_totals %>%
          group_by(state, measure_type)%>%
              summarise( mean_value   = mean(value) )%>%
              arrange(state, measure_type)


# pivot wide to plot student n by faculty b 
  publicschool_totals_wide <-
     publicschools_totals_grouped %>%
        pivot_wider(names_from = measure_type, values_from = mean_value)
  
  publicschool_plot_wide <-
     publicschool_totals_wide %>%
         filter(!is.na(total_students)&!is.na(total_faculty))   

  #scatter student n by faculty b, color by state  
  ggplot(publicschool_plot_wide, aes(x=total_students, y=total_faculty, color = state)) +         geom_point()+
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Average Count of Students and Faculty by State") +
  labs(x = "Student Count", y = "Faculty Count")