DACSS 601 HW5

Data Visualization

Snehal Prabhu
2/28/2022

Read the Active Duty Marital Status Excel Sheet

mdata <- read_excel("ActiveDuty_MaritalStatus.xls", skip=8)
colnames(mdata)
 [1] "...1"        "Pay Grade"   "Male...3"    "Female...4" 
 [5] "Total...5"   "Male...6"    "Female...7"  "Total...8"  
 [9] "Male...9"    "Female...10" "Total...11"  "Male...12"  
[13] "Female...13" "Total...14"  "Male...15"   "Female...16"
[17] "Total...17" 
colnames(mdata) <- c("d1","pay_grade","m_single_nochild","f_single_nochild","d2","m_single_child","f_single_child","d3","m_joint","f_joint","d4","m_married","f_married","d5","m_total","f_total","d6")

head(mdata)
# A tibble: 6 x 17
  d1    pay_grade m_single_nochild f_single_nochild     d2
  <chr> <chr>                <dbl>            <dbl>  <dbl>
1 <NA>  E-1                  31229             5717  36946
2 <NA>  E-2                  53094             8388  61482
3 <NA>  E-3                 131091            21019 152110
4 <NA>  E-4                 112710            16381 129091
5 <NA>  E-5                  57989            11021  69010
6 <NA>  E-6                  19125             4654  23779
# ... with 12 more variables: m_single_child <dbl>,
#   f_single_child <dbl>, d3 <dbl>, m_joint <dbl>, f_joint <dbl>,
#   d4 <dbl>, m_married <dbl>, f_married <dbl>, d5 <dbl>,
#   m_total <dbl>, f_total <dbl>, d6 <dbl>

Delete the columns which are repetative or not required for analysis.

mdata <- select(mdata, ! starts_with("d"))
head(mdata,12)
# A tibble: 12 x 11
   pay_grade      m_single_nochild f_single_nochild m_single_child
   <chr>                     <dbl>            <dbl>          <dbl>
 1 E-1                       31229             5717            563
 2 E-2                       53094             8388           1457
 3 E-3                      131091            21019           4264
 4 E-4                      112710            16381           9491
 5 E-5                       57989            11021          10937
 6 E-6                       19125             4654          10369
 7 E-7                        5446             1913           6530
 8 E-8                        1009              438           1786
 9 E-9                         381              202            579
10 TOTAL ENLISTED           412074            69733          45976
11 O-1                       13495             3081            402
12 O-2                       11029             2715            426
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
#   f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
#   f_total <dbl>

Lets compare the distribution of total count of officers in different categories.

total_paygrade <- filter(mdata, str_detect(pay_grade, "^TOTAL"))
total_paygrade
# A tibble: 3 x 11
  pay_grade      m_single_nochild f_single_nochild m_single_child
  <chr>                     <dbl>            <dbl>          <dbl>
1 TOTAL ENLISTED           412074            69733          45976
2 TOTAL OFFICER             44163            13746           4458
3 TOTAL WARRANT              1374              355            994
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
#   f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
#   f_total <dbl>
total_paygrade_pivotted <- pivot_longer(total_paygrade, c("m_single_nochild":"f_married"), names_to = "category", values_to = "count")
head(total_paygrade_pivotted,10)
# A tibble: 10 x 5
   pay_grade      m_total f_total category          count
   <chr>            <dbl>   <dbl> <chr>             <dbl>
 1 TOTAL ENLISTED 1017201  166801 m_single_nochild 412074
 2 TOTAL ENLISTED 1017201  166801 f_single_nochild  69733
 3 TOTAL ENLISTED 1017201  166801 m_single_child    45976
 4 TOTAL ENLISTED 1017201  166801 f_single_child    21759
 5 TOTAL ENLISTED 1017201  166801 m_joint           40696
 6 TOTAL ENLISTED 1017201  166801 f_joint           37570
 7 TOTAL ENLISTED 1017201  166801 m_married        518455
 8 TOTAL ENLISTED 1017201  166801 f_married         37739
 9 TOTAL OFFICER   177403   34338 m_single_nochild  44163
10 TOTAL OFFICER   177403   34338 f_single_nochild  13746
ggplot(total_paygrade_pivotted, aes(x=category, y=count, fill=category)) +
  geom_bar(stat = "identity", position = 'dodge') +
  facet_wrap(vars(pay_grade), scales = "free") +
  scale_x_discrete(guide = guide_axis(n.dodge=4)) +
  theme_bw() 

We can also look at the numbers within a specific pay_grade

enlisted <- mdata %>%
  filter(! str_detect(pay_grade, "^TOTAL")) %>%
  filter(str_detect(pay_grade, "^E"))
enlisted
# A tibble: 9 x 11
  pay_grade m_single_nochild f_single_nochild m_single_child
  <chr>                <dbl>            <dbl>          <dbl>
1 E-1                  31229             5717            563
2 E-2                  53094             8388           1457
3 E-3                 131091            21019           4264
4 E-4                 112710            16381           9491
5 E-5                  57989            11021          10937
6 E-6                  19125             4654          10369
7 E-7                   5446             1913           6530
8 E-8                   1009              438           1786
9 E-9                    381              202            579
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
#   f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
#   f_total <dbl>
officer <- mdata %>%
  filter(! str_detect(pay_grade, "^TOTAL")) %>%
  filter(str_detect(pay_grade, "^O"))
officer
# A tibble: 10 x 11
   pay_grade m_single_nochild f_single_nochild m_single_child
   <chr>                <dbl>            <dbl>          <dbl>
 1 O-1                  13495             3081            402
 2 O-2                  11029             2715            426
 3 O-3                  14551             5056           1442
 4 O-4                   3480             1720           1190
 5 O-5                   1244              810            729
 6 O-6                    353              349            261
 7 O-7                      5                7              7
 8 O-8                      4                7              0
 9 O-9                      1                1              1
10 O-10                     1                0              0
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
#   f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
#   f_total <dbl>
warrant <- mdata %>%
  filter(! str_detect(pay_grade, "^TOTAL")) %>%
  filter(str_detect(pay_grade, "^W"))
warrant
# A tibble: 5 x 11
  pay_grade m_single_nochild f_single_nochild m_single_child
  <chr>                <dbl>            <dbl>          <dbl>
1 W-1                    354               68            160
2 W-2                    658              151            358
3 W-3                    221               77            283
4 W-4                    116               47            169
5 W-5                     25               12             24
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
#   f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
#   f_total <dbl>

Lets observe the change in the count of enlisted officers in different pay_grade by gender and their marital information.

enlisted %>%
  gather(key = "m_by", value = "count", 6:9) %>%
  mutate(marriage_with = case_when(endsWith(as.character(m_by), "joint") ~ "joint", TRUE ~ "Civilian")) %>%
  mutate(gender = case_when(startsWith(as.character(m_by), "m") ~ "male", TRUE ~ "female")) %>%
  ggplot(aes(x = marriage_with, y= count, fill=gender)) +
  geom_bar(stat = "identity", position = 'dodge') +
  facet_wrap(vars(pay_grade), scales="free") +
  theme_bw() +
  labs(title="Total Enlisted")

Lets observe the change in the count of officers enlisted in different pay_grade by gender and being single with and without a dependent child.

officer %>%
  gather(key = "child_status", value = "count", 2:5) %>%
  mutate(children=case_when(endsWith(as.character(child_status), "_child") ~ "Child", TRUE ~ "NoChild")) %>%
  mutate(gender=case_when(startsWith(as.character(child_status), "m") ~ "male", TRUE ~ "female")) %>%
  ggplot(aes(x = children, y = count, fill = gender)) +
  geom_bar(stat = "identity", position = 'dodge') +
  facet_wrap(vars(pay_grade), scales="free") +
  theme_bw() +
  labs(title="Officers")

Comparing the distribution of warrants by gender in different pay_grades.

warrant %>%
  gather(key = "gender", value = "count", 10:11) %>%
  ggplot(aes(x=gender, y=count)) +
  geom_bar(stat = "identity", position = 'dodge') +
  facet_wrap(vars(pay_grade)) +
  theme_bw() +
  labs(title = "Warrant Distribution", x="Gender", y="count")

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Prabhu (2022, March 2). Data Analytics and Computational Social Science: DACSS 601 HW5. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httprpubscomsnehalhw5/

BibTeX citation

@misc{prabhu2022dacss,
  author = {Prabhu, Snehal},
  title = {Data Analytics and Computational Social Science: DACSS 601 HW5},
  url = {https://github.com/DACSS/dacss_course_website/posts/httprpubscomsnehalhw5/},
  year = {2022}
}