Data Visualization
mdata <- read_excel("ActiveDuty_MaritalStatus.xls", skip=8)
colnames(mdata)
[1] "...1" "Pay Grade" "Male...3" "Female...4"
[5] "Total...5" "Male...6" "Female...7" "Total...8"
[9] "Male...9" "Female...10" "Total...11" "Male...12"
[13] "Female...13" "Total...14" "Male...15" "Female...16"
[17] "Total...17"
colnames(mdata) <- c("d1","pay_grade","m_single_nochild","f_single_nochild","d2","m_single_child","f_single_child","d3","m_joint","f_joint","d4","m_married","f_married","d5","m_total","f_total","d6")
head(mdata)
# A tibble: 6 x 17
d1 pay_grade m_single_nochild f_single_nochild d2
<chr> <chr> <dbl> <dbl> <dbl>
1 <NA> E-1 31229 5717 36946
2 <NA> E-2 53094 8388 61482
3 <NA> E-3 131091 21019 152110
4 <NA> E-4 112710 16381 129091
5 <NA> E-5 57989 11021 69010
6 <NA> E-6 19125 4654 23779
# ... with 12 more variables: m_single_child <dbl>,
# f_single_child <dbl>, d3 <dbl>, m_joint <dbl>, f_joint <dbl>,
# d4 <dbl>, m_married <dbl>, f_married <dbl>, d5 <dbl>,
# m_total <dbl>, f_total <dbl>, d6 <dbl>
Delete the columns which are repetative or not required for analysis.
mdata <- select(mdata, ! starts_with("d"))
head(mdata,12)
# A tibble: 12 x 11
pay_grade m_single_nochild f_single_nochild m_single_child
<chr> <dbl> <dbl> <dbl>
1 E-1 31229 5717 563
2 E-2 53094 8388 1457
3 E-3 131091 21019 4264
4 E-4 112710 16381 9491
5 E-5 57989 11021 10937
6 E-6 19125 4654 10369
7 E-7 5446 1913 6530
8 E-8 1009 438 1786
9 E-9 381 202 579
10 TOTAL ENLISTED 412074 69733 45976
11 O-1 13495 3081 402
12 O-2 11029 2715 426
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
# f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
# f_total <dbl>
Lets compare the distribution of total count of officers in different categories.
total_paygrade <- filter(mdata, str_detect(pay_grade, "^TOTAL"))
total_paygrade
# A tibble: 3 x 11
pay_grade m_single_nochild f_single_nochild m_single_child
<chr> <dbl> <dbl> <dbl>
1 TOTAL ENLISTED 412074 69733 45976
2 TOTAL OFFICER 44163 13746 4458
3 TOTAL WARRANT 1374 355 994
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
# f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
# f_total <dbl>
total_paygrade_pivotted <- pivot_longer(total_paygrade, c("m_single_nochild":"f_married"), names_to = "category", values_to = "count")
head(total_paygrade_pivotted,10)
# A tibble: 10 x 5
pay_grade m_total f_total category count
<chr> <dbl> <dbl> <chr> <dbl>
1 TOTAL ENLISTED 1017201 166801 m_single_nochild 412074
2 TOTAL ENLISTED 1017201 166801 f_single_nochild 69733
3 TOTAL ENLISTED 1017201 166801 m_single_child 45976
4 TOTAL ENLISTED 1017201 166801 f_single_child 21759
5 TOTAL ENLISTED 1017201 166801 m_joint 40696
6 TOTAL ENLISTED 1017201 166801 f_joint 37570
7 TOTAL ENLISTED 1017201 166801 m_married 518455
8 TOTAL ENLISTED 1017201 166801 f_married 37739
9 TOTAL OFFICER 177403 34338 m_single_nochild 44163
10 TOTAL OFFICER 177403 34338 f_single_nochild 13746
ggplot(total_paygrade_pivotted, aes(x=category, y=count, fill=category)) +
geom_bar(stat = "identity", position = 'dodge') +
facet_wrap(vars(pay_grade), scales = "free") +
scale_x_discrete(guide = guide_axis(n.dodge=4)) +
theme_bw()
We can also look at the numbers within a specific pay_grade
enlisted <- mdata %>%
filter(! str_detect(pay_grade, "^TOTAL")) %>%
filter(str_detect(pay_grade, "^E"))
enlisted
# A tibble: 9 x 11
pay_grade m_single_nochild f_single_nochild m_single_child
<chr> <dbl> <dbl> <dbl>
1 E-1 31229 5717 563
2 E-2 53094 8388 1457
3 E-3 131091 21019 4264
4 E-4 112710 16381 9491
5 E-5 57989 11021 10937
6 E-6 19125 4654 10369
7 E-7 5446 1913 6530
8 E-8 1009 438 1786
9 E-9 381 202 579
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
# f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
# f_total <dbl>
officer <- mdata %>%
filter(! str_detect(pay_grade, "^TOTAL")) %>%
filter(str_detect(pay_grade, "^O"))
officer
# A tibble: 10 x 11
pay_grade m_single_nochild f_single_nochild m_single_child
<chr> <dbl> <dbl> <dbl>
1 O-1 13495 3081 402
2 O-2 11029 2715 426
3 O-3 14551 5056 1442
4 O-4 3480 1720 1190
5 O-5 1244 810 729
6 O-6 353 349 261
7 O-7 5 7 7
8 O-8 4 7 0
9 O-9 1 1 1
10 O-10 1 0 0
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
# f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
# f_total <dbl>
warrant <- mdata %>%
filter(! str_detect(pay_grade, "^TOTAL")) %>%
filter(str_detect(pay_grade, "^W"))
warrant
# A tibble: 5 x 11
pay_grade m_single_nochild f_single_nochild m_single_child
<chr> <dbl> <dbl> <dbl>
1 W-1 354 68 160
2 W-2 658 151 358
3 W-3 221 77 283
4 W-4 116 47 169
5 W-5 25 12 24
# ... with 7 more variables: f_single_child <dbl>, m_joint <dbl>,
# f_joint <dbl>, m_married <dbl>, f_married <dbl>, m_total <dbl>,
# f_total <dbl>
Lets observe the change in the count of enlisted officers in different pay_grade by gender and their marital information.
enlisted %>%
gather(key = "m_by", value = "count", 6:9) %>%
mutate(marriage_with = case_when(endsWith(as.character(m_by), "joint") ~ "joint", TRUE ~ "Civilian")) %>%
mutate(gender = case_when(startsWith(as.character(m_by), "m") ~ "male", TRUE ~ "female")) %>%
ggplot(aes(x = marriage_with, y= count, fill=gender)) +
geom_bar(stat = "identity", position = 'dodge') +
facet_wrap(vars(pay_grade), scales="free") +
theme_bw() +
labs(title="Total Enlisted")
Lets observe the change in the count of officers enlisted in different pay_grade by gender and being single with and without a dependent child.
officer %>%
gather(key = "child_status", value = "count", 2:5) %>%
mutate(children=case_when(endsWith(as.character(child_status), "_child") ~ "Child", TRUE ~ "NoChild")) %>%
mutate(gender=case_when(startsWith(as.character(child_status), "m") ~ "male", TRUE ~ "female")) %>%
ggplot(aes(x = children, y = count, fill = gender)) +
geom_bar(stat = "identity", position = 'dodge') +
facet_wrap(vars(pay_grade), scales="free") +
theme_bw() +
labs(title="Officers")
Comparing the distribution of warrants by gender in different pay_grades.
warrant %>%
gather(key = "gender", value = "count", 10:11) %>%
ggplot(aes(x=gender, y=count)) +
geom_bar(stat = "identity", position = 'dodge') +
facet_wrap(vars(pay_grade)) +
theme_bw() +
labs(title = "Warrant Distribution", x="Gender", y="count")
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Prabhu (2022, March 2). Data Analytics and Computational Social Science: DACSS 601 HW5. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httprpubscomsnehalhw5/
BibTeX citation
@misc{prabhu2022dacss, author = {Prabhu, Snehal}, title = {Data Analytics and Computational Social Science: DACSS 601 HW5}, url = {https://github.com/DACSS/dacss_course_website/posts/httprpubscomsnehalhw5/}, year = {2022} }