library(tidyverse)
library(readxl)
library(ggplot2)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 8 Instructions
Challenge Overview
Today’s challenge is to:
- read in multiple data sets, and describe the data set using both words and any supporting information (e.g., tables, etc)
- tidy data (as needed, including sanity checks)
- mutate variables as needed (including sanity checks)
- join two or more data sets and analyze some aspect of the joined data
I chose this dataset because it will be very similar to my final project dataset.
Read in data
Read in one (or more) of the following datasets, using the correct R package and command.
- military marriages ⭐⭐
<- c("payGrade","singleWO_Child_M","singleWO_Child_F","singleWO_Child_T",
cols "singleW_Child_M","singleW_Child_F","singleW_Child_T",
"JointService_M","JointService_F","JointService_T",
"CivilianMarriage_M","CivilianMarriage_F","CivilianMarriage_T",
"Total_M","Total_F","TotalT")
<- read_excel("_data/ActiveDuty_MaritalStatus.xls",range= "AirForce!B10:Q31",col_names = cols)
AirForce_MaritalStatus <- read_excel("_data/ActiveDuty_MaritalStatus.xls",range= "MarineCorps!B10:Q37",col_names = cols)
MarineCorps_MaritalStatus <- read_excel("_data/ActiveDuty_MaritalStatus.xls",range= "Navy!B10:Q36",col_names = cols)
Navy_MaritalStatus <- read_excel("_data/ActiveDuty_MaritalStatus.xls",range= "Army!B10:Q37",col_names = cols)
Army_MaritalStatus
head(AirForce_MaritalStatus)
# A tibble: 6 × 16
payGrade singleWO_Ch…¹ singl…² singl…³ singl…⁴ singl…⁵ singl…⁶ Joint…⁷ Joint…⁸
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 E-1 7721 1550 9271 27 5 32 49 27
2 E-2 4380 1010 5390 33 9 42 97 105
3 E-3 29725 7108 36833 396 266 662 1258 1687
4 E-4 20805 4756 25561 987 842 1829 3036 3207
5 E-5 14623 4104 18727 2755 2171 4926 6154 5519
6 E-6 3660 1377 5037 2446 1449 3895 3654 3263
# … with 7 more variables: JointService_T <dbl>, CivilianMarriage_M <dbl>,
# CivilianMarriage_F <dbl>, CivilianMarriage_T <dbl>, Total_M <dbl>,
# Total_F <dbl>, TotalT <dbl>, and abbreviated variable names
# ¹singleWO_Child_M, ²singleWO_Child_F, ³singleWO_Child_T, ⁴singleW_Child_M,
# ⁵singleW_Child_F, ⁶singleW_Child_T, ⁷JointService_M, ⁸JointService_F
head(MarineCorps_MaritalStatus)
# A tibble: 6 × 16
payGrade singleWO_Ch…¹ singl…² singl…³ singl…⁴ singl…⁵ singl…⁶ Joint…⁷ Joint…⁸
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 E-1 6232 583 6815 54 3 57 20 19
2 E-2 15916 1336 17252 190 12 202 98 109
3 E-3 34868 1864 36732 574 167 741 780 928
4 E-4 17862 1026 18888 527 272 799 1070 944
5 E-5 7490 590 8080 931 338 1269 1199 730
6 E-6 2051 208 2259 1060 226 1286 568 341
# … with 7 more variables: JointService_T <dbl>, CivilianMarriage_M <dbl>,
# CivilianMarriage_F <dbl>, CivilianMarriage_T <dbl>, Total_M <dbl>,
# Total_F <dbl>, TotalT <dbl>, and abbreviated variable names
# ¹singleWO_Child_M, ²singleWO_Child_F, ³singleWO_Child_T, ⁴singleW_Child_M,
# ⁵singleW_Child_F, ⁶singleW_Child_T, ⁷JointService_M, ⁸JointService_F
head(Navy_MaritalStatus)
# A tibble: 6 × 16
payGrade singleWO_Ch…¹ singl…² singl…³ singl…⁴ singl…⁵ singl…⁶ Joint…⁷ Joint…⁸
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 E-1 7820 2275 10095 117 34 151 30 57
2 E-2 11198 2718 13916 237 87 324 113 164
3 E-3 28163 6396 34559 681 788 1469 861 1288
4 E-4 23285 4266 27551 967 949 1916 1526 1691
5 E-5 18856 3649 22505 2837 1787 4624 2191 2099
6 E-6 5917 1429 7346 2725 1284 4009 1654 1461
# … with 7 more variables: JointService_T <dbl>, CivilianMarriage_M <dbl>,
# CivilianMarriage_F <dbl>, CivilianMarriage_T <dbl>, Total_M <dbl>,
# Total_F <dbl>, TotalT <dbl>, and abbreviated variable names
# ¹singleWO_Child_M, ²singleWO_Child_F, ³singleWO_Child_T, ⁴singleW_Child_M,
# ⁵singleW_Child_F, ⁶singleW_Child_T, ⁷JointService_M, ⁸JointService_F
head(Army_MaritalStatus)
# A tibble: 6 × 16
payGrade singleWO_Ch…¹ singl…² singl…³ singl…⁴ singl…⁵ singl…⁶ Joint…⁷ Joint…⁸
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 E-1 9456 1309 10765 365 80 445 40 38
2 E-2 21600 3324 24924 997 167 1164 130 201
3 E-3 38335 5651 43986 2613 699 3312 680 999
4 E-4 50758 6333 57091 7010 2599 9609 3029 3936
5 E-5 17020 2678 19698 4414 2280 6694 2915 2769
6 E-6 7497 1640 9137 4138 2003 6141 2598 1896
# … with 7 more variables: JointService_T <dbl>, CivilianMarriage_M <dbl>,
# CivilianMarriage_F <dbl>, CivilianMarriage_T <dbl>, Total_M <dbl>,
# Total_F <dbl>, TotalT <dbl>, and abbreviated variable names
# ¹singleWO_Child_M, ²singleWO_Child_F, ³singleWO_Child_T, ⁴singleW_Child_M,
# ⁵singleW_Child_F, ⁶singleW_Child_T, ⁷JointService_M, ⁸JointService_F
Briefly describe the data
Tidy Data (as needed)
Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.
#final shape:
#cols = Pay grade, sex, Martital status
<- function(inSheet){
tidyData # First we remove totals
<- inSheet%>%
tidySheet select(c("payGrade","singleWO_Child_M","singleWO_Child_F",
"singleW_Child_M","singleW_Child_F",
"JointService_M","JointService_F",
"CivilianMarriage_M","CivilianMarriage_F"))%>%
filter(!str_detect(`payGrade`,"TOTAL"))
#could just put this in the same statement but for readability, I will separate.
# pivoting and fixing col names
<-tidySheet%>%
pivotedSheetpivot_longer(cols=c("singleWO_Child_M","singleWO_Child_F",
"singleW_Child_M","singleW_Child_F",
"JointService_M","JointService_F",
"CivilianMarriage_M","CivilianMarriage_F"), names_to = "Marital Status",values_to = "Count")%>%
mutate(`Marital Status`,Sex=substr(`Marital Status`,nchar(`Marital Status`),nchar(`Marital Status`)),.after=`payGrade`)%>%
mutate(`Marital Status`,`Marital Status`=substr(`Marital Status`,0,nchar(`Marital Status`)-2))
<- pivotedSheet
inSheet }
Are there any variables that require mutation to be usable in your analysis stream? For example, do you need to calculate new values in order to graph them? Can string values be represented numerically? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?
Document your work here.
Using above declared function to clean all data.
<- tidyData(AirForce_MaritalStatus)
AirForce_MaritalStatus<- tidyData(MarineCorps_MaritalStatus)
MarineCorps_MaritalStatus<- tidyData(Navy_MaritalStatus)
Navy_MaritalStatus<- tidyData(Army_MaritalStatus)
Army_MaritalStatus
<- c("payGrade", "Sex", "Marital Status", "Airforce_Count")
colNamescolnames(AirForce_MaritalStatus)<-colNames
<- c("payGrade", "Sex", "Marital Status", "MarineCorps_Count")
colNamescolnames(MarineCorps_MaritalStatus)<-colNames
<- c("payGrade", "Sex", "Marital Status", "Navy_Count")
colNamescolnames(Navy_MaritalStatus)<-colNames
<- c("payGrade", "Sex", "Marital Status", "Army_Count")
colNamescolnames(Army_MaritalStatus)<-colNames
head(AirForce_MaritalStatus)
# A tibble: 6 × 4
payGrade Sex `Marital Status` Airforce_Count
<chr> <chr> <chr> <dbl>
1 E-1 M singleWO_Child 7721
2 E-1 F singleWO_Child 1550
3 E-1 M singleW_Child 27
4 E-1 F singleW_Child 5
5 E-1 M JointService 49
6 E-1 F JointService 27
head(MarineCorps_MaritalStatus)
# A tibble: 6 × 4
payGrade Sex `Marital Status` MarineCorps_Count
<chr> <chr> <chr> <dbl>
1 E-1 M singleWO_Child 6232
2 E-1 F singleWO_Child 583
3 E-1 M singleW_Child 54
4 E-1 F singleW_Child 3
5 E-1 M JointService 20
6 E-1 F JointService 19
head(Navy_MaritalStatus)
# A tibble: 6 × 4
payGrade Sex `Marital Status` Navy_Count
<chr> <chr> <chr> <dbl>
1 E-1 M singleWO_Child 7820
2 E-1 F singleWO_Child 2275
3 E-1 M singleW_Child 117
4 E-1 F singleW_Child 34
5 E-1 M JointService 30
6 E-1 F JointService 57
head(Army_MaritalStatus)
# A tibble: 6 × 4
payGrade Sex `Marital Status` Army_Count
<chr> <chr> <chr> <dbl>
1 E-1 M singleWO_Child 9456
2 E-1 F singleWO_Child 1309
3 E-1 M singleW_Child 365
4 E-1 F singleW_Child 80
5 E-1 M JointService 40
6 E-1 F JointService 38
Join Data
Be sure to include a sanity check, and double-check that case count is correct!
First I ensure each type of marital status has information. Lengths of tibbles will be different because of different amounts of pay grades, if each tot col has 4 however that means there is representation of each category.
%>%
AirForce_MaritalStatusgroup_by(payGrade)%>%
summarise(tot = n_distinct(`Marital Status`))
# A tibble: 19 × 2
payGrade tot
<chr> <int>
1 E-1 4
2 E-2 4
3 E-3 4
4 E-4 4
5 E-5 4
6 E-6 4
7 E-7 4
8 E-8 4
9 E-9 4
10 O-1 4
11 O-10 4
12 O-2 4
13 O-3 4
14 O-4 4
15 O-5 4
16 O-6 4
17 O-7 4
18 O-8 4
19 O-9 4
%>%
MarineCorps_MaritalStatusgroup_by(`payGrade`)%>%
summarise(tot = n_distinct(`Marital Status`))
# A tibble: 24 × 2
payGrade tot
<chr> <int>
1 E-1 4
2 E-2 4
3 E-3 4
4 E-4 4
5 E-5 4
6 E-6 4
7 E-7 4
8 E-8 4
9 E-9 4
10 O-1 4
# … with 14 more rows
%>%
Navy_MaritalStatusgroup_by(`payGrade`)%>%
summarise(tot = n_distinct(`Marital Status`))
# A tibble: 23 × 2
payGrade tot
<chr> <int>
1 E-1 4
2 E-2 4
3 E-3 4
4 E-4 4
5 E-5 4
6 E-6 4
7 E-7 4
8 E-8 4
9 E-9 4
10 O-1 4
# … with 13 more rows
%>%
Army_MaritalStatusgroup_by(`payGrade`)%>%
summarise(tot = n_distinct(`Marital Status`))
# A tibble: 24 × 2
payGrade tot
<chr> <int>
1 E-1 4
2 E-2 4
3 E-3 4
4 E-4 4
5 E-5 4
6 E-6 4
7 E-7 4
8 E-8 4
9 E-9 4
10 O-1 4
# … with 14 more rows
Next I see how many different pay grades there are. From below we see that once all joins are done we should have at least 24 distinct pay grades. ::: {.cell}
nrow(AirForce_MaritalStatus)
[1] 152
nrow(MarineCorps_MaritalStatus)
[1] 192
%>%
AirForce_MaritalStatusdistinct((payGrade))%>%
nrow()
[1] 19
%>%
MarineCorps_MaritalStatusdistinct((payGrade))%>%
nrow()
[1] 24
%>%
Navy_MaritalStatusdistinct((payGrade))%>%
nrow()
[1] 23
%>%
Army_MaritalStatusdistinct((payGrade))%>%
nrow()
[1] 24
:::
<-full_join(AirForce_MaritalStatus,MarineCorps_MaritalStatus,by=c("payGrade"="payGrade","Sex"="Sex","Marital Status"="Marital Status"))
finalTable
<-full_join(finalTable,Navy_MaritalStatus,by=c("payGrade"="payGrade","Sex"="Sex","Marital Status"="Marital Status"))
finalTable
<-full_join(finalTable,Army_MaritalStatus,by=c("payGrade"="payGrade","Sex"="Sex","Marital Status"="Marital Status"))
finalTable
%>%
finalTabledistinct((payGrade))%>%
nrow()
[1] 24
%>%
finalTablegroup_by(`payGrade`)%>%
summarise(tot = n_distinct(`Marital Status`))
# A tibble: 24 × 2
payGrade tot
<chr> <int>
1 E-1 4
2 E-2 4
3 E-3 4
4 E-4 4
5 E-5 4
6 E-6 4
7 E-7 4
8 E-8 4
9 E-9 4
10 O-1 4
# … with 14 more rows
Final Result of joins: ::: {.cell}
head(finalTable)
# A tibble: 6 × 7
payGrade Sex `Marital Status` Airforce_Count MarineCorps_C…¹ Navy_…² Army_…³
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 E-1 M singleWO_Child 7721 6232 7820 9456
2 E-1 F singleWO_Child 1550 583 2275 1309
3 E-1 M singleW_Child 27 54 117 365
4 E-1 F singleW_Child 5 3 34 80
5 E-1 M JointService 49 20 30 40
6 E-1 F JointService 27 19 57 38
# … with abbreviated variable names ¹MarineCorps_Count, ²Navy_Count,
# ³Army_Count
:::