Challenge data

I went back to try to tidy some of the untidy data

April Merleaux
2022-03-27

Australian Marriage Survey

#Australian marriage survey data; just doing Table 1
#response to the question"Should the law be changed to allow same-sex couples to marry?"
library(readxl)
library(tidyverse)
excel_sheets("~/DATA/Classwork/Copy of Copy of australian_marriage_law_postal_survey_2017_response_final.xlsx")
[1] "Contents"          "Table 1"           "Table 2"          
[4] "Explanatory Notes"
ausMar <- read_excel("~/DATA/Classwork/Copy of Copy of australian_marriage_law_postal_survey_2017_response_final.xlsx", sheet = 2, skip = 6, col_types = c("text", "numeric", "skip", "numeric", "skip", "skip", "skip", "skip", "skip", "skip", "numeric", "skip", "numeric", "skip", "skip", "skip"), n_max = 9)
colnames(ausMar) <- c("Territory", "Yes", "No", "Unclear", "No Response")
ausMar1 <- ausMar %>% 
  rowwise() %>% 
  mutate(NoResp = `Unclear` + `No Response`) %>%
  select(-`Unclear`, -`No Response`) %>%
  pivot_longer(cols = `Yes`:`NoResp`, names_to = "response") %>%
  filter(`Territory` != "Australia") 
ausMar1$Territory <- str_remove(ausMar1$Territory, "\\(.\\)")
ausMar1
# A tibble: 24 x 3
   Territory       response   value
   <chr>           <chr>      <dbl>
 1 New South Wales Yes      2374362
 2 New South Wales No       1736838
 3 New South Wales NoResp   1076481
 4 Victoria        Yes      2145629
 5 Victoria        No       1161098
 6 Victoria        NoResp    754662
 7 Queensland      Yes      1487060
 8 Queensland      No        961015
 9 Queensland      NoResp    702798
10 South Australia Yes       592528
# ... with 14 more rows
ausMarSumm <- ausMar1 %>%
group_by(response) %>%
summarize(votes = sum(value)) 

ausMar1 %>%
  group_by(Territory) %>%
  ggplot(data = ausMar1, mapping = aes(x = response, y = value)) +
  geom_col()
ausMar1 %>%
  ggplot(data = ausMar1, mapping = aes(x = Territory, y = value, fill = response)) +
  geom_col() +
  coord_flip()

Railroad Employment

#total railroad employment by state and county
library(readxl)
excel_sheets("~/DATA/Classwork/CopyStateCounty2012.xlsx")
[1] "StateCounty"
railroad <- read_excel("~/DATA/Classwork/CopyStateCounty2012.xlsx", skip = 4, col_types = c("text", "skip", "text", "skip", "numeric"))
colnames(railroad) <- c("State", "County", "Total")
railroad <- railroad %>%
drop_na(County) %>%
filter(County != "AP", State != "AP")
railroad
# A tibble: 2,928 x 3
   State County               Total
   <chr> <chr>                <dbl>
 1 AK    ANCHORAGE                7
 2 AK    FAIRBANKS NORTH STAR     2
 3 AK    JUNEAU                   3
 4 AK    MATANUSKA-SUSITNA        2
 5 AK    SITKA                    1
 6 AK    SKAGWAY MUNICIPALITY    88
 7 AL    AUTAUGA                102
 8 AL    BALDWIN                143
 9 AL    BARBOUR                  1
10 AL    BIBB                    25
# ... with 2,918 more rows

Active Duty Military Marriage Status
variables Marital Status (Married, Single) children (Yes, No, NA) Pay grade (24 levels) gender (M, F) spouse (Military, Civilian, None)

#active duty military; just doing the Total DoD sheet for now and not trying to create a function to loop through as in the blog example
library(readxl)
excel_sheets("ActiveDuty_MaritalStatus.xls")
[1] "TotalDoD"    "AirForce"    "MarineCorps" "Navy"       
[5] "Army"       
Active <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "TotalDoD")
#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(Active) <- c("PayGrade", 
                      "Single_No_Male_None", 
                      "Single_No_Female_None", 
                      "Single_Yes_Male_None", 
                      "Single_Yes_Female_None", 
                      "Married_NA_Male_Military", 
                      "Married_NA_Female_Military", 
                      "Married_NA_Male_Civilian", 
                      "Married_NA_Female_Civilian")
#NA PayGrade if it has the word TOTAL in it and then drop_na to drop rows with NA (this two moves get rid of a bunch of random junk)
#TidierActive <- Active %>% mutate(across(c(PayGrade),  ~ replace(.,   str_detect(., "TOTAL"), NA)))
#TidierActive <- TidierActive %>% drop_na(PayGrade)

#works better
TidierActive <- filter(Active, str_detect(PayGrade, "TOTAL", negate=TRUE))

#pivot longer
ActivePivot1 <- TidierActive %>%
  pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")

# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
ActivePivot1 <- ActivePivot1 %>%
  separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
  separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-")
ActivePivot1
# A tibble: 192 x 7
   PayGrade PayLevel MarStat Kids  Sex    Spouse   count
   <chr>    <chr>    <chr>   <chr> <chr>  <chr>    <dbl>
 1 E        1        Single  No    Male   None     31229
 2 E        1        Single  No    Female None      5717
 3 E        1        Single  Yes   Male   None       563
 4 E        1        Single  Yes   Female None       122
 5 E        1        Married NA    Male   Military   139
 6 E        1        Married NA    Female Military   141
 7 E        1        Married NA    Male   Civilian  5060
 8 E        1        Married NA    Female Civilian   719
 9 E        2        Single  No    Male   None     53094
10 E        2        Single  No    Female None      8388
# ... with 182 more rows
#crosstab
WageSex <- ActivePivot1 %>%
  group_by(Sex, PayGrade) %>%
  tally(count) %>%
  spread(Sex, n)

WageMarital <- ActivePivot1 %>%
  group_by(MarStat, PayGrade) %>%
  tally(count) %>%
  spread(MarStat, n)

#basic visual
ggplot(data = ActivePivot1) +
geom_col(mapping = aes(x = MarStat, y = count, fill = Sex)) 
#I am not going to try to loop through the other sheets, but I am going to try to repeat the whole thing with another sheet

#read in AirForce sheet
AirForce <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "AirForce") 

#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(AirForce) <- c("PayGrade", 
                      "Single_No_Male_None", 
                      "Single_No_Female_None", 
                      "Single_Yes_Male_None", 
                      "Single_Yes_Female_None", 
                      "Married_NA_Male_Military", 
                      "Married_NA_Female_Military", 
                      "Married_NA_Male_Civilian", 
                      "Married_NA_Female_Civilian")
#drop row if PayGrade has the word TOTAL in it (also drops rows that are all NAs, not sure why)
TidierAirForce <- filter(AirForce, str_detect(PayGrade, "TOTAL", negate=TRUE))

#pivot longer
TidyAirForce <- TidierAirForce %>%
  pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")

# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
TidyAirForce <- TidyAirForce %>%
  separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
  separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-") %>%
  mutate("branch" = "Air Force")

# create an empty tibble to hold this data, to which I can add the other sheets 
#create a vector of the column names
columnsMilMar <- colnames(TidyAirForce)

# this is a dumb way to do it
#PayGradesMaritalStatus_All <- as_tibble(data.frame(matrix(nrow=0,ncol=length(columnsMilMar), dimnames = list(c(), c(columnsMilMar))))) 

#better but I don't really understand it lol
Tidy_ActiveDuty_MaritalStatus <- columnsMilMar %>% purrr::map_dfc(setNames, object = list(character())) 
Tidy_ActiveDuty_MaritalStatus$count <- as.numeric(Tidy_ActiveDuty_MaritalStatus$count)

#join to the main list Tidy_ActiveDuty_MaritalStatus
Tidy_ActiveDuty_MaritalStatus <- bind_rows(list(Tidy_ActiveDuty_MaritalStatus, TidyAirForce))

#repeat for the next branch
#read in MarineCorps sheet
MarineCorps <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "MarineCorps") 

#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(MarineCorps) <- c("PayGrade", 
                      "Single_No_Male_None", 
                      "Single_No_Female_None", 
                      "Single_Yes_Male_None", 
                      "Single_Yes_Female_None", 
                      "Married_NA_Male_Military", 
                      "Married_NA_Female_Military", 
                      "Married_NA_Male_Civilian", 
                      "Married_NA_Female_Civilian")
#drop row if PayGrade has the word TOTAL in it (also drops rows that are all NAs, not sure why)
TidierMarineCorps <- filter(MarineCorps, str_detect(PayGrade, "TOTAL", negate=TRUE))

#pivot longer
TidyMarineCorps <- TidierMarineCorps %>%
  pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")

# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
TidyMarineCorps <- TidyMarineCorps %>%
  separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
  separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-") %>%
  mutate("branch" = "MarineCorps")

#join to the main list Tidy_ActiveDuty_MaritalStatus
Tidy_ActiveDuty_MaritalStatus <- bind_rows(list(Tidy_ActiveDuty_MaritalStatus, TidyMarineCorps))

#and again...

excel_sheets("ActiveDuty_MaritalStatus.xls")
[1] "TotalDoD"    "AirForce"    "MarineCorps" "Navy"       
[5] "Army"       
Navy <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "Navy")
#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(Navy) <- c("PayGrade",
"Single_No_Male_None",
"Single_No_Female_None",
"Single_Yes_Male_None",
"Single_Yes_Female_None",
"Married_NA_Male_Military",
"Married_NA_Female_Military",
"Married_NA_Male_Civilian",
"Married_NA_Female_Civilian")
#drop row if PayGrade has the word TOTAL in it (also drops rows that are all NAs, not sure why)
TidierNavy <- filter(Navy, str_detect(PayGrade, "TOTAL", negate=TRUE))
#pivot longer
TidyNavy <- TidierNavy %>%
pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")
# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
TidyNavy <- TidyNavy %>%
separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-") %>%
mutate("branch" = "Navy")
#join to the main list Tidy_ActiveDuty_MaritalStatus
Tidy_ActiveDuty_MaritalStatus <- bind_rows(list(Tidy_ActiveDuty_MaritalStatus, TidyNavy))

#that's enough...definitely see that advantage of writing a function to do that since there is a lot of room for copy paste error (and of course if there were many more sheets)

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Merleaux (2022, March 30). Data Analytics and Computational Social Science: Challenge data. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httprpubscomamerleaux882960/

BibTeX citation

@misc{merleaux2022challenge,
  author = {Merleaux, April},
  title = {Data Analytics and Computational Social Science: Challenge data},
  url = {https://github.com/DACSS/dacss_course_website/posts/httprpubscomamerleaux882960/},
  year = {2022}
}