I went back to try to tidy some of the untidy data
Australian Marriage Survey
#Australian marriage survey data; just doing Table 1
#response to the question"Should the law be changed to allow same-sex couples to marry?"
library(readxl)
library(tidyverse)
excel_sheets("~/DATA/Classwork/Copy of Copy of australian_marriage_law_postal_survey_2017_response_final.xlsx")
[1] "Contents" "Table 1" "Table 2"
[4] "Explanatory Notes"
ausMar <- read_excel("~/DATA/Classwork/Copy of Copy of australian_marriage_law_postal_survey_2017_response_final.xlsx", sheet = 2, skip = 6, col_types = c("text", "numeric", "skip", "numeric", "skip", "skip", "skip", "skip", "skip", "skip", "numeric", "skip", "numeric", "skip", "skip", "skip"), n_max = 9)
colnames(ausMar) <- c("Territory", "Yes", "No", "Unclear", "No Response")
ausMar1 <- ausMar %>%
rowwise() %>%
mutate(NoResp = `Unclear` + `No Response`) %>%
select(-`Unclear`, -`No Response`) %>%
pivot_longer(cols = `Yes`:`NoResp`, names_to = "response") %>%
filter(`Territory` != "Australia")
ausMar1$Territory <- str_remove(ausMar1$Territory, "\\(.\\)")
ausMar1
# A tibble: 24 x 3
Territory response value
<chr> <chr> <dbl>
1 New South Wales Yes 2374362
2 New South Wales No 1736838
3 New South Wales NoResp 1076481
4 Victoria Yes 2145629
5 Victoria No 1161098
6 Victoria NoResp 754662
7 Queensland Yes 1487060
8 Queensland No 961015
9 Queensland NoResp 702798
10 South Australia Yes 592528
# ... with 14 more rows
ausMarSumm <- ausMar1 %>%
group_by(response) %>%
summarize(votes = sum(value))
ausMar1 %>%
group_by(Territory) %>%
ggplot(data = ausMar1, mapping = aes(x = response, y = value)) +
geom_col()
ausMar1 %>%
ggplot(data = ausMar1, mapping = aes(x = Territory, y = value, fill = response)) +
geom_col() +
coord_flip()
Railroad Employment
#total railroad employment by state and county
library(readxl)
excel_sheets("~/DATA/Classwork/CopyStateCounty2012.xlsx")
[1] "StateCounty"
railroad <- read_excel("~/DATA/Classwork/CopyStateCounty2012.xlsx", skip = 4, col_types = c("text", "skip", "text", "skip", "numeric"))
colnames(railroad) <- c("State", "County", "Total")
railroad <- railroad %>%
drop_na(County) %>%
filter(County != "AP", State != "AP")
railroad
# A tibble: 2,928 x 3
State County Total
<chr> <chr> <dbl>
1 AK ANCHORAGE 7
2 AK FAIRBANKS NORTH STAR 2
3 AK JUNEAU 3
4 AK MATANUSKA-SUSITNA 2
5 AK SITKA 1
6 AK SKAGWAY MUNICIPALITY 88
7 AL AUTAUGA 102
8 AL BALDWIN 143
9 AL BARBOUR 1
10 AL BIBB 25
# ... with 2,918 more rows
Active Duty Military Marriage Status
variables Marital Status (Married, Single) children (Yes, No, NA) Pay grade (24 levels) gender (M, F) spouse (Military, Civilian, None)
#active duty military; just doing the Total DoD sheet for now and not trying to create a function to loop through as in the blog example
library(readxl)
excel_sheets("ActiveDuty_MaritalStatus.xls")
[1] "TotalDoD" "AirForce" "MarineCorps" "Navy"
[5] "Army"
Active <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "TotalDoD")
#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(Active) <- c("PayGrade",
"Single_No_Male_None",
"Single_No_Female_None",
"Single_Yes_Male_None",
"Single_Yes_Female_None",
"Married_NA_Male_Military",
"Married_NA_Female_Military",
"Married_NA_Male_Civilian",
"Married_NA_Female_Civilian")
#NA PayGrade if it has the word TOTAL in it and then drop_na to drop rows with NA (this two moves get rid of a bunch of random junk)
#TidierActive <- Active %>% mutate(across(c(PayGrade), ~ replace(., str_detect(., "TOTAL"), NA)))
#TidierActive <- TidierActive %>% drop_na(PayGrade)
#works better
TidierActive <- filter(Active, str_detect(PayGrade, "TOTAL", negate=TRUE))
#pivot longer
ActivePivot1 <- TidierActive %>%
pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")
# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
ActivePivot1 <- ActivePivot1 %>%
separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-")
ActivePivot1
# A tibble: 192 x 7
PayGrade PayLevel MarStat Kids Sex Spouse count
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 E 1 Single No Male None 31229
2 E 1 Single No Female None 5717
3 E 1 Single Yes Male None 563
4 E 1 Single Yes Female None 122
5 E 1 Married NA Male Military 139
6 E 1 Married NA Female Military 141
7 E 1 Married NA Male Civilian 5060
8 E 1 Married NA Female Civilian 719
9 E 2 Single No Male None 53094
10 E 2 Single No Female None 8388
# ... with 182 more rows
#crosstab
WageSex <- ActivePivot1 %>%
group_by(Sex, PayGrade) %>%
tally(count) %>%
spread(Sex, n)
WageMarital <- ActivePivot1 %>%
group_by(MarStat, PayGrade) %>%
tally(count) %>%
spread(MarStat, n)
#basic visual
ggplot(data = ActivePivot1) +
geom_col(mapping = aes(x = MarStat, y = count, fill = Sex))
#I am not going to try to loop through the other sheets, but I am going to try to repeat the whole thing with another sheet
#read in AirForce sheet
AirForce <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "AirForce")
#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(AirForce) <- c("PayGrade",
"Single_No_Male_None",
"Single_No_Female_None",
"Single_Yes_Male_None",
"Single_Yes_Female_None",
"Married_NA_Male_Military",
"Married_NA_Female_Military",
"Married_NA_Male_Civilian",
"Married_NA_Female_Civilian")
#drop row if PayGrade has the word TOTAL in it (also drops rows that are all NAs, not sure why)
TidierAirForce <- filter(AirForce, str_detect(PayGrade, "TOTAL", negate=TRUE))
#pivot longer
TidyAirForce <- TidierAirForce %>%
pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")
# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
TidyAirForce <- TidyAirForce %>%
separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-") %>%
mutate("branch" = "Air Force")
# create an empty tibble to hold this data, to which I can add the other sheets
#create a vector of the column names
columnsMilMar <- colnames(TidyAirForce)
# this is a dumb way to do it
#PayGradesMaritalStatus_All <- as_tibble(data.frame(matrix(nrow=0,ncol=length(columnsMilMar), dimnames = list(c(), c(columnsMilMar)))))
#better but I don't really understand it lol
Tidy_ActiveDuty_MaritalStatus <- columnsMilMar %>% purrr::map_dfc(setNames, object = list(character()))
Tidy_ActiveDuty_MaritalStatus$count <- as.numeric(Tidy_ActiveDuty_MaritalStatus$count)
#join to the main list Tidy_ActiveDuty_MaritalStatus
Tidy_ActiveDuty_MaritalStatus <- bind_rows(list(Tidy_ActiveDuty_MaritalStatus, TidyAirForce))
#repeat for the next branch
#read in MarineCorps sheet
MarineCorps <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "MarineCorps")
#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(MarineCorps) <- c("PayGrade",
"Single_No_Male_None",
"Single_No_Female_None",
"Single_Yes_Male_None",
"Single_Yes_Female_None",
"Married_NA_Male_Military",
"Married_NA_Female_Military",
"Married_NA_Male_Civilian",
"Married_NA_Female_Civilian")
#drop row if PayGrade has the word TOTAL in it (also drops rows that are all NAs, not sure why)
TidierMarineCorps <- filter(MarineCorps, str_detect(PayGrade, "TOTAL", negate=TRUE))
#pivot longer
TidyMarineCorps <- TidierMarineCorps %>%
pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")
# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
TidyMarineCorps <- TidyMarineCorps %>%
separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-") %>%
mutate("branch" = "MarineCorps")
#join to the main list Tidy_ActiveDuty_MaritalStatus
Tidy_ActiveDuty_MaritalStatus <- bind_rows(list(Tidy_ActiveDuty_MaritalStatus, TidyMarineCorps))
#and again...
excel_sheets("ActiveDuty_MaritalStatus.xls")
[1] "TotalDoD" "AirForce" "MarineCorps" "Navy"
[5] "Army"
Navy <- read_excel("ActiveDuty_MaritalStatus.xls",
col_types = c("skip", "text", "numeric",
"numeric", "skip", "numeric", "numeric", "skip",
"numeric", "numeric", "skip", "numeric", "numeric",
"skip", "skip", "skip", "skip"),
skip = 8, sheet = "Navy")
#rename columns with a _ separating the parts that will eventually be turned into grouping variables
colnames(Navy) <- c("PayGrade",
"Single_No_Male_None",
"Single_No_Female_None",
"Single_Yes_Male_None",
"Single_Yes_Female_None",
"Married_NA_Male_Military",
"Married_NA_Female_Military",
"Married_NA_Male_Civilian",
"Married_NA_Female_Civilian")
#drop row if PayGrade has the word TOTAL in it (also drops rows that are all NAs, not sure why)
TidierNavy <- filter(Navy, str_detect(PayGrade, "TOTAL", negate=TRUE))
#pivot longer
TidyNavy <- TidierNavy %>%
pivot_longer(!PayGrade, names_to = "MaritalStatus", values_to = "count")
# separate into columns with demographics
#is this wrong? to separate the elements of the variable?
TidyNavy <- TidyNavy %>%
separate(MaritalStatus, c("MarStat", "Kids", "Sex", "Spouse"), sep = "_") %>%
separate(PayGrade, c("PayGrade", "PayLevel"), sep = "-") %>%
mutate("branch" = "Navy")
#join to the main list Tidy_ActiveDuty_MaritalStatus
Tidy_ActiveDuty_MaritalStatus <- bind_rows(list(Tidy_ActiveDuty_MaritalStatus, TidyNavy))
#that's enough...definitely see that advantage of writing a function to do that since there is a lot of room for copy paste error (and of course if there were many more sheets)
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Merleaux (2022, March 30). Data Analytics and Computational Social Science: Challenge data. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httprpubscomamerleaux882960/
BibTeX citation
@misc{merleaux2022challenge, author = {Merleaux, April}, title = {Data Analytics and Computational Social Science: Challenge data}, url = {https://github.com/DACSS/dacss_course_website/posts/httprpubscomamerleaux882960/}, year = {2022} }