library(tidyverse)
library(ggplot2)
library(skimr)
library(summarytools)
library(readxl)
if(!require(janitor))
install.packages("janitor",repos = "https://cran.us.r-project.org")
library(purrr)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 7
Read in data
Reading in the ‘australian_marriage_tidy.csv’ dataset.
<- read_csv("_data/australian_marriage_tidy.csv") australian_marriage
Briefly describe the data
print(summarytools::dfSummary(australian_marriage, varnumbers = FALSE, plain.ascii = FALSE, graph.magnif = 0.50, style = "grid", valid.col = FALSE),
method = 'render', table.classes = 'table-condensed')
Data Frame Summary
australian_marriage
Dimensions: 16 x 4Duplicates: 0
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
territory [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||
resp [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||
count [numeric] |
|
16 distinct values | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||
percent [numeric] |
|
16 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.1)
2022-08-31
skim(australian_marriage)
Name | australian_marriage |
Number of rows | 16 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
character | 2 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
territory | 0 | 1 | 8 | 31 | 0 | 8 | 0 |
resp | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
count | 0 | 1 | 793202.1 | 778436.94 | 31690 | 159008.00 | 524226 | 1242588.50 | 2374362 | ▇▃▁▂▂ |
percent | 0 | 1 | 50.0 | 14.67 | 26 | 37.23 | 50 | 62.77 | 74 | ▂▇▁▇▂ |
This dataset has 16 rows and 4 columns, of which 2 are character type and 2 are numeric. There are no missing values. It describes the number of people who support/don’t support same sex marriage in 8 Australian regions.
Tidy Data
The entries in the ‘territory’ column can be turned into factors to aid visualization. Also, the ‘resp’ column values of “yes” and “no” can be converted to “2” and “1” respectively.
$territory <- as.factor(australian_marriage$territory)
australian_marriageclass(australian_marriage$territory)
[1] "factor"
$resp <- as.numeric(as.factor(australian_marriage$resp)) australian_marriage
Visualization with Multiple Dimensions
I used a pie chart to figure out the proportion of married to not-married people in the different Australian states.
$resp <- as.factor(australian_marriage$resp)
australian_marriage
<- ggplot(australian_marriage, aes(x="", y=percent, fill=resp)) +
marriage_pie geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) + labs(title = "Support for Same-Sex Marriage Across Australian States") + theme_void() +
theme(strip.text = element_text(size = 5)) + guides(fill=guide_legend(title="Responses")) + scale_fill_manual(values = unique(australian_marriage$resp), labels = c("No","Yes")) + geom_text(aes(label = paste0(round(percent), "%")),
position = position_stack(vjust = 0.5))
<- marriage_pie+facet_wrap(~ territory)
marriage_pie marriage_pie
The Australian Capital Territory (c) overall seems to support same-sex marriage the most, while New South Wales has lesser people that support it.
ABC Poll
Read in data
<- read_csv("_data/abc_poll_2021.csv") abc_poll
Briefly describe the data
print(summarytools::dfSummary(abc_poll, varnumbers = FALSE, plain.ascii = FALSE, graph.magnif = 0.50, style = "grid", valid.col = FALSE),
method = 'render', table.classes = 'table-condensed')
Data Frame Summary
abc_poll
Dimensions: 527 x 31Duplicates: 0
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id [numeric] |
|
527 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
xspanish [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
complete_status [character] | 1. qualified |
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppage [numeric] |
|
72 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppeduc5 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppeducat [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppgender [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppethm [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
pphhsize [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppinc7 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppmarit5 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppmsacat [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppreg4 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
pprent [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppstaten [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
PPWORKA [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ppemploy [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q1_a [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q1_b [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q1_c [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q1_d [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q1_e [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q1_f [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q2 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q3 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q4 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Q5 [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
QPID [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ABCAGE [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Contact [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
weights_pid [numeric] |
|
453 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.1)
2022-08-31
skim(abc_poll)
Name | abc_poll |
Number of rows | 527 |
Number of columns | 31 |
_______________________ | |
Column type frequency: | |
character | 28 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
xspanish | 0 | 1 | 7 | 7 | 0 | 2 | 0 |
complete_status | 0 | 1 | 9 | 9 | 0 | 1 | 0 |
ppeduc5 | 0 | 1 | 29 | 64 | 0 | 5 | 0 |
ppeducat | 0 | 1 | 11 | 26 | 0 | 4 | 0 |
ppgender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
ppethm | 0 | 1 | 8 | 22 | 0 | 5 | 0 |
pphhsize | 0 | 1 | 1 | 9 | 0 | 6 | 0 |
ppinc7 | 0 | 1 | 16 | 20 | 0 | 7 | 0 |
ppmarit5 | 0 | 1 | 7 | 13 | 0 | 5 | 0 |
ppmsacat | 0 | 1 | 10 | 14 | 0 | 2 | 0 |
ppreg4 | 0 | 1 | 4 | 9 | 0 | 4 | 0 |
pprent | 0 | 1 | 15 | 57 | 0 | 3 | 0 |
ppstaten | 0 | 1 | 4 | 20 | 0 | 49 | 0 |
PPWORKA | 0 | 1 | 5 | 36 | 0 | 9 | 0 |
ppemploy | 0 | 1 | 11 | 17 | 0 | 3 | 0 |
Q1_a | 0 | 1 | 7 | 10 | 0 | 3 | 0 |
Q1_b | 0 | 1 | 7 | 10 | 0 | 3 | 0 |
Q1_c | 0 | 1 | 7 | 10 | 0 | 3 | 0 |
Q1_d | 0 | 1 | 7 | 10 | 0 | 3 | 0 |
Q1_e | 0 | 1 | 7 | 10 | 0 | 3 | 0 |
Q1_f | 0 | 1 | 7 | 10 | 0 | 3 | 0 |
Q2 | 0 | 1 | 14 | 20 | 0 | 4 | 0 |
Q3 | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
Q4 | 0 | 1 | 4 | 11 | 0 | 5 | 0 |
Q5 | 0 | 1 | 7 | 11 | 0 | 3 | 0 |
QPID | 0 | 1 | 7 | 14 | 0 | 5 | 0 |
ABCAGE | 0 | 1 | 3 | 5 | 0 | 4 | 0 |
Contact | 0 | 1 | 35 | 38 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 7230264.00 | 152.28 | 7230001.00 | 7230132.50 | 7230264.00 | 7230395.50 | 7230527.00 | ▇▇▇▇▇ |
ppage | 0 | 1 | 53.39 | 17.10 | 18.00 | 40.00 | 55.00 | 67.00 | 91.00 | ▅▆▇▇▂ |
weights_pid | 0 | 1 | 1.00 | 0.60 | 0.32 | 0.63 | 0.85 | 1.15 | 6.26 | ▇▁▁▁▁ |
This dataset seems to be a national sample survey from 2019 that includes questions related to respondents’ political attitudes and demographics, along with some survey administration questions. There are 527 rows and 31 columns in this dataset, of which 28 are character type and 3 are numeric. No missing values are present. ppeducat
seems to have more complete information than ppeduc5
, so I’ll be using this column for further visualization of respondents’ educational background.
Tidy Data
First, I’ll be ordering ppeducat
values by increasing educational level instead of alphabetically.
<- unique(abc_poll$ppeducat)
education_levels education_levels
[1] "High school" "Bachelors degree or higher"
[3] "Some college" "Less than high school"
<- abc_poll %>%
abc_poll_new mutate(education = factor(ppeducat, levels = education_levels[c(4,1,3,2)])) %>%
select(-ppeducat)
table(abc_poll_new$education)
Less than high school High school
29 133
Some college Bachelors degree or higher
158 207
unique(abc_poll$QPID)
[1] "A Democrat" "An Independent" "Something else" "A Republican"
[5] "Skipped"
# removing extra characters such as "A" and "An"
<- abc_poll_new %>%
abc_poll_new mutate(partyid = str_remove(QPID, "A[n]* "),
partyid = na_if(partyid, "Skipped")) %>%
select(-QPID)
table(abc_poll_new$partyid)
Democrat Independent Republican Something else
176 168 152 28
Then, I replaced all “Skipped” entries across columns with NA:
<-abc_poll_new%>%
abc_poll_new mutate(across(starts_with("Q"), ~ na_if(.x, "Skipped")))
map(select(abc_poll_new, starts_with("Q")), table)
$Q1_a
Approve Disapprove
329 193
$Q1_b
Approve Disapprove
192 322
$Q1_c
Approve Disapprove
272 248
$Q1_d
Approve Disapprove
192 321
$Q1_e
Approve Disapprove
212 301
$Q1_f
Approve Disapprove
281 230
$Q2
Not concerned at all Not so concerned Somewhat concerned
65 147 221
Very concerned
94
$Q3
No Yes
107 415
$Q4
Excellent Good Not so good Poor
60 215 97 149
$Q5
Optimistic Pessimistic
229 295
Finally, I shortened some entries in the ppethm
column by removing the redundant “Non-Hispanic” part.
unique(abc_poll_new$ppethm)
[1] "White, Non-Hispanic" "Other, Non-Hispanic" "Black, Non-Hispanic"
[4] "Hispanic" "2+ Races, Non-Hispanic"
# shortening
<- abc_poll_new %>%
abc_poll_new mutate(ethnicity = str_remove(ppethm,", Non-Hispanic")) %>%
select(-ppethm)
unique(abc_poll_new$ethnicity)
[1] "White" "Other" "Black" "Hispanic" "2+ Races"
Visualization with Multiple Dimensions
First, I look at the distribution of party affiliations by age using geom_point()
and percentage labels for easy comparison:
<- abc_poll_new %>%
abc_poll_viz select(partyid,ABCAGE) %>%
group_by(partyid,ABCAGE) %>%
tally()
abc_poll_viz
# A tibble: 19 × 3
# Groups: partyid [5]
partyid ABCAGE n
<chr> <chr> <int>
1 Democrat 18-29 23
2 Democrat 30-49 54
3 Democrat 50-64 49
4 Democrat 65+ 50
5 Independent 18-29 20
6 Independent 30-49 44
7 Independent 50-64 50
8 Independent 65+ 54
9 Republican 18-29 10
10 Republican 30-49 41
11 Republican 50-64 52
12 Republican 65+ 49
13 Something else 18-29 6
14 Something else 30-49 8
15 Something else 50-64 6
16 Something else 65+ 8
17 <NA> 18-29 1
18 <NA> 30-49 1
19 <NA> 65+ 1
<- ggplot(abc_poll_viz, aes(partyid, ABCAGE)) + geom_point(aes(size = n), colour = "orange") + xlab("") + ylab("")
abc_table + scale_size_continuous(range=c(10,20)) + geom_text(aes(label = scales::percent(n/sum(n), accuracy = .1, trim = FALSE)), size = 3) + theme(panel.background=element_blank(), panel.grid.major = element_line(color = "red", size = 0.5),legend.position="none") + labs(title = "Distribution of Party Affiliations by Age") abc_table
Then, I wanted to see how the proportion of people who were optimistic about the next 12 months with the current party by age and party affiliation, but I wasn’t able to properly visualize it beyond these tables:
<- abc_poll_new %>%
abc_poll_table tabyl(partyid,ABCAGE,Q5, show_missing_levels = FALSE) %>%
adorn_totals("row") %>%
adorn_percentages("all") %>%
adorn_pct_formatting(digits = 1) %>%
%>%
adorn_ns adorn_title()
abc_poll_table
$Optimistic
ABCAGE
partyid 18-29 30-49 50-64 65+
Democrat 7.4% (17) 15.3% (35) 17.5% (40) 17.5% (40)
Independent 2.6% (6) 7.4% (17) 8.7% (20) 7.4% (17)
Republican 0.9% (2) 3.1% (7) 3.9% (9) 3.5% (8)
Something else 1.3% (3) 0.9% (2) 1.3% (3) 0.9% (2)
<NA> 0.4% (1) 0.0% (0) 0.0% (0) 0.0% (0)
Total 12.7% (29) 26.6% (61) 31.4% (72) 29.3% (67)
$Pessimistic
ABCAGE
partyid 18-29 30-49 50-64 65+
Democrat 2.0% (6) 6.4% (19) 3.1% (9) 3.1% (9)
Independent 4.7% (14) 9.2% (27) 10.2% (30) 12.5% (37)
Republican 2.7% (8) 11.5% (34) 14.6% (43) 13.9% (41)
Something else 1.0% (3) 2.0% (6) 1.0% (3) 1.7% (5)
<NA> 0.0% (0) 0.3% (1) 0.0% (0) 0.0% (0)
Total 10.5% (31) 29.5% (87) 28.8% (85) 31.2% (92)
$NA_
ABCAGE
partyid 65+
Democrat 33.3% (1)
Something else 33.3% (1)
<NA> 33.3% (1)
Total 100.0% (3)
:::