Code
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Erika Nagai
September 27, 2022
Installing useful packages
Today’s challenge is to:
pivot_longer
Read in one (or more) of the following datasets, using the correct R package and command.
I’m using “USA households” dataset.
# A tibble: 6 × 16
...1 ...2 Total Under…¹ $15,0…² $25,0…³ $35,0…⁴ $50,0…⁵ $75,0…⁶ $100,…⁷
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ALL RACES <NA> NA NA NA NA NA NA NA NA
2 2019 128451 100 9.1 8 8.3 11.7 16.5 12.3 15.5
3 2018 128579 100 10.1 8.8 8.7 12 17 12.5 15
4 2017 2 127669 100 10 9.1 9.2 12 16.4 12.4 14.7
5 2017 127586 100 10.1 9.1 9.2 11.9 16.3 12.6 14.8
6 2016 126224 100 10.4 9 9.2 12.3 16.7 12.2 15
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
# `$200,000 and over` <dbl>, Estimate...13 <dbl>,
# `Margin of error1 (±)...14` <dbl>, Estimate...15 <chr>,
# `Margin of error1 (±)...16` <chr>, and abbreviated variable names
# ¹`Under $15,000`, ²`$15,000\r\nto\r\n$24,999`, ³`$25,000\r\nto\r\n$34,999`,
# ⁴`$35,000\r\nto\r\n$49,999`, ⁵`$50,000\r\nto\r\n$74,999`,
# ⁶`$75,000\r\nto\r\n$99,999`, ⁷`$100,000\r\nto\r\n$149,999`
This data is very dirty and hard to read. Before start analyzing it, let’s clean the data!
As
function (..., list = character(), package = NULL, lib.loc = NULL,
verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
{
fileExt <- function(x) {
db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
ans <- sub(".*\\.", "", x)
ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
x[db])
ans
}
my_read_table <- function(...) {
lcc <- Sys.getlocale("LC_COLLATE")
on.exit(Sys.setlocale("LC_COLLATE", lcc))
Sys.setlocale("LC_COLLATE", "C")
read.table(...)
}
stopifnot(is.character(list))
names <- c(as.character(substitute(list(...))[-1L]), list)
if (!is.null(package)) {
if (!is.character(package))
stop("'package' must be a character vector or NULL")
}
paths <- find.package(package, lib.loc, verbose = verbose)
if (is.null(lib.loc))
paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
paths)
paths <- unique(normalizePath(paths[file.exists(paths)]))
paths <- paths[dir.exists(file.path(paths, "data"))]
dataExts <- tools:::.make_file_exts("data")
if (length(names) == 0L) {
db <- matrix(character(), nrow = 0L, ncol = 4L)
for (path in paths) {
entries <- NULL
packageName <- if (file_test("-f", file.path(path,
"DESCRIPTION")))
basename(path)
else "."
if (file_test("-f", INDEX <- file.path(path, "Meta",
"data.rds"))) {
entries <- readRDS(INDEX)
}
else {
dataDir <- file.path(path, "data")
entries <- tools::list_files_with_type(dataDir,
"data")
if (length(entries)) {
entries <- unique(tools::file_path_sans_ext(basename(entries)))
entries <- cbind(entries, "")
}
}
if (NROW(entries)) {
if (is.matrix(entries) && ncol(entries) == 2L)
db <- rbind(db, cbind(packageName, dirname(path),
entries))
else warning(gettextf("data index for package %s is invalid and will be ignored",
sQuote(packageName)), domain = NA, call. = FALSE)
}
}
colnames(db) <- c("Package", "LibPath", "Item", "Title")
footer <- if (missing(package))
paste0("Use ", sQuote(paste("data(package =", ".packages(all.available = TRUE))")),
"\n", "to list the data sets in all *available* packages.")
else NULL
y <- list(title = "Data sets", header = NULL, results = db,
footer = footer)
class(y) <- "packageIQR"
return(y)
}
paths <- file.path(paths, "data")
for (name in names) {
found <- FALSE
for (p in paths) {
tmp_env <- if (overwrite)
envir
else new.env()
if (file_test("-f", file.path(p, "Rdata.rds"))) {
rds <- readRDS(file.path(p, "Rdata.rds"))
if (name %in% names(rds)) {
found <- TRUE
if (verbose)
message(sprintf("name=%s:\t found in Rdata.rds",
name), domain = NA)
thispkg <- sub(".*/([^/]*)/data$", "\\1", p)
thispkg <- sub("_.*$", "", thispkg)
thispkg <- paste0("package:", thispkg)
objs <- rds[[name]]
lazyLoad(file.path(p, "Rdata"), envir = tmp_env,
filter = function(x) x %in% objs)
break
}
else if (verbose)
message(sprintf("name=%s:\t NOT found in names() of Rdata.rds, i.e.,\n\t%s\n",
name, paste(names(rds), collapse = ",")),
domain = NA)
}
if (file_test("-f", file.path(p, "Rdata.zip"))) {
warning("zipped data found for package ", sQuote(basename(dirname(p))),
".\nThat is defunct, so please re-install the package.",
domain = NA)
if (file_test("-f", fp <- file.path(p, "filelist")))
files <- file.path(p, scan(fp, what = "", quiet = TRUE))
else {
warning(gettextf("file 'filelist' is missing for directory %s",
sQuote(p)), domain = NA)
next
}
}
else {
files <- list.files(p, full.names = TRUE)
}
files <- files[grep(name, files, fixed = TRUE)]
if (length(files) > 1L) {
o <- match(fileExt(files), dataExts, nomatch = 100L)
paths0 <- dirname(files)
paths0 <- factor(paths0, levels = unique(paths0))
files <- files[order(paths0, o)]
}
if (length(files)) {
for (file in files) {
if (verbose)
message("name=", name, ":\t file= ...", .Platform$file.sep,
basename(file), "::\t", appendLF = FALSE,
domain = NA)
ext <- fileExt(file)
if (basename(file) != paste0(name, ".", ext))
found <- FALSE
else {
found <- TRUE
zfile <- file
zipname <- file.path(dirname(file), "Rdata.zip")
if (file.exists(zipname)) {
Rdatadir <- tempfile("Rdata")
dir.create(Rdatadir, showWarnings = FALSE)
topic <- basename(file)
rc <- .External(C_unzip, zipname, topic,
Rdatadir, FALSE, TRUE, FALSE, FALSE)
if (rc == 0L)
zfile <- file.path(Rdatadir, topic)
}
if (zfile != file)
on.exit(unlink(zfile))
switch(ext, R = , r = {
library("utils")
sys.source(zfile, chdir = TRUE, envir = tmp_env)
}, RData = , rdata = , rda = load(zfile,
envir = tmp_env), TXT = , txt = , tab = ,
tab.gz = , tab.bz2 = , tab.xz = , txt.gz = ,
txt.bz2 = , txt.xz = assign(name, my_read_table(zfile,
header = TRUE, as.is = FALSE), envir = tmp_env),
CSV = , csv = , csv.gz = , csv.bz2 = ,
csv.xz = assign(name, my_read_table(zfile,
header = TRUE, sep = ";", as.is = FALSE),
envir = tmp_env), found <- FALSE)
}
if (found)
break
}
if (verbose)
message(if (!found)
"*NOT* ", "found", domain = NA)
}
if (found)
break
}
if (!found) {
warning(gettextf("data set %s not found", sQuote(name)),
domain = NA)
}
else if (!overwrite) {
for (o in ls(envir = tmp_env, all.names = TRUE)) {
if (exists(o, envir = envir, inherits = FALSE))
warning(gettextf("an object named %s already exists and will not be overwritten",
sQuote(o)))
else assign(o, get(o, envir = tmp_env, inherits = FALSE),
envir = envir)
}
rm(tmp_env)
}
}
invisible(names)
}
<bytecode: 0x0000027b396d5ae8>
<environment: namespace:utils>
# A tibble: 383 × 16
Year Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ALL RA… <NA> NA NA NA NA NA NA NA NA
2 2019 128451 100 9.1 8 8.3 11.7 16.5 12.3 15.5
3 2018 128579 100 10.1 8.8 8.7 12 17 12.5 15
4 2017 2 127669 100 10 9.1 9.2 12 16.4 12.4 14.7
5 2017 127586 100 10.1 9.1 9.2 11.9 16.3 12.6 14.8
6 2016 126224 100 10.4 9 9.2 12.3 16.7 12.2 15
7 2015 125819 100 10.6 10 9.6 12.1 16.1 12.4 14.9
8 2014 124587 100 11.4 10.5 9.6 12.6 16.4 12.1 14
9 2013 3 123931 100 11.4 10.3 9.5 12.5 16.8 12 13.9
10 2013 4 122952 100 11.3 10.4 9.7 13.1 17 12.5 13.6
# … with 373 more rows, 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
# `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
# Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
# Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
# ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
# ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
# ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`
# A tibble: 35 × 16
Year Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 "1975 … 2948 100 16.4 14.7 15 17.5 21.5 8.9 4.7
2 "1974 … 2897 100 13 15.4 13.8 18 22.1 10.4 5.7
3 "1973" 2722 100 12.2 14.2 14 18.8 22.3 11.4 5.8
4 "1972 … 2655 100 11.9 16 13.5 20.6 22.1 9.5 4.8
5 "N Not… <NA> NA NA NA NA NA NA NA NA
6 "1 A m… <NA> NA NA NA NA NA NA NA NA
7 "2 Est… <NA> NA NA NA NA NA NA NA NA
8 "3 The… <NA> NA NA NA NA NA NA NA NA
9 "4 The… <NA> NA NA NA NA NA NA NA NA
10 "5 Imp… <NA> NA NA NA NA NA NA NA NA
# … with 25 more rows, 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
# `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
# Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
# Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
# ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
# ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
# ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`
We can see that the last 31 rows are not part of data but notes, so we should drop them.
# A tibble: 6 × 16
Year Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1977 3304 100 13.7 14.8 14.1 18.2 20.8 10.6 6
2 1976 18 3081 100 16.2 15.3 13.6 18.1 20 10.3 5.1
3 1975 19 2948 100 16.4 14.7 15 17.5 21.5 8.9 4.7
4 1974 19… 2897 100 13 15.4 13.8 18 22.1 10.4 5.7
5 1973 2722 100 12.2 14.2 14 18.8 22.3 11.4 5.8
6 1972 21 2655 100 11.9 16 13.5 20.6 22.1 9.5 4.8
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
# `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
# Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
# Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
# ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
# ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
# ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`
# A tibble: 6 × 16
race Year Total…¹ Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ALL RAC… 2019 128451 9.1 8 8.3 11.7 16.5 12.3 15.5
2 ALL RAC… 2018 128579 10.1 8.8 8.7 12 17 12.5 15
3 ALL RAC… 2017… 127669 10 9.1 9.2 12 16.4 12.4 14.7
4 ALL RAC… 2017 127586 10.1 9.1 9.2 11.9 16.3 12.6 14.8
5 ALL RAC… 2016 126224 10.4 9 9.2 12.3 16.7 12.2 15
6 ALL RAC… 2015 125819 10.6 10 9.6 12.1 16.1 12.4 14.9
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
# `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
# Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
# Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
# ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
# ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
# ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`
This data is about the annual income per U.S. household from 1967 to 2019 by the racial composition of that household. It shows (1) Distribution of the income and (2) Median income (3) Mean income.
The name of columns
[1] "race" "Year"
[3] "Total_Number" "Under $15,000"
[5] "$15,000\r\nto\r\n$24,999" "$25,000\r\nto\r\n$34,999"
[7] "$35,000\r\nto\r\n$49,999" "$50,000\r\nto\r\n$74,999"
[9] "$75,000\r\nto\r\n$99,999" "$100,000\r\nto\r\n$149,999"
[11] "$150,000\r\nto\r\n$199,999" "$200,000 and over"
[13] "Median_Income_Estimate" "Median_Margin_Error"
[15] "Mean_Income_Estimate" "Mean_Margin_Error"
Racial classifications include:
The current dataframe “data” includes several observations in the same row. Let’s make this dataframe longer.
# In order to use pivot_longer, the data type of the columns that will be combined need to be needs to be the same.
data$Total_Number <- as.numeric(data$Total_Number)
data$Mean_Income_Estimate <- as.numeric(data$Mean_Income_Estimate)
data$Mean_Margin_Error <- as.numeric(data$Mean_Margin_Error)
data$Median_Income_Estimate <- as.numeric(data$Median_Income_Estimate)
data$Median_Margin_Error <- as.numeric(data$Median_Margin_Error)
str(data$Total_Number)
num [1:340] 128451 128579 127669 127586 126224 ...
# A tibble: 4,760 × 4
race Year measure value
<chr> <chr> <chr> <dbl>
1 ALL RACES 2019 "Total_Number" 128451
2 ALL RACES 2019 "Under $15,000" 9.1
3 ALL RACES 2019 "$15,000\r\nto\r\n$24,999" 8
4 ALL RACES 2019 "$25,000\r\nto\r\n$34,999" 8.3
5 ALL RACES 2019 "$35,000\r\nto\r\n$49,999" 11.7
6 ALL RACES 2019 "$50,000\r\nto\r\n$74,999" 16.5
7 ALL RACES 2019 "$75,000\r\nto\r\n$99,999" 12.3
8 ALL RACES 2019 "$100,000\r\nto\r\n$149,999" 15.5
9 ALL RACES 2019 "$150,000\r\nto\r\n$199,999" 8.3
10 ALL RACES 2019 "$200,000 and over" 10.3
# … with 4,750 more rows
The dataframe is now cleaner, however it can be improved by the below ideas. * Replace “‘’” with “to” * The values in the following columns are proportion, not absolute number. So it may be better to consider them as one observation (because the total of these values is always 100) rather than consider these values as several observations. “Under $15,000” “$15,000$24,999” “$25,000$34,999” “$35,000$49,999” “$50,000$74,999”
“$75,000$99,999” “$100,000$149,999” “$150,000$199,999” “$200,000 and over”
---
title: "Challenge 3 Instructions"
author: "Erika Nagai"
desription: "Tidy Data: Pivoting"
date: "09/27/2022"
format:
html:
toc: true
code-fold: true
code-copy: true
code-tools: true
categories:
- challenge_3
- animal_weights
- eggs
- australian_marriage
- usa_households
- sce_labor
---
Installing useful packages
```{r}
#| label: setup
#| warning: false
#| message: false
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
## Challenge Overview
Today's challenge is to:
1. read in a data set, and describe the data set using both words and any supporting information (e.g., tables, etc)
2. identify what needs to be done to tidy the current data
3. anticipate the shape of pivoted data
4. pivot the data into tidy format using `pivot_longer`
## Read in data
Read in one (or more) of the following datasets, using the correct R package and command.
- animal_weights.csv ⭐
- eggs_tidy.csv ⭐⭐ or organiceggpoultry.xls ⭐⭐⭐
- australian_marriage\*.xls ⭐⭐⭐
- USA Households\*.xlsx ⭐⭐⭐⭐
- sce_labor_chart_data_public.xlsx 🌟🌟🌟🌟🌟
I'm using "USA households" dataset.
```{r}
library(readxl)
original_data <- read_excel("_data/USA Households by Total Money Income, Race, and Hispanic Origin of Householder 1967 to 2019.xlsx", skip = 4)
head(original_data)
```
This data is very dirty and hard to read. Before start analyzing it, let's clean the data!
## Anticipate the End Result
As
1. Let's clean column names first
```{r}
data
data <- original_data %>%
rename(
Year = ...1,
Total_Number = ...2,
Median_Income_Estimate = Estimate...13,
Median_Margin_Error = `Margin of error1 (±)...14`,
Mean_Income_Estimate = Estimate...15,
Mean_Margin_Error = `Margin of error1 (±)...16`)
data
#QUESTION: I manually renamed the column names instead of merging two rows ("Mean income (dollars)" + "Estimate") because I didn't find a way to do it. But this is not realistic if we have more columns. Any tips?
```
2. Let's look at the end of the data and clean if necessary.
```{r}
tail(data,35)
```
We can see that the last 31 rows are not part of data but notes, so we should drop them.
```{r}
data <- head(data, -31)
tail(data)
```
3. Clean "Year" column
* "Year" column includes the information of racial classification as well. Make a new column that can contains it.
* Change the order of columns
* Drop the rows that have only racial classification information
* Some values in the "Year" column includes the number for footnotes. Remove this number.
```{r}
data <- data %>%
mutate(race = str_extract(Year, "^(\\D)+")) %>% #extract only text data and create a new column that contains it
fill(race, .direction = 'down') %>%
select(race, Year, everything())
data <- data[!(is.na(data$Total_Number)),]
data <- data[, colnames(data)!= "Total"]
head(data)
```
### Briefly describe the data
This data is about the annual income per U.S. household from 1967 to 2019 by the racial composition of that household.
It shows (1) Distribution of the income and (2) Median income (3) Mean income.
The name of columns
```{r}
colnames(data)
```
Racial classifications include:
```{r}
unique(data$race)
```
### Challenge: Describe the final dimensions
The current dataframe "data" includes several observations in the same row.
Let's make this dataframe longer.
```{r}
# In order to use pivot_longer, the data type of the columns that will be combined need to be needs to be the same.
data$Total_Number <- as.numeric(data$Total_Number)
data$Mean_Income_Estimate <- as.numeric(data$Mean_Income_Estimate)
data$Mean_Margin_Error <- as.numeric(data$Mean_Margin_Error)
data$Median_Income_Estimate <- as.numeric(data$Median_Income_Estimate)
data$Median_Margin_Error <- as.numeric(data$Median_Margin_Error)
str(data$Total_Number)
long_data <- pivot_longer(data, col = c(3:16),
names_to = "measure",
values_to = "value"
)
long_data
```
The dataframe is now cleaner, however it can be improved by the below ideas.
* Replace "'\r\nto\r\n'" with "to"
* The values in the following columns are proportion, not absolute number. So it may be better to consider them as one observation (because the total of these values is always 100) rather than consider these values as several observations.
"Under $15,000" "$15,000\r\nto\r\n$24,999" "$25,000\r\nto\r\n$34,999" "$35,000\r\nto\r\n$49,999" "$50,000\r\nto\r\n$74,999"
"$75,000\r\nto\r\n$99,999" "$100,000\r\nto\r\n$149,999" "$150,000\r\nto\r\n$199,999" "$200,000 and over"