Code
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)Erika Nagai
September 27, 2022
Installing useful packages
Today’s challenge is to:
pivot_longerRead in one (or more) of the following datasets, using the correct R package and command.
I’m using “USA households” dataset.
# A tibble: 6 × 16
  ...1      ...2   Total Under…¹ $15,0…² $25,0…³ $35,0…⁴ $50,0…⁵ $75,0…⁶ $100,…⁷
  <chr>     <chr>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 ALL RACES <NA>      NA    NA      NA      NA      NA      NA      NA      NA  
2 2019      128451   100     9.1     8       8.3    11.7    16.5    12.3    15.5
3 2018      128579   100    10.1     8.8     8.7    12      17      12.5    15  
4 2017 2    127669   100    10       9.1     9.2    12      16.4    12.4    14.7
5 2017      127586   100    10.1     9.1     9.2    11.9    16.3    12.6    14.8
6 2016      126224   100    10.4     9       9.2    12.3    16.7    12.2    15  
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Estimate...13 <dbl>,
#   `Margin of error1 (±)...14` <dbl>, Estimate...15 <chr>,
#   `Margin of error1 (±)...16` <chr>, and abbreviated variable names
#   ¹`Under $15,000`, ²`$15,000\r\nto\r\n$24,999`, ³`$25,000\r\nto\r\n$34,999`,
#   ⁴`$35,000\r\nto\r\n$49,999`, ⁵`$50,000\r\nto\r\n$74,999`,
#   ⁶`$75,000\r\nto\r\n$99,999`, ⁷`$100,000\r\nto\r\n$149,999`This data is very dirty and hard to read. Before start analyzing it, let’s clean the data!
As
function (..., list = character(), package = NULL, lib.loc = NULL, 
    verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE) 
{
    fileExt <- function(x) {
        db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
        ans <- sub(".*\\.", "", x)
        ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2", 
            x[db])
        ans
    }
    my_read_table <- function(...) {
        lcc <- Sys.getlocale("LC_COLLATE")
        on.exit(Sys.setlocale("LC_COLLATE", lcc))
        Sys.setlocale("LC_COLLATE", "C")
        read.table(...)
    }
    stopifnot(is.character(list))
    names <- c(as.character(substitute(list(...))[-1L]), list)
    if (!is.null(package)) {
        if (!is.character(package)) 
            stop("'package' must be a character vector or NULL")
    }
    paths <- find.package(package, lib.loc, verbose = verbose)
    if (is.null(lib.loc)) 
        paths <- c(path.package(package, TRUE), if (!length(package)) getwd(), 
            paths)
    paths <- unique(normalizePath(paths[file.exists(paths)]))
    paths <- paths[dir.exists(file.path(paths, "data"))]
    dataExts <- tools:::.make_file_exts("data")
    if (length(names) == 0L) {
        db <- matrix(character(), nrow = 0L, ncol = 4L)
        for (path in paths) {
            entries <- NULL
            packageName <- if (file_test("-f", file.path(path, 
                "DESCRIPTION"))) 
                basename(path)
            else "."
            if (file_test("-f", INDEX <- file.path(path, "Meta", 
                "data.rds"))) {
                entries <- readRDS(INDEX)
            }
            else {
                dataDir <- file.path(path, "data")
                entries <- tools::list_files_with_type(dataDir, 
                  "data")
                if (length(entries)) {
                  entries <- unique(tools::file_path_sans_ext(basename(entries)))
                  entries <- cbind(entries, "")
                }
            }
            if (NROW(entries)) {
                if (is.matrix(entries) && ncol(entries) == 2L) 
                  db <- rbind(db, cbind(packageName, dirname(path), 
                    entries))
                else warning(gettextf("data index for package %s is invalid and will be ignored", 
                  sQuote(packageName)), domain = NA, call. = FALSE)
            }
        }
        colnames(db) <- c("Package", "LibPath", "Item", "Title")
        footer <- if (missing(package)) 
            paste0("Use ", sQuote(paste("data(package =", ".packages(all.available = TRUE))")), 
                "\n", "to list the data sets in all *available* packages.")
        else NULL
        y <- list(title = "Data sets", header = NULL, results = db, 
            footer = footer)
        class(y) <- "packageIQR"
        return(y)
    }
    paths <- file.path(paths, "data")
    for (name in names) {
        found <- FALSE
        for (p in paths) {
            tmp_env <- if (overwrite) 
                envir
            else new.env()
            if (file_test("-f", file.path(p, "Rdata.rds"))) {
                rds <- readRDS(file.path(p, "Rdata.rds"))
                if (name %in% names(rds)) {
                  found <- TRUE
                  if (verbose) 
                    message(sprintf("name=%s:\t found in Rdata.rds", 
                      name), domain = NA)
                  thispkg <- sub(".*/([^/]*)/data$", "\\1", p)
                  thispkg <- sub("_.*$", "", thispkg)
                  thispkg <- paste0("package:", thispkg)
                  objs <- rds[[name]]
                  lazyLoad(file.path(p, "Rdata"), envir = tmp_env, 
                    filter = function(x) x %in% objs)
                  break
                }
                else if (verbose) 
                  message(sprintf("name=%s:\t NOT found in names() of Rdata.rds, i.e.,\n\t%s\n", 
                    name, paste(names(rds), collapse = ",")), 
                    domain = NA)
            }
            if (file_test("-f", file.path(p, "Rdata.zip"))) {
                warning("zipped data found for package ", sQuote(basename(dirname(p))), 
                  ".\nThat is defunct, so please re-install the package.", 
                  domain = NA)
                if (file_test("-f", fp <- file.path(p, "filelist"))) 
                  files <- file.path(p, scan(fp, what = "", quiet = TRUE))
                else {
                  warning(gettextf("file 'filelist' is missing for directory %s", 
                    sQuote(p)), domain = NA)
                  next
                }
            }
            else {
                files <- list.files(p, full.names = TRUE)
            }
            files <- files[grep(name, files, fixed = TRUE)]
            if (length(files) > 1L) {
                o <- match(fileExt(files), dataExts, nomatch = 100L)
                paths0 <- dirname(files)
                paths0 <- factor(paths0, levels = unique(paths0))
                files <- files[order(paths0, o)]
            }
            if (length(files)) {
                for (file in files) {
                  if (verbose) 
                    message("name=", name, ":\t file= ...", .Platform$file.sep, 
                      basename(file), "::\t", appendLF = FALSE, 
                      domain = NA)
                  ext <- fileExt(file)
                  if (basename(file) != paste0(name, ".", ext)) 
                    found <- FALSE
                  else {
                    found <- TRUE
                    zfile <- file
                    zipname <- file.path(dirname(file), "Rdata.zip")
                    if (file.exists(zipname)) {
                      Rdatadir <- tempfile("Rdata")
                      dir.create(Rdatadir, showWarnings = FALSE)
                      topic <- basename(file)
                      rc <- .External(C_unzip, zipname, topic, 
                        Rdatadir, FALSE, TRUE, FALSE, FALSE)
                      if (rc == 0L) 
                        zfile <- file.path(Rdatadir, topic)
                    }
                    if (zfile != file) 
                      on.exit(unlink(zfile))
                    switch(ext, R = , r = {
                      library("utils")
                      sys.source(zfile, chdir = TRUE, envir = tmp_env)
                    }, RData = , rdata = , rda = load(zfile, 
                      envir = tmp_env), TXT = , txt = , tab = , 
                      tab.gz = , tab.bz2 = , tab.xz = , txt.gz = , 
                      txt.bz2 = , txt.xz = assign(name, my_read_table(zfile, 
                        header = TRUE, as.is = FALSE), envir = tmp_env), 
                      CSV = , csv = , csv.gz = , csv.bz2 = , 
                      csv.xz = assign(name, my_read_table(zfile, 
                        header = TRUE, sep = ";", as.is = FALSE), 
                        envir = tmp_env), found <- FALSE)
                  }
                  if (found) 
                    break
                }
                if (verbose) 
                  message(if (!found) 
                    "*NOT* ", "found", domain = NA)
            }
            if (found) 
                break
        }
        if (!found) {
            warning(gettextf("data set %s not found", sQuote(name)), 
                domain = NA)
        }
        else if (!overwrite) {
            for (o in ls(envir = tmp_env, all.names = TRUE)) {
                if (exists(o, envir = envir, inherits = FALSE)) 
                  warning(gettextf("an object named %s already exists and will not be overwritten", 
                    sQuote(o)))
                else assign(o, get(o, envir = tmp_env, inherits = FALSE), 
                  envir = envir)
            }
            rm(tmp_env)
        }
    }
    invisible(names)
}
<bytecode: 0x0000027b396d5ae8>
<environment: namespace:utils># A tibble: 383 × 16
   Year    Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
   <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 ALL RA… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 2 2019    128451    100     9.1     8       8.3    11.7    16.5    12.3    15.5
 3 2018    128579    100    10.1     8.8     8.7    12      17      12.5    15  
 4 2017 2  127669    100    10       9.1     9.2    12      16.4    12.4    14.7
 5 2017    127586    100    10.1     9.1     9.2    11.9    16.3    12.6    14.8
 6 2016    126224    100    10.4     9       9.2    12.3    16.7    12.2    15  
 7 2015    125819    100    10.6    10       9.6    12.1    16.1    12.4    14.9
 8 2014    124587    100    11.4    10.5     9.6    12.6    16.4    12.1    14  
 9 2013 3  123931    100    11.4    10.3     9.5    12.5    16.8    12      13.9
10 2013 4  122952    100    11.3    10.4     9.7    13.1    17      12.5    13.6
# … with 373 more rows, 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`# A tibble: 35 × 16
   Year    Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
   <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 "1975 … 2948      100    16.4    14.7    15      17.5    21.5     8.9     4.7
 2 "1974 … 2897      100    13      15.4    13.8    18      22.1    10.4     5.7
 3 "1973"  2722      100    12.2    14.2    14      18.8    22.3    11.4     5.8
 4 "1972 … 2655      100    11.9    16      13.5    20.6    22.1     9.5     4.8
 5 "N Not… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 6 "1 A m… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 7 "2 Est… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 8 "3 The… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 9 "4 The… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
10 "5 Imp… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
# … with 25 more rows, 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`We can see that the last 31 rows are not part of data but notes, so we should drop them.
# A tibble: 6 × 16
  Year     Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
  <chr>    <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 1977     3304      100    13.7    14.8    14.1    18.2    20.8    10.6     6  
2 1976 18  3081      100    16.2    15.3    13.6    18.1    20      10.3     5.1
3 1975 19  2948      100    16.4    14.7    15      17.5    21.5     8.9     4.7
4 1974 19… 2897      100    13      15.4    13.8    18      22.1    10.4     5.7
5 1973     2722      100    12.2    14.2    14      18.8    22.3    11.4     5.8
6 1972 21  2655      100    11.9    16      13.5    20.6    22.1     9.5     4.8
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`# A tibble: 6 × 16
  race     Year  Total…¹ Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
  <chr>    <chr> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 ALL RAC… 2019  128451      9.1     8       8.3    11.7    16.5    12.3    15.5
2 ALL RAC… 2018  128579     10.1     8.8     8.7    12      17      12.5    15  
3 ALL RAC… 2017… 127669     10       9.1     9.2    12      16.4    12.4    14.7
4 ALL RAC… 2017  127586     10.1     9.1     9.2    11.9    16.3    12.6    14.8
5 ALL RAC… 2016  126224     10.4     9       9.2    12.3    16.7    12.2    15  
6 ALL RAC… 2015  125819     10.6    10       9.6    12.1    16.1    12.4    14.9
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`This data is about the annual income per U.S. household from 1967 to 2019 by the racial composition of that household. It shows (1) Distribution of the income and (2) Median income (3) Mean income.
The name of columns
 [1] "race"                       "Year"                      
 [3] "Total_Number"               "Under $15,000"             
 [5] "$15,000\r\nto\r\n$24,999"   "$25,000\r\nto\r\n$34,999"  
 [7] "$35,000\r\nto\r\n$49,999"   "$50,000\r\nto\r\n$74,999"  
 [9] "$75,000\r\nto\r\n$99,999"   "$100,000\r\nto\r\n$149,999"
[11] "$150,000\r\nto\r\n$199,999" "$200,000 and over"         
[13] "Median_Income_Estimate"     "Median_Margin_Error"       
[15] "Mean_Income_Estimate"       "Mean_Margin_Error"         Racial classifications include:
The current dataframe “data” includes several observations in the same row. Let’s make this dataframe longer.
# In order to use pivot_longer, the data type of the columns that will be combined need to be needs to be the same.
data$Total_Number <- as.numeric(data$Total_Number)
data$Mean_Income_Estimate <- as.numeric(data$Mean_Income_Estimate)
data$Mean_Margin_Error <- as.numeric(data$Mean_Margin_Error)
data$Median_Income_Estimate <- as.numeric(data$Median_Income_Estimate)
data$Median_Margin_Error <- as.numeric(data$Median_Margin_Error)
str(data$Total_Number) num [1:340] 128451 128579 127669 127586 126224 ...# A tibble: 4,760 × 4
   race      Year  measure                         value
   <chr>     <chr> <chr>                           <dbl>
 1 ALL RACES 2019  "Total_Number"               128451  
 2 ALL RACES 2019  "Under $15,000"                   9.1
 3 ALL RACES 2019  "$15,000\r\nto\r\n$24,999"        8  
 4 ALL RACES 2019  "$25,000\r\nto\r\n$34,999"        8.3
 5 ALL RACES 2019  "$35,000\r\nto\r\n$49,999"       11.7
 6 ALL RACES 2019  "$50,000\r\nto\r\n$74,999"       16.5
 7 ALL RACES 2019  "$75,000\r\nto\r\n$99,999"       12.3
 8 ALL RACES 2019  "$100,000\r\nto\r\n$149,999"     15.5
 9 ALL RACES 2019  "$150,000\r\nto\r\n$199,999"      8.3
10 ALL RACES 2019  "$200,000 and over"              10.3
# … with 4,750 more rowsThe dataframe is now cleaner, however it can be improved by the below ideas. * Replace “‘’” with “to” * The values in the following columns are proportion, not absolute number. So it may be better to consider them as one observation (because the total of these values is always 100) rather than consider these values as several observations. “Under $15,000” “$15,000$24,999” “$25,000$34,999” “$35,000$49,999” “$50,000$74,999”
“$75,000$99,999” “$100,000$149,999” “$150,000$199,999” “$200,000 and over”
---
title: "Challenge 3 Instructions"
author: "Erika Nagai"
desription: "Tidy Data: Pivoting"
date: "09/27/2022"
format:
  html:
    toc: true
    code-fold: true
    code-copy: true
    code-tools: true
categories:
  - challenge_3
  - animal_weights
  - eggs
  - australian_marriage
  - usa_households
  - sce_labor
---
Installing useful packages
```{r}
#| label: setup
#| warning: false
#| message: false
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
## Challenge Overview
Today's challenge is to:
1.  read in a data set, and describe the data set using both words and any supporting information (e.g., tables, etc)
2.  identify what needs to be done to tidy the current data
3.  anticipate the shape of pivoted data
4.  pivot the data into tidy format using `pivot_longer`
## Read in data
Read in one (or more) of the following datasets, using the correct R package and command.
-   animal_weights.csv ⭐
-   eggs_tidy.csv ⭐⭐ or organiceggpoultry.xls ⭐⭐⭐
-   australian_marriage\*.xls ⭐⭐⭐
-   USA Households\*.xlsx ⭐⭐⭐⭐
-   sce_labor_chart_data_public.xlsx 🌟🌟🌟🌟🌟
I'm using "USA households" dataset.
```{r}
library(readxl)
original_data <- read_excel("_data/USA Households by Total Money Income, Race, and Hispanic Origin of Householder 1967 to 2019.xlsx", skip = 4)
head(original_data)
```
This data is very dirty and hard to read. Before start analyzing it, let's clean the data!
## Anticipate the End Result
As 
1. Let's clean column names first
```{r}
data
data <- original_data %>%
  rename(
    Year = ...1,
    Total_Number = ...2,
    Median_Income_Estimate = Estimate...13,
    Median_Margin_Error = `Margin of error1 (±)...14`,
    Mean_Income_Estimate = Estimate...15,
    Mean_Margin_Error = `Margin of error1 (±)...16`)
data
#QUESTION: I manually renamed the column names instead of merging two rows ("Mean income (dollars)" + "Estimate") because I didn't find a way to do it. But this is not realistic if we have more columns. Any tips?
```
2. Let's look at the end of the data and clean if necessary.
```{r}
tail(data,35)
```
We can see that the last 31 rows are not part of data but notes, so we should drop them.
```{r}
data <- head(data, -31)
tail(data)
```
3. Clean "Year" column
* "Year" column includes the information of racial classification as well. Make a new column that can contains it.
* Change the order of columns
* Drop the rows that have only racial classification information
* Some values in the "Year" column includes the number for footnotes. Remove this number. 
```{r}
data <- data %>%
  mutate(race = str_extract(Year, "^(\\D)+")) %>% #extract only text data and create a new column that contains it
  fill(race, .direction = 'down') %>% 
  select(race, Year, everything()) 
data <- data[!(is.na(data$Total_Number)),]
data <- data[, colnames(data)!= "Total"]
head(data)
```
### Briefly describe the data
This data is about the annual income per U.S. household from 1967 to 2019 by the racial composition of that household.
It shows (1) Distribution of the income and (2) Median income (3) Mean income.
The name of columns 
```{r}
colnames(data)
```
Racial classifications include:
```{r}
unique(data$race)
```
### Challenge: Describe the final dimensions
The current dataframe "data" includes several observations in the same row.
Let's make this dataframe longer.
```{r}
# In order to use pivot_longer, the data type of the columns that will be combined need to be needs to be the same.
data$Total_Number <- as.numeric(data$Total_Number)
data$Mean_Income_Estimate <- as.numeric(data$Mean_Income_Estimate)
data$Mean_Margin_Error <- as.numeric(data$Mean_Margin_Error)
data$Median_Income_Estimate <- as.numeric(data$Median_Income_Estimate)
data$Median_Margin_Error <- as.numeric(data$Median_Margin_Error)
str(data$Total_Number)
long_data <- pivot_longer(data, col = c(3:16),
                         names_to = "measure",
                         values_to = "value"
                         ) 
  
long_data
```
The dataframe is now cleaner, however it can be improved by the below ideas.
* Replace "'\r\nto\r\n'" with "to"
* The values in the following columns are proportion, not absolute number. So it may be better to consider them as one observation (because the total of these values is always 100) rather than consider these values as several observations.
"Under $15,000" "$15,000\r\nto\r\n$24,999"   "$25,000\r\nto\r\n$34,999"   "$35,000\r\nto\r\n$49,999"   "$50,000\r\nto\r\n$74,999"  
"$75,000\r\nto\r\n$99,999"   "$100,000\r\nto\r\n$149,999" "$150,000\r\nto\r\n$199,999" "$200,000 and over"