Challenge 3 Instructions

challenge_3

animal_weights

eggs

australian_marriage

usa_households

sce_labor

Author

Erika Nagai

Published

September 27, 2022

Installing useful packages

Code

library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Challenge Overview

Today’s challenge is to:

read in a data set, and describe the data set using both words and any supporting information (e.g., tables, etc)
identify what needs to be done to tidy the current data
anticipate the shape of pivoted data
pivot the data into tidy format using pivot_longer

Read in data

Read in one (or more) of the following datasets, using the correct R package and command.

animal_weights.csv ⭐
eggs_tidy.csv ⭐⭐ or organiceggpoultry.xls ⭐⭐⭐
australian_marriage*.xls ⭐⭐⭐
USA Households*.xlsx ⭐⭐⭐⭐
sce_labor_chart_data_public.xlsx 🌟🌟🌟🌟🌟

I’m using “USA households” dataset.

Code

library(readxl)
original_data <- read_excel("_data/USA Households by Total Money Income, Race, and Hispanic Origin of Householder 1967 to 2019.xlsx", skip = 4)

head(original_data)

# A tibble: 6 × 16
  ...1      ...2   Total Under…¹ $15,0…² $25,0…³ $35,0…⁴ $50,0…⁵ $75,0…⁶ $100,…⁷
  <chr>     <chr>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 ALL RACES <NA>      NA    NA      NA      NA      NA      NA      NA      NA  
2 2019      128451   100     9.1     8       8.3    11.7    16.5    12.3    15.5
3 2018      128579   100    10.1     8.8     8.7    12      17      12.5    15  
4 2017 2    127669   100    10       9.1     9.2    12      16.4    12.4    14.7
5 2017      127586   100    10.1     9.1     9.2    11.9    16.3    12.6    14.8
6 2016      126224   100    10.4     9       9.2    12.3    16.7    12.2    15  
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Estimate...13 <dbl>,
#   `Margin of error1 (±)...14` <dbl>, Estimate...15 <chr>,
#   `Margin of error1 (±)...16` <chr>, and abbreviated variable names
#   ¹`Under $15,000`, ²`$15,000\r\nto\r\n$24,999`, ³`$25,000\r\nto\r\n$34,999`,
#   ⁴`$35,000\r\nto\r\n$49,999`, ⁵`$50,000\r\nto\r\n$74,999`,
#   ⁶`$75,000\r\nto\r\n$99,999`, ⁷`$100,000\r\nto\r\n$149,999`

This data is very dirty and hard to read. Before start analyzing it, let’s clean the data!

Anticipate the End Result

Let’s clean column names first

Code

data

function (..., list = character(), package = NULL, lib.loc = NULL, 
    verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE) 
{
    fileExt <- function(x) {
        db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
        ans <- sub(".*\\.", "", x)
        ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2", 
            x[db])
        ans
    }
    my_read_table <- function(...) {
        lcc <- Sys.getlocale("LC_COLLATE")
        on.exit(Sys.setlocale("LC_COLLATE", lcc))
        Sys.setlocale("LC_COLLATE", "C")
        read.table(...)
    }
    stopifnot(is.character(list))
    names <- c(as.character(substitute(list(...))[-1L]), list)
    if (!is.null(package)) {
        if (!is.character(package)) 
            stop("'package' must be a character vector or NULL")
    }
    paths <- find.package(package, lib.loc, verbose = verbose)
    if (is.null(lib.loc)) 
        paths <- c(path.package(package, TRUE), if (!length(package)) getwd(), 
            paths)
    paths <- unique(normalizePath(paths[file.exists(paths)]))
    paths <- paths[dir.exists(file.path(paths, "data"))]
    dataExts <- tools:::.make_file_exts("data")
    if (length(names) == 0L) {
        db <- matrix(character(), nrow = 0L, ncol = 4L)
        for (path in paths) {
            entries <- NULL
            packageName <- if (file_test("-f", file.path(path, 
                "DESCRIPTION"))) 
                basename(path)
            else "."
            if (file_test("-f", INDEX <- file.path(path, "Meta", 
                "data.rds"))) {
                entries <- readRDS(INDEX)
            }
            else {
                dataDir <- file.path(path, "data")
                entries <- tools::list_files_with_type(dataDir, 
                  "data")
                if (length(entries)) {
                  entries <- unique(tools::file_path_sans_ext(basename(entries)))
                  entries <- cbind(entries, "")
                }
            }
            if (NROW(entries)) {
                if (is.matrix(entries) && ncol(entries) == 2L) 
                  db <- rbind(db, cbind(packageName, dirname(path), 
                    entries))
                else warning(gettextf("data index for package %s is invalid and will be ignored", 
                  sQuote(packageName)), domain = NA, call. = FALSE)
            }
        }
        colnames(db) <- c("Package", "LibPath", "Item", "Title")
        footer <- if (missing(package)) 
            paste0("Use ", sQuote(paste("data(package =", ".packages(all.available = TRUE))")), 
                "\n", "to list the data sets in all *available* packages.")
        else NULL
        y <- list(title = "Data sets", header = NULL, results = db, 
            footer = footer)
        class(y) <- "packageIQR"
        return(y)
    }
    paths <- file.path(paths, "data")
    for (name in names) {
        found <- FALSE
        for (p in paths) {
            tmp_env <- if (overwrite) 
                envir
            else new.env()
            if (file_test("-f", file.path(p, "Rdata.rds"))) {
                rds <- readRDS(file.path(p, "Rdata.rds"))
                if (name %in% names(rds)) {
                  found <- TRUE
                  if (verbose) 
                    message(sprintf("name=%s:\t found in Rdata.rds", 
                      name), domain = NA)
                  thispkg <- sub(".*/([^/]*)/data$", "\\1", p)
                  thispkg <- sub("_.*$", "", thispkg)
                  thispkg <- paste0("package:", thispkg)
                  objs <- rds[[name]]
                  lazyLoad(file.path(p, "Rdata"), envir = tmp_env, 
                    filter = function(x) x %in% objs)
                  break
                }
                else if (verbose) 
                  message(sprintf("name=%s:\t NOT found in names() of Rdata.rds, i.e.,\n\t%s\n", 
                    name, paste(names(rds), collapse = ",")), 
                    domain = NA)
            }
            if (file_test("-f", file.path(p, "Rdata.zip"))) {
                warning("zipped data found for package ", sQuote(basename(dirname(p))), 
                  ".\nThat is defunct, so please re-install the package.", 
                  domain = NA)
                if (file_test("-f", fp <- file.path(p, "filelist"))) 
                  files <- file.path(p, scan(fp, what = "", quiet = TRUE))
                else {
                  warning(gettextf("file 'filelist' is missing for directory %s", 
                    sQuote(p)), domain = NA)
                  next
                }
            }
            else {
                files <- list.files(p, full.names = TRUE)
            }
            files <- files[grep(name, files, fixed = TRUE)]
            if (length(files) > 1L) {
                o <- match(fileExt(files), dataExts, nomatch = 100L)
                paths0 <- dirname(files)
                paths0 <- factor(paths0, levels = unique(paths0))
                files <- files[order(paths0, o)]
            }
            if (length(files)) {
                for (file in files) {
                  if (verbose) 
                    message("name=", name, ":\t file= ...", .Platform$file.sep, 
                      basename(file), "::\t", appendLF = FALSE, 
                      domain = NA)
                  ext <- fileExt(file)
                  if (basename(file) != paste0(name, ".", ext)) 
                    found <- FALSE
                  else {
                    found <- TRUE
                    zfile <- file
                    zipname <- file.path(dirname(file), "Rdata.zip")
                    if (file.exists(zipname)) {
                      Rdatadir <- tempfile("Rdata")
                      dir.create(Rdatadir, showWarnings = FALSE)
                      topic <- basename(file)
                      rc <- .External(C_unzip, zipname, topic, 
                        Rdatadir, FALSE, TRUE, FALSE, FALSE)
                      if (rc == 0L) 
                        zfile <- file.path(Rdatadir, topic)
                    }
                    if (zfile != file) 
                      on.exit(unlink(zfile))
                    switch(ext, R = , r = {
                      library("utils")
                      sys.source(zfile, chdir = TRUE, envir = tmp_env)
                    }, RData = , rdata = , rda = load(zfile, 
                      envir = tmp_env), TXT = , txt = , tab = , 
                      tab.gz = , tab.bz2 = , tab.xz = , txt.gz = , 
                      txt.bz2 = , txt.xz = assign(name, my_read_table(zfile, 
                        header = TRUE, as.is = FALSE), envir = tmp_env), 
                      CSV = , csv = , csv.gz = , csv.bz2 = , 
                      csv.xz = assign(name, my_read_table(zfile, 
                        header = TRUE, sep = ";", as.is = FALSE), 
                        envir = tmp_env), found <- FALSE)
                  }
                  if (found) 
                    break
                }
                if (verbose) 
                  message(if (!found) 
                    "*NOT* ", "found", domain = NA)
            }
            if (found) 
                break
        }
        if (!found) {
            warning(gettextf("data set %s not found", sQuote(name)), 
                domain = NA)
        }
        else if (!overwrite) {
            for (o in ls(envir = tmp_env, all.names = TRUE)) {
                if (exists(o, envir = envir, inherits = FALSE)) 
                  warning(gettextf("an object named %s already exists and will not be overwritten", 
                    sQuote(o)))
                else assign(o, get(o, envir = tmp_env, inherits = FALSE), 
                  envir = envir)
            }
            rm(tmp_env)
        }
    }
    invisible(names)
}
<bytecode: 0x0000027b396d5ae8>
<environment: namespace:utils>

Code

data <- original_data %>%
  rename(
    Year = ...1,
    Total_Number = ...2,
    Median_Income_Estimate = Estimate...13,
    Median_Margin_Error = `Margin of error1 (±)...14`,
    Mean_Income_Estimate = Estimate...15,
    Mean_Margin_Error = `Margin of error1 (±)...16`)

data

# A tibble: 383 × 16
   Year    Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
   <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 ALL RA… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 2 2019    128451    100     9.1     8       8.3    11.7    16.5    12.3    15.5
 3 2018    128579    100    10.1     8.8     8.7    12      17      12.5    15  
 4 2017 2  127669    100    10       9.1     9.2    12      16.4    12.4    14.7
 5 2017    127586    100    10.1     9.1     9.2    11.9    16.3    12.6    14.8
 6 2016    126224    100    10.4     9       9.2    12.3    16.7    12.2    15  
 7 2015    125819    100    10.6    10       9.6    12.1    16.1    12.4    14.9
 8 2014    124587    100    11.4    10.5     9.6    12.6    16.4    12.1    14  
 9 2013 3  123931    100    11.4    10.3     9.5    12.5    16.8    12      13.9
10 2013 4  122952    100    11.3    10.4     9.7    13.1    17      12.5    13.6
# … with 373 more rows, 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`

Code

#QUESTION: I manually renamed the column names instead of merging two rows ("Mean income (dollars)" + "Estimate") because I didn't find a way to do it. But this is not realistic if we have more columns. Any tips?

Let’s look at the end of the data and clean if necessary.

Code

tail(data,35)

# A tibble: 35 × 16
   Year    Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
   <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 "1975 … 2948      100    16.4    14.7    15      17.5    21.5     8.9     4.7
 2 "1974 … 2897      100    13      15.4    13.8    18      22.1    10.4     5.7
 3 "1973"  2722      100    12.2    14.2    14      18.8    22.3    11.4     5.8
 4 "1972 … 2655      100    11.9    16      13.5    20.6    22.1     9.5     4.8
 5 "N Not… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 6 "1 A m… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 7 "2 Est… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 8 "3 The… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
 9 "4 The… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
10 "5 Imp… <NA>       NA    NA      NA      NA      NA      NA      NA      NA  
# … with 25 more rows, 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`

We can see that the last 31 rows are not part of data but notes, so we should drop them.

Code

data <- head(data, -31)
tail(data)

# A tibble: 6 × 16
  Year     Total…¹ Total Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
  <chr>    <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 1977     3304      100    13.7    14.8    14.1    18.2    20.8    10.6     6  
2 1976 18  3081      100    16.2    15.3    13.6    18.1    20      10.3     5.1
3 1975 19  2948      100    16.4    14.7    15      17.5    21.5     8.9     4.7
4 1974 19… 2897      100    13      15.4    13.8    18      22.1    10.4     5.7
5 1973     2722      100    12.2    14.2    14      18.8    22.3    11.4     5.8
6 1972 21  2655      100    11.9    16      13.5    20.6    22.1     9.5     4.8
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`

Clean “Year” column

“Year” column includes the information of racial classification as well. Make a new column that can contains it.
Change the order of columns
Drop the rows that have only racial classification information
Some values in the “Year” column includes the number for footnotes. Remove this number.

Code

data <- data %>%
  mutate(race = str_extract(Year, "^(\\D)+")) %>% #extract only text data and create a new column that contains it
  fill(race, .direction = 'down') %>% 
  select(race, Year, everything()) 

data <- data[!(is.na(data$Total_Number)),]
data <- data[, colnames(data)!= "Total"]



head(data)

# A tibble: 6 × 16
  race     Year  Total…¹ Under…² $15,0…³ $25,0…⁴ $35,0…⁵ $50,0…⁶ $75,0…⁷ $100,…⁸
  <chr>    <chr> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 ALL RAC… 2019  128451      9.1     8       8.3    11.7    16.5    12.3    15.5
2 ALL RAC… 2018  128579     10.1     8.8     8.7    12      17      12.5    15  
3 ALL RAC… 2017… 127669     10       9.1     9.2    12      16.4    12.4    14.7
4 ALL RAC… 2017  127586     10.1     9.1     9.2    11.9    16.3    12.6    14.8
5 ALL RAC… 2016  126224     10.4     9       9.2    12.3    16.7    12.2    15  
6 ALL RAC… 2015  125819     10.6    10       9.6    12.1    16.1    12.4    14.9
# … with 6 more variables: `$150,000\r\nto\r\n$199,999` <dbl>,
#   `$200,000 and over` <dbl>, Median_Income_Estimate <dbl>,
#   Median_Margin_Error <dbl>, Mean_Income_Estimate <chr>,
#   Mean_Margin_Error <chr>, and abbreviated variable names ¹Total_Number,
#   ²`Under $15,000`, ³`$15,000\r\nto\r\n$24,999`, ⁴`$25,000\r\nto\r\n$34,999`,
#   ⁵`$35,000\r\nto\r\n$49,999`, ⁶`$50,000\r\nto\r\n$74,999`,
#   ⁷`$75,000\r\nto\r\n$99,999`, ⁸`$100,000\r\nto\r\n$149,999`

Briefly describe the data

This data is about the annual income per U.S. household from 1967 to 2019 by the racial composition of that household. It shows (1) Distribution of the income and (2) Median income (3) Mean income.

The name of columns

Code

colnames(data)

 [1] "race"                       "Year"                      
 [3] "Total_Number"               "Under $15,000"             
 [5] "$15,000\r\nto\r\n$24,999"   "$25,000\r\nto\r\n$34,999"  
 [7] "$35,000\r\nto\r\n$49,999"   "$50,000\r\nto\r\n$74,999"  
 [9] "$75,000\r\nto\r\n$99,999"   "$100,000\r\nto\r\n$149,999"
[11] "$150,000\r\nto\r\n$199,999" "$200,000 and over"         
[13] "Median_Income_Estimate"     "Median_Margin_Error"       
[15] "Mean_Income_Estimate"       "Mean_Margin_Error"

Racial classifications include:

Code

unique(data$race)

 [1] "ALL RACES"                     "WHITE ALONE "                 
 [3] "WHITE "                        "WHITE ALONE, NOT HISPANIC "   
 [5] "WHITE, NOT HISPANIC "          "BLACK ALONE OR IN COMBINATION"
 [7] "BLACK ALONE "                  "BLACK "                       
 [9] "ASIAN ALONE OR IN COMBINATION" "ASIAN ALONE "                 
[11] "ASIAN AND PACIFIC ISLANDER "   "HISPANIC (ANY RACE) "

Challenge: Describe the final dimensions

The current dataframe “data” includes several observations in the same row. Let’s make this dataframe longer.

Code

# In order to use pivot_longer, the data type of the columns that will be combined need to be needs to be the same.
data$Total_Number <- as.numeric(data$Total_Number)
data$Mean_Income_Estimate <- as.numeric(data$Mean_Income_Estimate)
data$Mean_Margin_Error <- as.numeric(data$Mean_Margin_Error)
data$Median_Income_Estimate <- as.numeric(data$Median_Income_Estimate)
data$Median_Margin_Error <- as.numeric(data$Median_Margin_Error)

str(data$Total_Number)

 num [1:340] 128451 128579 127669 127586 126224 ...

Code

long_data <- pivot_longer(data, col = c(3:16),
                         names_to = "measure",
                         values_to = "value"
                         ) 

  
long_data

# A tibble: 4,760 × 4
   race      Year  measure                         value
   <chr>     <chr> <chr>                           <dbl>
 1 ALL RACES 2019  "Total_Number"               128451  
 2 ALL RACES 2019  "Under $15,000"                   9.1
 3 ALL RACES 2019  "$15,000\r\nto\r\n$24,999"        8  
 4 ALL RACES 2019  "$25,000\r\nto\r\n$34,999"        8.3
 5 ALL RACES 2019  "$35,000\r\nto\r\n$49,999"       11.7
 6 ALL RACES 2019  "$50,000\r\nto\r\n$74,999"       16.5
 7 ALL RACES 2019  "$75,000\r\nto\r\n$99,999"       12.3
 8 ALL RACES 2019  "$100,000\r\nto\r\n$149,999"     15.5
 9 ALL RACES 2019  "$150,000\r\nto\r\n$199,999"      8.3
10 ALL RACES 2019  "$200,000 and over"              10.3
# … with 4,750 more rows

The dataframe is now cleaner, however it can be improved by the below ideas. * Replace “‘’” with “to” * The values in the following columns are proportion, not absolute number. So it may be better to consider them as one observation (because the total of these values is always 100) rather than consider these values as several observations. “Under $15,000” “$15,000$24,999” “$25,000$34,999” “$35,000$49,999” “$50,000$74,999”
“$75,000$99,999” “$100,000$149,999” “$150,000$199,999” “$200,000 and over”