library(tidyverse)
library(ggplot2)
library("readxl")
library(lubridate)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 9
Challenge Overview
Today’s challenge is simple. Create a function, and use it to perform a data analysis / cleaning / visualization task:
Examples of such functions are: 1) A function that reads in and cleans a dataset.
2) A function that computes summary statistics (e.g., computes the z score for a variable).
3) A function that plots a histogram.
That’s it!
Create a function to read in and clean a dataset.
I will be reusing the dataset - debt_in_trillion as I’m already familiar with it (Challenge 6). Here using a function, I’m creating a new column called Date which parses date and quarter column into a single Date
column. Q1, Q2, Q3, Q4 will be mapped to 01, 04, 07, and 10 respectively.
<- read_xlsx("_data/debt_in_trillions.xlsx")
data view(data)
<- function(x) {
debt_data <-read_excel("_data/debt_in_trillions.xlsx")
data <- data%>%
data mutate(Date = parse_date_time(`Year and Quarter`, orders="yq"))
return(data)
}debt_data()
# A tibble: 74 × 9
`Year and Quarter` Mortgage HE Revolvin…¹ Auto …² Credi…³ Stude…⁴ Other Total
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 03:Q1 4.94 0.242 0.641 0.688 0.241 0.478 7.23
2 03:Q2 5.08 0.26 0.622 0.693 0.243 0.486 7.38
3 03:Q3 5.18 0.269 0.684 0.693 0.249 0.477 7.56
4 03:Q4 5.66 0.302 0.704 0.698 0.253 0.449 8.07
5 04:Q1 5.84 0.328 0.72 0.695 0.260 0.446 8.29
6 04:Q2 5.97 0.367 0.743 0.697 0.263 0.423 8.46
7 04:Q3 6.21 0.426 0.751 0.706 0.33 0.41 8.83
8 04:Q4 6.36 0.468 0.728 0.717 0.346 0.423 9.04
9 05:Q1 6.51 0.502 0.725 0.71 0.364 0.394 9.21
10 05:Q2 6.70 0.528 0.774 0.717 0.374 0.402 9.49
# … with 64 more rows, 1 more variable: Date <dttm>, and abbreviated variable
# names ¹`HE Revolving`, ²`Auto Loan`, ³`Credit Card`, ⁴`Student Loan`
Function to calculate summary statistics - z-score of a variable
Below is a new function to calculate z-score for a variable, that is a column in the data for our use case.
<- function(val) {
z_score <- (val - mean(val)) / sd(val)
result return(result)
}
Calculating z-score for total column in the data.
<-read_excel("_data/debt_in_trillions.xlsx")
data z_score(data$Total)
[1] -2.61296476 -2.52529286 -2.42693486 -2.13369932 -2.00512152 -1.90705078
[7] -1.69275444 -1.57290935 -1.47805593 -1.31425999 -1.14173132 -1.02102445
[13] -0.80293627 -0.59306374 -0.38376573 -0.26776994 -0.16320711 0.04046059
[19] 0.20287792 0.34024566 0.43498418 0.47094920 0.51478515 0.51162529
[25] 0.43101999 0.36069861 0.28704502 0.22218159 0.19391516 0.09400595
[31] 0.03735818 -0.03801898 -0.01434871 -0.02865429 -0.06789407 -0.13936449
[37] -0.19712906 -0.22715130 -0.26966586 -0.25185571 -0.31505302 -0.35986566
[43] -0.28690149 -0.14844192 -0.07432871 -0.08467009 -0.03985745 0.02736151
[49] 0.04115002 0.04229906 0.16409752 0.19339809 0.27153295 0.29164118
[55] 0.32783601 0.45767776 0.54328139 0.60877679 0.67542123 0.78630379
[61] 0.82249861 0.86960934 0.99542944 1.01381411 1.08505472 1.19536276
[67] 1.24821869 1.35910125 1.44815201 1.42861829 1.47860162 1.59695296
[73] 1.64578724 1.82561232
Function to plot graph
<- data %>%
data mutate(Date = parse_date_time(`Year and Quarter`, orders="yq"))
<- data %>%
data pivot_longer(
cols = c(`Mortgage`, `HE Revolving`, `Auto Loan`, `Credit Card`, `Student Loan`, `Other`, `Total`),
names_to = "Type",
values_to = "Value"
)
<- function(data) {
plot_trend ggplot(data, aes(x=`Date`, y=`Value`, fill=`Type`)) +
geom_area()
}
plot_trend(data)