challenge_4
abc_poll
eggs
fed_rates
hotel_bookings
debt
More data wrangling: pivoting
Author

Shuqi Hong

Published

June 9, 2023

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

fed <- read_csv("_data/FedFundsRate.csv")

fed<- fed %>% fill(`Real GDP (Percent Change)`, .direction = "down") %>% 
  fill(`Inflation Rate`, .direction = "down") %>% 
  fill(`Effective Federal Funds Rate`, .direction = "down") %>%
  mutate_at (c(1:3), as.character) #%>% str()

fed %>% summarise_all(funs(sum(is.na(.))))
# A tibble: 1 × 10
   Year Month   Day `Federal Funds Target Rate` `Federal Funds Upper Target`
  <int> <int> <int>                       <int>                        <int>
1     0     0     0                         442                          801
# ℹ 5 more variables: `Federal Funds Lower Target` <int>,
#   `Effective Federal Funds Rate` <int>, `Real GDP (Percent Change)` <int>,
#   `Unemployment Rate` <int>, `Inflation Rate` <int>
Code
glimpse(fed)
Rows: 904
Columns: 10
$ Year                           <chr> "1954", "1954", "1954", "1954", "1954",…
$ Month                          <chr> "7", "8", "9", "10", "11", "12", "1", "…
$ Day                            <chr> "1", "1", "1", "1", "1", "1", "1", "1",…
$ `Federal Funds Target Rate`    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ `Federal Funds Upper Target`   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ `Federal Funds Lower Target`   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ `Effective Federal Funds Rate` <dbl> 0.80, 1.22, 1.06, 0.85, 0.83, 1.28, 1.3…
$ `Real GDP (Percent Change)`    <dbl> 4.6, 4.6, 4.6, 8.0, 8.0, 8.0, 11.9, 11.…
$ `Unemployment Rate`            <dbl> 5.8, 6.0, 6.1, 5.7, 5.3, 5.0, 4.9, 4.7,…
$ `Inflation Rate`               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
Code
head(fed)
# A tibble: 6 × 10
  Year  Month Day   `Federal Funds Target Rate` `Federal Funds Upper Target`
  <chr> <chr> <chr>                       <dbl>                        <dbl>
1 1954  7     1                              NA                           NA
2 1954  8     1                              NA                           NA
3 1954  9     1                              NA                           NA
4 1954  10    1                              NA                           NA
5 1954  11    1                              NA                           NA
6 1954  12    1                              NA                           NA
# ℹ 5 more variables: `Federal Funds Lower Target` <dbl>,
#   `Effective Federal Funds Rate` <dbl>, `Real GDP (Percent Change)` <dbl>,
#   `Unemployment Rate` <dbl>, `Inflation Rate` <dbl>
Code
fed %>% count(Year)
# A tibble: 64 × 2
   Year      n
   <chr> <int>
 1 1954      6
 2 1955     12
 3 1956     12
 4 1957     12
 5 1958     12
 6 1959     12
 7 1960     12
 8 1961     12
 9 1962     12
10 1963     12
# ℹ 54 more rows

Dates can be added, so it should be character not factor. I also fill the several columns.

Below code: I just filter Federal Funds Target Rate without NA

Code
TargetRate <- fed %>% 
  filter(!is.na(`Federal Funds Target Rate`)) %>% 
  select(-c(`Federal Funds Upper Target`, `Federal Funds Lower Target`))

TargetRate
# A tibble: 462 × 8
   Year  Month Day   `Federal Funds Target Rate` `Effective Federal Funds Rate`
   <chr> <chr> <chr>                       <dbl>                          <dbl>
 1 1982  9     27                           10.2                          10.3 
 2 1982  10    1                            10                             9.71
 3 1982  10    7                             9.5                           9.71
 4 1982  11    1                             9.5                           9.2 
 5 1982  11    19                            9                             9.2 
 6 1982  12    1                             9                             8.95
 7 1982  12    14                            8.5                           8.95
 8 1983  1     1                             8.5                           8.68
 9 1983  2     1                             8.5                           8.51
10 1983  3     1                             8.5                           8.77
# ℹ 452 more rows
# ℹ 3 more variables: `Real GDP (Percent Change)` <dbl>,
#   `Unemployment Rate` <dbl>, `Inflation Rate` <dbl>
Code
#newfed <- fed %>% select(-c(`Federal Funds Target Rate`,  `Federal Funds Upper Target`, `Federal Funds Lower Target`)) 

#newfed
Code
hotel <- read_csv("_data/hotel_bookings.csv")

#Find unique values in all columns
#hotel %>% lapply(unique)
hotel
# A tibble: 119,390 × 32
   hotel        is_canceled lead_time arrival_date_year arrival_date_month
   <chr>              <dbl>     <dbl>             <dbl> <chr>             
 1 Resort Hotel           0       342              2015 July              
 2 Resort Hotel           0       737              2015 July              
 3 Resort Hotel           0         7              2015 July              
 4 Resort Hotel           0        13              2015 July              
 5 Resort Hotel           0        14              2015 July              
 6 Resort Hotel           0        14              2015 July              
 7 Resort Hotel           0         0              2015 July              
 8 Resort Hotel           0         9              2015 July              
 9 Resort Hotel           1        85              2015 July              
10 Resort Hotel           1        75              2015 July              
# ℹ 119,380 more rows
# ℹ 27 more variables: arrival_date_week_number <dbl>,
#   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>, …
Code
hotelclean <- hotel %>% 
  mutate_at(c(1,2,5,13,14,15,20,21,27,31,16), as.factor)

hotelclean <- hotelclean %>%
  mutate_at(c(6,8,9,10,11,12,18,19,21,22,26,28,29,30,17), as.integer, na.rm=T) 

hotelclean <- hotelclean %>%
  mutate_at(c(4,7,32,3), as.character) %>% select(-company) %>% mutate(state = case_when
         (is_canceled == "1" ~ "canceled",
          is_canceled == "0" ~ "regular")) %>% 
  select(-is_canceled) 

#separate reservation status date into year, month, date columns

hotelclean <- hotelclean %>% separate(reservation_status_date, into= c("reservation_status_year","reservation_month", "reservation_status_date"), sep="-")

hotelclean 
# A tibble: 119,390 × 33
   hotel   lead_time arrival_date_year arrival_date_month arrival_date_week_nu…¹
   <fct>   <chr>     <chr>             <fct>                               <int>
 1 Resort… 342       2015              July                                   27
 2 Resort… 737       2015              July                                   27
 3 Resort… 7         2015              July                                   27
 4 Resort… 13        2015              July                                   27
 5 Resort… 14        2015              July                                   27
 6 Resort… 14        2015              July                                   27
 7 Resort… 0         2015              July                                   27
 8 Resort… 9         2015              July                                   27
 9 Resort… 85        2015              July                                   27
10 Resort… 75        2015              July                                   27
# ℹ 119,380 more rows
# ℹ abbreviated name: ¹​arrival_date_week_number
# ℹ 28 more variables: arrival_date_day_of_month <chr>,
#   stays_in_weekend_nights <int>, stays_in_week_nights <int>, adults <int>,
#   children <int>, babies <int>, meal <fct>, country <fct>,
#   market_segment <fct>, distribution_channel <fct>, is_repeated_guest <int>,
#   previous_cancellations <int>, previous_bookings_not_canceled <int>, …
Code
glimpse(hotelclean)
Rows: 119,390
Columns: 33
$ hotel                          <fct> Resort Hotel, Resort Hotel, Resort Hote…
$ lead_time                      <chr> "342", "737", "7", "13", "14", "14", "0…
$ arrival_date_year              <chr> "2015", "2015", "2015", "2015", "2015",…
$ arrival_date_month             <fct> July, July, July, July, July, July, Jul…
$ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month      <chr> "1", "1", "1", "1", "1", "1", "1", "1",…
$ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal                           <fct> BB, BB, BB, BB, BB, BB, BB, FB, BB, HB,…
$ country                        <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT,…
$ market_segment                 <fct> Direct, Direct, Direct, Corporate, Onli…
$ distribution_channel           <fct> Direct, Direct, Direct, Corporate, TA/T…
$ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> C, C, A, A, A, A, C, C, A, D, E, D, D, …
$ assigned_room_type             <int> 3, 3, 3, 1, 1, 1, 3, 3, 1, 4, 5, 4, 5, …
$ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
$ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <fct> Transient, Transient, Transient, Transi…
$ adr                            <int> 0, 0, 75, 75, 98, 98, 107, 103, 82, 105…
$ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status             <fct> Check-Out, Check-Out, Check-Out, Check-…
$ reservation_status_year        <chr> "2015", "2015", "2015", "2015", "2015",…
$ reservation_month              <chr> "07", "07", "07", "07", "07", "07", "07…
$ reservation_status_date        <chr> "01", "01", "02", "02", "03", "03", "03…
$ state                          <chr> "regular", "regular", "regular", "regul…