Challenge 4

challenge_4
hotel_bookings
More data wrangling: pivoting
Author

Pranav Komaravolu

Published

May 18, 2023

library(tidyverse)
library(readr)
library(dplyr)
library(lubridate)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Data Description

Reading the dataset

data <- read_csv("_data/hotel_bookings.csv", show_col_types = FALSE)
head(data)
# A tibble: 6 × 32
  hotel   is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
  <chr>     <dbl>   <dbl>   <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
1 Resort…       0     342    2015 July         27       1       0       0      2
2 Resort…       0     737    2015 July         27       1       0       0      2
3 Resort…       0       7    2015 July         27       1       0       1      1
4 Resort…       0      13    2015 July         27       1       0       1      1
5 Resort…       0      14    2015 July         27       1       0       2      2
6 Resort…       0      14    2015 July         27       1       0       2      2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
#   country <chr>, market_segment <chr>, distribution_channel <chr>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
#   assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
#   agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …

Columns in the dataset

colnames(data)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       

The Dimensions of the dataset is as follows:

dim(data)
[1] 119390     32

From the above cells we can see that the dataset has 32 columns of which 3 columns only depict the arrival date this can be brought down to one column and that will reduce the dimensionality of the dataset to (119390, 30).

Tidying the dataset

The month column in the dataset is as follows:

head(data["arrival_date_month"])
# A tibble: 6 × 1
  arrival_date_month
  <chr>             
1 July              
2 July              
3 July              
4 July              
5 July              
6 July              

This can be changed to numerical data so that the date can be represented as a string of numerical values depicting month, day and year. The code to mutate the data is as follows:

mutated_data <- data %>%
                  mutate(arrival_date_month = case_when(
                    arrival_date_month == "January" ~ 1,
                    arrival_date_month == "Febuary" ~ 2,
                    arrival_date_month == "March" ~ 3,
                    arrival_date_month == "April" ~ 4,
                    arrival_date_month == "May" ~ 5,
                    arrival_date_month == "June" ~ 6,
                    arrival_date_month == "July" ~ 7,
                    arrival_date_month == "August" ~ 8,
                    arrival_date_month == "September" ~ 9,
                    arrival_date_month == "October" ~ 10,
                    arrival_date_month == "November" ~ 11,
                    arrival_date_month == "December" ~ 12
                  ))
head(mutated_data)
# A tibble: 6 × 32
  hotel   is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
  <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
1 Resort…       0     342    2015       7      27       1       0       0      2
2 Resort…       0     737    2015       7      27       1       0       0      2
3 Resort…       0       7    2015       7      27       1       0       1      1
4 Resort…       0      13    2015       7      27       1       0       1      1
5 Resort…       0      14    2015       7      27       1       0       2      2
6 Resort…       0      14    2015       7      27       1       0       2      2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
#   country <chr>, market_segment <chr>, distribution_channel <chr>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
#   assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
#   agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …

Variables to be mutated

Now we can mutate all the three columns arrival_date_year, arrival_date_month, arrival_date_day_of_month to a single column arrival_date

final_data <- mutated_data %>%
                mutate(
                  arrival_date = make_date(arrival_date_year, arrival_date_month, arrival_date_day_of_month)
                ) %>%
                select(-c(arrival_date_day_of_month, arrival_date_month, arrival_date_year))

head(final_data)
# A tibble: 6 × 30
  hotel      is_ca…¹ lead_…² arriv…³ stays…⁴ stays…⁵ adults child…⁶ babies meal 
  <chr>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <chr>
1 Resort Ho…       0     342      27       0       0      2       0      0 BB   
2 Resort Ho…       0     737      27       0       0      2       0      0 BB   
3 Resort Ho…       0       7      27       0       1      1       0      0 BB   
4 Resort Ho…       0      13      27       0       1      1       0      0 BB   
5 Resort Ho…       0      14      27       0       2      2       0      0 BB   
6 Resort Ho…       0      14      27       0       2      2       0      0 BB   
# … with 20 more variables: country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
#   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
#   total_of_special_requests <dbl>, reservation_status <chr>, …

The dimensions of the new modified dataset is as follows:

dim(final_data)
[1] 119390     30

Therefore we achieved the desired target and reduced the divided arrival date data into a single column.