Tidy/Consolidate a dataset

challenge4
Neha Jhurani
hotel_bookings.csv
Author

Neha Jhurani

Published

April 12, 2023

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE)

Analysing hotel_bookings data

Code
library(readr)

#reading hotel_bookings csv data
hotel_bookings_data <- read_csv("_data/hotel_bookings.csv")
Rows: 119390 Columns: 32
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
date  (1): reservation_status_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
#extracting all the column names
colnames(hotel_bookings_data)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
Code
dim(hotel_bookings_data)
[1] 119390     32
Code
# The dataset contains around 119300 bookings at 2 Hotels from July 2015 to August 2017. There are 32 features for each booking

# We see that there are a lot of variables that can be combined as they represent the same information. For ex - if we wnt to know the arriavl date, we have 4 different attrs for it, i.e., year of arrival, month of arrival, day of arrival and week of arrival. These features separated might help us with some info like, if we want to know the busiest months every year, etc. But, for now, these information makes more sense to our usecase together.

library(dplyr)
library(lubridate)

Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union
Code
#need to format month from str to it's respective number. For example - 'January' -> 1. We can use case_when along with mutate() as shown below
consolidated_hotel_bookings_data <- hotel_bookings_data %>%
    mutate(month = case_when(
       arrival_date_month == "January" ~ 1,
       arrival_date_month == "February" ~ 2,
       arrival_date_month == "March" ~ 3,
       arrival_date_month == "April" ~ 4,
       arrival_date_month == "May" ~ 5,
       arrival_date_month == "June" ~ 6,
       arrival_date_month == "July" ~ 7,
       arrival_date_month == "August" ~ 8,
       arrival_date_month == "September" ~ 9,
       arrival_date_month == "October" ~ 10,
       arrival_date_month == "November" ~ 11,
       arrival_date_month == "December" ~ 12,
     )) %>%
    mutate(arrival_date = make_date(arrival_date_year, month, arrival_date_day_of_month)) %>% #combining separate parts of date into one
    select(-c(arrival_date_year, arrival_date_month, arrival_date_day_of_month, month, arrival_date_week_number))  # removing extra columns

view(consolidated_hotel_bookings_data)

dim(consolidated_hotel_bookings_data)
[1] 119390     29
Code
#The number of columns reduce to 29