library(tidyverse)
library(readr)
library(dplyr)
library(lubridate)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 4
Data Description
Reading the dataset
<- read_csv("_data/hotel_bookings.csv", show_col_types = FALSE)
data head(data)
# A tibble: 6 × 32
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resort… 0 342 2015 July 27 1 0 0 2
2 Resort… 0 737 2015 July 27 1 0 0 2
3 Resort… 0 7 2015 July 27 1 0 1 1
4 Resort… 0 13 2015 July 27 1 0 1 1
5 Resort… 0 14 2015 July 27 1 0 2 2
6 Resort… 0 14 2015 July 27 1 0 2 2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
# country <chr>, market_segment <chr>, distribution_channel <chr>,
# is_repeated_guest <dbl>, previous_cancellations <dbl>,
# previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
# assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
# agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
Columns in the dataset
colnames(data)
[1] "hotel" "is_canceled"
[3] "lead_time" "arrival_date_year"
[5] "arrival_date_month" "arrival_date_week_number"
[7] "arrival_date_day_of_month" "stays_in_weekend_nights"
[9] "stays_in_week_nights" "adults"
[11] "children" "babies"
[13] "meal" "country"
[15] "market_segment" "distribution_channel"
[17] "is_repeated_guest" "previous_cancellations"
[19] "previous_bookings_not_canceled" "reserved_room_type"
[21] "assigned_room_type" "booking_changes"
[23] "deposit_type" "agent"
[25] "company" "days_in_waiting_list"
[27] "customer_type" "adr"
[29] "required_car_parking_spaces" "total_of_special_requests"
[31] "reservation_status" "reservation_status_date"
The Dimensions of the dataset is as follows:
dim(data)
[1] 119390 32
From the above cells we can see that the dataset has 32 columns of which 3 columns only depict the arrival date this can be brought down to one column and that will reduce the dimensionality of the dataset to (119390, 30).
Tidying the dataset
The month column in the dataset is as follows:
head(data["arrival_date_month"])
# A tibble: 6 × 1
arrival_date_month
<chr>
1 July
2 July
3 July
4 July
5 July
6 July
This can be changed to numerical data so that the date can be represented as a string of numerical values depicting month, day and year. The code to mutate the data is as follows:
<- data %>%
mutated_data mutate(arrival_date_month = case_when(
== "January" ~ 1,
arrival_date_month == "Febuary" ~ 2,
arrival_date_month == "March" ~ 3,
arrival_date_month == "April" ~ 4,
arrival_date_month == "May" ~ 5,
arrival_date_month == "June" ~ 6,
arrival_date_month == "July" ~ 7,
arrival_date_month == "August" ~ 8,
arrival_date_month == "September" ~ 9,
arrival_date_month == "October" ~ 10,
arrival_date_month == "November" ~ 11,
arrival_date_month == "December" ~ 12
arrival_date_month
))head(mutated_data)
# A tibble: 6 × 32
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resort… 0 342 2015 7 27 1 0 0 2
2 Resort… 0 737 2015 7 27 1 0 0 2
3 Resort… 0 7 2015 7 27 1 0 1 1
4 Resort… 0 13 2015 7 27 1 0 1 1
5 Resort… 0 14 2015 7 27 1 0 2 2
6 Resort… 0 14 2015 7 27 1 0 2 2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
# country <chr>, market_segment <chr>, distribution_channel <chr>,
# is_repeated_guest <dbl>, previous_cancellations <dbl>,
# previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
# assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
# agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
Variables to be mutated
Now we can mutate all the three columns arrival_date_year, arrival_date_month, arrival_date_day_of_month to a single column arrival_date
<- mutated_data %>%
final_data mutate(
arrival_date = make_date(arrival_date_year, arrival_date_month, arrival_date_day_of_month)
%>%
) select(-c(arrival_date_day_of_month, arrival_date_month, arrival_date_year))
head(final_data)
# A tibble: 6 × 30
hotel is_ca…¹ lead_…² arriv…³ stays…⁴ stays…⁵ adults child…⁶ babies meal
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 Resort Ho… 0 342 27 0 0 2 0 0 BB
2 Resort Ho… 0 737 27 0 0 2 0 0 BB
3 Resort Ho… 0 7 27 0 1 1 0 0 BB
4 Resort Ho… 0 13 27 0 1 1 0 0 BB
5 Resort Ho… 0 14 27 0 2 2 0 0 BB
6 Resort Ho… 0 14 27 0 2 2 0 0 BB
# … with 20 more variables: country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
# deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
# total_of_special_requests <dbl>, reservation_status <chr>, …
The dimensions of the new modified dataset is as follows:
dim(final_data)
[1] 119390 30
Therefore we achieved the desired target and reduced the divided arrival date data into a single column.