Challenge 6
Pradhakshya Dhanakumar
Hotel Bookings
ggplot2
Author

Pradhakshya Dhanakumar

Published

May 5, 2023

Code
library(tidyverse)
library(dplyr)
knitr::opts_chunk$set(echo = TRUE)

Reading Data

Code
df <- read.csv("_data/hotel_bookings.csv")
Code
head(df)
         hotel is_canceled lead_time arrival_date_year arrival_date_month
1 Resort Hotel           0       342              2015               July
2 Resort Hotel           0       737              2015               July
3 Resort Hotel           0         7              2015               July
4 Resort Hotel           0        13              2015               July
5 Resort Hotel           0        14              2015               July
6 Resort Hotel           0        14              2015               July
  arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
1                       27                         1                       0
2                       27                         1                       0
3                       27                         1                       0
4                       27                         1                       0
5                       27                         1                       0
6                       27                         1                       0
  stays_in_week_nights adults children babies meal country market_segment
1                    0      2        0      0   BB     PRT         Direct
2                    0      2        0      0   BB     PRT         Direct
3                    1      1        0      0   BB     GBR         Direct
4                    1      1        0      0   BB     GBR      Corporate
5                    2      2        0      0   BB     GBR      Online TA
6                    2      2        0      0   BB     GBR      Online TA
  distribution_channel is_repeated_guest previous_cancellations
1               Direct                 0                      0
2               Direct                 0                      0
3               Direct                 0                      0
4            Corporate                 0                      0
5                TA/TO                 0                      0
6                TA/TO                 0                      0
  previous_bookings_not_canceled reserved_room_type assigned_room_type
1                              0                  C                  C
2                              0                  C                  C
3                              0                  A                  C
4                              0                  A                  A
5                              0                  A                  A
6                              0                  A                  A
  booking_changes deposit_type agent company days_in_waiting_list customer_type
1               3   No Deposit  NULL    NULL                    0     Transient
2               4   No Deposit  NULL    NULL                    0     Transient
3               0   No Deposit  NULL    NULL                    0     Transient
4               0   No Deposit   304    NULL                    0     Transient
5               0   No Deposit   240    NULL                    0     Transient
6               0   No Deposit   240    NULL                    0     Transient
  adr required_car_parking_spaces total_of_special_requests reservation_status
1   0                           0                         0          Check-Out
2   0                           0                         0          Check-Out
3  75                           0                         0          Check-Out
4  75                           0                         0          Check-Out
5  98                           0                         1          Check-Out
6  98                           0                         1          Check-Out
  reservation_status_date
1              2015-07-01
2              2015-07-01
3              2015-07-02
4              2015-07-02
5              2015-07-03
6              2015-07-03

Describe Data

Code
colnames(df)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
Code
dim(df)
[1] 119390     32

Tidying the data

Code
df<- na.omit(df)

Time dependent Visualization

Code
library(ggplot2)

The resulting plot shows the total number of bookings for each month and hotel combination in the year 2016 using the filter, group by, ggplot fucntions. The bars are color-coded by hotel - City or Resort Hotel.

Code
df %>%
  filter(arrival_date_year == 2015) %>%
  group_by(arrival_date_month, hotel) %>%
  summarise(total_bookings = n()) %>%
  ggplot(aes(x = arrival_date_month, y = total_bookings, fill = hotel)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Month", y = "Total Bookings", title = "Monthly bookings by hotel") +
  scale_fill_manual(values = c("#4e79a7", "#f28e2c"), 
                    name = "Hotel", 
                    labels = c("City Hotel", "Resort Hotel"))
`summarise()` has grouped output by 'arrival_date_month'. You can override
using the `.groups` argument.

Visualizing Part-Whole Relationships

This will create a stacked bar chart showing the total number of bookings for each month, with the cancelled bookings shown in red and the non-cancelled bookings shown in green. The legend will show the range of cancelled bookings, with more cancelled bookings being represented by darker shades of red.

Code
df %>% 
  group_by(arrival_date_month) %>% 
  summarize(
    total_bookings = n(),
    cancelled_bookings = sum(is_canceled)
  ) %>% 
  ggplot(aes(x = arrival_date_month, y = total_bookings, fill = cancelled_bookings)) + 
  geom_col() + 
  scale_fill_gradient(low = "green", high = "red") +
  labs(title = "Total Bookings and Cancelled Bookings by Month", 
       x = "Month of Arrival", y = "Total Bookings") +
  theme_minimal()