Global Hotel Bookings: 2015-2017

challenge_2
Michele Carlin
hotel_bookings
Author

Michele Carlin

Published

February 28, 2023

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE)
install.packages(summarytools)
Error in install.packages(summarytools): object 'summarytools' not found
Code
library(summarytools)
install.packages(dplyr)
Error in install.packages(dplyr): object 'dplyr' not found
Code
library(dplyr)

Read in and view ‘hotel bookings’ dataset.

Code
library(readr)
hotel_bookings <- read_csv ("_data/hotel_bookings.csv")
Rows: 119390 Columns: 32
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
date  (1): reservation_status_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
View(hotel_bookings)

List of variables in dataset.

Code
str(hotel_bookings)
spc_tbl_ [119,390 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ hotel                         : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled                   : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
 $ lead_time                     : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
 $ arrival_date_year             : num [1:119390] 2015 2015 2015 2015 2015 ...
 $ arrival_date_month            : chr [1:119390] "July" "July" "July" "July" ...
 $ arrival_date_week_number      : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
 $ arrival_date_day_of_month     : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
 $ stays_in_weekend_nights       : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights          : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
 $ adults                        : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
 $ children                      : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ babies                        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ meal                          : chr [1:119390] "BB" "BB" "BB" "BB" ...
 $ country                       : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
 $ market_segment                : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
 $ distribution_channel          : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
 $ is_repeated_guest             : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_cancellations        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ reserved_room_type            : chr [1:119390] "C" "C" "A" "A" ...
 $ assigned_room_type            : chr [1:119390] "C" "C" "C" "A" ...
 $ booking_changes               : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
 $ deposit_type                  : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
 $ agent                         : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
 $ company                       : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
 $ days_in_waiting_list          : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ customer_type                 : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
 $ adr                           : num [1:119390] 0 0 75 75 98 ...
 $ required_car_parking_spaces   : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ total_of_special_requests     : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
 $ reservation_status            : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
 $ reservation_status_date       : Date[1:119390], format: "2015-07-01" "2015-07-01" ...
 - attr(*, "spec")=
  .. cols(
  ..   hotel = col_character(),
  ..   is_canceled = col_double(),
  ..   lead_time = col_double(),
  ..   arrival_date_year = col_double(),
  ..   arrival_date_month = col_character(),
  ..   arrival_date_week_number = col_double(),
  ..   arrival_date_day_of_month = col_double(),
  ..   stays_in_weekend_nights = col_double(),
  ..   stays_in_week_nights = col_double(),
  ..   adults = col_double(),
  ..   children = col_double(),
  ..   babies = col_double(),
  ..   meal = col_character(),
  ..   country = col_character(),
  ..   market_segment = col_character(),
  ..   distribution_channel = col_character(),
  ..   is_repeated_guest = col_double(),
  ..   previous_cancellations = col_double(),
  ..   previous_bookings_not_canceled = col_double(),
  ..   reserved_room_type = col_character(),
  ..   assigned_room_type = col_character(),
  ..   booking_changes = col_double(),
  ..   deposit_type = col_character(),
  ..   agent = col_character(),
  ..   company = col_character(),
  ..   days_in_waiting_list = col_double(),
  ..   customer_type = col_character(),
  ..   adr = col_double(),
  ..   required_car_parking_spaces = col_double(),
  ..   total_of_special_requests = col_double(),
  ..   reservation_status = col_character(),
  ..   reservation_status_date = col_date(format = "")
  .. )
 - attr(*, "problems")=<externalptr> 

dfSummary

Code
view(dfSummary(hotel_bookings))
Switching method to 'browser'
Output file written: C:\Users\CarlinML\AppData\Local\Temp\RtmpszR3TJ\file41c471ac4ea0.html

Describing dataset

The ‘hotel_bookings’ dataset contains 119,390 rows and 32 columns. Each observation is that of a single hotel booking between 2015 and 2017 and provides information on type of resort, arrival date, number of nights/people, room type, country, market segment, etc.

#Create new variable for total number of guests.

Code
hotel_bookings <- hotel_bookings %>%
    mutate (guests = adults + children + babies)
select(hotel_bookings, "adults", "children", "babies", "guests")
# A tibble: 119,390 × 4
   adults children babies guests
    <dbl>    <dbl>  <dbl>  <dbl>
 1      2        0      0      2
 2      2        0      0      2
 3      1        0      0      1
 4      1        0      0      1
 5      2        0      0      2
 6      2        0      0      2
 7      2        0      0      2
 8      2        0      0      2
 9      2        0      0      2
10      2        0      0      2
# … with 119,380 more rows

Create a new ‘season’ variable using ‘case_when’.

Code
hotel_bookings <- hotel_bookings %>%
  mutate(season = case_when(
         arrival_date_month == "December" | arrival_date_month == "January" | arrival_date_month == "February" ~ "Winter",
        arrival_date_month == "March" | arrival_date_month == "April" | arrival_date_month == "May" ~ "Spring",
        arrival_date_month == "June" | arrival_date_month == "July" | arrival_date_month == "August" ~ "Summer",
       arrival_date_month == "September" | arrival_date_month == "October" | arrival_date_month == "November" ~ "Fall")
  )
table(select(hotel_bookings, season))
season
  Fall Spring Summer Winter 
 28462  32674  37477  20777 

Create crosstabs of Total number of guests by Season. Summer bookings tend to have higher guest totals possibly because families travel more when kids are on summer break.

Code
xtabs(~ guests + season, hotel_bookings)
      season
guests  Fall Spring Summer Winter
    0     44     48     43     45
    1   6438   6563   4279   5301
    2  20043  22794  25999  13212
    3   1395   2364   5112   1623
    4    517    880   1970    562
    5     10     25     69     33
    6      1      0      0      0
    10     2      0      0      0
    12     0      0      1      1
    20     2      0      0      0
    26     5      0      0      0
    27     2      0      0      0
    40     1      0      0      0
    50     1      0      0      0
    55     1      0      0      0

Calculate min/max/mean average daily rate (adr) and group data by hotel type and arrival month.

Code
GrpByHotelType <- hotel_bookings %>%
    group_by(hotel, arrival_date_month) %>%
    summarise(minADR = min(adr), maxADR = max(adr), meanADR = mean(adr)) 
`summarise()` has grouped output by 'hotel'. You can override using the
`.groups` argument.
Code
tail(GrpByHotelType)
# A tibble: 6 × 5
# Groups:   hotel [1]
  hotel        arrival_date_month minADR maxADR meanADR
  <chr>        <chr>               <dbl>  <dbl>   <dbl>
1 Resort Hotel June                 0      319.   110. 
2 Resort Hotel March               -6.38   194.    57.5
3 Resort Hotel May                  0      226.    78.8
4 Resort Hotel November             0      175     48.3
5 Resort Hotel October              0      246.    62.1
6 Resort Hotel September            0      308.    93.3

Graph adr by type of hotel and arrival month.

Code
ggplot(hotel_bookings, aes(fill=hotel, x=arrival_date_month, y=adr)) + 
  geom_bar(position="dodge", stat="identity")

Sort data to find outlier(s).

Code
hotel_bookings %>%
  arrange(desc(adr)) %>%
  select(hotel, adr)
# A tibble: 119,390 × 2
   hotel          adr
   <chr>        <dbl>
 1 City Hotel   5400 
 2 City Hotel    510 
 3 Resort Hotel  508 
 4 City Hotel    452.
 5 Resort Hotel  450 
 6 Resort Hotel  437 
 7 Resort Hotel  426.
 8 Resort Hotel  402 
 9 Resort Hotel  397.
10 Resort Hotel  392 
# … with 119,380 more rows

One hotel had an average daily rate of -6.4 and another of 5400 (the next highest adr is 510).Re-run bar chart with these two outliers excluded, summarize by season instead of month, and add title/axis labels.

Code
hotel_bookings %>%
  filter(adr > 0 & adr < 600) %>%
  ggplot(aes(fill=hotel, x=season, y=adr)) + 
  geom_bar(position="dodge", stat="identity") +
   labs(title = "Hotel Average Daily Rate by Type of Hotel and Season", y = "Avg Daily Rate", x = "Season")

Graph adr by market segment and type of hotel.

Code
hotel_bookings %>%
  filter(adr > 0 & adr < 600) %>%
  ggplot(aes(fill=hotel, x=market_segment, y=adr)) + 
  geom_bar(position="dodge", stat="identity") +
     labs(title = "Hotel Average Daily Rate by Type of Hotel and Market Segment", y = "Avg Daily Rate", x = "Market Segment") +
  scale_x_discrete(guide = guide_axis(n.dodge=2))

Aviation and Undefined don’t appear to have any Resort Hotel data. Run crosstabs to verify.

How do I leave empty space for teal column instead of red taking over width of both columns?

Code
xtabs(~ market_segment + hotel, hotel_bookings)
               hotel
market_segment  City Hotel Resort Hotel
  Aviation             237            0
  Complementary        542          201
  Corporate           2986         2309
  Direct              6093         6513
  Groups             13975         5836
  Offline TA/TO      16747         7472
  Online TA          38748        17729
  Undefined              2            0