Homework-3

hw2

hotel_bookings

Author

Siddharth Nammara Kalyana Raman

Published

December 17, 2022

Code

#load the libraries
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2

Warning: package 'ggplot2' was built under R version 4.2.2

Warning: package 'tibble' was built under R version 4.2.2

Warning: package 'stringr' was built under R version 4.2.2

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Code

library(psych)


Attaching package: 'psych'

The following objects are masked from 'package:ggplot2':

    %+%, alpha

Code

library(summarytools)

Warning: package 'summarytools' was built under R version 4.2.2


Attaching package: 'summarytools'

The following object is masked from 'package:tibble':

    view

Code

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Read Data

Code

#read data

hotel.bookings <- read.csv("_data/hotel_bookings.csv")

Describe Data

Code

#checking the dimensions of the data
dim(hotel.bookings)

[1] 119390     32

Changing the column name to make it more readable

Code

colnames(hotel.bookings)

 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"

Code

colnames(hotel.bookings)[28] <- "average_daily_rate"

view first few rows of the data

Code

head(hotel.bookings)

         hotel is_canceled lead_time arrival_date_year arrival_date_month
1 Resort Hotel           0       342              2015               July
2 Resort Hotel           0       737              2015               July
3 Resort Hotel           0         7              2015               July
4 Resort Hotel           0        13              2015               July
5 Resort Hotel           0        14              2015               July
6 Resort Hotel           0        14              2015               July
  arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
1                       27                         1                       0
2                       27                         1                       0
3                       27                         1                       0
4                       27                         1                       0
5                       27                         1                       0
6                       27                         1                       0
  stays_in_week_nights adults children babies meal country market_segment
1                    0      2        0      0   BB     PRT         Direct
2                    0      2        0      0   BB     PRT         Direct
3                    1      1        0      0   BB     GBR         Direct
4                    1      1        0      0   BB     GBR      Corporate
5                    2      2        0      0   BB     GBR      Online TA
6                    2      2        0      0   BB     GBR      Online TA
  distribution_channel is_repeated_guest previous_cancellations
1               Direct                 0                      0
2               Direct                 0                      0
3               Direct                 0                      0
4            Corporate                 0                      0
5                TA/TO                 0                      0
6                TA/TO                 0                      0
  previous_bookings_not_canceled reserved_room_type assigned_room_type
1                              0                  C                  C
2                              0                  C                  C
3                              0                  A                  C
4                              0                  A                  A
5                              0                  A                  A
6                              0                  A                  A
  booking_changes deposit_type agent company days_in_waiting_list customer_type
1               3   No Deposit  NULL    NULL                    0     Transient
2               4   No Deposit  NULL    NULL                    0     Transient
3               0   No Deposit  NULL    NULL                    0     Transient
4               0   No Deposit   304    NULL                    0     Transient
5               0   No Deposit   240    NULL                    0     Transient
6               0   No Deposit   240    NULL                    0     Transient
  average_daily_rate required_car_parking_spaces total_of_special_requests
1                  0                           0                         0
2                  0                           0                         0
3                 75                           0                         0
4                 75                           0                         0
5                 98                           0                         1
6                 98                           0                         1
  reservation_status reservation_status_date
1          Check-Out              2015-07-01
2          Check-Out              2015-07-01
3          Check-Out              2015-07-02
4          Check-Out              2015-07-02
5          Check-Out              2015-07-03
6          Check-Out              2015-07-03

Checking the characted data types

Code

table(sapply(hotel.bookings, function(x) typeof(x)))


character    double   integer 
       14         1        17

Code

sapply(
  hotel.bookings, function(x) n_distinct(x)
  )

                         hotel                    is_canceled 
                             2                              2 
                     lead_time              arrival_date_year 
                           479                              3 
            arrival_date_month       arrival_date_week_number 
                            12                             53 
     arrival_date_day_of_month        stays_in_weekend_nights 
                            31                             17 
          stays_in_week_nights                         adults 
                            35                             14 
                      children                         babies 
                             6                              5 
                          meal                        country 
                             5                            178 
                market_segment           distribution_channel 
                             8                              5 
             is_repeated_guest         previous_cancellations 
                             2                             15 
previous_bookings_not_canceled             reserved_room_type 
                            73                             10 
            assigned_room_type                booking_changes 
                            12                             21 
                  deposit_type                          agent 
                             3                            334 
                       company           days_in_waiting_list 
                           353                            128 
                 customer_type             average_daily_rate 
                             4                           8879 
   required_car_parking_spaces      total_of_special_requests 
                             5                              6 
            reservation_status        reservation_status_date 
                             3                            926

finding the unique values for each column

Code

unique(hotel.bookings$hotel)

[1] "Resort Hotel" "City Hotel"

Code

unique(hotel.bookings$arrival_date_year)

[1] 2015 2016 2017

Code

unique(hotel.bookings$reservation_status)

[1] "Check-Out" "Canceled"  "No-Show"

Code

unique(hotel.bookings$distribution_channel)

[1] "Direct"    "Corporate" "TA/TO"     "Undefined" "GDS"

Code

unique(hotel.bookings$customer_type)

[1] "Transient"       "Contract"        "Transient-Party" "Group"

There are 119,390 observations and 32 variables in this dataset (rows). We can infer from the variable names that it contains hotel reservation information. 18 variables include numerical values, whereas 14 variables contain character values. Nevertheless, a few character variables and a few numerical variables genuinely represent dates. Except for “adr,” all of the variable names are sufficiently descriptive. The term “adr” stands for “average daily rate.” Therefore, I substituted “average daily rate” for the original column name “adr.”

The dataset includes two different kinds of hotels: resort hotels and city hotels. Bookings are made for the years 2015, 2016 and 2017. The information includes reservations from 178 nations. Thus, the information should belong to a significant global network of hotels. The dataset includes no-shows as well as completed and canceled reservations. Therefore, each observation contains data on a reservation, such as the hotel category and the country for which the reservation is made, the number of visitors, the dates, the daily rates, the lengths of stays, and some categorical information about the customer and the reservation channel. Each row and each column in the dataset correspond to an observation. Thus, pivoting is not required.

Code

hotel.bookings <- mutate(
  hotel.bookings, 
  number_of_guests = adults + children + babies,
  total_stay = stays_in_weekend_nights + stays_in_week_nights) # adding two new variables

Code

print(dfSummary(hotel.bookings, 
                varnumbers= FALSE, 
                plain.ascii= FALSE, 
                style= "grid", 
                graph.magnif= 0.80, 
                valid.col= TRUE),
      method= 'render', 
      table.classes= 'table-condensed')

Data Frame Summary

hotel.bookings

Dimensions: 119390 x 34
Duplicates: 31994

Variable

Stats / Values

Freqs (% of Valid)

Graph

Valid

Missing

hotel [character]

1. City Hotel

2. Resort Hotel

79330	(	66.4%	)
40060	(	33.6%	)

119390 (100.0%)

0 (0.0%)

is_canceled [integer]

Min : 0

Mean : 0.4

Max : 1

0	:	75166	(	63.0%	)
1	:	44224	(	37.0%	)

119390 (100.0%)

0 (0.0%)

lead_time [integer]

Mean (sd) : 104 (106.9)

min ≤ med ≤ max:

0 ≤ 69 ≤ 737

IQR (CV) : 142 (1)

479 distinct values

119390 (100.0%)

0 (0.0%)

arrival_date_year [integer]

Mean (sd) : 2016.2 (0.7)

min ≤ med ≤ max:

2015 ≤ 2016 ≤ 2017

IQR (CV) : 1 (0)

2015	:	21996	(	18.4%	)
2016	:	56707	(	47.5%	)
2017	:	40687	(	34.1%	)

119390 (100.0%)

0 (0.0%)

arrival_date_month [character]

1. August

2. July

3. May

4. October

5. April

6. June

7. September

8. March

9. February

10. November

[ 2 others ]

13877	(	11.6%	)
12661	(	10.6%	)
11791	(	9.9%	)
11160	(	9.3%	)
11089	(	9.3%	)
10939	(	9.2%	)
10508	(	8.8%	)
9794	(	8.2%	)
8068	(	6.8%	)
6794	(	5.7%	)
12709	(	10.6%	)

119390 (100.0%)

0 (0.0%)

arrival_date_week_number [integer]

Mean (sd) : 27.2 (13.6)

min ≤ med ≤ max:

1 ≤ 28 ≤ 53

IQR (CV) : 22 (0.5)

53 distinct values

119390 (100.0%)

0 (0.0%)

arrival_date_day_of_month [integer]

Mean (sd) : 15.8 (8.8)

min ≤ med ≤ max:

1 ≤ 16 ≤ 31

IQR (CV) : 15 (0.6)

31 distinct values

119390 (100.0%)

0 (0.0%)

stays_in_weekend_nights [integer]

Mean (sd) : 0.9 (1)

min ≤ med ≤ max:

0 ≤ 1 ≤ 19

IQR (CV) : 2 (1.1)

17 distinct values

119390 (100.0%)

0 (0.0%)

stays_in_week_nights [integer]

Mean (sd) : 2.5 (1.9)

min ≤ med ≤ max:

0 ≤ 2 ≤ 50

IQR (CV) : 2 (0.8)

35 distinct values

119390 (100.0%)

0 (0.0%)

adults [integer]

Mean (sd) : 1.9 (0.6)

min ≤ med ≤ max:

0 ≤ 2 ≤ 55

IQR (CV) : 0 (0.3)

14 distinct values

119390 (100.0%)

0 (0.0%)

children [integer]

Mean (sd) : 0.1 (0.4)

min ≤ med ≤ max:

0 ≤ 0 ≤ 10

IQR (CV) : 0 (3.8)

0	:	110796	(	92.8%	)
1	:	4861	(	4.1%	)
2	:	3652	(	3.1%	)
3	:	76	(	0.1%	)
10	:	1	(	0.0%	)

119386 (100.0%)

4 (0.0%)

babies [integer]

Mean (sd) : 0 (0.1)

min ≤ med ≤ max:

0 ≤ 0 ≤ 10

IQR (CV) : 0 (12.3)

0	:	118473	(	99.2%	)
1	:	900	(	0.8%	)
2	:	15	(	0.0%	)
9	:	1	(	0.0%	)
10	:	1	(	0.0%	)

119390 (100.0%)

0 (0.0%)

meal [character]

1. BB

2. FB

3. HB

4. SC

5. Undefined

92310	(	77.3%	)
798	(	0.7%	)
14463	(	12.1%	)
10650	(	8.9%	)
1169	(	1.0%	)

119390 (100.0%)

0 (0.0%)

country [character]

1. PRT

2. GBR

3. FRA

4. ESP

5. DEU

6. ITA

7. IRL

8. BEL

9. BRA

10. NLD

[ 168 others ]

48590	(	40.7%	)
12129	(	10.2%	)
10415	(	8.7%	)
8568	(	7.2%	)
7287	(	6.1%	)
3766	(	3.2%	)
3375	(	2.8%	)
2342	(	2.0%	)
2224	(	1.9%	)
2104	(	1.8%	)
18590	(	15.6%	)

119390 (100.0%)

0 (0.0%)

market_segment [character]

1. Aviation

2. Complementary

3. Corporate

4. Direct

5. Groups

6. Offline TA/TO

7. Online TA

8. Undefined

237	(	0.2%	)
743	(	0.6%	)
5295	(	4.4%	)
12606	(	10.6%	)
19811	(	16.6%	)
24219	(	20.3%	)
56477	(	47.3%	)
2	(	0.0%	)

119390 (100.0%)

0 (0.0%)

distribution_channel [character]

1. Corporate

2. Direct

3. GDS

4. TA/TO

5. Undefined

6677	(	5.6%	)
14645	(	12.3%	)
193	(	0.2%	)
97870	(	82.0%	)
5	(	0.0%	)

119390 (100.0%)

0 (0.0%)

is_repeated_guest [integer]

Min : 0

Mean : 0

Max : 1

0	:	115580	(	96.8%	)
1	:	3810	(	3.2%	)

119390 (100.0%)

0 (0.0%)

previous_cancellations [integer]

Mean (sd) : 0.1 (0.8)

min ≤ med ≤ max:

0 ≤ 0 ≤ 26

IQR (CV) : 0 (9.7)

15 distinct values

119390 (100.0%)

0 (0.0%)

previous_bookings_not_canceled [integer]

Mean (sd) : 0.1 (1.5)

min ≤ med ≤ max:

0 ≤ 0 ≤ 72

IQR (CV) : 0 (10.9)

73 distinct values

119390 (100.0%)

0 (0.0%)

reserved_room_type [character]

1. A

2. B

3. C

4. D

5. E

6. F

7. G

8. H

9. L

10. P

85994	(	72.0%	)
1118	(	0.9%	)
932	(	0.8%	)
19201	(	16.1%	)
6535	(	5.5%	)
2897	(	2.4%	)
2094	(	1.8%	)
601	(	0.5%	)
6	(	0.0%	)
12	(	0.0%	)

119390 (100.0%)

0 (0.0%)

assigned_room_type [character]

1. A

2. D

3. E

4. F

5. G

6. C

7. B

8. H

9. I

10. K

[ 2 others ]

74053	(	62.0%	)
25322	(	21.2%	)
7806	(	6.5%	)
3751	(	3.1%	)
2553	(	2.1%	)
2375	(	2.0%	)
2163	(	1.8%	)
712	(	0.6%	)
363	(	0.3%	)
279	(	0.2%	)
13	(	0.0%	)

119390 (100.0%)

0 (0.0%)

booking_changes [integer]

Mean (sd) : 0.2 (0.7)

min ≤ med ≤ max:

0 ≤ 0 ≤ 21

IQR (CV) : 0 (2.9)

21 distinct values

119390 (100.0%)

0 (0.0%)

deposit_type [character]

1. No Deposit

2. Non Refund

3. Refundable

104641	(	87.6%	)
14587	(	12.2%	)
162	(	0.1%	)

119390 (100.0%)

0 (0.0%)

agent [character]

1. 9
2. NULL
3. 240
4. 1
5. 14
6. 7
7. 6
8. 250
9. 241
10. 28
[ 324 others ]

31961	(	26.8%	)
16340	(	13.7%	)
13922	(	11.7%	)
7191	(	6.0%	)
3640	(	3.0%	)
3539	(	3.0%	)
3290	(	2.8%	)
2870	(	2.4%	)
1721	(	1.4%	)
1666	(	1.4%	)
33250	(	27.8%	)

119390 (100.0%)

0 (0.0%)

company [character]

1. NULL
2. 40
3. 223
4. 67
5. 45
6. 153
7. 174
8. 219
9. 281
10. 154
[ 343 others ]

112593	(	94.3%	)
927	(	0.8%	)
784	(	0.7%	)
267	(	0.2%	)
250	(	0.2%	)
215	(	0.2%	)
149	(	0.1%	)
141	(	0.1%	)
138	(	0.1%	)
133	(	0.1%	)
3793	(	3.2%	)

119390 (100.0%)

0 (0.0%)

days_in_waiting_list [integer]

Mean (sd) : 2.3 (17.6)

min ≤ med ≤ max:

0 ≤ 0 ≤ 391

IQR (CV) : 0 (7.6)

128 distinct values

119390 (100.0%)

0 (0.0%)

customer_type [character]

1. Contract

2. Group

3. Transient

4. Transient-Party

4076	(	3.4%	)
577	(	0.5%	)
89613	(	75.1%	)
25124	(	21.0%	)

119390 (100.0%)

0 (0.0%)

average_daily_rate [numeric]

Mean (sd) : 101.8 (50.5)

min ≤ med ≤ max:

-6.4 ≤ 94.6 ≤ 5400

IQR (CV) : 56.7 (0.5)

8879 distinct values

119390 (100.0%)

0 (0.0%)

required_car_parking_spaces [integer]

Mean (sd) : 0.1 (0.2)

min ≤ med ≤ max:

0 ≤ 0 ≤ 8

IQR (CV) : 0 (3.9)

0	:	111974	(	93.8%	)
1	:	7383	(	6.2%	)
2	:	28	(	0.0%	)
3	:	3	(	0.0%	)
8	:	2	(	0.0%	)

119390 (100.0%)

0 (0.0%)

total_of_special_requests [integer]

Mean (sd) : 0.6 (0.8)

min ≤ med ≤ max:

0 ≤ 0 ≤ 5

IQR (CV) : 1 (1.4)

0	:	70318	(	58.9%	)
1	:	33226	(	27.8%	)
2	:	12969	(	10.9%	)
3	:	2497	(	2.1%	)
4	:	340	(	0.3%	)
5	:	40	(	0.0%	)

119390 (100.0%)

0 (0.0%)

reservation_status [character]

1. Canceled

2. Check-Out

3. No-Show

43017	(	36.0%	)
75166	(	63.0%	)
1207	(	1.0%	)

119390 (100.0%)

0 (0.0%)

reservation_status_date [character]

1. 2015-10-21
2. 2015-07-06
3. 2016-11-25
4. 2015-01-01
5. 2016-01-18
6. 2015-07-02
7. 2016-12-07
8. 2015-12-18
9. 2016-02-09
10. 2016-04-04
[ 916 others ]

1461	(	1.2%	)
805	(	0.7%	)
790	(	0.7%	)
763	(	0.6%	)
625	(	0.5%	)
469	(	0.4%	)
450	(	0.4%	)
423	(	0.4%	)
412	(	0.3%	)
382	(	0.3%	)
112810	(	94.5%	)

119390 (100.0%)

0 (0.0%)

number_of_guests [integer]

Mean (sd) : 2 (0.7)

min ≤ med ≤ max:

0 ≤ 2 ≤ 55

IQR (CV) : 0 (0.4)

15 distinct values

119386 (100.0%)

4 (0.0%)

total_stay [integer]

Mean (sd) : 3.4 (2.6)

min ≤ med ≤ max:

0 ≤ 3 ≤ 69

IQR (CV) : 2 (0.7)

45 distinct values

119390 (100.0%)

0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-12-21

Some descriptive statistics for the data’s numeric variables are included in this table. For instance, 37% of reservations are canceled in reality. Reservations are typically made 104 days prior to the intended arrival date. Every reservation is typically booked for 1.97 people. An average of one reservation every ten includes a child or infant. The typical stay lasts 3.43 days. 22% of reservations are modified after the fact. Last but not least, hotels’ average daily charge is $101.

Let’s check for null values

Code

colSums(is.na(hotel.bookings))

                         hotel                    is_canceled 
                             0                              0 
                     lead_time              arrival_date_year 
                             0                              0 
            arrival_date_month       arrival_date_week_number 
                             0                              0 
     arrival_date_day_of_month        stays_in_weekend_nights 
                             0                              0 
          stays_in_week_nights                         adults 
                             0                              0 
                      children                         babies 
                             4                              0 
                          meal                        country 
                             0                              0 
                market_segment           distribution_channel 
                             0                              0 
             is_repeated_guest         previous_cancellations 
                             0                              0 
previous_bookings_not_canceled             reserved_room_type 
                             0                              0 
            assigned_room_type                booking_changes 
                             0                              0 
                  deposit_type                          agent 
                             0                              0 
                       company           days_in_waiting_list 
                             0                              0 
                 customer_type             average_daily_rate 
                             0                              0 
   required_car_parking_spaces      total_of_special_requests 
                             0                              0 
            reservation_status        reservation_status_date 
                             0                              0 
              number_of_guests                     total_stay 
                             4                              0

There are just four numerical variables that are missing values. However, when we look at the dataset itself and the summary table above, we can see that some “NULL” values appear as strings. Agent and Company variables in the summary table contain “NULL” values. I’ll check each column independently to see if it contains the value “NULL” as a final sanity check.

Code

nulls <- sapply(hotel.bookings, function(x) table(grepl("NULL", x)))

for (i in 1:32) {
  if (!is.na(nulls[[i]][2])) {
    print(nulls[i])
    }
}

$country

 FALSE   TRUE 
118902    488 

$agent

 FALSE   TRUE 
103050  16340 

$company

 FALSE   TRUE 
  6797 112593

Thus, country, agent and company have “NULL” values.

Code

#rounding up the values
round(100*prop.table(table(grepl("NULL", hotel.bookings$country))), 2)


FALSE  TRUE 
99.59  0.41

Code

round(100*prop.table(table(grepl("NULL", hotel.bookings$agent))), 2)


FALSE  TRUE 
86.31 13.69

Code

round(100*prop.table(table(grepl("NULL", hotel.bookings$company))), 2)


FALSE  TRUE 
 5.69 94.31

For reservations, 0.41% of the “country” data, 13.89% of the “agent” data, and 94.31% of the “business” data are missing.

Some analysis

The summary table indicates that a city hotel’s daily fee might reach $5,400 while resort hotels only charge $508 per day. I think that’s the actual situation.

Now let’s do some in depth analysis on the given data.

Code

hotel.bookings %>% 
  arrange(desc(average_daily_rate)) %>% 
  slice_head(n=10) %>% 
  select(hotel, arrival_date_year, 
         country, 
         agent, 
         number_of_guests, 
         total_stay, 
         reservation_status, 
         average_daily_rate)

          hotel arrival_date_year country agent number_of_guests total_stay
1    City Hotel              2016     PRT    12                2          1
2    City Hotel              2017     ITA   159                1          1
3  Resort Hotel              2015     PRT  NULL                2          1
4    City Hotel              2016     PRT  NULL                4          2
5  Resort Hotel              2017     PRT   314                2         14
6  Resort Hotel              2017     PRT   250                4          6
7  Resort Hotel              2017     PRT   250                4          8
8  Resort Hotel              2017     ESP   240                4          5
9  Resort Hotel              2017     MAR   250                5          8
10 Resort Hotel              2017     PRT   250                3         10
   reservation_status average_daily_rate
1            Canceled            5400.00
2           Check-Out             510.00
3           Check-Out             508.00
4           Check-Out             451.50
5            Canceled             450.00
6            Canceled             437.00
7           Check-Out             426.25
8           Check-Out             402.00
9           Check-Out             397.38
10           Canceled             392.00

Code

hotel.bookings %>% 
  arrange(average_daily_rate) %>% 
  slice_head(n=10) %>% 
  select(hotel, arrival_date_year, 
         country, 
         agent, 
         number_of_guests, 
         total_stay, 
         reservation_status, 
         average_daily_rate)

          hotel arrival_date_year country agent number_of_guests total_stay
1  Resort Hotel              2017     GBR   273                2         10
2  Resort Hotel              2015     PRT  NULL                2          0
3  Resort Hotel              2015     PRT  NULL                2          0
4  Resort Hotel              2015     PRT  NULL                4          1
5  Resort Hotel              2015     PRT   240                2          0
6  Resort Hotel              2015     PRT   250                1          0
7  Resort Hotel              2015     PRT  NULL                2          0
8  Resort Hotel              2015     PRT   240                2          0
9  Resort Hotel              2015     PRT   305                2          2
10 Resort Hotel              2015     PRT   305                1          2
   reservation_status average_daily_rate
1           Check-Out              -6.38
2           Check-Out               0.00
3           Check-Out               0.00
4           Check-Out               0.00
5           Check-Out               0.00
6           Check-Out               0.00
7           Check-Out               0.00
8           Check-Out               0.00
9            Canceled               0.00
10          Check-Out               0.00

Therefore, it appears that the row with the average daily fee of 5,400 dollars is an error. A row with a negative average daily rate is also present. I’ll take both of them away.

Code

hotel.bookings <- hotel.bookings %>% filter(average_daily_rate>=0 & average_daily_rate<=510)

Code

hotel.bookings %>% 
  select(hotel, average_daily_rate) %>% 
  group_by(hotel) %>% 
  summarise_if(is.numeric, list(min = min, max = max, mean = mean, std_dev = sd, median = median), na.rm = TRUE)

# A tibble: 2 × 6
  hotel          min   max  mean std_dev median
  <chr>        <dbl> <dbl> <dbl>   <dbl>  <dbl>
1 City Hotel       0   510 105.     39.3   99.9
2 Resort Hotel     0   508  95.0    61.4   75

Code

hotel.bookings %>% 
  select(country) %>% 
  group_by(country) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  head(n=10)

# A tibble: 10 × 2
# Groups:   country [10]
   country     n
   <chr>   <int>
 1 PRT     48589
 2 GBR     12128
 3 FRA     10415
 4 ESP      8568
 5 DEU      7287
 6 ITA      3766
 7 IRL      3375
 8 BEL      2342
 9 BRA      2224
10 NLD      2104

Code

hotel.bookings %>% 
  filter(country %in% c("PRT", "GBR", "ESP", "FRA", "ITA")) %>% 
  select(country,is_canceled) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  arrange(desc(is_canceled))

# A tibble: 5 × 2
  country is_canceled
  <chr>         <dbl>
1 PRT           0.566
2 ITA           0.354
3 ESP           0.254
4 GBR           0.202
5 FRA           0.186

Code

hotel.bookings %>% 
  filter(reservation_status == "Check-Out") %>% 
  select(country, number_of_guests) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% 
  arrange(desc(number_of_guests)) %>% 
  head(n=10)

# A tibble: 10 × 2
   country number_of_guests
   <chr>              <int>
 1 PRT                37670
 2 GBR                19256
 3 FRA                17299
 4 ESP                13213
 5 DEU                11659
 6 IRL                 5102
 7 ITA                 4894
 8 BEL                 3855
 9 NLD                 3374
10 USA                 3212

Code

hotel.bookings %>% 
  filter(reservation_status == "Check-Out") %>% 
  select(country, number_of_guests) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% 
  arrange(desc(number_of_guests)) %>% 
  head(n=10)

# A tibble: 10 × 2
   country number_of_guests
   <chr>              <int>
 1 PRT                37670
 2 GBR                19256
 3 FRA                17299
 4 ESP                13213
 5 DEU                11659
 6 IRL                 5102
 7 ITA                 4894
 8 BEL                 3855
 9 NLD                 3374
10 USA                 3212

Code

hotel.bookings %>% 
  select(country, average_daily_rate) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  arrange(desc(average_daily_rate)) %>% 
  head(n=10)

# A tibble: 10 × 2
   country average_daily_rate
   <chr>                <dbl>
 1 DJI                   273 
 2 AIA                   265 
 3 AND                   203.
 4 UMI                   200 
 5 LAO                   182.
 6 MYT                   178.
 7 NCL                   176.
 8 GEO                   169.
 9 COM                   165.
10 FRO                   155.

Code

table(hotel.bookings$arrival_date_month)


    April    August  December  February   January      July      June     March 
    11089     13877      6780      8068      5929     12661     10939      9792 
      May  November   October September 
    11791      6794     11160     10508

City hotels charge an average of $11 more per night than resort hotels. In contrast, the price variety for resort hotels is greater than that for city hotels.

Portugal, Great Britain, France, Spain, Germany, Italy, Ireland, Belgium, Brazil, and the Netherlands top the list of countries with the most reservations, followed by Great Britain, France, and Spain. But as we can see, 56% of hotel reservations for Portugal are really canceled. Italy and Spain each have a 35% share of this percentage. Nevertheless, out of all of them, Portugal has hosted a total of 37,670 visitors over the course of three years.

The most expensive hotels, according to average daily rates, are found in Djibouti, Anguilla, Andorra, United States Minor Outlying Islands, Laos, and so on. It appears that hotels in small nations with few visitors are significantly more expensive.

The busiest times of the year for hotels are in August, July, and May, respectively.

Finally, Let me check how many rows have a daily rate of zero last.

Code

hotel.bookings %>% 
  filter(average_daily_rate == 0) %>% 
  count()

     n
1 1959

1959 reservations have a daily rate of nothing.

Code

hotel.bookings %>% 
  filter(average_daily_rate == 0) %>% 
  group_by(country) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  head()

# A tibble: 6 × 2
# Groups:   country [6]
  country     n
  <chr>   <int>
1 PRT      1550
2 ESP        79
3 GBR        73
4 FRA        55
5 DEU        41
6 NULL       21

Portugal makes up the majority of the zero values. Therefore, further research into the accuracy of hotel data for Portugal is

Code

head(hotel.bookings)

         hotel is_canceled lead_time arrival_date_year arrival_date_month
1 Resort Hotel           0       342              2015               July
2 Resort Hotel           0       737              2015               July
3 Resort Hotel           0         7              2015               July
4 Resort Hotel           0        13              2015               July
5 Resort Hotel           0        14              2015               July
6 Resort Hotel           0        14              2015               July
  arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
1                       27                         1                       0
2                       27                         1                       0
3                       27                         1                       0
4                       27                         1                       0
5                       27                         1                       0
6                       27                         1                       0
  stays_in_week_nights adults children babies meal country market_segment
1                    0      2        0      0   BB     PRT         Direct
2                    0      2        0      0   BB     PRT         Direct
3                    1      1        0      0   BB     GBR         Direct
4                    1      1        0      0   BB     GBR      Corporate
5                    2      2        0      0   BB     GBR      Online TA
6                    2      2        0      0   BB     GBR      Online TA
  distribution_channel is_repeated_guest previous_cancellations
1               Direct                 0                      0
2               Direct                 0                      0
3               Direct                 0                      0
4            Corporate                 0                      0
5                TA/TO                 0                      0
6                TA/TO                 0                      0
  previous_bookings_not_canceled reserved_room_type assigned_room_type
1                              0                  C                  C
2                              0                  C                  C
3                              0                  A                  C
4                              0                  A                  A
5                              0                  A                  A
6                              0                  A                  A
  booking_changes deposit_type agent company days_in_waiting_list customer_type
1               3   No Deposit  NULL    NULL                    0     Transient
2               4   No Deposit  NULL    NULL                    0     Transient
3               0   No Deposit  NULL    NULL                    0     Transient
4               0   No Deposit   304    NULL                    0     Transient
5               0   No Deposit   240    NULL                    0     Transient
6               0   No Deposit   240    NULL                    0     Transient
  average_daily_rate required_car_parking_spaces total_of_special_requests
1                  0                           0                         0
2                  0                           0                         0
3                 75                           0                         0
4                 75                           0                         0
5                 98                           0                         1
6                 98                           0                         1
  reservation_status reservation_status_date number_of_guests total_stay
1          Check-Out              2015-07-01                2          0
2          Check-Out              2015-07-01                2          0
3          Check-Out              2015-07-02                1          1
4          Check-Out              2015-07-02                1          1
5          Check-Out              2015-07-03                2          2
6          Check-Out              2015-07-03                2          2

Code

colnames(hotel.bookings)

 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "average_daily_rate"            
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
[33] "number_of_guests"               "total_stay"

Code

ggplot(hotel.bookings, aes(x = customer_type, y = is_canceled))+
  geom_col(fill = "green")+
  theme_minimal()+
  labs(
    title = "Cancellations vs Customer_type",
    x = "Customer_type",
    y = "Cancellations"
  )

From the above graph we can clearly infer that the transient customers are the ones who do most number of cancellations and the group customers are the ones doing least number of cancellations.

Code

cancel_data<-hotel.bookings %>% 
  select(country, is_canceled) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% 
  arrange(desc(is_canceled)) %>% 
  head(n=10)

Code

ggplot(data = cancel_data, mapping = aes(x= is_canceled, y= reorder(country, is_canceled)))+
  geom_col(aes(fill = country))+
   theme_minimal()+
  coord_flip()+
  labs(title = "Country vs Cancellations",
       y = "Country",
       x = "Cancellations")+
 theme(legend.position = "none")

From the above graph we can clearly see that the number of cancellations differ for the people from different countries and is maximum for thr person from Portugal.

Code

booking_changes_room_type<-hotel.bookings %>% 
  select(assigned_room_type, booking_changes) %>% 
  group_by(assigned_room_type) %>% 
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% 
  arrange(desc(booking_changes)) %>% 
  head(n=10)

Code

ggplot(data = booking_changes_room_type, mapping = aes(x = booking_changes, y = reorder(assigned_room_type, booking_changes))) +
  geom_col(mapping = aes(fill = booking_changes), position = "dodge")

Code

  labs(x = NULL, y = NULL,
       fill = NULL,
       title = "room type and booking changes") +
  scale_fill_brewer(palette = 1) +
  theme_minimal() +
  theme(legend.position = "top")

NULL

From the above graph we can clearly see that there is a lot of co-relation between the room_type assigned and the booking changes. The people who were assigned room A are the ones doing most number of booking changes.

I have failed to answer the questions What distinguishes hotel reservations for resorts from those for cities? I can also improvise on building visualizations for the data.