library(tidyverse)
library(ggplot2)
library(summarytools)
library(lubridate)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 7
Challenge Overview
library(readr)
<- read_csv("_data/hotel_bookings.csv") hotel_bookings
Briefly describe the data
The data has about 119390 rows and 32 columns.It has 13 categorical variables and 18 numeric variables and a date column. ## Tidy Data (as needed) In order to tidy the data, first we need to understand the data.
head(hotel_bookings)
# A tibble: 6 × 32
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resort… 0 342 2015 July 27 1 0 0 2
2 Resort… 0 737 2015 July 27 1 0 0 2
3 Resort… 0 7 2015 July 27 1 0 1 1
4 Resort… 0 13 2015 July 27 1 0 1 1
5 Resort… 0 14 2015 July 27 1 0 2 2
6 Resort… 0 14 2015 July 27 1 0 2 2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
# country <chr>, market_segment <chr>, distribution_channel <chr>,
# is_repeated_guest <dbl>, previous_cancellations <dbl>,
# previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
# assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
# agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
Document your work here.
colnames(hotel_bookings)
[1] "hotel" "is_canceled"
[3] "lead_time" "arrival_date_year"
[5] "arrival_date_month" "arrival_date_week_number"
[7] "arrival_date_day_of_month" "stays_in_weekend_nights"
[9] "stays_in_week_nights" "adults"
[11] "children" "babies"
[13] "meal" "country"
[15] "market_segment" "distribution_channel"
[17] "is_repeated_guest" "previous_cancellations"
[19] "previous_bookings_not_canceled" "reserved_room_type"
[21] "assigned_room_type" "booking_changes"
[23] "deposit_type" "agent"
[25] "company" "days_in_waiting_list"
[27] "customer_type" "adr"
[29] "required_car_parking_spaces" "total_of_special_requests"
[31] "reservation_status" "reservation_status_date"
library(funModeling)
plot_num(hotel_bookings)
dfSummary(hotel_bookings)
Data Frame Summary
hotel_bookings
Dimensions: 119390 x 32
Duplicates: 31994
-----------------------------------------------------------------------------------------------------------------------------------
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
---- -------------------------------- -------------------------- ---------------------- ---------------------- ---------- ---------
1 hotel 1. City Hotel 79330 (66.4%) IIIIIIIIIIIII 119390 0
[character] 2. Resort Hotel 40060 (33.6%) IIIIII (100.0%) (0.0%)
2 is_canceled Min : 0 0 : 75166 (63.0%) IIIIIIIIIIII 119390 0
[numeric] Mean : 0.4 1 : 44224 (37.0%) IIIIIII (100.0%) (0.0%)
Max : 1
3 lead_time Mean (sd) : 104 (106.9) 479 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 69 < 737 :
IQR (CV) : 142 (1) : : .
: : : . .
4 arrival_date_year Mean (sd) : 2016.2 (0.7) 2015 : 21996 (18.4%) III 119390 0
[numeric] min < med < max: 2016 : 56707 (47.5%) IIIIIIIII (100.0%) (0.0%)
2015 < 2016 < 2017 2017 : 40687 (34.1%) IIIIII
IQR (CV) : 1 (0)
5 arrival_date_month 1. August 13877 (11.6%) II 119390 0
[character] 2. July 12661 (10.6%) II (100.0%) (0.0%)
3. May 11791 ( 9.9%) I
4. October 11160 ( 9.3%) I
5. April 11089 ( 9.3%) I
6. June 10939 ( 9.2%) I
7. September 10508 ( 8.8%) I
8. March 9794 ( 8.2%) I
9. February 8068 ( 6.8%) I
10. November 6794 ( 5.7%) I
[ 2 others ] 12709 (10.6%) II
6 arrival_date_week_number Mean (sd) : 27.2 (13.6) 53 distinct values . : . . . 119390 0
[numeric] min < med < max: . : : : : : : (100.0%) (0.0%)
1 < 28 < 53 . : : : : : : : : :
IQR (CV) : 22 (0.5) : : : : : : : : : :
: : : : : : : : : :
7 arrival_date_day_of_month Mean (sd) : 15.8 (8.8) 31 distinct values : 119390 0
[numeric] min < med < max: : : : . : : . : : (100.0%) (0.0%)
1 < 16 < 31 : : : : : : : : : :
IQR (CV) : 15 (0.6) : : : : : : : : : :
: : : : : : : : : :
8 stays_in_weekend_nights Mean (sd) : 0.9 (1) 17 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 1 < 19 :
IQR (CV) : 2 (1.1) : :
: :
9 stays_in_week_nights Mean (sd) : 2.5 (1.9) 35 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 2 < 50 :
IQR (CV) : 2 (0.8) :
:
10 adults Mean (sd) : 1.9 (0.6) 14 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 2 < 55 :
IQR (CV) : 0 (0.3) :
:
11 children Mean (sd) : 0.1 (0.4) 0 : 110796 (92.8%) IIIIIIIIIIIIIIIIII 119386 4
[numeric] min < med < max: 1 : 4861 ( 4.1%) (100.0%) (0.0%)
0 < 0 < 10 2 : 3652 ( 3.1%)
IQR (CV) : 0 (3.8) 3 : 76 ( 0.1%)
10 : 1 ( 0.0%)
12 babies Mean (sd) : 0 (0.1) 0 : 118473 (99.2%) IIIIIIIIIIIIIIIIIII 119390 0
[numeric] min < med < max: 1 : 900 ( 0.8%) (100.0%) (0.0%)
0 < 0 < 10 2 : 15 ( 0.0%)
IQR (CV) : 0 (12.3) 9 : 1 ( 0.0%)
10 : 1 ( 0.0%)
13 meal 1. BB 92310 (77.3%) IIIIIIIIIIIIIII 119390 0
[character] 2. FB 798 ( 0.7%) (100.0%) (0.0%)
3. HB 14463 (12.1%) II
4. SC 10650 ( 8.9%) I
5. Undefined 1169 ( 1.0%)
14 country 1. PRT 48590 (40.7%) IIIIIIII 119390 0
[character] 2. GBR 12129 (10.2%) II (100.0%) (0.0%)
3. FRA 10415 ( 8.7%) I
4. ESP 8568 ( 7.2%) I
5. DEU 7287 ( 6.1%) I
6. ITA 3766 ( 3.2%)
7. IRL 3375 ( 2.8%)
8. BEL 2342 ( 2.0%)
9. BRA 2224 ( 1.9%)
10. NLD 2104 ( 1.8%)
[ 168 others ] 18590 (15.6%) III
15 market_segment 1. Aviation 237 ( 0.2%) 119390 0
[character] 2. Complementary 743 ( 0.6%) (100.0%) (0.0%)
3. Corporate 5295 ( 4.4%)
4. Direct 12606 (10.6%) II
5. Groups 19811 (16.6%) III
6. Offline TA/TO 24219 (20.3%) IIII
7. Online TA 56477 (47.3%) IIIIIIIII
8. Undefined 2 ( 0.0%)
16 distribution_channel 1. Corporate 6677 ( 5.6%) I 119390 0
[character] 2. Direct 14645 (12.3%) II (100.0%) (0.0%)
3. GDS 193 ( 0.2%)
4. TA/TO 97870 (82.0%) IIIIIIIIIIIIIIII
5. Undefined 5 ( 0.0%)
17 is_repeated_guest Min : 0 0 : 115580 (96.8%) IIIIIIIIIIIIIIIIIII 119390 0
[numeric] Mean : 0 1 : 3810 ( 3.2%) (100.0%) (0.0%)
Max : 1
18 previous_cancellations Mean (sd) : 0.1 (0.8) 15 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 0 < 26 :
IQR (CV) : 0 (9.7) :
:
19 previous_bookings_not_canceled Mean (sd) : 0.1 (1.5) 73 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 0 < 72 :
IQR (CV) : 0 (10.9) :
:
20 reserved_room_type 1. A 85994 (72.0%) IIIIIIIIIIIIII 119390 0
[character] 2. B 1118 ( 0.9%) (100.0%) (0.0%)
3. C 932 ( 0.8%)
4. D 19201 (16.1%) III
5. E 6535 ( 5.5%) I
6. F 2897 ( 2.4%)
7. G 2094 ( 1.8%)
8. H 601 ( 0.5%)
9. L 6 ( 0.0%)
10. P 12 ( 0.0%)
21 assigned_room_type 1. A 74053 (62.0%) IIIIIIIIIIII 119390 0
[character] 2. D 25322 (21.2%) IIII (100.0%) (0.0%)
3. E 7806 ( 6.5%) I
4. F 3751 ( 3.1%)
5. G 2553 ( 2.1%)
6. C 2375 ( 2.0%)
7. B 2163 ( 1.8%)
8. H 712 ( 0.6%)
9. I 363 ( 0.3%)
10. K 279 ( 0.2%)
[ 2 others ] 13 ( 0.0%)
22 booking_changes Mean (sd) : 0.2 (0.7) 21 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 0 < 21 :
IQR (CV) : 0 (2.9) :
:
23 deposit_type 1. No Deposit 104641 (87.6%) IIIIIIIIIIIIIIIII 119390 0
[character] 2. Non Refund 14587 (12.2%) II (100.0%) (0.0%)
3. Refundable 162 ( 0.1%)
24 agent 1. 9 31961 (26.8%) IIIII 119390 0
[character] 2. NULL 16340 (13.7%) II (100.0%) (0.0%)
3. 240 13922 (11.7%) II
4. 1 7191 ( 6.0%) I
5. 14 3640 ( 3.0%)
6. 7 3539 ( 3.0%)
7. 6 3290 ( 2.8%)
8. 250 2870 ( 2.4%)
9. 241 1721 ( 1.4%)
10. 28 1666 ( 1.4%)
[ 324 others ] 33250 (27.8%) IIIII
25 company 1. NULL 112593 (94.3%) IIIIIIIIIIIIIIIIII 119390 0
[character] 2. 40 927 ( 0.8%) (100.0%) (0.0%)
3. 223 784 ( 0.7%)
4. 67 267 ( 0.2%)
5. 45 250 ( 0.2%)
6. 153 215 ( 0.2%)
7. 174 149 ( 0.1%)
8. 219 141 ( 0.1%)
9. 281 138 ( 0.1%)
10. 154 133 ( 0.1%)
[ 343 others ] 3793 ( 3.2%)
26 days_in_waiting_list Mean (sd) : 2.3 (17.6) 128 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
0 < 0 < 391 :
IQR (CV) : 0 (7.6) :
:
27 customer_type 1. Contract 4076 ( 3.4%) 119390 0
[character] 2. Group 577 ( 0.5%) (100.0%) (0.0%)
3. Transient 89613 (75.1%) IIIIIIIIIIIIIII
4. Transient-Party 25124 (21.0%) IIII
28 adr Mean (sd) : 101.8 (50.5) 8879 distinct values : 119390 0
[numeric] min < med < max: : (100.0%) (0.0%)
-6.4 < 94.6 < 5400 :
IQR (CV) : 56.7 (0.5) :
:
29 required_car_parking_spaces Mean (sd) : 0.1 (0.2) 0 : 111974 (93.8%) IIIIIIIIIIIIIIIIII 119390 0
[numeric] min < med < max: 1 : 7383 ( 6.2%) I (100.0%) (0.0%)
0 < 0 < 8 2 : 28 ( 0.0%)
IQR (CV) : 0 (3.9) 3 : 3 ( 0.0%)
8 : 2 ( 0.0%)
30 total_of_special_requests Mean (sd) : 0.6 (0.8) 0 : 70318 (58.9%) IIIIIIIIIII 119390 0
[numeric] min < med < max: 1 : 33226 (27.8%) IIIII (100.0%) (0.0%)
0 < 0 < 5 2 : 12969 (10.9%) II
IQR (CV) : 1 (1.4) 3 : 2497 ( 2.1%)
4 : 340 ( 0.3%)
5 : 40 ( 0.0%)
31 reservation_status 1. Canceled 43017 (36.0%) IIIIIII 119390 0
[character] 2. Check-Out 75166 (63.0%) IIIIIIIIIIII (100.0%) (0.0%)
3. No-Show 1207 ( 1.0%)
32 reservation_status_date min : 2014-10-17 926 distinct values . : : : : 119390 0
[Date] med : 2016-08-07 : : : : : : . (100.0%) (0.0%)
max : 2017-09-14 . : : : : : : :
range : 2y 10m 28d : : : : : : : :
. : : : : : : : :
-----------------------------------------------------------------------------------------------------------------------------------
sapply(hotel_bookings,function(x)sum(is.na(x)))
hotel is_canceled
0 0
lead_time arrival_date_year
0 0
arrival_date_month arrival_date_week_number
0 0
arrival_date_day_of_month stays_in_weekend_nights
0 0
stays_in_week_nights adults
0 0
children babies
4 0
meal country
0 0
market_segment distribution_channel
0 0
is_repeated_guest previous_cancellations
0 0
previous_bookings_not_canceled reserved_room_type
0 0
assigned_room_type booking_changes
0 0
deposit_type agent
0 0
company days_in_waiting_list
0 0
customer_type adr
0 0
required_car_parking_spaces total_of_special_requests
0 0
reservation_status reservation_status_date
0 0
We see that there are no missing values.
sapply(hotel_bookings,function(x)sum(is.null(x)))
hotel is_canceled
0 0
lead_time arrival_date_year
0 0
arrival_date_month arrival_date_week_number
0 0
arrival_date_day_of_month stays_in_weekend_nights
0 0
stays_in_week_nights adults
0 0
children babies
0 0
meal country
0 0
market_segment distribution_channel
0 0
is_repeated_guest previous_cancellations
0 0
previous_bookings_not_canceled reserved_room_type
0 0
assigned_room_type booking_changes
0 0
deposit_type agent
0 0
company days_in_waiting_list
0 0
customer_type adr
0 0
required_car_parking_spaces total_of_special_requests
0 0
reservation_status reservation_status_date
0 0
There are no null values as such
unique(hotel_bookings$country)
[1] "PRT" "GBR" "USA" "ESP" "IRL" "FRA" "NULL" "ROU" "NOR" "OMN"
[11] "ARG" "POL" "DEU" "BEL" "CHE" "CN" "GRC" "ITA" "NLD" "DNK"
[21] "RUS" "SWE" "AUS" "EST" "CZE" "BRA" "FIN" "MOZ" "BWA" "LUX"
[31] "SVN" "ALB" "IND" "CHN" "MEX" "MAR" "UKR" "SMR" "LVA" "PRI"
[41] "SRB" "CHL" "AUT" "BLR" "LTU" "TUR" "ZAF" "AGO" "ISR" "CYM"
[51] "ZMB" "CPV" "ZWE" "DZA" "KOR" "CRI" "HUN" "ARE" "TUN" "JAM"
[61] "HRV" "HKG" "IRN" "GEO" "AND" "GIB" "URY" "JEY" "CAF" "CYP"
[71] "COL" "GGY" "KWT" "NGA" "MDV" "VEN" "SVK" "FJI" "KAZ" "PAK"
[81] "IDN" "LBN" "PHL" "SEN" "SYC" "AZE" "BHR" "NZL" "THA" "DOM"
[91] "MKD" "MYS" "ARM" "JPN" "LKA" "CUB" "CMR" "BIH" "MUS" "COM"
[101] "SUR" "UGA" "BGR" "CIV" "JOR" "SYR" "SGP" "BDI" "SAU" "VNM"
[111] "PLW" "QAT" "EGY" "PER" "MLT" "MWI" "ECU" "MDG" "ISL" "UZB"
[121] "NPL" "BHS" "MAC" "TGO" "TWN" "DJI" "STP" "KNA" "ETH" "IRQ"
[131] "HND" "RWA" "KHM" "MCO" "BGD" "IMN" "TJK" "NIC" "BEN" "VGB"
[141] "TZA" "GAB" "GHA" "TMP" "GLP" "KEN" "LIE" "GNB" "MNE" "UMI"
[151] "MYT" "FRO" "MMR" "PAN" "BFA" "LBY" "MLI" "NAM" "BOL" "PRY"
[161] "BRB" "ABW" "AIA" "SLV" "DMA" "PYF" "GUY" "LCA" "ATA" "GTM"
[171] "ASM" "MRT" "NCL" "KIR" "SDN" "ATF" "SLE" "LAO"
If we observe the country names, in the first column there is a country null.Now we should observe the attributes present, so that we will validate if we need data in null or not.
filter(hotel_bookings,country == "NULL")
# A tibble: 488 × 32
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resor… 0 118 2015 July 27 1 4 10 1
2 Resor… 1 0 2016 Februa… 8 15 0 0 0
3 Resor… 1 8 2016 July 30 21 0 1 1
4 Resor… 1 39 2016 August 36 30 0 5 2
5 Resor… 1 0 2016 October 42 13 0 1 1
6 Resor… 1 0 2016 Novemb… 48 21 0 0 0
7 Resor… 1 4 2016 Decemb… 52 20 0 1 1
8 Resor… 1 4 2016 Decemb… 52 20 0 1 1
9 Resor… 1 87 2017 April 18 30 2 3 2
10 Resor… 1 87 2017 April 18 30 2 3 2
# … with 478 more rows, 22 more variables: children <dbl>, babies <dbl>,
# meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
# deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
we see that the data is populated , so only when we are doing country wise analysis, we can choose to exclude the data of NULL, else we can store it.
<- hotel_bookings%>%
hotel_bookings_mutate mutate(arrival_date = str_c(arrival_date_day_of_month,
arrival_date_month,sep="-"),arrival_date = dmy(arrival_date)) arrival_date_year,
We now format the date column and view them
head(hotel_bookings_mutate$arrival_date)
[1] "2015-07-01" "2015-07-01" "2015-07-01" "2015-07-01" "2015-07-01"
[6] "2015-07-01"
we convert the format of the arrival date
<- hotel_bookings_mutate%>%
hotel_bookings_mutate mutate(child_guests = children + babies)%>%
select(-c(children,babies))
We can also add few other variables to find out the advanced bookings made.
<- hotel_bookings_mutate%>%
hotel_bookings_mutate mutate(date_of_booking = arrival_date-lead_time)
head(hotel_bookings_mutate)
# A tibble: 6 × 33
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resort… 0 342 2015 July 27 1 0 0 2
2 Resort… 0 737 2015 July 27 1 0 0 2
3 Resort… 0 7 2015 July 27 1 0 1 1
4 Resort… 0 13 2015 July 27 1 0 1 1
5 Resort… 0 14 2015 July 27 1 0 2 2
6 Resort… 0 14 2015 July 27 1 0 2 2
# … with 23 more variables: meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
# deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
# total_of_special_requests <dbl>, reservation_status <chr>, …
<- hotel_bookings_mutate%>%
hotel_bookings_mutate mutate(stay_in_nights = stays_in_week_nights+stays_in_weekend_nights)
head(hotel_bookings_mutate)
# A tibble: 6 × 34
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resort… 0 342 2015 July 27 1 0 0 2
2 Resort… 0 737 2015 July 27 1 0 0 2
3 Resort… 0 7 2015 July 27 1 0 1 1
4 Resort… 0 13 2015 July 27 1 0 1 1
5 Resort… 0 14 2015 July 27 1 0 2 2
6 Resort… 0 14 2015 July 27 1 0 2 2
# … with 24 more variables: meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
# deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
# total_of_special_requests <dbl>, reservation_status <chr>, …
here we are adding an extra column which we would be using in visualization.
Visualization with Multiple Dimensions
ggplot(hotel_bookings_mutate, aes(x = hotel)) +
geom_bar(fill = "lightblue") +
labs(title = "Hotel type and count", x = "Hotel_type",
y = "Count")+geom_text(stat='count', aes(label=..count..), vjust=1)
I have learned to add the number tips to the graphs
ggplot(hotel_bookings_mutate, aes(x = market_segment, fill = hotel)) +
geom_bar(bins = 25) +
labs(title = "Market_Wise_booking", x = "market_segment", y = "Frequency",
fill = guide_legend("hotel_type")) +scale_fill_manual(values = c("navyblue","red"))
I have newly found a way to modify the color of the plots.
ggplot(hotel_bookings_mutate, aes(x=adults,y=stay_in_nights,color=hotel))+
geom_point(alpha=1)+labs(title = "scatter plot of stay in nights and adult guests")
ggplot(hotel_bookings_mutate,aes(factor(is_canceled)))+
geom_bar( col='black', fill="lightpink", alpha = 1) +
facet_wrap(~hotel) +
scale_x_discrete("Canceled_tag",labels = c("No","Yes"))+geom_text(stat='count', aes(label=..count..), vjust=-0.25)
ggplot(hotel_bookings_mutate, aes(x=arrival_date_month, y=days_in_waiting_list, group=1)) + stat_summary(fun="mean", geom="line", col="black") + labs(title = "avg waitlist", x = "month", y = "waitlist_days")
Highest wait times are observed in may and october.