Reading datasets: hotel_bookings.csv

challenge_2
Susmita Madineni
hotel_bookings
readr
Author

Susmita Madineni

Published

February 27, 2022

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Read in the Data

Read in the following data set, available in the posts/_data folder: - hotel_bookings.csv ⭐⭐⭐⭐

Code
# Reading hotel_bookings.csv dataset

library(readr)
hotel_bookings_data <- read_csv("_data/hotel_bookings.csv")
view(hotel_bookings_data)
Code
# Preview the first few rows of the dataset
head(hotel_bookings_data)
# A tibble: 6 × 32
  hotel   is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
  <chr>     <dbl>   <dbl>   <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
1 Resort…       0     342    2015 July         27       1       0       0      2
2 Resort…       0     737    2015 July         27       1       0       0      2
3 Resort…       0       7    2015 July         27       1       0       1      1
4 Resort…       0      13    2015 July         27       1       0       1      1
5 Resort…       0      14    2015 July         27       1       0       2      2
6 Resort…       0      14    2015 July         27       1       0       2      2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
#   country <chr>, market_segment <chr>, distribution_channel <chr>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
#   assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
#   agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
Code
# Understanding the dimensions of the dataset 
dim(hotel_bookings_data)
[1] 119390     32
Code
# Identifying the column names of the dataset 
colnames(hotel_bookings_data)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
Code
# Changing column name to make it more readable
colnames(hotel_bookings_data)[28] <- "average_daily_rate"

# Identifying the data types of the columns
table(sapply(hotel_bookings_data, function(x) typeof(x)))

character    double 
       13        19 
Code
sapply(hotel_bookings_data, function(x) n_distinct(x))
                         hotel                    is_canceled 
                             2                              2 
                     lead_time              arrival_date_year 
                           479                              3 
            arrival_date_month       arrival_date_week_number 
                            12                             53 
     arrival_date_day_of_month        stays_in_weekend_nights 
                            31                             17 
          stays_in_week_nights                         adults 
                            35                             14 
                      children                         babies 
                             6                              5 
                          meal                        country 
                             5                            178 
                market_segment           distribution_channel 
                             8                              5 
             is_repeated_guest         previous_cancellations 
                             2                             15 
previous_bookings_not_canceled             reserved_room_type 
                            73                             10 
            assigned_room_type                booking_changes 
                            12                             21 
                  deposit_type                          agent 
                             3                            334 
                       company           days_in_waiting_list 
                           353                            128 
                 customer_type             average_daily_rate 
                             4                           8879 
   required_car_parking_spaces      total_of_special_requests 
                             5                              6 
            reservation_status        reservation_status_date 
                             3                            926 
Code
# Identifying unique values for columns - country, distribution_channel
unique(hotel_bookings_data$hotel)
[1] "Resort Hotel" "City Hotel"  
Code
unique(hotel_bookings_data$country)
  [1] "PRT"  "GBR"  "USA"  "ESP"  "IRL"  "FRA"  "NULL" "ROU"  "NOR"  "OMN" 
 [11] "ARG"  "POL"  "DEU"  "BEL"  "CHE"  "CN"   "GRC"  "ITA"  "NLD"  "DNK" 
 [21] "RUS"  "SWE"  "AUS"  "EST"  "CZE"  "BRA"  "FIN"  "MOZ"  "BWA"  "LUX" 
 [31] "SVN"  "ALB"  "IND"  "CHN"  "MEX"  "MAR"  "UKR"  "SMR"  "LVA"  "PRI" 
 [41] "SRB"  "CHL"  "AUT"  "BLR"  "LTU"  "TUR"  "ZAF"  "AGO"  "ISR"  "CYM" 
 [51] "ZMB"  "CPV"  "ZWE"  "DZA"  "KOR"  "CRI"  "HUN"  "ARE"  "TUN"  "JAM" 
 [61] "HRV"  "HKG"  "IRN"  "GEO"  "AND"  "GIB"  "URY"  "JEY"  "CAF"  "CYP" 
 [71] "COL"  "GGY"  "KWT"  "NGA"  "MDV"  "VEN"  "SVK"  "FJI"  "KAZ"  "PAK" 
 [81] "IDN"  "LBN"  "PHL"  "SEN"  "SYC"  "AZE"  "BHR"  "NZL"  "THA"  "DOM" 
 [91] "MKD"  "MYS"  "ARM"  "JPN"  "LKA"  "CUB"  "CMR"  "BIH"  "MUS"  "COM" 
[101] "SUR"  "UGA"  "BGR"  "CIV"  "JOR"  "SYR"  "SGP"  "BDI"  "SAU"  "VNM" 
[111] "PLW"  "QAT"  "EGY"  "PER"  "MLT"  "MWI"  "ECU"  "MDG"  "ISL"  "UZB" 
[121] "NPL"  "BHS"  "MAC"  "TGO"  "TWN"  "DJI"  "STP"  "KNA"  "ETH"  "IRQ" 
[131] "HND"  "RWA"  "KHM"  "MCO"  "BGD"  "IMN"  "TJK"  "NIC"  "BEN"  "VGB" 
[141] "TZA"  "GAB"  "GHA"  "TMP"  "GLP"  "KEN"  "LIE"  "GNB"  "MNE"  "UMI" 
[151] "MYT"  "FRO"  "MMR"  "PAN"  "BFA"  "LBY"  "MLI"  "NAM"  "BOL"  "PRY" 
[161] "BRB"  "ABW"  "AIA"  "SLV"  "DMA"  "PYF"  "GUY"  "LCA"  "ATA"  "GTM" 
[171] "ASM"  "MRT"  "NCL"  "KIR"  "SDN"  "ATF"  "SLE"  "LAO" 
Code
unique(hotel_bookings_data$distribution_channel)
[1] "Direct"    "Corporate" "TA/TO"     "Undefined" "GDS"      

Describe the data

This dataset gives the information about hotel reservations. It has 11930 rows and 32 columns. Based on the above, we can ionfer that there are 13 columns(variables) are of character datatype and 19 variables are of double datatype. Almost all the columns are readable for the user, except adr(column 28) which stands for average daily rate. The hotel reservation data is provided for two types of hotels(Resort, City hotels) in various countries(178 countries) in years 2015, 2016 and 2017. Each observation in the dataset provides information of hotel category, reservation of the hotel information like arrival date(date, month, year, week), number of visitors(adult, children), country, number of days staying, meal, previous cancellations, reserved room types, customer type, number of special requests etc. Since every row and column in the dataset correspond to an observation, pivoting is not required.

Code
hotel_bookings_data <- mutate(
  hotel_bookings_data, total_number_of_guests = adults + children + babies, 
  total_days_stay = stays_in_weekend_nights + stays_in_week_nights
)

print(summary(hotel_bookings_data, varnumbers= FALSE, 
                plain.ascii= FALSE, 
                style= "grid", 
                graph.magnif= 0.80, 
                valid.col= TRUE),
      method= 'render', 
      table.classes= 'table-condensed')
    hotel            is_canceled       lead_time   arrival_date_year
 Length:119390      Min.   :0.0000   Min.   :  0   Min.   :2015     
 Class :character   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
 Mode  :character   Median :0.0000   Median : 69   Median :2016     
                    Mean   :0.3704   Mean   :104   Mean   :2016     
                    3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
                    Max.   :1.0000   Max.   :737   Max.   :2017     
                                                                    
 arrival_date_month arrival_date_week_number arrival_date_day_of_month
 Length:119390      Min.   : 1.00            Min.   : 1.0             
 Class :character   1st Qu.:16.00            1st Qu.: 8.0             
 Mode  :character   Median :28.00            Median :16.0             
                    Mean   :27.17            Mean   :15.8             
                    3rd Qu.:38.00            3rd Qu.:23.0             
                    Max.   :53.00            Max.   :31.0             
                                                                      
 stays_in_weekend_nights stays_in_week_nights     adults      
 Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
 1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
 Median : 1.0000         Median : 2.0         Median : 2.000  
 Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
 3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
 Max.   :19.0000         Max.   :50.0         Max.   :55.000  
                                                              
    children           babies              meal             country         
 Min.   : 0.0000   Min.   : 0.000000   Length:119390      Length:119390     
 1st Qu.: 0.0000   1st Qu.: 0.000000   Class :character   Class :character  
 Median : 0.0000   Median : 0.000000   Mode  :character   Mode  :character  
 Mean   : 0.1039   Mean   : 0.007949                                        
 3rd Qu.: 0.0000   3rd Qu.: 0.000000                                        
 Max.   :10.0000   Max.   :10.000000                                        
 NA's   :4                                                                  
 market_segment     distribution_channel is_repeated_guest
 Length:119390      Length:119390        Min.   :0.00000  
 Class :character   Class :character     1st Qu.:0.00000  
 Mode  :character   Mode  :character     Median :0.00000  
                                         Mean   :0.03191  
                                         3rd Qu.:0.00000  
                                         Max.   :1.00000  
                                                          
 previous_cancellations previous_bookings_not_canceled reserved_room_type
 Min.   : 0.00000       Min.   : 0.0000                Length:119390     
 1st Qu.: 0.00000       1st Qu.: 0.0000                Class :character  
 Median : 0.00000       Median : 0.0000                Mode  :character  
 Mean   : 0.08712       Mean   : 0.1371                                  
 3rd Qu.: 0.00000       3rd Qu.: 0.0000                                  
 Max.   :26.00000       Max.   :72.0000                                  
                                                                         
 assigned_room_type booking_changes   deposit_type          agent          
 Length:119390      Min.   : 0.0000   Length:119390      Length:119390     
 Class :character   1st Qu.: 0.0000   Class :character   Class :character  
 Mode  :character   Median : 0.0000   Mode  :character   Mode  :character  
                    Mean   : 0.2211                                        
                    3rd Qu.: 0.0000                                        
                    Max.   :21.0000                                        
                                                                           
   company          days_in_waiting_list customer_type      average_daily_rate
 Length:119390      Min.   :  0.000      Length:119390      Min.   :  -6.38   
 Class :character   1st Qu.:  0.000      Class :character   1st Qu.:  69.29   
 Mode  :character   Median :  0.000      Mode  :character   Median :  94.58   
                    Mean   :  2.321                         Mean   : 101.83   
                    3rd Qu.:  0.000                         3rd Qu.: 126.00   
                    Max.   :391.000                         Max.   :5400.00   
                                                                              
 required_car_parking_spaces total_of_special_requests reservation_status
 Min.   :0.00000             Min.   :0.0000            Length:119390     
 1st Qu.:0.00000             1st Qu.:0.0000            Class :character  
 Median :0.00000             Median :0.0000            Mode  :character  
 Mean   :0.06252             Mean   :0.5714                              
 3rd Qu.:0.00000             3rd Qu.:1.0000                              
 Max.   :8.00000             Max.   :5.0000                              
                                                                         
 reservation_status_date total_number_of_guests total_days_stay 
 Min.   :2014-10-17      Min.   : 0.000         Min.   : 0.000  
 1st Qu.:2016-02-01      1st Qu.: 2.000         1st Qu.: 2.000  
 Median :2016-08-07      Median : 2.000         Median : 3.000  
 Mean   :2016-07-30      Mean   : 1.968         Mean   : 3.428  
 3rd Qu.:2017-02-08      3rd Qu.: 2.000         3rd Qu.: 4.000  
 Max.   :2017-09-14      Max.   :55.000         Max.   :69.000  
                         NA's   :4                              
Code
#Filter country Code = 'PRT' from the dataset 
filter(hotel_bookings_data, `country` == "PRT")
# A tibble: 48,590 × 34
   hotel  is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
   <chr>    <dbl>   <dbl>   <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
 1 Resor…       0     342    2015 July         27       1       0       0      2
 2 Resor…       0     737    2015 July         27       1       0       0      2
 3 Resor…       0       0    2015 July         27       1       0       2      2
 4 Resor…       0       9    2015 July         27       1       0       2      2
 5 Resor…       1      85    2015 July         27       1       0       3      2
 6 Resor…       1      75    2015 July         27       1       0       3      2
 7 Resor…       1      23    2015 July         27       1       0       4      2
 8 Resor…       0      35    2015 July         27       1       0       4      2
 9 Resor…       0      37    2015 July         27       1       0       4      2
10 Resor…       0      37    2015 July         27       1       0       4      2
# … with 48,580 more rows, 24 more variables: children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
#   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, average_daily_rate <dbl>, …
Code
#Filter the rows that has country as "PRT" and is not a repeated_guest
filter(hotel_bookings_data, `country` == "PRT" & `is_repeated_guest` == 0)
# A tibble: 45,340 × 34
   hotel  is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
   <chr>    <dbl>   <dbl>   <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
 1 Resor…       0     342    2015 July         27       1       0       0      2
 2 Resor…       0     737    2015 July         27       1       0       0      2
 3 Resor…       0       0    2015 July         27       1       0       2      2
 4 Resor…       0       9    2015 July         27       1       0       2      2
 5 Resor…       1      85    2015 July         27       1       0       3      2
 6 Resor…       1      75    2015 July         27       1       0       3      2
 7 Resor…       1      23    2015 July         27       1       0       4      2
 8 Resor…       0      35    2015 July         27       1       0       4      2
 9 Resor…       0      37    2015 July         27       1       0       4      2
10 Resor…       0      37    2015 July         27       1       0       4      2
# … with 45,330 more rows, 24 more variables: children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
#   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, average_daily_rate <dbl>, …
Code
#Arranging data based on average_daily_rate and selecting hotel, country, average_daily_rate columns, grouping them based on country and then slicing out first 10 rows (with piping)
hotel_bookings_data %>%
  arrange(desc(average_daily_rate)) %>%
  select(hotel, country, average_daily_rate)%>%
  group_by(country) %>%
  slice(1:10)
# A tibble: 1,220 × 3
# Groups:   country [178]
   hotel        country average_daily_rate
   <chr>        <chr>                <dbl>
 1 City Hotel   ABW                   158.
 2 City Hotel   ABW                    99 
 3 City Hotel   AGO                   305 
 4 City Hotel   AGO                   291.
 5 City Hotel   AGO                   274.
 6 City Hotel   AGO                   266.
 7 City Hotel   AGO                   266.
 8 Resort Hotel AGO                   255.
 9 Resort Hotel AGO                   255.
10 City Hotel   AGO                   252.
# … with 1,210 more rows

Grouped Summary Statistics

The above table shows descriptive statistics for the numeric variables present in the data. Based on the summary statistics, typically the reservations are made 104 days before the anticipated arrival date and 37% of the reservations are cancelled in reality. The length of the stay is typically 3.43 days and 1.97 guests are reserved for every reservation. 22.11% of the reservations are modified after they are made. The average daily rate for the hotels is $101.83.

Code
hotel_bookings_data %>% 
  arrange(desc(average_daily_rate)) %>% 
  slice_head(n=10) %>% 
  select(hotel, 
         arrival_date_year, 
         country, 
         agent, 
         total_number_of_guests, 
         total_days_stay, 
         reservation_status, 
         average_daily_rate)
# A tibble: 10 × 8
   hotel        arrival_date_year country agent total_…¹ total…² reser…³ avera…⁴
   <chr>                    <dbl> <chr>   <chr>    <dbl>   <dbl> <chr>     <dbl>
 1 City Hotel                2016 PRT     12           2       1 Cancel…   5400 
 2 City Hotel                2017 ITA     159          1       1 Check-…    510 
 3 Resort Hotel              2015 PRT     NULL         2       1 Check-…    508 
 4 City Hotel                2016 PRT     NULL         4       2 Check-…    452.
 5 Resort Hotel              2017 PRT     314          2      14 Cancel…    450 
 6 Resort Hotel              2017 PRT     250          4       6 Cancel…    437 
 7 Resort Hotel              2017 PRT     250          4       8 Check-…    426.
 8 Resort Hotel              2017 ESP     240          4       5 Check-…    402 
 9 Resort Hotel              2017 MAR     250          5       8 Check-…    397.
10 Resort Hotel              2017 PRT     250          3      10 Cancel…    392 
# … with abbreviated variable names ¹​total_number_of_guests, ²​total_days_stay,
#   ³​reservation_status, ⁴​average_daily_rate

According to the summary table, the daily fee for a city hotel might reach upto $5400 per night while it costs only $508 per night for a resort. This might be beguine but let’s perform some analysis.

Code
hotel_bookings_data %>% 
  arrange(average_daily_rate) %>% 
  slice_head(n=10) %>% 
  select(hotel, arrival_date_year, 
         country, 
         agent, 
         total_number_of_guests, 
         total_days_stay, 
         reservation_status, 
         average_daily_rate)
# A tibble: 10 × 8
   hotel        arrival_date_year country agent total_…¹ total…² reser…³ avera…⁴
   <chr>                    <dbl> <chr>   <chr>    <dbl>   <dbl> <chr>     <dbl>
 1 Resort Hotel              2017 GBR     273          2      10 Check-…   -6.38
 2 Resort Hotel              2015 PRT     NULL         2       0 Check-…    0   
 3 Resort Hotel              2015 PRT     NULL         2       0 Check-…    0   
 4 Resort Hotel              2015 PRT     NULL         4       1 Check-…    0   
 5 Resort Hotel              2015 PRT     240          2       0 Check-…    0   
 6 Resort Hotel              2015 PRT     250          1       0 Check-…    0   
 7 Resort Hotel              2015 PRT     NULL         2       0 Check-…    0   
 8 Resort Hotel              2015 PRT     240          2       0 Check-…    0   
 9 Resort Hotel              2015 PRT     305          2       2 Cancel…    0   
10 Resort Hotel              2015 PRT     305          1       2 Check-…    0   
# … with abbreviated variable names ¹​total_number_of_guests, ²​total_days_stay,
#   ³​reservation_status, ⁴​average_daily_rate

If we observe the above table, the average daily rate for a city hotel is recorded as -6.38 which is incorrect. So, the row with average daily rate $5400 is an error. I will remove both observations.

Code
# filtering the observations for average daily rate > 0 and < 510
hotel_bookings_data <- hotel_bookings_data %>% filter(average_daily_rate>=0 & average_daily_rate<=510)

#Using group_by and summarise to understand the data of average daily rate for different types of hotels.
hotel_bookings_data %>% 
  select(hotel, average_daily_rate) %>% 
  group_by(hotel) %>% 
  summarise_if(is.numeric, list(min = min, max = max, mean = mean, std_dev = sd, median = median), na.rm = TRUE)
# A tibble: 2 × 6
  hotel          min   max  mean std_dev median
  <chr>        <dbl> <dbl> <dbl>   <dbl>  <dbl>
1 City Hotel       0   510 105.     39.3   99.9
2 Resort Hotel     0   508  95.0    61.4   75  

Based on the above statistics, we can say that mean average daily rate is more for city hotel than a resort hotel and standard deviation is less for a city hotel than a resort hotel.

Code
hotel_bookings_data %>% 
  filter(reservation_status == "Check-Out") %>% 
  select(country, total_number_of_guests) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% 
  arrange(desc(total_number_of_guests)) %>% 
  head(n=10)
# A tibble: 10 × 2
   country total_number_of_guests
   <chr>                    <dbl>
 1 PRT                      37670
 2 GBR                      19256
 3 FRA                      17299
 4 ESP                      13213
 5 DEU                      11659
 6 IRL                       5102
 7 ITA                       4894
 8 BEL                       3855
 9 NLD                       3374
10 USA                       3212
Code
hotel_bookings_data %>% 
  filter(country %in% c("PRT", "GBR", "ESP", "FRA", "ITA")) %>% 
  select(country,is_canceled) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  arrange(desc(is_canceled))
# A tibble: 5 × 2
  country is_canceled
  <chr>         <dbl>
1 PRT           0.566
2 ITA           0.354
3 ESP           0.254
4 GBR           0.202
5 FRA           0.186

Based on the above tables,Portugal, Great Britain, France, Spain, Germany, Italy, Ireland, Belgium,Netherlands and the USA are top the list of countries with the most reservations. But we can also see that 56.63% of the reservations are cancelled in Portugal. Italy and Spain has 35.39% and 25.4% cancellations respectively.

Code
hotel_bookings_data %>% 
  select(country, average_daily_rate) %>% 
  group_by(country) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  arrange(desc(average_daily_rate)) %>% 
  head(n=10)
# A tibble: 10 × 2
   country average_daily_rate
   <chr>                <dbl>
 1 DJI                   273 
 2 AIA                   265 
 3 AND                   203.
 4 UMI                   200 
 5 LAO                   182.
 6 MYT                   178.
 7 NCL                   176.
 8 GEO                   169.
 9 COM                   165.
10 FRO                   155.

The above table gives the list of top 10 most expensive hotels(based on the average daily rate, grouped based on the country). Most expensive hotels are found in Djibouti, Anguilla, Andorra, United States Minor Outlying Islands, and so on which implies that hotels in small countries with less visitors are more expensive.

Code
# Verifying if any observations are made with zero average daily rate and counting the total observations

hotel_bookings_data %>% 
  filter(average_daily_rate == 0) %>% 
  count()
# A tibble: 1 × 1
      n
  <int>
1  1959
Code
# Identifying the observations that has c and grouping them based on the country 
hotel_bookings_data %>% 
  filter(average_daily_rate == 0) %>% 
  group_by(country) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  head()
# A tibble: 6 × 2
# Groups:   country [6]
  country     n
  <chr>   <int>
1 PRT      1550
2 ESP        79
3 GBR        73
4 FRA        55
5 DEU        41
6 NULL       21

Based on the above table, there are 1959 observations with average daily rate as 0. Portugal has 1550 observations with average daily rate as zero, followed by Spain, Great Britain. Since Portugal has more observation with average daily rate as zero we need to investigate more about that.