Mutating dataset using lubridate, stringr: hotel_bookings.csv

challenge_4
hotel_bookings.csv
readr
lubridate, stringr
Author

Susmita Madineni

Published

March 25, 2022

Code
library(tidyverse)
library(lubridate)


knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Read in data

Code
library(readr)
hotel_bookings_data <- read_csv("_data/hotel_bookings.csv")
View(hotel_bookings_data)
Code
# Preview the first few rows of the dataset
head(hotel_bookings_data)
# A tibble: 6 × 32
  hotel   is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
  <chr>     <dbl>   <dbl>   <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
1 Resort…       0     342    2015 July         27       1       0       0      2
2 Resort…       0     737    2015 July         27       1       0       0      2
3 Resort…       0       7    2015 July         27       1       0       1      1
4 Resort…       0      13    2015 July         27       1       0       1      1
5 Resort…       0      14    2015 July         27       1       0       2      2
6 Resort…       0      14    2015 July         27       1       0       2      2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
#   country <chr>, market_segment <chr>, distribution_channel <chr>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
#   assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
#   agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
Code
# Understanding the dimensions of the dataset 
dim(hotel_bookings_data)
[1] 119390     32
Code
# Identifying the column names of the dataset 
colnames(hotel_bookings_data)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
Code
# Changing column name to make it more readable
colnames(hotel_bookings_data)[28] <- "average_daily_rate"

# Identifying the data types of the columns
sapply(hotel_bookings_data, class)
                         hotel                    is_canceled 
                   "character"                      "numeric" 
                     lead_time              arrival_date_year 
                     "numeric"                      "numeric" 
            arrival_date_month       arrival_date_week_number 
                   "character"                      "numeric" 
     arrival_date_day_of_month        stays_in_weekend_nights 
                     "numeric"                      "numeric" 
          stays_in_week_nights                         adults 
                     "numeric"                      "numeric" 
                      children                         babies 
                     "numeric"                      "numeric" 
                          meal                        country 
                   "character"                    "character" 
                market_segment           distribution_channel 
                   "character"                    "character" 
             is_repeated_guest         previous_cancellations 
                     "numeric"                      "numeric" 
previous_bookings_not_canceled             reserved_room_type 
                     "numeric"                    "character" 
            assigned_room_type                booking_changes 
                   "character"                      "numeric" 
                  deposit_type                          agent 
                   "character"                    "character" 
                       company           days_in_waiting_list 
                   "character"                      "numeric" 
                 customer_type             average_daily_rate 
                   "character"                      "numeric" 
   required_car_parking_spaces      total_of_special_requests 
                     "numeric"                      "numeric" 
            reservation_status        reservation_status_date 
                   "character"                         "Date" 
Code
table(sapply(hotel_bookings_data, function(x) typeof(x)))

character    double 
       13        19 
Code
sapply(hotel_bookings_data, function(x) n_distinct(x))
                         hotel                    is_canceled 
                             2                              2 
                     lead_time              arrival_date_year 
                           479                              3 
            arrival_date_month       arrival_date_week_number 
                            12                             53 
     arrival_date_day_of_month        stays_in_weekend_nights 
                            31                             17 
          stays_in_week_nights                         adults 
                            35                             14 
                      children                         babies 
                             6                              5 
                          meal                        country 
                             5                            178 
                market_segment           distribution_channel 
                             8                              5 
             is_repeated_guest         previous_cancellations 
                             2                             15 
previous_bookings_not_canceled             reserved_room_type 
                            73                             10 
            assigned_room_type                booking_changes 
                            12                             21 
                  deposit_type                          agent 
                             3                            334 
                       company           days_in_waiting_list 
                           353                            128 
                 customer_type             average_daily_rate 
                             4                           8879 
   required_car_parking_spaces      total_of_special_requests 
                             5                              6 
            reservation_status        reservation_status_date 
                             3                            926 
Code
# Identifying unique values for columns - hotel, country, distribution_channel
unique(hotel_bookings_data$hotel)
[1] "Resort Hotel" "City Hotel"  
Code
unique(hotel_bookings_data$country)
  [1] "PRT"  "GBR"  "USA"  "ESP"  "IRL"  "FRA"  "NULL" "ROU"  "NOR"  "OMN" 
 [11] "ARG"  "POL"  "DEU"  "BEL"  "CHE"  "CN"   "GRC"  "ITA"  "NLD"  "DNK" 
 [21] "RUS"  "SWE"  "AUS"  "EST"  "CZE"  "BRA"  "FIN"  "MOZ"  "BWA"  "LUX" 
 [31] "SVN"  "ALB"  "IND"  "CHN"  "MEX"  "MAR"  "UKR"  "SMR"  "LVA"  "PRI" 
 [41] "SRB"  "CHL"  "AUT"  "BLR"  "LTU"  "TUR"  "ZAF"  "AGO"  "ISR"  "CYM" 
 [51] "ZMB"  "CPV"  "ZWE"  "DZA"  "KOR"  "CRI"  "HUN"  "ARE"  "TUN"  "JAM" 
 [61] "HRV"  "HKG"  "IRN"  "GEO"  "AND"  "GIB"  "URY"  "JEY"  "CAF"  "CYP" 
 [71] "COL"  "GGY"  "KWT"  "NGA"  "MDV"  "VEN"  "SVK"  "FJI"  "KAZ"  "PAK" 
 [81] "IDN"  "LBN"  "PHL"  "SEN"  "SYC"  "AZE"  "BHR"  "NZL"  "THA"  "DOM" 
 [91] "MKD"  "MYS"  "ARM"  "JPN"  "LKA"  "CUB"  "CMR"  "BIH"  "MUS"  "COM" 
[101] "SUR"  "UGA"  "BGR"  "CIV"  "JOR"  "SYR"  "SGP"  "BDI"  "SAU"  "VNM" 
[111] "PLW"  "QAT"  "EGY"  "PER"  "MLT"  "MWI"  "ECU"  "MDG"  "ISL"  "UZB" 
[121] "NPL"  "BHS"  "MAC"  "TGO"  "TWN"  "DJI"  "STP"  "KNA"  "ETH"  "IRQ" 
[131] "HND"  "RWA"  "KHM"  "MCO"  "BGD"  "IMN"  "TJK"  "NIC"  "BEN"  "VGB" 
[141] "TZA"  "GAB"  "GHA"  "TMP"  "GLP"  "KEN"  "LIE"  "GNB"  "MNE"  "UMI" 
[151] "MYT"  "FRO"  "MMR"  "PAN"  "BFA"  "LBY"  "MLI"  "NAM"  "BOL"  "PRY" 
[161] "BRB"  "ABW"  "AIA"  "SLV"  "DMA"  "PYF"  "GUY"  "LCA"  "ATA"  "GTM" 
[171] "ASM"  "MRT"  "NCL"  "KIR"  "SDN"  "ATF"  "SLE"  "LAO" 
Code
unique(hotel_bookings_data$distribution_channel)
[1] "Direct"    "Corporate" "TA/TO"     "Undefined" "GDS"      

Briefly describe the data

This dataset gives the information about hotel reservations. The dataset has 11930 rows and 32 columns. Based on the above, we can infer that there are 13 columns(variables) are of character datatype and 19 variables are of double datatype. Almost all the columns are readable for the user, except adr(column 28) which stands for average daily rate. The hotel reservation data is provided for two types of hotels(Resort, City hotels) in various countries(178 countries) in years 2015, 2016 and 2017. Each observation in the dataset provides information of hotel category, reservation of the hotel information like arrival date(date, month, year, week), number of visitors(adult, children), country, number of days staying, meal, previous cancellations, reserved room types, customer type, number of special requests etc.

Tidy Data (as needed)

Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.

In the process of tidying the data we can check if there are any NULL values in the data, and if the class type and values present in a particular column are easy to identify or not. For example, a class can be defined as a character, but there might be numeric values in it, which makes it hard for the user in analyzing the dataset.

While analyzing the dataset, I have found that few of the columns like country, company has NULL values in it. I will remove the country cases where there is a NULL value since it doesn’t help in data analysis. I will also change the class of company and agent to numeric since they have numeric values in it but are defined as character classes. I will make the NULL values in these two columns as NA.

Code
table(hotel_bookings_data$country)

  ABW   AGO   AIA   ALB   AND   ARE   ARG   ARM   ASM   ATA   ATF   AUS   AUT 
    2   362     1    12     7    51   214     8     1     2     1   426  1263 
  AZE   BDI   BEL   BEN   BFA   BGD   BGR   BHR   BHS   BIH   BLR   BOL   BRA 
   17     1  2342     3     1    12    75     5     1    13    26    10  2224 
  BRB   BWA   CAF   CHE   CHL   CHN   CIV   CMR    CN   COL   COM   CPV   CRI 
    4     1     5  1730    65   999     6    10  1279    71     2    24    19 
  CUB   CYM   CYP   CZE   DEU   DJI   DMA   DNK   DOM   DZA   ECU   EGY   ESP 
    8     1    51   171  7287     1     1   435    14   103    27    32  8568 
  EST   ETH   FIN   FJI   FRA   FRO   GAB   GBR   GEO   GGY   GHA   GIB   GLP 
   83     3   447     1 10415     5     4 12129    22     3     4    18     2 
  GNB   GRC   GTM   GUY   HKG   HND   HRV   HUN   IDN   IMN   IND   IRL   IRN 
    9   128     4     1    29     1   100   230    35     2   152  3375    83 
  IRQ   ISL   ISR   ITA   JAM   JEY   JOR   JPN   KAZ   KEN   KHM   KIR   KNA 
   14    57   669  3766     6     8    21   197    19     6     2     1     2 
  KOR   KWT   LAO   LBN   LBY   LCA   LIE   LKA   LTU   LUX   LVA   MAC   MAR 
  133    16     2    31     8     1     3     7    81   287    55    16   259 
  MCO   MDG   MDV   MEX   MKD   MLI   MLT   MMR   MNE   MOZ   MRT   MUS   MWI 
    4     1    12    85    10     1    18     1     5    67     1     7     2 
  MYS   MYT   NAM   NCL   NGA   NIC   NLD   NOR   NPL  NULL   NZL   OMN   PAK 
   28     2     1     1    34     1  2104   607     1   488    74    18    14 
  PAN   PER   PHL   PLW   POL   PRI   PRT   PRY   PYF   QAT   ROU   RUS   RWA 
    9    29    40     1   919    12 48590     4     1    15   500   632     2 
  SAU   SDN   SEN   SGP   SLE   SLV   SMR   SRB   STP   SUR   SVK   SVN   SWE 
   48     1    11    39     1     2     1   101     2     5    65    57  1024 
  SYC   SYR   TGO   THA   TJK   TMP   TUN   TUR   TWN   TZA   UGA   UKR   UMI 
    2     3     2    59     9     3    39   248    51     5     2    68     1 
  URY   USA   UZB   VEN   VGB   VNM   ZAF   ZMB   ZWE 
   32  2097     4    26     1     8    80     2     4 

There are 488 rows with country value as NULL.

Code
table(hotel_bookings_data$company)

    10    100    101    102    103    104    105    106    107    108    109 
     1      1      1      1     16      1      8      2      9     11      1 
    11    110    112    113    115    116    118     12    120    122    126 
     1     52     13     36      4      6      7     14     14     18      1 
   127    130    132    135    137    139     14    140    142    143    144 
    15     12      1     66      4      3      9      1      1     17     27 
   146    148    149    150    153    154    158    159     16    160    163 
     3     37      5     19    215    133      2      6      5      1     17 
   165    167    168    169    174    178    179     18    180    183    184 
     3      7      2     65    149     27     24      1      5     16      1 
   185    186    192    193    195    197     20    200    202    203    204 
     4     12      4     16     38     47     50      3     38     13     34 
   207    209    210    212    213    215    216    217    218    219     22 
     9     19      2      1      1      8     21      2     43    141      6 
   220    221    222    223    224    225    227    229    230    232    233 
     4     27      2    784      3      7     24      1      3      2    114 
   234    237    238    240    242    243    245    246    250    251    253 
     1      1     33      3     62      2      3      3      2     18      1 
   254    255    257    258    259    260    263    264    268    269    270 
    10      6      1      1      2      3     14      2     14     33     43 
   271    272    273    274    275    277    278    279     28    280    281 
     2      3      1     14      3      5      2      8      5     48    138 
   282    284    286    287    288    289     29    290    291    292    293 
     4      1     21      5      1      2      2     17     12     18      5 
   297    301    302    304    305    307    308    309     31    311    312 
     7      1      5      2      1     36     33      1     17      2      3 
   313    314    316    317    318    319     32    320    321    323    324 
     1      1      2      9      1      3      1      1      2     10      9 
   325    329    330    331    332    333    334    337    338     34    341 
     2     12      4     61      2     11      3     25     12      8      5 
   342    343    346    347    348    349     35    350    351    352    353 
    48     29     14      1     59      2      1      3      2      1      4 
   355    356    357    358    360    361    362    364    365    366    367 
    13     10      5      7     12      2      2      6     29     24     14 
   368    369     37    370    371    372    373    376    377    378    379 
     1      5     10      2     11      3      1      1      5      3      9 
    38    380    382    383    384    385    386    388     39    390    391 
    51     12      5      6      9     30      1      7      8     13      2 
   392    393    394    395    396    397    398    399     40    400    401 
     4      1      6      4     18     15      1     11    927      2      1 
   402    403    405    407    408    409    410    411    412    413    415 
     1      2    119     22     15     12      5      2      1      1      1 
   416    417    418    419     42    420    421    422    423    424    425 
     1      1     25      1      5      1      9      1      2     24      1 
   426    428    429     43    433    435    436    437    439    442    443 
     4     13      2     29      2     12      2      7      6      1      5 
   444    445    446    447    448     45    450    451    452    454    455 
     5      4      1      2      4    250     10      6      4      1      1 
   456    457    458    459     46    460    461    465    466     47    470 
     2      3      2      5     26      3      1     12      3     72      5 
   477    478    479     48    481    482    483    484    485    486    487 
    23      2      1      5      1      2      2      2     14      2      1 
   489     49    490    491    492    494    496    497    498    499    501 
     1      5      5      2      2      4      1      1     58      1      1 
   504    506    507     51    511    512    513    514    515    516    518 
    11      1     23     99      6      3      2      2      6      1      2 
    52    520    521    523    525    528     53    530    531    534    539 
     2      1      7     19     15      2      8      5      1      2      2 
    54    541    543     59      6     61     62     64     65     67     68 
     1      1      2      7      1      2     47      1      1    267     46 
    71     72     73     76     77     78      8     80     81     82     83 
     2     30      3      1      1     22      1      1     23     14      9 
    84     85     86     88      9     91     92     93     94     96     99 
     3      2     32     22     37     48     13      3     87      1     12 
  NULL 
112593 
Code
table(hotel_bookings_data$agent)

    1    10   103   104   105   106   107    11   110   111   112   114   115 
 7191   260    21    53    14     2     2   395    12    16    15     1   225 
  117   118   119    12   121   122   126   127   128   129    13   132   133 
    1    69   304   578    37     2    14    45    23    14    82   143    56 
  134   135   138   139    14   141   142   143   144   146   147   148   149 
  482     2   287     8  3640     6   137   172     1   124   156     4    28 
   15   150   151   152   153   154   155   156   157   158   159    16   162 
  402     5    56   183    25   193    94   190    61     1    89   246    37 
  163   165   167   168    17   170   171   173   174   175   177   179   180 
    7     1     3   184   241    93   607    29    22   195   347     2     4 
  181   182   183   184   185   187    19   191   192   193   195   196   197 
   59     8    45    52    78    24  1061   198    41    15   193   301     1 
    2    20   201   205   208    21   210   211   213   214   215   216   219 
  162   540    42    27   173   875     7     2     1     5    15     1    13 
   22   220   223   227   229    23   232   234   235   236    24   240   241 
  382   104    18     2   786    25     2   128    29   247    22 13922  1721 
  242   243   244   245   247   248   249    25   250   251   252   253   254 
  780   514     4    37     1   131    51     3  2870   220    29    87    29 
  256   257   258    26   261   262   265   267   269    27   270   273   275 
   24    24     3   401    38    22     1     1     2   450     6   349     8 
  276   278    28   280   281   282   283   285   286   287   288   289    29 
    8     1  1666     1    82     2     2     1    45     8    14     1   683 
  290   291   294   295   296   298   299     3    30   300   301   302   303 
   19     1     1     4    42   472     1  1336   484     1     1     3     2 
  304   305   306   307   308    31   310   313   314   315    32   321   323 
    1    45    35    14    54   162    25    36   927   284    15     3    25 
  324   325   326   327   328    33   330   331   332   333   334   335   336 
    9     6   165    20     9    31   125     2    55     1    28     4    23 
  337   339    34   341   344   346   348    35   350   352   354   355   358 
    1    77   294     4     8     1    22   109    28     1    14     4     1 
  359    36   360   363   364   367   368    37   370   371   375   378    38 
   21   100    15     6    19     1    45  1230     3     4    40    36   274 
  384   385   387   388    39   390   391   393   394   397     4    40   403 
    2    60    32     1   127    57     2    13    33     1    47  1039     4 
  404   405   406   408    41   410   411   414   416   418    42   420   423 
    2     5     1     1    75   133    16     2     1     8   211     3    19 
  425   426   427   429   430   431   432   433   434   436   438    44   440 
   16     3     3     5     4     1     1     1    33    49     2   292    56 
  441   444   446   449    45   450   451   453   454   455   459   461   464 
    7     1     1     2    32     1     1     1     2    19    16     2    98 
  467   468   469    47   472   474   475   476   479   480   481   483   484 
   39    49     2    50     1    17     8     2    32     1     8     1    11 
  492   493   495   497     5    50   502   508   509   510    52   526   527 
   28    35    57     1   330    20    24     6    10     2   137    10    35 
   53   531   535    54    55    56    57    58    59     6    60    61    63 
   18    68     3     1    16   375    28   335     1  3290    19     2    29 
   64    66    67    68    69     7    70    71    72    73    74    75    77 
   23    44   127   211    90  3539     1    73     6     1    20    73    33 
   78    79     8    81    82    83    85    86    87    88    89     9    90 
   37    47  1514     6    77   696   554   338    77    19    99 31961     1 
   91    92    93    94    95    96    98    99  NULL 
   58     7     1   114   135   537   124    68 16340 

There are 16340 rows as NULL in agent column and 112593 rows as NULL in company column.

Code
# filtering out NULL values in country column

hotel_bookings_data <- hotel_bookings_data %>% 
  filter(!(country == "NULL"))
table(hotel_bookings_data$country)

  ABW   AGO   AIA   ALB   AND   ARE   ARG   ARM   ASM   ATA   ATF   AUS   AUT 
    2   362     1    12     7    51   214     8     1     2     1   426  1263 
  AZE   BDI   BEL   BEN   BFA   BGD   BGR   BHR   BHS   BIH   BLR   BOL   BRA 
   17     1  2342     3     1    12    75     5     1    13    26    10  2224 
  BRB   BWA   CAF   CHE   CHL   CHN   CIV   CMR    CN   COL   COM   CPV   CRI 
    4     1     5  1730    65   999     6    10  1279    71     2    24    19 
  CUB   CYM   CYP   CZE   DEU   DJI   DMA   DNK   DOM   DZA   ECU   EGY   ESP 
    8     1    51   171  7287     1     1   435    14   103    27    32  8568 
  EST   ETH   FIN   FJI   FRA   FRO   GAB   GBR   GEO   GGY   GHA   GIB   GLP 
   83     3   447     1 10415     5     4 12129    22     3     4    18     2 
  GNB   GRC   GTM   GUY   HKG   HND   HRV   HUN   IDN   IMN   IND   IRL   IRN 
    9   128     4     1    29     1   100   230    35     2   152  3375    83 
  IRQ   ISL   ISR   ITA   JAM   JEY   JOR   JPN   KAZ   KEN   KHM   KIR   KNA 
   14    57   669  3766     6     8    21   197    19     6     2     1     2 
  KOR   KWT   LAO   LBN   LBY   LCA   LIE   LKA   LTU   LUX   LVA   MAC   MAR 
  133    16     2    31     8     1     3     7    81   287    55    16   259 
  MCO   MDG   MDV   MEX   MKD   MLI   MLT   MMR   MNE   MOZ   MRT   MUS   MWI 
    4     1    12    85    10     1    18     1     5    67     1     7     2 
  MYS   MYT   NAM   NCL   NGA   NIC   NLD   NOR   NPL   NZL   OMN   PAK   PAN 
   28     2     1     1    34     1  2104   607     1    74    18    14     9 
  PER   PHL   PLW   POL   PRI   PRT   PRY   PYF   QAT   ROU   RUS   RWA   SAU 
   29    40     1   919    12 48590     4     1    15   500   632     2    48 
  SDN   SEN   SGP   SLE   SLV   SMR   SRB   STP   SUR   SVK   SVN   SWE   SYC 
    1    11    39     1     2     1   101     2     5    65    57  1024     2 
  SYR   TGO   THA   TJK   TMP   TUN   TUR   TWN   TZA   UGA   UKR   UMI   URY 
    3     2    59     9     3    39   248    51     5     2    68     1    32 
  USA   UZB   VEN   VGB   VNM   ZAF   ZMB   ZWE 
 2097     4    26     1     8    80     2     4 

Identify variables that need to be mutated

Are there any variables that require mutation to be usable in your analysis stream? For example, are all time variables correctly coded as dates? Are all string variables reduced and cleaned to sensible categories? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?

I have observed that each of the date, month, year are presented in a different column, instead of this we can combine all three columns and have 1 column as date to track the reservations.

Code
# Mutating the month, date, year columns to 1 column - date arrived and mutating adults, children, babies to total_number_of_guests and mutating stays_in_weekend_nights, stays_in_week_nights to total_days_stay

hotel_bookings_data_m <- hotel_bookings_data %>%
  mutate(date_arrived = str_c(arrival_date_month, arrival_date_day_of_month, arrival_date_year, sep = "/"), 
        date_arrived = mdy(date_arrived),
        total_days_stay = stays_in_weekend_nights + stays_in_week_nights,
        total_number_of_guests = adults + children + babies) %>%
  select(-c(arrival_date_month, arrival_date_day_of_month, arrival_date_year))

hotel_bookings_data_m
# A tibble: 118,902 × 32
   hotel     is_ca…¹ lead_…² arriv…³ stays…⁴ stays…⁵ adults child…⁶ babies meal 
   <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <chr>
 1 Resort H…       0     342      27       0       0      2       0      0 BB   
 2 Resort H…       0     737      27       0       0      2       0      0 BB   
 3 Resort H…       0       7      27       0       1      1       0      0 BB   
 4 Resort H…       0      13      27       0       1      1       0      0 BB   
 5 Resort H…       0      14      27       0       2      2       0      0 BB   
 6 Resort H…       0      14      27       0       2      2       0      0 BB   
 7 Resort H…       0       0      27       0       2      2       0      0 BB   
 8 Resort H…       0       9      27       0       2      2       0      0 FB   
 9 Resort H…       1      85      27       0       3      2       0      0 BB   
10 Resort H…       1      75      27       0       3      2       0      0 HB   
# … with 118,892 more rows, 22 more variables: country <chr>,
#   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
#   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
#   customer_type <chr>, average_daily_rate <dbl>,
#   required_car_parking_spaces <dbl>, total_of_special_requests <dbl>, …

We can see that date_arrived, total_days_stay, total_number_of_guests columns are present after the mutation. Let’s understand the statistics of these columns.

Code
# Calculating summaries of date_arrived, total_days_stay, total_number_of_guests columns

summary(hotel_bookings_data_m$date_arrived)
        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"2015-07-01" "2016-03-14" "2016-09-07" "2016-08-29" "2017-03-19" "2017-08-31" 
Code
summary(hotel_bookings_data_m$total_days_stay)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   2.000   3.000   3.431   4.000  57.000 
Code
summary(hotel_bookings_data_m$total_number_of_guests)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   2.000   2.000   1.971   2.000  55.000       4 

Based on the above summary, we can say that the reservation data from July 1st 2015 to 31st August 2017 are present and the minimum and maximum number of days stayed are 0, 57. The minimum and maximum total number of guests stayed are 0 and 55.

Mutating the class of agent and company fields to numeric

Code
# Mutating the class of agent and company fields to numeric and updating NULL values as NA
hotel_bookings_data_m <- hotel_bookings_data_m %>%
  mutate(across(c(company, agent), ~ replace(.,str_detect(.,"NULL"), NA))) %>% mutate_at(vars(company, agent), as.numeric)

# verifying whether the classes are updated as numeric for company and agent columns
is.numeric(hotel_bookings_data_m$company)
[1] TRUE
Code
is.numeric(hotel_bookings_data_m$agent)
[1] TRUE
Code
sapply(hotel_bookings_data_m, class)
                         hotel                    is_canceled 
                   "character"                      "numeric" 
                     lead_time       arrival_date_week_number 
                     "numeric"                      "numeric" 
       stays_in_weekend_nights           stays_in_week_nights 
                     "numeric"                      "numeric" 
                        adults                       children 
                     "numeric"                      "numeric" 
                        babies                           meal 
                     "numeric"                    "character" 
                       country                 market_segment 
                   "character"                    "character" 
          distribution_channel              is_repeated_guest 
                   "character"                      "numeric" 
        previous_cancellations previous_bookings_not_canceled 
                     "numeric"                      "numeric" 
            reserved_room_type             assigned_room_type 
                   "character"                    "character" 
               booking_changes                   deposit_type 
                     "numeric"                    "character" 
                         agent                        company 
                     "numeric"                      "numeric" 
          days_in_waiting_list                  customer_type 
                     "numeric"                    "character" 
            average_daily_rate    required_car_parking_spaces 
                     "numeric"                      "numeric" 
     total_of_special_requests             reservation_status 
                     "numeric"                    "character" 
       reservation_status_date                   date_arrived 
                        "Date"                         "Date" 
               total_days_stay         total_number_of_guests 
                     "numeric"                      "numeric" 
Code
hotel_bookings_data_m
# A tibble: 118,902 × 32
   hotel     is_ca…¹ lead_…² arriv…³ stays…⁴ stays…⁵ adults child…⁶ babies meal 
   <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <chr>
 1 Resort H…       0     342      27       0       0      2       0      0 BB   
 2 Resort H…       0     737      27       0       0      2       0      0 BB   
 3 Resort H…       0       7      27       0       1      1       0      0 BB   
 4 Resort H…       0      13      27       0       1      1       0      0 BB   
 5 Resort H…       0      14      27       0       2      2       0      0 BB   
 6 Resort H…       0      14      27       0       2      2       0      0 BB   
 7 Resort H…       0       0      27       0       2      2       0      0 BB   
 8 Resort H…       0       9      27       0       2      2       0      0 FB   
 9 Resort H…       1      85      27       0       3      2       0      0 BB   
10 Resort H…       1      75      27       0       3      2       0      0 HB   
# … with 118,892 more rows, 22 more variables: country <chr>,
#   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
#   deposit_type <chr>, agent <dbl>, company <dbl>, days_in_waiting_list <dbl>,
#   customer_type <chr>, average_daily_rate <dbl>,
#   required_car_parking_spaces <dbl>, total_of_special_requests <dbl>, …