challenge_4
maanusri balasubramanian
hotel_bookings
More data wrangling: pivoting
Author

Maanusri Balasubramanian

Published

May 5, 2023

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Challenge Overview

Today’s challenge is to:

  1. read in a data set, and describe the data set using both words and any supporting information (e.g., tables, etc)
  2. tidy data (as needed, including sanity checks)
  3. identify variables that need to be mutated
  4. mutate variables and sanity check all mutations

Read in data

Read in one (or more) of the following datasets, using the correct R package and command.

  • abc_poll.csv ⭐
  • poultry_tidy.xlsx or organiceggpoultry.xls⭐⭐
  • FedFundsRate.csv⭐⭐⭐
  • hotel_bookings.csv⭐⭐⭐⭐
  • debt_in_trillions.xlsx ⭐⭐⭐⭐⭐
Code
# reading the dataset from csv
bookings <- read_csv("_data/hotel_bookings.csv")

# number of rows
nrow(bookings)
[1] 119390
Code
# number of columns
ncol(bookings)
[1] 32
Code
# taking a peak into the dataset
head(bookings)
# A tibble: 6 × 32
  hotel        is_canceled lead_time arrival_date_year arrival_date_month
  <chr>              <dbl>     <dbl>             <dbl> <chr>             
1 Resort Hotel           0       342              2015 July              
2 Resort Hotel           0       737              2015 July              
3 Resort Hotel           0         7              2015 July              
4 Resort Hotel           0        13              2015 July              
5 Resort Hotel           0        14              2015 July              
6 Resort Hotel           0        14              2015 July              
# ℹ 27 more variables: arrival_date_week_number <dbl>,
#   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, …
Code
# printing column names
colnames(bookings)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
Code
str(bookings)
spc_tbl_ [119,390 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ hotel                         : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled                   : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
 $ lead_time                     : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
 $ arrival_date_year             : num [1:119390] 2015 2015 2015 2015 2015 ...
 $ arrival_date_month            : chr [1:119390] "July" "July" "July" "July" ...
 $ arrival_date_week_number      : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
 $ arrival_date_day_of_month     : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
 $ stays_in_weekend_nights       : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights          : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
 $ adults                        : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
 $ children                      : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ babies                        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ meal                          : chr [1:119390] "BB" "BB" "BB" "BB" ...
 $ country                       : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
 $ market_segment                : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
 $ distribution_channel          : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
 $ is_repeated_guest             : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_cancellations        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ reserved_room_type            : chr [1:119390] "C" "C" "A" "A" ...
 $ assigned_room_type            : chr [1:119390] "C" "C" "C" "A" ...
 $ booking_changes               : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
 $ deposit_type                  : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
 $ agent                         : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
 $ company                       : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
 $ days_in_waiting_list          : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ customer_type                 : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
 $ adr                           : num [1:119390] 0 0 75 75 98 ...
 $ required_car_parking_spaces   : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
 $ total_of_special_requests     : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
 $ reservation_status            : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
 $ reservation_status_date       : Date[1:119390], format: "2015-07-01" "2015-07-01" ...
 - attr(*, "spec")=
  .. cols(
  ..   hotel = col_character(),
  ..   is_canceled = col_double(),
  ..   lead_time = col_double(),
  ..   arrival_date_year = col_double(),
  ..   arrival_date_month = col_character(),
  ..   arrival_date_week_number = col_double(),
  ..   arrival_date_day_of_month = col_double(),
  ..   stays_in_weekend_nights = col_double(),
  ..   stays_in_week_nights = col_double(),
  ..   adults = col_double(),
  ..   children = col_double(),
  ..   babies = col_double(),
  ..   meal = col_character(),
  ..   country = col_character(),
  ..   market_segment = col_character(),
  ..   distribution_channel = col_character(),
  ..   is_repeated_guest = col_double(),
  ..   previous_cancellations = col_double(),
  ..   previous_bookings_not_canceled = col_double(),
  ..   reserved_room_type = col_character(),
  ..   assigned_room_type = col_character(),
  ..   booking_changes = col_double(),
  ..   deposit_type = col_character(),
  ..   agent = col_character(),
  ..   company = col_character(),
  ..   days_in_waiting_list = col_double(),
  ..   customer_type = col_character(),
  ..   adr = col_double(),
  ..   required_car_parking_spaces = col_double(),
  ..   total_of_special_requests = col_double(),
  ..   reservation_status = col_character(),
  ..   reservation_status_date = col_date(format = "")
  .. )
 - attr(*, "problems")=<externalptr> 
Code
unique(bookings$hotel)
[1] "Resort Hotel" "City Hotel"  
Code
unique(bookings$arrival_date_year)
[1] 2015 2016 2017
Code
unique(bookings$customer_type)
[1] "Transient"       "Contract"        "Transient-Party" "Group"          

Briefly describe the data

The dataset contains information about hotel bookings made in Resort Hotel and City Hotel during the years 2015, 2016 and 2017. The dataset contains 119390 rows and 32 columns. Each entry in the dataset contains information about a particular booking like guest information, date of arrival, room information, payment information, show/no-show information, etc.

Tidy Data (as needed)

Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.

The data is not tidy and work needs to be done: 1. The date information is split in multiple columns, combining these would make is easier for analysis. Creating a new column “arrival_date” by getting the date from “arrival_date_day_of_month”, “arrival_date_month”,“arrival_date_year” columns and dropping these 3 columns. 2. previous_bookings_not_canceled - this information is redundant and can be calculated from the columns “stays_in_weekend_nights” and “stays_in_week_nights”. So we can drop this column. 3. is_repeated_guest - again, this is redundant information. If atleast one of “stays_in_weekend_nights” and “stays_in_week_nights” is non-zero, then is_repeated_guest would be 1, else 0. So we can remove this column too.

We’ll be dropping 3 + 1 + 1 columns and adding 1 column. So we’ll have 28 columns after tidying.

Code
# checking out the values in columns to better understand the data
unique(bookings$is_repeated_guest)
[1] 0 1
Code
select(bookings, previous_bookings_not_canceled)
# A tibble: 119,390 × 1
   previous_bookings_not_canceled
                            <dbl>
 1                              0
 2                              0
 3                              0
 4                              0
 5                              0
 6                              0
 7                              0
 8                              0
 9                              0
10                              0
# ℹ 119,380 more rows
Code
# combining arrival date information into 1 column
tidy_bookings <- bookings %>%
  mutate(arrival_date = (str_c(arrival_date_day_of_month, arrival_date_month, arrival_date_year, sep = "/")), arrival_date = dmy(arrival_date), .after = lead_time)

# removing all the above mentioned redundant data
tidy_bookings <- tidy_bookings[,-c(5, 6, 8, 14, 16)]
tidy_bookings
# A tibble: 119,390 × 28
   hotel        is_canceled lead_time arrival_date arrival_date_week_number
   <chr>              <dbl>     <dbl> <date>                          <dbl>
 1 Resort Hotel           0       342 2015-07-01                         27
 2 Resort Hotel           0       737 2015-07-01                         27
 3 Resort Hotel           0         7 2015-07-01                         27
 4 Resort Hotel           0        13 2015-07-01                         27
 5 Resort Hotel           0        14 2015-07-01                         27
 6 Resort Hotel           0        14 2015-07-01                         27
 7 Resort Hotel           0         0 2015-07-01                         27
 8 Resort Hotel           0         9 2015-07-01                         27
 9 Resort Hotel           1        85 2015-07-01                         27
10 Resort Hotel           1        75 2015-07-01                         27
# ℹ 119,380 more rows
# ℹ 23 more variables: stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
#   country <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
#   deposit_type <chr>, agent <chr>, company <chr>, …

Any additional comments?

Identify variables that need to be mutated

Are there any variables that require mutation to be usable in your analysis stream? For example, are all time variables correctly coded as dates? Are all string variables reduced and cleaned to sensible categories? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?

Document your work here.

The column “agent” is stored as string. Looking at it, the codes range from 1 to 535. So we can convert these to numeric type, for easier analysis/filtering.

Code
# checking out the values in the column "agent"
unique(bookings$agent)
  [1] "NULL" "304"  "240"  "303"  "15"   "241"  "8"    "250"  "115"  "5"   
 [11] "175"  "134"  "156"  "243"  "242"  "3"    "105"  "40"   "147"  "306" 
 [21] "184"  "96"   "2"    "127"  "95"   "146"  "9"    "177"  "6"    "143" 
 [31] "244"  "149"  "167"  "300"  "171"  "305"  "67"   "196"  "152"  "142" 
 [41] "261"  "104"  "36"   "26"   "29"   "258"  "110"  "71"   "181"  "88"  
 [51] "251"  "275"  "69"   "248"  "208"  "256"  "314"  "126"  "281"  "273" 
 [61] "253"  "185"  "330"  "334"  "328"  "326"  "321"  "324"  "313"  "38"  
 [71] "155"  "68"   "335"  "308"  "332"  "94"   "348"  "310"  "339"  "375" 
 [81] "66"   "327"  "387"  "298"  "91"   "245"  "385"  "257"  "393"  "168" 
 [91] "405"  "249"  "315"  "75"   "128"  "307"  "11"   "436"  "1"    "201" 
[101] "183"  "223"  "368"  "336"  "291"  "464"  "411"  "481"  "10"   "154" 
[111] "468"  "410"  "390"  "440"  "495"  "492"  "493"  "434"  "57"   "531" 
[121] "420"  "483"  "526"  "472"  "429"  "16"   "446"  "34"   "78"   "139" 
[131] "252"  "270"  "47"   "114"  "301"  "193"  "182"  "135"  "350"  "195" 
[141] "352"  "355"  "159"  "363"  "384"  "360"  "331"  "367"  "64"   "406" 
[151] "163"  "414"  "333"  "427"  "431"  "430"  "426"  "438"  "433"  "418" 
[161] "441"  "282"  "432"  "72"   "450"  "180"  "454"  "455"  "59"   "451" 
[171] "254"  "358"  "469"  "165"  "467"  "510"  "337"  "476"  "502"  "527" 
[181] "479"  "508"  "535"  "302"  "497"  "187"  "13"   "7"    "27"   "14"  
[191] "22"   "17"   "28"   "42"   "20"   "19"   "45"   "37"   "61"   "39"  
[201] "21"   "24"   "41"   "50"   "30"   "54"   "52"   "12"   "44"   "31"  
[211] "83"   "32"   "63"   "60"   "55"   "56"   "89"   "87"   "118"  "86"  
[221] "85"   "210"  "214"  "129"  "179"  "138"  "174"  "170"  "153"  "93"  
[231] "151"  "119"  "35"   "173"  "58"   "53"   "133"  "79"   "235"  "192" 
[241] "191"  "236"  "162"  "215"  "157"  "287"  "132"  "234"  "98"   "77"  
[251] "103"  "107"  "262"  "220"  "121"  "205"  "378"  "23"   "296"  "290" 
[261] "229"  "33"   "286"  "276"  "425"  "484"  "323"  "403"  "219"  "394" 
[271] "509"  "111"  "423"  "4"    "70"   "82"   "81"   "74"   "92"   "99"  
[281] "90"   "112"  "117"  "106"  "148"  "158"  "144"  "211"  "213"  "216" 
[291] "232"  "150"  "267"  "227"  "247"  "278"  "280"  "285"  "289"  "269" 
[301] "295"  "265"  "288"  "122"  "294"  "325"  "341"  "344"  "346"  "359" 
[311] "283"  "364"  "370"  "371"  "25"   "141"  "391"  "397"  "416"  "404" 
[321] "299"  "197"  "73"   "354"  "444"  "408"  "461"  "388"  "453"  "459" 
[331] "474"  "475"  "480"  "449" 
Code
# selecting and converting type to numeric (in case of "null" values, it gets set to NA)
agent <- select(bookings, agent)
bookings$agent <- as.numeric(as.character(bookings$agent))

# sanity check
agent_nums <- unique(bookings$agent)
agent_nums[order(agent_nums)]
  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  19
 [19]  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37
 [37]  38  39  40  41  42  44  45  47  50  52  53  54  55  56  57  58  59  60
 [55]  61  63  64  66  67  68  69  70  71  72  73  74  75  77  78  79  81  82
 [73]  83  85  86  87  88  89  90  91  92  93  94  95  96  98  99 103 104 105
 [91] 106 107 110 111 112 114 115 117 118 119 121 122 126 127 128 129 132 133
[109] 134 135 138 139 141 142 143 144 146 147 148 149 150 151 152 153 154 155
[127] 156 157 158 159 162 163 165 167 168 170 171 173 174 175 177 179 180 181
[145] 182 183 184 185 187 191 192 193 195 196 197 201 205 208 210 211 213 214
[163] 215 216 219 220 223 227 229 232 234 235 236 240 241 242 243 244 245 247
[181] 248 249 250 251 252 253 254 256 257 258 261 262 265 267 269 270 273 275
[199] 276 278 280 281 282 283 285 286 287 288 289 290 291 294 295 296 298 299
[217] 300 301 302 303 304 305 306 307 308 310 313 314 315 321 323 324 325 326
[235] 327 328 330 331 332 333 334 335 336 337 339 341 344 346 348 350 352 354
[253] 355 358 359 360 363 364 367 368 370 371 375 378 384 385 387 388 390 391
[271] 393 394 397 403 404 405 406 408 410 411 414 416 418 420 423 425 426 427
[289] 429 430 431 432 433 434 436 438 440 441 444 446 449 450 451 453 454 455
[307] 459 461 464 467 468 469 472 474 475 476 479 480 481 483 484 492 493 495
[325] 497 502 508 509 510 526 527 531 535  NA

Any additional comments?