Thrishul Challenge 3

challenge_2
railroads
faostat
hotel_bookings
Author

Meredith Rolfe

Published

August 16, 2022

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Challenge Overview

Today’s challenge is to

  1. read in a data set, and describe the data using both words and any supporting information (e.g., tables, etc)
  2. provide summary statistics for different interesting groups within the data, and interpret those statistics

Read in the Data

Read in one (or more) of the following data sets, available in the posts/_data folder, using the correct R package and command.

  • railroad*.csv or StateCounty2012.xls ⭐
  • FAOstat*.csv or birds.csv ⭐⭐⭐
  • hotel_bookings.csv ⭐⭐⭐⭐
Code
df<- read.csv(file = "_data/hotel_bookings.csv")

Add any comments or documentation as needed. More challenging data may require additional code chunks and documentation.

Describe the data

Using a combination of words and results of R commands, can you provide a high level description of the data? Describe as efficiently as possible where/how the data was (likely) gathered, indicate the cases and variables (both the interpretation and any details you deem useful to the reader to fully understand your chosen data).

Code
str(data)
function (..., list = character(), package = NULL, lib.loc = NULL, verbose = getOption("verbose"), 
    envir = .GlobalEnv, overwrite = TRUE)  
Code
num_observations <- nrow(df)
num_variables <- ncol(df)
cat("Dataset Size: ", num_observations, " observations, ", num_variables, " variables\n")
Dataset Size:  119390  observations,  32  variables
Code
# histogram of is_canceled
hist(df$is_canceled)

Code
#scatterplot of stays_in_weekend_nights, stays_in_week_nights
plot(df$stays_in_weekend_nights, df$stays_in_week_nights)

Code
# boxplot of arrival_date_week_number
boxplot(df$arrival_date_week_number, )
Error in boxplot.default(df$arrival_date_week_number, ): argument is missing, with no default
Code
summary(data)
Error in object[[i]]: object of type 'closure' is not subsettable

Provide Grouped Summary Statistics

Conduct some exploratory data analysis, using dplyr commands such as group_by(), select(), filter(), and summarise(). Find the central tendency (mean, median, mode) and dispersion (standard deviation, mix/max/quantile) for different subgroups within the data set.

Code
# Load necessary libraries
library(dplyr)
library(ggplot2)

# Read data
hotel_data <- df
Code
data <- select(df,-where(is.character))
cor(data)
                                is_canceled     lead_time arrival_date_year
is_canceled                     1.000000000  0.2931233558      0.0166598602
lead_time                       0.293123356  1.0000000000      0.0401420998
arrival_date_year               0.016659860  0.0401420998      1.0000000000
arrival_date_week_number        0.008148065  0.1268708127     -0.5405613308
arrival_date_day_of_month      -0.006130079  0.0022675527     -0.0002210192
stays_in_weekend_nights        -0.001791078  0.0856711329      0.0214973654
stays_in_week_nights            0.024764629  0.1657993639      0.0308832973
adults                          0.060017213  0.1195186926      0.0296351442
children                                 NA            NA                NA
babies                         -0.032491089 -0.0209150163     -0.0131920747
is_repeated_guest              -0.084793418 -0.1244099080      0.0103413169
previous_cancellations          0.110132808  0.0860418019     -0.1198220705
previous_bookings_not_canceled -0.057357723 -0.0735481679      0.0292180512
booking_changes                -0.144380991  0.0001488301      0.0308723496
days_in_waiting_list            0.054185824  0.1700841843     -0.0564966020
adr                             0.047556598 -0.0630768525      0.1975800885
required_car_parking_spaces    -0.195497817 -0.1164505701     -0.0136844109
total_of_special_requests      -0.234657774 -0.0957120489      0.1085314863
                               arrival_date_week_number
is_canceled                                 0.008148065
lead_time                                   0.126870813
arrival_date_year                          -0.540561331
arrival_date_week_number                    1.000000000
arrival_date_day_of_month                   0.066809253
stays_in_weekend_nights                     0.018207653
stays_in_week_nights                        0.015558297
adults                                      0.025909057
children                                             NA
babies                                      0.010395480
is_repeated_guest                          -0.030130758
previous_cancellations                      0.035500909
previous_bookings_not_canceled             -0.020903552
booking_changes                             0.005507503
days_in_waiting_list                        0.022933027
adr                                         0.075790643
required_car_parking_spaces                 0.001920423
total_of_special_requests                   0.026149364
                               arrival_date_day_of_month
is_canceled                                -0.0061300789
lead_time                                   0.0022675527
arrival_date_year                          -0.0002210192
arrival_date_week_number                    0.0668092530
arrival_date_day_of_month                   1.0000000000
stays_in_weekend_nights                    -0.0163542995
stays_in_week_nights                       -0.0281735214
adults                                     -0.0015659791
children                                              NA
babies                                     -0.0002303647
is_repeated_guest                          -0.0061450207
previous_cancellations                     -0.0270107761
previous_bookings_not_canceled             -0.0002997868
booking_changes                             0.0106128560
days_in_waiting_list                        0.0227275352
adr                                         0.0302451948
required_car_parking_spaces                 0.0086834665
total_of_special_requests                   0.0030621241
                               stays_in_weekend_nights stays_in_week_nights
is_canceled                               -0.001791078           0.02476463
lead_time                                  0.085671133           0.16579936
arrival_date_year                          0.021497365           0.03088330
arrival_date_week_number                   0.018207653           0.01555830
arrival_date_day_of_month                 -0.016354300          -0.02817352
stays_in_weekend_nights                    1.000000000           0.49896882
stays_in_week_nights                       0.498968818           1.00000000
adults                                     0.091871020           0.09297551
children                                            NA                   NA
babies                                     0.018482810           0.02019097
is_repeated_guest                         -0.087239379          -0.09724497
previous_cancellations                    -0.012774619          -0.01399243
previous_bookings_not_canceled            -0.042715235          -0.04874255
booking_changes                            0.063281316           0.09620945
days_in_waiting_list                      -0.054151113          -0.00201981
adr                                        0.049341906           0.06523748
required_car_parking_spaces               -0.018553809          -0.02485942
total_of_special_requests                  0.072670830           0.06819178
                                     adults children        babies
is_canceled                     0.060017213       NA -0.0324910892
lead_time                       0.119518693       NA -0.0209150163
arrival_date_year               0.029635144       NA -0.0131920747
arrival_date_week_number        0.025909057       NA  0.0103954801
arrival_date_day_of_month      -0.001565979       NA -0.0002303647
stays_in_weekend_nights         0.091871020       NA  0.0184828105
stays_in_week_nights            0.092975513       NA  0.0201909744
adults                          1.000000000       NA  0.0181456419
children                                 NA        1            NA
babies                          0.018145642       NA  1.0000000000
is_repeated_guest              -0.146426116       NA -0.0089426341
previous_cancellations         -0.006738096       NA -0.0075009985
previous_bookings_not_canceled -0.107983172       NA -0.0065504543
booking_changes                -0.051672774       NA  0.0834397814
days_in_waiting_list           -0.008283347       NA -0.0106205427
adr                             0.230641216       NA  0.0291856903
required_car_parking_spaces     0.014784817       NA  0.0373833559
total_of_special_requests       0.122883546       NA  0.0978887921
                               is_repeated_guest previous_cancellations
is_canceled                         -0.084793418            0.110132808
lead_time                           -0.124409908            0.086041802
arrival_date_year                    0.010341317           -0.119822070
arrival_date_week_number            -0.030130758            0.035500909
arrival_date_day_of_month           -0.006145021           -0.027010776
stays_in_weekend_nights             -0.087239379           -0.012774619
stays_in_week_nights                -0.097244972           -0.013992431
adults                              -0.146426116           -0.006738096
children                                      NA                     NA
babies                              -0.008942634           -0.007500998
is_repeated_guest                    1.000000000            0.082293234
previous_cancellations               0.082293234            1.000000000
previous_bookings_not_canceled       0.418055995            0.152728115
booking_changes                      0.012091787           -0.026992663
days_in_waiting_list                -0.022234965            0.005928941
adr                                 -0.134314447           -0.065645638
required_car_parking_spaces          0.077089573           -0.018492250
total_of_special_requests            0.013050009           -0.048384118
                               previous_bookings_not_canceled booking_changes
is_canceled                                     -0.0573577232   -0.1443809911
lead_time                                       -0.0735481679    0.0001488301
arrival_date_year                                0.0292180512    0.0308723496
arrival_date_week_number                        -0.0209035517    0.0055075035
arrival_date_day_of_month                       -0.0002997868    0.0106128560
stays_in_weekend_nights                         -0.0427152350    0.0632813159
stays_in_week_nights                            -0.0487425495    0.0962094460
adults                                          -0.1079831725   -0.0516727735
children                                                   NA              NA
babies                                          -0.0065504543    0.0834397814
is_repeated_guest                                0.4180559949    0.0120917873
previous_cancellations                           0.1527281149   -0.0269926626
previous_bookings_not_canceled                   1.0000000000    0.0116075289
booking_changes                                  0.0116075289    1.0000000000
days_in_waiting_list                            -0.0093969779   -0.0116339446
adr                                             -0.0721441957    0.0196176738
required_car_parking_spaces                      0.0476530869    0.0656201914
total_of_special_requests                        0.0378237757    0.0528334357
                               days_in_waiting_list         adr
is_canceled                             0.054185824  0.04755660
lead_time                               0.170084184 -0.06307685
arrival_date_year                      -0.056496602  0.19758009
arrival_date_week_number                0.022933027  0.07579064
arrival_date_day_of_month               0.022727535  0.03024519
stays_in_weekend_nights                -0.054151113  0.04934191
stays_in_week_nights                   -0.002019810  0.06523748
adults                                 -0.008283347  0.23064122
children                                         NA          NA
babies                                 -0.010620543  0.02918569
is_repeated_guest                      -0.022234965 -0.13431445
previous_cancellations                  0.005928941 -0.06564564
previous_bookings_not_canceled         -0.009396978 -0.07214420
booking_changes                        -0.011633945  0.01961767
days_in_waiting_list                    1.000000000 -0.04075641
adr                                    -0.040756412  1.00000000
required_car_parking_spaces            -0.030600046  0.05662809
total_of_special_requests              -0.082729719  0.17218526
                               required_car_parking_spaces
is_canceled                                   -0.195497817
lead_time                                     -0.116450570
arrival_date_year                             -0.013684411
arrival_date_week_number                       0.001920423
arrival_date_day_of_month                      0.008683466
stays_in_weekend_nights                       -0.018553809
stays_in_week_nights                          -0.024859423
adults                                         0.014784817
children                                                NA
babies                                         0.037383356
is_repeated_guest                              0.077089573
previous_cancellations                        -0.018492250
previous_bookings_not_canceled                 0.047653087
booking_changes                                0.065620191
days_in_waiting_list                          -0.030600046
adr                                            0.056628092
required_car_parking_spaces                    1.000000000
total_of_special_requests                      0.082626338
                               total_of_special_requests
is_canceled                                 -0.234657774
lead_time                                   -0.095712049
arrival_date_year                            0.108531486
arrival_date_week_number                     0.026149364
arrival_date_day_of_month                    0.003062124
stays_in_weekend_nights                      0.072670830
stays_in_week_nights                         0.068191782
adults                                       0.122883546
children                                              NA
babies                                       0.097888792
is_repeated_guest                            0.013050009
previous_cancellations                      -0.048384118
previous_bookings_not_canceled               0.037823776
booking_changes                              0.052833436
days_in_waiting_list                        -0.082729719
adr                                          0.172185264
required_car_parking_spaces                  0.082626338
total_of_special_requests                    1.000000000

Group by City hotel and mean, median, and mode of lead_time

Code
# Group by hotel type and calculate mean, median, and mode of lead_time
hotel_lead_time_city <- hotel_data %>%
  group_by(hotel = 'City Hotel') %>%
  summarise(
    mean_lead_time = mean(lead_time),
    median_lead_time = median(lead_time),
    mode_lead_time = lead_time[which.max(table(lead_time))]
  )
print(hotel_lead_time_city)
# A tibble: 1 × 4
  hotel      mean_lead_time median_lead_time mode_lead_time
  <chr>               <dbl>            <dbl>          <int>
1 City Hotel           104.               69            342

Group by Resort hotel and mean, median, and mode of lead_time

Code
# Group by hotel type and calculate mean, median, and mode of lead_time
hotel_lead_time_resort <- hotel_data %>%
  group_by(hotel = 'Resort Hotel') %>%
  summarise(
    mean_lead_time = mean(lead_time),
    median_lead_time = median(lead_time),
    mode_lead_time = lead_time[which.max(table(lead_time))]
  )
print(hotel_lead_time_resort)
# A tibble: 1 × 4
  hotel        mean_lead_time median_lead_time mode_lead_time
  <chr>                 <dbl>            <dbl>          <int>
1 Resort Hotel           104.               69            342

only reservations from Portugal and standard deviation of adr

Code
# Filter for only reservations from Portugal and calculate standard deviation of adr
portugal_adr_sd <- hotel_data %>%
  filter(country == "PRT") %>%
  summarise(adr_sd = sd(adr))
print(portugal_adr_sd)
    adr_sd
1 53.69684

min, max, and 25th, 50th, and 75th percentiles of total_of_special_requests

Code
# Calculate min, max, and 25th, 50th, and 75th percentiles of total_of_special_requests
special_req_summary <- hotel_data %>%
  summarise(
    min_special_req = min(total_of_special_requests),
    max_special_req = max(total_of_special_requests),
    q25_special_req = quantile(total_of_special_requests, 0.25),
    q50_special_req = quantile(total_of_special_requests, 0.5),
    q75_special_req = quantile(total_of_special_requests, 0.75))
print(special_req_summary)
  min_special_req max_special_req q25_special_req q50_special_req
1               0               5               0               0
  q75_special_req
1               1