Code
library(tidyverse)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Meredith Rolfe
August 16, 2022
Today’s challenge is to
Read in one (or more) of the following data sets, available in the posts/_data
folder, using the correct R package and command.
Add any comments or documentation as needed. More challenging data may require additional code chunks and documentation.
Using a combination of words and results of R commands, can you provide a high level description of the data? Describe as efficiently as possible where/how the data was (likely) gathered, indicate the cases and variables (both the interpretation and any details you deem useful to the reader to fully understand your chosen data).
function (..., list = character(), package = NULL, lib.loc = NULL, verbose = getOption("verbose"),
envir = .GlobalEnv, overwrite = TRUE)
Dataset Size: 119390 observations, 32 variables
Conduct some exploratory data analysis, using dplyr commands such as group_by()
, select()
, filter()
, and summarise()
. Find the central tendency (mean, median, mode) and dispersion (standard deviation, mix/max/quantile) for different subgroups within the data set.
is_canceled lead_time arrival_date_year
is_canceled 1.000000000 0.2931233558 0.0166598602
lead_time 0.293123356 1.0000000000 0.0401420998
arrival_date_year 0.016659860 0.0401420998 1.0000000000
arrival_date_week_number 0.008148065 0.1268708127 -0.5405613308
arrival_date_day_of_month -0.006130079 0.0022675527 -0.0002210192
stays_in_weekend_nights -0.001791078 0.0856711329 0.0214973654
stays_in_week_nights 0.024764629 0.1657993639 0.0308832973
adults 0.060017213 0.1195186926 0.0296351442
children NA NA NA
babies -0.032491089 -0.0209150163 -0.0131920747
is_repeated_guest -0.084793418 -0.1244099080 0.0103413169
previous_cancellations 0.110132808 0.0860418019 -0.1198220705
previous_bookings_not_canceled -0.057357723 -0.0735481679 0.0292180512
booking_changes -0.144380991 0.0001488301 0.0308723496
days_in_waiting_list 0.054185824 0.1700841843 -0.0564966020
adr 0.047556598 -0.0630768525 0.1975800885
required_car_parking_spaces -0.195497817 -0.1164505701 -0.0136844109
total_of_special_requests -0.234657774 -0.0957120489 0.1085314863
arrival_date_week_number
is_canceled 0.008148065
lead_time 0.126870813
arrival_date_year -0.540561331
arrival_date_week_number 1.000000000
arrival_date_day_of_month 0.066809253
stays_in_weekend_nights 0.018207653
stays_in_week_nights 0.015558297
adults 0.025909057
children NA
babies 0.010395480
is_repeated_guest -0.030130758
previous_cancellations 0.035500909
previous_bookings_not_canceled -0.020903552
booking_changes 0.005507503
days_in_waiting_list 0.022933027
adr 0.075790643
required_car_parking_spaces 0.001920423
total_of_special_requests 0.026149364
arrival_date_day_of_month
is_canceled -0.0061300789
lead_time 0.0022675527
arrival_date_year -0.0002210192
arrival_date_week_number 0.0668092530
arrival_date_day_of_month 1.0000000000
stays_in_weekend_nights -0.0163542995
stays_in_week_nights -0.0281735214
adults -0.0015659791
children NA
babies -0.0002303647
is_repeated_guest -0.0061450207
previous_cancellations -0.0270107761
previous_bookings_not_canceled -0.0002997868
booking_changes 0.0106128560
days_in_waiting_list 0.0227275352
adr 0.0302451948
required_car_parking_spaces 0.0086834665
total_of_special_requests 0.0030621241
stays_in_weekend_nights stays_in_week_nights
is_canceled -0.001791078 0.02476463
lead_time 0.085671133 0.16579936
arrival_date_year 0.021497365 0.03088330
arrival_date_week_number 0.018207653 0.01555830
arrival_date_day_of_month -0.016354300 -0.02817352
stays_in_weekend_nights 1.000000000 0.49896882
stays_in_week_nights 0.498968818 1.00000000
adults 0.091871020 0.09297551
children NA NA
babies 0.018482810 0.02019097
is_repeated_guest -0.087239379 -0.09724497
previous_cancellations -0.012774619 -0.01399243
previous_bookings_not_canceled -0.042715235 -0.04874255
booking_changes 0.063281316 0.09620945
days_in_waiting_list -0.054151113 -0.00201981
adr 0.049341906 0.06523748
required_car_parking_spaces -0.018553809 -0.02485942
total_of_special_requests 0.072670830 0.06819178
adults children babies
is_canceled 0.060017213 NA -0.0324910892
lead_time 0.119518693 NA -0.0209150163
arrival_date_year 0.029635144 NA -0.0131920747
arrival_date_week_number 0.025909057 NA 0.0103954801
arrival_date_day_of_month -0.001565979 NA -0.0002303647
stays_in_weekend_nights 0.091871020 NA 0.0184828105
stays_in_week_nights 0.092975513 NA 0.0201909744
adults 1.000000000 NA 0.0181456419
children NA 1 NA
babies 0.018145642 NA 1.0000000000
is_repeated_guest -0.146426116 NA -0.0089426341
previous_cancellations -0.006738096 NA -0.0075009985
previous_bookings_not_canceled -0.107983172 NA -0.0065504543
booking_changes -0.051672774 NA 0.0834397814
days_in_waiting_list -0.008283347 NA -0.0106205427
adr 0.230641216 NA 0.0291856903
required_car_parking_spaces 0.014784817 NA 0.0373833559
total_of_special_requests 0.122883546 NA 0.0978887921
is_repeated_guest previous_cancellations
is_canceled -0.084793418 0.110132808
lead_time -0.124409908 0.086041802
arrival_date_year 0.010341317 -0.119822070
arrival_date_week_number -0.030130758 0.035500909
arrival_date_day_of_month -0.006145021 -0.027010776
stays_in_weekend_nights -0.087239379 -0.012774619
stays_in_week_nights -0.097244972 -0.013992431
adults -0.146426116 -0.006738096
children NA NA
babies -0.008942634 -0.007500998
is_repeated_guest 1.000000000 0.082293234
previous_cancellations 0.082293234 1.000000000
previous_bookings_not_canceled 0.418055995 0.152728115
booking_changes 0.012091787 -0.026992663
days_in_waiting_list -0.022234965 0.005928941
adr -0.134314447 -0.065645638
required_car_parking_spaces 0.077089573 -0.018492250
total_of_special_requests 0.013050009 -0.048384118
previous_bookings_not_canceled booking_changes
is_canceled -0.0573577232 -0.1443809911
lead_time -0.0735481679 0.0001488301
arrival_date_year 0.0292180512 0.0308723496
arrival_date_week_number -0.0209035517 0.0055075035
arrival_date_day_of_month -0.0002997868 0.0106128560
stays_in_weekend_nights -0.0427152350 0.0632813159
stays_in_week_nights -0.0487425495 0.0962094460
adults -0.1079831725 -0.0516727735
children NA NA
babies -0.0065504543 0.0834397814
is_repeated_guest 0.4180559949 0.0120917873
previous_cancellations 0.1527281149 -0.0269926626
previous_bookings_not_canceled 1.0000000000 0.0116075289
booking_changes 0.0116075289 1.0000000000
days_in_waiting_list -0.0093969779 -0.0116339446
adr -0.0721441957 0.0196176738
required_car_parking_spaces 0.0476530869 0.0656201914
total_of_special_requests 0.0378237757 0.0528334357
days_in_waiting_list adr
is_canceled 0.054185824 0.04755660
lead_time 0.170084184 -0.06307685
arrival_date_year -0.056496602 0.19758009
arrival_date_week_number 0.022933027 0.07579064
arrival_date_day_of_month 0.022727535 0.03024519
stays_in_weekend_nights -0.054151113 0.04934191
stays_in_week_nights -0.002019810 0.06523748
adults -0.008283347 0.23064122
children NA NA
babies -0.010620543 0.02918569
is_repeated_guest -0.022234965 -0.13431445
previous_cancellations 0.005928941 -0.06564564
previous_bookings_not_canceled -0.009396978 -0.07214420
booking_changes -0.011633945 0.01961767
days_in_waiting_list 1.000000000 -0.04075641
adr -0.040756412 1.00000000
required_car_parking_spaces -0.030600046 0.05662809
total_of_special_requests -0.082729719 0.17218526
required_car_parking_spaces
is_canceled -0.195497817
lead_time -0.116450570
arrival_date_year -0.013684411
arrival_date_week_number 0.001920423
arrival_date_day_of_month 0.008683466
stays_in_weekend_nights -0.018553809
stays_in_week_nights -0.024859423
adults 0.014784817
children NA
babies 0.037383356
is_repeated_guest 0.077089573
previous_cancellations -0.018492250
previous_bookings_not_canceled 0.047653087
booking_changes 0.065620191
days_in_waiting_list -0.030600046
adr 0.056628092
required_car_parking_spaces 1.000000000
total_of_special_requests 0.082626338
total_of_special_requests
is_canceled -0.234657774
lead_time -0.095712049
arrival_date_year 0.108531486
arrival_date_week_number 0.026149364
arrival_date_day_of_month 0.003062124
stays_in_weekend_nights 0.072670830
stays_in_week_nights 0.068191782
adults 0.122883546
children NA
babies 0.097888792
is_repeated_guest 0.013050009
previous_cancellations -0.048384118
previous_bookings_not_canceled 0.037823776
booking_changes 0.052833436
days_in_waiting_list -0.082729719
adr 0.172185264
required_car_parking_spaces 0.082626338
total_of_special_requests 1.000000000
# Group by hotel type and calculate mean, median, and mode of lead_time
hotel_lead_time_city <- hotel_data %>%
group_by(hotel = 'City Hotel') %>%
summarise(
mean_lead_time = mean(lead_time),
median_lead_time = median(lead_time),
mode_lead_time = lead_time[which.max(table(lead_time))]
)
print(hotel_lead_time_city)
# A tibble: 1 × 4
hotel mean_lead_time median_lead_time mode_lead_time
<chr> <dbl> <dbl> <int>
1 City Hotel 104. 69 342
# Group by hotel type and calculate mean, median, and mode of lead_time
hotel_lead_time_resort <- hotel_data %>%
group_by(hotel = 'Resort Hotel') %>%
summarise(
mean_lead_time = mean(lead_time),
median_lead_time = median(lead_time),
mode_lead_time = lead_time[which.max(table(lead_time))]
)
print(hotel_lead_time_resort)
# A tibble: 1 × 4
hotel mean_lead_time median_lead_time mode_lead_time
<chr> <dbl> <dbl> <int>
1 Resort Hotel 104. 69 342
# Calculate min, max, and 25th, 50th, and 75th percentiles of total_of_special_requests
special_req_summary <- hotel_data %>%
summarise(
min_special_req = min(total_of_special_requests),
max_special_req = max(total_of_special_requests),
q25_special_req = quantile(total_of_special_requests, 0.25),
q50_special_req = quantile(total_of_special_requests, 0.5),
q75_special_req = quantile(total_of_special_requests, 0.75))
print(special_req_summary)
min_special_req max_special_req q25_special_req q50_special_req
1 0 5 0 0
q75_special_req
1 1
---
title: "Thrishul Challenge 2"
author: "Meredith Rolfe"
desription: "Data wrangling: using group() and summarise()"
date: "08/16/2022"
format:
html:
toc: true
code-fold: true
code-copy: true
code-tools: true
categories:
- challenge_2
- railroads
- faostat
- hotel_bookings
---
```{r}
#| label: setup
#| warning: false
#| message: false
library(tidyverse)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
## Challenge Overview
Today's challenge is to
1) read in a data set, and describe the data using both words and any supporting information (e.g., tables, etc)
2) provide summary statistics for different interesting groups within the data, and interpret those statistics
## Read in the Data
Read in one (or more) of the following data sets, available in the `posts/_data` folder, using the correct R package and command.
- railroad\*.csv or StateCounty2012.xls ⭐
- FAOstat\*.csv or birds.csv ⭐⭐⭐
- hotel_bookings.csv ⭐⭐⭐⭐
```{r}
df<- read.csv(file = "_data/hotel_bookings.csv")
```
Add any comments or documentation as needed. More challenging data may require additional code chunks and documentation.
## Describe the data
Using a combination of words and results of R commands, can you provide a high level description of the data? Describe as efficiently as possible where/how the data was (likely) gathered, indicate the cases and variables (both the interpretation and any details you deem useful to the reader to fully understand your chosen data).
```{r}
#| label: summary
str(data)
num_observations <- nrow(df)
num_variables <- ncol(df)
cat("Dataset Size: ", num_observations, " observations, ", num_variables, " variables\n")
# histogram of is_canceled
hist(df$is_canceled)
#scatterplot of stays_in_weekend_nights, stays_in_week_nights
plot(df$stays_in_weekend_nights, df$stays_in_week_nights)
# boxplot of arrival_date_week_number
boxplot(df$arrival_date_week_number)
```
## Provide Grouped Summary Statistics
Conduct some exploratory data analysis, using dplyr commands such as `group_by()`, `select()`, `filter()`, and `summarise()`. Find the central tendency (mean, median, mode) and dispersion (standard deviation, mix/max/quantile) for different subgroups within the data set.
```{r}
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Read data
hotel_data <- df
```
```{r}
data <- select(df,-where(is.character))
cor(data)
```
### Group by City hotel and mean, median, and mode of lead_time
```{r}
# Group by hotel type and calculate mean, median, and mode of lead_time
hotel_lead_time_city <- hotel_data %>%
group_by(hotel = 'City Hotel') %>%
summarise(
mean_lead_time = mean(lead_time),
median_lead_time = median(lead_time),
mode_lead_time = lead_time[which.max(table(lead_time))]
)
print(hotel_lead_time_city)
```
### Group by Resort hotel and mean, median, and mode of lead_time
```{r}
# Group by hotel type and calculate mean, median, and mode of lead_time
hotel_lead_time_resort <- hotel_data %>%
group_by(hotel = 'Resort Hotel') %>%
summarise(
mean_lead_time = mean(lead_time),
median_lead_time = median(lead_time),
mode_lead_time = lead_time[which.max(table(lead_time))]
)
print(hotel_lead_time_resort)
```
### only reservations from Portugal and standard deviation of adr
```{r}
# Filter for only reservations from Portugal and calculate standard deviation of adr
portugal_adr_sd <- hotel_data %>%
filter(country == "PRT") %>%
summarise(adr_sd = sd(adr))
print(portugal_adr_sd)
```
### min, max, and 25th, 50th, and 75th percentiles of total_of_special_requests
```{r}
# Calculate min, max, and 25th, 50th, and 75th percentiles of total_of_special_requests
special_req_summary <- hotel_data %>%
summarise(
min_special_req = min(total_of_special_requests),
max_special_req = max(total_of_special_requests),
q25_special_req = quantile(total_of_special_requests, 0.25),
q50_special_req = quantile(total_of_special_requests, 0.5),
q75_special_req = quantile(total_of_special_requests, 0.75))
print(special_req_summary)
```