Code
library(tidyverse)
library(lubridate)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
FNU Avinesh Krishnan
May 15, 2022
Today’s challenge is to:
Read in one (or more) of the following datasets, using the correct R package and command.
# A tibble: 6 × 32
hotel is_ca…¹ lead_…² arriv…³ arriv…⁴ arriv…⁵ arriv…⁶ stays…⁷ stays…⁸ adults
<chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Resort… 0 342 2015 July 27 1 0 0 2
2 Resort… 0 737 2015 July 27 1 0 0 2
3 Resort… 0 7 2015 July 27 1 0 1 1
4 Resort… 0 13 2015 July 27 1 0 1 1
5 Resort… 0 14 2015 July 27 1 0 2 2
6 Resort… 0 14 2015 July 27 1 0 2 2
# … with 22 more variables: children <dbl>, babies <dbl>, meal <chr>,
# country <chr>, market_segment <chr>, distribution_channel <chr>,
# is_repeated_guest <dbl>, previous_cancellations <dbl>,
# previous_bookings_not_canceled <dbl>, reserved_room_type <chr>,
# assigned_room_type <chr>, booking_changes <dbl>, deposit_type <chr>,
# agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>, …
spc_tbl_ [119,390 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ hotel : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
$ is_canceled : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
$ lead_time : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
$ arrival_date_year : num [1:119390] 2015 2015 2015 2015 2015 ...
$ arrival_date_month : chr [1:119390] "July" "July" "July" "July" ...
$ arrival_date_week_number : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
$ arrival_date_day_of_month : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
$ stays_in_weekend_nights : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ stays_in_week_nights : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
$ adults : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
$ children : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ babies : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ meal : chr [1:119390] "BB" "BB" "BB" "BB" ...
$ country : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
$ market_segment : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
$ distribution_channel : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
$ is_repeated_guest : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ previous_cancellations : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ reserved_room_type : chr [1:119390] "C" "C" "A" "A" ...
$ assigned_room_type : chr [1:119390] "C" "C" "C" "A" ...
$ booking_changes : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
$ deposit_type : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
$ agent : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
$ company : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
$ days_in_waiting_list : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ customer_type : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
$ adr : num [1:119390] 0 0 75 75 98 ...
$ required_car_parking_spaces : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
$ total_of_special_requests : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
$ reservation_status : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
$ reservation_status_date : Date[1:119390], format: "2015-07-01" "2015-07-01" ...
- attr(*, "spec")=
.. cols(
.. hotel = col_character(),
.. is_canceled = col_double(),
.. lead_time = col_double(),
.. arrival_date_year = col_double(),
.. arrival_date_month = col_character(),
.. arrival_date_week_number = col_double(),
.. arrival_date_day_of_month = col_double(),
.. stays_in_weekend_nights = col_double(),
.. stays_in_week_nights = col_double(),
.. adults = col_double(),
.. children = col_double(),
.. babies = col_double(),
.. meal = col_character(),
.. country = col_character(),
.. market_segment = col_character(),
.. distribution_channel = col_character(),
.. is_repeated_guest = col_double(),
.. previous_cancellations = col_double(),
.. previous_bookings_not_canceled = col_double(),
.. reserved_room_type = col_character(),
.. assigned_room_type = col_character(),
.. booking_changes = col_double(),
.. deposit_type = col_character(),
.. agent = col_character(),
.. company = col_character(),
.. days_in_waiting_list = col_double(),
.. customer_type = col_character(),
.. adr = col_double(),
.. required_car_parking_spaces = col_double(),
.. total_of_special_requests = col_double(),
.. reservation_status = col_character(),
.. reservation_status_date = col_date(format = "")
.. )
- attr(*, "problems")=<externalptr>
[1] "hotel" "is_canceled"
[3] "lead_time" "arrival_date_year"
[5] "arrival_date_month" "arrival_date_week_number"
[7] "arrival_date_day_of_month" "stays_in_weekend_nights"
[9] "stays_in_week_nights" "adults"
[11] "children" "babies"
[13] "meal" "country"
[15] "market_segment" "distribution_channel"
[17] "is_repeated_guest" "previous_cancellations"
[19] "previous_bookings_not_canceled" "reserved_room_type"
[21] "assigned_room_type" "booking_changes"
[23] "deposit_type" "agent"
[25] "company" "days_in_waiting_list"
[27] "customer_type" "adr"
[29] "required_car_parking_spaces" "total_of_special_requests"
[31] "reservation_status" "reservation_status_date"
Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.
Any additional comments?
Are there any variables that require mutation to be usable in your analysis stream? For example, are all time variables correctly coded as dates? Are all string variables reduced and cleaned to sensible categories? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?
Document your work here.
mutated_data <- hotelbookdata %>%
mutate(month = case_when(arrival_date_month == "January" ~ 1,arrival_date_month == "February" ~ 2,arrival_date_month == "March" ~ 3,arrival_date_month == "April" ~ 4,arrival_date_month == "May" ~ 5,arrival_date_month == "June" ~ 6,arrival_date_month == "July" ~ 7,arrival_date_month == "August" ~ 8,arrival_date_month == "September" ~ 9,arrival_date_month == "October" ~ 10,arrival_date_month == "November" ~ 11,arrival_date_month == "December" ~ 12,)) %>%
mutate(date_of_arrrival = make_date(arrival_date_year, month, arrival_date_day_of_month)) %>%
select(-c(arrival_date_year, arrival_date_month, arrival_date_day_of_month, month, arrival_date_week_number))
mutated_data
# A tibble: 119,390 × 29
hotel is_ca…¹ lead_…² stays…³ stays…⁴ adults child…⁵ babies meal country
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 Resort H… 0 342 0 0 2 0 0 BB PRT
2 Resort H… 0 737 0 0 2 0 0 BB PRT
3 Resort H… 0 7 0 1 1 0 0 BB GBR
4 Resort H… 0 13 0 1 1 0 0 BB GBR
5 Resort H… 0 14 0 2 2 0 0 BB GBR
6 Resort H… 0 14 0 2 2 0 0 BB GBR
7 Resort H… 0 0 0 2 2 0 0 BB PRT
8 Resort H… 0 9 0 2 2 0 0 FB PRT
9 Resort H… 1 85 0 3 2 0 0 BB PRT
10 Resort H… 1 75 0 3 2 0 0 HB PRT
# … with 119,380 more rows, 19 more variables: market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
# deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
# customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
# total_of_special_requests <dbl>, reservation_status <chr>, …
After eliminating four of the initial date columns (year, month, day, and week) and introducing a new column to represent the complete expected arrival date, the total number of columns (variables) in our dataset is 29.
Any additional comments?
---
title: "Challenge 4"
author: "FNU Avinesh Krishnan"
description: "More data wrangling: pivoting"
date: "05/15/2022"
format:
html:
toc: true
code-fold: true
code-copy: true
code-tools: true
categories:
- challenge_4
- FNU Avinesh Krishnan
- HOTEL
---
```{r}
#| label: setup
#| warning: false
#| message: false
library(tidyverse)
library(lubridate)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
## Challenge Overview
Today's challenge is to:
1) read in a data set, and describe the data set using both words and any supporting information (e.g., tables, etc)
2) tidy data (as needed, including sanity checks)
3) identify variables that need to be mutated
4) mutate variables and sanity check all mutations
## Read in data
Read in one (or more) of the following datasets, using the correct R package and command.
- abc_poll.csv ⭐
- poultry_tidy.xlsx or organiceggpoultry.xls⭐⭐
- FedFundsRate.csv⭐⭐⭐
- hotel_bookings.csv⭐⭐⭐⭐
- debt_in_trillions.xlsx ⭐⭐⭐⭐⭐
```{r}
hotelbookdata <- read_csv("~/Desktop/601_Spring_2023/posts/_data/hotel_bookings.csv")
```
### Briefly describe the data
```{r}
head(hotelbookdata)
```
```{r}
str(hotelbookdata)
```
```{r}
dim(hotelbookdata)
```
```{r}
colnames(hotelbookdata)
```
```{r}
unique(hotelbookdata$hotel)
```
## Tidy Data (as needed)
Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.
```{r}
```
Any additional comments?
## Identify variables that need to be mutated
Are there any variables that require mutation to be usable in your analysis stream? For example, are all time variables correctly coded as dates? Are all string variables reduced and cleaned to sensible categories? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?
Document your work here.
```{r}
mutated_data <- hotelbookdata %>%
mutate(month = case_when(arrival_date_month == "January" ~ 1,arrival_date_month == "February" ~ 2,arrival_date_month == "March" ~ 3,arrival_date_month == "April" ~ 4,arrival_date_month == "May" ~ 5,arrival_date_month == "June" ~ 6,arrival_date_month == "July" ~ 7,arrival_date_month == "August" ~ 8,arrival_date_month == "September" ~ 9,arrival_date_month == "October" ~ 10,arrival_date_month == "November" ~ 11,arrival_date_month == "December" ~ 12,)) %>%
mutate(date_of_arrrival = make_date(arrival_date_year, month, arrival_date_day_of_month)) %>%
select(-c(arrival_date_year, arrival_date_month, arrival_date_day_of_month, month, arrival_date_week_number))
mutated_data
```
After eliminating four of the initial date columns (year, month, day, and week) and introducing a new column to represent the complete expected arrival date, the total number of columns (variables) in our dataset is 29.
```{r}
ncol(mutated_data)
```
```{r}
dim(mutated_data)
```
Any additional comments?