Code
library(tidyverse)
library(readr)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Shuqi Hong
June 6, 2023
Here are the analysis for railroad file.
rr <- read_csv("_data/railroad_2012_clean_county.csv")
#discard county and group others
rr2 <- rr %>% select(state,total_employees) %>% group_by(state)
rr3 <- rr %>% select(state,total_employees)
#Find mode function
compute_mode <- function(x){
mode_index <- which.max(table(x))
mode <- x[mode_index]
return(mode)
}
#Calculate central tendency and dispersion for each state
rr2 %>%
summarise(mean_employees=mean(total_employees),median_employees=median(total_employees), mode_employees=compute_mode(total_employees), iqr_employees=IQR(total_employees), sd_employees = sd(total_employees) , min_employees=min(total_employees), max_employees=max(total_employees))
# A tibble: 53 × 8
state mean_employees median_employees mode_employees iqr_employees
<chr> <dbl> <dbl> <dbl> <dbl>
1 AE 2 2 2 0
2 AK 17.2 2.5 2 4
3 AL 63.5 26 13 47
4 AP 1 1 1 0
5 AR 53.8 16.5 5 33.8
6 AZ 210. 94 270 296
7 CA 239. 61 9 188
8 CO 64.0 10 553 39
9 CT 324 125 486 167.
10 DC 279 279 279 0
# ℹ 43 more rows
# ℹ 3 more variables: sd_employees <dbl>, min_employees <dbl>,
# max_employees <dbl>
#Calculate central tendency and dispersion for all states
rr3 %>%
summarise(mean_employees=mean(total_employees),median_employees=median(total_employees), mode_employees=compute_mode(total_employees),iqr_employees=IQR(total_employees), sd_employees = sd(total_employees) , min_employees=min(total_employees), max_employees=max(total_employees))
# A tibble: 1 × 7
mean_employees median_employees mode_employees iqr_employees sd_employees
<dbl> <dbl> <dbl> <dbl> <dbl>
1 87.2 21 2 58 284.
# ℹ 2 more variables: min_employees <dbl>, max_employees <dbl>
# A tibble: 6 × 3
state county total_employees
<chr> <chr> <dbl>
1 AK ANCHORAGE 7
2 AK FAIRBANKS NORTH STAR 2
3 AK JUNEAU 3
4 AK MATANUSKA-SUSITNA 2
5 AK SITKA 1
6 AK SKAGWAY MUNICIPALITY 88
Here are the analysis for hotel data
[1] "hotel" "is_canceled"
[3] "lead_time" "arrival_date_year"
[5] "arrival_date_month" "arrival_date_week_number"
[7] "arrival_date_day_of_month" "stays_in_weekend_nights"
[9] "stays_in_week_nights" "adults"
[11] "children" "babies"
[13] "meal" "country"
[15] "market_segment" "distribution_channel"
[17] "is_repeated_guest" "previous_cancellations"
[19] "previous_bookings_not_canceled" "reserved_room_type"
[21] "assigned_room_type" "booking_changes"
[23] "deposit_type" "agent"
[25] "company" "days_in_waiting_list"
[27] "customer_type" "adr"
[29] "required_car_parking_spaces" "total_of_special_requests"
[31] "reservation_status" "reservation_status_date"
[1] "Resort Hotel" "City Hotel"
# A tibble: 79,330 × 33
hotel is_canceled lead_time arrival_date_year arrival_date_month
<chr> <dbl> <dbl> <dbl> <chr>
1 City Hotel 0 6 2015 July
2 City Hotel 1 88 2015 July
3 City Hotel 1 65 2015 July
4 City Hotel 1 92 2015 July
5 City Hotel 1 100 2015 July
6 City Hotel 1 79 2015 July
7 City Hotel 0 3 2015 July
8 City Hotel 1 63 2015 July
9 City Hotel 1 62 2015 July
10 City Hotel 1 62 2015 July
# ℹ 79,320 more rows
# ℹ 28 more variables: arrival_date_week_number <dbl>,
# arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
# stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
# meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>, …
# A tibble: 40,060 × 33
hotel is_canceled lead_time arrival_date_year arrival_date_month
<chr> <dbl> <dbl> <dbl> <chr>
1 Resort Hotel 0 342 2015 July
2 Resort Hotel 0 737 2015 July
3 Resort Hotel 0 7 2015 July
4 Resort Hotel 0 13 2015 July
5 Resort Hotel 0 14 2015 July
6 Resort Hotel 0 14 2015 July
7 Resort Hotel 0 0 2015 July
8 Resort Hotel 0 9 2015 July
9 Resort Hotel 1 85 2015 July
10 Resort Hotel 1 75 2015 July
# ℹ 40,050 more rows
# ℹ 28 more variables: arrival_date_week_number <dbl>,
# arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
# stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
# meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>, …
state
hotel canceled regular
City Hotel 33102 46228
Resort Hotel 11122 28938
children
hotel 0 1 2 3 10
City Hotel 74220 3023 2024 59 0
Resort Hotel 36576 1838 1628 17 1
City Hotel Resort Hotel
79326 40060
From this new table, we can compare the number of cancel state of two hotel.
# A tibble: 6 × 34
hotel is_canceled lead_time arrival_date_year arrival_date_month
<chr> <dbl> <dbl> <dbl> <chr>
1 Resort Hotel 0 342 2015 July
2 Resort Hotel 0 737 2015 July
3 Resort Hotel 0 7 2015 July
4 Resort Hotel 0 13 2015 July
5 Resort Hotel 0 14 2015 July
6 Resort Hotel 0 14 2015 July
# ℹ 29 more variables: arrival_date_week_number <dbl>,
# arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
# stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
# meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, …
# A tibble: 2 × 179
hotel PRT GBR USA ESP IRL FRA `NULL` ROU NOR OMN ARG POL
<chr> <lis> <lis> <lis> <lis> <lis> <lis> <list> <lis> <lis> <lis> <lis> <lis>
1 Reso… <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
2 City… <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# ℹ 166 more variables: DEU <list>, BEL <list>, CHE <list>, CN <list>,
# GRC <list>, ITA <list>, NLD <list>, DNK <list>, RUS <list>, SWE <list>,
# AUS <list>, EST <list>, CZE <list>, BRA <list>, FIN <list>, MOZ <list>,
# BWA <list>, LUX <list>, SVN <list>, ALB <list>, IND <list>, CHN <list>,
# MEX <list>, MAR <list>, UKR <list>, SMR <list>, LVA <list>, PRI <list>,
# SRB <list>, CHL <list>, AUT <list>, BLR <list>, LTU <list>, TUR <list>,
# ZAF <list>, AGO <list>, ISR <list>, CYM <list>, ZMB <list>, CPV <list>, …
I don’t know why the cell became <dbl[123]>. Maybe I still don’t know the meaning of pivot_wider. I mean what does it mean to swap columns and rows?
---
title: "Challenge 2"
author: "Shuqi Hong"
description: "Data wrangling: using group() and summarise()"
date: "6/6/2023"
format:
html:
toc: true
code-fold: true
code-copy: true
code-tools: true
categories:
- challenge_2
- railroads
- faostat
- hotel_bookings
---
```{r}
#| label: setup
#| warning: false
#| message: false
library(tidyverse)
library(readr)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
Here are the analysis for railroad file.
```{r}
rr <- read_csv("_data/railroad_2012_clean_county.csv")
#discard county and group others
rr2 <- rr %>% select(state,total_employees) %>% group_by(state)
rr3 <- rr %>% select(state,total_employees)
#Find mode function
compute_mode <- function(x){
mode_index <- which.max(table(x))
mode <- x[mode_index]
return(mode)
}
#Calculate central tendency and dispersion for each state
rr2 %>%
summarise(mean_employees=mean(total_employees),median_employees=median(total_employees), mode_employees=compute_mode(total_employees), iqr_employees=IQR(total_employees), sd_employees = sd(total_employees) , min_employees=min(total_employees), max_employees=max(total_employees))
#Calculate central tendency and dispersion for all states
rr3 %>%
summarise(mean_employees=mean(total_employees),median_employees=median(total_employees), mode_employees=compute_mode(total_employees),iqr_employees=IQR(total_employees), sd_employees = sd(total_employees) , min_employees=min(total_employees), max_employees=max(total_employees))
#Filter out the number of employees of each county in AK
rr%>% filter(`state`=="AK")
```
Here are the analysis for hotel data
```{r}
hb <- read_csv("_data/hotel_bookings.csv")
#See the name of each colunm
colnames(hb)
#See what kind of hotel names there are
unique(hb$hotel)
#Change the name of is_canceled which is not clear to see
hb <- hb %>%
mutate(state = case_when
(is_canceled == "1" ~ "canceled",
is_canceled == "0" ~ "regular"))
#make a new table for each hotel
CityHotel <-hb %>% filter(`hotel` == "City Hotel")
CityHotel
ResortHotel <-hb %>% filter(`hotel` == "Resort Hotel")
ResortHotel
```
```{r}
#Make a corsstable
comparison_hs <-xtabs(~hotel+state, hb)
comparison_hs
comparison_hc <-xtabs(~hotel+children, hb)
comparison_hc
rowSums(comparison_hc)
```
From this new table, we can compare the number of cancel state of two hotel.
```{r}
hb<- hb %>% mutate(`customer number` =adults + babies + children)
head(hb)
hb %>% select(country,"customer number", hotel) %>%
pivot_wider(names_from=country, values_from="customer number")
```
I don't know why the cell became <dbl[123]>. Maybe I still don't know the meaning of pivot_wider. I mean what does it mean to swap columns and rows?