challenge_2
railroads
faostat
hotel_bookings
Data wrangling: using group() and summarise()
Author

Shuqi Hong

Published

June 6, 2023

Code
library(tidyverse)
library(readr)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Here are the analysis for railroad file.

Code
rr <- read_csv("_data/railroad_2012_clean_county.csv")

#discard county and group others
rr2 <- rr %>% select(state,total_employees) %>% group_by(state) 

rr3 <- rr %>% select(state,total_employees)

#Find mode function
compute_mode <- function(x){
  mode_index <- which.max(table(x))
  mode <- x[mode_index]
return(mode)
}

#Calculate central tendency and dispersion for each state
rr2 %>%
  summarise(mean_employees=mean(total_employees),median_employees=median(total_employees), mode_employees=compute_mode(total_employees), iqr_employees=IQR(total_employees), sd_employees = sd(total_employees) , min_employees=min(total_employees), max_employees=max(total_employees)) 
# A tibble: 53 × 8
   state mean_employees median_employees mode_employees iqr_employees
   <chr>          <dbl>            <dbl>          <dbl>         <dbl>
 1 AE               2                2                2           0  
 2 AK              17.2              2.5              2           4  
 3 AL              63.5             26               13          47  
 4 AP               1                1                1           0  
 5 AR              53.8             16.5              5          33.8
 6 AZ             210.              94              270         296  
 7 CA             239.              61                9         188  
 8 CO              64.0             10              553          39  
 9 CT             324              125              486         167. 
10 DC             279              279              279           0  
# ℹ 43 more rows
# ℹ 3 more variables: sd_employees <dbl>, min_employees <dbl>,
#   max_employees <dbl>
Code
#Calculate central tendency and dispersion for all states
rr3 %>%
  summarise(mean_employees=mean(total_employees),median_employees=median(total_employees), mode_employees=compute_mode(total_employees),iqr_employees=IQR(total_employees), sd_employees = sd(total_employees) , min_employees=min(total_employees), max_employees=max(total_employees)) 
# A tibble: 1 × 7
  mean_employees median_employees mode_employees iqr_employees sd_employees
           <dbl>            <dbl>          <dbl>         <dbl>        <dbl>
1           87.2               21              2            58         284.
# ℹ 2 more variables: min_employees <dbl>, max_employees <dbl>
Code
#Filter out the number of employees of each county in AK
rr%>% filter(`state`=="AK")
# A tibble: 6 × 3
  state county               total_employees
  <chr> <chr>                          <dbl>
1 AK    ANCHORAGE                          7
2 AK    FAIRBANKS NORTH STAR               2
3 AK    JUNEAU                             3
4 AK    MATANUSKA-SUSITNA                  2
5 AK    SITKA                              1
6 AK    SKAGWAY MUNICIPALITY              88

Here are the analysis for hotel data

Code
hb <- read_csv("_data/hotel_bookings.csv")

#See the name of each colunm
colnames(hb)
 [1] "hotel"                          "is_canceled"                   
 [3] "lead_time"                      "arrival_date_year"             
 [5] "arrival_date_month"             "arrival_date_week_number"      
 [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
 [9] "stays_in_week_nights"           "adults"                        
[11] "children"                       "babies"                        
[13] "meal"                           "country"                       
[15] "market_segment"                 "distribution_channel"          
[17] "is_repeated_guest"              "previous_cancellations"        
[19] "previous_bookings_not_canceled" "reserved_room_type"            
[21] "assigned_room_type"             "booking_changes"               
[23] "deposit_type"                   "agent"                         
[25] "company"                        "days_in_waiting_list"          
[27] "customer_type"                  "adr"                           
[29] "required_car_parking_spaces"    "total_of_special_requests"     
[31] "reservation_status"             "reservation_status_date"       
Code
#See what kind of hotel names there are
unique(hb$hotel)
[1] "Resort Hotel" "City Hotel"  
Code
#Change the name of is_canceled which is not clear to see
hb <- hb %>% 
  mutate(state = case_when
         (is_canceled == "1" ~ "canceled",
          is_canceled == "0" ~ "regular"))

#make a new table for each hotel
CityHotel <-hb %>% filter(`hotel` == "City Hotel")
CityHotel
# A tibble: 79,330 × 33
   hotel      is_canceled lead_time arrival_date_year arrival_date_month
   <chr>            <dbl>     <dbl>             <dbl> <chr>             
 1 City Hotel           0         6              2015 July              
 2 City Hotel           1        88              2015 July              
 3 City Hotel           1        65              2015 July              
 4 City Hotel           1        92              2015 July              
 5 City Hotel           1       100              2015 July              
 6 City Hotel           1        79              2015 July              
 7 City Hotel           0         3              2015 July              
 8 City Hotel           1        63              2015 July              
 9 City Hotel           1        62              2015 July              
10 City Hotel           1        62              2015 July              
# ℹ 79,320 more rows
# ℹ 28 more variables: arrival_date_week_number <dbl>,
#   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>, …
Code
ResortHotel <-hb %>% filter(`hotel` == "Resort Hotel")
ResortHotel
# A tibble: 40,060 × 33
   hotel        is_canceled lead_time arrival_date_year arrival_date_month
   <chr>              <dbl>     <dbl>             <dbl> <chr>             
 1 Resort Hotel           0       342              2015 July              
 2 Resort Hotel           0       737              2015 July              
 3 Resort Hotel           0         7              2015 July              
 4 Resort Hotel           0        13              2015 July              
 5 Resort Hotel           0        14              2015 July              
 6 Resort Hotel           0        14              2015 July              
 7 Resort Hotel           0         0              2015 July              
 8 Resort Hotel           0         9              2015 July              
 9 Resort Hotel           1        85              2015 July              
10 Resort Hotel           1        75              2015 July              
# ℹ 40,050 more rows
# ℹ 28 more variables: arrival_date_week_number <dbl>,
#   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>, …
Code
#Make a corsstable
comparison_hs <-xtabs(~hotel+state, hb)
comparison_hs
              state
hotel          canceled regular
  City Hotel      33102   46228
  Resort Hotel    11122   28938
Code
comparison_hc <-xtabs(~hotel+children, hb)
comparison_hc
              children
hotel              0     1     2     3    10
  City Hotel   74220  3023  2024    59     0
  Resort Hotel 36576  1838  1628    17     1
Code
rowSums(comparison_hc)
  City Hotel Resort Hotel 
       79326        40060 

From this new table, we can compare the number of cancel state of two hotel.

Code
hb<- hb %>% mutate(`customer number` =adults + babies + children)
head(hb)
# A tibble: 6 × 34
  hotel        is_canceled lead_time arrival_date_year arrival_date_month
  <chr>              <dbl>     <dbl>             <dbl> <chr>             
1 Resort Hotel           0       342              2015 July              
2 Resort Hotel           0       737              2015 July              
3 Resort Hotel           0         7              2015 July              
4 Resort Hotel           0        13              2015 July              
5 Resort Hotel           0        14              2015 July              
6 Resort Hotel           0        14              2015 July              
# ℹ 29 more variables: arrival_date_week_number <dbl>,
#   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
#   meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   reserved_room_type <chr>, assigned_room_type <chr>, …
Code
hb %>% select(country,"customer number", hotel) %>% 
  pivot_wider(names_from=country, values_from="customer number")
# A tibble: 2 × 179
  hotel PRT   GBR   USA   ESP   IRL   FRA   `NULL` ROU   NOR   OMN   ARG   POL  
  <chr> <lis> <lis> <lis> <lis> <lis> <lis> <list> <lis> <lis> <lis> <lis> <lis>
1 Reso… <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
2 City… <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
# ℹ 166 more variables: DEU <list>, BEL <list>, CHE <list>, CN <list>,
#   GRC <list>, ITA <list>, NLD <list>, DNK <list>, RUS <list>, SWE <list>,
#   AUS <list>, EST <list>, CZE <list>, BRA <list>, FIN <list>, MOZ <list>,
#   BWA <list>, LUX <list>, SVN <list>, ALB <list>, IND <list>, CHN <list>,
#   MEX <list>, MAR <list>, UKR <list>, SMR <list>, LVA <list>, PRI <list>,
#   SRB <list>, CHL <list>, AUT <list>, BLR <list>, LTU <list>, TUR <list>,
#   ZAF <list>, AGO <list>, ISR <list>, CYM <list>, ZMB <list>, CPV <list>, …

I don’t know why the cell became <dbl[123]>. Maybe I still don’t know the meaning of pivot_wider. I mean what does it mean to swap columns and rows?