library(tidyverse)
library(ggplot2)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 6
Challenge Overview
Today’s challenge is to:
- read in a data set, and describe the data set using both words and any supporting information (e.g., tables, etc)
- tidy data (as needed, including sanity checks)
- mutate variables as needed (including sanity checks)
- create at least one graph including time (evolution)
- try to make them “publication” ready (optional)
- Explain why you choose the specific graph type
- Create at least one graph depicting part-whole or flow relationships
- try to make them “publication” ready (optional)
- Explain why you choose the specific graph type
R Graph Gallery is a good starting point for thinking about what information is conveyed in standard graph types, and includes example R code.
(be sure to only include the category tags for the data you use!)
Read in data
Read in one (or more) of the following datasets, using the correct R package and command.
- debt ⭐
- fed_rate ⭐⭐
- abc_poll ⭐⭐⭐
- usa_hh ⭐⭐⭐
- hotel_bookings ⭐⭐⭐⭐
- AB_NYC ⭐⭐⭐⭐⭐
<- read.csv("~/Documents/601/601_Spring_2023/posts/_data/hotel_bookings.csv") hotelbookdata
Briefly describe the data
head(hotelbookdata)
hotel is_canceled lead_time arrival_date_year arrival_date_month
1 Resort Hotel 0 342 2015 July
2 Resort Hotel 0 737 2015 July
3 Resort Hotel 0 7 2015 July
4 Resort Hotel 0 13 2015 July
5 Resort Hotel 0 14 2015 July
6 Resort Hotel 0 14 2015 July
arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
1 27 1 0
2 27 1 0
3 27 1 0
4 27 1 0
5 27 1 0
6 27 1 0
stays_in_week_nights adults children babies meal country market_segment
1 0 2 0 0 BB PRT Direct
2 0 2 0 0 BB PRT Direct
3 1 1 0 0 BB GBR Direct
4 1 1 0 0 BB GBR Corporate
5 2 2 0 0 BB GBR Online TA
6 2 2 0 0 BB GBR Online TA
distribution_channel is_repeated_guest previous_cancellations
1 Direct 0 0
2 Direct 0 0
3 Direct 0 0
4 Corporate 0 0
5 TA/TO 0 0
6 TA/TO 0 0
previous_bookings_not_canceled reserved_room_type assigned_room_type
1 0 C C
2 0 C C
3 0 A C
4 0 A A
5 0 A A
6 0 A A
booking_changes deposit_type agent company days_in_waiting_list customer_type
1 3 No Deposit NULL NULL 0 Transient
2 4 No Deposit NULL NULL 0 Transient
3 0 No Deposit NULL NULL 0 Transient
4 0 No Deposit 304 NULL 0 Transient
5 0 No Deposit 240 NULL 0 Transient
6 0 No Deposit 240 NULL 0 Transient
adr required_car_parking_spaces total_of_special_requests reservation_status
1 0 0 0 Check-Out
2 0 0 0 Check-Out
3 75 0 0 Check-Out
4 75 0 0 Check-Out
5 98 0 1 Check-Out
6 98 0 1 Check-Out
reservation_status_date
1 2015-07-01
2 2015-07-01
3 2015-07-02
4 2015-07-02
5 2015-07-03
6 2015-07-03
colnames(hotelbookdata)
[1] "hotel" "is_canceled"
[3] "lead_time" "arrival_date_year"
[5] "arrival_date_month" "arrival_date_week_number"
[7] "arrival_date_day_of_month" "stays_in_weekend_nights"
[9] "stays_in_week_nights" "adults"
[11] "children" "babies"
[13] "meal" "country"
[15] "market_segment" "distribution_channel"
[17] "is_repeated_guest" "previous_cancellations"
[19] "previous_bookings_not_canceled" "reserved_room_type"
[21] "assigned_room_type" "booking_changes"
[23] "deposit_type" "agent"
[25] "company" "days_in_waiting_list"
[27] "customer_type" "adr"
[29] "required_car_parking_spaces" "total_of_special_requests"
[31] "reservation_status" "reservation_status_date"
dim(hotelbookdata)
[1] 119390 32
str(hotelbookdata)
'data.frame': 119390 obs. of 32 variables:
$ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
$ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
$ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
$ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
$ arrival_date_month : chr "July" "July" "July" "July" ...
$ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
$ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
$ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
$ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
$ adults : int 2 2 1 1 2 2 2 2 2 2 ...
$ children : int 0 0 0 0 0 0 0 0 0 0 ...
$ babies : int 0 0 0 0 0 0 0 0 0 0 ...
$ meal : chr "BB" "BB" "BB" "BB" ...
$ country : chr "PRT" "PRT" "GBR" "GBR" ...
$ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
$ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
$ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
$ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
$ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
$ reserved_room_type : chr "C" "C" "A" "A" ...
$ assigned_room_type : chr "C" "C" "C" "A" ...
$ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
$ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
$ agent : chr "NULL" "NULL" "NULL" "304" ...
$ company : chr "NULL" "NULL" "NULL" "NULL" ...
$ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
$ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
$ adr : num 0 0 75 75 98 ...
$ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
$ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
$ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
$ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
%>%
hotelbookdata select(country) %>%
distinct()
country
1 PRT
2 GBR
3 USA
4 ESP
5 IRL
6 FRA
7 NULL
8 ROU
9 NOR
10 OMN
11 ARG
12 POL
13 DEU
14 BEL
15 CHE
16 CN
17 GRC
18 ITA
19 NLD
20 DNK
21 RUS
22 SWE
23 AUS
24 EST
25 CZE
26 BRA
27 FIN
28 MOZ
29 BWA
30 LUX
31 SVN
32 ALB
33 IND
34 CHN
35 MEX
36 MAR
37 UKR
38 SMR
39 LVA
40 PRI
41 SRB
42 CHL
43 AUT
44 BLR
45 LTU
46 TUR
47 ZAF
48 AGO
49 ISR
50 CYM
51 ZMB
52 CPV
53 ZWE
54 DZA
55 KOR
56 CRI
57 HUN
58 ARE
59 TUN
60 JAM
61 HRV
62 HKG
63 IRN
64 GEO
65 AND
66 GIB
67 URY
68 JEY
69 CAF
70 CYP
71 COL
72 GGY
73 KWT
74 NGA
75 MDV
76 VEN
77 SVK
78 FJI
79 KAZ
80 PAK
81 IDN
82 LBN
83 PHL
84 SEN
85 SYC
86 AZE
87 BHR
88 NZL
89 THA
90 DOM
91 MKD
92 MYS
93 ARM
94 JPN
95 LKA
96 CUB
97 CMR
98 BIH
99 MUS
100 COM
101 SUR
102 UGA
103 BGR
104 CIV
105 JOR
106 SYR
107 SGP
108 BDI
109 SAU
110 VNM
111 PLW
112 QAT
113 EGY
114 PER
115 MLT
116 MWI
117 ECU
118 MDG
119 ISL
120 UZB
121 NPL
122 BHS
123 MAC
124 TGO
125 TWN
126 DJI
127 STP
128 KNA
129 ETH
130 IRQ
131 HND
132 RWA
133 KHM
134 MCO
135 BGD
136 IMN
137 TJK
138 NIC
139 BEN
140 VGB
141 TZA
142 GAB
143 GHA
144 TMP
145 GLP
146 KEN
147 LIE
148 GNB
149 MNE
150 UMI
151 MYT
152 FRO
153 MMR
154 PAN
155 BFA
156 LBY
157 MLI
158 NAM
159 BOL
160 PRY
161 BRB
162 ABW
163 AIA
164 SLV
165 DMA
166 PYF
167 GUY
168 LCA
169 ATA
170 GTM
171 ASM
172 MRT
173 NCL
174 KIR
175 SDN
176 ATF
177 SLE
178 LAO
%>%
hotelbookdata select(country) %>%
n_distinct()
[1] 178
Tidy Data (as needed)
Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.
<- hotelbookdata %>%
mutated_hotelbookdata mutate(arrival_date = paste(arrival_date_month, arrival_date_day_of_month, arrival_date_year, sep = " "))
head(bookings$arrival_date)
Error in head(bookings$arrival_date): object 'bookings' not found
Are there any variables that require mutation to be usable in your analysis stream? For example, do you need to calculate new values in order to graph them? Can string values be represented numerically? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?
Document your work here.
Time Dependent Visualization
<- mutated_hotelbookdata %>%
resort_reservations filter(reservation_status == "Check-Out", hotel == "Resort Hotel") %>%
select(arrival_date)
<- resort_reservations %>%
resort_reservations mutate(reservation_count = 1) %>%
group_by(arrival_date) %>%
summarise(reservation_count = sum(reservation_count)) %>%
mutate(full_date = as.Date(arrival_date, format = "%B %d %Y"))
ggplot(resort_reservations,aes(x = full_date, y = reservation_count)) +
geom_line(color = "blue") +
labs(title = "Resort Hotel Reservation Trend") +
ylab("Number of reservations") +
xlab("Date")
Visualizing Part-Whole Relationships
<- mutated_hotelbookdata %>%
data_grouped select(arrival_date_month, days_in_waiting_list) %>%
group_by(arrival_date_month) %>%
summarise(max_waiting_days = max(days_in_waiting_list)) %>%
mutate(month = as.Date(paste(arrival_date_month, "-01-2021", sep = ""), "%B-%d-%Y"))
ggplot(data_grouped,aes(month, max_waiting_days)) +
geom_line() +
geom_line(color = "blue") +
scale_x_date(date_labels = "%B", date_breaks = "1 month") +
labs(title = "Maximum Waiting List Duration per Month days")