Code
library(tidyverse)
library(Hmisc)
library(psych)
library(readxl)
library(stringr)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Akhilesh Kumar Meghwal
August 22, 2022
posts/_data
folder, using the read_excel R package and command.Rows: 74
Columns: 13
$ Year <chr> "2013", "Certified Organic Eggs:", "Extra Large \nDozen", "Extra …
$ Jan. <chr> NA, NA, "290", "188.13", "267.5", "178", "Jan.", NA, NA, "288.5",…
$ Feb. <chr> NA, NA, "290", "188.13", "267.5", "178", "Feb.", NA, NA, "288.5",…
$ Mar. <chr> NA, NA, "290", "188.13", "267.5", "178", "Mar.", NA, NA, "288.5",…
$ Apr. <chr> NA, NA, "290", "188.13", "267.5", "178", "Apr.", NA, NA, "288.5",…
$ May <chr> NA, NA, "290", "188.13", "267.5", "178", "May", NA, NA, "288.5", …
$ June <chr> "Cents per pound", NA, "290", "188.13", "267.5", "178", "June", "…
$ July <chr> NA, NA, "290", "188.13", "267.5", "178", "July", NA, NA, "288.5",…
$ Aug. <chr> NA, NA, "290", "188.13", "267.5", "178", "Aug.", NA, NA, "288.5",…
$ Sep. <chr> NA, NA, "290", "188.13", "267.5", "178", "Sep.", NA, NA, "290", "…
$ Oct. <chr> NA, NA, "290", "188.13", "267.5", "178", "Oct.", NA, NA, "290", "…
$ Nov. <chr> NA, NA, "290", "188.13", "267.5", "178", "Nov.", NA, NA, "290", "…
$ Dec. <chr> NA, NA, "290", "188.13", "267.5", "178", "Dec.", NA, NA, "290", "…
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Year [numeric] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Certified_Organic_Eggs [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Month [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Price [numeric] |
|
46 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04
# A tibble: 40 × 5
# Groups: Year, Certified_Organic_Eggs [40]
Year Certified_Organic_Eggs Price_min Price_max Price_mean
<fct> <chr> <dbl> <dbl> <dbl>
1 2004 "Large \nDozen" 225 234. 230.
2 2005 "Large \nDozen" 234. 234. 234.
3 2006 "Large \nDozen" 234. 236. 234.
4 2007 "Large \nDozen" 234. 237 237.
5 2008 "Large \nDozen" 237 278. 267.
6 2009 "Large \nDozen" 272. 278. 274
7 2010 "Large \nDozen" 268. 268 268.
8 2011 "Large \nDozen" 268. 270 269.
9 2012 "Large \nDozen" 268. 268. 268.
10 2013 "Large \nDozen" 268. 268. 268.
# … with 30 more rows
# ℹ Use `print(n = ...)` to see more rows
# A tibble: 480 × 4
Year Certified_Organic_Eggs Month Price
<fct> <chr> <chr> <dbl>
1 2013 "Extra Large \nDozen" Jan. 290
2 2013 "Extra Large \nDozen" Feb. 290
3 2013 "Extra Large \nDozen" Mar. 290
4 2013 "Extra Large \nDozen" Apr. 290
5 2013 "Extra Large \nDozen" May 290
6 2013 "Extra Large \nDozen" June 290
7 2013 "Extra Large \nDozen" July 290
8 2013 "Extra Large \nDozen" Aug. 290
9 2013 "Extra Large \nDozen" Sep. 290
10 2013 "Extra Large \nDozen" Oct. 290
11 2013 "Extra Large \nDozen" Nov. 290
12 2013 "Extra Large \nDozen" Dec. 290
13 2013 "Extra Large 1/2 Doz." Jan. 188.
14 2013 "Extra Large 1/2 Doz." Feb. 188.
15 2013 "Extra Large 1/2 Doz." Mar. 188.
16 2013 "Extra Large 1/2 Doz." Apr. 188.
17 2013 "Extra Large 1/2 Doz." May 188.
18 2013 "Extra Large 1/2 Doz." June 188.
19 2013 "Extra Large 1/2 Doz." July 188.
20 2013 "Extra Large 1/2 Doz." Aug. 188.
21 2013 "Extra Large 1/2 Doz." Sep. 188.
22 2013 "Extra Large 1/2 Doz." Oct. 188.
23 2013 "Extra Large 1/2 Doz." Nov. 188.
24 2013 "Extra Large 1/2 Doz." Dec. 188.
25 2013 "Large \nDozen" Jan. 268.
26 2013 "Large \nDozen" Feb. 268.
27 2013 "Large \nDozen" Mar. 268.
28 2013 "Large \nDozen" Apr. 268.
29 2013 "Large \nDozen" May 268.
30 2013 "Large \nDozen" June 268.
31 2013 "Large \nDozen" July 268.
32 2013 "Large \nDozen" Aug. 268.
33 2013 "Large \nDozen" Sep. 268.
34 2013 "Large \nDozen" Oct. 268.
35 2013 "Large \nDozen" Nov. 268.
36 2013 "Large \nDozen" Dec. 268.
37 2013 "Large \n1/2 Doz." Jan. 178
38 2013 "Large \n1/2 Doz." Feb. 178
39 2013 "Large \n1/2 Doz." Mar. 178
40 2013 "Large \n1/2 Doz." Apr. 178
# … with 440 more rows
# ℹ Use `print(n = ...)` to see more rows
# A tibble: 40 × 14
Year Certified…¹ Jan. Feb. Mar. Apr. May June July Aug. Sep. Oct.
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 "Extra Lar… 290 290 290 290 290 290 290 290 290 290
2 2013 "Extra Lar… 188. 188. 188. 188. 188. 188. 188. 188. 188. 188.
3 2013 "Large \nD… 268. 268. 268. 268. 268. 268. 268. 268. 268. 268.
4 2013 "Large \n1… 178 178 178 178 178 178 178 178 178 178
5 2012 "Extra Lar… 288. 288. 288. 288. 288. 288. 288. 288. 290 290
6 2012 "Extra Lar… 186. 186. 186. 186. 186. 186. 186. 186. 188. 188.
7 2012 "Large \nD… 268. 268. 268. 268. 268. 268. 268. 268. 268. 268.
8 2012 "Large \n1… 174. 174. 173. 173. 173. 173. 173. 173. 178 178
9 2011 "Extra Lar… 286. 286. 286. 286. 286. 286. 286. 286. 286. 286.
10 2011 "Extra Lar… 186. 186. 186. 186. 186. 186. 186. 186. 186. 186.
11 2011 "Large \nD… 268. 268. 268. 270 270 270 270 270 270 270
12 2011 "Large \n1… 174. 174. 174. 174. 174. 174. 174. 174. 174. 174.
13 2010 "Extra Lar… 286. 286. 286. 286. 286. 286. 286. 286. 286. 286.
14 2010 "Extra Lar… 186. 186. 186. 186. 186. 186. 186. 186. 186. 186.
15 2010 "Large \nD… 268 268 268 268 268 268 268 268. 268. 268.
16 2010 "Large \n1… 174. 174. 174. 174. 174. 174. 174. 174. 174. 174.
17 2009 "Extra Lar… 286. 286. 286. 286. 286. 286. 286. 286. 286. 286.
18 2009 "Extra Lar… 186. 186. 186. 186. 186. 186. 186. 186. 186. 186.
19 2009 "Large \nD… 278. 278. 278. 278. 278. 272. 272. 272. 272. 272.
20 2009 "Large \n1… 174. 174. 174. 174. 174. 174. 174. 174. 174. 174.
21 2008 "Extra Lar… 245 245 245 286. 286. 286. 286. 286. 286. 286.
22 2008 "Extra Lar… 139 139 139 186. 186. 186. 186. 186. 186. 186.
23 2008 "Large \nD… 237 237 237 278. 278. 278. 278. 278. 278. 278.
24 2008 "Large \n1… 132 132 132 174. 174. 174. 174. 174. 174. 174.
25 2007 "Extra Lar… 241. 244. 245 245 245 245 245 245 245 245
26 2007 "Extra Lar… 136. 138. 139 139 139 139 139 139 139 139
27 2007 "Large \nD… 234. 236. 237 237 237 237 237 237 237 237
28 2007 "Large \n1… 128. 131. 132 132 132 132 132 132 132 132
29 2006 "Extra Lar… 241 240 241. 242. 242. 242. 242. 242. 242. 242.
30 2006 "Extra Lar… 136. 136. 136. 136. 136. 136. 136. 136. 136. 136.
31 2006 "Large \nD… 234. 234. 234. 234. 234. 234. 234. 234. 234. 234.
32 2006 "Large \n1… 128. 128. 128. 128. 128. 128. 128. 128. 128. 128.
33 2005 "Extra Lar… 241 241 241 241 241 241 241 241 241 241
34 2005 "Extra Lar… 136. 136. 136. 136. 136. 136. 136. 136. 136. 136.
35 2005 "Large \nD… 234. 234. 234. 234. 234. 234. 234. 234. 234. 234.
36 2005 "Large \n1… 128. 128. 128. 128. 128. 128. 128. 128. 128. 128.
37 2004 "Extra Lar… 230 230 230 234. 236 241 241 241 241 241
38 2004 "Extra Lar… 132 134. 137 137 137 137 137 137 136. 136.
39 2004 "Large \nD… 230 226. 225 225 225 231. 234. 234. 234. 234.
40 2004 "Large \n1… 126 128. 131 131 131 134. 134. 134. 130. 128.
# … with 2 more variables: Nov. <dbl>, Dec. <dbl>, and abbreviated variable
# name ¹Certified_Organic_Eggs
# ℹ Use `colnames()` to see all variable names
posts/_data
folder, using the read_excel R package and command.Rows: 99
Columns: 13
$ Year <chr> "2013", "Organic young chicken:", "Whole", "B/S Breast", "Bone-in…
$ Jan. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Jan.", NA…
$ Feb. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Feb.", NA…
$ Mar. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Mar.", NA…
$ Apr. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Apr.", NA…
$ May <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "May", NA,…
$ June <chr> "Cents per pound", NA, "238.5", "703.75", "390.5", "203.5", "216.…
$ July <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "July", NA…
$ Aug. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Aug.", NA…
$ Sep. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Sep.", NA…
$ Oct. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Oct.", NA…
$ Nov. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Nov.", NA…
$ Dec. <chr> NA, NA, "238.5", "703.75", "390.5", "203.5", "216.25", "Dec.", NA…
Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Year [numeric] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Organic_Young_Chicken [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Month [character] |
|
|
0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Price [numeric] |
|
33 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.1)
2022-09-04
# A tibble: 50 × 5
# Groups: Year, Organic_Young_Chicken [50]
Year Organic_Young_Chicken Price_min Price_max Price_mean
<fct> <chr> <dbl> <dbl> <dbl>
1 2004 Whole Legs 194. 204. 199.
2 2005 Whole Legs 204. 204. 204.
3 2006 Whole Legs 204. 204. 204.
4 2007 Whole Legs 204. 204. 204.
5 2008 Whole Legs 204. 204. 204.
6 2009 Whole Legs 204. 204. 204.
7 2010 Whole Legs 204. 204. 204.
8 2011 Whole Legs 204. 204. 204.
9 2012 Whole Legs 204. 204. 204.
10 2013 Whole Legs 204. 204. 204.
11 2004 Whole 198. 217 212.
12 2005 Whole 217 217 217
13 2006 Whole 217 220. 220.
14 2007 Whole 220. 220. 220.
15 2008 Whole 220. 248 237.
16 2009 Whole 248 248 248
17 2010 Whole 235 248 239.
18 2011 Whole 235 235 235
19 2012 Whole 235 238. 238.
20 2013 Whole 238. 238. 238.
21 2004 Thighs 0 203 184.
22 2005 Thighs 213 222 221.
23 2006 Thighs 222 222 222
24 2007 Thighs 222 222 222
25 2008 Thighs 222 222 222
26 2009 Thighs 222 222 222
27 2010 Thighs 215 222 219.
28 2011 Thighs 215 215 215
29 2012 Thighs 215 216. 216.
30 2013 Thighs 216. 216. 216.
31 2004 Bone-in Breast 0 390. 195.
32 2005 Bone-in Breast 390. 390. 390.
33 2006 Bone-in Breast 390. 390. 390.
34 2007 Bone-in Breast 390. 390. 390.
35 2008 Bone-in Breast 390. 390. 390.
36 2009 Bone-in Breast 390. 390. 390.
37 2010 Bone-in Breast 390. 390. 390.
38 2011 Bone-in Breast 390. 390. 390.
39 2012 Bone-in Breast 390. 390. 390.
40 2013 Bone-in Breast 390. 390. 390.
41 2004 B/S Breast 641 646. 643.
42 2005 B/S Breast 644 646. 645.
43 2006 B/S Breast 646. 646. 646.
44 2007 B/S Breast 646. 646. 646.
45 2008 B/S Breast 646. 646. 646.
46 2009 B/S Breast 646. 646. 646.
47 2010 B/S Breast 638. 646. 643.
48 2011 B/S Breast 638. 638. 638.
49 2012 B/S Breast 638. 704. 695.
50 2013 B/S Breast 704. 704. 704.
---
title: "Homework 2"
author: "Akhilesh Kumar Meghwal"
desription: ""
date: "08/22/2022"
format:
html:
toc: true
code-fold: true
code-copy: true
code-tools: true
categories:
- hw2
---
## Homework-2 Overview
- Read in a dataset from the _data folder in the course blog repository, or choose your own data. If you decide to use one of the datasets we have provided, please use a challenging dataset - check with us if you are not sure.
- Clean the data as needed using dplyr and related tidyverse packages.
- Provide a narrative about the data set (look it up if you aren't sure what you have got) and the variables in your dataset, including what type of data each variable is. The goal of this step is to communicate in a visually appealing way to non-experts - not to replicate r-code.
- Identify potential research questions that your dataset can help answer.
```{r}
#| label: setup
#| warning: false
#| message: false
library(tidyverse)
library(Hmisc)
library(psych)
library(readxl)
library(stringr)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
## Data Reading
#### Read dataset 'organiceggpoultry.xls', sheet 2, available in the `posts/_data` folder, using the read_excel R package and command.
```{r}
organicegg <- read_excel("_data/organiceggpoultry.xls", sheet = "Organic egg prices, 2004-13", skip = 1)
glimpse(organicegg)
```
## Data Wrangling
##### - mutate function to convert column type to numeric
##### - filter along with regex to remove row consisting NA,Year & Certified Organic Eggs:
##### - replace to replace all NA value with 0
##### - filter to remove unnecessary tail of 41, 42 43 rows
```{r}
organicegg <-organicegg%>%
mutate_at(vars(colnames(organicegg)[2:13]), function(x)as.numeric(x)) %>%
filter(!is.na(Year) & !grepl("^2", Year) & !grepl("^Ce", Year)) %>%
replace(is.na(.), 0)%>%
filter(!row_number() %in% c(41, 42, 43))
```
## Data Wrangling
##### - changed name of first column
##### - create vector of same length as the number of rows in the organicegg dataset and same distribution of years as original dataset
##### - column bind to merge Year vector & organicegg dataset
##### - pivot_longer on All Month Columns
```{r}
names(organicegg)[1] = "Certified_Organic_Eggs"
Year <- rep(c(2013,2012,2011,2010,2009,2008,2007,2006,2005,2004),each=4)
organicegg<- cbind(Year, organicegg)
organicegg <- organicegg%>%
pivot_longer(colnames(organicegg)[3:14], names_to = 'Month', values_to = 'Price')
```
## Describe Data
#### - summary of organicegg dataset using summarytools::dfSummary
```{r}
print(summarytools::dfSummary(organicegg,
varnumbers = FALSE,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.70,
valid.col = FALSE),
method = 'render',
table.classes = 'table-condensed')
```
```{r}
# a <-as.data.frame(strsplit(organicegg$Certified_Organic_Eggs, split = '([a-zA-Z\\ -1/-2])'))
# View(a)
```
## Group Summary Statistics
#### - Year wise, Certified_Organic_Eggs wise Price Distribution
```{r}
organicegg$Year<-as.factor(organicegg$Year)
organicegg %>%
group_by(Year, Certified_Organic_Eggs) %>%
summarise(Price_min=min(Price),Price_max=max(Price),Price_mean=mean(Price), .groups = 'keep') %>%
arrange(desc(Certified_Organic_Eggs))
print(organicegg, n=40)
```
## Group Summary Statistics
#### - pivot_wider to expand Month wise Price value across columns
```{r}
organicegg %>%
pivot_wider(names_from = Month, values_from = Price) %>%
print(n=40)
```
## Data Reading
#### - read dataset 'organiceggpoultry.xls', sheet 3, , available in the `posts/_data` folder, using the read_excel R package and command.
```{r}
organicepoultry <- read_excel("_data/organiceggpoultry.xls", sheet = "Organic poultry prices, 2004-13", skip = 1)
glimpse(organicepoultry)
```
## Data Wrangling
#### - mutate function to convert column type to numeric
#### - filter along with regex to remove rows consisting NA, Year & Organic young chicken:
#### - replace to replace all NA value with 0
#### - filter to remove unnecessary tail of 51, 52 and 53 rows
```{r}
organicepoultry <-organicepoultry%>%
mutate_at(vars(colnames(organicepoultry)[2:13]), function(x)as.numeric(x)) %>%
filter(!is.na(Year) & !grepl("^2", Year) & !grepl("^Or", Year)) %>%
replace(is.na(.), 0)%>%
filter(!row_number() %in% c(51, 52, 53))
```
## Data Wrangling
#### - changed name of first column to Organic_Young_Chicken
#### - create vector of same length as the number of rows in the organicegg dataset and same distribution of years as original dataset
#### - column bind to merge, Year vector & organicegg
#### - pivot_longer on All Month Columns
```{r}
names(organicepoultry)[1] = "Organic_Young_Chicken"
Year <- rep(c(2013,2012,2011,2010,2009,2008,2007,2006,2005,2004),each=5)
organicepoultry<- cbind(Year, organicepoultry)
organicepoultry<-organicepoultry%>%
pivot_longer(colnames(organicepoultry)[3:14], names_to = 'Month', values_to = 'Price')
```
## Describe Data
#### - Summary of organicepoultry Dataset, using summarytools::dfSummary
```{r}
print(summarytools::dfSummary(organicepoultry,
varnumbers = FALSE,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.70,
valid.col = FALSE),
method = 'render',
table.classes = 'table-condensed')
```
## Group Summary Statistics
#### - Year wise, Certified_Organic_Eggs wise Price Distribution
```{r}
organicepoultry$Year<-as.factor(organicepoultry$Year)
organicepoultry %>%
group_by(Year, Organic_Young_Chicken) %>%
summarise(Price_min=min(Price),Price_max=max(Price),Price_mean=mean(Price), .groups = 'keep') %>%
arrange(desc(Organic_Young_Chicken)) %>%
print(n = 50)
```