Importing/Tidying Organic Egg and Poultry Data
knitr::opts_chunk$set(echo = TRUE, Message = FALSE, warning = FALSE)
My Data is one of the uncleaned date sets provided by the course. It is call organic egg poultry and it shows price changes for different egg and organic chicken products from January 2004 to December 2013.
I chose this data set because it wasn’t perfectly clean and required some alteration to make it useful in statistical analysis. I Also like the idea of working with data across time because I know time-series analysis will be a very useful skill for my future career. Also, poultry products and prices are can be related to environmental issues.
organiceggpoultry <- read_excel("organiceggpoultry.xlsx",
na = "too few", range = "A5:L125")
#View(organiceggpoultry)
Organic_Egg_Poultry <- unite(organiceggpoultry, Date,
Month, Year, sep = " ",
remove = TRUE
)
Organic_Egg_Poultry <- rename(Organic_Egg_Poultry, c(
Dozen_XL = "Extra Large \r\nDozen",
Half_Dozen_XL = "Extra Large 1/2 Doz.\r\n1/2 Dozen",
Dozen_Lg = "Large \r\nDozen",
Half_Dozen_Lg = "Large \r\n1/2 Doz.",
BI_Breast = "Bone-in Breast",
BS_Breast = "B/S Breast",
Whole_Legs = "Whole Legs"
))
Organic_Egg_Poultry_NEW <- select(Organic_Egg_Poultry,
"Date",
"Dozen_XL",
"Half_Dozen_XL",
"Dozen_Lg",
"Half_Dozen_Lg",
"Whole",
"BS_Breast",
"BI_Breast",
"Whole_Legs",
"Thighs")
Egg_Products <- data.frame(Date = Organic_Egg_Poultry_NEW$Date,
Dozen_XL = Organic_Egg_Poultry_NEW$Dozen_XL,
Half_Dozen_XL = Organic_Egg_Poultry_NEW$Half_Dozen_XL,
Dozen_Lg = Organic_Egg_Poultry_NEW$Dozen_Lg,
Half_Dozen_Lg = Organic_Egg_Poultry_NEW$Half_Dozen_Lg
)
#View(Egg_Products)
Egg_Products_New <- Egg_Products %>%
pivot_longer(c("Dozen_XL", "Half_Dozen_XL",
"Dozen_Lg", "Half_Dozen_Lg"),
names_to = "Egg_Product",
values_to = "Egg_Price"
)
#view(Egg_Products_New)
Chicken_Products <- data.frame(Date = Organic_Egg_Poultry_NEW$Date,
Whole = Organic_Egg_Poultry_NEW$Whole,
BS_Breast = Organic_Egg_Poultry_NEW$BS_Breast,
BI_Breast = Organic_Egg_Poultry_NEW$BI_Breast,
Whole_Legs = Organic_Egg_Poultry_NEW$Whole_Legs,
Thighs = Organic_Egg_Poultry_NEW$Thighs
)
#view(Chicken_Products)
Chicken_Products_New <- Chicken_Products %>%
pivot_longer(c("Whole", "BS_Breast",
"BI_Breast", "Whole_Legs", "Thighs"),
names_to = "Chicken_Product",
values_to = "Chicken_Price"
)
#view(Chicken_Products_New)
as_tibble(Egg_Products_New)
# A tibble: 480 x 3
Date Egg_Product Egg_Price
<chr> <chr> <dbl>
1 January 2004 Dozen_XL 230
2 January 2004 Half_Dozen_XL 132
3 January 2004 Dozen_Lg 230
4 January 2004 Half_Dozen_Lg 126
5 February 2004 Dozen_XL 230
6 February 2004 Half_Dozen_XL 134.
7 February 2004 Dozen_Lg 226.
8 February 2004 Half_Dozen_Lg 128.
9 March 2004 Dozen_XL 230
10 March 2004 Half_Dozen_XL 137
# ... with 470 more rows
Egg_Products_New$Egg_Price <- sprintf(Egg_Products_New$Egg_Price,
fmt = '%#.2f'
)
#view(Egg_Products_New)
as_tibble(Chicken_Products_New)
# A tibble: 600 x 3
Date Chicken_Product Chicken_Price
<chr> <chr> <dbl>
1 January 2004 Whole 198.
2 January 2004 BS_Breast 646.
3 January 2004 BI_Breast NA
4 January 2004 Whole_Legs 194.
5 January 2004 Thighs NA
6 February 2004 Whole 198.
7 February 2004 BS_Breast 642.
8 February 2004 BI_Breast NA
9 February 2004 Whole_Legs 194.
10 February 2004 Thighs 203
# ... with 590 more rows
Chicken_Products_New$Chicken_Price <- sprintf(Chicken_Products_New$Chicken_Price,
fmt = '%#.2f')
#view(Chicken_Products_New)
Chicken_Products_New$Date <- parse_date(Chicken_Products_New$Date, format = "%B %Y")
Egg_Products_New$Date <- parse_date(Egg_Products_New$Date, format = "%B %Y")
as_tibble(Chicken_Products_New)
# A tibble: 600 x 3
Date Chicken_Product Chicken_Price
<date> <chr> <chr>
1 2004-01-01 Whole 197.50
2 2004-01-01 BS_Breast 645.50
3 2004-01-01 BI_Breast NA
4 2004-01-01 Whole_Legs 193.50
5 2004-01-01 Thighs NA
6 2004-02-01 Whole 197.50
7 2004-02-01 BS_Breast 642.50
8 2004-02-01 BI_Breast NA
9 2004-02-01 Whole_Legs 193.50
10 2004-02-01 Thighs 203.00
# ... with 590 more rows
as_tibble(Egg_Products_New)
# A tibble: 480 x 3
Date Egg_Product Egg_Price
<date> <chr> <chr>
1 2004-01-01 Dozen_XL 230.00
2 2004-01-01 Half_Dozen_XL 132.00
3 2004-01-01 Dozen_Lg 230.00
4 2004-01-01 Half_Dozen_Lg 126.00
5 2004-02-01 Dozen_XL 230.00
6 2004-02-01 Half_Dozen_XL 134.50
7 2004-02-01 Dozen_Lg 226.25
8 2004-02-01 Half_Dozen_Lg 128.50
9 2004-03-01 Dozen_XL 230.00
10 2004-03-01 Half_Dozen_XL 137.00
# ... with 470 more rows
Egg_Products_New$Egg_Price <- as.double(Egg_Products_New$Egg_Price)
Chicken_Products_New$Chicken_Price <- as.double(Chicken_Products_New$Chicken_Price)
as_tibble(Chicken_Products_New)
# A tibble: 600 x 3
Date Chicken_Product Chicken_Price
<date> <chr> <dbl>
1 2004-01-01 Whole 198.
2 2004-01-01 BS_Breast 646.
3 2004-01-01 BI_Breast NA
4 2004-01-01 Whole_Legs 194.
5 2004-01-01 Thighs NA
6 2004-02-01 Whole 198.
7 2004-02-01 BS_Breast 642.
8 2004-02-01 BI_Breast NA
9 2004-02-01 Whole_Legs 194.
10 2004-02-01 Thighs 203
# ... with 590 more rows
as_tibble(Egg_Products_New)
# A tibble: 480 x 3
Date Egg_Product Egg_Price
<date> <chr> <dbl>
1 2004-01-01 Dozen_XL 230
2 2004-01-01 Half_Dozen_XL 132
3 2004-01-01 Dozen_Lg 230
4 2004-01-01 Half_Dozen_Lg 126
5 2004-02-01 Dozen_XL 230
6 2004-02-01 Half_Dozen_XL 134.
7 2004-02-01 Dozen_Lg 226.
8 2004-02-01 Half_Dozen_Lg 128.
9 2004-03-01 Dozen_XL 230
10 2004-03-01 Half_Dozen_XL 137
# ... with 470 more rows
#The two decimal places don't show in the tibble but can be seen here:
#view(Chicken_Products_New)
#view(Egg_Products_New)
#Future data tidying I may need to do: 1. Change the prices from cents/unit to dollars/unit 1. standardize the price of eggs to be measurable between the half dozen and dozen containers by finding a price per egg. 1. Can I change date to remove the ‘day’ and keep it to just month/year. 1. Can I display i.e. Jan 2004 in graphs instead of 2004-01?
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Lennon (2022, Jan. 25). Data Analytics and Computational Social Science: Hw #3. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomlennont857314/
BibTeX citation
@misc{lennon2022hw, author = {Lennon, Tim}, title = {Data Analytics and Computational Social Science: Hw #3}, url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomlennont857314/}, year = {2022} }