Challenge 8

Joining Data
Author

Nayan Jani

Published

August 25, 2022

library(tidyverse)
library(ggplot2)
library(lubridate)

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

Read in data

birds<-read_csv("_data/birds.csv", col_names = c("Domain_Code", "Domain", "Area_Code", "Area", "Element_Code","Element", "Item_Code","Item", "Year_Code", "Year","Unit", "Value", "Flag", "Flag_Description" ), skip=1)%>%
  select(-c("Year_Code"))
birds
ABCDEFGHIJ0123456789
Domain_Code
<chr>
Domain
<chr>
Area_Code
<dbl>
Area
<chr>
Element_Code
<dbl>
Element
<chr>
Item_Code
<dbl>
Item
<chr>
Year
<dbl>
Unit
<chr>
QALive Animals2Afghanistan5112Stocks1057Chickens19611000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19621000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19631000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19641000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19651000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19661000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19671000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19681000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19691000 Head
QALive Animals2Afghanistan5112Stocks1057Chickens19701000 Head
livestock <- read_csv("_data/FAOSTAT_livestock.csv", col_names = c("Domain_Code", "Domain", "Area_Code", "Area", "Element_Code","Element", "Item_Code","Item", "Year_Code", "Year","Unit", "Value", "Flag", "Flag_Description" ), skip=1) %>%
  select(-c("Year_Code"))
livestock
ABCDEFGHIJ0123456789
Domain_Code
<chr>
Domain
<chr>
Area_Code
<dbl>
Area
<chr>
Element_Code
<dbl>
Element
<chr>
Item_Code
<dbl>
Item
<chr>
Year
<dbl>
Unit
<chr>
QALive Animals2Afghanistan5111Stocks1107Asses1961Head
QALive Animals2Afghanistan5111Stocks1107Asses1962Head
QALive Animals2Afghanistan5111Stocks1107Asses1963Head
QALive Animals2Afghanistan5111Stocks1107Asses1964Head
QALive Animals2Afghanistan5111Stocks1107Asses1965Head
QALive Animals2Afghanistan5111Stocks1107Asses1966Head
QALive Animals2Afghanistan5111Stocks1107Asses1967Head
QALive Animals2Afghanistan5111Stocks1107Asses1968Head
QALive Animals2Afghanistan5111Stocks1107Asses1969Head
QALive Animals2Afghanistan5111Stocks1107Asses1970Head
c_groups <- read_csv("_data/FAOSTAT_country_groups.csv",col_names = c("Country_Group_Code","Country_Group","Country_Code", "Country", "M49_Code", "ISO2_Code", "ISO3_Code"), skip=1)
c_groups
ABCDEFGHIJ0123456789
Country_Group_Code
<dbl>
Country_Group
<chr>
Country_Code
<dbl>
Country
<chr>
M49_Code
<chr>
ISO2_Code
<chr>
ISO3_Code
<chr>
5100Africa4Algeria012DZDZA
5100Africa7Angola024AOAGO
5100Africa53Benin204BJBEN
5100Africa20Botswana072BWBWA
5100Africa233Burkina Faso854BFBFA
5100Africa29Burundi108BIBDI
5100Africa35Cabo Verde132CVCPV
5100Africa32Cameroon120CMCMR
5100Africa37Central African Republic140CFCAF
5100Africa39Chad148TDTCD
c_groups %>%
  count(Country_Group_Code,Country_Code) %>% 
  filter(n > 1)
ABCDEFGHIJ0123456789
Country_Group_Code
<dbl>
Country_Code
<dbl>
n
<int>

My final filter is what I think the Primary key is for c_groups.

Join pt1

b_sum <-birds %>%
  group_by(Item)%>%
  summarise(avg_stocks = mean(Value, na.rm=TRUE),
            med_stocks = median(Value, na.rm=TRUE),
            n_missing = sum(is.na(Value)))

l_sum <-livestock %>%
  group_by(Item)%>%
  summarise(avg_stocks = mean(Value, na.rm=TRUE),
            med_stocks = median(Value, na.rm=TRUE),
            n_missing = sum(is.na(Value)))

b_sum
ABCDEFGHIJ0123456789
Item
<chr>
avg_stocks
<dbl>
med_stocks
<dbl>
n_missing
<int>
Chickens207930.80810783.5104
Ducks23071.673510.0417
Geese and guinea fowls10291.937258.0139
Pigeons, other birds6163.3752800.058
Turkeys15227.919528.0318
l_sum
ABCDEFGHIJ0123456789
Item
<chr>
avg_stocks
<dbl>
med_stocks
<dbl>
n_missing
<int>
Asses114226030000139
Buffaloes10189405260000185
Camels156635723100096
Cattle226212851393394166
Goats11799739435080227
Horses126437657100208
Mules4687671400080
Pigs15423572349039112
Sheep2145417970925788
item_sum <- full_join(b_sum,l_sum)

item_sum
ABCDEFGHIJ0123456789
Item
<chr>
avg_stocks
<dbl>
med_stocks
<dbl>
n_missing
<int>
Chickens207930.80810783.5104
Ducks23071.673510.0417
Geese and guinea fowls10291.937258.0139
Pigeons, other birds6163.3752800.058
Turkeys15227.919528.0318
Asses1142260.00630000.0139
Buffaloes10189405.200260000.0185
Camels1566356.858231000.096
Cattle22621285.1281393393.5166
Goats11799738.637435080.0227
ggplot(item_sum, aes(x=Item, y=avg_stocks))+
  geom_bar(stat="identity") +
  coord_flip() +
  geom_text(aes(label = round(avg_stocks), color = "red"), size=3, hjust=.6) +
  theme(legend.position = "none")

Here I summarized the Birds and Livestock stocks and joined them into one table. I then created a Bar graph showing the average amount of stock for all animals.

livestock %>%
  count(Area_Code,Item, Year) %>% 
  filter(n > 1)
ABCDEFGHIJ0123456789
Area_Code
<dbl>
Item
<chr>
Year
<dbl>
n
<int>
livestock_new <- livestock %>%
  mutate(index = row_number()) %>%
  select(index, everything())

c_groups_new <- c_groups %>%
  mutate(index = row_number()) %>%
  select(index, everything())

livestock_new
ABCDEFGHIJ0123456789
index
<int>
Domain_Code
<chr>
Domain
<chr>
Area_Code
<dbl>
Area
<chr>
Element_Code
<dbl>
Element
<chr>
Item_Code
<dbl>
Item
<chr>
Year
<dbl>
1QALive Animals2Afghanistan5111Stocks1107Asses1961
2QALive Animals2Afghanistan5111Stocks1107Asses1962
3QALive Animals2Afghanistan5111Stocks1107Asses1963
4QALive Animals2Afghanistan5111Stocks1107Asses1964
5QALive Animals2Afghanistan5111Stocks1107Asses1965
6QALive Animals2Afghanistan5111Stocks1107Asses1966
7QALive Animals2Afghanistan5111Stocks1107Asses1967
8QALive Animals2Afghanistan5111Stocks1107Asses1968
9QALive Animals2Afghanistan5111Stocks1107Asses1969
10QALive Animals2Afghanistan5111Stocks1107Asses1970
c_groups_new
ABCDEFGHIJ0123456789
index
<int>
Country_Group_Code
<dbl>
Country_Group
<chr>
Country_Code
<dbl>
Country
<chr>
M49_Code
<chr>
ISO2_Code
<chr>
ISO3_Code
<chr>
15100Africa4Algeria012DZDZA
25100Africa7Angola024AOAGO
35100Africa53Benin204BJBEN
45100Africa20Botswana072BWBWA
55100Africa233Burkina Faso854BFBFA
65100Africa29Burundi108BIBDI
75100Africa35Cabo Verde132CVCPV
85100Africa32Cameroon120CMCMR
95100Africa37Central African Republic140CFCAF
105100Africa39Chad148TDTCD

The First filter represents the possible primary key for livestock. I created a Index for livestock and c_groups so that I could join them together.

Join pt2

livestock %>%
  full_join(birds)
ABCDEFGHIJ0123456789
Domain_Code
<chr>
Domain
<chr>
Area_Code
<dbl>
Area
<chr>
Element_Code
<dbl>
Element
<chr>
Item_Code
<dbl>
Item
<chr>
Year
<dbl>
Unit
<chr>
QALive Animals2Afghanistan5111Stocks1107Asses1961Head
QALive Animals2Afghanistan5111Stocks1107Asses1962Head
QALive Animals2Afghanistan5111Stocks1107Asses1963Head
QALive Animals2Afghanistan5111Stocks1107Asses1964Head
QALive Animals2Afghanistan5111Stocks1107Asses1965Head
QALive Animals2Afghanistan5111Stocks1107Asses1966Head
QALive Animals2Afghanistan5111Stocks1107Asses1967Head
QALive Animals2Afghanistan5111Stocks1107Asses1968Head
QALive Animals2Afghanistan5111Stocks1107Asses1969Head
QALive Animals2Afghanistan5111Stocks1107Asses1970Head
livestock_new %>%
  inner_join(c_groups_new)
ABCDEFGHIJ0123456789
index
<int>
Domain_Code
<chr>
Domain
<chr>
Area_Code
<dbl>
Area
<chr>
Element_Code
<dbl>
Element
<chr>
Item_Code
<dbl>
Item
<chr>
Year
<dbl>
1QALive Animals2Afghanistan5111Stocks1107Asses1961
2QALive Animals2Afghanistan5111Stocks1107Asses1962
3QALive Animals2Afghanistan5111Stocks1107Asses1963
4QALive Animals2Afghanistan5111Stocks1107Asses1964
5QALive Animals2Afghanistan5111Stocks1107Asses1965
6QALive Animals2Afghanistan5111Stocks1107Asses1966
7QALive Animals2Afghanistan5111Stocks1107Asses1967
8QALive Animals2Afghanistan5111Stocks1107Asses1968
9QALive Animals2Afghanistan5111Stocks1107Asses1969
10QALive Animals2Afghanistan5111Stocks1107Asses1970