Descriptive and Summary Statistics of a dataset

Neha Jhurani

April 12, 2023


knitr::opts_chunk$set(echo = TRUE)

Analysing birds data


#reading birds csv data
birds_data <- read_csv("_data/birds.csv")
Rows: 30977 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Domain Code, Domain, Area, Element, Item, Unit, Flag, Flag Description
dbl (6): Area Code, Element Code, Item Code, Year Code, Year, Value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#extracting all the column names
 [1] "Domain Code"      "Domain"           "Area Code"        "Area"            
 [5] "Element Code"     "Element"          "Item Code"        "Item"            
 [9] "Year Code"        "Year"             "Unit"             "Value"           
[13] "Flag"             "Flag Description"
#Extracting and formatting the important attributes of birds data which tells us the kind of poultry, the area, the year and the value for each.
extracted_birds_data <- select(birds_data, "Area", "Item", "Year", "Value")
extracted_birds_data %>% arrange(desc('value'))
# A tibble: 30,977 × 4
   Area        Item      Year Value
   <chr>       <chr>    <dbl> <dbl>
 1 Afghanistan Chickens  1961  4700
 2 Afghanistan Chickens  1962  4900
 3 Afghanistan Chickens  1963  5000
 4 Afghanistan Chickens  1964  5300
 5 Afghanistan Chickens  1965  5500
 6 Afghanistan Chickens  1966  5800
 7 Afghanistan Chickens  1967  6600
 8 Afghanistan Chickens  1968  6290
 9 Afghanistan Chickens  1969  6300
10 Afghanistan Chickens  1970  6000
# … with 30,967 more rows
#filtering all the extracted birds data for the year 2003
filtered_data <- filter(extracted_birds_data, Year == "2003")

#Comprehending the size of stock for each poultry type and the amt of poultry present in each area. On average, we see that countries have more chickens (=58.4million head) than other livestock birds.
birds_data %>% group_by(Area) %>% summarize(avg_stocks = mean(Value, na.rm = TRUE))
# A tibble: 248 × 2
   Area                avg_stocks
   <chr>                    <dbl>
 1 Afghanistan             8099. 
 2 Africa                196561. 
 3 Albania                 2278. 
 4 Algeria                17621. 
 5 American Samoa            41.4
 6 Americas              856356. 
 7 Angola                  9453. 
 8 Antigua and Barbuda       93.6
 9 Argentina              18844. 
10 Armenia                 2062. 
# … with 238 more rows
#Analyzing the median and average values of the poultry type exported. You will notice that the average values are exponentially larger than the median values. This indicates that there is an extremely large value difference in the data.
    summarise (
      avg_year = mean (Value, na.rm = TRUE),
      med_year = median(Value, na.rm = TRUE)
Adding missing grouping variables: `Year`
# A tibble: 58 × 3
    Year avg_year med_year
   <dbl>    <dbl>    <dbl>
 1  1961   36752.    1033 
 2  1962   37787.    1014 
 3  1963   38736.    1106 
 4  1964   39325.    1103 
 5  1965   40334.    1104 
 6  1966   41229.    1088.
 7  1967   43240.    1193 
 8  1968   44420.    1252.
 9  1969   45607.    1267 
10  1970   47706.    1259 
# … with 48 more rows