Read and describe dataset

challenge1
Neha Jhurani
birds.csv, wild_bird_data.xlsx
Author

Neha Jhurani

Published

February 22, 2023

Code
library(tidyverse)

knitr::opts_chunk$set(echo = TRUE)

Analysing birds data

Code
library(readr)

#reading birds csv data
birds_data <- read_csv("_data/birds.csv")
Rows: 30977 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Domain Code, Domain, Area, Element, Item, Unit, Flag, Flag Description
dbl (6): Area Code, Element Code, Item Code, Year Code, Year, Value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
#visualizing birds data in spreadsheet style
view(birds_data)

#getting the number of rows and columns in the csv
dim(birds_data)
[1] 30977    14
Code
#viewing the last few rows of the birds data
tail(birds_data)
# A tibble: 6 × 14
  Domai…¹ Domain Area …² Area  Eleme…³ Element Item …⁴ Item  Year …⁵  Year Unit 
  <chr>   <chr>    <dbl> <chr>   <dbl> <chr>     <dbl> <chr>   <dbl> <dbl> <chr>
1 QA      Live …    5504 Poly…    5112 Stocks     1068 Ducks    2013  2013 1000…
2 QA      Live …    5504 Poly…    5112 Stocks     1068 Ducks    2014  2014 1000…
3 QA      Live …    5504 Poly…    5112 Stocks     1068 Ducks    2015  2015 1000…
4 QA      Live …    5504 Poly…    5112 Stocks     1068 Ducks    2016  2016 1000…
5 QA      Live …    5504 Poly…    5112 Stocks     1068 Ducks    2017  2017 1000…
6 QA      Live …    5504 Poly…    5112 Stocks     1068 Ducks    2018  2018 1000…
# … with 3 more variables: Value <dbl>, Flag <chr>, `Flag Description` <chr>,
#   and abbreviated variable names ¹​`Domain Code`, ²​`Area Code`,
#   ³​`Element Code`, ⁴​`Item Code`, ⁵​`Year Code`
Code
#extracting all the column names
colnames(birds_data)
 [1] "Domain Code"      "Domain"           "Area Code"        "Area"            
 [5] "Element Code"     "Element"          "Item Code"        "Item"            
 [9] "Year Code"        "Year"             "Unit"             "Value"           
[13] "Flag"             "Flag Description"
Code
#selecting only the unique values of Item column
unique(select(birds_data, c("Item")))
# A tibble: 5 × 1
  Item                  
  <chr>                 
1 Chickens              
2 Ducks                 
3 Geese and guinea fowls
4 Turkeys               
5 Pigeons, other birds  
Code
#filtering all the rows which has the Year as 2018
filter(birds_data, Year == "2018")
# A tibble: 577 × 14
   Domain Cod…¹ Domain Area …² Area  Eleme…³ Element Item …⁴ Item  Year …⁵  Year
   <chr>        <chr>    <dbl> <chr>   <dbl> <chr>     <dbl> <chr>   <dbl> <dbl>
 1 QA           Live …       2 Afgh…    5112 Stocks     1057 Chic…    2018  2018
 2 QA           Live …       3 Alba…    5112 Stocks     1057 Chic…    2018  2018
 3 QA           Live …       3 Alba…    5112 Stocks     1068 Ducks    2018  2018
 4 QA           Live …       3 Alba…    5112 Stocks     1072 Gees…    2018  2018
 5 QA           Live …       3 Alba…    5112 Stocks     1079 Turk…    2018  2018
 6 QA           Live …       4 Alge…    5112 Stocks     1057 Chic…    2018  2018
 7 QA           Live …       4 Alge…    5112 Stocks     1068 Ducks    2018  2018
 8 QA           Live …       4 Alge…    5112 Stocks     1072 Gees…    2018  2018
 9 QA           Live …       4 Alge…    5112 Stocks     1079 Turk…    2018  2018
10 QA           Live …       5 Amer…    5112 Stocks     1057 Chic…    2018  2018
# … with 567 more rows, 4 more variables: Unit <chr>, Value <dbl>, Flag <chr>,
#   `Flag Description` <chr>, and abbreviated variable names ¹​`Domain Code`,
#   ²​`Area Code`, ³​`Element Code`, ⁴​`Item Code`, ⁵​`Year Code`
Code
#finding the mean value for each item in a specific area during a specific year
birds_data %>% group_by(Year,Area,Item) %>% summarise(mean(Value), .groups = 'drop')
# A tibble: 30,977 × 4
    Year Area        Item                   `mean(Value)`
   <dbl> <chr>       <chr>                          <dbl>
 1  1961 Afghanistan Chickens                        4700
 2  1961 Africa      Chickens                      274201
 3  1961 Africa      Ducks                           6231
 4  1961 Africa      Geese and guinea fowls          3882
 5  1961 Africa      Pigeons, other birds            5800
 6  1961 Africa      Turkeys                         1213
 7  1961 Albania     Chickens                        1580
 8  1961 Albania     Ducks                             NA
 9  1961 Albania     Geese and guinea fowls            NA
10  1961 Albania     Turkeys                           NA
# … with 30,967 more rows
Code
#summarizing the values in each column of birds data, It gives us (minimum, 1st quartile, median, mean, 3rd quartile, maximum, number of NA's present) for a column that has numerical values and gives (length, class, mode) for a column that has character values
summary(birds_data)
 Domain Code           Domain            Area Code        Area          
 Length:30977       Length:30977       Min.   :   1   Length:30977      
 Class :character   Class :character   1st Qu.:  79   Class :character  
 Mode  :character   Mode  :character   Median : 156   Mode  :character  
                                       Mean   :1202                     
                                       3rd Qu.: 231                     
                                       Max.   :5504                     
                                                                        
  Element Code    Element            Item Code        Item          
 Min.   :5112   Length:30977       Min.   :1057   Length:30977      
 1st Qu.:5112   Class :character   1st Qu.:1057   Class :character  
 Median :5112   Mode  :character   Median :1068   Mode  :character  
 Mean   :5112                      Mean   :1066                     
 3rd Qu.:5112                      3rd Qu.:1072                     
 Max.   :5112                      Max.   :1083                     
                                                                    
   Year Code         Year          Unit               Value         
 Min.   :1961   Min.   :1961   Length:30977       Min.   :       0  
 1st Qu.:1976   1st Qu.:1976   Class :character   1st Qu.:     171  
 Median :1992   Median :1992   Mode  :character   Median :    1800  
 Mean   :1991   Mean   :1991                      Mean   :   99411  
 3rd Qu.:2005   3rd Qu.:2005                      3rd Qu.:   15404  
 Max.   :2018   Max.   :2018                      Max.   :23707134  
                                                  NA's   :1036      
     Flag           Flag Description  
 Length:30977       Length:30977      
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      
                                      

Analysing wild birds data

Code
library(readxl)

#reading wild_bird_data xlsx
wild_bird_data <- read_xlsx("_data/wild_bird_data.xlsx")

#visualizing wild bird data in spreadsheet style
view(wild_bird_data)

#extracting the number of rows and columns in the excel
dim(wild_bird_data)
[1] 147   2
Code
#viewing the last 10 rows of wild bird data
tail(wild_bird_data, n = 10)
# A tibble: 10 × 2
   Reference        `Taken from Figure 1 of Nee et al.`
   <chr>            <chr>                              
 1 4450.50815600577 4788.66353161336                   
 2 4223.72945322751 433.148027881688                   
 3 2320.09569921356 151.097182999144                   
 4 1064.32682601983 106.902249798449                   
 5 1137.96479906865 53.9362035620238                   
 6 1003.03939853867 22.3522084608936                   
 7 1042.06074444654 1758.76989729032                   
 8 1106.07510035687 3974.59483110465                   
 9 1368.36501582366 9797.18806953683                   
10 2053.74863827143 20660.849081157                    
Code
#extracting all the column names
colnames(wild_bird_data)
[1] "Reference"                         "Taken from Figure 1 of Nee et al."
Code
#extracting a subset of wild bird data
wild_bird_data[4:12,]
# A tibble: 9 × 2
  Reference        `Taken from Figure 1 of Nee et al.`
  <chr>            <chr>                              
1 8.63858738018464 2592996.86778979                   
2 10.6897349302105 3524193.2266336                    
3 7.41722577905587 389806.168891807                   
4 9.1169347252776  604765.97978904                    
5 8.03684333000353 192360.511579436                   
6 8.70473119796067 250452.449623033                   
7 8.89032317828959 16997.4156415239                   
8 9.51590845877281 595.09393677964                    
9 10.9430490453536 864.665387886239