library(tidyverse)
library(ggplot2)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Homework 2
Homework Overview
HW 2: For this homework, your goal is to read in a more complicated dataset. Please use the category tag “hw2” as well as a tag for the dataset you choose to use. Read in a dataset. It’s strongly recommended that you choose a dataset you’re considering using for the final project. If you decide to use one of the datasets we have provided, please use a challenging dataset - check with us if you are not sure. Clean the data as needed using dplyr and related tidyverse packages. Provide a narrative about the data set (look it up if you aren’t sure what you have got) and the variables in your dataset, including what type of data each variable is. The goal of this step is to communicate in a visually appealing way to non-experts - not to replicate r-code. Identify potential research questions that your dataset can help answer.
Describe the Data
<- read_csv ("_data/world-data-2023.csv")
world_data world_data
#column names
colnames(world_data)
[1] "Country"
[2] "Density\n(P/Km2)"
[3] "Abbreviation"
[4] "Agricultural Land( %)"
[5] "Land Area(Km2)"
[6] "Armed Forces size"
[7] "Birth Rate"
[8] "Calling Code"
[9] "Capital/Major City"
[10] "Co2-Emissions"
[11] "CPI"
[12] "CPI Change (%)"
[13] "Currency-Code"
[14] "Fertility Rate"
[15] "Forested Area (%)"
[16] "Gasoline Price"
[17] "GDP"
[18] "Gross primary education enrollment (%)"
[19] "Gross tertiary education enrollment (%)"
[20] "Infant mortality"
[21] "Largest city"
[22] "Life expectancy"
[23] "Maternal mortality ratio"
[24] "Minimum wage"
[25] "Official language"
[26] "Out of pocket health expenditure"
[27] "Physicians per thousand"
[28] "Population"
[29] "Population: Labor force participation (%)"
[30] "Tax revenue (%)"
[31] "Total tax rate"
[32] "Unemployment rate"
[33] "Urban_population"
[34] "Latitude"
[35] "Longitude"
#names of all the countries represented
unique(world_data$Country)
[1] "Afghanistan" "Albania"
[3] "Algeria" "Andorra"
[5] "Angola" "Antigua and Barbuda"
[7] "Argentina" "Armenia"
[9] "Australia" "Austria"
[11] "Azerbaijan" "The Bahamas"
[13] "Bahrain" "Bangladesh"
[15] "Barbados" "Belarus"
[17] "Belgium" "Belize"
[19] "Benin" "Bhutan"
[21] "Bolivia" "Bosnia and Herzegovina"
[23] "Botswana" "Brazil"
[25] "Brunei" "Bulgaria"
[27] "Burkina Faso" "Burundi"
[29] "Ivory Coast" "Cape Verde"
[31] "Cambodia" "Cameroon"
[33] "Canada" "Central African Republic"
[35] "Chad" "Chile"
[37] "China" "Colombia"
[39] "Comoros" "Republic of the Congo"
[41] "Costa Rica" "Croatia"
[43] "Cuba" "Cyprus"
[45] "Czech Republic" "Democratic Republic of the Congo"
[47] "Denmark" "Djibouti"
[49] "Dominica" "Dominican Republic"
[51] "Ecuador" "Egypt"
[53] "El Salvador" "Equatorial Guinea"
[55] "Eritrea" "Estonia"
[57] "Eswatini" "Ethiopia"
[59] "Fiji" "Finland"
[61] "France" "Gabon"
[63] "The Gambia" "Georgia"
[65] "Germany" "Ghana"
[67] "Greece" "Grenada"
[69] "Guatemala" "Guinea"
[71] "Guinea-Bissau" "Guyana"
[73] "Haiti" "Vatican City"
[75] "Honduras" "Hungary"
[77] "Iceland" "India"
[79] "Indonesia" "Iran"
[81] "Iraq" "Republic of Ireland"
[83] "Israel" "Italy"
[85] "Jamaica" "Japan"
[87] "Jordan" "Kazakhstan"
[89] "Kenya" "Kiribati"
[91] "Kuwait" "Kyrgyzstan"
[93] "Laos" "Latvia"
[95] "Lebanon" "Lesotho"
[97] "Liberia" "Libya"
[99] "Liechtenstein" "Lithuania"
[101] "Luxembourg" "Madagascar"
[103] "Malawi" "Malaysia"
[105] "Maldives" "Mali"
[107] "Malta" "Marshall Islands"
[109] "Mauritania" "Mauritius"
[111] "Mexico" "Federated States of Micronesia"
[113] "Moldova" "Monaco"
[115] "Mongolia" "Montenegro"
[117] "Morocco" "Mozambique"
[119] "Myanmar" "Namibia"
[121] "Nauru" "Nepal"
[123] "Netherlands" "New Zealand"
[125] "Nicaragua" "Niger"
[127] "Nigeria" "North Korea"
[129] "North Macedonia" "Norway"
[131] "Oman" "Pakistan"
[133] "Palau" "Palestinian National Authority"
[135] "Panama" "Papua New Guinea"
[137] "Paraguay" "Peru"
[139] "Philippines" "Poland"
[141] "Portugal" "Qatar"
[143] "Romania" "Russia"
[145] "Rwanda" "Saint Kitts and Nevis"
[147] "Saint Lucia" "Saint Vincent and the Grenadines"
[149] "Samoa" "San Marino"
[151] "S�����������" "Saudi Arabia"
[153] "Senegal" "Serbia"
[155] "Seychelles" "Sierra Leone"
[157] "Singapore" "Slovakia"
[159] "Slovenia" "Solomon Islands"
[161] "Somalia" "South Africa"
[163] "South Korea" "South Sudan"
[165] "Spain" "Sri Lanka"
[167] "Sudan" "Suriname"
[169] "Sweden" "Switzerland"
[171] "Syria" "Tajikistan"
[173] "Tanzania" "Thailand"
[175] "East Timor" "Togo"
[177] "Tonga" "Trinidad and Tobago"
[179] "Tunisia" "Turkey"
[181] "Turkmenistan" "Tuvalu"
[183] "Uganda" "Ukraine"
[185] "United Arab Emirates" "United Kingdom"
[187] "United States" "Uruguay"
[189] "Uzbekistan" "Vanuatu"
[191] "Venezuela" "Vietnam"
[193] "Yemen" "Zambia"
[195] "Zimbabwe"
For my final project, I will be analyzing a comprehensive dataset that provides various socio-economic and demographic information from 195 countries. This dataset encompasses a wide range of information, including statistics related to population characteristics, environmental aspects,economic factors, educational indicators, healthcare measures,and various other data points.
Tidy the Data
#check with missing values
anyNA(world_data, recursive = TRUE)
[1] TRUE
#column names with missing values
names(which(colSums(is.na(world_data))>0))
[1] "Abbreviation"
[2] "Agricultural Land( %)"
[3] "Land Area(Km2)"
[4] "Armed Forces size"
[5] "Birth Rate"
[6] "Calling Code"
[7] "Capital/Major City"
[8] "Co2-Emissions"
[9] "CPI"
[10] "CPI Change (%)"
[11] "Currency-Code"
[12] "Fertility Rate"
[13] "Forested Area (%)"
[14] "Gasoline Price"
[15] "GDP"
[16] "Gross primary education enrollment (%)"
[17] "Gross tertiary education enrollment (%)"
[18] "Infant mortality"
[19] "Largest city"
[20] "Life expectancy"
[21] "Maternal mortality ratio"
[22] "Minimum wage"
[23] "Official language"
[24] "Out of pocket health expenditure"
[25] "Physicians per thousand"
[26] "Population"
[27] "Population: Labor force participation (%)"
[28] "Tax revenue (%)"
[29] "Total tax rate"
[30] "Unemployment rate"
[31] "Urban_population"
[32] "Latitude"
[33] "Longitude"
This data set has several missing values, and running the code above shows that there are missing values under every column, except for country name and density. There are also several rows where a particular name in a row in unfinished/ incomplete. To tidy the data, i’ve replaced all incomplete names of countries, capital cities, and largest city names with the correct names.
# Define the rows and columns to be changed
<- c(151)
rows_to_change_country <- c(24, 32, 38, 41, 77, 105, 113, 137, 151, 176, 177)
rows_to_change_capital <- c(24, 38, 41, 44, 77, 105, 113, 151, 169, 170, 176, 177)
rows_to_change_largest_city
# Define the new names for each column
<- c("Sao Tome and Principe dobra")
new_names_country <- c("Brasília", "Yaoundé", "Bogotá", "San José", "Reykjavík","Malé", "Chişinău", "Asunción", "São Tomé", "Lomé", "Nuku'alofa")
new_names_capital <- c("Brasília", "Bogotá", "San José", "Nicosa", "Reykjavík", "Malé", "Chişinău", "São Tomé", "Stockholm", "Zürich", "Lomé", "Nuku'alofa")
new_names_largest_city
# Apply changes to the specified rows and columns
$Country[rows_to_change_country] <- new_names_country
world_data$`Capital/Major City`[rows_to_change_capital] <- new_names_capital
world_data$`Largest city`[rows_to_change_largest_city] <- new_names_largest_city
world_data
world_data
Potential research questions
Potential research questions to investigate are:
Analyzing the correlation between unemployment rate and other indicators such as armed forces size, life expectancy, and population labor force participation.
An environmental analysis of factors like Co2 emissions and it’s correlation to factors like fertility rate, infant mortality, maternal portality ratio, and life expectancy.
An education analysis using gross primary and tertiary education enrollment and it’s correlation to factors like unemployment rate, GDP, fertility rate, and urban population size.