Class Project 1

final project

Joseph Vincent

Author

Joseph Vincent

Published

July 16, 2023

Code

library(tidyverse)
library(dplyr)
library(lubridate)

knitr::opts_chunk$set(echo = TRUE)

Reading, Cleaning and Combining Data Sets

I’m using the CDC’s database for Air Quality Measures on the National Environmental Health Tracking Network, a dataset from the state of California’s Department of Health and Human Services (HHS) on annual number of deaths by county, and per-county population data from the United States Census Bureau.

Code

#reading in air quality data
airquality <- read_csv("Final Project Data/Air_Quality_Measures_on_the_National_Environmental_Health_Tracking_Network.csv")

Rows: 218635 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): MeasureName, MeasureType, StratificationLevel, StateName, CountyNam...
dbl (6): MeasureId, StateFips, CountyFips, ReportYear, Value, MonitorOnly

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Code

#reading in deaths dataset
calideaths <- read_csv("Final Project Data/2021-05-14_deaths_final_1999_2013_county_year_sup.csv") %>%
  #filtering for occurrence deaths (i.e. all deaths that occurred, disregarding residence)
  filter(Geography_Type == "Occurrence") %>%
  #de-selecting geography type as they are now all occurrence
  select(-Geography_Type)

Rows: 292320 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): County, Geography_Type, Strata, Strata_Name, Cause, Cause_Desc, Ann...
dbl (3): Year, Count, Annotation_Code

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Code

#reading in population data
populations <- read_csv("Final Project Data/co-est00int-tot.csv")

Rows: 3194 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): STNAME, CTYNAME
dbl (18): SUMLEV, REGION, DIVISION, STATE, COUNTY, ESTIMATESBASE2000, POPEST...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Code

#cleaning air quality data
airqualitycali <- airquality %>%
  #arranging by year
  arrange(`ReportYear`) %>%
  #filtering for california only and 2000-2010
  filter(StateName == "California" & `ReportYear` %in% c(2000:2010)) %>%
  #selecting only relevant columns
  select(CountyName, ReportYear, MeasureName, Value) %>%
  #renaming county and year to be consistent with deaths dataset
  rename("County" = `CountyName`, "Year" = `ReportYear`) %>%
  #pivoting so that each row is a year-county for merging data
  pivot_wider(names_from = MeasureName, values_from = Value) %>%
  #renaming Air Quality columns
  rename("Ozone Days Delete" = `Number of days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard`,
         "Ozone Person Days Delete" = `Number of person-days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard`,
         "PM2.5 Percent of Days Delete" = `Percent of days with PM2.5 levels over the National Ambient Air Quality Standard (NAAQS)`,
         "PM2.5 Person Days Delete" = `Person-days with PM2.5 over the National Ambient Air Quality Standard`,
         "PM2.5 Annual Average Delete" = `Annual average ambient concentrations of PM2.5 in micrograms per cubic meter (based on seasonal averages and daily measurement)`,
         "Ozone Days" = `Number of days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard (monitor and modeled data)`,
         "Ozone Person Days" = `Number of person-days with maximum 8-hour average ozone concentration over the National Ambient Air Quality Standard (monitor and modeled data)`,
         "PM2.5 Percent of Days" = `Percent of days with PM2.5 levels over the National Ambient Air Quality Standard (monitor and modeled data)`,
         "PM2.5 Person Days" = `Number of person-days with PM2.5 over the National Ambient Air Quality Standard (monitor and modeled data)`,
         "PM2.5 Annual Average" = `Annual average ambient concentrations of PM 2.5 in micrograms per cubic meter, based on seasonal averages and daily measurement (monitor and modeled data)`)

#filling in modeled data for first year, where there is no modeled data
airqualitycali <- airqualitycali %>%
  mutate(`Ozone Days` = case_when(
    `Year` == 2000 ~ `Ozone Days Delete`,
    TRUE ~ as.numeric(as.character(`Ozone Days`)))) %>%
  mutate(`Ozone Person Days` = case_when(
    `Year` == 2000 ~ `Ozone Person Days Delete`,
    TRUE ~ as.numeric(as.character(`Ozone Person Days`)))) %>%
  mutate(`PM2.5 Percent of Days` = case_when(
    `Year` == 2000  ~ `PM2.5 Percent of Days Delete`,
    TRUE ~ as.numeric(as.character(`PM2.5 Percent of Days`)))) %>%
  mutate(`PM2.5 Person Days` = case_when(
    `Year` == 2000 ~ `PM2.5 Person Days Delete`,
    TRUE ~ as.numeric(as.character(`PM2.5 Person Days`)))) %>%
  mutate(`PM2.5 Annual Average` = case_when(
    `Year` == 2000 ~ `PM2.5 Annual Average Delete`,
    TRUE ~ as.numeric(as.character(`PM2.5 Annual Average`)))) %>%
  select(!contains("Delete"))

#cleaning deaths data
calideathsclean <- calideaths %>%
  #filtering for 2000-2010
  filter(`Year` %in% c(2000:2010)) %>%
  #filling in suppressed data
  mutate(Count = case_when(
    Annotation_Code == 1 ~ 0,
    TRUE ~ as.numeric(as.character(Count)))) %>%
  #de-selecting some unused columns
  select(-Cause_Desc, -Annotation_Code, -Annotation_Desc) %>%
  #focusing on relevant conditions
  filter(Cause %in% c("ALL", "CLD", "HTD")) %>%
  rename("Deaths" = `Count`) %>%
  rename("Strata Name" = `Strata_Name`)

# cleaning population data
populationstidy <- populations %>%
  filter(STNAME == "California" & CTYNAME != "California") %>%
  separate(CTYNAME, into = c("County", "Delete"), sep = " County") %>%
  select(County, POPESTIMATE2000:POPESTIMATE2010, -CENSUS2010POP) %>%
  rename("2000" = POPESTIMATE2000,
         "2001" = POPESTIMATE2001,
         "2002" = POPESTIMATE2002,
         "2003" = POPESTIMATE2003,
         "2004" = POPESTIMATE2004,
         "2005" = POPESTIMATE2005,
         "2006" = POPESTIMATE2006,
         "2007" = POPESTIMATE2007,
         "2008" = POPESTIMATE2008,
         "2009" = POPESTIMATE2009,
         "2010" = POPESTIMATE2010) %>%
  pivot_longer(col = c(`2000`:`2010`), names_to = "Year", values_to = "Population") %>%
  mutate(`Year` = as.numeric(Year))

#merging data
airqualityanddeaths <- left_join(calideathsclean, airqualitycali, by = c("County", "Year"))
airqualityanddeaths <- left_join(airqualityanddeaths, populationstidy, by = c("County", "Year"))

#For the remaining missing air quality data not filled in by models in 2000, replacing NAs with zero, as they are all small rural counties that will skew means higher for 2000
airqualityanddeaths <- airqualityanddeaths %>%
  mutate(`Ozone Days` = replace_na(`Ozone Days`, 0),
         `Ozone Person Days` = replace_na(`Ozone Person Days`, 0),
         `PM2.5 Percent of Days` = replace_na(`PM2.5 Percent of Days`, 0),
         `PM2.5 Person Days` = replace_na(`PM2.5 Person Days`, 0),
         `PM2.5 Annual Average` = replace_na(`PM2.5 Annual Average`, 0)) %>%
# Creating standardized deaths per 100,000 column
  mutate("Deaths per 100,000" = `Deaths`/`Population`*100000) %>%
# Creating a raw PM2.5 Days column
  mutate("PM2.5 Days" = `PM2.5 Percent of Days`/100*365) %>%
# Re-arranging
  select(Year, County, Population, Strata, `Strata Name`, Cause, Deaths, `Deaths per 100,000`, `Ozone Days`, `Ozone Person Days`, `PM2.5 Days`, `PM2.5 Person Days`, `PM2.5 Percent of Days`, `PM2.5 Annual Average`) %>% 
# Turning Year into a date for ease of plotting time series'
  mutate(Year = make_date(year=Year)) %>%
  filter(County != "Sierra")

Creating a Pared-down Data Frame for Project 1

The columns I’m mainly interested in are Deaths per 100,000 and PM2.5 Days. PM2.5 Days is defined as the number of days in a calendar year where PM2.5 levels (Particulate Matter less than 2.5 nm in diatmeter) exceeded the national average. Essentially, higher PM2.5 Days means there was worse air quality in that county during the year. Each observation is a county-year. The data ranges from 2000-2010 for the state of California.

Code

partonedata <- airqualityanddeaths %>%
  filter(Strata == "Total Population" &
           Cause == "ALL") %>%
  select(Year, County, `Deaths per 100,000`, `PM2.5 Days`, Population)
head(partonedata)

# A tibble: 6 × 5
  Year       County    `Deaths per 100,000` `PM2.5 Days` Population
  <date>     <chr>                    <dbl>        <dbl>      <dbl>
1 2000-01-01 Alameda                   677.        26.6     1449840
2 2000-01-01 Alpine                    910.         0          1209
3 2000-01-01 Amador                    964.         0         35153
4 2000-01-01 Butte                    1123.        47.9      203807
5 2000-01-01 Calaveras                 664.         5.79      40645
6 2000-01-01 Colusa                    648.         0         18817

1

a. Plotting PDFs and CDFs

There is clearly a greater density of PM2.5 days at or near 0 for the year. There is an extremely noticable right (positive) skew.

The Deaths per 100,000 data looks a bit closer to a normal distribution; however, there is some left (negative) skew.

Code

partonedata %>%
  ggplot(aes(`PM2.5 Days`)) +
  geom_density() +
  labs(title = "PM2.5 Days PDF",
       x = "PM2.5 Days",
       y = "Density")

Code

partonedata %>%
  ggplot(aes(`PM2.5 Days`)) +
  stat_ecdf(geom = "step") +
  labs(title = "PM2.5 Days CDF",
       x = "PM2.5 Days",
       y = "Cumulative Density")

Code

partonedata %>%
  ggplot(aes(`Deaths per 100,000`)) +
  geom_density() +
  labs(title = "Deaths per 100,000 PDF",
       x = "Deaths per 100,000",
       y = "Density")

Code

partonedata %>%
  ggplot(aes(`Deaths per 100,000`)) +
  stat_ecdf(geom = "step") +
  labs(title = "Deaths per 100,000 CDF",
       x = "Deaths per 100,000",
       y = "Cumulative Density")

b. Mean and Standard Deviation of Variables, Grouped by Year

Mean Deaths per 100,000 and Mean PM2.5 Days both noticably decline throughout the decade when grouped by year, as seen in the table below.

Code

averages <- partonedata %>%
  group_by(Year) %>%
  summarise(MeanDeathsper100k = mean(`Deaths per 100,000`),
            SDDeathsper100k = sd(`Deaths per 100,000`),
            MeanPM2.5Days = mean(`PM2.5 Days`),
            SDPM2.5Days = sd(`PM2.5 Days`),
            MeanPopulation = mean(Population),
            SDPopulatoin = sd(Population))
averages

# A tibble: 11 × 7
   Year       MeanDeathsper100k SDDeathsper100k MeanPM…¹ SDPM2…² MeanP…³ SDPop…⁴
   <date>                 <dbl>           <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
 1 2000-01-01              746.            197.    20.1    26.3  596218.  1.36e6
 2 2001-01-01              729.            218.    17.5    23.3  604841.  1.38e6
 3 2002-01-01              728.            216.    22.0    21.4  611726.  1.39e6
 4 2003-01-01              757.            211.     8.09   15.7  618415.  1.40e6
 5 2004-01-01              703.            209.     9.66   12.5  624055.  1.41e6
 6 2005-01-01              720.            218.    10.0    11.3  628501.  1.41e6
 7 2006-01-01              717.            216.     8.33    9.51 631891.  1.40e6
 8 2007-01-01              694.            217.    12.5    16.5  635911.  1.40e6
 9 2008-01-01              709.            194.    17.2    12.2  642123.  1.41e6
10 2009-01-01              683.            210.     5.67   10.1  648386.  1.42e6
11 2010-01-01              688.            214.     2.67    5.91 655195.  1.43e6
# … with abbreviated variable names ¹MeanPM2.5Days, ²SDPM2.5Days,
#   ³MeanPopulation, ⁴SDPopulatoin

c. Splitting Data by PM2.5 Days Greater or Less than Mean

I chose to split my data by county-years greater or equal to the mean PM2.5 Days and county-years less than the mean PM2.5 Days.

Code

PM2.5greaterthanmean <- partonedata %>%
  filter(`PM2.5 Days` >= mean(`PM2.5 Days`))
  
PM2.5lessthanmean <- partonedata %>%
  filter(`PM2.5 Days` < mean(`PM2.5 Days`))

d. For Split Data, Finding Differences in Means, Standard Errors and Confidence Intervals

Difference in Mean Deaths per 100,000

Surprisingly, the mean Deaths per 100,000 rate for counties with greater PM2.5 days is about 40 less than the mean for counties with less PM2.5 days. I will explore this more below.

Code

#Finding difference in Means
meangreater <- mean(PM2.5greaterthanmean$`Deaths per 100,000`)
meangreater

[1] 688.9798

Code

meanless <- mean(PM2.5lessthanmean$`Deaths per 100,000`)
meanless

[1] 728.2283

Code

meangreater - meanless

[1] -39.24847

Standard Error of Deaths per 100,000 when PM2.5 >= Mean

The standard error of Deaths per 100,000 for PM2.5 days greater than the mean is about 10.

Code

# Calculating Standard Error of Deaths per 100,000 when PM2.5 >= Mean
StandarderrorPM2.5greater <- (sd(PM2.5greaterthanmean$`Deaths per 100,000`)/(sqrt(nrow(PM2.5greaterthanmean))))
StandarderrorPM2.5greater

[1] 10.07148

Confidence Interval of Deaths per 100,000 when PM2.5 >= Mean

The confidence interval ranges from the lower bound of around 669 to the upper bound of arround 709.

Code

# Calculating Confidence Interval
lowergreater <- meangreater - 1.96 * StandarderrorPM2.5greater
lowergreater

[1] 669.2397

Code

uppergreater <- meangreater + 1.96 * StandarderrorPM2.5greater
uppergreater

[1] 708.7199

Standard Error of Deaths per 100,000 when PM2.5 < Mean

The standard error of Deaths per 100,000 for PM2.5 days less than the mean is about 11. This is larger standard error than for the ‘greater than mean’ counties and contributes to the wider confidence interval as seen below.

Code

# Calculating Standard Error of Deaths per 100,000 when PM2.5 < Mean
StandarderrorPM2.5less <- (sd(PM2.5lessthanmean$`Deaths per 100,000`)/(sqrt(nrow(PM2.5lessthanmean))))
StandarderrorPM2.5less

[1] 11.34218

Confidence Interval of Deaths per 100,000 when PM2.5 < Mean

The confidence interval ranges from the lower bound of around 666 to the upper bound of arround 711.

Code

# Calculating Confidence Interval
lowerless <- meangreater - 1.96 * StandarderrorPM2.5less
lowerless

[1] 666.7492

Code

upperless <- meangreater + 1.96 * StandarderrorPM2.5less
upperless

[1] 711.2105

e. Creating a Scatter Plot of All Data

It is clear from the scatter plot below that there is an extremely high variance in Deaths per 100,000 for counties with low, or zero, PM2.5 Days. Once past about 30 PM2.5 Days, there does appear to be a slight upward trend in death rate. I believe there is ommitted variable bias present, which I will explore more moving forward.

Code

partonedata %>%
  ggplot(aes(x = `PM2.5 Days`, y = `Deaths per 100,000`)) +
  geom_point() +
  labs(title = "PM2.5 Days vs Deaths per 100k",
       subtitle = "Across all County-Years, CA, 2000-2010",
       x = "PM 2.5 Days",
       y = "Deaths per 100,000")

2.

Performing Simple Regression Analysis on Entire Data Set

Surprisingly, the beta estimator for PM2.5 days is -1.2, suggesting that for each additional PM2.5 Day in a year, a county is expected to have 1.2 less deaths per 100,000.

Code

alldatalm <- lm(formula = `Deaths per 100,000` ~ `PM2.5 Days`, data = partonedata)
summary(alldatalm)


Call:
lm(formula = `Deaths per 100,000` ~ `PM2.5 Days`, data = partonedata)

Residuals:
    Min      1Q  Median      3Q     Max 
-730.43 -133.95  -15.82  133.68  580.06 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   730.427     10.273   71.10   <2e-16 ***
`PM2.5 Days`   -1.206      0.490   -2.46   0.0141 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 209.6 on 625 degrees of freedom
Multiple R-squared:  0.009593,  Adjusted R-squared:  0.008008 
F-statistic: 6.054 on 1 and 625 DF,  p-value: 0.01415

a. Creating a Scatter Plot with a Linear Regression Line

Below, I’ve added this simple regression line to the scatter plot. Clearly, this is not the trend you would expect, and I believe there are a large number of variables missing from this analysis that might explain this.

Code

partonedata %>%
  ggplot(aes(x = `PM2.5 Days`, y = `Deaths per 100,000`)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "PM2.5 Days vs Deaths per 100k",
       subtitle = "Across all County-Years, CA, 2000-2010",
       x = "PM 2.5 Days",
       y = "Deaths per 100,000")

`geom_smooth()` using formula = 'y ~ x'

Looking Ahead at Population As a Variable

Quickly, I’ll look ahead at what happens when you split counties by those greater or less than the mean for population. My theory is that small counties will have a large variance in death rate due to the sample size being much smaller. Additionally, there may be other factors at play that affect death rate for small, rural counties that I cannot account for here.

Code

Popgreaterthanmean <- partonedata %>%
  filter(Population >= mean(Population))
  
Poplessthanmean <- partonedata %>%
  filter(Population < mean(Population))

Performing Simple Regression Analysis on Split Data

The beta estimator for PM2.5 Days is indeed positive when only looking at larger counties. This will need to be explored further. I’ve also plotted the scatter plots with regression lines below.

Code

Popgreaterlm <- lm(formula = `Deaths per 100,000` ~ `PM2.5 Days`, data = Popgreaterthanmean)
summary(Popgreaterlm)


Call:
lm(formula = `Deaths per 100,000` ~ `PM2.5 Days`, data = Popgreaterthanmean)

Residuals:
     Min       1Q   Median       3Q      Max 
-107.803  -56.364   -0.728   34.238  206.137 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  650.7043     7.5990   85.63   <2e-16 ***
`PM2.5 Days`   0.4769     0.2249    2.12   0.0355 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 67.74 on 159 degrees of freedom
Multiple R-squared:  0.0275,    Adjusted R-squared:  0.02138 
F-statistic: 4.495 on 1 and 159 DF,  p-value: 0.03554

Code

Poplesslm <- lm(formula = `Deaths per 100,000` ~ `PM2.5 Days`, data = Poplessthanmean)
summary(Poplesslm)


Call:
lm(formula = `Deaths per 100,000` ~ `PM2.5 Days`, data = Poplessthanmean)

Residuals:
    Min      1Q  Median      3Q     Max 
-750.72 -165.05   13.89  179.67  583.93 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  750.7222    13.4077  55.992   <2e-16 ***
`PM2.5 Days`  -2.0413     0.9552  -2.137   0.0331 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 237.1 on 464 degrees of freedom
Multiple R-squared:  0.009747,  Adjusted R-squared:  0.007613 
F-statistic: 4.567 on 1 and 464 DF,  p-value: 0.03311

Plotting Split Data with Regression Lines

Code

Popgreaterthanmean %>%
  ggplot(aes(x = `PM2.5 Days`, y = `Deaths per 100,000`)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "PM2.5 Days vs Deaths per 100k",
       subtitle = "County-Years with Pop > Mean, CA, 2000-2010",
       x = "PM 2.5 Days",
       y = "Deaths per 100,000")

`geom_smooth()` using formula = 'y ~ x'

Code

Poplessthanmean %>%
  ggplot(aes(x = `PM2.5 Days`, y = `Deaths per 100,000`)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "PM2.5 Days vs Deaths per 100k",
       subtitle = "County-Years with Pop < Mean, CA, 2000-2010",
       x = "PM 2.5 Days",
       y = "Deaths per 100,000")

`geom_smooth()` using formula = 'y ~ x'