HNguyen HW 4

HW 4: Descriptive statistics and data visualization.

Henry Nguyen
2022-04-22

Load Packages

HIV

1a. Readin your dataset

Import Data HIV Incidence Data

# Data set #1
# Data obtained from here:https://gis.cdc.gov/grasp/nchhstpatlas/tables.html

HIV.State <- read_csv("HIV.by.State.CSV", skip = 10)

HIV.State <- select(HIV.State, 2,3,5,6)

HIV.State <- rename(HIV.State, Rate =4)

HIV.State.Wide.Year <- HIV.State%>%
  select(1:3)%>%
  pivot_wider(
    names_from = Year,
    values_from = Cases
  )

1b - Compute descriptive Satistics

I will compute the mean, median, and standard deviation by case-state by year and total year.

Here I calculate the mean and median for cases across all states from 2008-2021.

HIV.Statistics.Total <- HIV.State%>%
  summarise(
    HIV.Mean.Total = mean(Cases),
    HIV.Median.Total = median(Cases))

HIV.Statistics.Total
# A tibble: 1 × 2
  HIV.Mean.Total HIV.Median.Total
           <dbl>            <dbl>
1           754.             352.

Next, I calculate the mean,median, and standard deviation for each year from 2008-2019.

Rate calculations are from 2008-2019 as there is not data for the years 2020 and 2021.

Now I calculate the mean, median, and SD for each state from 2008-2021

By.State <- group_by(HIV.State, Geography)



HIV.Statistics.by.State <- summarise(By.State,
                  HIV.Mean.by.State = mean(Cases),
                  HIV.Median.by.State = median(Cases),
                  HIV.SD.by.State = sd(Cases),
                  )

HIV.Statistics.by.State <- arrange(HIV.Statistics.by.State, HIV.Median.by.State)

HIV.Statistics.by.State
# A tibble: 51 × 4
   Geography     HIV.Mean.by.State HIV.Median.by.State HIV.SD.by.State
   <chr>                     <dbl>               <dbl>           <dbl>
 1 Vermont                    13.8                14              5.10
 2 Wyoming                    14.2                14.5            5.51
 3 North Dakota               23.6                19.5           12.6 
 4 Montana                    20.9                20.5            6.07
 5 Alaska                     28                  27.5            7.57
 6 South Dakota               29.1                29.5            7.29
 7 Idaho                      36.6                36.5            9.39
 8 New Hampshire              36.6                38              8.08
 9 Maine                      41.4                46             13.6 
10 Rhode Island               82.2                77.5           25.2 
# … with 41 more rows
Rate.By.State <- filter(By.State, Year < 2020)

Rate.By.State$Rate = as.numeric(Rate.By.State$Rate)

Rate.By.State <- group_by(Rate.By.State, Geography)

HIV.Statistics.by.State.Rate <- summarise(Rate.By.State,
                  HIV.Rate.Mean.by.State = mean(Rate),
                  HIV.Rate.Median.by.State = median(Rate),
                  HIV.Rate.SD.by.State = sd(Rate),
                  )

HIV.Statistics.by.State.Rate <- arrange(HIV.Statistics.by.State.Rate, HIV.Rate.Median.by.State)

HIV.Statistics.by.State.Rate
# A tibble: 51 × 4
   Geography     HIV.Rate.Mean.by… HIV.Rate.Median.b… HIV.Rate.SD.by.…
   <chr>                     <dbl>              <dbl>            <dbl>
 1 Montana                    2.62               2.55            0.589
 2 Vermont                    2.78               2.85            0.805
 3 Idaho                      2.91               2.9             0.779
 4 North Dakota               3.86               3.2             1.99 
 5 Wyoming                    3.24               3.25            1.10 
 6 New Hampshire              3.37               3.4             0.656
 7 Maine                      3.92               4.2             0.972
 8 South Dakota               4.29               4.25            0.908
 9 Iowa                       4.45               4.55            0.485
10 Alaska                     4.93               4.6             1.19 
# … with 41 more rows

Healthcare GDP

Import and tidy Healthcare GDP data

I tidy the data even more. Ambulatory Health services and hospitals are including in health and social assistance along with other industries, therefore I will remove them.

GDP.State <- read_csv("StateGDP.csv", skip = 4)

GDP.State <- GDP.State %>% 
  select(2,4:17)

GDP.State <- GDP.State %>%
  pivot_longer(
    `2008`: `2020`,
    names_to = "Year",
    values_to = "GDP"
  )

GDP.State <- filter(GDP.State,Description == "Health care and social assistance")

1b cont. GDP Statistics total

GDP.Statistics.Total <- summarise(GDP.State,
  GDP.Mean.Total = mean(GDP),
  GDP.Median.Total = median(GDP)
)

GDP.Statistics.Total
# A tibble: 1 × 2
  GDP.Mean.Total GDP.Median.Total
           <dbl>            <dbl>
1         65088.           21526.

GDP statitistics by Year

GDP.by.YEAR <- group_by(GDP.State, Year)

GDP.Statistics.by.Year <- summarise(GDP.by.YEAR,
            GDP.Mean.Year = mean(GDP),
            GDP.Median.Year = median(GDP),
            GDP.SD.Year = sd(GDP)
    
  )

GDP.by.YEAR
# A tibble: 780 × 4
# Groups:   Year [13]
   GeoName         Description                       Year      GDP
   <chr>           <chr>                             <chr>   <dbl>
 1 United States * Health care and social assistance 2008  1017197
 2 United States * Health care and social assistance 2009  1078771
 3 United States * Health care and social assistance 2010  1112327
 4 United States * Health care and social assistance 2011  1149944
 5 United States * Health care and social assistance 2012  1195074
 6 United States * Health care and social assistance 2013  1230767
 7 United States * Health care and social assistance 2014  1266432
 8 United States * Health care and social assistance 2015  1337785
 9 United States * Health care and social assistance 2016  1406434
10 United States * Health care and social assistance 2017  1461525
# … with 770 more rows

GDP statistics by State

GDP.by.State <- group_by(GDP.State, GeoName)

GDP.Statistics.by.State <- summarise(GDP.by.State,
                                     GDP.Mean.State = mean(GDP),
                                     GDP.Median.State = median(GDP),
                                     GDP.SD.State = sd(GDP)
                                       )

GDP.Statistics.by.State <- arrange(GDP.Statistics.by.State, GDP.Median.State)
GDP.Statistics.by.State
# A tibble: 60 × 4
   GeoName      GDP.Mean.State GDP.Median.State GDP.SD.State
   <chr>                 <dbl>            <dbl>        <dbl>
 1 Wyoming               1584.            1586.         131.
 2 Vermont               3172.            3091.         412.
 3 Alaska                3542.            3474.         633.
 4 North Dakota          3664.            3603.         763.
 5 Montana               4290.            4116.         749.
 6 South Dakota          4251.            4125.         793.
 7 Delaware              4725.            4704.         738.
 8 Hawaii                5272.            5077.         868.
 9 Idaho                 5333.            5147.        1036.
10 Rhode Island          5571.            5557.         464.
# … with 50 more rows

2 Create Visulizations

HIV

This goal of this visualization is to see if there are states with a different trend in incident HIV infection and rate.

#Figure 1a: Trend in Cases From 2008-2021 by ACross the States of the United States
F1A.point.hiv.ALL.cases <- ggplot(data=HIV.State, mapping = aes(x= Year, y= Cases, group = Geography))+
  geom_line()


F1A.point.hiv.ALL.cases
#Figure 1b: Trend in Cases From 2008-2021 by Individual States
F1b.point.hiv.ALL.cases.WRAPPED <- ggplot(data=HIV.State, mapping = aes(x= Year, y= Cases, group = Geography))+
  geom_line()+
  facet_wrap(~Geography, nrow=12,, scales = "free_y")


F1b.point.hiv.ALL.cases.WRAPPED 
#Figure 1c: Trend in Cases From 2008-2021 by Individual States
F1C.point.hiv <- ggplot(data=HIV.State, mapping = aes(x= Year, y= Cases))+
  geom_point(position = "jitter")+
  geom_smooth()+
  facet_wrap(~Geography, nrow=12, scales = "free")

F1C.point.hiv 
#Figure 2a: Trend in Incident Rate of HIV Infection From 2008-2019 Across the States of the United States

F2a.point.Rate.hiv.LINE <- ggplot(data = Rate.By.State, mapping = aes(x=Year,y=Rate, group=Geography))+
  geom_point(position = "jitter")+
  geom_line()

F2a.point.Rate.hiv.LINE 
#Figure 2b: Trend in Incident Rate of HIV Infection From 2008-2019 by state
   
F2b.point.Rate.hiv.LINE.WRAP <- ggplot(data = Rate.By.State, mapping = aes(x=Year,y=Rate, group=Geography))+
  geom_point()+
  geom_line()+
  facet_wrap(~Geography, nrow =12, scales = "free")+
  geom_smooth()
  
F2b.point.Rate.hiv.LINE.WRAP

Healthcare GDP

The goal of this visualization is to see the change in healthcare gdp over time by state

# Figure 3a. Trend in Healthcare GDP across the United States from 2008-2021.
F3a.point.gdp <- ggplot(data = GDP.State, mapping = aes(x=Year, y=GDP, group = GeoName))+
  geom_point()+
  geom_line()

F3a.point.gdp
# Figure 3b. Trend in Health care GDP across the United States by state from 2008-2021.
F3b.point.gdp.WRAPPED <- ggplot(data = GDP.State, mapping = aes(x=Year, y=GDP, group = GeoName))+
  geom_point()+
  geom_smooth()+
  facet_wrap(~GeoName,, nrow =12, scales = "free")

F3b.point.gdp.WRAPPED

3. Describe Visulations

3a. What variables are being visualized?

These visualization looks at the following variables:

These visualizations look at the following variables:

3b. What questions are you answering with the visualizations?

3c. What conclusions can you make from the visualization?

4 Indentify Limitations to Visualizations

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Nguyen (2022, April 27). Data Analytics and Computational Social Science: HNguyen HW 4. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomhenryfnp893359/

BibTeX citation

@misc{nguyen2022hnguyen,
  author = {Nguyen, Henry},
  title = {Data Analytics and Computational Social Science: HNguyen HW 4},
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomhenryfnp893359/},
  year = {2022}
}