HW 4: Descriptive statistics and data visualization.
# Data set #1
# Data obtained from here:https://gis.cdc.gov/grasp/nchhstpatlas/tables.html
HIV.State <- read_csv("HIV.by.State.CSV", skip = 10)
HIV.State <- select(HIV.State, 2,3,5,6)
HIV.State <- rename(HIV.State, Rate =4)
HIV.State.Wide.Year <- HIV.State%>%
select(1:3)%>%
pivot_wider(
names_from = Year,
values_from = Cases
)
I will compute the mean, median, and standard deviation by case-state by year and total year.
HIV.Statistics.Total <- HIV.State%>%
summarise(
HIV.Mean.Total = mean(Cases),
HIV.Median.Total = median(Cases))
HIV.Statistics.Total
# A tibble: 1 × 2
HIV.Mean.Total HIV.Median.Total
<dbl> <dbl>
1 754. 352.
Rate calculations are from 2008-2019 as there is not data for the years 2020 and 2021.
By.State <- group_by(HIV.State, Geography)
HIV.Statistics.by.State <- summarise(By.State,
HIV.Mean.by.State = mean(Cases),
HIV.Median.by.State = median(Cases),
HIV.SD.by.State = sd(Cases),
)
HIV.Statistics.by.State <- arrange(HIV.Statistics.by.State, HIV.Median.by.State)
HIV.Statistics.by.State
# A tibble: 51 × 4
Geography HIV.Mean.by.State HIV.Median.by.State HIV.SD.by.State
<chr> <dbl> <dbl> <dbl>
1 Vermont 13.8 14 5.10
2 Wyoming 14.2 14.5 5.51
3 North Dakota 23.6 19.5 12.6
4 Montana 20.9 20.5 6.07
5 Alaska 28 27.5 7.57
6 South Dakota 29.1 29.5 7.29
7 Idaho 36.6 36.5 9.39
8 New Hampshire 36.6 38 8.08
9 Maine 41.4 46 13.6
10 Rhode Island 82.2 77.5 25.2
# … with 41 more rows
Rate.By.State <- filter(By.State, Year < 2020)
Rate.By.State$Rate = as.numeric(Rate.By.State$Rate)
Rate.By.State <- group_by(Rate.By.State, Geography)
HIV.Statistics.by.State.Rate <- summarise(Rate.By.State,
HIV.Rate.Mean.by.State = mean(Rate),
HIV.Rate.Median.by.State = median(Rate),
HIV.Rate.SD.by.State = sd(Rate),
)
HIV.Statistics.by.State.Rate <- arrange(HIV.Statistics.by.State.Rate, HIV.Rate.Median.by.State)
HIV.Statistics.by.State.Rate
# A tibble: 51 × 4
Geography HIV.Rate.Mean.by… HIV.Rate.Median.b… HIV.Rate.SD.by.…
<chr> <dbl> <dbl> <dbl>
1 Montana 2.62 2.55 0.589
2 Vermont 2.78 2.85 0.805
3 Idaho 2.91 2.9 0.779
4 North Dakota 3.86 3.2 1.99
5 Wyoming 3.24 3.25 1.10
6 New Hampshire 3.37 3.4 0.656
7 Maine 3.92 4.2 0.972
8 South Dakota 4.29 4.25 0.908
9 Iowa 4.45 4.55 0.485
10 Alaska 4.93 4.6 1.19
# … with 41 more rows
I tidy the data even more. Ambulatory Health services and hospitals are including in health and social assistance along with other industries, therefore I will remove them.
GDP.Statistics.Total <- summarise(GDP.State,
GDP.Mean.Total = mean(GDP),
GDP.Median.Total = median(GDP)
)
GDP.Statistics.Total
# A tibble: 1 × 2
GDP.Mean.Total GDP.Median.Total
<dbl> <dbl>
1 65088. 21526.
GDP.by.YEAR <- group_by(GDP.State, Year)
GDP.Statistics.by.Year <- summarise(GDP.by.YEAR,
GDP.Mean.Year = mean(GDP),
GDP.Median.Year = median(GDP),
GDP.SD.Year = sd(GDP)
)
GDP.by.YEAR
# A tibble: 780 × 4
# Groups: Year [13]
GeoName Description Year GDP
<chr> <chr> <chr> <dbl>
1 United States * Health care and social assistance 2008 1017197
2 United States * Health care and social assistance 2009 1078771
3 United States * Health care and social assistance 2010 1112327
4 United States * Health care and social assistance 2011 1149944
5 United States * Health care and social assistance 2012 1195074
6 United States * Health care and social assistance 2013 1230767
7 United States * Health care and social assistance 2014 1266432
8 United States * Health care and social assistance 2015 1337785
9 United States * Health care and social assistance 2016 1406434
10 United States * Health care and social assistance 2017 1461525
# … with 770 more rows
GDP.by.State <- group_by(GDP.State, GeoName)
GDP.Statistics.by.State <- summarise(GDP.by.State,
GDP.Mean.State = mean(GDP),
GDP.Median.State = median(GDP),
GDP.SD.State = sd(GDP)
)
GDP.Statistics.by.State <- arrange(GDP.Statistics.by.State, GDP.Median.State)
GDP.Statistics.by.State
# A tibble: 60 × 4
GeoName GDP.Mean.State GDP.Median.State GDP.SD.State
<chr> <dbl> <dbl> <dbl>
1 Wyoming 1584. 1586. 131.
2 Vermont 3172. 3091. 412.
3 Alaska 3542. 3474. 633.
4 North Dakota 3664. 3603. 763.
5 Montana 4290. 4116. 749.
6 South Dakota 4251. 4125. 793.
7 Delaware 4725. 4704. 738.
8 Hawaii 5272. 5077. 868.
9 Idaho 5333. 5147. 1036.
10 Rhode Island 5571. 5557. 464.
# … with 50 more rows
This goal of this visualization is to see if there are states with a different trend in incident HIV infection and rate.
#Figure 1a: Trend in Cases From 2008-2021 by ACross the States of the United States
F1A.point.hiv.ALL.cases <- ggplot(data=HIV.State, mapping = aes(x= Year, y= Cases, group = Geography))+
geom_line()
F1A.point.hiv.ALL.cases
#Figure 1b: Trend in Cases From 2008-2021 by Individual States
F1b.point.hiv.ALL.cases.WRAPPED <- ggplot(data=HIV.State, mapping = aes(x= Year, y= Cases, group = Geography))+
geom_line()+
facet_wrap(~Geography, nrow=12,, scales = "free_y")
F1b.point.hiv.ALL.cases.WRAPPED
#Figure 1c: Trend in Cases From 2008-2021 by Individual States
F1C.point.hiv <- ggplot(data=HIV.State, mapping = aes(x= Year, y= Cases))+
geom_point(position = "jitter")+
geom_smooth()+
facet_wrap(~Geography, nrow=12, scales = "free")
F1C.point.hiv
#Figure 2a: Trend in Incident Rate of HIV Infection From 2008-2019 Across the States of the United States
F2a.point.Rate.hiv.LINE <- ggplot(data = Rate.By.State, mapping = aes(x=Year,y=Rate, group=Geography))+
geom_point(position = "jitter")+
geom_line()
F2a.point.Rate.hiv.LINE
#Figure 2b: Trend in Incident Rate of HIV Infection From 2008-2019 by state
F2b.point.Rate.hiv.LINE.WRAP <- ggplot(data = Rate.By.State, mapping = aes(x=Year,y=Rate, group=Geography))+
geom_point()+
geom_line()+
facet_wrap(~Geography, nrow =12, scales = "free")+
geom_smooth()
F2b.point.Rate.hiv.LINE.WRAP
The goal of this visualization is to see the change in healthcare gdp over time by state
# Figure 3a. Trend in Healthcare GDP across the United States from 2008-2021.
F3a.point.gdp <- ggplot(data = GDP.State, mapping = aes(x=Year, y=GDP, group = GeoName))+
geom_point()+
geom_line()
F3a.point.gdp
# Figure 3b. Trend in Health care GDP across the United States by state from 2008-2021.
F3b.point.gdp.WRAPPED <- ggplot(data = GDP.State, mapping = aes(x=Year, y=GDP, group = GeoName))+
geom_point()+
geom_smooth()+
facet_wrap(~GeoName,, nrow =12, scales = "free")
F3b.point.gdp.WRAPPED
These visualization looks at the following variables:
These visualizations look at the following variables:
Figures 1a and 3a seeks to answer the questions ” What are the trends in incident HIV infections, rate of HIV infections, and healthcare GDP across the United States over this time frame?”
Figure 1b,1c,2b, and 3b seeks to answer the questions ” What are the trends in incident HIV infections, rate of HIV infections, and healthcare GDP by state over this time frame?”
There is an overall trend that incident HIV infections have decreased from 2008-2021 (Figure 1a and 2a).
Incident cases have decreased in all states with some states having outlier years where incident cases increased in recent years. For example North Dakota, Tennessee, Utah, Washington, West Virginia, and Wyoming (Figure 1b and 1c).
Looking at the incidence rates over time, you can see that most states had case rates decrease over time (Figure 2b).
Looking at rates, some states appear to have a plateau in case rates from an increasing trend ( i.e Arizona, Arkansas, Nevada, North Dakota) (Figure 2b).
Looking at rates, some states appear to have increase case rates (i.e Missisippi, Montana, New Mexico, North Dakota, Ohio, Oklahoma, Washington, West Virginia) (Figure 2b).
Looking at healthcare GDP, most states have increased their healthcare GDP over time (Figure 3a and 3b).
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Nguyen (2022, April 27). Data Analytics and Computational Social Science: HNguyen HW 4. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomhenryfnp893359/
BibTeX citation
@misc{nguyen2022hnguyen, author = {Nguyen, Henry}, title = {Data Analytics and Computational Social Science: HNguyen HW 4}, url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomhenryfnp893359/}, year = {2022} }