hw1
desriptive statistics
probability
Homework 1
Author

Asch Harwood

Published

February 28, 2023

Code
library(dplyr)
library(ggplot2)
library(knitr)
library(kableExtra)
Code
library(readxl)
df <- read_excel("_data/LungCapData.xls")

Question 1

a

Code
#calc mean of lung cap
lc_mean <- mean(df$LungCap)

#calc sd
lc_sd <- sd(df$LungCap)

#use denorm to generate array of probabilities associated with each value of lung cap
df['probs'] <- dnorm(x=df$LungCap, mean=lc_mean, sd = lc_sd)

#plot histogram
ggplot(df, aes(x = LungCap)) + 
  geom_histogram(aes(y=..density..), binwidth=1, color="black", fill="white") +
  stat_function(fun=dnorm, args=list(mean=lc_mean, sd=lc_sd), color="red") +
  xlab("Lung Capacity") + ylab("Density")

b

Compare the probability distribution of the LungCap with respect to Males and Females?

Code
ggplot(df, aes(x = Gender, y = LungCap)) +
  geom_boxplot()

Males have higher mean lung capacity, a slightly wider interquartile range, and a slightly higher standard deviation.

Code
IQR(df$LungCap[df$Gender=='male'])
IQR(df$LungCap[df$Gender=='female'])
sd(df$LungCap[df$Gender=='male'])
sd(df$LungCap[df$Gender=='female'])

c

Code
mean_df <- df %>%
  group_by(Smoke) %>%
  summarise(mean(LungCap))

kable(mean_df, format='simple')
Smoke mean(LungCap)
no 7.770188
yes 8.645454
Code
ggplot(df, aes(x = Smoke, y = LungCap)) +
  geom_boxplot()

The average lung capacity for non-smokers is 7.77 compared to 8.65 for smokers, which does not make sense. I would assume that smokers would have a lower average lung capacity than non-smokers. However, we can see that there is more variance in LungCapacity for non-smokers, which likely explains this finding.

d

Code
df$AgeGroup <- cut(df$Age, breaks = c(0, 13, 15, 17, Inf),
                    labels = c("<=13", "14 to 15", "16 to 17", ">=18"))
Code
age_group_mean <- df %>%
  group_by(AgeGroup) %>%
  summarise(mean(LungCap))
kable(age_group_mean)
AgeGroup mean(LungCap)
<=13 6.411932
14 to 15 9.045417
16 to 17 10.245876
>=18 10.964688
Code
ggplot(df, aes(x = AgeGroup, y = LungCap)) +
  geom_boxplot()

There appears to be a relationship between age and lung capacity, where as age group increases, lung capacity increases as well. This is understandable given that developmentally a younger child would have a lower lung capacity than a young adult.

e

Code
ggplot(df, aes(x = AgeGroup, y = LungCap, fill=Smoke)) +
  geom_boxplot() 

Code
ggplot(df, aes(x = LungCap, fill = Smoke)) +
  geom_histogram(alpha = 0.5, bins = 10, position = "identity") +
  facet_wrap(~AgeGroup, nrow = 2, scales = "free") +
  labs(x = "Lung Capacity", y = "Frequency") +
  ggtitle("Distribution of Lung Capacity by Age Group, by Smoking Status")

When controlling for age, average smoker lung capacity is lower across the board for all age groups except “13 <=”. In the later group, there are far fewer smokers, as one would (hope to) expect, which explains the slightly higher mean lung capacity for smokers.

Question 2

Code
#loading data into dataframe
prisoners <- data.frame(convictions = c(0, 1, 2, 3, 4), num_prisoners = c(128, 434, 160, 64, 24))

#calculating total prisoners
total_prisoners <- prisoners %>%
  summarise(sum(num_prisoners)) %>%
  as.numeric()

a

Code
prob_2_convictions <- round(prisoners$num_prisoners[prisoners$convictions==2]/ total_prisoners,2)

Probability of exactly 2 convictions: 0.2

b

Code
sum_prisoner_lessthan_2 <- sum(prisoners$num_prisoners[prisoners$convictions<2])
prob_lessthan_2_convictions <- round(sum_prisoner_lessthan_2/total_prisoners, 2)

Probability of less than 2 convictions: 0.69

c

Code
sum_prisoner_2_orless <- sum(prisoners$num_prisoners[prisoners$convictions<=2])
prob_2_orless_convictions <- round(sum_prisoner_2_orless/total_prisoners, 2)

Probability of 2 or fewer convictions: 0.89

d

Code
sum_prisoner_morethan_2 <- sum(prisoners$num_prisoners[prisoners$convictions>2])
prob_morethan_2_convictions <- round(sum_prisoner_morethan_2/total_prisoners,2)

Probability of more than 2 convictions: 0.11

e

Code
get_prob <- function(num_prisoners, total_prisoners) {
  num_convictions <- prisoners$num_prisoners[prisoners$convictions== num_prisoners]
  prob_convictions <- num_convictions/total_prisoners
  return(prob_convictions)
}

prisoners$prob <- apply(prisoners, 1, function(x) get_prob(x['convictions'], 810))
expected_value <- sum(prisoners$convictions * prisoners$prob)

Expected value of number of convictions for any random prisoners: 1.2864198

f

Code
std_convictions <-sd(prisoners$num_prisoners)
var_convictions <- var(prisoners$num_prisoners)

Standard deviation: 161.0838291

Variance: 2.5948^{4}