Homework 2

hw2

hypothesis_testing

hw2

Author

Asch Harwood

Published

March 28, 2023

Code

library(dplyr)
library(ggplot2)

Q1

Code

# create data frame
df <- data.frame(
  "Surgical Procedure" = c("Bypass", "Angiography"),
  "Sample Size" = c(539, 847),
  "Mean Wait Time" = c(19, 18),
  "Standard Deviation" = c(10, 9)
)

# bypass confidence interval
mean <- 19
n <- 539
sd <- 10
ci_level <- .9
tail_area <- (1 - ci_level)/2
t_score <- qt(p=1-tail_area, df = n - 1)
b_ci <- c(mean - t_score * sd/sqrt(n), mean + t_score * sd/sqrt(n))

# angiography
mean <- 18
n <- 847
sd <- 9
ci_level <- .9
tail_area <- (1 - ci_level)/2
t_score <- qt(p=1-tail_area, df = n - 1)
a_ci <- c(mean - t_score * sd/sqrt(n), mean + t_score * sd/sqrt(n))


cat("Bypass 90% Confidence Interval: (", b_ci[1], ", ", b_ci[2], ")", sep = "")

Bypass 90% Confidence Interval: (18.29029, 19.70971)

Code

cat("\n")

Code

cat("\n")

Code

cat("Angiography 90% Confidence Interval: (", a_ci[1], ", ", a_ci[2], ")", sep = "")

Angiography 90% Confidence Interval: (17.49078, 18.50922)

Interpretation

Bypass confidence interval: 18.29029, 19.70971 Angiography confidence interval: 17.49078 18.50922

The angiography confidence level is smaller.

Q2

Code

n <- 1031 # num of americans survey
college_essential <- 567 # num americans believe college is essential
point_estimate <- college_essential/n
prop_results <- prop.test(college_essential, n)

cat("College is essential point estimate:", point_estimate)

College is essential point estimate: 0.5499515

Code

cat("\n")

Code

cat("\n")

Code

cat("College is essential 95% Confidence Interval: (", prop_results$conf.int[1], ", ", prop_results$conf.int[2], ")", sep = "")

College is essential 95% Confidence Interval: (0.5189682, 0.580558)

Q3

Code

# solve for n:  CI = x + (t * (sd / sqrt(n)))
# plug in value in this equation: n = (sd * t / (CI - x))^2
#calculate mean
high_price <- 200
low_price <- 30

sd <- (high_price - low_price) * .25
mean <- (high_price + low_price)/2
ci_lower <- mean - 5
ci_upper <- mean + 5
ci_level <- .95
estimated_sample_size <- (sd * ci_level / (ci_upper - mean))^2

cat("The estimated sample size is: ", round(estimated_sample_size))

The estimated sample size is:  65

Q4

Code

# goal: test whether mean female incole is inline with union agreement
mean_weekly_income_predicted <- 500
n = 9
mean_sampled = 410
sd_sample = 90

Q4_A

Approach and Assumptions

Approach 1: Construct confidence interval
Approach 2: One-sample, two-tailed T-test
Assumptions:

Sample is random and representative of underlying population
Data is normally distributed
Sample observations are independent
Population standard deviation is not known

Hypothesis

h0: female weekly salary is the same as union weekly ($410 == $500)
h1: female weekly salary is different from the union weekly (higher/lower) ($410 != $500)

Code

#confidence interval calculation
mean <- 410
union_mean <- 500
n <- 9
sd <- 90
alpha <- 0.05
ci_level <- .95
tail_area <- (1 - ci_level)/2
t_score <- qt(p=1-tail_area, df = n - 1)
a_ci <- c(mean - t_score * sd/sqrt(n), mean + t_score * sd/sqrt(n))

#p-value calculation

#compute t_staistics
t_stat <- (mean - union_mean)/(sd/sqrt(n))

#computer degrees of freedom
df <- n - 1

#calculate lower/left tail because t statistics is negative
#multipy by two for two-tailed test
p_val <- 2*pt(t_stat, df)

cat("Approach 1: Confidence Interval")

Approach 1: Confidence Interval

Code

cat("\n")

Code

cat("\n")

Code

cat("Female Salary 95% Confidence Interval: (", a_ci[1], ", ", a_ci[2], ")", sep = "")

Female Salary 95% Confidence Interval: (340.8199, 479.1801)

Code

cat("\n")

Code

cat("T score:", t_score)

T score: 2.306004

Code

cat("\n")

Code

cat("\n")

Code

cat("Interpretation")

Interpretation

Code

cat("\n")

Code

cat("\n")

Code

cat("The 95% confidence interval for the weekly female salary of $410 does not include $500, so we can reject the null hypothesis in favor of the alternative hypthosis that weekly female salary differs from $500 a week")

The 95% confidence interval for the weekly female salary of $410 does not include $500, so we can reject the null hypothesis in favor of the alternative hypthosis that weekly female salary differs from $500 a week

Code

cat("Approach 2: Calculat P Value")

Approach 2: Calculat P Value

Code

cat("\n")

Code

cat("\n")

Code

cat("T statistic:", t_stat)

T statistic: -3

Code

cat("\n")

Code

cat("Two-tailed P Value", p_val)

Two-tailed P Value 0.01707168

Code

cat("\n")

Code

cat("Is P value less than alpha?:", p_val < alpha)

Is P value less than alpha?: TRUE

Code

cat("\n")

Code

cat("\n")

Code

cat("Interpretation")

Interpretation

Code

cat("\n")

Code

cat("\n")

Code

cat("We can reject the null hypothesis in favor of the alternative hypothosis that weekly female salary differs from $500 a week because the observed p value of", p_val, "is less than our designated signifiance level of", alpha, ".")

We can reject the null hypothesis in favor of the alternative hypothosis that weekly female salary differs from $500 a week because the observed p value of 0.01707168 is less than our designated signifiance level of 0.05 .

Q4_B

Code

#p-value calculation

mean <- 410
union_mean <- 500
n <- 9
sd <- 90
alpha <- 0.05

#compute t_staistics
t_stat <- (mean - union_mean)/(sd/sqrt(n))

#computer degrees of freedom
df <- n - 1

p_val <- pt(t_stat, df, lower.tail = TRUE)

#calculate lower/left tail because t statistics is negative
#multipy by two for two-tailed test
cat("Lower Tail P Value:", p_val)

Lower Tail P Value: 0.008535841

Code

cat("\n")

Code

cat("This lower tail p value can be used to test whether the observed weekly female salary is less than the union wage. In this case, if we were doing a one-tailed t-test with an alpha significance level of 0.05, we could reject the null hypothesis in favor of the altnernative that weekly female salary is less than the union wage with a p value of", p_val,".")

This lower tail p value can be used to test whether the observed weekly female salary is less than the union wage. In this case, if we were doing a one-tailed t-test with an alpha significance level of 0.05, we could reject the null hypothesis in favor of the altnernative that weekly female salary is less than the union wage with a p value of 0.008535841 .

Q4_C

Code

mean <- 410
union_mean <- 500
n <- 9
sd <- 90
alpha <- 0.05

#compute t_staistics
t_stat <- (mean - union_mean)/(sd/sqrt(n))

#computer degrees of freedom
df <- n - 1

p_val <- pt(t_stat, df, lower.tail = FALSE)

#calculate lower/left tail because t statistics is negative
#multipy by two for two-tailed test
cat("Upper Tail P Value:", p_val)

Upper Tail P Value: 0.9914642

Code

cat("\n")

Code

cat("This upper tail p value can be used to test whether the observed weekly female salary is more than the union wage. In this case, if we were doing a one-tailed t-test with an alpha significance level of 0.05, we cannot reject the null hypothesis in favor of the altnernative that weekly female salary is less than the union wage with a p value of", p_val,".")

This upper tail p value can be used to test whether the observed weekly female salary is more than the union wage. In this case, if we were doing a one-tailed t-test with an alpha significance level of 0.05, we cannot reject the null hypothesis in favor of the altnernative that weekly female salary is less than the union wage with a p value of 0.9914642 .

Q5

Code

n <- 1000 #sample size
df <- n - 1 #degrees of freedom
se <- 10.0 #standard error
pop_mean <- 500 #assumed pop mean
y_jones <- 519.5
y_smith <- 519.7
t_jones <- 1.95
t_smith <- 1.97
p_jones <- 0.051
p_smith <- 0.049

# helper functions - computes t-stat
t_stat_func <- function(mean_1, test_mean, standard_error){
  t_stat <- (mean_1 - test_mean)/(standard_error)
  return(t_stat)
}

#helper function - computers p-value
p_val_func <- function(t_stat, df, tail_type){
  if (t_stat < 0){
    p_val <- pt(t_stat, df, lower.tail = TRUE)
  } else{
    p_val <- pt(t_stat, df, lower.tail = FALSE)
  }
  if (tail_type==2){
    p_val <- p_val*2
  }else{p_val}
  
  return(p_val)
}

Q5_A

Code

#compute t_staistics for jones

t_stat_jones <- t_stat_func(y_jones, pop_mean, se)

p_val_jones <- p_val_func(t_stat_jones, df, 2)

t_stat_smith <- t_stat_func(y_smith, pop_mean, se)

p_val_smith <- p_val_func(t_stat_smith, df, 2)

cat("Jones")

Jones

Code

cat("\n")

Code

cat("T statistic:", t_stat_jones,",", "P value:", p_val_jones)

T statistic: 1.95 , P value: 0.05145555

Code

cat("\n")

Code

cat("\n")

Code

cat("Smith")

Smith

Code

cat("\n")

Code

cat("T statistic:", t_stat_smith,",", "P value:", p_val_smith)

T statistic: 1.97 , P value: 0.04911426

Q5_B

With an alpha of 0.05, Smith’s result is statistically significant, while Jones’s result is not.

Q5_C

The chosen significant level (alpha = 0.05, for example) is typically a rule of thumb in the social sciences. In this case, a decision to round a value to decimal places would put both of these p-values at 0.05. Context matters as well. Depending on what is being measured in this example, the difference between 519.5 versus 519.7 might not be meaningful, or it could represent a huge real world difference. The implications/consequences of committing type 1 or type 2 error could be pertinent in the interpretation of these p_values as well. By reporting the actual p-values, readers are empowered to better evaluate the decisions of the researchers.

Q6

Null Hypothesis

H0: There is no association between snack choice and grade level

6th_grade_healthy_snack_31 = 7th_grade_healthy_snack_43 = 8th_grade_healthy_snack_51

6th_grade_unhealthy_snack_69 = 7th_grade_healthy_snack_57 = 8th_grade_healthy_snack_49

Chi Squared Test

We will use a chi-squared test

Code

snack_counts <- matrix(
  c(31, 43, 51, 69, 57, 49),
  nrow = 2,
  byrow = TRUE
)
rownames(snack_counts) <- c("Healthy_Snack", "Unhealthy_Snack")
colnames(snack_counts) <- c("6th grade", "7th grade", "8th grade")
snack_counts

                6th grade 7th grade 8th grade
Healthy_Snack          31        43        51
Unhealthy_Snack        69        57        49

Code

# Perform the Chi-square test
chi_square_test <- chisq.test(snack_counts)

# Print the test results
print(chi_square_test)


    Pearson's Chi-squared test

data:  snack_counts
X-squared = 8.3383, df = 2, p-value = 0.01547

Interpretation

We can reject the null hypothesis that there is no association between snack choice and grade in favor of the alternative hypothesis that there is an association between snack choice and grade

Code

# Create a dataframe
snack_data <- data.frame(
  Grade_Level = c("6th grade", "7th grade", "8th grade"),
  Healthy_Snack = c(31, 43, 51),
  Unhealthy_Snack = c(69, 57, 49)
)

# Calculate the proportions
snack_data$Total <- snack_data$Healthy_Snack + snack_data$Unhealthy_Snack
snack_data$Healthy_Prop <- snack_data$Healthy_Snack / snack_data$Total
snack_data$Unhealthy_Prop <- snack_data$Unhealthy_Snack / snack_data$Total

# Reshape the data to a long format
snack_data_long <- tidyr::gather(snack_data, key = "Snack_Type", value = "Proportion", Healthy_Prop, Unhealthy_Prop)

# Create a bar plot using ggplot2
ggplot(snack_data_long, aes(x = Grade_Level, y = Proportion, fill = Snack_Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Proportion of Healthy and Unhealthy Snacks by Grade Level",
       x = "Grade Level",
       y = "Proportion",
       fill = "Snack Type") +
  theme_minimal()

Q7

Code

# Data from the table
data_matrix <- matrix(
  c(
    6.2, 9.3, 6.8, 6.1, 6.7, 7.5,
    7.5, 8.2, 8.5, 8.2, 7.0, 9.3,
    5.8, 6.4, 5.6, 7.1, 3.0, 3.5
  ),
  nrow = 3,
  byrow = TRUE
)
# Create a dataframe with Area and Value columns
data_df <- data.frame(
  Area = factor(rep(c("Area 1", "Area 2", "Area 3"), each = 6)),
  Value = as.vector(t(data_matrix))
)

mean_df <- data_df %>%
  group_by(Area) %>%
  summarise(mean_val = mean(Value))

Answer

Null Hypothesis

H0: All groups have the same population mean

area_1 = area_2 = area_3 is TRUE

H1: At least one group has a different population mean

area_1 = area_2 = area_3 is FALSE

ANOVA

Since we are trying to determine if the observed differences in area means are statistically significant, we will use a one-way analysis of variance test

Code

anova_results <- aov(Value ~ Area, data_df)
summary(anova_results)

            Df Sum Sq Mean Sq F value  Pr(>F)   
Area         2  25.66  12.832   8.176 0.00397 **
Residuals   15  23.54   1.569                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation

With a P value of 0.00397 we can reject the null hypothesis that there is no difference between the three group means in favor of the alternative hypothesis that there is indeed a difference.