Homework_2

hw1

challenge1

my name

dataset

ggplot2

Author

Paritosh G

Published

May 26, 2023

Libraries

Code

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.1     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

Q.1)

Code

surgical_procedures <- c("bypass","angiography")
sample_size <- c(539, 847)
mean_wait_time <- c(19, 18)
standard_deviation <- c(10,9)

df <- data.frame(surgical_procedures, sample_size, mean_wait_time, standard_deviation)

CI for bypass

Code

# confidence level for bypass and angiography
conf_level <- 0.9

# standard error for bypass
bypass_se <- 10 / sqrt(539)

# confidence interval for bypass
bypassCI <- 19 + qt(c(0.05, 0.95), 539-1) * bypass_se
bypassCI

[1] 18.29029 19.70971

Ci for angio

Code

# standard error for angiography
angio_se <- 9 / sqrt(847)

# confidence interval for angiography
angioCI <- 18 + qt(c(0.05, 0.95), 847-1) * angio_se
angioCI

[1] 17.49078 18.50922

Size of confidence interval twice the margin of error

Code

2*qt(c(0.05, 0.95), 539-1) * bypass_se

[1] -1.419421  1.419421

Code

2*qt(c(0.05, 0.95), 847-1) * angio_se

[1] -1.018436  1.018436

The confidence interval for angiography is narrower.

Q.2)

Code

# out of 1031 Americans surveyed
p <- 567 / 1031 
# 54% of Americans believe college education is essential for success

# 95% significant level
conf<- 0.95 

# standard error
college_se <- sqrt(p*(1-p)/1031) 

# confidence interval
collegeCI <- p + qnorm(c(0.025, 0.975)) * college_se
collegeCI

[1] 0.5195839 0.5803191

Q.3)

Code

range = 200-30
population_sd = range/4
z = qnorm(.975)
s = population_sd
n = ((z *s) / 5)^2
print(n)

[1] 277.5454

Q.4)

Setting up our t-test formula

Code

t_test <- function(x_bar, mu, sd, n){
  return((x_bar - mu) / (sd / sqrt(n)))
}

# getting t-test score
t_statistic <- t_test(x_bar = 410, mu = 500, sd = 90, n = 9)

A)

Code

n = 9
pval_two_tail = 2*pt(t_statistic, df = n-1)
pval_two_tail

[1] 0.01707168

Rejecting null Hypothesis that population mean in 500

B)

Code

pval_left_side_tail = pt(t_statistic, df = n-1)
pval_left_side_tail

[1] 0.008535841

Rejecting null hypothesis that population mean is greater than 500

C)

Code

pval_right_side_tail = pt(t_statistic, df = n-1, lower.tail=FALSE)
pval_right_side_tail

[1] 0.9914642

Fail to reject that population mean is less than 500.

Q.5)

Code

#  T values
t_jones <- (519.5 - 500) / 10 # sample mean = 519.5 - 500 for population mean / sample error of 10.0
t_jones

[1] 1.95

Code

t_smith <- (519.7 - 500) / 10 # sample mean = 519.7 - 500 for population mean / sample error of 10.0
t_smith

[1] 1.97

Code

# p values
p_jones <- 2 * pt(-abs(t_jones), df = 999)
p_jones

[1] 0.05145555

Code

p_smith <- 2 * pt(-abs(t_smith), df = 999)
p_smith

[1] 0.04911426

Jones is not significant at 0.05 level but smith’s result is significant

Q.6)

Code

# dataframe creation
grade <- c(rep("6th grade", 100), rep("7th grade", 100), rep("8th grade", 100))
snack <- c(rep("healthy snack", 31), rep("unhealthy snack", 69), rep("healthy snack", 43),
           rep("unhealthy snack", 57), rep("healthy snack", 51), rep("unhealthy snack", 49))
snack_df <- data.frame(grade, snack)

Using chi square test as we are testing association between two categorical variables.

Code

table(snack_df$snack,snack_df$grade)

                 
                  6th grade 7th grade 8th grade
  healthy snack          31        43        51
  unhealthy snack        69        57        49

Code

chisq.test(snack_df$snack,snack_df$grade)


    Pearson's Chi-squared test

data:  snack_df$snack and snack_df$grade
X-squared = 8.3383, df = 2, p-value = 0.01547

There is a relation between grades and choice of snack

Q.7)

Code

Area <- c(rep("Area1", 6), rep("Area2", 6), rep("Area3", 6))
cost <- c(6.2, 9.3, 6.8, 6.1, 6.7, 7.5, 7.5, 8.2, 8.5, 8.2, 7.0, 9.3,
          5.8, 6.4, 5.6, 7.1, 3.0, 3.5)
Area_cost <- data.frame(Area,cost)

using anova as comparing means of more than two groups

Code

one.way <- aov(cost ~ Area, data = Area_cost)
summary(one.way)

            Df Sum Sq Mean Sq F value  Pr(>F)   
Area         2  25.66  12.832   8.176 0.00397 **
Residuals   15  23.54   1.569                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Libraries

Q.1)

Q.2)

Q.3)

Q.4)

A)

B)

C)

Q.5)

Q.6)

Q.7)