hw1
challenge1
my name
dataset
ggplot2
Author

Paritosh G

Published

May 26, 2023

Libraries

Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.1     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

Q.1)

Code
surgical_procedures <- c("bypass","angiography")
sample_size <- c(539, 847)
mean_wait_time <- c(19, 18)
standard_deviation <- c(10,9)

df <- data.frame(surgical_procedures, sample_size, mean_wait_time, standard_deviation)

CI for bypass

Code
# confidence level for bypass and angiography
conf_level <- 0.9

# standard error for bypass
bypass_se <- 10 / sqrt(539)

# confidence interval for bypass
bypassCI <- 19 + qt(c(0.05, 0.95), 539-1) * bypass_se
bypassCI
[1] 18.29029 19.70971

Ci for angio

Code
# standard error for angiography
angio_se <- 9 / sqrt(847)

# confidence interval for angiography
angioCI <- 18 + qt(c(0.05, 0.95), 847-1) * angio_se
angioCI
[1] 17.49078 18.50922

Size of confidence interval twice the margin of error

Code
2*qt(c(0.05, 0.95), 539-1) * bypass_se
[1] -1.419421  1.419421
Code
2*qt(c(0.05, 0.95), 847-1) * angio_se
[1] -1.018436  1.018436
  • The confidence interval for angiography is narrower.

Q.2)

Code
# out of 1031 Americans surveyed
p <- 567 / 1031 
# 54% of Americans believe college education is essential for success

# 95% significant level
conf<- 0.95 

# standard error
college_se <- sqrt(p*(1-p)/1031) 

# confidence interval
collegeCI <- p + qnorm(c(0.025, 0.975)) * college_se
collegeCI 
[1] 0.5195839 0.5803191

Q.3)

Code
range = 200-30
population_sd = range/4
z = qnorm(.975)
s = population_sd
n = ((z *s) / 5)^2
print(n)
[1] 277.5454

Q.4)

  • Setting up our t-test formula

    Code
    t_test <- function(x_bar, mu, sd, n){
      return((x_bar - mu) / (sd / sqrt(n)))
    }
    
    # getting t-test score
    t_statistic <- t_test(x_bar = 410, mu = 500, sd = 90, n = 9)

A)

Code
n = 9
pval_two_tail = 2*pt(t_statistic, df = n-1)
pval_two_tail
[1] 0.01707168

Rejecting null Hypothesis that population mean in 500

B)

Code
pval_left_side_tail = pt(t_statistic, df = n-1)
pval_left_side_tail
[1] 0.008535841
  • Rejecting null hypothesis that population mean is greater than 500

C)

Code
pval_right_side_tail = pt(t_statistic, df = n-1, lower.tail=FALSE)
pval_right_side_tail
[1] 0.9914642
  • Fail to reject that population mean is less than 500.

    Q.5)

    Code
    #  T values
    t_jones <- (519.5 - 500) / 10 # sample mean = 519.5 - 500 for population mean / sample error of 10.0
    t_jones
    [1] 1.95
Code
t_smith <- (519.7 - 500) / 10 # sample mean = 519.7 - 500 for population mean / sample error of 10.0
t_smith
[1] 1.97
Code
# p values
p_jones <- 2 * pt(-abs(t_jones), df = 999)
p_jones
[1] 0.05145555
Code
p_smith <- 2 * pt(-abs(t_smith), df = 999)
p_smith
[1] 0.04911426
  • Jones is not significant at 0.05 level but smith’s result is significant

    Q.6)

Code
# dataframe creation
grade <- c(rep("6th grade", 100), rep("7th grade", 100), rep("8th grade", 100))
snack <- c(rep("healthy snack", 31), rep("unhealthy snack", 69), rep("healthy snack", 43),
           rep("unhealthy snack", 57), rep("healthy snack", 51), rep("unhealthy snack", 49))
snack_df <- data.frame(grade, snack)

Using chi square test as we are testing association between two categorical variables.

Code
table(snack_df$snack,snack_df$grade)
                 
                  6th grade 7th grade 8th grade
  healthy snack          31        43        51
  unhealthy snack        69        57        49
Code
chisq.test(snack_df$snack,snack_df$grade)

    Pearson's Chi-squared test

data:  snack_df$snack and snack_df$grade
X-squared = 8.3383, df = 2, p-value = 0.01547
  • There is a relation between grades and choice of snack

Q.7)

Code
Area <- c(rep("Area1", 6), rep("Area2", 6), rep("Area3", 6))
cost <- c(6.2, 9.3, 6.8, 6.1, 6.7, 7.5, 7.5, 8.2, 8.5, 8.2, 7.0, 9.3,
          5.8, 6.4, 5.6, 7.1, 3.0, 3.5)
Area_cost <- data.frame(Area,cost)

using anova as comparing means of more than two groups

Code
one.way <- aov(cost ~ Area, data = Area_cost)
summary(one.way) 
            Df Sum Sq Mean Sq F value  Pr(>F)   
Area         2  25.66  12.832   8.176 0.00397 **
Residuals   15  23.54   1.569                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1