hw3
regression analysis
Homework 3
Author

Guanhua Tan

Published

April 1, 2023

Code
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Code
library(smss)
library(alr4)
Loading required package: car
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

Loading required package: effects
lattice theme set by effectsTheme()
See ?effectsTheme for details.
Code
data(UN11)

Question 1

Code
glimpse(UN11)
Rows: 199
Columns: 6
$ region    <fct> Asia, Europe, Africa, Africa, Caribbean, Latin Amer, Asia, C…
$ group     <fct> other, other, africa, africa, other, other, other, other, oe…
$ fertility <dbl> 5.968, 1.525, 2.142, 5.135, 2.000, 2.172, 1.735, 1.671, 1.94…
$ ppgdp     <dbl> 499.0, 3677.2, 4473.0, 4321.9, 13750.1, 9162.1, 3030.7, 2285…
$ lifeExpF  <dbl> 49.49, 80.40, 75.00, 53.17, 81.10, 79.89, 77.33, 77.75, 84.2…
$ pctUrban  <dbl> 23, 53, 67, 59, 100, 93, 64, 47, 89, 68, 52, 84, 89, 29, 45,…
Code
summary(UN11)
        region      group       fertility         ppgdp         
 Africa    :53   oecd  : 31   Min.   :1.134   Min.   :   114.8  
 Asia      :50   other :115   1st Qu.:1.754   1st Qu.:  1283.0  
 Europe    :39   africa: 53   Median :2.262   Median :  4684.5  
 Latin Amer:20                Mean   :2.761   Mean   : 13012.0  
 Caribbean :17                3rd Qu.:3.545   3rd Qu.: 15520.5  
 Oceania   :17                Max.   :6.925   Max.   :105095.4  
 (Other)   : 3                                                  
    lifeExpF        pctUrban     
 Min.   :48.11   Min.   : 11.00  
 1st Qu.:65.66   1st Qu.: 39.00  
 Median :75.89   Median : 59.00  
 Mean   :72.29   Mean   : 57.93  
 3rd Qu.:79.58   3rd Qu.: 75.00  
 Max.   :87.12   Max.   :100.00  
                                 
  1. the variable ppgdp is the predictor and the variable fertility is the response.

Code
# scatterplot
ggplot(UN11, aes(x=ppgdp, y=fertility))+
  geom_point()+
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

When ppgdp is lower than 25000, fertility surges. when ppgdp is greater than 25000, fertility maintains stable. I don’t believe a stright-line mean function would be plausible for a summary of this graph.

Code
# scatterplot log(data)
ggplot(UN11, aes(x=log(ppgdp), y=fertility))+
  geom_point()+
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

Yes, the simple line regression model seem plausible for a summary of this graphic.

Question 2

Code
ggplot(UN11, aes(x=log(ppgdp), y=fertility))+
  geom_point()+
  geom_smooth(method="lm")
`geom_smooth()` using formula = 'y ~ x'

Code
cor(UN11$ppgdp,UN11$fertility)
[1] -0.4399891
Code
UN11$income.pound=UN11$ppgdp*1.33
ggplot(UN11, aes(x=log(income.pound), y=fertility))+
  geom_point()+
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

Code
cor.test(UN11$income.pound, UN11$fertility)

    Pearson's product-moment correlation

data:  UN11$income.pound and UN11$fertility
t = -6.877, df = 197, p-value = 7.903e-11
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.5456842 -0.3205140
sample estimates:
       cor 
-0.4399891 
  1. the slopes of the prediction equation maintain the same.

  2. the correlation doesn’t change.

Question 3

Code
data("water")
summary(water)
      Year          APMAM            APSAB           APSLAKE     
 Min.   :1948   Min.   : 2.700   Min.   : 1.450   Min.   : 1.77  
 1st Qu.:1958   1st Qu.: 4.975   1st Qu.: 3.390   1st Qu.: 3.36  
 Median :1969   Median : 7.080   Median : 4.460   Median : 4.62  
 Mean   :1969   Mean   : 7.323   Mean   : 4.652   Mean   : 4.93  
 3rd Qu.:1980   3rd Qu.: 9.115   3rd Qu.: 5.685   3rd Qu.: 5.83  
 Max.   :1990   Max.   :18.080   Max.   :11.960   Max.   :13.02  
     OPBPC             OPRC           OPSLAKE           BSAAM       
 Min.   : 4.050   Min.   : 4.350   Min.   : 4.600   Min.   : 41785  
 1st Qu.: 7.975   1st Qu.: 7.875   1st Qu.: 8.705   1st Qu.: 59857  
 Median : 9.550   Median :11.110   Median :12.140   Median : 69177  
 Mean   :12.836   Mean   :12.002   Mean   :13.522   Mean   : 77756  
 3rd Qu.:16.545   3rd Qu.:14.975   3rd Qu.:16.920   3rd Qu.: 92206  
 Max.   :43.370   Max.   :24.850   Max.   :33.070   Max.   :146345  
Code
pairs(~APMAM+APSAB+APSLAKE+OPBPC+OPRC+OPSLAKE+BSAAM, data=water)

The sctterplot martrix clearly demonstrates that there are positive correlations between any two sites.

Question 4

Code
data("Rateprof")
summary(Rateprof)
    gender       numYears        numRaters       numCourses     pepper   
 female:159   Min.   : 1.000   Min.   :10.00   Min.   : 1.000   no :320  
 male  :207   1st Qu.: 6.000   1st Qu.:15.00   1st Qu.: 3.000   yes: 46  
              Median :10.000   Median :24.00   Median : 4.000            
              Mean   : 8.347   Mean   :28.58   Mean   : 4.251            
              3rd Qu.:11.000   3rd Qu.:37.00   3rd Qu.: 5.000            
              Max.   :11.000   Max.   :86.00   Max.   :12.000            
                                                                         
    discipline          dept        quality       helpfulness   
 Hum     :134   English   : 49   Min.   :1.409   Min.   :1.364  
 SocSci  : 66   Math      : 34   1st Qu.:2.936   1st Qu.:3.069  
 STEM    :103   Biology   : 20   Median :3.612   Median :3.662  
 Pre-prof: 63   Chemistry : 20   Mean   :3.575   Mean   :3.631  
                Psychology: 20   3rd Qu.:4.250   3rd Qu.:4.351  
                Spanish   : 20   Max.   :4.981   Max.   :5.000  
                (Other)   :203                                  
    clarity         easiness     raterInterest     sdQuality      
 Min.   :1.333   Min.   :1.391   Min.   :1.098   Min.   :0.09623  
 1st Qu.:2.871   1st Qu.:2.548   1st Qu.:2.934   1st Qu.:0.87508  
 Median :3.600   Median :3.148   Median :3.305   Median :1.15037  
 Mean   :3.525   Mean   :3.135   Mean   :3.310   Mean   :1.05610  
 3rd Qu.:4.214   3rd Qu.:3.692   3rd Qu.:3.692   3rd Qu.:1.28730  
 Max.   :5.000   Max.   :4.900   Max.   :4.909   Max.   :1.67739  
                                                                  
 sdHelpfulness      sdClarity        sdEasiness     sdRaterInterest 
 Min.   :0.0000   Min.   :0.0000   Min.   :0.3162   Min.   :0.3015  
 1st Qu.:0.9902   1st Qu.:0.9085   1st Qu.:0.9045   1st Qu.:1.0848  
 Median :1.2860   Median :1.1712   Median :1.0247   Median :1.2167  
 Mean   :1.1719   Mean   :1.0970   Mean   :1.0196   Mean   :1.1965  
 3rd Qu.:1.4365   3rd Qu.:1.3328   3rd Qu.:1.1485   3rd Qu.:1.3326  
 Max.   :1.8091   Max.   :1.8091   Max.   :1.6293   Max.   :1.7246  
                                                                    
Code
pairs(~quality+helpfulness+clarity+easiness+raterInterest, data=Rateprof)

The sccatterplot matrix suggests that there are positive correlations between quality and helpfulness, helpfulness and clarity, quality and clarity. It also indicates that there is no strong correlation between the rest of them.

Question 5

Code
data("student.survey")
summary(student.survey)
      subj       ge           ag              hi              co       
 Min.   : 1.00   f:31   Min.   :22.00   Min.   :2.000   Min.   :2.600  
 1st Qu.:15.75   m:29   1st Qu.:24.00   1st Qu.:3.000   1st Qu.:3.175  
 Median :30.50          Median :26.50   Median :3.350   Median :3.500  
 Mean   :30.50          Mean   :29.17   Mean   :3.308   Mean   :3.453  
 3rd Qu.:45.25          3rd Qu.:31.00   3rd Qu.:3.625   3rd Qu.:3.725  
 Max.   :60.00          Max.   :71.00   Max.   :4.000   Max.   :4.000  
                                                                       
       dh             dr               tv               sp        
 Min.   :   0   Min.   : 0.200   Min.   : 0.000   Min.   : 0.000  
 1st Qu.: 205   1st Qu.: 1.450   1st Qu.: 3.000   1st Qu.: 3.000  
 Median : 640   Median : 2.000   Median : 6.000   Median : 5.000  
 Mean   :1232   Mean   : 3.818   Mean   : 7.267   Mean   : 5.483  
 3rd Qu.:1350   3rd Qu.: 5.000   3rd Qu.:10.000   3rd Qu.: 7.000  
 Max.   :8000   Max.   :20.000   Max.   :37.000   Max.   :16.000  
                                                                  
       ne               ah             ve          pa    
 Min.   : 0.000   Min.   : 0.000   Mode :logical   d:21  
 1st Qu.: 2.000   1st Qu.: 0.000   FALSE:60        i:24  
 Median : 3.000   Median : 0.500                   r:15  
 Mean   : 4.083   Mean   : 1.433                         
 3rd Qu.: 5.250   3rd Qu.: 2.000                         
 Max.   :14.000   Max.   :11.000                         
                                                         
                     pi                re         ab              aa         
 very liberal         : 8   never       :15   Mode :logical   Mode :logical  
 liberal              :24   occasionally:29   FALSE:60        FALSE:59       
 slightly liberal     : 6   most weeks  : 7                   NA's :1        
 moderate             :10   every week  : 9                                  
 slightly conservative: 6                                                    
 conservative         : 4                                                    
 very conservative    : 2                                                    
     ld         
 Mode :logical  
 FALSE:44       
 NA's :16       
                
                
                
                
Code
# (i)
lm_ideology_religiosity <-lm(as.numeric(pi)~as.numeric(re), data=student.survey)
summary(lm_ideology_religiosity)

Call:
lm(formula = as.numeric(pi) ~ as.numeric(re), data = student.survey)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81243 -0.87160  0.09882  1.12840  3.09882 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      0.9308     0.4252   2.189   0.0327 *  
as.numeric(re)   0.9704     0.1792   5.416 1.22e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.345 on 58 degrees of freedom
Multiple R-squared:  0.3359,    Adjusted R-squared:  0.3244 
F-statistic: 29.34 on 1 and 58 DF,  p-value: 1.221e-06
Code
# ii
lm_hi_tv <- lm(hi~tv, data=student.survey)
summary(lm_hi_tv)

Call:
lm(formula = hi ~ tv, data = student.survey)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2583 -0.2456  0.0417  0.3368  0.7051 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.441353   0.085345  40.323   <2e-16 ***
tv          -0.018305   0.008658  -2.114   0.0388 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4467 on 58 degrees of freedom
Multiple R-squared:  0.07156,   Adjusted R-squared:  0.05555 
F-statistic: 4.471 on 1 and 58 DF,  p-value: 0.03879
Code
ggplot(student.survey, aes(x=re, fill=pi))+
  geom_bar(position="fill")

Code
ggplot(student.survey, aes(x=tv, y=log(hi)))+
   geom_smooth(method="lm")
`geom_smooth()` using formula = 'y ~ x'

For the first regression analysis, the people who are very conservative come to church every week. By contrast, the people who are liberal or very liberal come to church rarely. It indicates the close relationship between political ideology and religion.

For the second regression analysis, the graphic demonstrates that students who spent more time watching TV achieve lower gpa. In other words, there is a negative association between gpa and hours of watching TV.