import the dataset

good <- read.csv("good.csv")
good <- na.omit(good[,c("CHID",
                        "mathraw97",
                        "AGE97",
                        "faminc97",
                        "bthwht",
                        "WICpreg",
                        "HOME97")])
attach(good)

Descriptive Statistics

summary(good[,c("mathraw97",
                "AGE97",
                "faminc97",
                "bthwht",
                "WICpreg",
                "HOME97")])
##    mathraw97         AGE97           faminc97          bthwht     
##  Min.   : 0.00   Min.   : 3.000   Min.   :     0   Min.   :0.000  
##  1st Qu.:15.00   1st Qu.: 5.000   1st Qu.: 20686   1st Qu.:0.000  
##  Median :36.00   Median : 7.000   Median : 40705   Median :0.000  
##  Mean   :35.83   Mean   : 7.395   Mean   : 52060   Mean   :0.406  
##  3rd Qu.:54.00   3rd Qu.:10.000   3rd Qu.: 67477   3rd Qu.:1.000  
##  Max.   :98.00   Max.   :13.000   Max.   :784611   Max.   :1.000  
##     WICpreg           HOME97     
##  Min.   :0.0000   Min.   : 7.90  
##  1st Qu.:0.0000   1st Qu.:18.00  
##  Median :0.0000   Median :20.50  
##  Mean   :0.4167   Mean   :20.19  
##  3rd Qu.:1.0000   3rd Qu.:22.40  
##  Max.   :1.0000   Max.   :27.00
sd(mathraw97)
## [1] 22.28801
sd(AGE97)
## [1] 2.922137
sd(faminc97)
## [1] 53175.04
sd(bthwht)
## [1] 0.4911999
sd(WICpreg)
## [1] 0.4931412
sd(HOME97)
## [1] 3.068104
table(bthwht)
## bthwht
##    0    1 
## 1213  829
table(WICpreg)
## WICpreg
##    0    1 
## 1191  851
cor(good[,c("AGE97",
            "faminc97",
            "bthwht",
            "WICpreg",
            "HOME97")], use="complete.obs")
##                AGE97    faminc97      bthwht     WICpreg      HOME97
## AGE97     1.00000000  0.05414526  0.20405292 -0.08530692  0.19893080
## faminc97  0.05414526  1.00000000 -0.10282942 -0.38393247  0.39448347
## bthwht    0.20405292 -0.10282942  1.00000000  0.12644926 -0.09637508
## WICpreg  -0.08530692 -0.38393247  0.12644926  1.00000000 -0.40249151
## HOME97    0.19893080  0.39448347 -0.09637508 -0.40249151  1.00000000

Build the multiple regression analysis.

good1 <- good[,c("mathraw97",
                 "AGE97",
                 "faminc97",
                 "bthwht",
                 "WICpreg")]
model1 <- lm(mathraw97 ~ AGE97+faminc97+bthwht+WICpreg, data=good1)

Diagnostics & Corrections

Check for Linearity

Check linearity with scatter plot matrix.

pairs(good1, panel=panel.smooth)

Examine the relationship:

  1. AGE97 & mathraw97
ggplot(good1, aes(x=AGE97, y=mathraw97)) +
  geom_point(size=0.6) +
  xlab("Respondent's Age") +
  ylab("Math Test Score") +
  theme_bw() +
  geom_smooth(method="loess")
## `geom_smooth()` using formula 'y ~ x'

2. faminc97 & mathraw97

ggplot(good1, aes(x=faminc97, y=mathraw97)) +
  geom_point(size=0.6) +
  xlab("Respondent's Family Income") +
  ylab("Math Test Score") +
  theme_bw() +
  geom_smooth(method="loess")
## `geom_smooth()` using formula 'y ~ x'

Check for Homoscedasticity

plot(model1)  #enter to get to residual plot 

1. model residuals

good.res<-resid(model1)
  1. fitted values
fitted.res<-fitted(model1)
plot(fitted.res,good.res)
abline(0, 0, lty=2, col="grey") 
lines(lowess(good.res ~ fitted.res), col="red")

Check for Normality

  1. AGE97
#probability plots
hist(AGE97, freq=FALSE)
lines(density(na.omit(AGE97)))

#quantile-quantile plots
qqnorm(AGE97)
qqline(AGE97,col="red")

2. faminc97

#probability plots
hist(faminc97,freq=FALSE)
lines(density(faminc97))

#quantile-quantile plots
qqnorm(faminc97)
qqline(faminc97,col="red")

3. Residuals

hist(good.res,10)

boxplot(good.res,main="Boxplot of residuals") 

Check for Omitted Relevant Variables

Using Added Variable Plots

good2 <- good[c("mathraw97",
                "AGE97",
                "faminc97",
                "bthwht",
                "WICpreg",
                "HOME97")]
# mathraw97
model_math <- lm(mathraw97 ~ AGE97+faminc97+bthwht+WICpreg, data=good2)
# HOME97
model_home <- lm(HOME97 ~ AGE97+faminc97+bthwht+WICpreg, data=good2)

Compare plotted residuals.

plot(density(resid(model_math)))

plot(density(resid(model_home)))

qqnorm(resid(model_math))
qqline(resid(model_math), col="red")

qqnorm(resid(model_home))
qqline(resid(model_home), col="red")

plot(model_home$residuals, model_math$residuals)
abline(lm(model_math$residuals ~ model_home$residuals), col="red")
lines(lowess(model_math$residuals ~ model_home$residuals), col="blue")

Data Transformation

  1. Centering AGE97
#Create a new centered AGE97 variable.
good2$AGE97c <- good2$AGE97-mean(good2$AGE97)
#Create a new centered AGE97 squared variable
good2$AGE97c2 <- (good2$AGE97c**2)
  1. Log-Transform faminc97
good2$logfaminc <- ifelse(good2$faminc97 <= 1, 0, 
                          ifelse(good2$faminc97 > 1, log(good2$faminc97), NA))

Create a new model after transformation.

model2 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=good2)

Extract the model fitted values and residuals.

pred1 <- as.data.frame(model2$fitted.values)
pred1$residuals <- (model2$residuals)

Produce a scatter plot of residuals against IVs.

plot(good2$logfaminc, pred1$residuals)
abline(lm(pred1$residuals ~ good2$logfaminc), col="red")  
lines(lowess(good2$logfaminc, pred1$residuals), col="blue")

plot(good2$HOME97, pred1$residuals)
abline(lm(pred1$residuals~good2$HOME97), col="red")  
lines(lowess(good2$HOME97, pred1$residuals), col="blue")

summary(model2)
## 
## Call:
## lm(formula = mathraw97 ~ AGE97c + AGE97c2 + logfaminc + bthwht + 
##     WICpreg + HOME97, data = good2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.070  -4.582   0.050   4.973  28.446 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.14896    2.00751   9.041  < 2e-16 ***
## AGE97c       6.90390    0.06400 107.874  < 2e-16 ***
## AGE97c2     -0.08536    0.02481  -3.441 0.000592 ***
## logfaminc    0.62995    0.17378   3.625 0.000296 ***
## bthwht      -1.59559    0.38854  -4.107 4.17e-05 ***
## WICpreg     -2.26227    0.41342  -5.472 5.00e-08 ***
## HOME97       0.66547    0.06871   9.685  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.043 on 2035 degrees of freedom
## Multiple R-squared:  0.8701, Adjusted R-squared:  0.8698 
## F-statistic:  2273 on 6 and 2035 DF,  p-value: < 2.2e-16

Outlier Examination

good3 <- good

good3$AGE97c <- good3$AGE97-mean(good3$AGE97)
good3$AGE97c2 <- (good3$AGE97c**2)
good3$logfaminc <- ifelse(good3$faminc97 <= 1, 0, 
                          ifelse(good3$faminc97 > 1, log(good3$faminc97), NA))

model3 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=good3)

Create a new data frame for outlier information.

outliers <- good3[,c("CHID",
                     "mathraw97",
                     "AGE97c",
                     "AGE97c2",
                     "logfaminc",
                     "bthwht",
                     "WICpreg",
                     "HOME97")]
outliers$r <- rstudent(model3)
outliers$lev <- hatvalues(model3)
outliers$cd <- cooks.distance(model3)
outliers$dffit <- dffits(model3)

Create a separate data frame with the dfbetas.

dfbetaR <- dfbetas(model3)

Check the order of the variable names.

colnames(dfbetaR)
## [1] "(Intercept)" "AGE97c"      "AGE97c2"     "logfaminc"   "bthwht"     
## [6] "WICpreg"     "HOME97"

Assign new variable names.

colnames(dfbetaR) <- c("int_dfb","agec_dfb","agec2_dfb",
                       "income_dfb","bthwht_dfb",
                       "wic_dfb","home_dfb")
outliers<-cbind(outliers,dfbetaR)
head(outliers)
##    CHID mathraw97    AGE97c   AGE97c2 logfaminc bthwht WICpreg HOME97
## 3  4041         8 -3.394711 11.524063  9.038364      1       1   15.1
## 8  4180        58  4.605289 21.208686  8.670482      1       0   19.1
## 9  5032        75  4.605289 21.208686 11.397620      1       1   21.0
## 11 6034        70  5.605289 31.419264 11.652700      1       0   23.1
## 12 7041        11 -1.394711  1.945219 10.111904      1       1   13.5
## 13 7044         6 -3.394711 11.524063 10.111904      1       1   16.0
##             r         lev           cd        dffit       int_dfb     agec_dfb
## 3   0.2971785 0.003148028 3.986019e-05  0.016700189  5.558141e-03 -0.007379273
## 8  -0.8363879 0.005351258 5.377343e-04 -0.061348060 -3.386257e-02 -0.023928782
## 9   1.1927873 0.004774372 9.748380e-04  0.082615273 -4.073782e-02  0.034819061
## 11 -0.6590167 0.007067356 4.417257e-04 -0.055598746  1.451809e-02 -0.020079938
## 12 -1.1029433 0.004731514 8.260799e-04 -0.076047186 -1.519217e-02  0.004354774
## 13 -0.1105540 0.002990600 5.239874e-06 -0.006054859  9.265463e-05  0.002826915
##        agec2_dfb   income_dfb   bthwht_dfb      wic_dfb     home_dfb
## 3   0.0004813350 -0.002299047  0.007698965  0.002097753 -0.005406757
## 8  -0.0251478015  0.033696409 -0.009780305  0.029116906  0.003958870
## 9   0.0368936152  0.033335099  0.011082923  0.044719278  0.006376854
## 11 -0.0408258323 -0.008100480 -0.003812535  0.004392093 -0.006192089
## 12  0.0338562674 -0.023771850 -0.032929307 -0.010787744  0.049811718
## 13 -0.0001619237 -0.001433873 -0.002981657 -0.001720253  0.002001586
  1. Discrepency
#abs(standardized resid) > 2 (or 3)
plot(outliers$r, 
     xlab="Index",
     ylab="studentized residuals",
     pch=19)
#Add fit lines.
abline(lm(r~CHID, data=outliers), col="red")
abline(h=2, col="blue")
abline(h=-2, col="blue")
abline(h=3, col="green")
abline(h=-3, col="green")

rstudent1 <- subset(outliers,abs(r)>3)
View(rstudent1)
  1. Leverage
#leverage > (2k+2)/N 
plot(outliers$lev, 
     xlab="Index",
     ylab="leverage",
     pch=19)
#Add fit lines.
abline(lm(lev~CHID, data=outliers), col="red")
abline(h=0.007, col="blue")
abline(h=0.01, col="green")

leverage1 <- subset(outliers,lev> .01)
View(leverage1)
  1. Influence (Cook’s D) Cook’s D > 4/N
model4 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=subset(outliers, cd<(4/2042)))
summary(model4)
## 
## Call:
## lm(formula = mathraw97 ~ AGE97c + AGE97c2 + logfaminc + bthwht + 
##     WICpreg + HOME97, data = subset(outliers, cd < (4/2042)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.5466  -4.4276  -0.0635   4.4143  20.6705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.71109    1.98034   7.934 3.58e-15 ***
## AGE97c       7.06038    0.05550 127.204  < 2e-16 ***
## AGE97c2     -0.05990    0.02157  -2.777  0.00554 ** 
## logfaminc    1.04998    0.18251   5.753 1.02e-08 ***
## bthwht      -1.50088    0.33155  -4.527 6.35e-06 ***
## WICpreg     -1.75488    0.36510  -4.807 1.65e-06 ***
## HOME97       0.55821    0.06080   9.181  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.695 on 1927 degrees of freedom
## Multiple R-squared:  0.9072, Adjusted R-squared:  0.9069 
## F-statistic:  3139 on 6 and 1927 DF,  p-value: < 2.2e-16

Maybe our critical cooks.d value of 4/2042 is too strict.

large_cd <- subset(outliers, cd >(4/2042))

Next, produce a histogram and print the sample quantiles of our Cook’s D outlier measure to the console

library(Hmisc)
## 载入需要的程辑包:lattice
## 载入需要的程辑包:survival
## 载入需要的程辑包:Formula
## 
## 载入程辑包:'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(large_cd$cd)
## large_cd$cd 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      108        0      108        1 0.004932 0.004036 0.002049 0.002111 
##      .25      .50      .75      .90      .95 
## 0.002421 0.003123 0.004684 0.007934 0.009792 
## 
## lowest : 0.001970330 0.001993940 0.002011764 0.002031514 0.002041046
## highest: 0.016160333 0.020026022 0.026734741 0.034488956 0.047132584
##                                                                          
## Value      0.0020 0.0025 0.0030 0.0035 0.0040 0.0045 0.0050 0.0055 0.0070
## Frequency      20     19     18     12      8      6      3      4      2
## Proportion  0.185  0.176  0.167  0.111  0.074  0.056  0.028  0.037  0.019
##                                                                          
## Value      0.0075 0.0080 0.0085 0.0100 0.0160 0.0200 0.0265 0.0345 0.0470
## Frequency       2      4      3      2      1      1      1      1      1
## Proportion  0.019  0.037  0.028  0.019  0.009  0.009  0.009  0.009  0.009
## 
## For the frequency table, variable is rounded to the nearest 0.0005
hist(large_cd$cd)

quantile(large_cd$cd, probs = seq(0, 1, 0.05))
##          0%          5%         10%         15%         20%         25% 
## 0.001970330 0.002048829 0.002111474 0.002213343 0.002314805 0.002421454 
##         30%         35%         40%         45%         50%         55% 
## 0.002562883 0.002715239 0.002919652 0.003039767 0.003123255 0.003386273 
##         60%         65%         70%         75%         80%         85% 
## 0.003634526 0.003769098 0.004139253 0.004683894 0.005371739 0.007008524 
##         90%         95%        100% 
## 0.007934118 0.009792413 0.047132584

Take a closer look at these observations

large_cd2<-subset(outliers, cd > 0.007008524)
#examine observations in this object
View(large_cd2)

Adjusted Model

outliers2<-subset(outliers, cd < 0.007008524)

#Now you can compare the results produced from our model1 model with the results
#produced with our new dataset.
model5 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=outliers2)

summary(model5)
## 
## Call:
## lm(formula = mathraw97 ~ AGE97c + AGE97c2 + logfaminc + bthwht + 
##     WICpreg + HOME97, data = outliers2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.4896  -4.6052   0.0335   4.8690  27.9359 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.88466    2.10568   7.069 2.14e-12 ***
## AGE97c       6.93708    0.06172 112.394  < 2e-16 ***
## AGE97c2     -0.07857    0.02401  -3.273  0.00108 ** 
## logfaminc    0.90921    0.19071   4.768 2.00e-06 ***
## bthwht      -1.59216    0.37373  -4.260 2.14e-05 ***
## WICpreg     -1.91186    0.40395  -4.733 2.37e-06 ***
## HOME97       0.67532    0.06688  10.097  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.714 on 2018 degrees of freedom
## Multiple R-squared:   0.88,  Adjusted R-squared:  0.8796 
## F-statistic:  2466 on 6 and 2018 DF,  p-value: < 2.2e-16
detach(good)