good <- read.csv("good.csv")
good <- na.omit(good[,c("CHID",
"mathraw97",
"AGE97",
"faminc97",
"bthwht",
"WICpreg",
"HOME97")])
attach(good)
summary(good[,c("mathraw97",
"AGE97",
"faminc97",
"bthwht",
"WICpreg",
"HOME97")])
## mathraw97 AGE97 faminc97 bthwht
## Min. : 0.00 Min. : 3.000 Min. : 0 Min. :0.000
## 1st Qu.:15.00 1st Qu.: 5.000 1st Qu.: 20686 1st Qu.:0.000
## Median :36.00 Median : 7.000 Median : 40705 Median :0.000
## Mean :35.83 Mean : 7.395 Mean : 52060 Mean :0.406
## 3rd Qu.:54.00 3rd Qu.:10.000 3rd Qu.: 67477 3rd Qu.:1.000
## Max. :98.00 Max. :13.000 Max. :784611 Max. :1.000
## WICpreg HOME97
## Min. :0.0000 Min. : 7.90
## 1st Qu.:0.0000 1st Qu.:18.00
## Median :0.0000 Median :20.50
## Mean :0.4167 Mean :20.19
## 3rd Qu.:1.0000 3rd Qu.:22.40
## Max. :1.0000 Max. :27.00
sd(mathraw97)
## [1] 22.28801
sd(AGE97)
## [1] 2.922137
sd(faminc97)
## [1] 53175.04
sd(bthwht)
## [1] 0.4911999
sd(WICpreg)
## [1] 0.4931412
sd(HOME97)
## [1] 3.068104
table(bthwht)
## bthwht
## 0 1
## 1213 829
table(WICpreg)
## WICpreg
## 0 1
## 1191 851
cor(good[,c("AGE97",
"faminc97",
"bthwht",
"WICpreg",
"HOME97")], use="complete.obs")
## AGE97 faminc97 bthwht WICpreg HOME97
## AGE97 1.00000000 0.05414526 0.20405292 -0.08530692 0.19893080
## faminc97 0.05414526 1.00000000 -0.10282942 -0.38393247 0.39448347
## bthwht 0.20405292 -0.10282942 1.00000000 0.12644926 -0.09637508
## WICpreg -0.08530692 -0.38393247 0.12644926 1.00000000 -0.40249151
## HOME97 0.19893080 0.39448347 -0.09637508 -0.40249151 1.00000000
Build the multiple regression analysis.
good1 <- good[,c("mathraw97",
"AGE97",
"faminc97",
"bthwht",
"WICpreg")]
model1 <- lm(mathraw97 ~ AGE97+faminc97+bthwht+WICpreg, data=good1)
Check linearity with scatter plot matrix.
pairs(good1, panel=panel.smooth)
Examine the relationship:
ggplot(good1, aes(x=AGE97, y=mathraw97)) +
geom_point(size=0.6) +
xlab("Respondent's Age") +
ylab("Math Test Score") +
theme_bw() +
geom_smooth(method="loess")
## `geom_smooth()` using formula 'y ~ x'
2. faminc97 & mathraw97
ggplot(good1, aes(x=faminc97, y=mathraw97)) +
geom_point(size=0.6) +
xlab("Respondent's Family Income") +
ylab("Math Test Score") +
theme_bw() +
geom_smooth(method="loess")
## `geom_smooth()` using formula 'y ~ x'
plot(model1) #enter to get to residual plot
1. model residuals
good.res<-resid(model1)
fitted.res<-fitted(model1)
plot(fitted.res,good.res)
abline(0, 0, lty=2, col="grey")
lines(lowess(good.res ~ fitted.res), col="red")
#probability plots
hist(AGE97, freq=FALSE)
lines(density(na.omit(AGE97)))
#quantile-quantile plots
qqnorm(AGE97)
qqline(AGE97,col="red")
2. faminc97
#probability plots
hist(faminc97,freq=FALSE)
lines(density(faminc97))
#quantile-quantile plots
qqnorm(faminc97)
qqline(faminc97,col="red")
3. Residuals
hist(good.res,10)
boxplot(good.res,main="Boxplot of residuals")
Using Added Variable Plots
good2 <- good[c("mathraw97",
"AGE97",
"faminc97",
"bthwht",
"WICpreg",
"HOME97")]
# mathraw97
model_math <- lm(mathraw97 ~ AGE97+faminc97+bthwht+WICpreg, data=good2)
# HOME97
model_home <- lm(HOME97 ~ AGE97+faminc97+bthwht+WICpreg, data=good2)
Compare plotted residuals.
plot(density(resid(model_math)))
plot(density(resid(model_home)))
qqnorm(resid(model_math))
qqline(resid(model_math), col="red")
qqnorm(resid(model_home))
qqline(resid(model_home), col="red")
plot(model_home$residuals, model_math$residuals)
abline(lm(model_math$residuals ~ model_home$residuals), col="red")
lines(lowess(model_math$residuals ~ model_home$residuals), col="blue")
#Create a new centered AGE97 variable.
good2$AGE97c <- good2$AGE97-mean(good2$AGE97)
#Create a new centered AGE97 squared variable
good2$AGE97c2 <- (good2$AGE97c**2)
good2$logfaminc <- ifelse(good2$faminc97 <= 1, 0,
ifelse(good2$faminc97 > 1, log(good2$faminc97), NA))
Create a new model after transformation.
model2 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=good2)
Extract the model fitted values and residuals.
pred1 <- as.data.frame(model2$fitted.values)
pred1$residuals <- (model2$residuals)
Produce a scatter plot of residuals against IVs.
plot(good2$logfaminc, pred1$residuals)
abline(lm(pred1$residuals ~ good2$logfaminc), col="red")
lines(lowess(good2$logfaminc, pred1$residuals), col="blue")
plot(good2$HOME97, pred1$residuals)
abline(lm(pred1$residuals~good2$HOME97), col="red")
lines(lowess(good2$HOME97, pred1$residuals), col="blue")
summary(model2)
##
## Call:
## lm(formula = mathraw97 ~ AGE97c + AGE97c2 + logfaminc + bthwht +
## WICpreg + HOME97, data = good2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.070 -4.582 0.050 4.973 28.446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.14896 2.00751 9.041 < 2e-16 ***
## AGE97c 6.90390 0.06400 107.874 < 2e-16 ***
## AGE97c2 -0.08536 0.02481 -3.441 0.000592 ***
## logfaminc 0.62995 0.17378 3.625 0.000296 ***
## bthwht -1.59559 0.38854 -4.107 4.17e-05 ***
## WICpreg -2.26227 0.41342 -5.472 5.00e-08 ***
## HOME97 0.66547 0.06871 9.685 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.043 on 2035 degrees of freedom
## Multiple R-squared: 0.8701, Adjusted R-squared: 0.8698
## F-statistic: 2273 on 6 and 2035 DF, p-value: < 2.2e-16
good3 <- good
good3$AGE97c <- good3$AGE97-mean(good3$AGE97)
good3$AGE97c2 <- (good3$AGE97c**2)
good3$logfaminc <- ifelse(good3$faminc97 <= 1, 0,
ifelse(good3$faminc97 > 1, log(good3$faminc97), NA))
model3 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=good3)
Create a new data frame for outlier information.
outliers <- good3[,c("CHID",
"mathraw97",
"AGE97c",
"AGE97c2",
"logfaminc",
"bthwht",
"WICpreg",
"HOME97")]
outliers$r <- rstudent(model3)
outliers$lev <- hatvalues(model3)
outliers$cd <- cooks.distance(model3)
outliers$dffit <- dffits(model3)
Create a separate data frame with the dfbetas.
dfbetaR <- dfbetas(model3)
Check the order of the variable names.
colnames(dfbetaR)
## [1] "(Intercept)" "AGE97c" "AGE97c2" "logfaminc" "bthwht"
## [6] "WICpreg" "HOME97"
Assign new variable names.
colnames(dfbetaR) <- c("int_dfb","agec_dfb","agec2_dfb",
"income_dfb","bthwht_dfb",
"wic_dfb","home_dfb")
outliers<-cbind(outliers,dfbetaR)
head(outliers)
## CHID mathraw97 AGE97c AGE97c2 logfaminc bthwht WICpreg HOME97
## 3 4041 8 -3.394711 11.524063 9.038364 1 1 15.1
## 8 4180 58 4.605289 21.208686 8.670482 1 0 19.1
## 9 5032 75 4.605289 21.208686 11.397620 1 1 21.0
## 11 6034 70 5.605289 31.419264 11.652700 1 0 23.1
## 12 7041 11 -1.394711 1.945219 10.111904 1 1 13.5
## 13 7044 6 -3.394711 11.524063 10.111904 1 1 16.0
## r lev cd dffit int_dfb agec_dfb
## 3 0.2971785 0.003148028 3.986019e-05 0.016700189 5.558141e-03 -0.007379273
## 8 -0.8363879 0.005351258 5.377343e-04 -0.061348060 -3.386257e-02 -0.023928782
## 9 1.1927873 0.004774372 9.748380e-04 0.082615273 -4.073782e-02 0.034819061
## 11 -0.6590167 0.007067356 4.417257e-04 -0.055598746 1.451809e-02 -0.020079938
## 12 -1.1029433 0.004731514 8.260799e-04 -0.076047186 -1.519217e-02 0.004354774
## 13 -0.1105540 0.002990600 5.239874e-06 -0.006054859 9.265463e-05 0.002826915
## agec2_dfb income_dfb bthwht_dfb wic_dfb home_dfb
## 3 0.0004813350 -0.002299047 0.007698965 0.002097753 -0.005406757
## 8 -0.0251478015 0.033696409 -0.009780305 0.029116906 0.003958870
## 9 0.0368936152 0.033335099 0.011082923 0.044719278 0.006376854
## 11 -0.0408258323 -0.008100480 -0.003812535 0.004392093 -0.006192089
## 12 0.0338562674 -0.023771850 -0.032929307 -0.010787744 0.049811718
## 13 -0.0001619237 -0.001433873 -0.002981657 -0.001720253 0.002001586
#abs(standardized resid) > 2 (or 3)
plot(outliers$r,
xlab="Index",
ylab="studentized residuals",
pch=19)
#Add fit lines.
abline(lm(r~CHID, data=outliers), col="red")
abline(h=2, col="blue")
abline(h=-2, col="blue")
abline(h=3, col="green")
abline(h=-3, col="green")
rstudent1 <- subset(outliers,abs(r)>3)
View(rstudent1)
#leverage > (2k+2)/N
plot(outliers$lev,
xlab="Index",
ylab="leverage",
pch=19)
#Add fit lines.
abline(lm(lev~CHID, data=outliers), col="red")
abline(h=0.007, col="blue")
abline(h=0.01, col="green")
leverage1 <- subset(outliers,lev> .01)
View(leverage1)
model4 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=subset(outliers, cd<(4/2042)))
summary(model4)
##
## Call:
## lm(formula = mathraw97 ~ AGE97c + AGE97c2 + logfaminc + bthwht +
## WICpreg + HOME97, data = subset(outliers, cd < (4/2042)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.5466 -4.4276 -0.0635 4.4143 20.6705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.71109 1.98034 7.934 3.58e-15 ***
## AGE97c 7.06038 0.05550 127.204 < 2e-16 ***
## AGE97c2 -0.05990 0.02157 -2.777 0.00554 **
## logfaminc 1.04998 0.18251 5.753 1.02e-08 ***
## bthwht -1.50088 0.33155 -4.527 6.35e-06 ***
## WICpreg -1.75488 0.36510 -4.807 1.65e-06 ***
## HOME97 0.55821 0.06080 9.181 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.695 on 1927 degrees of freedom
## Multiple R-squared: 0.9072, Adjusted R-squared: 0.9069
## F-statistic: 3139 on 6 and 1927 DF, p-value: < 2.2e-16
Maybe our critical cooks.d value of 4/2042 is too strict.
large_cd <- subset(outliers, cd >(4/2042))
Next, produce a histogram and print the sample quantiles of our Cook’s D outlier measure to the console
library(Hmisc)
## 载入需要的程辑包:lattice
## 载入需要的程辑包:survival
## 载入需要的程辑包:Formula
##
## 载入程辑包:'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(large_cd$cd)
## large_cd$cd
## n missing distinct Info Mean Gmd .05 .10
## 108 0 108 1 0.004932 0.004036 0.002049 0.002111
## .25 .50 .75 .90 .95
## 0.002421 0.003123 0.004684 0.007934 0.009792
##
## lowest : 0.001970330 0.001993940 0.002011764 0.002031514 0.002041046
## highest: 0.016160333 0.020026022 0.026734741 0.034488956 0.047132584
##
## Value 0.0020 0.0025 0.0030 0.0035 0.0040 0.0045 0.0050 0.0055 0.0070
## Frequency 20 19 18 12 8 6 3 4 2
## Proportion 0.185 0.176 0.167 0.111 0.074 0.056 0.028 0.037 0.019
##
## Value 0.0075 0.0080 0.0085 0.0100 0.0160 0.0200 0.0265 0.0345 0.0470
## Frequency 2 4 3 2 1 1 1 1 1
## Proportion 0.019 0.037 0.028 0.019 0.009 0.009 0.009 0.009 0.009
##
## For the frequency table, variable is rounded to the nearest 0.0005
hist(large_cd$cd)
quantile(large_cd$cd, probs = seq(0, 1, 0.05))
## 0% 5% 10% 15% 20% 25%
## 0.001970330 0.002048829 0.002111474 0.002213343 0.002314805 0.002421454
## 30% 35% 40% 45% 50% 55%
## 0.002562883 0.002715239 0.002919652 0.003039767 0.003123255 0.003386273
## 60% 65% 70% 75% 80% 85%
## 0.003634526 0.003769098 0.004139253 0.004683894 0.005371739 0.007008524
## 90% 95% 100%
## 0.007934118 0.009792413 0.047132584
Take a closer look at these observations
large_cd2<-subset(outliers, cd > 0.007008524)
#examine observations in this object
View(large_cd2)
outliers2<-subset(outliers, cd < 0.007008524)
#Now you can compare the results produced from our model1 model with the results
#produced with our new dataset.
model5 <- lm(mathraw97 ~ AGE97c+AGE97c2+logfaminc+bthwht+WICpreg+HOME97, data=outliers2)
summary(model5)
##
## Call:
## lm(formula = mathraw97 ~ AGE97c + AGE97c2 + logfaminc + bthwht +
## WICpreg + HOME97, data = outliers2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.4896 -4.6052 0.0335 4.8690 27.9359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.88466 2.10568 7.069 2.14e-12 ***
## AGE97c 6.93708 0.06172 112.394 < 2e-16 ***
## AGE97c2 -0.07857 0.02401 -3.273 0.00108 **
## logfaminc 0.90921 0.19071 4.768 2.00e-06 ***
## bthwht -1.59216 0.37373 -4.260 2.14e-05 ***
## WICpreg -1.91186 0.40395 -4.733 2.37e-06 ***
## HOME97 0.67532 0.06688 10.097 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.714 on 2018 degrees of freedom
## Multiple R-squared: 0.88, Adjusted R-squared: 0.8796
## F-statistic: 2466 on 6 and 2018 DF, p-value: < 2.2e-16
detach(good)