HW2 Applied Questions: 1 Problem 6
HW2 Applied Questions: 1 Problem 6
HW2 Applied Questions: 1 Problem 6
魏羿晖 2020200732
1 Problem 6
1.1 a
setwd("D:/files/study/ISL")
auto = read.csv("Auto.csv", header = T, na.strings = "?")
auto = na.omit(auto)
library(MASS)
library(ISLR)
lm.fit = lm(mpg~horsepower, data = auto)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
1
1 PROBLEM 6 2
1.2 b
plot(auto$horsepower, auto$mpg)
abline(lm.fit)
1 PROBLEM 6 3
40
auto$mpg
30
20
10
auto$horsepower
1.3 c
par(mfrow = c(2,2))
plot(lm.fit)
2 PROBLEM 7 4
Standardized residuals
Residuals vs Fitted Normal Q−Q
0 2 4
323 330323
15
Residuals
−3
5 10 15 20 25 30 −3 −2 −1 0 1 2 3
Standardized residuals
Scale−Location Residuals vs Leverage
323
330
334
117
14
9
2
1.0
−2
Cook's distance
0.0
残差图的拟合线为 U 型,可以看出残差与预测值存在相关关系,表明数据
具有非线性。Scale-Location 图表明残差项可能有着非恒定的方差。
2 Problem 7
2.1 a
auto$name = as.factor(auto$name)
pairs(auto)
2 PROBLEM 7 5
mpg
10
7
cylinders
3
displacement
100
horsepower
50
weight
1500
acceleration
10
70 82
year
origin
1.0
0 300
name
2.2 b
2.3 c
##
## Call:
## lm(formula = mpg ~ . - name, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
2 PROBLEM 7 7
2.4 d
par(mfrow = c(2,2))
plot(lm.fit1) Standardized residuals
3323
327 327323
Residuals
26 326
2
5
−2
−10
10 15 20 25 30 35 −3 −2 −1 0 1 2 3
Standardized residuals
3323
32726 327 0.5
394
2
1.0
−2
14
Cook's distance
0.0
plot(predict(lm.fit1), rstudent(lm.fit1))
2 PROBLEM 7 8
rstudent(lm.fit1)
3
0
−3
10 15 20 25 30 35
predict(lm.fit1)
残差图的拟合线为 U 型,可以看出残差与预测值仍存在一定的相关关系。
观测点 14 为高杠杆点。存在异常大的离群点,因为一部分点的学生化残差
大于 3。
2.5 e
##
## Call:
## lm(formula = mpg ~ cylinders * displacement, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0432 -2.4308 -0.2263 2.2048 20.9051
##
2 PROBLEM 7 9
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.22040 2.34712 20.545 < 2e-16 ***
## cylinders -2.41838 0.53456 -4.524 8.08e-06 ***
## displacement -0.13436 0.01615 -8.321 1.50e-15 ***
## cylinders:displacement 0.01182 0.00207 5.711 2.24e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.454 on 388 degrees of freedom
## Multiple R-squared: 0.6769, Adjusted R-squared: 0.6744
## F-statistic: 271 on 3 and 388 DF, p-value: < 2.2e-16
summary(lm.fit3)
##
## Call:
## lm(formula = mpg ~ weight + displacement + displacement:weight,
## data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.8664 -2.4801 -0.3355 1.8071 17.9429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.372e+01 1.940e+00 27.697 < 2e-16 ***
## weight -8.931e-03 8.474e-04 -10.539 < 2e-16 ***
## displacement -7.831e-02 1.131e-02 -6.922 1.85e-11 ***
## weight:displacement 1.744e-05 2.789e-06 6.253 1.06e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.097 on 388 degrees of freedom
2 PROBLEM 7 10
选取相关系数最大的两组变量分别进行有交互项的线性回归,均存在统计
显著的交互作用。
2.6 f
##
## Call:
## lm(formula = mpg ~ log(displacement) + sqrt(weight) + year +
## I(year^2), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9700 -1.8228 -0.0033 1.6218 13.5212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 434.93957 75.22099 5.782 1.52e-08 ***
## log(displacement) -3.42454 0.90388 -3.789 0.000176 ***
## sqrt(weight) -0.51231 0.06243 -8.206 3.43e-15 ***
## year -10.40999 1.97815 -5.262 2.36e-07 ***
## I(year^2) 0.07339 0.01301 5.641 3.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.123 on 387 degrees of freedom
## Multiple R-squared: 0.8416, Adjusted R-squared: 0.8399
## F-statistic: 513.9 on 4 and 387 DF, p-value: < 2.2e-16
3 PROBLEM 8 11
par(mfrow=c(2,2))
plot(lm.fit4)
Standardized residuals
Residuals vs Fitted Normal Q−Q
5 15
323 245323
Residuals
245
327 327
2
−2
−10
10 15 20 25 30 35 −3 −2 −1 0 1 2 3
Standardized residuals
Scale−Location Residuals vs Leverage
0.0 1.0 2.0
245323
327 4
0 387
Cook's
112distance
335
−4
选取 lm.fit1 中统计显著的几个预测变量,进行不同变换后的回归结果仍是
统计显著的。残差图中的点几乎没有规律,表明模型对数据的拟合度较好。
杠杆图中点的杠杆值均较小。但 Normal Q-Q 图则表示残差可能不是正态
分布的。
3 Problem 8
3.1 a
summary(Weekly)
Year
1990
Lag1
−15
Lag2
−15
Lag3
−15
Lag4
−15
Lag5
−15
6
Volume
0
Today
−15
相关系数矩阵和散点图表示,大部分预测变量之间的相关系数接近于 0,
前几天的投资回报率与当天的回报率相关性很小。唯一的特例是 year 和
volume。可以看到 volume 随着 year 的增加逐渐增长。
3.2 b
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
3 PROBLEM 8 14
3.3 c
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652
3.4 d 逻辑斯蒂回归
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = tr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
3 PROBLEM 8 16
##
## glm.pred1 Down Up
## Down 9 5
## Up 34 56
mean(glm.pred1 == Weekly$Direction[!tr])
## [1] 0.625
3.5 e LDA
##
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class == Weekly$Direction[!tr])
## [1] 0.625
3.6 f QDA
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == Weekly$Direction[!tr])
## [1] 0.5865385
3.7 g KNN: K = 1
library(class)
train.X = as.matrix(Weekly$Lag2[tr])
test.X = as.matrix(Weekly$Lag2[!tr])
train.Direction = Weekly$Direction[tr]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Weekly$Direction[!tr])
##
## knn.pred Down Up
## Down 21 30
3 PROBLEM 8 19
## Up 22 31
mean(knn.pred == Weekly$Direction[!tr])
## [1] 0.5
3.8 i
3.8.1 KNN :K = 3
library(class)
train.X = as.matrix(Weekly$Lag2[tr])
test.X = as.matrix(Weekly$Lag2[!tr])
train.Direction = Weekly$Direction[tr]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred, Weekly$Direction[!tr])
##
## knn.pred Down Up
## Down 16 20
## Up 27 41
mean(knn.pred == Weekly$Direction[!tr])
## [1] 0.5480769
3.8.2 KNN :K = 4
library(class)
train.X = as.matrix(Weekly$Lag2[tr])
3 PROBLEM 8 20
test.X = as.matrix(Weekly$Lag2[!tr])
train.Direction = Weekly$Direction[tr]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 4)
table(knn.pred, Weekly$Direction[!tr])
##
## knn.pred Down Up
## Down 20 17
## Up 23 44
mean(knn.pred == Weekly$Direction[!tr])
## [1] 0.6153846
3.8.3 KNN :K = 5
library(class)
train.X = as.matrix(Weekly$Lag2[tr])
test.X = as.matrix(Weekly$Lag2[!tr])
train.Direction = Weekly$Direction[tr]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 5)
table(knn.pred, Weekly$Direction[!tr])
##
## knn.pred Down Up
## Down 16 21
## Up 27 40
mean(knn.pred == Weekly$Direction[!tr])
## [1] 0.5384615
3 PROBLEM 8 21
3.8.4 KNN :K = 10
library(class)
train.X = as.matrix(Weekly$Lag2[tr])
test.X = as.matrix(Weekly$Lag2[!tr])
train.Direction = Weekly$Direction[tr]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 10)
table(knn.pred, Weekly$Direction[!tr])
##
## knn.pred Down Up
## Down 17 21
## Up 26 40
mean(knn.pred == Weekly$Direction[!tr])
## [1] 0.5480769
3.8.5 调整后的逻辑斯蒂回归
##
## Call:
## glm(formula = Direction ~ Lag1 * Lag2, family = binomial, data = Weekly,
## subset = tr)
3 PROBLEM 8 22
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.573 -1.259 1.003 1.086 1.596
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.211419 0.064589 3.273 0.00106 **
## Lag1 -0.051505 0.030727 -1.676 0.09370 .
## Lag2 0.053471 0.029193 1.832 0.06700 .
## Lag1:Lag2 0.001921 0.007460 0.257 0.79680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1346.9 on 981 degrees of freedom
## AIC: 1354.9
##
## Number of Fisher Scoring iterations: 4
glm.probs1 = predict(glm.fit1, te, type = "response")
glm.pred1 = rep("Down",length(glm.probs1))
glm.pred1[glm.probs1 > .5] = "Up"
table(glm.pred1, Weekly$Direction[!tr])
##
## glm.pred1 Down Up
## Down 7 8
## Up 36 53
mean(glm.pred1 == Weekly$Direction[!tr])
## [1] 0.5769231
3 PROBLEM 8 23
##
## lda.class Down Up
## Down 7 8
## Up 36 53
mean(lda.class == Weekly$Direction[!tr])
## [1] 0.5769231
3 PROBLEM 8 24
##
## qda.class Down Up
## Down 23 36
## Up 20 25
mean(qda.class == Weekly$Direction[!tr])
## [1] 0.4615385