setwd("C:/Users/CHEN/Desktop/金融论文")
knitr::opts_chunk$set(
cache = TRUE,
fig.align = 'center',
out.width = "100%",
message = FALSE,
warning = FALSE)
y <- c(0.77,0.79,0.89,1.03,0.85,0.86,0.91,1.01,1.02,1.05,0.96,1.07)
x1 <- c(26.7,26.2,27.2,26.8,24.1,24.0,23.6,22.1,22.2,21.4,20.3,19.1)
x2 <- c(5596,5539,5546,4131,5637,4266,2185,5360,4166,4496,2686,2627)
x3 <- c(0.916,0.808,0.670,0.648,0.621,0.605,0.513,0.494,0.484,0.478,0.471,0.457)
x4 <- c(9247,10428,12407,14163,15636,17476,18979 ,20653,22553,24579,26679,27904)
x5 <- c( 27.14,27.22,27.45,27.86, 28.17 ,28.56,29.07,30.11,30.75,31.88,32.76,32.84)
data <- data.frame(y,x1,x2,x3,x4,x5)
library(car)
model_lm <- lm(y~x1+x2+x3+x4+x5)
#模型详细结果
summary(model_lm)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.077629 -0.030150 -0.001994 0.019373 0.081463
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.472e+00 1.600e+00 -0.921 0.393
## x1 5.608e-02 3.579e-02 1.567 0.168
## x2 1.739e-05 2.511e-05 0.693 0.514
## x3 -2.421e-01 7.531e-01 -0.321 0.759
## x4 2.588e-05 5.719e-05 0.453 0.667
## x5 2.284e-02 9.355e-02 0.244 0.815
##
## Residual standard error: 0.06326 on 6 degrees of freedom
## Multiple R-squared: 0.796, Adjusted R-squared: 0.626
## F-statistic: 4.682 on 5 and 6 DF, p-value: 0.04348
#VIF
vif(model_lm)
## x1 x2 x3 x4 x5
## 25.655976 2.773748 33.292128 350.918621 109.064833
#变量x1-x5的相关系数
cor(data[,2:6])
## x1 x2 x3 x4 x5
## x1 1.0000000 0.6078049 0.8294011 -0.9641181 -0.9583671
## x2 0.6078049 1.0000000 0.6327106 -0.6853585 -0.6374646
## x3 0.8294011 0.6327106 1.0000000 -0.9148242 -0.8285551
## x4 -0.9641181 -0.6853585 -0.9148242 1.0000000 0.9792614
## x5 -0.9583671 -0.6374646 -0.8285551 0.9792614 1.0000000
#模型参数
coefficients(model_lm)
## (Intercept) x1 x2 x3 x4
## -1.472400e+00 5.608026e-02 1.739076e-05 -2.420773e-01 2.588390e-05
## x5
## 2.284343e-02
#模型参数的置信区间
confint(model_lm)
## 2.5 % 97.5 %
## (Intercept) -5.386303e+00 2.4415018030
## x1 -3.149962e-02 0.1436601343
## x2 -4.404728e-05 0.0000788288
## x3 -2.084833e+00 1.6006778850
## x4 -1.140532e-04 0.0001658210
## x5 -2.060604e-01 0.2517472888
#模型预测值
fitted(model_lm)
## 1 2 3 4 5 6 7 8
## 0.7598374 0.7893467 0.9354336 0.9485372 0.8750555 0.9060133 0.9202158 0.9629974
## 9 10 11 12
## 1.0140608 1.0546419 1.0376293 1.0062312
#模型残差值
residuals(model_lm)
## 1 2 3 4 5
## 0.0101626158 0.0006533057 -0.0454335859 0.0814628034 -0.0250555157
## 6 7 8 9 10
## -0.0460133119 -0.0102158087 0.0470026406 0.0059392015 -0.0046418619
## 11 12
## -0.0776292564 0.0637687738
#模型方差分析
anova(model_lm)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 0.049562 0.049562 12.3842 0.01253 *
## x2 1 0.001547 0.001547 0.3866 0.55697
## x3 1 0.027138 0.027138 6.7810 0.04044 *
## x4 1 0.015193 0.015193 3.7964 0.09928 .
## x5 1 0.000239 0.000239 0.0596 0.81522
## Residuals 6 0.024012 0.004002
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#AIC
AIC(model_lm)
## [1] -26.51462
从VIF猜测变量间有严重的多重共线性,计算相关系数后发现确实存在多重共线性,下面用step函数对变量进行筛选
model_lm.step <- step(model_lm, direction = "backward")
## Start: AIC=-62.57
## y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x5 1 0.0002386 0.024251 -64.450
## - x3 1 0.0004135 0.024426 -64.364
## - x4 1 0.0008198 0.024832 -64.166
## - x2 1 0.0019199 0.025932 -63.646
## <none> 0.024012 -62.569
## - x1 1 0.0098250 0.033837 -60.453
##
## Step: AIC=-64.45
## y ~ x1 + x2 + x3 + x4
##
## Df Sum of Sq RSS AIC
## - x3 1 0.0001944 0.024445 -66.355
## - x2 1 0.0035479 0.027799 -64.812
## <none> 0.024251 -64.450
## - x1 1 0.0140032 0.038254 -60.981
## - x4 1 0.0151932 0.039444 -60.613
##
## Step: AIC=-66.35
## y ~ x1 + x2 + x4
##
## Df Sum of Sq RSS AIC
## - x2 1 0.003852 0.028297 -66.599
## <none> 0.024445 -66.355
## - x1 1 0.021089 0.045534 -60.890
## - x4 1 0.042137 0.066582 -56.331
##
## Step: AIC=-66.6
## y ~ x1 + x4
##
## Df Sum of Sq RSS AIC
## <none> 0.028297 -66.599
## - x1 1 0.017774 0.046071 -62.750
## - x4 1 0.039832 0.068129 -58.055
summary(model_lm.step)
##
## Call:
## lm(formula = y ~ x1 + x4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.087371 -0.030756 -0.001341 0.024210 0.080283
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.059e+00 7.404e-01 -1.431 0.18630
## x1 5.609e-02 2.359e-02 2.378 0.04139 *
## x4 3.628e-05 1.019e-05 3.559 0.00613 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05607 on 9 degrees of freedom
## Multiple R-squared: 0.7596, Adjusted R-squared: 0.7061
## F-statistic: 14.22 on 2 and 9 DF, p-value: 0.001639
vif(model_lm.step)
## x1 x4
## 14.18917 14.18917
根据AIC 赤池信息准则,模型最后选x1,x4变量参与建模。但是VIF 还是高。
#检验是否可以进行主成分,Overall MSA大于0.5就可以
library(psych)
KMO(cor(data[2:6]))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(data[2:6]))
## Overall MSA = 0.59
## MSA for each item =
## x1 x2 x3 x4 x5
## 0.69 0.61 0.54 0.55 0.59
#主成分分析降维
model_lm.princomp <- princomp(data[2:6],cor = T)
summary(model_lm.princomp,loadings=TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.0607097 0.70683004 0.4573920 0.206454854 0.0451189337
## Proportion of Variance 0.8493049 0.09992174 0.0418415 0.008524721 0.0004071436
## Cumulative Proportion 0.8493049 0.94922664 0.9910681 0.999592856 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## x1 0.464 0.271 0.347 0.753 0.153
## x2 0.367 -0.922 0.104
## x3 0.445 -0.855 0.246
## x4 -0.482 -0.144 0.184 0.844
## x5 -0.468 -0.216 -0.371 0.628 -0.449
screeplot(model_lm.princomp,type="lines")
predict(model_lm.princomp)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## [1,] 3.2162992 0.05946270 -0.97952843 0.01729405 0.012537261
## [2,] 2.6519339 -0.05958836 -0.40943759 -0.09682358 -0.059978946
## [3,] 2.1819789 -0.12533446 0.52123056 0.29214393 -0.014257984
## [4,] 1.3754172 0.80894956 0.40106064 0.29327959 0.029936870
## [5,] 1.0716288 -0.70726111 0.27239737 -0.29850277 0.015909361
## [6,] 0.3491829 0.22917250 0.16300452 -0.21021323 0.106305710
## [7,] -0.8847448 1.61852534 0.39593322 -0.23062062 -0.054203211
## [8,] -0.6253771 -1.11882482 0.38920443 -0.17184766 -0.055248783
## [9,] -1.3013894 -0.32000757 0.23950828 0.06232217 0.017354865
## [10,] -1.7867432 -0.82783201 -0.01417894 0.25180965 0.008244198
## [11,] -2.9266256 0.28563033 -0.44033275 0.19219150 -0.031927395
## [12,] -3.3215608 0.15710790 -0.53886129 -0.10103305 0.025328055
model_lm1 <- lm(y~x1*x2*x3*x4*x5)
#模型详细结果
summary(model_lm1)
##
## Call:
## lm(formula = y ~ x1 * x2 * x3 * x4 * x5)
##
## Residuals:
## ALL 12 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (20 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.153e+01 NaN NaN NaN
## x1 9.301e-01 NaN NaN NaN
## x2 2.775e-03 NaN NaN NaN
## x3 1.670e+01 NaN NaN NaN
## x4 7.243e-04 NaN NaN NaN
## x5 -2.661e-01 NaN NaN NaN
## x1:x2 -8.220e-05 NaN NaN NaN
## x1:x3 -4.406e-01 NaN NaN NaN
## x2:x3 -4.695e-04 NaN NaN NaN
## x1:x4 -1.278e-05 NaN NaN NaN
## x2:x4 -2.864e-08 NaN NaN NaN
## x3:x4 -3.229e-04 NaN NaN NaN
## x1:x5 NA NA NA NA
## x2:x5 NA NA NA NA
## x3:x5 NA NA NA NA
## x4:x5 NA NA NA NA
## x1:x2:x3 NA NA NA NA
## x1:x2:x4 NA NA NA NA
## x1:x3:x4 NA NA NA NA
## x2:x3:x4 NA NA NA NA
## x1:x2:x5 NA NA NA NA
## x1:x3:x5 NA NA NA NA
## x2:x3:x5 NA NA NA NA
## x1:x4:x5 NA NA NA NA
## x2:x4:x5 NA NA NA NA
## x3:x4:x5 NA NA NA NA
## x1:x2:x3:x4 NA NA NA NA
## x1:x2:x3:x5 NA NA NA NA
## x1:x2:x4:x5 NA NA NA NA
## x1:x3:x4:x5 NA NA NA NA
## x2:x3:x4:x5 NA NA NA NA
## x1:x2:x3:x4:x5 NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 11 and 0 DF, p-value: NA
#模型参数
coefficients(model_lm1)
## (Intercept) x1 x2 x3 x4
## -2.152597e+01 9.301000e-01 2.774761e-03 1.669894e+01 7.243050e-04
## x5 x1:x2 x1:x3 x2:x3 x1:x4
## -2.661066e-01 -8.219911e-05 -4.406450e-01 -4.694717e-04 -1.277791e-05
## x2:x4 x3:x4 x1:x5 x2:x5 x3:x5
## -2.863702e-08 -3.229305e-04 NA NA NA
## x4:x5 x1:x2:x3 x1:x2:x4 x1:x3:x4 x2:x3:x4
## NA NA NA NA NA
## x1:x2:x5 x1:x3:x5 x2:x3:x5 x1:x4:x5 x2:x4:x5
## NA NA NA NA NA
## x3:x4:x5 x1:x2:x3:x4 x1:x2:x3:x5 x1:x2:x4:x5 x1:x3:x4:x5
## NA NA NA NA NA
## x2:x3:x4:x5 x1:x2:x3:x4:x5
## NA NA
#模型参数的置信区间(该模型为空)
confint(model_lm1)
## 2.5 % 97.5 %
## (Intercept) NaN NaN
## x1 NaN NaN
## x2 NaN NaN
## x3 NaN NaN
## x4 NaN NaN
## x5 NaN NaN
## x1:x2 NaN NaN
## x1:x3 NaN NaN
## x2:x3 NaN NaN
## x1:x4 NaN NaN
## x2:x4 NaN NaN
## x3:x4 NaN NaN
## x1:x5 NaN NaN
## x2:x5 NaN NaN
## x3:x5 NaN NaN
## x4:x5 NaN NaN
## x1:x2:x3 NaN NaN
## x1:x2:x4 NaN NaN
## x1:x3:x4 NaN NaN
## x2:x3:x4 NaN NaN
## x1:x2:x5 NaN NaN
## x1:x3:x5 NaN NaN
## x2:x3:x5 NaN NaN
## x1:x4:x5 NaN NaN
## x2:x4:x5 NaN NaN
## x3:x4:x5 NaN NaN
## x1:x2:x3:x4 NaN NaN
## x1:x2:x3:x5 NaN NaN
## x1:x2:x4:x5 NaN NaN
## x1:x3:x4:x5 NaN NaN
## x2:x3:x4:x5 NaN NaN
## x1:x2:x3:x4:x5 NaN NaN
#模型预测值
fitted(model_lm1)
## 1 2 3 4 5 6 7 8 9 10 11 12
## 0.77 0.79 0.89 1.03 0.85 0.86 0.91 1.01 1.02 1.05 0.96 1.07
#模型残差值
residuals(model_lm1)
## 1 2 3 4 5 6 7 8 9 10 11 12
## 0 0 0 0 0 0 0 0 0 0 0 0
#模型方差分析
anova(model_lm1)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 0.049562 0.049562 NaN NaN
## x2 1 0.001547 0.001547 NaN NaN
## x3 1 0.027138 0.027138 NaN NaN
## x4 1 0.015193 0.015193 NaN NaN
## x5 1 0.000239 0.000239 NaN NaN
## x1:x2 1 0.001104 0.001104 NaN NaN
## x1:x3 1 0.013203 0.013203 NaN NaN
## x2:x3 1 0.003462 0.003462 NaN NaN
## x1:x4 1 0.004060 0.004060 NaN NaN
## x2:x4 1 0.001196 0.001196 NaN NaN
## x3:x4 1 0.000987 0.000987 NaN NaN
## Residuals 0 0.000000 NaN
#AIC
AIC(model_lm1)
## [1] -Inf
这个模型过拟合了
library(lars)
names(data) <- c("生态效率值","第二产业比重","人均水资源量","单位GDP能耗",
"居民人均可支配收入","金融发展指数")
lx <- as.matrix(data[2:6])
ly <- as.matrix(data[1])
larresult <- lars(lx,ly,type = "lasso")
larresult
##
## Call:
## lars(x = lx, y = ly, type = "lasso")
## R-squared: 0.796
## Sequence of LASSO moves:
## 单位GDP能耗 金融发展指数 第二产业比重 人均水资源量 居民人均可支配收入
## Var 3 5 1 2 4
## Step 1 2 3 4 5
plot(larresult)
summary(larresult)
## LARS/LASSO
## Call: lars(x = lx, y = ly, type = "lasso")
## Df Rss Cp
## 0 1 0.117692 19.4078
## 1 2 0.060820 7.1972
## 2 3 0.038084 3.5162
## 3 4 0.033337 4.3299
## 4 5 0.025582 4.3923
## 5 6 0.024012 6.0000
larresult$Cp[which.min(larresult$Cp)]
## 2
## 3.516187
larresult$beta
## 第二产业比重 人均水资源量 单位GDP能耗 居民人均可支配收入 金融发展指数
## 0 0.000000000 0.000000e+00 0.0000000 0.00000e+00 0.00000000
## 1 0.000000000 0.000000e+00 -0.2766809 0.00000e+00 0.00000000
## 2 0.000000000 0.000000e+00 -0.4229568 0.00000e+00 0.01003891
## 3 0.009604011 0.000000e+00 -0.4467589 0.00000e+00 0.02083052
## 4 0.034266042 7.489358e-06 -0.5292337 0.00000e+00 0.05013105
## 5 0.056080258 1.739076e-05 -0.2420773 2.58839e-05 0.02284343
## attr(,"scaled:scale")
## [1] 8.952607e+00 4.196196e+03 4.846885e-01 2.072191e+04 7.062343e+00
#系数
coef <- coef.lars(larresult,mode="step",s=larresult$df[which.min(larresult$Cp)])
#不为0的系数
coef[coef!=0]
## 单位GDP能耗 金融发展指数
## -0.42295683 0.01003891
#求截距值,其中s的含义和求取coef中的s相同,代表第几次迭代对应的模型的截距值。
predict(larresult,data.frame(x1=0,x2=0,x3=0,x4=0,x5=0),
s=larresult$df[which.min(larresult$Cp)]) #
## $s
##
## 3
##
## $fraction
##
## 0.4
##
## $mode
## [1] "step"
##
## $fit
## [1] 0.8907181