金融论文数据分析

setwd("C:/Users/CHEN/Desktop/金融论文")
knitr::opts_chunk$set(
  cache = TRUE,
  fig.align = 'center',
  out.width = "100%",
  message = FALSE,
  warning = FALSE)

0.1 数据导入

y <- c(0.77,0.79,0.89,1.03,0.85,0.86,0.91,1.01,1.02,1.05,0.96,1.07)
x1 <- c(26.7,26.2,27.2,26.8,24.1,24.0,23.6,22.1,22.2,21.4,20.3,19.1)
x2 <- c(5596,5539,5546,4131,5637,4266,2185,5360,4166,4496,2686,2627)
x3 <- c(0.916,0.808,0.670,0.648,0.621,0.605,0.513,0.494,0.484,0.478,0.471,0.457)
x4 <- c(9247,10428,12407,14163,15636,17476,18979 ,20653,22553,24579,26679,27904)
x5 <- c( 27.14,27.22,27.45,27.86, 28.17 ,28.56,29.07,30.11,30.75,31.88,32.76,32.84)
data <- data.frame(y,x1,x2,x3,x4,x5)

0.2 线性回归

library(car)

0.2.1 不含交互项的线性回归模型

model_lm <- lm(y~x1+x2+x3+x4+x5)
#模型详细结果
summary(model_lm)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.077629 -0.030150 -0.001994  0.019373  0.081463 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.472e+00  1.600e+00  -0.921    0.393
## x1           5.608e-02  3.579e-02   1.567    0.168
## x2           1.739e-05  2.511e-05   0.693    0.514
## x3          -2.421e-01  7.531e-01  -0.321    0.759
## x4           2.588e-05  5.719e-05   0.453    0.667
## x5           2.284e-02  9.355e-02   0.244    0.815
## 
## Residual standard error: 0.06326 on 6 degrees of freedom
## Multiple R-squared:  0.796,  Adjusted R-squared:  0.626 
## F-statistic: 4.682 on 5 and 6 DF,  p-value: 0.04348

#VIF
vif(model_lm)

##         x1         x2         x3         x4         x5 
##  25.655976   2.773748  33.292128 350.918621 109.064833

#变量x1-x5的相关系数
cor(data[,2:6])

##            x1         x2         x3         x4         x5
## x1  1.0000000  0.6078049  0.8294011 -0.9641181 -0.9583671
## x2  0.6078049  1.0000000  0.6327106 -0.6853585 -0.6374646
## x3  0.8294011  0.6327106  1.0000000 -0.9148242 -0.8285551
## x4 -0.9641181 -0.6853585 -0.9148242  1.0000000  0.9792614
## x5 -0.9583671 -0.6374646 -0.8285551  0.9792614  1.0000000

#模型参数
coefficients(model_lm)

##   (Intercept)            x1            x2            x3            x4 
## -1.472400e+00  5.608026e-02  1.739076e-05 -2.420773e-01  2.588390e-05 
##            x5 
##  2.284343e-02

#模型参数的置信区间
confint(model_lm)

##                     2.5 %       97.5 %
## (Intercept) -5.386303e+00 2.4415018030
## x1          -3.149962e-02 0.1436601343
## x2          -4.404728e-05 0.0000788288
## x3          -2.084833e+00 1.6006778850
## x4          -1.140532e-04 0.0001658210
## x5          -2.060604e-01 0.2517472888

#模型预测值
fitted(model_lm)

##         1         2         3         4         5         6         7         8 
## 0.7598374 0.7893467 0.9354336 0.9485372 0.8750555 0.9060133 0.9202158 0.9629974 
##         9        10        11        12 
## 1.0140608 1.0546419 1.0376293 1.0062312

#模型残差值
residuals(model_lm)

##             1             2             3             4             5 
##  0.0101626158  0.0006533057 -0.0454335859  0.0814628034 -0.0250555157 
##             6             7             8             9            10 
## -0.0460133119 -0.0102158087  0.0470026406  0.0059392015 -0.0046418619 
##            11            12 
## -0.0776292564  0.0637687738

#模型方差分析
anova(model_lm)

## Analysis of Variance Table
## 
## Response: y
##           Df   Sum Sq  Mean Sq F value  Pr(>F)  
## x1         1 0.049562 0.049562 12.3842 0.01253 *
## x2         1 0.001547 0.001547  0.3866 0.55697  
## x3         1 0.027138 0.027138  6.7810 0.04044 *
## x4         1 0.015193 0.015193  3.7964 0.09928 .
## x5         1 0.000239 0.000239  0.0596 0.81522  
## Residuals  6 0.024012 0.004002                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#AIC
AIC(model_lm)

## [1] -26.51462

从VIF猜测变量间有严重的多重共线性，计算相关系数后发现确实存在多重共线性，下面用step函数对变量进行筛选

model_lm.step <- step(model_lm, direction = "backward")

## Start:  AIC=-62.57
## y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq      RSS     AIC
## - x5    1 0.0002386 0.024251 -64.450
## - x3    1 0.0004135 0.024426 -64.364
## - x4    1 0.0008198 0.024832 -64.166
## - x2    1 0.0019199 0.025932 -63.646
## <none>              0.024012 -62.569
## - x1    1 0.0098250 0.033837 -60.453
## 
## Step:  AIC=-64.45
## y ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq      RSS     AIC
## - x3    1 0.0001944 0.024445 -66.355
## - x2    1 0.0035479 0.027799 -64.812
## <none>              0.024251 -64.450
## - x1    1 0.0140032 0.038254 -60.981
## - x4    1 0.0151932 0.039444 -60.613
## 
## Step:  AIC=-66.35
## y ~ x1 + x2 + x4
## 
##        Df Sum of Sq      RSS     AIC
## - x2    1  0.003852 0.028297 -66.599
## <none>              0.024445 -66.355
## - x1    1  0.021089 0.045534 -60.890
## - x4    1  0.042137 0.066582 -56.331
## 
## Step:  AIC=-66.6
## y ~ x1 + x4
## 
##        Df Sum of Sq      RSS     AIC
## <none>              0.028297 -66.599
## - x1    1  0.017774 0.046071 -62.750
## - x4    1  0.039832 0.068129 -58.055

summary(model_lm.step)

## 
## Call:
## lm(formula = y ~ x1 + x4)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.087371 -0.030756 -0.001341  0.024210  0.080283 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -1.059e+00  7.404e-01  -1.431  0.18630   
## x1           5.609e-02  2.359e-02   2.378  0.04139 * 
## x4           3.628e-05  1.019e-05   3.559  0.00613 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05607 on 9 degrees of freedom
## Multiple R-squared:  0.7596, Adjusted R-squared:  0.7061 
## F-statistic: 14.22 on 2 and 9 DF,  p-value: 0.001639

vif(model_lm.step)

##       x1       x4 
## 14.18917 14.18917

根据AIC 赤池信息准则，模型最后选x1,x4变量参与建模。但是VIF 还是高。

#检验是否可以进行主成分,Overall MSA大于0.5就可以
library(psych)
KMO(cor(data[2:6]))

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(data[2:6]))
## Overall MSA =  0.59
## MSA for each item = 
##   x1   x2   x3   x4   x5 
## 0.69 0.61 0.54 0.55 0.59

#主成分分析降维
model_lm.princomp <- princomp(data[2:6],cor = T)
summary(model_lm.princomp,loadings=TRUE)

## Importance of components:
##                           Comp.1     Comp.2    Comp.3      Comp.4       Comp.5
## Standard deviation     2.0607097 0.70683004 0.4573920 0.206454854 0.0451189337
## Proportion of Variance 0.8493049 0.09992174 0.0418415 0.008524721 0.0004071436
## Cumulative Proportion  0.8493049 0.94922664 0.9910681 0.999592856 1.0000000000
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## x1  0.464  0.271  0.347  0.753  0.153
## x2  0.367 -0.922  0.104              
## x3  0.445        -0.855         0.246
## x4 -0.482 -0.144         0.184  0.844
## x5 -0.468 -0.216 -0.371  0.628 -0.449

screeplot(model_lm.princomp,type="lines")

predict(model_lm.princomp)

##           Comp.1      Comp.2      Comp.3      Comp.4       Comp.5
##  [1,]  3.2162992  0.05946270 -0.97952843  0.01729405  0.012537261
##  [2,]  2.6519339 -0.05958836 -0.40943759 -0.09682358 -0.059978946
##  [3,]  2.1819789 -0.12533446  0.52123056  0.29214393 -0.014257984
##  [4,]  1.3754172  0.80894956  0.40106064  0.29327959  0.029936870
##  [5,]  1.0716288 -0.70726111  0.27239737 -0.29850277  0.015909361
##  [6,]  0.3491829  0.22917250  0.16300452 -0.21021323  0.106305710
##  [7,] -0.8847448  1.61852534  0.39593322 -0.23062062 -0.054203211
##  [8,] -0.6253771 -1.11882482  0.38920443 -0.17184766 -0.055248783
##  [9,] -1.3013894 -0.32000757  0.23950828  0.06232217  0.017354865
## [10,] -1.7867432 -0.82783201 -0.01417894  0.25180965  0.008244198
## [11,] -2.9266256  0.28563033 -0.44033275  0.19219150 -0.031927395
## [12,] -3.3215608  0.15710790 -0.53886129 -0.10103305  0.025328055

0.2.2 含所有交互项的线性回归模型

model_lm1 <- lm(y~x1*x2*x3*x4*x5)
#模型详细结果
summary(model_lm1)

## 
## Call:
## lm(formula = y ~ x1 * x2 * x3 * x4 * x5)
## 
## Residuals:
## ALL 12 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (20 not defined because of singularities)
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)    -2.153e+01        NaN     NaN      NaN
## x1              9.301e-01        NaN     NaN      NaN
## x2              2.775e-03        NaN     NaN      NaN
## x3              1.670e+01        NaN     NaN      NaN
## x4              7.243e-04        NaN     NaN      NaN
## x5             -2.661e-01        NaN     NaN      NaN
## x1:x2          -8.220e-05        NaN     NaN      NaN
## x1:x3          -4.406e-01        NaN     NaN      NaN
## x2:x3          -4.695e-04        NaN     NaN      NaN
## x1:x4          -1.278e-05        NaN     NaN      NaN
## x2:x4          -2.864e-08        NaN     NaN      NaN
## x3:x4          -3.229e-04        NaN     NaN      NaN
## x1:x5                  NA         NA      NA       NA
## x2:x5                  NA         NA      NA       NA
## x3:x5                  NA         NA      NA       NA
## x4:x5                  NA         NA      NA       NA
## x1:x2:x3               NA         NA      NA       NA
## x1:x2:x4               NA         NA      NA       NA
## x1:x3:x4               NA         NA      NA       NA
## x2:x3:x4               NA         NA      NA       NA
## x1:x2:x5               NA         NA      NA       NA
## x1:x3:x5               NA         NA      NA       NA
## x2:x3:x5               NA         NA      NA       NA
## x1:x4:x5               NA         NA      NA       NA
## x2:x4:x5               NA         NA      NA       NA
## x3:x4:x5               NA         NA      NA       NA
## x1:x2:x3:x4            NA         NA      NA       NA
## x1:x2:x3:x5            NA         NA      NA       NA
## x1:x2:x4:x5            NA         NA      NA       NA
## x1:x3:x4:x5            NA         NA      NA       NA
## x2:x3:x4:x5            NA         NA      NA       NA
## x1:x2:x3:x4:x5         NA         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 11 and 0 DF,  p-value: NA

#模型参数
coefficients(model_lm1)

##    (Intercept)             x1             x2             x3             x4 
##  -2.152597e+01   9.301000e-01   2.774761e-03   1.669894e+01   7.243050e-04 
##             x5          x1:x2          x1:x3          x2:x3          x1:x4 
##  -2.661066e-01  -8.219911e-05  -4.406450e-01  -4.694717e-04  -1.277791e-05 
##          x2:x4          x3:x4          x1:x5          x2:x5          x3:x5 
##  -2.863702e-08  -3.229305e-04             NA             NA             NA 
##          x4:x5       x1:x2:x3       x1:x2:x4       x1:x3:x4       x2:x3:x4 
##             NA             NA             NA             NA             NA 
##       x1:x2:x5       x1:x3:x5       x2:x3:x5       x1:x4:x5       x2:x4:x5 
##             NA             NA             NA             NA             NA 
##       x3:x4:x5    x1:x2:x3:x4    x1:x2:x3:x5    x1:x2:x4:x5    x1:x3:x4:x5 
##             NA             NA             NA             NA             NA 
##    x2:x3:x4:x5 x1:x2:x3:x4:x5 
##             NA             NA

#模型参数的置信区间(该模型为空)
confint(model_lm1)

##                2.5 % 97.5 %
## (Intercept)      NaN    NaN
## x1               NaN    NaN
## x2               NaN    NaN
## x3               NaN    NaN
## x4               NaN    NaN
## x5               NaN    NaN
## x1:x2            NaN    NaN
## x1:x3            NaN    NaN
## x2:x3            NaN    NaN
## x1:x4            NaN    NaN
## x2:x4            NaN    NaN
## x3:x4            NaN    NaN
## x1:x5            NaN    NaN
## x2:x5            NaN    NaN
## x3:x5            NaN    NaN
## x4:x5            NaN    NaN
## x1:x2:x3         NaN    NaN
## x1:x2:x4         NaN    NaN
## x1:x3:x4         NaN    NaN
## x2:x3:x4         NaN    NaN
## x1:x2:x5         NaN    NaN
## x1:x3:x5         NaN    NaN
## x2:x3:x5         NaN    NaN
## x1:x4:x5         NaN    NaN
## x2:x4:x5         NaN    NaN
## x3:x4:x5         NaN    NaN
## x1:x2:x3:x4      NaN    NaN
## x1:x2:x3:x5      NaN    NaN
## x1:x2:x4:x5      NaN    NaN
## x1:x3:x4:x5      NaN    NaN
## x2:x3:x4:x5      NaN    NaN
## x1:x2:x3:x4:x5   NaN    NaN

#模型预测值
fitted(model_lm1)

##    1    2    3    4    5    6    7    8    9   10   11   12 
## 0.77 0.79 0.89 1.03 0.85 0.86 0.91 1.01 1.02 1.05 0.96 1.07

#模型残差值
residuals(model_lm1)

##  1  2  3  4  5  6  7  8  9 10 11 12 
##  0  0  0  0  0  0  0  0  0  0  0  0

#模型方差分析
anova(model_lm1)

## Analysis of Variance Table
## 
## Response: y
##           Df   Sum Sq  Mean Sq F value Pr(>F)
## x1         1 0.049562 0.049562     NaN    NaN
## x2         1 0.001547 0.001547     NaN    NaN
## x3         1 0.027138 0.027138     NaN    NaN
## x4         1 0.015193 0.015193     NaN    NaN
## x5         1 0.000239 0.000239     NaN    NaN
## x1:x2      1 0.001104 0.001104     NaN    NaN
## x1:x3      1 0.013203 0.013203     NaN    NaN
## x2:x3      1 0.003462 0.003462     NaN    NaN
## x1:x4      1 0.004060 0.004060     NaN    NaN
## x2:x4      1 0.001196 0.001196     NaN    NaN
## x3:x4      1 0.000987 0.000987     NaN    NaN
## Residuals  0 0.000000      NaN

#AIC
AIC(model_lm1)

## [1] -Inf

这个模型过拟合了

0.3 Lasso回归

library(lars)
names(data) <- c("生态效率值","第二产业比重","人均水资源量","单位GDP能耗",
                 "居民人均可支配收入","金融发展指数")
lx <- as.matrix(data[2:6])
ly <- as.matrix(data[1])
larresult <- lars(lx,ly,type = "lasso")
larresult

## 
## Call:
## lars(x = lx, y = ly, type = "lasso")
## R-squared: 0.796 
## Sequence of LASSO moves:
##      单位GDP能耗 金融发展指数 第二产业比重 人均水资源量 居民人均可支配收入
## Var            3            5            1            2                  4
## Step           1            2            3            4                  5

plot(larresult)

summary(larresult)

## LARS/LASSO
## Call: lars(x = lx, y = ly, type = "lasso")
##   Df      Rss      Cp
## 0  1 0.117692 19.4078
## 1  2 0.060820  7.1972
## 2  3 0.038084  3.5162
## 3  4 0.033337  4.3299
## 4  5 0.025582  4.3923
## 5  6 0.024012  6.0000

larresult$Cp[which.min(larresult$Cp)]

##        2 
## 3.516187

larresult$beta

##   第二产业比重 人均水资源量 单位GDP能耗 居民人均可支配收入 金融发展指数
## 0  0.000000000 0.000000e+00   0.0000000        0.00000e+00   0.00000000
## 1  0.000000000 0.000000e+00  -0.2766809        0.00000e+00   0.00000000
## 2  0.000000000 0.000000e+00  -0.4229568        0.00000e+00   0.01003891
## 3  0.009604011 0.000000e+00  -0.4467589        0.00000e+00   0.02083052
## 4  0.034266042 7.489358e-06  -0.5292337        0.00000e+00   0.05013105
## 5  0.056080258 1.739076e-05  -0.2420773        2.58839e-05   0.02284343
## attr(,"scaled:scale")
## [1] 8.952607e+00 4.196196e+03 4.846885e-01 2.072191e+04 7.062343e+00

#系数
coef <- coef.lars(larresult,mode="step",s=larresult$df[which.min(larresult$Cp)])
#不为0的系数
coef[coef!=0]

##  单位GDP能耗 金融发展指数 
##  -0.42295683   0.01003891

#求截距值,其中s的含义和求取coef中的s相同，代表第几次迭代对应的模型的截距值。
predict(larresult,data.frame(x1=0,x2=0,x3=0,x4=0,x5=0),
        s=larresult$df[which.min(larresult$Cp)]) #

## $s
##   
## 3 
## 
## $fraction
##     
## 0.4 
## 
## $mode
## [1] "step"
## 
## $fit
## [1] 0.8907181

金融论文数据分析

2022-08-24

0.1 数据导入

0.2 线性回归

0.2.1 不含交互项的线性回归模型

0.2.2 含所有交互项的线性回归模型

0.3 Lasso回归