source(here::here("R/load-pkg-basic.R"))
data("mtcars")
经验法则
我们首先给出本文的结论:
直接对公式(
formula()
)对象获得变量名,可以使用函数labels(terms(your_mod, keep.order = TRUE))
会明确得到公式所指定的全部协变量,但是不包含截距变量。基于回归估计获得变量名,可以得到实际有效的全部自变量(包括截距项)。具体过程为:先进行
lm()
回归拟合,然后使用summary()
函数得到拟合摘要,最后再使用rownames(coef(smry))
获得模型估计的全部变量明。
下面将具体进行分析和解释。
常规操作
很容易地,我们可以构造一个简单的回归模型
<- formula("mpg ~ vs + gear +am +wt")) (model
mpg ~ vs + gear + am + wt
为了获得模型中的回归变量,我们可以使用函数labels(terms(your_mod, keep.order = TRUE))
labels(terms(model, keep.order = TRUE))
[1] "vs" "gear" "am" "wt"
扩展数据和模型
<- mtcars %>%
dt_dummy ::mutate(
dplyrgear_3 = ifelse(gear==3, 1, 0),
gear_4 = ifelse(gear==4, 1, 0), gear_5 = ifelse(gear==5, 1, 0)
%>%
) ::mutate(
dplyram_1 = ifelse(am==1, 1, 0),
am_0 = ifelse(am==0, 1, 0),
vs_1 = ifelse(vs==1, 1, 0),
vs_0 = ifelse(vs==0, 1, 0) ) %>%
rownames_to_column(var = "obs")
对数据集进行扩展,构造了多个虚拟变量:
```{r}
#| label: tbl-dummy
#| tbl-cap: "包含虚拟变量的mtcars数据集(部分)"
dt_dummy %>%
head(., n = 10) %>%
kable()
```
obs | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | gear_3 | gear_4 | gear_5 | am_1 | am_0 | vs_1 | vs_0 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
我们也给出更加复杂的多种公式情形:
<- list(
list_model basic = "mpg ~ vs_1 + gear_4 +gear_5 +am_1 +wt",
basic1 = "mpg ~ 1 + vs_1 + gear_4 +gear_5 +am_1 +wt",
noIntercept = "mpg ~ -1 + vs_1 + gear_4 +gear_5 +am_1 + wt",
onlyIntercept = "mpg ~ 1" ,
logx = "mpg ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt)",
logxy = "log(mpg) ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt)",
sqry = "mpg^2 ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt)",
colon = "mpg ~ vs_1 +(gear_4 +gear_5):am_1 +log(wt)",
star = "mpg ~ vs_1 +(gear_4 +gear_5)*am_1 + log(wt)"
)
<- as.data.frame(list_model) %>%
tbl_model t(.) %>%
as.data.frame() %>%
rename_all(., ~"formula") %>%
rownames_to_column(var = "model") %>%
add_column(index = 1:nrow(.), .before = "model")
模型列表展示如下:
kable(tbl_model)
index | model | formula |
---|---|---|
1 | basic | mpg ~ vs_1 + gear_4 +gear_5 +am_1 +wt |
2 | basic1 | mpg ~ 1 + vs_1 + gear_4 +gear_5 +am_1 +wt |
3 | noIntercept | mpg ~ -1 + vs_1 + gear_4 +gear_5 +am_1 + wt |
4 | onlyIntercept | mpg ~ 1 |
5 | logx | mpg ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) |
6 | logxy | log(mpg) ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) |
7 | sqry | mpg^2 ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) |
8 | colon | mpg ~ vs_1 +(gear_4 +gear_5):am_1 +log(wt) |
9 | star | mpg ~ vs_1 +(gear_4 +gear_5)*am_1 + log(wt) |
基于公式对象获取变量名
类似地,我们可以使用函数labels(terms(your_mod, keep.order = TRUE))
获得全部的变量名:
<- tbl_model %>%
tbl_terms mutate(terms = map(.x = formula,
~labels(terms(formula(.x), keep.order = TRUE))))
kable(tbl_terms)
index | model | formula | terms |
---|---|---|---|
1 | basic | mpg ~ vs_1 + gear_4 +gear_5 +am_1 +wt | vs_1 , gear_4, gear_5, am_1 , wt |
2 | basic1 | mpg ~ 1 + vs_1 + gear_4 +gear_5 +am_1 +wt | vs_1 , gear_4, gear_5, am_1 , wt |
3 | noIntercept | mpg ~ -1 + vs_1 + gear_4 +gear_5 +am_1 + wt | vs_1 , gear_4, gear_5, am_1 , wt |
4 | onlyIntercept | mpg ~ 1 | |
5 | logx | mpg ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) | vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
6 | logxy | log(mpg) ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) | vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
7 | sqry | mpg^2 ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) | vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
8 | colon | mpg ~ vs_1 +(gear_4 +gear_5):am_1 +log(wt) | vs_1 , gear_4:am_1, gear_5:am_1, log(wt) |
9 | star | mpg ~ vs_1 +(gear_4 +gear_5)*am_1 + log(wt) | vs_1 , gear_4 , gear_5 , am_1 , gear_4:am_1, gear_5:am_1, log(wt) |
基于回归估计获取变量名
当然,我们可以先进行lm()
回归拟合,然后使用summary()
函数得到拟合摘要,最后再使用rownames(coef(smry))
获得模型估计的全部自变量:
<- tbl_terms %>%
tbl_fit mutate(smry = map(.x = formula,
~summary(lm(formula = .x, data = dt_dummy)))) %>%
mutate(vars = map(.x = smry, ~rownames(coef(.x))))
%>%
tbl_fit select(-smry) %>%
kable()
index | model | formula | terms | vars |
---|---|---|---|---|
1 | basic | mpg ~ vs_1 + gear_4 +gear_5 +am_1 +wt | vs_1 , gear_4, gear_5, am_1 , wt | (Intercept), vs_1 , gear_4 , gear_5 , am_1 , wt |
2 | basic1 | mpg ~ 1 + vs_1 + gear_4 +gear_5 +am_1 +wt | vs_1 , gear_4, gear_5, am_1 , wt | (Intercept), vs_1 , gear_4 , gear_5 , am_1 , wt |
3 | noIntercept | mpg ~ -1 + vs_1 + gear_4 +gear_5 +am_1 + wt | vs_1 , gear_4, gear_5, am_1 , wt | vs_1 , gear_4, gear_5, am_1 , wt |
4 | onlyIntercept | mpg ~ 1 | (Intercept) | |
5 | logx | mpg ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) | vs_1 , gear_4 , gear_5 , am_1 , log(wt) | (Intercept), vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
6 | logxy | log(mpg) ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) | vs_1 , gear_4 , gear_5 , am_1 , log(wt) | (Intercept), vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
7 | sqry | mpg^2 ~ vs_1 + gear_4 +gear_5 +am_1 +log(wt) | vs_1 , gear_4 , gear_5 , am_1 , log(wt) | (Intercept), vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
8 | colon | mpg ~ vs_1 +(gear_4 +gear_5):am_1 +log(wt) | vs_1 , gear_4:am_1, gear_5:am_1, log(wt) | (Intercept), vs_1 , log(wt) , gear_4:am_1, gear_5:am_1 |
9 | star | mpg ~ vs_1 +(gear_4 +gear_5)*am_1 + log(wt) | vs_1 , gear_4 , gear_5 , am_1 , gear_4:am_1, gear_5:am_1, log(wt) | (Intercept), vs_1 , gear_4 , gear_5 , am_1 , log(wt) |
注意事项
以上对比中,我们需要注意 表 4 两个比较隐蔽的问题。
第一,对于仅含截距的模型onlyIntercept:mpg ~ 1,其term
列的变量集为空,但是其vars
列的变量集为(Intercept)
。
\[ \begin{aligned} \begin{split} mpg_i=&+\beta_{1}+u_i \end{split} \quad \text{(only Intercept)}\quad \end{aligned} \tag{1}\]
其回归摘要报告如下:
$smry[4][[1]] tbl_fit
Call:
lm(formula = .x, data = dt_dummy)
Residuals:
Min 1Q Median 3Q Max
-9.6906 -4.6656 -0.8906 2.7094 13.8094
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.091 1.065 18.86 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.027 on 31 degrees of freedom
第二,对于模型star:mpg ~ vs_1 +(gear_4 +gear_5)*am_1 + log(wt),term
列的变量集比vars
列的变量集要更多(具体原因下面会解释)。
\[ \begin{aligned} \begin{split} mpg_i=&+\beta_{1}+\beta_{2}vs\_1_i+\beta_{3}gear\_4_i\\&+\beta_{4}gear\_5_i+\beta_{5}am\_1_i+\beta_{6}gear\_4:am\_1_i\\&+\beta_{7}gear\_5:am\_1_i+\beta_{8}log(wt)_i+u_i \end{split} \quad \text{(colinearity)}\quad \end{aligned} \tag{2}\]
其回归摘要报告如下:
$smry[9][[1]] tbl_fit
Call:
lm(formula = .x, data = dt_dummy)
Residuals:
Min 1Q Median 3Q Max
-3.2814 -1.6668 -0.6301 1.3684 5.5136
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.0746 3.9120 9.221 1.11e-09 ***
vs_1 1.6079 1.4861 1.082 0.289
gear_4 1.4742 1.7084 0.863 0.396
gear_5 -0.6913 2.0150 -0.343 0.734
am_1 -0.3195 2.0264 -0.158 0.876
log(wt) -15.1572 2.7363 -5.539 8.14e-06 ***
gear_4:am_1 NA NA NA NA
gear_5:am_1 NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.48 on 26 degrees of freedom
Multiple R-squared: 0.858, Adjusted R-squared: 0.8306
F-statistic: 31.41 on 5 and 26 DF, p-value: 3.135e-10
但是我们可以看到,我们提取得到了拟合估计变量名实际上少了两个:
<- tbl_fit %>% filter(model == "star") %>%
terms_formula pull("terms") %>% unlist()
<- tbl_fit %>% filter(model == "star") %>%
(terms_fit pull("vars") %>% unlist())
[1] "(Intercept)" "vs_1" "gear_4" "gear_5" "am_1"
[6] "log(wt)"
print(glue("变量差异:{setdiff(terms_formula, terms_fit)}"))
变量差异:gear_4:am_1
变量差异:gear_5:am_1
这是因为模型star:mpg ~ vs_1 +(gear_4 +gear_5)*am_1 + log(wt)出现了完全共线性问题,也即:
\[ gear\_4:am\_1_i = gear\_4 \times am\_1_i \\ gear\_5:am\_1_i = gear\_5 \times am\_1_i \]