# Load required libraries
library(tidyverse)
library(haven) # For reading Stata files
library(ivreg) # For instrumental variables regression
library(knitr) # For generating tables
library(kableExtra)
library(broom) # For tidying model results
library(here)
library(gt) # For creating gt tables
source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R"))
# Read the data
data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta"))
# Rename variables
data <- data %>%
rename(
AGE = v1,
AGEQ = v2,
EDUC = v4,
ENOCENT = v5,
ESOCENT = v6,
LWKLYWGE = v9,
MARRIED = v10,
MIDATL = v11,
MT = v12,
NEWENG = v13,
CENSUS = v16,
SOB = v17, # State of birth - key variable for Table VII
QOB = v18,
RACE = v19,
SMSA = v20,
SOATL = v21,
WNOCENT = v24,
WSOCENT = v25,
YOB = v27
)
# Adjust YOB for years >= 1900
data <- data %>%
mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB))
# Create YOB dummies
for (i in 0:9) {
data <- data %>%
mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0))
}
# Create QOB dummies
for (i in 1:4) {
data <- data %>%
mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0))
}
# Create QOB*YOB interaction dummies
for (j in 1:3) {
for (i in 0:9) {
data <- data %>%
mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i)))
}
}
# Create COHORT variable and adjust AGEQ
data <- data %>%
mutate(
COHORT = case_when(
YOB <= 29 & YOB >= 20 ~ 2029,
YOB <= 39 & YOB >= 30 ~ 3039,
YOB <= 49 & YOB >= 40 ~ 4049,
TRUE ~ NA_real_
),
AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ),
AGEQSQ = AGEQ * AGEQ
)
# Select particular men born
# Filter data for COHORT >3000 and COHORT < 3040
data <- data %>%
filter(COHORT > 3000 & COHORT < 3040)
# Create state dummies
# First get unique states of birth
unique_states <- sort(unique(data$SOB))
num_states <- length(unique_states)
# Create state dummies (state1 to state51)
for (i in 1:num_states) {
data <- data %>%
mutate(!!paste0("state", i) := if_else(SOB == unique_states[i], 1, 0))
}
# Create QOB*SOB interaction dummies
for (j in 1:3) {
for (i in 1:(num_states-1)) {
data <- data %>%
mutate(!!paste0("QTR", j, "state", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("state", i)))
}
}
# Define included and excluded instrument variables
# YR0-YR8 are part of both formulas and instruments
yr_vars <- paste0("YR", 0:8)
# QTR*YR interactions as instruments (excluded instruments)
qtr_yr_vars <- c()
for (j in 1:3) {
for (i in 0:9) {
qtr_yr_vars <- c(qtr_yr_vars, paste0("QTR", j, "YR", i))
}
}
# QTR*state interactions as instruments
qtr_state_vars <- c()
for (j in 1:3) {
for (i in 1:(num_states-1)) {
qtr_state_vars <- c(qtr_state_vars, paste0("QTR", j, "state", i))
}
}
# State dummies
state_vars <- paste0("state", 1:(num_states-1))
# Create formula strings for the excluded instruments (for clarity)
excluded_instruments <- c(qtr_yr_vars, qtr_state_vars)
excluded_instr_str <- paste(excluded_instruments, collapse = " + ")
# Define regression formulas
# Model 1: OLS with state dummies (no IV)
formula1 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + ")))
# Model 2: IV with QTR*YR and QTR*state as instruments
formula2 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + "), " | ",
paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + "), " + ",
excluded_instr_str))
# Model 3: OLS with state dummies and age controls
formula3 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ",
paste(state_vars, collapse = " + ")))
# Model 4: IV with age controls
formula4 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ",
paste(state_vars, collapse = " + "), " | ",
paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ",
paste(state_vars, collapse = " + "), " + ",
excluded_instr_str))
# Model 5: OLS with demographic controls
formula5 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
"WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ",
paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + ")))
# Model 6: IV with demographic controls
formula6 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
"WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ",
paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + "), " | ",
"RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
"WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ",
paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + "), " + ",
excluded_instr_str))
# Model 7: OLS with all controls
formula7 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
"WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ",
paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ",
paste(state_vars, collapse = " + ")))
# Model 8: IV with all controls
formula8 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
"WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ",
paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ",
paste(state_vars, collapse = " + "), " | ",
"RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
"WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ + ",
paste(yr_vars, collapse = " + "), " + ",
paste(state_vars, collapse = " + "), " + ",
excluded_instr_str))
# Run regressions
model1 <- lm(formula1, data = data)
# summary(model1)
model2 <- ivreg(formula2, data = data)
# summary(model2)
model3 <- lm(formula3, data = data)
# summary(model3)
model4 <- ivreg(formula4, data = data)
# summary(model4)
model5 <- lm(formula5, data = data)
# summary(model5)
model6 <- ivreg(formula6, data = data)
# summary(model6)
model7 <- lm(formula7, data = data)
# summary(model7)
model8 <- ivreg(formula8, data = data)
# summary(model8)
# Extract coefficients and standard errors
table_data <- bind_rows(
broom::tidy(model1, conf.int = TRUE) %>% mutate(model = "model1"),
broom::tidy(model2, conf.int = TRUE) %>% mutate(model = "model2"),
broom::tidy(model3, conf.int = TRUE) %>% mutate(model = "model3"),
broom::tidy(model4, conf.int = TRUE) %>% mutate(model = "model4"),
broom::tidy(model5, conf.int = TRUE) %>% mutate(model = "model5"),
broom::tidy(model6, conf.int = TRUE) %>% mutate(model = "model6"),
broom::tidy(model7, conf.int = TRUE) %>% mutate(model = "model7"),
broom::tidy(model8, conf.int = TRUE) %>% mutate(model = "model8")
) %>%
filter(term %in% c("EDUC", "RACE", "SMSA", "MARRIED", "AGEQ", "AGEQSQ")) %>%
mutate(
term = case_when(
term == "EDUC" ~ "Years of education",
term == "RACE" ~ "Race(1 = black)",
term == "SMSA" ~ "SMSA (1 = center city)",
term == "MARRIED" ~ "Married (1 = married)",
term == "AGEQ" ~ "Age",
term == "AGEQSQ" ~ "Age-squared",
TRUE ~ term
),
p.sig = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
TRUE ~ ""
),
value = ak91_coef_cell(estimate, std.error, p.sig)
) %>%
select(term, model, value) %>%
pivot_wider(
names_from = model,
values_from = value
) %>%
# reorder the term
mutate(
term = factor(
term,
levels = c("Years of education", "Race(1 = black)", "SMSA (1 = center city)",
"Married (1 = married)", "Age", "Age-squared"))
) %>%
arrange(term) %>%
rename_all(~ all_of(c("Independent Variables", "(1)
OLS", "(2)
TSLS",
"(3)
OLS", "(4)
TSLS", "(5)
OLS", "(6)
TSLS",
"(7)
OLS", "(8)
TSLS")))
# Create the table using kableExtra
kbl_out <- table_data %>%
kbl(
format = "html", escape = FALSE,
scroll = TRUE,
align = c("l", "c", "c", "c", "c", "c", "c", "c", "c", "c"),
caption = "Table VII: OLS and TSLS Estimates of the Return to Education for Men Born 1930-1939: 1980 Census"
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
footnote(
general = "Yes-No Dummies are not kept in the shown figure.",
number = c(
"$^{*}$ p<0.05; $^{**}$ p<0.01; $^{***}$ p<0.001",
"The dependent variable is the log of weekly wage. The sample includes men born 1930-1939 in the 1980 Census.",
"The excluded instrumental variables for all TSLS models are not shown in the table and include quarter of birth-year of birth (27 interactions) and quarter of birth-state of birth (150 interactions) dummies."
)
)
# Create the table using gt
table_data <- ak91_quarto_blank_df(table_data)
gt_tbl <- table_data %>%
gt() %>%
# Set title
tab_header(
title = "Table VII: OLS and TSLS Estimates of the Return to Education for Men Born 1930-1939: 1980 Census"
) %>%
# Set column alignment
cols_align(
align = "left",
columns = 1
) %>%
cols_align(
align = "center",
columns = 2:9
) %>%
# Add footnotes
tab_footnote(
footnote = "Yes-No Dummies are not kept in the shown figure.",
locations = cells_title(groups = "title")
) %>%
tab_footnote(
footnote = md("$^{*}p<0.05; ^{**}p<0.01; ^{***}p<0.001$"),
locations = cells_title(groups = "title")
) %>%
tab_footnote(
footnote = "The dependent variable is the log of weekly wage. The sample includes men born 1930-1939 in the 1980 Census.",
locations = cells_title(groups = "title")
) %>%
tab_footnote(
footnote = md("The excluded instrumental variables for all TSLS models are not shown in the table and include quarter of birth-year of birth (27 interactions) and quarter of birth-state of birth (150 interactions) dummies."),
locations = cells_title(groups = "title")
) %>%
# Set table style
opt_row_striping() %>%
tab_options(
table.font.size = px(14),
heading.background.color = "#e8e8e8",
column_labels.font.weight = "bold",
table.border.top.width = px(3),
table.border.bottom.width = px(3),
container.width = pct(100),
table.width = px(900) # 固定宽度会自动启用滚动条
) %>%
# Ensure math symbols render correctly
fmt_markdown(columns = everything()) %>%
# Ensure all column headers use gt package's html() syntax
cols_label(.list = ak91_gt_cols_label(names(table_data)))
# Save the objects to RData file
save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_VII.RData"))