# Load required libraries
library(tidyverse)
library(haven) # For reading Stata files
library(ivreg) # For instrumental variables regression
library(knitr) # For generating tables
library(kableExtra)
library(broom) # For tidying model results
library(here)
library(gt) # For creating gt tables
source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R"))
# Read the data
data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta"))
# Rename variables
data <- data %>%
rename(
AGE = v1,
AGEQ = v2,
EDUC = v4,
ENOCENT = v5,
ESOCENT = v6,
LWKLYWGE = v9,
MARRIED = v10,
MIDATL = v11,
MT = v12,
NEWENG = v13,
CENSUS = v16,
QOB = v18,
RACE = v19,
SMSA = v20,
SOATL = v21,
WNOCENT = v24,
WSOCENT = v25,
YOB = v27
)
# Adjust YOB for years >= 1900
data <- data %>%
mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB))
# Create YOB dummies
for (i in 0:9) {
data <- data %>%
mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0))
}
# Create QOB dummies
for (i in 1:4) {
data <- data %>%
mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0))
}
# Create QOB*YOB interaction dummies
for (j in 1:3) {
for (i in 0:9) {
data <- data %>%
mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i)))
}
}
# Create COHORT variable and adjust AGEQ
data <- data %>%
mutate(
COHORT = case_when(
YOB <= 29 & YOB >= 20 ~ 2029,
YOB <= 39 & YOB >= 30 ~ 3039,
YOB <= 49 & YOB >= 40 ~ 4049,
TRUE ~ NA_real_
),
AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ),
AGEQSQ = AGEQ * AGEQ
)
# Select particular men born
# Filter data for COHORT > 4000
data <- data %>%
filter(COHORT > 4000)
# Define excluded instrumental variables
iv_excluded <- c(
"QTR1YR0", "QTR1YR1", "QTR1YR2", "QTR1YR3", "QTR1YR4", "QTR1YR5", "QTR1YR6",
"QTR1YR7", "QTR1YR8", "QTR1YR9", "QTR2YR0", "QTR2YR1", "QTR2YR2", "QTR2YR3",
"QTR2YR4", "QTR2YR5", "QTR2YR6", "QTR2YR7", "QTR2YR8", "QTR2YR9", "QTR3YR0",
"QTR3YR1", "QTR3YR2", "QTR3YR3", "QTR3YR4", "QTR3YR5", "QTR3YR6", "QTR3YR7",
"QTR3YR8", "QTR3YR9"
)
# Define regression formulas
formula1 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8
formula2 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 |
QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 +
QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 +
QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 +
QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 +
QTR3YR7 + QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 +
YR6 + YR7 + YR8
iv2_included <- c(
"YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8"
)
iv2_all <- c(iv2_included, iv_excluded)
formula3 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
AGEQ + AGEQSQ
formula4 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
AGEQ + AGEQSQ | QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 +
QTR1YR4 + QTR1YR5 + QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 +
QTR2YR1 + QTR2YR2 + QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 +
QTR2YR8 + QTR2YR9 + QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 +
QTR3YR5 + QTR3YR6 + QTR3YR7 + QTR3YR8 + QTR3YR9 +
YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
AGEQ + AGEQSQ
iv4_included <- c(
"YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
"AGEQ", "AGEQSQ"
)
iv4_all <- c(iv4_included, iv_excluded)
formula5 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 +
YR4 + YR5 + YR6 + YR7 + YR8
formula6 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 +
YR4 + YR5 + YR6 + YR7 + YR8 | QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 +
QTR1YR4 + QTR1YR5 + QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 +
QTR2YR1 + QTR2YR2 + QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 +
QTR2YR8 + QTR2YR9 + QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 +
QTR3YR5 + QTR3YR6 + QTR3YR7 + QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 +
YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
RACE + MARRIED + SMSA +
NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT
iv6_included <- c(
"YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
"RACE", "MARRIED", "SMSA",
"NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT"
)
iv6_all <- c(iv6_included, iv_excluded)
formula7 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 +
YR4 + YR5 + YR6 + YR7 + YR8 + AGEQ + AGEQSQ
formula8 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ +
YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 |
QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 + QTR1YR6 +
QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 + QTR2YR3 +
QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 + QTR3YR0 +
QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 + QTR3YR7 +
QTR3YR8 + QTR3YR9 +
YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
RACE + MARRIED + SMSA +
NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT +
AGEQ + AGEQSQ
iv8_included <- c(
"YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
"RACE", "MARRIED", "SMSA",
"NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT",
"AGEQ", "AGEQSQ"
)
iv8_all <- c(iv8_included, iv_excluded)
# Run regressions
model1 <- lm(formula1, data = data)
model2 <- ivreg(formula2, data = data)
model3 <- lm(formula3, data = data)
model4 <- ivreg(formula4, data = data)
model5 <- lm(formula5, data = data)
model6 <- ivreg(formula6, data = data)
model7 <- lm(formula7, data = data)
model8 <- ivreg(formula8, data = data)
# Extract coefficients and standard errors
table_data <- bind_rows(
broom::tidy(model1, conf.int = TRUE) %>% mutate(model = "model1"),
broom::tidy(model2, conf.int = TRUE) %>% mutate(model = "model2"),
broom::tidy(model3, conf.int = TRUE) %>% mutate(model = "model3"),
broom::tidy(model4, conf.int = TRUE) %>% mutate(model = "model4"),
broom::tidy(model5, conf.int = TRUE) %>% mutate(model = "model5"),
broom::tidy(model6, conf.int = TRUE) %>% mutate(model = "model6"),
broom::tidy(model7, conf.int = TRUE) %>% mutate(model = "model7"),
broom::tidy(model8, conf.int = TRUE) %>% mutate(model = "model8")
) %>%
filter(term %in% c("EDUC", "RACE", "SMSA", "MARRIED", "AGEQ", "AGEQSQ")) %>%
mutate(
term = case_when(
term == "EDUC" ~ "Years of education",
term == "RACE" ~ "Race (1 = black)",
term == "SMSA" ~ "SMSA (1 = center city)",
term == "MARRIED" ~ "Married (1 = married)",
term == "AGEQ" ~ "Age",
term == "AGEQSQ" ~ "Age-squared",
TRUE ~ term
),
p.sig = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
TRUE ~ ""
),
value = ak91_coef_cell(estimate, std.error, p.sig)
) %>%
select(term, model, value) %>%
pivot_wider(
names_from = model,
values_from = value
) %>%
# reorder the term
mutate(
term = factor(
term,
levels = c("Years of education", "Race (1 = black)", "SMSA (1 = center city)",
"Married (1 = married)", "Age", "Age-squared"))
) %>%
arrange(term) %>%
rename_all(~ all_of(c("Independent Variables", "(1)
OLS", "(2)
TSLS",
"(3)
OLS", "(4)
TSLS", "(5)
OLS", "(6)
TSLS",
"(7)
OLS", "(8)
TSLS")))
# Create the table using kableExtra
kbl_out <- table_data %>%
kbl(
format = "html", escape = FALSE,
scroll = TRUE,
align = c("l", "c", "c", "c", "c", "c", "c", "c", "c", "c"),
caption = "Table VI: OLS and TSLS Estimates of the Return to Education for Men Born 1940-1949: 1980 Census"
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
footnote(
general = "Yes-No Dummies are not kept in the shown figure.",
number = c(
"$^{*}$ p<0.05; $^{**}$ p<0.01; $^{***}$ p<0.001",
"The dependent variable is the log of weekly wage. The sample includes men born 1940-1949 in the 1980 Census.",
"The excluded instrumental variables for all TSLS models are not shown in the table and are the same which are the 27 interaction terms between the quarter of birth and year of birth $QTR_j \\times YR_i$, $i \\in \\{0, 1, \\dots, 9\\}$ and $j \\in \\{1, 2, 3\\}$."
)
)
# Create the table using gt
table_data <- ak91_quarto_blank_df(table_data)
gt_tbl <- table_data %>%
gt() %>%
# Set title
tab_header(
title = "Table VI: OLS and TSLS Estimates of the Return to Education for Men Born 1940-1949: 1980 Census"
) %>%
# Set column alignment
cols_align(
align = "left",
columns = 1
) %>%
cols_align(
align = "center",
columns = 2:9
) %>%
# Add footnotes
tab_footnote(
footnote = "Yes-No Dummies are not kept in the shown figure.",
locations = cells_title(groups = "title")
) %>%
tab_footnote(
footnote = md("$^{*}p<0.05; ^{**}p<0.01; ^{***}p<0.001$"),
locations = cells_title(groups = "title")
) %>%
tab_footnote(
footnote = "The dependent variable is the log of weekly wage. The sample includes men born 1940-1949 in the 1980 Census.",
locations = cells_title(groups = "title")
) %>%
tab_footnote(
footnote = md("The excluded instrumental variables for all TSLS models are not shown in the table and are the same which are the 27 interaction terms between the quarter of birth and year of birth $QTR_j \\times YR_i$, $i \\in \\{0, 1, \\dots, 9\\}$ and $j \\in \\{1, 2, 3\\}$."),
locations = cells_title(groups = "title")
) %>%
# Set table style
opt_row_striping() %>%
tab_options(
table.font.size = px(14),
heading.background.color = "#e8e8e8",
column_labels.font.weight = "bold",
table.border.top.width = px(3),
table.border.bottom.width = px(3),
container.width = pct(100),
table.width = px(100)
) %>%
# Ensure math symbols render correctly
fmt_markdown(columns = everything()) %>%
# Ensure all column headers use gt package's html() syntax
cols_label(.list = ak91_gt_cols_label(names(table_data)))
# Save the objects to RData file
save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_VI.RData"))