# Load required libraries library(tidyverse) library(haven) # For reading Stata files library(ivreg) # For instrumental variables regression library(knitr) # For generating tables library(kableExtra) library(broom) # For tidying model results library(here) library(gt) # For creating gt tables source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R")) # Read the data data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta")) # Rename variables data <- data %>% rename( AGE = v1, AGEQ = v2, EDUC = v4, ENOCENT = v5, ESOCENT = v6, LWKLYWGE = v9, MARRIED = v10, MIDATL = v11, MT = v12, NEWENG = v13, CENSUS = v16, SOB = v17, QOB = v18, RACE = v19, SMSA = v20, SOATL = v21, WNOCENT = v24, WSOCENT = v25, YOB = v27 ) # Adjust YOB for years >= 1900 data <- data %>% mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB)) # Create YOB dummies for (i in 0:9) { data <- data %>% mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0)) } # Create QOB dummies for (i in 1:4) { data <- data %>% mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0)) } # Create QOB*YOB interaction dummies for (j in 1:3) { for (i in 0:9) { data <- data %>% mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i))) } } # Create COHORT variable and adjust AGEQ data <- data %>% mutate( COHORT = case_when( YOB <= 29 & YOB >= 20 ~ 2029, YOB <= 39 & YOB >= 30 ~ 3039, YOB <= 49 & YOB >= 40 ~ 4049, TRUE ~ NA_real_ ), AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ), AGEQSQ = AGEQ * AGEQ ) # Select particular men born - Filter for black men born 1930-1939 data <- data %>% filter(COHORT > 3000 & COHORT < 3040 & RACE == 1) # Create state dummies using tabulate # Create dummy variables for unique SOB values data %>% select(SOB) %>% group_by(SOB) %>% summarise(n = n()) %>% print(n = 100) unique_states <- sort(unique(data$SOB)) num_states <- length(unique_states) # 50 for (i in 1:(num_states-1)) { data <- data %>% mutate(!!paste0("state", i) := if_else(SOB == unique_states[i], 1, 0)) } # Create QOB*SOB interaction dummies for (j in 1:3) { for (i in 1:(num_states-1)) { # Check if the state variable exists before creating interaction if (paste0("state", i) %in% colnames(data)) { data <- data %>% mutate(!!paste0("QTR", j, "state", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("state", i))) } } } # Define included and excluded instrument variables # YR0-YR8 are part of both formulas yr_vars <- paste0("YR", 0:8) # QTR*YR interactions as instruments (excluded instruments) qtr_yr_vars <- c() for (j in 1:3) { for (i in 0:9) { qtr_yr_vars <- c(qtr_yr_vars, paste0("QTR", j, "YR", i)) } } # State dummies state_vars <- paste0("state", 1:(num_states-1)) state_vars <- state_vars[state_vars %in% colnames(data)] # QTR*state interactions as instruments qtr_state_vars <- c() for (j in 1:3) { for (i in 1:(num_states-1)) { var_name <- paste0("QTR", j, "state", i) if (var_name %in% colnames(data)) { qtr_state_vars <- c(qtr_state_vars, var_name) } } } # Create formula strings for the excluded instruments (for clarity) excluded_instruments <- c(qtr_yr_vars, qtr_state_vars) excluded_instr_str <- paste(excluded_instruments, collapse = " + ") # Define regression formulas # Model 1: OLS with state dummies (no IV) formula1 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "))) # Model 2: IV with QTR*YR and QTR*state as instruments formula2 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "), " | ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "), " + ", excluded_instr_str)) # Model 3: OLS with state dummies and age controls formula3 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", paste(state_vars, collapse = " + "))) # Model 4: IV with age controls formula4 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", paste(state_vars, collapse = " + "), " | ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", paste(state_vars, collapse = " + "), " + ", excluded_instr_str)) # Model 5: OLS with demographic controls formula5 <- as.formula(paste("LWKLYWGE ~ EDUC + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ", "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "))) # Model 6: IV with demographic controls formula6 <- as.formula(paste("LWKLYWGE ~ EDUC + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ", "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "), " | ", "MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ", "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "), " + ", excluded_instr_str)) # Model 7: OLS with all controls formula7 <- as.formula(paste("LWKLYWGE ~ EDUC + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ", "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", paste(state_vars, collapse = " + "))) # Model 8: IV with all controls formula8 <- as.formula(paste("LWKLYWGE ~ EDUC + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ", "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", paste(state_vars, collapse = " + "), " | ", "MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ", "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ + ", paste(yr_vars, collapse = " + "), " + ", paste(state_vars, collapse = " + "), " + ", excluded_instr_str)) # Run regressions model1 <- lm(formula1, data = data) # summary(model1) model2 <- ivreg(formula2, data = data) # summary(model2) model3 <- lm(formula3, data = data) # summary(model3) model4 <- ivreg(formula4, data = data) # summary(model4) model5 <- lm(formula5, data = data) # summary(model5) model6 <- ivreg(formula6, data = data) # summary(model6) model7 <- lm(formula7, data = data) # summary(model7) model8 <- ivreg(formula8, data = data) # summary(model8) # Extract coefficients and standard errors table_data <- bind_rows( broom::tidy(model1, conf.int = TRUE) %>% mutate(model = "model1"), broom::tidy(model2, conf.int = TRUE) %>% mutate(model = "model2"), broom::tidy(model3, conf.int = TRUE) %>% mutate(model = "model3"), broom::tidy(model4, conf.int = TRUE) %>% mutate(model = "model4"), broom::tidy(model5, conf.int = TRUE) %>% mutate(model = "model5"), broom::tidy(model6, conf.int = TRUE) %>% mutate(model = "model6"), broom::tidy(model7, conf.int = TRUE) %>% mutate(model = "model7"), broom::tidy(model8, conf.int = TRUE) %>% mutate(model = "model8") ) %>% filter(term %in% c("EDUC", "SMSA", "MARRIED", "AGEQ", "AGEQSQ")) %>% mutate( term = case_when( term == "EDUC" ~ "Years of education", term == "SMSA" ~ "SMSA (1 = center city)", term == "MARRIED" ~ "Married (1 = married)", term == "AGEQ" ~ "Age", term == "AGEQSQ" ~ "Age-squared", TRUE ~ term ), p.sig = case_when( p.value < 0.001 ~ "***", p.value < 0.01 ~ "**", p.value < 0.05 ~ "*", TRUE ~ "" ), value = ak91_coef_cell(estimate, std.error, p.sig) ) %>% select(term, model, value) %>% pivot_wider( names_from = model, values_from = value ) %>% # reorder the term mutate( term = factor( term, levels = c("Years of education", "SMSA (1 = center city)", "Married (1 = married)", "Age", "Age-squared")) ) %>% arrange(term) %>% rename_all(~ all_of(c("Independent Variables", "(1)
OLS", "(2)
TSLS", "(3)
OLS", "(4)
TSLS", "(5)
OLS", "(6)
TSLS", "(7)
OLS", "(8)
TSLS"))) # Create the table using kableExtra kbl_out <- table_data %>% kbl( format = "html", escape = FALSE, scroll = TRUE, align = c("l", "c", "c", "c", "c", "c", "c", "c", "c", "c"), caption = "Table VIII: OLS and TSLS Estimates of the Return to Education for Black Men Born 1930-1939: 1980 Census" ) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% footnote( general = "Yes-No Dummies are not kept in the shown figure.", number = c( "$^{*}$ p<0.05; $^{**}$ p<0.01; $^{***}$ p<0.001", "The dependent variable is the log of weekly wage. The sample includes black men born 1930-1939 in the 1980 Census.", "The excluded instrumental variables for all TSLS models are not shown in the table and include quarter of birth-year of birth (27 interactions) and 150 quarter of birth-state of birth interactions." ) ) # Create the table using gt table_data <- ak91_quarto_blank_df(table_data) gt_tbl <- table_data %>% gt() %>% # Set title tab_header( title = "Table VIII: OLS and TSLS Estimates of the Return to Education for Black Men Born 1930-1939: 1980 Census" ) %>% # Set column alignment cols_align( align = "left", columns = 1 ) %>% cols_align( align = "center", columns = 2:9 ) %>% # Add footnotes tab_footnote( footnote = "Yes-No Dummies are not kept in the shown figure.", locations = cells_title(groups = "title") ) %>% tab_footnote( footnote = md("$^{*}p<0.05; ^{**}p<0.01; ^{***}p<0.001$"), locations = cells_title(groups = "title") ) %>% tab_footnote( footnote = "The dependent variable is the log of weekly wage. The sample includes black men born 1930-1939 in the 1980 Census.", locations = cells_title(groups = "title") ) %>% tab_footnote( footnote = md("The excluded instrumental variables for all TSLS models are not shown in the table and include quarter of birth-year of birth (27 interactions) and 150 quarter of birth-state of birth interactions."), locations = cells_title(groups = "title") ) %>% # Set table style opt_row_striping() %>% tab_options( table.font.size = px(14), heading.background.color = "#e8e8e8", column_labels.font.weight = "bold", table.border.top.width = px(3), table.border.bottom.width = px(3), container.width = pct(100), table.width = px(900) # 固定宽度会自动启用滚动条 ) %>% # Ensure math symbols render correctly fmt_markdown(columns = everything()) %>% # Ensure all column headers use gt package's html() syntax cols_label(.list = ak91_gt_cols_label(names(table_data))) # Save the objects to RData file save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_VIII.RData"))