# Bound et al. (1995) Table 3 – simulated quarter-of-birth instruments. library(tidyverse) library(ivreg) library(here) library(gt) source(here("04-topics/rep-bound1995/Rcode/bound1995-data-prep.R")) source(here("04-topics/rep-bound1995/Rcode/bound1995-iv-diagnostics.R")) source(here("04-topics/rep-bound1995/Rcode/bound1995-gt-quarto.R")) n_reps <- as.integer(Sys.getenv("BOUND95_SIM_REPS", "500")) n_cores <- suppressWarnings(as.integer(Sys.getenv("BOUND95_SIM_CORES", ""))) if (is.na(n_cores) || n_cores < 1L) { n_cores <- max(1L, parallel::detectCores() - 1L) } seed_base <- as.integer(Sys.getenv("BOUND95_SIM_SEED", "1995")) bound95_sim_checkpoint_path <- function(label) { slug <- gsub("[^a-zA-Z0-9]+", "_", label) here("04-topics/rep-bound1995/Rcode", paste0("Table_III_spec_", slug, ".rds")) } run_one_rep <- function(spec, data, seed) { d <- bound95_resimulate_qob(data, seed = seed) m <- bound95_run_iv(d, spec$controls, spec$excluded, spec$included) fs <- bound95_first_stage_stats( d, excluded = spec$excluded, included = spec$included ) educ <- bound95_educ_coef(m) tibble( rep = NA_integer_, coef = as.numeric(educ$estimate), se = as.numeric(educ$std.error), f_first = as.numeric(fs$f_excluded) ) } run_spec_sims <- function(spec, data, n_reps, n_cores, seed_base) { ckpt <- bound95_sim_checkpoint_path(spec$label) if (file.exists(ckpt)) { existing <- readRDS(ckpt) if (nrow(existing) >= n_reps) { message(" checkpoint hit: ", ckpt, " (", nrow(existing), " reps)") return(existing |> slice_head(n = n_reps)) } } reps_done <- if (file.exists(ckpt)) nrow(readRDS(ckpt)) else 0L if (reps_done > 0L) { message(" resuming from rep ", reps_done + 1L, " / ", n_reps) } run_indices <- seq.int(reps_done + 1L, n_reps) if (length(run_indices) == 0L) { return(readRDS(ckpt)) } one <- function(i) { out <- run_one_rep(spec, data, seed = seed_base + i) out$rep <- i out } if (n_cores <= 1L || length(run_indices) == 1L) { new_rows <- purrr::map_dfr(run_indices, function(i) { if (i %% 10L == 0L || i == n_reps) { message(" rep ", i, " / ", n_reps) } one(i) }) } else { message(" parallel: ", n_cores, " workers, ", length(run_indices), " reps") cl <- tryCatch( parallel::makePSOCKcluster(n_cores), error = function(e) { message(" cluster setup failed: ", conditionMessage(e)) NULL } ) if (is.null(cl)) { message(" falling back to serial execution") new_rows <- purrr::map_dfr(run_indices, function(i) { if (i %% 10L == 0L || i == n_reps) { message(" rep ", i, " / ", n_reps) } one(i) }) } else { on.exit(parallel::stopCluster(cl), add = TRUE) parallel::clusterExport( cl, c( "spec", "data", "seed_base", "one", "run_one_rep", "bound95_resimulate_qob", "bound95_resimulate_qob_inplace", "bound95_run_iv", "bound95_first_stage_stats", "bound95_educ_coef", "bound95_effective_excluded", "bound95_run_iv_chunked", "bound95_first_stage_chunked", "bound95_chunk_moments", "bound95_ols_from_moments", "bound95_chunk_wx", "bound95_qr_solve", "bound95_rank_keep", "bound95_subset_mom", "bound95_chunk_size" ), envir = environment() ) parallel::clusterEvalQ(cl, { library(here) library(tidyverse) library(ivreg) source(here("04-topics/rep-bound1995/Rcode/bound1995-data-prep.R")) source(here("04-topics/rep-bound1995/Rcode/bound1995-iv-diagnostics.R")) }) chunks <- split(run_indices, cut(seq_along(run_indices), breaks = min(n_cores, length(run_indices)), labels = FALSE )) chunk_results <- parallel::parLapply(cl, chunks, function(idxs) { purrr::map_dfr(idxs, one) }) parallel::stopCluster(cl) on.exit(NULL) new_rows <- bind_rows(chunk_results) } } if (reps_done > 0L) { new_rows <- bind_rows(readRDS(ckpt), new_rows) } saveRDS(new_rows, ckpt) new_rows } summarise_sim <- function(sim_results) { sim_results |> mutate( coef = as.numeric(coef), se = as.numeric(se), f_first = as.numeric(f_first) ) |> group_by(spec) |> summarise( coef_mean = mean(coef, na.rm = TRUE), coef_sd = sd(coef, na.rm = TRUE), coef_p05 = unname(quantile(coef, 0.05, na.rm = TRUE)), coef_median = median(coef, na.rm = TRUE), coef_p95 = unname(quantile(coef, 0.95, na.rm = TRUE)), se_mean = mean(se, na.rm = TRUE), se_sd = sd(se, na.rm = TRUE), se_p05 = unname(quantile(se, 0.05, na.rm = TRUE)), se_median = median(se, na.rm = TRUE), se_p95 = unname(quantile(se, 0.95, na.rm = TRUE)), f_mean = mean(f_first, na.rm = TRUE), f_sd = sd(f_first, na.rm = TRUE), f_median = median(f_first, na.rm = TRUE), .groups = "drop" ) } build_gt_table <- function(sim_summary, sim_specs, n_reps) { stat_keys <- c("mean", "sd", "p05", "median", "p95") row_labels <- c( "Mean", "Standard deviation", "5th percentile", "Median", "95th percentile" ) build_block <- function(prefix) { out <- tibble(` ` = row_labels) for (sp in sim_specs) { pref <- sim_summary |> filter(.data$spec == sp$label) out[[sp$label]] <- vapply(stat_keys, function(k) { bound95_stat_cell(pref[[paste0(prefix, "_", k)]], bound95_digits) }, character(1)) } out } table_coef <- build_block("coef") table_se <- build_block("se") f_row <- tibble(` ` = "Mean") for (sp in sim_specs) { pref <- sim_summary |> filter(.data$spec == sp$label) f_row[[sp$label]] <- bound95_stat_cell(pref$f_mean, bound95_digits) } table_data <- bind_rows( tibble(Section = "Estimated coefficient", table_coef), tibble(Section = "Estimated standard error", table_se), tibble(Section = "First-stage F (excluded instruments)", f_row) ) gt_tbl <- table_data |> gt(groupname_col = "Section") |> tab_header( title = md("Table 3: IV Estimates Using Simulated Quarter of Birth"), subtitle = md(paste0(n_reps, " replications; men born 1930–1939")) ) |> cols_align(align = "left", columns = 1:2) |> cols_align(align = "center", columns = 3:6) |> tab_footnote( footnote = paste( "Quarter of birth is randomly permuted within the sample (Krueger suggestion).", "With invalid instruments, second-stage coefficients look plausible but the", "first-stage F statistic clusters near 1 (see mean F row)." ), locations = cells_title(groups = "title") ) |> fmt_markdown(columns = everything()) list(table_data = table_data, gt_tbl = gt_tbl) } message("Table III simulation: ", n_reps, " replications × 4 specs (cores = ", n_cores, ")") data_base <- load_bound95_data(with_state = TRUE) message("Sample loaded: N = ", nrow(data_base)) demo <- bound95_demo_controls yob <- bound95_yob_controls state <- bound95_state_controls(data_base) qtr <- bound95_qtr_iv qtr_yr <- bound95_qtr_yr_iv excluded_t2 <- c(qtr, qtr_yr, bound95_state_iv(data_base)) sim_specs <- list( list(label = "1 (4)", controls = c(yob, demo), excluded = qtr_yr, included = c(yob, demo)), list( label = "1 (6)", controls = c(yob, "AGEQ", "AGEQSQ", demo), excluded = qtr_yr, included = c(yob, "AGEQ", "AGEQSQ", demo) ), list( label = "2 (2)", controls = c(yob, demo, state), excluded = excluded_t2, included = c(yob, demo, state) ), list( label = "2 (4)", controls = c(yob, "AGEQ", "AGEQSQ", demo, state), excluded = excluded_t2, included = c(yob, "AGEQ", "AGEQSQ", demo, state) ) ) sim_results <- purrr::map_dfr(sim_specs, function(spec) { message("Spec ", spec$label, " ...") t0 <- Sys.time() reps <- run_spec_sims(spec, data_base, n_reps, n_cores, seed_base) message(" done in ", round(difftime(Sys.time(), t0, units = "mins"), 1), " min") reps |> mutate(spec = spec$label) |> relocate(spec, rep) }) sim_summary <- summarise_sim(sim_results) paper_targets <- tibble::tribble( ~spec, ~coef_mean, ~coef_sd, ~se_mean, ~f_mean, "1 (4)", 0.062, 0.038, 0.037, 1.0, "1 (6)", 0.061, 0.039, 0.039, 1.0, "2 (2)", 0.060, 0.015, 0.015, 1.0, "2 (4)", 0.060, 0.015, 0.015, 1.0 ) message("Simulation summary vs Bound (1995) Table 3:") print(sim_summary) print(paper_targets) gt_out <- build_gt_table(sim_summary, sim_specs, n_reps) table_data <- gt_out$table_data gt_tbl <- gt_out$gt_tbl save( sim_results, sim_summary, table_data, gt_tbl, paper_targets, n_reps, n_cores, file = here("04-topics/rep-bound1995/Rcode/Table_III.RData") ) message("Saved: 04-topics/rep-bound1995/Rcode/Table_III.RData")