# Bound et al. (1995) Table 3 – simulated quarter-of-birth instruments.
library(tidyverse)
library(ivreg)
library(here)
library(gt)
source(here("04-topics/rep-bound1995/Rcode/bound1995-data-prep.R"))
source(here("04-topics/rep-bound1995/Rcode/bound1995-iv-diagnostics.R"))
source(here("04-topics/rep-bound1995/Rcode/bound1995-gt-quarto.R"))

n_reps <- as.integer(Sys.getenv("BOUND95_SIM_REPS", "500"))
n_cores <- suppressWarnings(as.integer(Sys.getenv("BOUND95_SIM_CORES", "")))
if (is.na(n_cores) || n_cores < 1L) {
  n_cores <- max(1L, parallel::detectCores() - 1L)
}
seed_base <- as.integer(Sys.getenv("BOUND95_SIM_SEED", "1995"))

bound95_sim_checkpoint_path <- function(label) {
  slug <- gsub("[^a-zA-Z0-9]+", "_", label)
  here("04-topics/rep-bound1995/Rcode", paste0("Table_III_spec_", slug, ".rds"))
}

run_one_rep <- function(spec, data, seed) {
  d <- bound95_resimulate_qob(data, seed = seed)
  m <- bound95_run_iv(d, spec$controls, spec$excluded, spec$included)
  fs <- bound95_first_stage_stats(
    d, excluded = spec$excluded, included = spec$included
  )
  educ <- bound95_educ_coef(m)
  tibble(
    rep = NA_integer_,
    coef = as.numeric(educ$estimate),
    se = as.numeric(educ$std.error),
    f_first = as.numeric(fs$f_excluded)
  )
}

run_spec_sims <- function(spec, data, n_reps, n_cores, seed_base) {
  ckpt <- bound95_sim_checkpoint_path(spec$label)
  if (file.exists(ckpt)) {
    existing <- readRDS(ckpt)
    if (nrow(existing) >= n_reps) {
      message("  checkpoint hit: ", ckpt, " (", nrow(existing), " reps)")
      return(existing |> slice_head(n = n_reps))
    }
  }

  reps_done <- if (file.exists(ckpt)) nrow(readRDS(ckpt)) else 0L
  if (reps_done > 0L) {
    message("  resuming from rep ", reps_done + 1L, " / ", n_reps)
  }

  run_indices <- seq.int(reps_done + 1L, n_reps)
  if (length(run_indices) == 0L) {
    return(readRDS(ckpt))
  }

  one <- function(i) {
    out <- run_one_rep(spec, data, seed = seed_base + i)
    out$rep <- i
    out
  }

  if (n_cores <= 1L || length(run_indices) == 1L) {
    new_rows <- purrr::map_dfr(run_indices, function(i) {
      if (i %% 10L == 0L || i == n_reps) {
        message("    rep ", i, " / ", n_reps)
      }
      one(i)
    })
  } else {
    message("  parallel: ", n_cores, " workers, ", length(run_indices), " reps")
    cl <- tryCatch(
      parallel::makePSOCKcluster(n_cores),
      error = function(e) {
        message("  cluster setup failed: ", conditionMessage(e))
        NULL
      }
    )
    if (is.null(cl)) {
      message("  falling back to serial execution")
      new_rows <- purrr::map_dfr(run_indices, function(i) {
        if (i %% 10L == 0L || i == n_reps) {
          message("    rep ", i, " / ", n_reps)
        }
        one(i)
      })
    } else {
      on.exit(parallel::stopCluster(cl), add = TRUE)
      parallel::clusterExport(
        cl,
        c(
        "spec", "data", "seed_base", "one", "run_one_rep",
        "bound95_resimulate_qob", "bound95_resimulate_qob_inplace",
          "bound95_run_iv", "bound95_first_stage_stats", "bound95_educ_coef",
          "bound95_effective_excluded", "bound95_run_iv_chunked",
          "bound95_first_stage_chunked", "bound95_chunk_moments",
          "bound95_ols_from_moments", "bound95_chunk_wx", "bound95_qr_solve",
          "bound95_rank_keep", "bound95_subset_mom", "bound95_chunk_size"
        ),
        envir = environment()
      )
      parallel::clusterEvalQ(cl, {
        library(here)
        library(tidyverse)
        library(ivreg)
        source(here("04-topics/rep-bound1995/Rcode/bound1995-data-prep.R"))
        source(here("04-topics/rep-bound1995/Rcode/bound1995-iv-diagnostics.R"))
      })
      chunks <- split(run_indices, cut(seq_along(run_indices),
        breaks = min(n_cores, length(run_indices)),
        labels = FALSE
      ))
      chunk_results <- parallel::parLapply(cl, chunks, function(idxs) {
        purrr::map_dfr(idxs, one)
      })
      parallel::stopCluster(cl)
      on.exit(NULL)
      new_rows <- bind_rows(chunk_results)
    }
  }

  if (reps_done > 0L) {
    new_rows <- bind_rows(readRDS(ckpt), new_rows)
  }
  saveRDS(new_rows, ckpt)
  new_rows
}

summarise_sim <- function(sim_results) {
  sim_results |>
    mutate(
      coef = as.numeric(coef),
      se = as.numeric(se),
      f_first = as.numeric(f_first)
    ) |>
    group_by(spec) |>
    summarise(
      coef_mean = mean(coef, na.rm = TRUE),
      coef_sd = sd(coef, na.rm = TRUE),
      coef_p05 = unname(quantile(coef, 0.05, na.rm = TRUE)),
      coef_median = median(coef, na.rm = TRUE),
      coef_p95 = unname(quantile(coef, 0.95, na.rm = TRUE)),
      se_mean = mean(se, na.rm = TRUE),
      se_sd = sd(se, na.rm = TRUE),
      se_p05 = unname(quantile(se, 0.05, na.rm = TRUE)),
      se_median = median(se, na.rm = TRUE),
      se_p95 = unname(quantile(se, 0.95, na.rm = TRUE)),
      f_mean = mean(f_first, na.rm = TRUE),
      f_sd = sd(f_first, na.rm = TRUE),
      f_median = median(f_first, na.rm = TRUE),
      .groups = "drop"
    )
}

build_gt_table <- function(sim_summary, sim_specs, n_reps) {
  stat_keys <- c("mean", "sd", "p05", "median", "p95")
  row_labels <- c(
    "Mean", "Standard deviation", "5th percentile", "Median", "95th percentile"
  )

  build_block <- function(prefix) {
    out <- tibble(` ` = row_labels)
    for (sp in sim_specs) {
      pref <- sim_summary |> filter(.data$spec == sp$label)
      out[[sp$label]] <- vapply(stat_keys, function(k) {
        bound95_stat_cell(pref[[paste0(prefix, "_", k)]], bound95_digits)
      }, character(1))
    }
    out
  }

  table_coef <- build_block("coef")
  table_se <- build_block("se")

  f_row <- tibble(` ` = "Mean")
  for (sp in sim_specs) {
    pref <- sim_summary |> filter(.data$spec == sp$label)
    f_row[[sp$label]] <- bound95_stat_cell(pref$f_mean, bound95_digits)
  }

  table_data <- bind_rows(
    tibble(Section = "Estimated coefficient", table_coef),
    tibble(Section = "Estimated standard error", table_se),
    tibble(Section = "First-stage F (excluded instruments)", f_row)
  )

  gt_tbl <- table_data |>
    gt(groupname_col = "Section") |>
    tab_header(
      title = md("Table 3: IV Estimates Using Simulated Quarter of Birth"),
      subtitle = md(paste0(n_reps, " replications; men born 1930–1939"))
    ) |>
    cols_align(align = "left", columns = 1:2) |>
    cols_align(align = "center", columns = 3:6) |>
    tab_footnote(
      footnote = paste(
        "Quarter of birth is randomly permuted within the sample (Krueger suggestion).",
        "With invalid instruments, second-stage coefficients look plausible but the",
        "first-stage F statistic clusters near 1 (see mean F row)."
      ),
      locations = cells_title(groups = "title")
    ) |>
    fmt_markdown(columns = everything())

  list(table_data = table_data, gt_tbl = gt_tbl)
}

message("Table III simulation: ", n_reps, " replications × 4 specs (cores = ", n_cores, ")")
data_base <- load_bound95_data(with_state = TRUE)
message("Sample loaded: N = ", nrow(data_base))

demo <- bound95_demo_controls
yob <- bound95_yob_controls
state <- bound95_state_controls(data_base)
qtr <- bound95_qtr_iv
qtr_yr <- bound95_qtr_yr_iv
excluded_t2 <- c(qtr, qtr_yr, bound95_state_iv(data_base))

sim_specs <- list(
  list(label = "1 (4)", controls = c(yob, demo), excluded = qtr_yr, included = c(yob, demo)),
  list(
    label = "1 (6)",
    controls = c(yob, "AGEQ", "AGEQSQ", demo),
    excluded = qtr_yr,
    included = c(yob, "AGEQ", "AGEQSQ", demo)
  ),
  list(
    label = "2 (2)",
    controls = c(yob, demo, state),
    excluded = excluded_t2,
    included = c(yob, demo, state)
  ),
  list(
    label = "2 (4)",
    controls = c(yob, "AGEQ", "AGEQSQ", demo, state),
    excluded = excluded_t2,
    included = c(yob, "AGEQ", "AGEQSQ", demo, state)
  )
)

sim_results <- purrr::map_dfr(sim_specs, function(spec) {
  message("Spec ", spec$label, " ...")
  t0 <- Sys.time()
  reps <- run_spec_sims(spec, data_base, n_reps, n_cores, seed_base)
  message("  done in ", round(difftime(Sys.time(), t0, units = "mins"), 1), " min")
  reps |> mutate(spec = spec$label) |> relocate(spec, rep)
})

sim_summary <- summarise_sim(sim_results)

paper_targets <- tibble::tribble(
  ~spec, ~coef_mean, ~coef_sd, ~se_mean, ~f_mean,
  "1 (4)", 0.062, 0.038, 0.037, 1.0,
  "1 (6)", 0.061, 0.039, 0.039, 1.0,
  "2 (2)", 0.060, 0.015, 0.015, 1.0,
  "2 (4)", 0.060, 0.015, 0.015, 1.0
)

message("Simulation summary vs Bound (1995) Table 3:")
print(sim_summary)
print(paper_targets)

gt_out <- build_gt_table(sim_summary, sim_specs, n_reps)
table_data <- gt_out$table_data
gt_tbl <- gt_out$gt_tbl

save(
  sim_results, sim_summary, table_data, gt_tbl, paper_targets, n_reps, n_cores,
  file = here("04-topics/rep-bound1995/Rcode/Table_III.RData")
)
message("Saved: 04-topics/rep-bound1995/Rcode/Table_III.RData")