# Semiparametric MTE and treatment-parameter helpers for CHV2011.
# Polynomial outcome equation follows chv2011-table4a-core.R (author bootstrap_MT.R layout).
source(here::here("04-topics/rep-chv2011/Rcode/chv2011-data-prep.R"))
source(here::here("04-topics/rep-chv2011/Rcode/chv2011-table4a-core.R"))

chv2011_mte_grid <- seq(0, 1, by = 0.04)

chv2011_normal_mte_path <- function(aer_dir = chv2011_aer_dir) {
  file.path(aer_dir, "normalmodel/mtexvb.out")
}

#' Author normal-model MTE curve (shipped mtexvb.out) or Heckman–Vytlacil proxy.
load_chv2011_normal_mte_curve <- function(aer_dir = chv2011_aer_dir) {
  path <- chv2011_normal_mte_path(aer_dir)
  if (file.exists(path)) {
    mat <- as.matrix(utils::read.table(path))
    return(tibble::tibble(u = mat[, 1L], mte = mat[, 3L]))
  }
  ana <- load_chv2011_analysis(trim = TRUE)
  df <- ana$data
  delta <- stats::coef(stats::lm(
    stats::as.formula(paste("wage ~ state +", paste(chv2011_x, collapse = " + "))),
    data = df
  ))[["state"]]
  sigma_v <- 0.5
  u <- seq(0.001, 0.999, length.out = 101)
  tibble::tibble(u = u, mte = (as.numeric(delta) + sigma_v * stats::qnorm(1 - u)) / 4)
}

chv2011_attach_propensity <- function(df, spec = "baseline", trim = TRUE) {
  prop <- estimate_propensity(df, spec)
  df$phat <- prop$phat
  df$xbeta <- prop$xbeta
  df$zgamma <- prop$zgamma
  if (trim) {
    df <- trim_common_support(df)
  }
  df
}

mte_derivative_at_u <- function(fit, u, degree = 4L, regressors = chv2011_wage_x) {
  b <- stats::coef(fit)
  n_x <- length(regressors)
  x_main <- stats::model.matrix(fit)[, seq_len(n_x + 1L), drop = FALSE]
  x_bar <- colMeans(x_main)
  b_prop <- b[n_x + 2L]
  int_start <- n_x + 2L + degree
  b_inter <- b[int_start:(int_start + n_x - 1L)]
  deriv <- sum(x_bar * c(b_prop, b_inter))
  if (degree >= 2L) {
    for (j in seq_len(degree - 1L)) {
      pwr <- j + 1L
      deriv <- deriv + pwr * u^(pwr - 1L) * b[n_x + 2L + j]
    }
  }
  as.numeric(deriv)
}

estimate_mte_polynomial <- function(df, degree = 4L, spec = "baseline") {
  regressors <- chv2011_wage_regressors(spec)
  d <- df
  d$prop <- d$phat
  fml <- chv2011_table4a_poly_formula(degree, regressors = regressors)
  fit <- stats::lm(fml, data = d)
  mte_vals <- vapply(
    chv2011_mte_grid,
    function(u) mte_derivative_at_u(fit, u, degree = degree, regressors = regressors),
    numeric(1)
  )
  list(mte = mte_vals, grid = chv2011_mte_grid, model = fit, degree = degree, spec = spec)
}

chv2011_trim_mte_grid <- function(mte_obj) {
  g <- mte_obj$grid
  m <- mte_obj$mte / 4
  keep <- g >= chv2011_p_bounds[1] & g <= chv2011_p_bounds[2]
  list(grid = g[keep], mte = m[keep])
}

compute_tilde_return_weights <- function(grid, phat, state) {
  phat1 <- phat[state == 1L]
  phat0 <- phat[state == 0L]
  tt_w <- vapply(grid, function(u) mean(phat1 > u, na.rm = TRUE), numeric(1))
  tut_w <- vapply(grid, function(u) mean(phat0 <= u, na.rm = TRUE), numeric(1))
  ate_w <- rep(1, length(grid))
  list(
    ate_w = ate_w / sum(ate_w),
    tt_w = tt_w / sum(tt_w),
    tut_w = tut_w / sum(tut_w)
  )
}

estimate_ols_iv_returns <- function(df) {
  d <- wage_interaction_terms(df)
  ols_f <- stats::as.formula(paste(
    "wage ~ state +", paste(chv2011_x, collapse = " + ")
  ))
  iv_f <- stats::as.formula(paste(
    "wage ~ state +", paste(chv2011_x, collapse = " + "),
    "|", paste(chv2011_x, collapse = " + "),
    "+ pub4 + lwage5_17 + lurate_17 + tuition"
  ))
  ols_fit <- stats::lm(ols_f, data = d)
  iv_fit <- AER::ivreg(iv_f, data = d)
  list(
    ols = stats::coef(ols_fit)[["state"]] / 4,
    iv = stats::coef(iv_fit)[["state"]] / 4,
    ols_fit = ols_fit,
    iv_fit = iv_fit
  )
}

compute_late_pairs <- function(mte_obj) {
  g <- mte_obj$grid
  m <- mte_obj$mte / 4
  pairs <- list(
    c(0, 0.04), c(0.08, 0.12), c(0.16, 0.20), c(0.24, 0.28),
    c(0.32, 0.36), c(0.40, 0.44), c(0.48, 0.52), c(0.56, 0.60),
    c(0.64, 0.68), c(0.72, 0.76), c(0.80, 0.84), c(0.88, 0.92)
  )
  idx <- function(u) which.min(abs(g - u))
  purrr::map_dfr(pairs, function(pr) {
    i1 <- idx(pr[1L])
    i2 <- idx(pr[2L])
    late1 <- mean(m[i1:min(i1 + 1L, length(m))])
    late2 <- mean(m[i2:min(i2 + 1L, length(m))])
    tibble::tibble(
      interval1 = paste0(pr[1L], "-", pr[1L] + 0.04),
      interval2 = paste0(pr[2L], "-", pr[2L] + 0.04),
      diff = late2 - late1
    )
  })
}

compute_mprte_weights <- function(grid, policy = c("z_shift", "p_shift", "p_scale")) {
  policy <- match.arg(policy)
  u <- grid
  switch(
    policy,
    z_shift = stats::dnorm(stats::qnorm(pmin(pmax(u, 1e-4), 1 - 1e-4))) /
      sum(stats::dnorm(stats::qnorm(pmin(pmax(u, 1e-4), 1 - 1e-4)))),
    p_shift = rep(1 / length(u), length(u)),
    p_scale = u / sum(u)
  )
}

weighted_mte_mean <- function(mte, grid, weights) {
  w <- weights / sum(weights)
  sum(mte * w)
}

compute_semiparam_returns <- function(df, spec = "baseline") {
  mte_obj <- estimate_mte_polynomial(df, spec = spec)
  m_full <- mte_obj$mte / 4
  g_full <- mte_obj$grid
  trimmed <- chv2011_trim_mte_grid(mte_obj)
  w <- compute_tilde_return_weights(trimmed$grid, df$phat, df$state)
  iv_ols <- estimate_ols_iv_returns(df)
  tibble::tibble(
    parameter = c(
      "ATE_tilde", "TT_tilde", "TUT_tilde",
      "MPRTE_Z", "MPRTE_P", "MPRTE_scaleP",
      "IV", "OLS"
    ),
    estimate = c(
      sum(trimmed$mte * w$ate_w),
      sum(trimmed$mte * w$tt_w),
      sum(trimmed$mte * w$tut_w),
      weighted_mte_mean(m_full, g_full, compute_mprte_weights(g_full, "z_shift")),
      weighted_mte_mean(m_full, g_full, compute_mprte_weights(g_full, "p_shift")),
      weighted_mte_mean(m_full, g_full, compute_mprte_weights(g_full, "p_scale")),
      iv_ols$iv, iv_ols$ols
    )
  )
}

chv2011_prep_boot_df <- function(df, spec = "baseline") {
  df <- chv2011_attach_school(df)
  if (identical(spec, "nodrop")) {
    df <- filter_chv2011_spec(df, spec)
  }
  df <- add_chv2011_wage_spec_vars(df, spec)
  chv2011_attach_propensity(df, spec = spec, trim = TRUE)
}

estimate_normal_mte_returns <- function(df) {
  curve <- load_chv2011_normal_mte_curve()
  u <- curve$u
  mte <- curve$mte
  iv_ols <- estimate_ols_iv_returns(df)
  tibble::tibble(
    parameter = c("ATE", "TT", "TUT", "MPRTE_Z", "MPRTE_P", "MPRTE_scaleP", "IV", "OLS"),
    estimate = c(
      weighted_mte_mean(mte, u, rep(1, length(u))),
      weighted_mte_mean(mte, u, u),
      weighted_mte_mean(mte, u, 1 - u),
      weighted_mte_mean(mte, u, compute_mprte_weights(u, "z_shift")),
      weighted_mte_mean(mte, u, compute_mprte_weights(u, "p_shift")),
      weighted_mte_mean(mte, u, compute_mprte_weights(u, "p_scale")),
      iv_ols$iv, iv_ols$ols
    )
  )
}

chv2011_mte_bootstrap_indices <- function(
    n,
    B = as.integer(Sys.getenv("CHV2011_BOOT", "50")),
    use_bootdata = NULL,
    boot_dir = chv2011_boot_dir()) {
  if (is.null(use_bootdata)) {
    use_bootdata <- !identical(Sys.getenv("CHV2011_USE_BOOTDATA", "TRUE"), "FALSE")
  }
  status <- if (use_bootdata) bootdata_status(boot_dir) else list(ok = FALSE)
  if (isTRUE(status$ok) && status$B >= B) {
    idx_mat <- readRDS(file.path(boot_dir, "bootstrap_indices.rds"))
    return(list(mode = "bootdata", idx_mat = idx_mat[, seq_len(B - 1L), drop = FALSE], B = B))
  }
  set.seed(as.integer(Sys.getenv("CHV2011_BOOT_SEED", "703296661")))
  list(
    mode = "resample",
    idx_mat = matrix(sample.int(n, n * B, replace = TRUE), nrow = n, ncol = B),
    B = B
  )
}

bootstrap_analysis_draw <- function(df_full, idx, spec = "baseline") {
  dfb <- df_full[idx, , drop = FALSE]
  chv2011_prep_boot_df(dfb, spec = spec)
}

bootstrap_semiparam_returns <- function(
    df,
    B = as.integer(Sys.getenv("CHV2011_BOOT", "50")),
    spec = "baseline",
    seed = as.integer(Sys.getenv("CHV2011_BOOT_SEED", "703296661"))) {
  params <- compute_semiparam_returns(df, spec = spec)$parameter
  mat <- matrix(NA_real_, nrow = length(params), ncol = B)
  rownames(mat) <- params
  boot_dir <- chv2011_boot_dir_for_spec(spec)
  use_bootdata <- !identical(Sys.getenv("CHV2011_USE_BOOTDATA", "TRUE"), "FALSE") &&
    isTRUE(bootdata_status(boot_dir)$ok)
  boot <- chv2011_mte_bootstrap_indices(
    nrow(df),
    B = B,
    use_bootdata = use_bootdata,
    boot_dir = boot_dir
  )
  raw_full <- load_chv2011_raw() |>
    add_chv2011_derived() |>
    filter_chv2011_spec(spec) |>
    add_chv2011_wage_spec_vars(spec)

  for (b in seq_len(B)) {
    tryCatch({
      dfb <- if (boot$mode == "bootdata") {
        chv2011_prep_boot_df(
          load_chv2011_boot_sample(b, boot_dir = boot_dir, for_analysis = TRUE),
          spec = spec
        )
      } else {
        bootstrap_analysis_draw(raw_full, boot$idx_mat[, b], spec = spec)
      }
      est <- compute_semiparam_returns(dfb, spec = spec)
      mat[, b] <- est$estimate[match(params, est$parameter)]
    }, error = function(e) NULL)
  }

  tibble::tibble(
    parameter = params,
    estimate = compute_semiparam_returns(df, spec = spec)$estimate,
    se = apply(mat, 1, stats::sd, na.rm = TRUE),
    boot_mode = boot$mode
  )
}

run_chv2011_spec <- function(
    spec = "baseline",
    B_boot = as.integer(Sys.getenv("CHV2011_BOOT", "50"))) {
  ana <- load_chv2011_analysis(spec = spec, trim = TRUE)
  df <- ana$data
  list(
    data = df,
    spec = spec,
    mode = ana$mode,
    semiparam = bootstrap_semiparam_returns(df, B = B_boot, spec = spec),
    normal = estimate_normal_mte_returns(df),
    mte = estimate_mte_polynomial(df, spec = spec),
    iv_ols = estimate_ols_iv_returns(df)
  )
}