# Semiparametric MTE and treatment-parameter helpers for CHV2011. # Polynomial outcome equation follows chv2011-table4a-core.R (author bootstrap_MT.R layout). source(here::here("04-topics/rep-chv2011/Rcode/chv2011-data-prep.R")) source(here::here("04-topics/rep-chv2011/Rcode/chv2011-table4a-core.R")) chv2011_mte_grid <- seq(0, 1, by = 0.04) chv2011_normal_mte_path <- function(aer_dir = chv2011_aer_dir) { file.path(aer_dir, "normalmodel/mtexvb.out") } #' Author normal-model MTE curve (shipped mtexvb.out) or Heckman–Vytlacil proxy. load_chv2011_normal_mte_curve <- function(aer_dir = chv2011_aer_dir) { path <- chv2011_normal_mte_path(aer_dir) if (file.exists(path)) { mat <- as.matrix(utils::read.table(path)) return(tibble::tibble(u = mat[, 1L], mte = mat[, 3L])) } ana <- load_chv2011_analysis(trim = TRUE) df <- ana$data delta <- stats::coef(stats::lm( stats::as.formula(paste("wage ~ state +", paste(chv2011_x, collapse = " + "))), data = df ))[["state"]] sigma_v <- 0.5 u <- seq(0.001, 0.999, length.out = 101) tibble::tibble(u = u, mte = (as.numeric(delta) + sigma_v * stats::qnorm(1 - u)) / 4) } chv2011_attach_propensity <- function(df, spec = "baseline", trim = TRUE) { prop <- estimate_propensity(df, spec) df$phat <- prop$phat df$xbeta <- prop$xbeta df$zgamma <- prop$zgamma if (trim) { df <- trim_common_support(df) } df } mte_derivative_at_u <- function(fit, u, degree = 4L, regressors = chv2011_wage_x) { b <- stats::coef(fit) n_x <- length(regressors) x_main <- stats::model.matrix(fit)[, seq_len(n_x + 1L), drop = FALSE] x_bar <- colMeans(x_main) b_prop <- b[n_x + 2L] int_start <- n_x + 2L + degree b_inter <- b[int_start:(int_start + n_x - 1L)] deriv <- sum(x_bar * c(b_prop, b_inter)) if (degree >= 2L) { for (j in seq_len(degree - 1L)) { pwr <- j + 1L deriv <- deriv + pwr * u^(pwr - 1L) * b[n_x + 2L + j] } } as.numeric(deriv) } estimate_mte_polynomial <- function(df, degree = 4L, spec = "baseline") { regressors <- chv2011_wage_regressors(spec) d <- df d$prop <- d$phat fml <- chv2011_table4a_poly_formula(degree, regressors = regressors) fit <- stats::lm(fml, data = d) mte_vals <- vapply( chv2011_mte_grid, function(u) mte_derivative_at_u(fit, u, degree = degree, regressors = regressors), numeric(1) ) list(mte = mte_vals, grid = chv2011_mte_grid, model = fit, degree = degree, spec = spec) } chv2011_trim_mte_grid <- function(mte_obj) { g <- mte_obj$grid m <- mte_obj$mte / 4 keep <- g >= chv2011_p_bounds[1] & g <= chv2011_p_bounds[2] list(grid = g[keep], mte = m[keep]) } compute_tilde_return_weights <- function(grid, phat, state) { phat1 <- phat[state == 1L] phat0 <- phat[state == 0L] tt_w <- vapply(grid, function(u) mean(phat1 > u, na.rm = TRUE), numeric(1)) tut_w <- vapply(grid, function(u) mean(phat0 <= u, na.rm = TRUE), numeric(1)) ate_w <- rep(1, length(grid)) list( ate_w = ate_w / sum(ate_w), tt_w = tt_w / sum(tt_w), tut_w = tut_w / sum(tut_w) ) } estimate_ols_iv_returns <- function(df) { d <- wage_interaction_terms(df) ols_f <- stats::as.formula(paste( "wage ~ state +", paste(chv2011_x, collapse = " + ") )) iv_f <- stats::as.formula(paste( "wage ~ state +", paste(chv2011_x, collapse = " + "), "|", paste(chv2011_x, collapse = " + "), "+ pub4 + lwage5_17 + lurate_17 + tuition" )) ols_fit <- stats::lm(ols_f, data = d) iv_fit <- AER::ivreg(iv_f, data = d) list( ols = stats::coef(ols_fit)[["state"]] / 4, iv = stats::coef(iv_fit)[["state"]] / 4, ols_fit = ols_fit, iv_fit = iv_fit ) } compute_late_pairs <- function(mte_obj) { g <- mte_obj$grid m <- mte_obj$mte / 4 pairs <- list( c(0, 0.04), c(0.08, 0.12), c(0.16, 0.20), c(0.24, 0.28), c(0.32, 0.36), c(0.40, 0.44), c(0.48, 0.52), c(0.56, 0.60), c(0.64, 0.68), c(0.72, 0.76), c(0.80, 0.84), c(0.88, 0.92) ) idx <- function(u) which.min(abs(g - u)) purrr::map_dfr(pairs, function(pr) { i1 <- idx(pr[1L]) i2 <- idx(pr[2L]) late1 <- mean(m[i1:min(i1 + 1L, length(m))]) late2 <- mean(m[i2:min(i2 + 1L, length(m))]) tibble::tibble( interval1 = paste0(pr[1L], "-", pr[1L] + 0.04), interval2 = paste0(pr[2L], "-", pr[2L] + 0.04), diff = late2 - late1 ) }) } compute_mprte_weights <- function(grid, policy = c("z_shift", "p_shift", "p_scale")) { policy <- match.arg(policy) u <- grid switch( policy, z_shift = stats::dnorm(stats::qnorm(pmin(pmax(u, 1e-4), 1 - 1e-4))) / sum(stats::dnorm(stats::qnorm(pmin(pmax(u, 1e-4), 1 - 1e-4)))), p_shift = rep(1 / length(u), length(u)), p_scale = u / sum(u) ) } weighted_mte_mean <- function(mte, grid, weights) { w <- weights / sum(weights) sum(mte * w) } compute_semiparam_returns <- function(df, spec = "baseline") { mte_obj <- estimate_mte_polynomial(df, spec = spec) m_full <- mte_obj$mte / 4 g_full <- mte_obj$grid trimmed <- chv2011_trim_mte_grid(mte_obj) w <- compute_tilde_return_weights(trimmed$grid, df$phat, df$state) iv_ols <- estimate_ols_iv_returns(df) tibble::tibble( parameter = c( "ATE_tilde", "TT_tilde", "TUT_tilde", "MPRTE_Z", "MPRTE_P", "MPRTE_scaleP", "IV", "OLS" ), estimate = c( sum(trimmed$mte * w$ate_w), sum(trimmed$mte * w$tt_w), sum(trimmed$mte * w$tut_w), weighted_mte_mean(m_full, g_full, compute_mprte_weights(g_full, "z_shift")), weighted_mte_mean(m_full, g_full, compute_mprte_weights(g_full, "p_shift")), weighted_mte_mean(m_full, g_full, compute_mprte_weights(g_full, "p_scale")), iv_ols$iv, iv_ols$ols ) ) } chv2011_prep_boot_df <- function(df, spec = "baseline") { df <- chv2011_attach_school(df) if (identical(spec, "nodrop")) { df <- filter_chv2011_spec(df, spec) } df <- add_chv2011_wage_spec_vars(df, spec) chv2011_attach_propensity(df, spec = spec, trim = TRUE) } estimate_normal_mte_returns <- function(df) { curve <- load_chv2011_normal_mte_curve() u <- curve$u mte <- curve$mte iv_ols <- estimate_ols_iv_returns(df) tibble::tibble( parameter = c("ATE", "TT", "TUT", "MPRTE_Z", "MPRTE_P", "MPRTE_scaleP", "IV", "OLS"), estimate = c( weighted_mte_mean(mte, u, rep(1, length(u))), weighted_mte_mean(mte, u, u), weighted_mte_mean(mte, u, 1 - u), weighted_mte_mean(mte, u, compute_mprte_weights(u, "z_shift")), weighted_mte_mean(mte, u, compute_mprte_weights(u, "p_shift")), weighted_mte_mean(mte, u, compute_mprte_weights(u, "p_scale")), iv_ols$iv, iv_ols$ols ) ) } chv2011_mte_bootstrap_indices <- function( n, B = as.integer(Sys.getenv("CHV2011_BOOT", "50")), use_bootdata = NULL, boot_dir = chv2011_boot_dir()) { if (is.null(use_bootdata)) { use_bootdata <- !identical(Sys.getenv("CHV2011_USE_BOOTDATA", "TRUE"), "FALSE") } status <- if (use_bootdata) bootdata_status(boot_dir) else list(ok = FALSE) if (isTRUE(status$ok) && status$B >= B) { idx_mat <- readRDS(file.path(boot_dir, "bootstrap_indices.rds")) return(list(mode = "bootdata", idx_mat = idx_mat[, seq_len(B - 1L), drop = FALSE], B = B)) } set.seed(as.integer(Sys.getenv("CHV2011_BOOT_SEED", "703296661"))) list( mode = "resample", idx_mat = matrix(sample.int(n, n * B, replace = TRUE), nrow = n, ncol = B), B = B ) } bootstrap_analysis_draw <- function(df_full, idx, spec = "baseline") { dfb <- df_full[idx, , drop = FALSE] chv2011_prep_boot_df(dfb, spec = spec) } bootstrap_semiparam_returns <- function( df, B = as.integer(Sys.getenv("CHV2011_BOOT", "50")), spec = "baseline", seed = as.integer(Sys.getenv("CHV2011_BOOT_SEED", "703296661"))) { params <- compute_semiparam_returns(df, spec = spec)$parameter mat <- matrix(NA_real_, nrow = length(params), ncol = B) rownames(mat) <- params boot_dir <- chv2011_boot_dir_for_spec(spec) use_bootdata <- !identical(Sys.getenv("CHV2011_USE_BOOTDATA", "TRUE"), "FALSE") && isTRUE(bootdata_status(boot_dir)$ok) boot <- chv2011_mte_bootstrap_indices( nrow(df), B = B, use_bootdata = use_bootdata, boot_dir = boot_dir ) raw_full <- load_chv2011_raw() |> add_chv2011_derived() |> filter_chv2011_spec(spec) |> add_chv2011_wage_spec_vars(spec) for (b in seq_len(B)) { tryCatch({ dfb <- if (boot$mode == "bootdata") { chv2011_prep_boot_df( load_chv2011_boot_sample(b, boot_dir = boot_dir, for_analysis = TRUE), spec = spec ) } else { bootstrap_analysis_draw(raw_full, boot$idx_mat[, b], spec = spec) } est <- compute_semiparam_returns(dfb, spec = spec) mat[, b] <- est$estimate[match(params, est$parameter)] }, error = function(e) NULL) } tibble::tibble( parameter = params, estimate = compute_semiparam_returns(df, spec = spec)$estimate, se = apply(mat, 1, stats::sd, na.rm = TRUE), boot_mode = boot$mode ) } run_chv2011_spec <- function( spec = "baseline", B_boot = as.integer(Sys.getenv("CHV2011_BOOT", "50"))) { ana <- load_chv2011_analysis(spec = spec, trim = TRUE) df <- ana$data list( data = df, spec = spec, mode = ana$mode, semiparam = bootstrap_semiparam_returns(df, B = B_boot, spec = spec), normal = estimate_normal_mte_returns(df), mte = estimate_mte_polynomial(df, spec = spec), iv_ols = estimate_ols_iv_returns(df) ) }