# Shared data preparation for CHV2011 replication (Carneiro, Heckman, Vytlacil 2011). library(here) library(tidyverse) library(haven) chv2011_aer_dir <- here("04-topics/rep-chv2011/replication_aer") # Outcome-equation controls (wage equation X). chv2011_x <- c( "exp", "expsq", "cafqt", "cafqt2", "mhgc", "mhgc2", "numsibs", "numsibs2", "urban14", "lavlocwage17", "lavlocwage172", "avurate", "avurate2", "d57", "d58", "d59", "d60", "d61", "d62", "d63", "lwage5", "lurate" ) # Choice-equation controls (excluding cohort dummies in Table 3 display). chv2011_choice_x <- c( "cafqt", "cafqt2", "mhgc", "mhgc2", "numsibs", "numsibs2", "urban14", "lavlocwage17", "lavlocwage172", "avurate", "avurate2" ) chv2011_cohort <- paste0("d", 57:63) chv2011_inst_base <- c("pub4", "lwage5_17", "lurate_17", "tuition") chv2011_inst_interact <- c("cafqt", "mhgc", "numsibs") # Table 4a / bootstrap_MT regressors (wage equation, no P interactions). chv2011_wage_x <- chv2011_x chv2011_p_bounds <- c(0.0324, 0.9775) #' Wage-equation regressors by Table 6 / robustness spec. chv2011_wage_regressors <- function(spec = "baseline") { if (identical(spec, "dropdummies")) { return(c(chv2011_wage_x, "hsdrop", "somecol")) } chv2011_wage_x } #' Willis–Rosen schooling dummies for Table 6(a) col 3 (wage equation only). add_chv2011_wage_spec_vars <- function(df, spec = "baseline") { if (!identical(spec, "dropdummies")) { return(df) } df |> mutate( hsdrop = as.integer(school == 1L), somecol = as.integer(school == 3L) ) } #' Merge `school` onto bootstrap rows (not stored in bootdata/*.asc). chv2011_attach_school <- function(df, aer_dir = chv2011_aer_dir) { if ("school" %in% names(df)) { return(df) } sch <- load_chv2011_raw(aer_dir) |> dplyr::select(caseid, school) dplyr::left_join(df, sch, by = "caseid") } #' Bootstrap directory for Table 6 specs (nodrop uses table6/bootdata_nodrop when present). chv2011_boot_dir_for_spec <- function(spec = "baseline", aer_dir = chv2011_aer_dir) { if (identical(spec, "nodrop")) { nodrop_dir <- file.path(aer_dir, "table6/bootdata_nodrop") if (isTRUE(bootdata_status(nodrop_dir)$ok)) { return(nodrop_dir) } } chv2011_boot_dir(aer_dir) } # Paper Table A-3 targets (Carneiro, Heckman and Vytlacil 2011, Appendix). chv2011_paper_a3 <- function() { tribble( ~var, ~mean0, ~sd0, ~mean1, ~sd1, ~source, "wage", 2.2089, 0.4412, 2.5496, 0.4959, "basic", "exp", 10.1042, 3.1260, 6.8404, 3.2522, "basic", "cafqt", -0.0446, 0.8673, 0.9515, 0.7498, "basic", "mhgc", 11.3083, 2.1056, 12.9121, 2.2789, "basic", "numsibs", 3.2630, 2.0842, 2.5849, 1.6450, "basic", "urban14", 0.6995, 0.4587, 0.7895, 0.4078, "basic", "lwage5", 10.2645, 0.1597, 10.3220, 0.1660, "local", "lurate", 6.7971, 1.3310, 6.8226, 1.1983, "local", "pub4", 0.4625, 0.4988, 0.5884, 0.4924, "local", "lwage5_17", 10.2780, 0.1619, 10.2736, 0.1651, "local", "lurate_17", 7.0804, 1.7846, 7.0846, 1.8449, "local", "tuition", 22.0164, 7.8730, 21.1105, 8.0683, "local", "lavlocwage17", 10.2673, 0.1798, 10.2991, 0.1945, "local", "avurate", 6.2942, 1.0156, 6.2077, 0.9536, "local" ) } # Paper Table 3 / A-4 average marginal derivatives (full sample; phat1 in getboot.do). chv2011_paper_table3_derivs <- function() { tribble( ~variable, ~paper, "cafqt", 0.2826, "mhgc", 0.0441, "numsibs", -0.0233, "urban14", 0.0340, "lavlocwage17", 0.1820, "avurate", 0.0058, "pub4", 0.0529, "lwage5_17", -0.2687, "lurate_17", 0.0149, "tuition", -0.0027 ) } chv2011_iv_terms <- function(include_numsibs = TRUE) { base <- chv2011_inst_base controls <- if (include_numsibs) { chv2011_inst_interact } else { c("cafqt", "mhgc") } c(base, as.vector(outer(base, controls, paste, sep = "_"))) } detect_chv2011_data_mode <- function( aer_dir = chv2011_aer_dir) { dataset_path <- file.path(aer_dir, "dataset.dta") newid_path <- file.path(aer_dir, "newid.dta") if (file.exists(dataset_path)) { return(list(mode = "full", message = "Using merged dataset.dta from geocode pipeline.")) } if (file.exists(newid_path)) { return(list(mode = "geocode", message = "Using newid.dta merge (geocode available).")) } list( mode = "bundled_aligned", message = paste( "Merging basicvariables.dta and localvariables.dta by row order", "(author-aligned replication bundle; N should be 1747)." ) ) } load_chv2011_raw <- function(aer_dir = chv2011_aer_dir) { mode_info <- detect_chv2011_data_mode(aer_dir) if (mode_info$mode == "full") { df <- read_dta(file.path(aer_dir, "dataset.dta")) } else if (mode_info$mode == "geocode") { basic <- read_dta(file.path(aer_dir, "basicvariables.dta")) |> arrange(caseid) newid <- read_dta(file.path(aer_dir, "newid.dta")) |> arrange(caseid) local <- read_dta(file.path(aer_dir, "localvariables.dta")) |> arrange(newid) df <- basic |> left_join(newid, by = "caseid") |> left_join(local, by = "newid") } else { basic <- read_dta(file.path(aer_dir, "basicvariables.dta")) local <- read_dta(file.path(aer_dir, "localvariables.dta")) |> select(-newid) df <- bind_cols(basic, local) } attr(df, "chv2011_data_mode") <- mode_info df } validate_chv2011_merge <- function( df = add_chv2011_derived(load_chv2011_raw()), tol_basic = 0.002, tol_local = 0.15) { mode <- attr(df, "chv2011_data_mode") mode_label <- if (is.list(mode) && !is.null(mode$mode)) mode$mode else "unknown" basic <- read_dta(file.path(chv2011_aer_dir, "basicvariables.dta")) local <- read_dta(file.path(chv2011_aer_dir, "localvariables.dta")) id_corr <- stats::cor(basic$caseid, local$newid) targets <- chv2011_paper_a3() report <- targets |> rowwise() |> mutate( got_mean0 = mean(df[[var]][df$state == 0], na.rm = TRUE), got_mean1 = mean(df[[var]][df$state == 1], na.rm = TRUE), got_sd0 = sd(df[[var]][df$state == 0], na.rm = TRUE), got_sd1 = sd(df[[var]][df$state == 1], na.rm = TRUE), dmean0 = got_mean0 - mean0, dmean1 = got_mean1 - mean1, dsd0 = got_sd0 - sd0, dsd1 = got_sd1 - sd1, tol = if_else(source == "basic", tol_basic, tol_local), ok_mean = abs(dmean0) <= tol & abs(dmean1) <= tol ) |> ungroup() list( mode = mode_label, n = nrow(df), n0 = sum(df$state == 0), n1 = sum(df$state == 1), caseid_newid_cor = id_corr, merge_note = paste( "Bundled AER files: bind_cols(basic, local[-newid]) by row order.", "Without geocode newid.dta, local instruments are only approximately aligned", sprintf("(cor(caseid, newid) = %.3f).", id_corr) ), all_basic_ok = all(report$ok_mean[report$source == "basic"]), all_local_ok = all(report$ok_mean[report$source == "local"]), max_mean_diff = max(abs(c(report$dmean0, report$dmean1))), report = report ) } validate_chv2011_table3 <- function( df = add_chv2011_derived(load_chv2011_raw()), fit = glm(choice_formula("baseline"), data = df, family = binomial(link = "logit")), tol_control = 0.04, tol_instrument = 0.03) { avder <- average_marginal_derivatives(fit, df) cmp <- chv2011_paper_table3_derivs() |> left_join(avder, by = "variable") |> mutate( diff = avg_deriv - paper, group = if_else(variable %in% chv2011_inst_base, "instrument", "control"), tol = if_else(group == "instrument", tol_instrument, tol_control), ok = abs(diff) <= tol ) list( estimates = cmp, joint_iv_p = chv2011_joint_iv_test(fit), all_controls_ok = all(cmp$ok[cmp$group == "control"]), all_instruments_ok = all(cmp$ok[cmp$group == "instrument"]), note = paste( "Point estimates use full-sample logit + normalden(xbeta), matching avder_ddd.do on phat1.", "Paper joint IV p = 0.0001 requires geocode merge (dataset.dta); bundled row merge", "attenuates instrument relevance." ) ) } chv2011_joint_iv_test <- function(fit, terms = chv2011_iv_terms()) { terms <- intersect(terms, names(coef(fit))) if (length(terms) == 0) { return(NA_real_) } restricted <- stats::update.formula(formula(fit), paste(". ~ . -", paste(terms, collapse = " - "))) fit_r <- stats::update(fit, restricted) lr <- stats::anova(fit_r, fit, test = "Chisq") lr$`Pr(>Chi)`[2] } add_chv2011_derived <- function(df) { df |> mutate( tuition = tuit4c, cafqt2 = cafqt^2, mhgc2 = mhgc^2, numsibs2 = numsibs^2, lavlocwage172 = lavlocwage17^2, avurate2 = avurate^2 ) |> chv2011_add_iv_interactions() } chv2011_add_iv_interactions <- function(df, inst = chv2011_inst_base, controls = chv2011_inst_interact) { out <- df for (z in inst) { for (x in controls) { nm <- paste0(z, "_", x) out[[nm]] <- out[[z]] * out[[x]] } } out } trim_common_support <- function(df, bounds = chv2011_p_bounds, phat_col = "phat") { df |> filter(.data[[phat_col]] >= bounds[1], .data[[phat_col]] <= bounds[2]) } choice_formula <- function(spec = "baseline") { x_part <- switch( spec, baseline = paste(c(chv2011_choice_x, chv2011_cohort), collapse = " + "), allxinz = paste(c(chv2011_x, chv2011_cohort), collapse = " + "), nointall = paste(c(chv2011_choice_x, chv2011_cohort, chv2011_inst_base), collapse = " + "), notuitnounemp = paste(c(chv2011_choice_x, chv2011_cohort, "pub4", "lwage5_17"), collapse = " + "), notuit = paste(c(chv2011_choice_x, chv2011_cohort, "pub4", "lwage5_17", "lurate_17"), collapse = " + "), paste(c(chv2011_choice_x, chv2011_cohort), collapse = " + ") ) z_part <- switch( spec, nointall = "", notuitnounemp = "", notuit = "", { iv_terms <- c( chv2011_inst_base, as.vector(outer(chv2011_inst_base, chv2011_inst_interact, paste, sep = "_")) ) paste(iv_terms, collapse = " + ") } ) rhs <- if (nzchar(z_part)) paste(x_part, z_part, sep = " + ") else x_part stats::as.formula(paste("state ~", rhs)) } filter_chv2011_spec <- function(df, spec = "baseline") { switch( spec, nodrop = df |> filter(school != 1), df ) } estimate_propensity <- function(df, spec = "baseline") { fml <- choice_formula(spec) fit <- glm(fml, data = df, family = binomial(link = "logit")) xbeta <- as.numeric(predict(fit, type = "link")) phat <- as.numeric(predict(fit, type = "response")) xgamma <- predict(fit, newdata = df, type = "link", se.fit = FALSE) # Linear index from X-only part (for policy weights). x_mat <- model.matrix( stats::update(fml, . ~ . - pub4 - lwage5_17 - lurate_17 - tuition - pub4_cafqt - pub4_mhgc - pub4_numsibs - lwage5_17_cafqt - lwage5_17_mhgc - lwage5_17_numsibs - lurate_17_cafqt - lurate_17_mhgc - lurate_17_numsibs - tuition_cafqt - tuition_mhgc - tuition_numsibs), data = df ) if (ncol(x_mat) > 1) { xgamma_val <- as.numeric(x_mat %*% coef(fit)[names(coef(fit)) %in% colnames(x_mat)]) } else { xgamma_val <- rep(0, nrow(df)) } zgamma <- xbeta - xgamma_val list(model = fit, phat = phat, xbeta = xbeta, xgamma = xgamma_val, zgamma = zgamma) } average_marginal_derivatives <- function(fit, df) { xb <- as.numeric(predict(fit, type = "link")) dens <- dnorm(xb) b <- coef(fit) nm <- names(b) getb <- function(k) if (k %in% nm) b[[k]] else 0 tibble( variable = c( "cafqt", "mhgc", "numsibs", "urban14", "lavlocwage17", "avurate", "pub4", "lwage5_17", "lurate_17", "tuition" ), avg_deriv = c( mean(dens * (getb("cafqt") + 2 * getb("cafqt2") * df$cafqt + getb("pub4_cafqt") * df$pub4 + getb("lwage5_17_cafqt") * df$lwage5_17 + getb("lurate_17_cafqt") * df$lurate_17 + getb("tuition_cafqt") * df$tuition)), mean(dens * (getb("mhgc") + 2 * getb("mhgc2") * df$mhgc + getb("pub4_mhgc") * df$pub4 + getb("lwage5_17_mhgc") * df$lwage5_17 + getb("lurate_17_mhgc") * df$lurate_17 + getb("tuition_mhgc") * df$tuition)), mean(dens * (getb("numsibs") + 2 * getb("numsibs2") * df$numsibs + getb("pub4_numsibs") * df$pub4 + getb("lwage5_17_numsibs") * df$lwage5_17 + getb("lurate_17_numsibs") * df$lurate_17 + getb("tuition_numsibs") * df$tuition)), mean(dens * getb("urban14")), mean(dens * (getb("lavlocwage17") + 2 * getb("lavlocwage172") * df$lavlocwage17)), mean(dens * (getb("avurate") + 2 * getb("avurate2") * df$avurate)), mean(dens * (getb("pub4") + getb("pub4_cafqt") * df$cafqt + getb("pub4_mhgc") * df$mhgc + getb("pub4_numsibs") * df$numsibs)), mean(dens * (getb("lwage5_17") + getb("lwage5_17_cafqt") * df$cafqt + getb("lwage5_17_mhgc") * df$mhgc + getb("lwage5_17_numsibs") * df$numsibs)), mean(dens * (getb("lurate_17") + getb("lurate_17_cafqt") * df$cafqt + getb("lurate_17_mhgc") * df$mhgc + getb("lurate_17_numsibs") * df$numsibs)), mean(dens * (getb("tuition") + getb("tuition_cafqt") * df$cafqt + getb("tuition_mhgc") * df$mhgc + getb("tuition_numsibs") * df$numsibs)) ) ) } bootstrap_avg_derivatives <- function( df, B = 250, seed = 703296661, spec = "baseline", boot_dir = chv2011_boot_dir(), use_bootdata = NULL) { if (is.null(use_bootdata)) { use_bootdata <- !identical(Sys.getenv("CHV2011_USE_BOOTDATA", "TRUE"), "FALSE") } status <- if (use_bootdata) bootdata_status(boot_dir) else list(ok = FALSE) if (isTRUE(status$ok) && status$B >= B) { boot <- matrix(NA_real_, nrow = 10, ncol = B) for (b in seq_len(B)) { dfb <- load_chv2011_boot_sample(b, boot_dir = boot_dir, for_analysis = TRUE) fitb <- glm(choice_formula(spec), data = dfb, family = binomial(link = "logit")) boot[, b] <- average_marginal_derivatives(fitb, dfb)$avg_deriv } return(apply(boot, 1, sd, na.rm = TRUE)) } set.seed(seed) n <- nrow(df) idx_mat <- matrix(sample.int(n, n * B, replace = TRUE), nrow = n, ncol = B) boot <- matrix(NA_real_, nrow = 10, ncol = B) for (b in seq_len(B)) { dfb <- df[idx_mat[, b], , drop = FALSE] fitb <- glm(choice_formula(spec), data = dfb, family = binomial(link = "logit")) boot[, b] <- average_marginal_derivatives(fitb, dfb)$avg_deriv } apply(boot, 1, sd, na.rm = TRUE) } load_chv2011_analysis <- function(spec = "baseline", trim = TRUE) { raw <- load_chv2011_raw() |> add_chv2011_derived() |> filter_chv2011_spec(spec) |> add_chv2011_wage_spec_vars(spec) prop <- estimate_propensity(raw, spec) raw$phat <- prop$phat raw$xbeta <- prop$xbeta raw$zgamma <- prop$zgamma if (trim) { raw <- trim_common_support(raw) } list( data = raw, propensity = prop, mode = attr(load_chv2011_raw(), "chv2011_data_mode"), spec = spec ) } make_bootstrap_indices <- function(n, B = 250, seed = 703296661) { set.seed(seed) matrix(sample.int(n, n * B, replace = TRUE), nrow = n, ncol = B) } # ---- Bootstrap samples (Stata getboot.do) ------------------------------------ #' Column order written to bootdata/phat*.asc (matches getboot.do outfile). chv2011_boot_columns <- function() { c( "caseid", "state", "wage", "const", "exp", "expsq", "cafqt", "mhgc", paste0("d", 57:63), "lwage5", "lurate", "pub4", "lwage5_17", "lurate_17", "urban14", "lavlocwage17", "avurate", "numsibs", "tuit4c" ) } chv2011_boot_dir <- function(aer_dir = chv2011_aer_dir) { file.path(aer_dir, "bootdata") } #' Build auxboot frame: sorted by caseid with observation index n (Stata egen n = group(caseid)). prepare_chv2011_boot_base <- function(aer_dir = chv2011_aer_dir) { cols <- chv2011_boot_columns() load_chv2011_raw(aer_dir) |> arrange(caseid) |> mutate(n = row_number()) |> select(all_of(cols), n) } write_chv2011_boot_asc <- function(df, path, cols = chv2011_boot_columns()) { dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE) mat <- df |> select(all_of(cols)) write.table( mat, file = path, row.names = FALSE, col.names = FALSE, sep = " ", quote = FALSE ) invisible(path) } bootdata_status <- function(boot_dir = chv2011_boot_dir()) { meta_path <- file.path(boot_dir, "boot_meta.rds") if (!file.exists(meta_path)) { return(list( ok = FALSE, message = "bootdata/ not generated — run Rcode/getboot.R or generate_chv2011_bootdata()" )) } meta <- readRDS(meta_path) asc_files <- list.files(boot_dir, pattern = "^phat[0-9]+\\.asc$", full.names = FALSE) asc_count <- length(asc_files) list( ok = asc_count >= meta$B, B = meta$B, n = meta$n, asc_count = asc_count, data_mode = meta$data_mode, seed = meta$seed, boot_dir = boot_dir, message = if (asc_count >= meta$B) { sprintf("%d bootstrap .asc files ready (N = %d)", asc_count, meta$n) } else { sprintf("Incomplete: %d / %d .asc files", asc_count, meta$B) } ) } #' R port of replication_aer/getboot.do. #' #' Writes phat1.asc (full sample) and phat2.asc … phatB.asc (bootstrap draws), #' plus boot_base.rds, bootstrap_indices.rds, and boot_meta.rds for fast R reuse. generate_chv2011_bootdata <- function( B = as.integer(Sys.getenv("CHV2011_BOOT", "250")), seed = as.integer(Sys.getenv("CHV2011_BOOT_SEED", "703296661")), boot_dir = chv2011_boot_dir(), write_asc = TRUE, overwrite = isTRUE(as.logical(Sys.getenv("CHV2011_BOOT_OVERWRITE", "FALSE"))), aer_dir = chv2011_aer_dir) { if (B < 1L) { stop("B must be >= 1") } base <- prepare_chv2011_boot_base(aer_dir) n <- nrow(base) cols <- chv2011_boot_columns() mode_info <- detect_chv2011_data_mode(aer_dir) dir.create(boot_dir, recursive = TRUE, showWarnings = FALSE) set.seed(seed) idx_mat <- if (B > 1L) { matrix(sample.int(n, n * (B - 1L), replace = TRUE), nrow = n, ncol = B - 1L) } else { matrix(integer(0), nrow = n, ncol = 0L) } paths <- character(B) if (write_asc) { for (b in seq_len(B)) { path_b <- file.path(boot_dir, sprintf("phat%d.asc", b)) if (!file.exists(path_b) || overwrite) { sample_b <- if (b == 1L) { base } else { base[idx_mat[, b - 1L], , drop = FALSE] } write_chv2011_boot_asc(sample_b, path_b, cols) } paths[b] <- path_b } } saveRDS(base, file.path(boot_dir, "boot_base.rds")) saveRDS(idx_mat, file.path(boot_dir, "bootstrap_indices.rds")) saveRDS( list( n = n, B = B, seed = seed, columns = cols, data_mode = mode_info$mode, generated = Sys.time() ), file.path(boot_dir, "boot_meta.rds") ) invisible(list( paths = paths, n = n, B = B, idx_mat = idx_mat, base = base, status = bootdata_status(boot_dir) )) } #' Load bootstrap sample i (1 = full sample / phat1, 2…B = resamples). load_chv2011_boot_sample <- function( i, boot_dir = chv2011_boot_dir(), for_analysis = FALSE) { if (i < 1L) { stop("i must be >= 1") } meta_path <- file.path(boot_dir, "boot_meta.rds") if (!file.exists(meta_path)) { stop( "bootdata/ not found under ", boot_dir, ". Run generate_chv2011_bootdata() or Rscript Rcode/getboot.R first." ) } meta <- readRDS(meta_path) if (i > meta$B) { stop(sprintf("Only %d bootstrap samples available (requested i = %d)", meta$B, i)) } cols <- chv2011_boot_columns() base <- readRDS(file.path(boot_dir, "boot_base.rds")) df <- if (i == 1L) { base[, cols, drop = FALSE] } else { idx <- readRDS(file.path(boot_dir, "bootstrap_indices.rds"))[, i - 1L] base[idx, cols, drop = FALSE] } if (for_analysis) { df <- add_chv2011_derived(df) } df } wage_interaction_terms <- function(df, phat_col = "phat") { p <- df[[phat_col]] s <- df$state out <- df for (v in chv2011_x) { out[[paste0("p", v)]] <- p * out[[v]] out[[paste0("s", v)]] <- s * out[[v]] } out } annualize_return <- function(x) x / 4