# Table 4(a) linearity test — port of mainresults/table4a/bootstrap_MT.R # Probit first step; HC robust Wald + pivotal bootstrap; Romano–Wolf (10%) critical value. chv2011_table4a_instruments <- function() { c( "cafqt", "mhgc", "numsibs", "urban14", "lavlocwage17", "avurate", "cafqt2", "mhgc2", "numsibs2", "lavlocwage172", "avurate2", chv2011_cohort, "pub4", "pub4_cafqt", "pub4_mhgc", "pub4_numsibs", "lwage5_17", "lwage5_17_cafqt", "lwage5_17_mhgc", "lwage5_17_numsibs", "lurate_17", "lurate_17_cafqt", "lurate_17_mhgc", "lurate_17_numsibs", "tuition", "tuition_cafqt", "tuition_mhgc", "tuition_numsibs" ) } chv2011_table4a_data_path <- function( aer_dir = chv2011_aer_dir) { file.path(aer_dir, "mainresults/table4a/data_with_tuition.RData") } #' Export analysis dataset for author bootstrap_MT.R (object name: data). export_chv2011_table4a_data <- function( path = chv2011_table4a_data_path(), aer_dir = chv2011_aer_dir) { data <- load_chv2011_raw(aer_dir) |> add_chv2011_derived() |> arrange(caseid) dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE) save(data, file = path) invisible(path) } chv2011_table4a_poly_formula <- function(degree, regressors = chv2011_wage_x) { poly_hi <- if (degree >= 2L) { paste(paste0("I(prop^", 2:degree, ")"), collapse = " + ") } else { NULL } # Match bootstrap_MT.R term order: X, prop, P^2..P^d, then X:prop (coef layout for MT). rhs <- paste( c(regressors, "prop", poly_hi, paste0(regressors, ":prop")), collapse = " + " ) stats::as.formula(paste("wage ~", rhs)) } chv2011_table4a_poly_idx <- function(degree, n_x = length(chv2011_wage_x)) { k <- degree - 1L (n_x + 3L):(n_x + 2L + k) } chv2011_table4a_fit_degree <- function(data, prop, degree) { db <- data db$prop <- prop fit <- lm(chv2011_table4a_poly_formula(degree), data = db) idx <- chv2011_table4a_poly_idx(degree) OLS <- coef(fit)[idx] cov_hc <- tryCatch( car::hccm(fit)[idx, idx, drop = FALSE], error = function(e) vcov(fit)[idx, idx, drop = FALSE] ) list( fit = fit, idx = idx, OLS = OLS, cov_hc = cov_hc, stat_hc = as.numeric(t(OLS) %*% solve(cov_hc) %*% OLS), p_asymp_hc = stats::pchisq( as.numeric(t(OLS) %*% solve(cov_hc) %*% OLS), df = degree - 1L, lower.tail = FALSE ) ) } chv2011_table4a_mt_cov <- function(probit, poly_fit, prop, degree) { n_reg <- length(chv2011_wage_x) V1 <- vcov(probit) V2 <- vcov(poly_fit) Z <- model.matrix(probit) X <- model.matrix(poly_fit) b <- coef(poly_fit) x_main <- X[, seq_len(n_reg + 1L), drop = FALSE] b_prop <- b[n_reg + 2L] int_start <- n_reg + 2L + degree b_inter <- b[int_start:(int_start + n_reg - 1L)] deriv <- as.numeric(x_main %*% c(b_prop, b_inter)) if (degree >= 2L) { for (j in seq_len(degree - 1L)) { pwr <- j + 1L deriv <- deriv + pwr * prop^(pwr - 1L) * b[n_reg + 2L + j] } } wc <- as.numeric(poly_fit$residuals^2 * stats::dnorm(stats::qnorm(prop)) * deriv) C <- crossprod(X, wc * Z) R <- crossprod(sandwich::estfun(poly_fit), sandwich::estfun(probit)) V2 + V2 %*% (C %*% V1 %*% t(C) - R %*% V1 %*% t(C) - C %*% V1 %*% t(R)) %*% V2 } chv2011_table4a_degree_test <- function(data, prop, probit, degree) { fit_info <- chv2011_table4a_fit_degree(data, prop, degree) M <- chv2011_table4a_mt_cov(probit, fit_info$fit, prop, degree) cov_mt <- M[fit_info$idx, fit_info$idx, drop = FALSE] stat_mt <- as.numeric(t(fit_info$OLS) %*% solve(cov_mt) %*% fit_info$OLS) fit_info$p_asymp_mt <- stats::pchisq(stat_mt, df = degree - 1L, lower.tail = FALSE) fit_info$stat_mt <- stat_mt fit_info$cov_mt <- cov_mt fit_info$probit <- probit fit_info } chv2011_table4a_bootstrap_draw <- function(data, inst, degree, OLS, idx, stat_ref) { N <- nrow(data) idx_s <- sample.int(N, N, replace = TRUE) db <- data[idx_s, , drop = FALSE] pb <- glm( state ~ ., data = db |> dplyr::select(state, dplyr::all_of(inst)), family = binomial(link = "probit") ) pr <- fitted(pb) fit_b <- chv2011_table4a_fit_degree(db, pr, degree) cov_b <- tryCatch( car::hccm(fit_b$fit)[idx, idx, drop = FALSE], error = function(e) vcov(fit_b$fit)[idx, idx, drop = FALSE] ) if (any(!is.finite(cov_b)) || rcond(cov_b) < 1e-12) { return(list(stat_star = NA_real_)) } stat_star <- as.numeric( t(fit_b$OLS - OLS) %*% solve(cov_b) %*% (fit_b$OLS - OLS) ) list(stat_star = stat_star) } #' Run Table 4(a) linearity tests (degrees 2–5 by default). chv2011_table4a_run <- function( data, B = as.integer(Sys.getenv("CHV2011_BOOT_4A", "1000")), degrees = 2:5, seed = 703296661L, rw_quantile = 0.10) { inst <- chv2011_table4a_instruments() data <- data |> arrange(caseid) probit <- glm( state ~ ., data = data |> dplyr::select(state, dplyr::all_of(inst)), family = binomial(link = "probit") ) prop <- fitted(probit) full <- lapply(degrees, function(d) { chv2011_table4a_degree_test(data, prop, probit, d) }) names(full) <- paste0("deg", degrees) p_asymp_hc <- vapply(full, function(x) x$p_asymp_hc, numeric(1)) p_asymp_mt <- vapply(full, function(x) x$p_asymp_mt, numeric(1)) stat_hc <- vapply(full, function(x) x$stat_hc, numeric(1)) set.seed(seed) stat_star <- matrix(NA_real_, nrow = length(degrees), ncol = B) p_star <- matrix(NA_real_, nrow = length(degrees), ncol = B) p_boot <- numeric(length(degrees)) for (b in seq_len(B)) { for (j in seq_along(degrees)) { d <- degrees[j] draw <- chv2011_table4a_bootstrap_draw( data, inst, d, full[[j]]$OLS, full[[j]]$idx, stat_hc[j] ) stat_star[j, b] <- draw$stat_star } } for (j in seq_along(degrees)) { p_boot[j] <- mean(stat_star[j, ] > stat_hc[j], na.rm = TRUE) } for (b in seq_len(B)) { for (j in seq_along(degrees)) { p_star[j, b] <- mean(stat_star[j, ] > stat_star[j, b], na.rm = TRUE) } } critical_rw <- stats::quantile( apply(p_star, 2, min, na.rm = TRUE), rw_quantile, na.rm = TRUE ) reject_rw <- min(p_boot, na.rm = TRUE) < critical_rw list( degrees = degrees, p_boot = p_boot, p_asymp_hc = p_asymp_hc, p_asymp_mt = p_asymp_mt, critical_rw = as.numeric(critical_rw), reject_rw = reject_rw, B = B, n = nrow(data) ) } chv2011_paper_table4a <- function() { tibble::tribble( ~polynomial_order, ~paper_p, 2L, 0.035, 3L, 0.049, 4L, 0.086, 5L, 0.122 ) } chv2011_paper_table_a5 <- function() { tibble::tribble( ~polynomial_order, ~paper_p, 2L, 0.004, 3L, 0.006, 4L, 0.022, 5L, 0.026 ) } chv2011_table4a_wald <- function(ols, cov) { if (any(!is.finite(cov)) || rcond(cov) < 1e-12) { return(NA_real_) } as.numeric(t(ols) %*% solve(cov) %*% ols) } chv2011_table4a_est_degree <- function( data, degree, inst = chv2011_table4a_instruments()) { probit <- glm( state ~ ., data = data |> dplyr::select(state, dplyr::all_of(inst)), family = binomial(link = "probit") ) prop <- fitted(probit) fit_info <- chv2011_table4a_fit_degree(data, prop, degree) list(probit = probit, prop = prop, fit_info = fit_info, inst = inst) } chv2011_table4a_inner_vcov_list <- function( data, degrees, inst, B_inner = 100L) { nd <- length(degrees) idx_list <- lapply(degrees, chv2011_table4a_poly_idx) coef_arr <- lapply(idx_list, function(idx) { matrix(NA_real_, nrow = B_inner, ncol = length(idx)) }) N <- nrow(data) for (b in seq_len(B_inner)) { db <- data[sample.int(N, N, replace = TRUE), , drop = FALSE] pb <- glm( state ~ ., data = db |> dplyr::select(state, dplyr::all_of(inst)), family = binomial(link = "probit") ) pr <- fitted(pb) for (j in seq_len(nd)) { fi <- chv2011_table4a_fit_degree(db, pr, degrees[j]) coef_arr[[j]][b, ] <- fi$OLS } } lapply(coef_arr, stats::cov) } chv2011_table4a_inner_vcov <- function( data, degree, inst, B_inner = 100L) { chv2011_table4a_inner_vcov_list(data, degree, inst, B_inner)[[1L]] } #' Table A-5: double bootstrap for estimated P (description1.tex / double_bootstrap.R). chv2011_table4a_run_a5 <- function( data, B_outer = as.integer(Sys.getenv("CHV2011_BOOT_A5_OUT", "500")), B_inner = as.integer(Sys.getenv("CHV2011_BOOT_A5_IN", "100")), degrees = 2:5, seed = 703296661L, rw_quantile = 0.10) { inst <- chv2011_table4a_instruments() data <- data |> dplyr::arrange(caseid) nd <- length(degrees) full <- lapply(seq_along(degrees), function(j) { d <- degrees[j] est <- chv2011_table4a_est_degree(data, d, inst) list( OLS = est$fit_info$OLS, idx = est$fit_info$idx ) }) names(full) <- paste0("deg", degrees) inner_full <- chv2011_table4a_inner_vcov_list(data, degrees, inst, B_inner) T_full <- vapply(seq_along(degrees), function(j) { chv2011_table4a_wald(full[[j]]$OLS, inner_full[[j]]) }, numeric(1)) set.seed(seed) T_star <- matrix(NA_real_, nrow = nd, ncol = B_outer) N <- nrow(data) for (b in seq_len(B_outer)) { db <- data[sample.int(N, N, replace = TRUE), , drop = FALSE] inner_b <- chv2011_table4a_inner_vcov_list(db, degrees, inst, B_inner) for (j in seq_along(degrees)) { est_b <- chv2011_table4a_est_degree(db, degrees[j], inst) T_star[j, b] <- chv2011_table4a_wald( est_b$fit_info$OLS - full[[j]]$OLS, inner_b[[j]] ) } if (b %% max(1L, B_outer %/% 10L) == 0L) { message("Table A-5 outer bootstrap: ", b, " / ", B_outer) } } p_boot <- vapply(seq_along(degrees), function(j) { mean(T_star[j, ] > T_full[j], na.rm = TRUE) }, numeric(1)) p_star <- matrix(NA_real_, nrow = nd, ncol = B_outer) for (b in seq_len(B_outer)) { for (j in seq_along(degrees)) { p_star[j, b] <- mean(T_star[j, ] > T_star[j, b], na.rm = TRUE) } } critical_rw <- stats::quantile( apply(p_star, 2, min, na.rm = TRUE), rw_quantile, na.rm = TRUE ) reject_rw <- min(p_boot, na.rm = TRUE) < critical_rw list( degrees = degrees, p_boot = p_boot, critical_rw = as.numeric(critical_rw), reject_rw = reject_rw, B_outer = B_outer, B_inner = B_inner, n = N ) }