# First-stage and overidentification diagnostics for Bound (1995) tables. source(here("04-topics/rep-bound1995/Rcode/bound1995-chunked-iv.R")) bound95_effective_excluded <- function( data, excluded, included, endog = "EDUC") { excluded <- excluded[excluded %in% names(data)] included <- included[included %in% names(data)] # Skip rank scan for large AK-style interaction sets (Table 2); too slow at N ≥330k. if (length(excluded) > 50L) { return(excluded) } mm <- stats::model.matrix( stats::reformulate(c(included, excluded), endog), data = data ) qr_mm <- qr(mm) keep <- colnames(mm)[qr_mm$pivot[seq_len(qr_mm$rank)]] intersect(excluded, keep) } bound95_first_stage_stats <- function( data, endog = "EDUC", excluded, included) { if (length(excluded) > 50L) { return(bound95_first_stage_chunked(data, endog, excluded, included)) } excluded <- bound95_effective_excluded(data, excluded, included, endog) included <- included[included %in% names(data)] f_reduced <- stats::reformulate(included, endog) f_full <- stats::reformulate(c(included, excluded), endog) m_red <- lm(f_reduced, data = data) m_full <- lm(f_full, data = data) an <- anova(m_red, m_full) f_stat <- unname(an$F[2]) partial_r2 <- 1 - sum(residuals(m_full)^2) / sum(residuals(m_red)^2) list( f_excluded = f_stat, partial_r2 = partial_r2, n_excluded = length(excluded), excluded = excluded ) } bound95_basmann_f <- function(model, excluded, included, data) { if (inherits(model, "bound95_iv")) { return(bound95_basmann_f_chunked(model, data)) } excluded <- bound95_effective_excluded(data, excluded, included) q <- length(excluded) - 1L if (q < 1L) { return(NA_real_) } sm <- summary(model, diagnostics = TRUE) if (!is.null(sm$diagnostics) && "Sargan" %in% rownames(sm$diagnostics)) { sargan <- sm$diagnostics["Sargan", "statistic"] return(unname(sargan / q)) } u_hat <- residuals(model) exog <- c(included, excluded) exog <- exog[exog %in% names(data)] n <- nobs(model) aux <- lm(u_hat ~ ., data = data[exog]) r2 <- summary(aux)$r.squared sargan <- n * r2 unname(sargan / q) } bound95_educ_coef <- function(model) { if (inherits(model, "bound95_iv")) { return(model$bound95_educ) } broom::tidy(model, conf.int = FALSE) |> dplyr::filter(.data$term == "EDUC") |> dplyr::slice(1) } bound95_run_ols <- function(data, controls, excluded = character()) { rhs <- c("EDUC", controls) stats::lm(stats::reformulate(rhs, "LWKLYWGE"), data = data) } bound95_run_iv <- function(data, controls, excluded, included = controls) { excluded <- bound95_effective_excluded(data, excluded, included) if (length(excluded) > 50L) { return(bound95_run_iv_chunked(data, controls, excluded, included)) } endog_rhs <- paste(c("EDUC", controls), collapse = " + ") instr <- paste(c(included, excluded), collapse = " + ") f <- stats::as.formula(paste("LWKLYWGE ~", endog_rhs, "|", instr)) ivreg::ivreg(f, data = data) }