# Memory-efficient IV for large instrument sets (Table 2: ~180 IVs, N ≈ 330k). bound95_chunk_size <- 40000L bound95_chunk_moments <- function(data, varlist, yvar, chunk = bound95_chunk_size) { n <- nrow(data) XtX <- NULL Xty <- NULL yty <- 0 cols <- NULL for (start in seq(1L, n, by = chunk)) { idx <- start:min(start + chunk - 1L, n) sub <- data[idx, , drop = FALSE] mm <- stats::model.matrix(stats::reformulate(varlist, yvar), data = sub) y <- sub[[yvar]] if (is.null(XtX)) { cols <- colnames(mm) p <- ncol(mm) XtX <- matrix(0, p, p) Xty <- matrix(0, p, 1) } XtX <- XtX + crossprod(mm) Xty <- Xty + crossprod(mm, y) yty <- yty + sum(y * y) } list(XtX = XtX, Xty = Xty, yty = yty, n = n, cols = cols) } bound95_ols_from_moments <- function(mom) { beta <- bound95_qr_solve(mom$XtX, mom$Xty) rss <- mom$yty - as.numeric(crossprod(beta, mom$Xty)) list(beta = beta, rss = rss, mom = mom, k = ncol(mom$XtX)) } bound95_chunk_wx <- function(data, w_vars, x_vars, chunk = bound95_chunk_size) { n <- nrow(data) WtX <- NULL for (start in seq(1L, n, by = chunk)) { idx <- start:min(start + chunk - 1L, n) sub <- data[idx, , drop = FALSE] W <- stats::model.matrix(stats::reformulate(w_vars), data = sub) X <- stats::model.matrix(stats::reformulate(x_vars), data = sub) if (is.null(WtX)) { WtX <- matrix(0, ncol(W), ncol(X)) } WtX <- WtX + crossprod(W, X) } WtX } bound95_qr_solve <- function(A, B) { qr.solve(A, B, tol = 1e-10) } bound95_rank_keep <- function(XtX, cols, tol = 1e-10) { piv <- qr(XtX, tol = tol)$pivot rank <- qr(XtX, tol = tol)$rank keep <- piv[seq_len(rank)] list( keep = keep, cols = cols[keep], XtX = XtX[keep, keep, drop = FALSE] ) } bound95_subset_mom <- function(mom, keep) { mom$XtX <- mom$XtX[keep, keep, drop = FALSE] mom$Xty <- mom$Xty[keep, , drop = FALSE] mom$cols <- mom$cols[keep] mom } bound95_run_iv_chunked <- function( data, controls, excluded, included = controls, y = "LWKLYWGE", endog = "EDUC") { excluded <- bound95_effective_excluded(data, excluded, included, endog) included <- included[included %in% names(data)] w_vars <- c(included, excluded) x_vars <- c(endog, controls) mom_w <- bound95_chunk_moments(data, w_vars, y) mom_x <- bound95_chunk_moments(data, x_vars, y) w_rank <- bound95_rank_keep(mom_w$XtX, mom_w$cols) mom_w <- bound95_subset_mom(mom_w, w_rank$keep) WtX <- bound95_chunk_wx(data, w_vars, x_vars)[w_rank$keep, , drop = FALSE] WtW_inv_WtX <- bound95_qr_solve(mom_w$XtX, WtX) WtW_inv_Wty <- bound95_qr_solve(mom_w$XtX, mom_w$Xty) XtPwX <- crossprod(WtX, WtW_inv_WtX) XtPwy <- crossprod(WtX, WtW_inv_Wty) beta <- bound95_qr_solve(XtPwX, XtPwy) names(beta) <- mom_x$cols rss <- mom_x$yty - as.numeric(crossprod(beta, mom_x$Xty)) sigma2 <- rss / mom_x$n vcov <- sigma2 * qr.solve(XtPwX, diag(ncol(XtPwX)), tol = 1e-10) educ_idx <- match(endog, names(beta)) educ_est <- unname(beta[educ_idx]) educ_se <- sqrt(vcov[educ_idx, educ_idx]) z <- educ_est / educ_se educ_row <- tibble::tibble( term = endog, estimate = educ_est, std.error = educ_se, statistic = z, p.value = 2 * stats::pnorm(-abs(z)) ) structure( list( coefficients = beta, vcov = vcov, nobs = mom_x$n, resid_rss = rss, sigma2 = sigma2, w_vars = w_vars, x_vars = x_vars, excluded = excluded, included = included, y = y, bound95_educ = educ_row ), class = c("bound95_iv", "list"), formula = stats::as.formula( paste(y, "~", paste(x_vars, collapse = " + "), "|", paste(w_vars, collapse = " + ")) ) ) } bound95_first_stage_chunked <- function(data, endog, excluded, included) { excluded <- bound95_effective_excluded(data, excluded, included, endog) included <- included[included %in% names(data)] q <- length(excluded) red <- bound95_ols_from_moments(bound95_chunk_moments(data, included, endog)) full_mom <- bound95_chunk_moments(data, c(included, excluded), endog) fr <- bound95_rank_keep(full_mom$XtX, full_mom$cols) full_mom <- bound95_subset_mom(full_mom, fr$keep) full <- bound95_ols_from_moments(full_mom) f_stat <- ((red$rss - full$rss) / q) / (full$rss / (full$mom$n - full$k)) partial_r2 <- 1 - full$rss / red$rss list( f_excluded = unname(f_stat), partial_r2 = partial_r2, n_excluded = q, excluded = excluded ) } bound95_basmann_f_chunked <- function(model, data) { q <- length(model$excluded) - 1L if (q < 1L) { return(NA_real_) } chunk <- bound95_chunk_size n <- model$nobs uu <- 0 WtW <- NULL Wtu <- NULL w_cols <- NULL for (start in seq(1L, n, by = chunk)) { idx <- start:min(start + chunk - 1L, n) sub <- data[idx, , drop = FALSE] X <- stats::model.matrix( stats::reformulate(model$x_vars, model$y), data = sub ) y <- sub[[model$y]] b <- model$coefficients[match(colnames(X), names(model$coefficients))] u <- y - as.numeric(X %*% b) W <- stats::model.matrix(stats::reformulate(model$w_vars), data = sub) if (is.null(WtW)) { w_cols <- colnames(W) p <- ncol(W) WtW <- matrix(0, p, p) Wtu <- matrix(0, p, 1) } WtW <- WtW + crossprod(W) Wtu <- Wtu + crossprod(W, u) uu <- uu + sum(u * u) } fr <- bound95_rank_keep(WtW, w_cols) beta_u <- bound95_qr_solve(fr$XtX, Wtu[fr$keep, , drop = FALSE]) rss_u <- uu - as.numeric(crossprod(beta_u, Wtu[fr$keep, , drop = FALSE])) r2 <- 1 - rss_u / uu sargan <- n * r2 unname(sargan / q) }