# Memory-efficient IV for large instrument sets (Table 2: ~180 IVs, N ≈ 330k).
bound95_chunk_size <- 40000L

bound95_chunk_moments <- function(data, varlist, yvar, chunk = bound95_chunk_size) {
  n <- nrow(data)
  XtX <- NULL
  Xty <- NULL
  yty <- 0
  cols <- NULL

  for (start in seq(1L, n, by = chunk)) {
    idx <- start:min(start + chunk - 1L, n)
    sub <- data[idx, , drop = FALSE]
    mm <- stats::model.matrix(stats::reformulate(varlist, yvar), data = sub)
    y <- sub[[yvar]]
    if (is.null(XtX)) {
      cols <- colnames(mm)
      p <- ncol(mm)
      XtX <- matrix(0, p, p)
      Xty <- matrix(0, p, 1)
    }
    XtX <- XtX + crossprod(mm)
    Xty <- Xty + crossprod(mm, y)
    yty <- yty + sum(y * y)
  }

  list(XtX = XtX, Xty = Xty, yty = yty, n = n, cols = cols)
}

bound95_ols_from_moments <- function(mom) {
  beta <- bound95_qr_solve(mom$XtX, mom$Xty)
  rss <- mom$yty - as.numeric(crossprod(beta, mom$Xty))
  list(beta = beta, rss = rss, mom = mom, k = ncol(mom$XtX))
}

bound95_chunk_wx <- function(data, w_vars, x_vars, chunk = bound95_chunk_size) {
  n <- nrow(data)
  WtX <- NULL

  for (start in seq(1L, n, by = chunk)) {
    idx <- start:min(start + chunk - 1L, n)
    sub <- data[idx, , drop = FALSE]
    W <- stats::model.matrix(stats::reformulate(w_vars), data = sub)
    X <- stats::model.matrix(stats::reformulate(x_vars), data = sub)
    if (is.null(WtX)) {
      WtX <- matrix(0, ncol(W), ncol(X))
    }
    WtX <- WtX + crossprod(W, X)
  }

  WtX
}

bound95_qr_solve <- function(A, B) {
  qr.solve(A, B, tol = 1e-10)
}

bound95_rank_keep <- function(XtX, cols, tol = 1e-10) {
  piv <- qr(XtX, tol = tol)$pivot
  rank <- qr(XtX, tol = tol)$rank
  keep <- piv[seq_len(rank)]
  list(
    keep = keep,
    cols = cols[keep],
    XtX = XtX[keep, keep, drop = FALSE]
  )
}

bound95_subset_mom <- function(mom, keep) {
  mom$XtX <- mom$XtX[keep, keep, drop = FALSE]
  mom$Xty <- mom$Xty[keep, , drop = FALSE]
  mom$cols <- mom$cols[keep]
  mom
}

bound95_run_iv_chunked <- function(
    data,
    controls,
    excluded,
    included = controls,
    y = "LWKLYWGE",
    endog = "EDUC") {
  excluded <- bound95_effective_excluded(data, excluded, included, endog)
  included <- included[included %in% names(data)]
  w_vars <- c(included, excluded)
  x_vars <- c(endog, controls)

  mom_w <- bound95_chunk_moments(data, w_vars, y)
  mom_x <- bound95_chunk_moments(data, x_vars, y)
  w_rank <- bound95_rank_keep(mom_w$XtX, mom_w$cols)
  mom_w <- bound95_subset_mom(mom_w, w_rank$keep)

  WtX <- bound95_chunk_wx(data, w_vars, x_vars)[w_rank$keep, , drop = FALSE]

  WtW_inv_WtX <- bound95_qr_solve(mom_w$XtX, WtX)
  WtW_inv_Wty <- bound95_qr_solve(mom_w$XtX, mom_w$Xty)

  XtPwX <- crossprod(WtX, WtW_inv_WtX)
  XtPwy <- crossprod(WtX, WtW_inv_Wty)

  beta <- bound95_qr_solve(XtPwX, XtPwy)
  names(beta) <- mom_x$cols
  rss <- mom_x$yty - as.numeric(crossprod(beta, mom_x$Xty))
  sigma2 <- rss / mom_x$n
  vcov <- sigma2 * qr.solve(XtPwX, diag(ncol(XtPwX)), tol = 1e-10)

  educ_idx <- match(endog, names(beta))
  educ_est <- unname(beta[educ_idx])
  educ_se <- sqrt(vcov[educ_idx, educ_idx])
  z <- educ_est / educ_se

  educ_row <- tibble::tibble(
    term = endog,
    estimate = educ_est,
    std.error = educ_se,
    statistic = z,
    p.value = 2 * stats::pnorm(-abs(z))
  )

  structure(
    list(
      coefficients = beta,
      vcov = vcov,
      nobs = mom_x$n,
      resid_rss = rss,
      sigma2 = sigma2,
      w_vars = w_vars,
      x_vars = x_vars,
      excluded = excluded,
      included = included,
      y = y,
      bound95_educ = educ_row
    ),
    class = c("bound95_iv", "list"),
    formula = stats::as.formula(
      paste(y, "~", paste(x_vars, collapse = " + "), "|", paste(w_vars, collapse = " + "))
    )
  )
}

bound95_first_stage_chunked <- function(data, endog, excluded, included) {
  excluded <- bound95_effective_excluded(data, excluded, included, endog)
  included <- included[included %in% names(data)]
  q <- length(excluded)

  red <- bound95_ols_from_moments(bound95_chunk_moments(data, included, endog))
  full_mom <- bound95_chunk_moments(data, c(included, excluded), endog)
  fr <- bound95_rank_keep(full_mom$XtX, full_mom$cols)
  full_mom <- bound95_subset_mom(full_mom, fr$keep)
  full <- bound95_ols_from_moments(full_mom)

  f_stat <- ((red$rss - full$rss) / q) / (full$rss / (full$mom$n - full$k))
  partial_r2 <- 1 - full$rss / red$rss

  list(
    f_excluded = unname(f_stat),
    partial_r2 = partial_r2,
    n_excluded = q,
    excluded = excluded
  )
}

bound95_basmann_f_chunked <- function(model, data) {
  q <- length(model$excluded) - 1L
  if (q < 1L) {
    return(NA_real_)
  }

  chunk <- bound95_chunk_size
  n <- model$nobs
  uu <- 0
  WtW <- NULL
  Wtu <- NULL
  w_cols <- NULL

  for (start in seq(1L, n, by = chunk)) {
    idx <- start:min(start + chunk - 1L, n)
    sub <- data[idx, , drop = FALSE]
    X <- stats::model.matrix(
      stats::reformulate(model$x_vars, model$y),
      data = sub
    )
    y <- sub[[model$y]]
    b <- model$coefficients[match(colnames(X), names(model$coefficients))]
    u <- y - as.numeric(X %*% b)
    W <- stats::model.matrix(stats::reformulate(model$w_vars), data = sub)
    if (is.null(WtW)) {
      w_cols <- colnames(W)
      p <- ncol(W)
      WtW <- matrix(0, p, p)
      Wtu <- matrix(0, p, 1)
    }
    WtW <- WtW + crossprod(W)
    Wtu <- Wtu + crossprod(W, u)
    uu <- uu + sum(u * u)
  }

  fr <- bound95_rank_keep(WtW, w_cols)
  beta_u <- bound95_qr_solve(fr$XtX, Wtu[fr$keep, , drop = FALSE])
  rss_u <- uu - as.numeric(crossprod(beta_u, Wtu[fr$keep, , drop = FALSE]))
  r2 <- 1 - rss_u / uu
  sargan <- n * r2
  unname(sargan / q)
}