# Shared data preparation for CHV2011 replication (Carneiro, Heckman, Vytlacil 2011).
library(here)
library(tidyverse)
library(haven)

chv2011_aer_dir <- here("04-topics/rep-chv2011/replication_aer")

# Outcome-equation controls (wage equation X).
chv2011_x <- c(
  "exp", "expsq", "cafqt", "cafqt2", "mhgc", "mhgc2", "numsibs", "numsibs2",
  "urban14", "lavlocwage17", "lavlocwage172", "avurate", "avurate2",
  "d57", "d58", "d59", "d60", "d61", "d62", "d63", "lwage5", "lurate"
)

# Choice-equation controls (excluding cohort dummies in Table 3 display).
chv2011_choice_x <- c(
  "cafqt", "cafqt2", "mhgc", "mhgc2", "numsibs", "numsibs2",
  "urban14", "lavlocwage17", "lavlocwage172", "avurate", "avurate2"
)

chv2011_cohort <- paste0("d", 57:63)

chv2011_inst_base <- c("pub4", "lwage5_17", "lurate_17", "tuition")

chv2011_inst_interact <- c("cafqt", "mhgc", "numsibs")

# Table 4a / bootstrap_MT regressors (wage equation, no P interactions).
chv2011_wage_x <- chv2011_x

chv2011_p_bounds <- c(0.0324, 0.9775)

#' Wage-equation regressors by Table 6 / robustness spec.
chv2011_wage_regressors <- function(spec = "baseline") {
  if (identical(spec, "dropdummies")) {
    return(c(chv2011_wage_x, "hsdrop", "somecol"))
  }
  chv2011_wage_x
}

#' Willis–Rosen schooling dummies for Table 6(a) col 3 (wage equation only).
add_chv2011_wage_spec_vars <- function(df, spec = "baseline") {
  if (!identical(spec, "dropdummies")) {
    return(df)
  }
  df |>
    mutate(
      hsdrop = as.integer(school == 1L),
      somecol = as.integer(school == 3L)
    )
}

#' Merge `school` onto bootstrap rows (not stored in bootdata/*.asc).
chv2011_attach_school <- function(df, aer_dir = chv2011_aer_dir) {
  if ("school" %in% names(df)) {
    return(df)
  }
  sch <- load_chv2011_raw(aer_dir) |> dplyr::select(caseid, school)
  dplyr::left_join(df, sch, by = "caseid")
}

#' Bootstrap directory for Table 6 specs (nodrop uses table6/bootdata_nodrop when present).
chv2011_boot_dir_for_spec <- function(spec = "baseline", aer_dir = chv2011_aer_dir) {
  if (identical(spec, "nodrop")) {
    nodrop_dir <- file.path(aer_dir, "table6/bootdata_nodrop")
    if (isTRUE(bootdata_status(nodrop_dir)$ok)) {
      return(nodrop_dir)
    }
  }
  chv2011_boot_dir(aer_dir)
}

# Paper Table A-3 targets (Carneiro, Heckman and Vytlacil 2011, Appendix).
chv2011_paper_a3 <- function() {
  tribble(
    ~var, ~mean0, ~sd0, ~mean1, ~sd1, ~source,
    "wage", 2.2089, 0.4412, 2.5496, 0.4959, "basic",
    "exp", 10.1042, 3.1260, 6.8404, 3.2522, "basic",
    "cafqt", -0.0446, 0.8673, 0.9515, 0.7498, "basic",
    "mhgc", 11.3083, 2.1056, 12.9121, 2.2789, "basic",
    "numsibs", 3.2630, 2.0842, 2.5849, 1.6450, "basic",
    "urban14", 0.6995, 0.4587, 0.7895, 0.4078, "basic",
    "lwage5", 10.2645, 0.1597, 10.3220, 0.1660, "local",
    "lurate", 6.7971, 1.3310, 6.8226, 1.1983, "local",
    "pub4", 0.4625, 0.4988, 0.5884, 0.4924, "local",
    "lwage5_17", 10.2780, 0.1619, 10.2736, 0.1651, "local",
    "lurate_17", 7.0804, 1.7846, 7.0846, 1.8449, "local",
    "tuition", 22.0164, 7.8730, 21.1105, 8.0683, "local",
    "lavlocwage17", 10.2673, 0.1798, 10.2991, 0.1945, "local",
    "avurate", 6.2942, 1.0156, 6.2077, 0.9536, "local"
  )
}

# Paper Table 3 / A-4 average marginal derivatives (full sample; phat1 in getboot.do).
chv2011_paper_table3_derivs <- function() {
  tribble(
    ~variable, ~paper,
    "cafqt", 0.2826,
    "mhgc", 0.0441,
    "numsibs", -0.0233,
    "urban14", 0.0340,
    "lavlocwage17", 0.1820,
    "avurate", 0.0058,
    "pub4", 0.0529,
    "lwage5_17", -0.2687,
    "lurate_17", 0.0149,
    "tuition", -0.0027
  )
}

chv2011_iv_terms <- function(include_numsibs = TRUE) {
  base <- chv2011_inst_base
  controls <- if (include_numsibs) {
    chv2011_inst_interact
  } else {
    c("cafqt", "mhgc")
  }
  c(base, as.vector(outer(base, controls, paste, sep = "_")))
}

detect_chv2011_data_mode <- function(
    aer_dir = chv2011_aer_dir) {
  dataset_path <- file.path(aer_dir, "dataset.dta")
  newid_path <- file.path(aer_dir, "newid.dta")
  if (file.exists(dataset_path)) {
    return(list(mode = "full", message = "Using merged dataset.dta from geocode pipeline."))
  }
  if (file.exists(newid_path)) {
    return(list(mode = "geocode", message = "Using newid.dta merge (geocode available)."))
  }
  list(
    mode = "bundled_aligned",
    message = paste(
      "Merging basicvariables.dta and localvariables.dta by row order",
      "(author-aligned replication bundle; N should be 1747)."
    )
  )
}

load_chv2011_raw <- function(aer_dir = chv2011_aer_dir) {
  mode_info <- detect_chv2011_data_mode(aer_dir)
  if (mode_info$mode == "full") {
    df <- read_dta(file.path(aer_dir, "dataset.dta"))
  } else if (mode_info$mode == "geocode") {
    basic <- read_dta(file.path(aer_dir, "basicvariables.dta")) |> arrange(caseid)
    newid <- read_dta(file.path(aer_dir, "newid.dta")) |> arrange(caseid)
    local <- read_dta(file.path(aer_dir, "localvariables.dta")) |> arrange(newid)
    df <- basic |>
      left_join(newid, by = "caseid") |>
      left_join(local, by = "newid")
  } else {
    basic <- read_dta(file.path(aer_dir, "basicvariables.dta"))
    local <- read_dta(file.path(aer_dir, "localvariables.dta")) |>
      select(-newid)
    df <- bind_cols(basic, local)
  }
  attr(df, "chv2011_data_mode") <- mode_info
  df
}

validate_chv2011_merge <- function(
    df = add_chv2011_derived(load_chv2011_raw()),
    tol_basic = 0.002,
    tol_local = 0.15) {
  mode <- attr(df, "chv2011_data_mode")
  mode_label <- if (is.list(mode) && !is.null(mode$mode)) mode$mode else "unknown"

  basic <- read_dta(file.path(chv2011_aer_dir, "basicvariables.dta"))
  local <- read_dta(file.path(chv2011_aer_dir, "localvariables.dta"))
  id_corr <- stats::cor(basic$caseid, local$newid)

  targets <- chv2011_paper_a3()
  report <- targets |>
    rowwise() |>
    mutate(
      got_mean0 = mean(df[[var]][df$state == 0], na.rm = TRUE),
      got_mean1 = mean(df[[var]][df$state == 1], na.rm = TRUE),
      got_sd0 = sd(df[[var]][df$state == 0], na.rm = TRUE),
      got_sd1 = sd(df[[var]][df$state == 1], na.rm = TRUE),
      dmean0 = got_mean0 - mean0,
      dmean1 = got_mean1 - mean1,
      dsd0 = got_sd0 - sd0,
      dsd1 = got_sd1 - sd1,
      tol = if_else(source == "basic", tol_basic, tol_local),
      ok_mean = abs(dmean0) <= tol & abs(dmean1) <= tol
    ) |>
    ungroup()

  list(
    mode = mode_label,
    n = nrow(df),
    n0 = sum(df$state == 0),
    n1 = sum(df$state == 1),
    caseid_newid_cor = id_corr,
    merge_note = paste(
      "Bundled AER files: bind_cols(basic, local[-newid]) by row order.",
      "Without geocode newid.dta, local instruments are only approximately aligned",
      sprintf("(cor(caseid, newid) = %.3f).", id_corr)
    ),
    all_basic_ok = all(report$ok_mean[report$source == "basic"]),
    all_local_ok = all(report$ok_mean[report$source == "local"]),
    max_mean_diff = max(abs(c(report$dmean0, report$dmean1))),
    report = report
  )
}

validate_chv2011_table3 <- function(
    df = add_chv2011_derived(load_chv2011_raw()),
    fit = glm(choice_formula("baseline"), data = df, family = binomial(link = "logit")),
    tol_control = 0.04,
    tol_instrument = 0.03) {
  avder <- average_marginal_derivatives(fit, df)
  cmp <- chv2011_paper_table3_derivs() |>
    left_join(avder, by = "variable") |>
    mutate(
      diff = avg_deriv - paper,
      group = if_else(variable %in% chv2011_inst_base, "instrument", "control"),
      tol = if_else(group == "instrument", tol_instrument, tol_control),
      ok = abs(diff) <= tol
    )

  list(
    estimates = cmp,
    joint_iv_p = chv2011_joint_iv_test(fit),
    all_controls_ok = all(cmp$ok[cmp$group == "control"]),
    all_instruments_ok = all(cmp$ok[cmp$group == "instrument"]),
    note = paste(
      "Point estimates use full-sample logit + normalden(xbeta), matching avder_ddd.do on phat1.",
      "Paper joint IV p = 0.0001 requires geocode merge (dataset.dta); bundled row merge",
      "attenuates instrument relevance."
    )
  )
}

chv2011_joint_iv_test <- function(fit, terms = chv2011_iv_terms()) {
  terms <- intersect(terms, names(coef(fit)))
  if (length(terms) == 0) {
    return(NA_real_)
  }
  restricted <- stats::update.formula(formula(fit), paste(". ~ . -", paste(terms, collapse = " - ")))
  fit_r <- stats::update(fit, restricted)
  lr <- stats::anova(fit_r, fit, test = "Chisq")
  lr$`Pr(>Chi)`[2]
}

add_chv2011_derived <- function(df) {
  df |>
    mutate(
      tuition = tuit4c,
      cafqt2 = cafqt^2,
      mhgc2 = mhgc^2,
      numsibs2 = numsibs^2,
      lavlocwage172 = lavlocwage17^2,
      avurate2 = avurate^2
    ) |>
    chv2011_add_iv_interactions()
}

chv2011_add_iv_interactions <- function(df, inst = chv2011_inst_base, controls = chv2011_inst_interact) {
  out <- df
  for (z in inst) {
    for (x in controls) {
      nm <- paste0(z, "_", x)
      out[[nm]] <- out[[z]] * out[[x]]
    }
  }
  out
}

trim_common_support <- function(df, bounds = chv2011_p_bounds, phat_col = "phat") {
  df |>
    filter(.data[[phat_col]] >= bounds[1], .data[[phat_col]] <= bounds[2])
}

choice_formula <- function(spec = "baseline") {
  x_part <- switch(
    spec,
    baseline = paste(c(chv2011_choice_x, chv2011_cohort), collapse = " + "),
    allxinz = paste(c(chv2011_x, chv2011_cohort), collapse = " + "),
    nointall = paste(c(chv2011_choice_x, chv2011_cohort, chv2011_inst_base), collapse = " + "),
    notuitnounemp = paste(c(chv2011_choice_x, chv2011_cohort, "pub4", "lwage5_17"), collapse = " + "),
    notuit = paste(c(chv2011_choice_x, chv2011_cohort, "pub4", "lwage5_17", "lurate_17"), collapse = " + "),
    paste(c(chv2011_choice_x, chv2011_cohort), collapse = " + ")
  )
  z_part <- switch(
    spec,
    nointall = "",
    notuitnounemp = "",
    notuit = "",
    {
      iv_terms <- c(
        chv2011_inst_base,
        as.vector(outer(chv2011_inst_base, chv2011_inst_interact, paste, sep = "_"))
      )
      paste(iv_terms, collapse = " + ")
    }
  )
  rhs <- if (nzchar(z_part)) paste(x_part, z_part, sep = " + ") else x_part
  stats::as.formula(paste("state ~", rhs))
}

filter_chv2011_spec <- function(df, spec = "baseline") {
  switch(
    spec,
    nodrop = df |> filter(school != 1),
    df
  )
}

estimate_propensity <- function(df, spec = "baseline") {
  fml <- choice_formula(spec)
  fit <- glm(fml, data = df, family = binomial(link = "logit"))
  xbeta <- as.numeric(predict(fit, type = "link"))
  phat <- as.numeric(predict(fit, type = "response"))
  xgamma <- predict(fit, newdata = df, type = "link", se.fit = FALSE)
  # Linear index from X-only part (for policy weights).
  x_mat <- model.matrix(
    stats::update(fml, . ~ . - pub4 - lwage5_17 - lurate_17 - tuition -
      pub4_cafqt - pub4_mhgc - pub4_numsibs -
      lwage5_17_cafqt - lwage5_17_mhgc - lwage5_17_numsibs -
      lurate_17_cafqt - lurate_17_mhgc - lurate_17_numsibs -
      tuition_cafqt - tuition_mhgc - tuition_numsibs),
    data = df
  )
  if (ncol(x_mat) > 1) {
    xgamma_val <- as.numeric(x_mat %*% coef(fit)[names(coef(fit)) %in% colnames(x_mat)])
  } else {
    xgamma_val <- rep(0, nrow(df))
  }
  zgamma <- xbeta - xgamma_val
  list(model = fit, phat = phat, xbeta = xbeta, xgamma = xgamma_val, zgamma = zgamma)
}

average_marginal_derivatives <- function(fit, df) {
  xb <- as.numeric(predict(fit, type = "link"))
  dens <- dnorm(xb)
  b <- coef(fit)
  nm <- names(b)
  getb <- function(k) if (k %in% nm) b[[k]] else 0

  tibble(
    variable = c(
      "cafqt", "mhgc", "numsibs", "urban14", "lavlocwage17", "avurate",
      "pub4", "lwage5_17", "lurate_17", "tuition"
    ),
    avg_deriv = c(
      mean(dens * (getb("cafqt") + 2 * getb("cafqt2") * df$cafqt +
        getb("pub4_cafqt") * df$pub4 + getb("lwage5_17_cafqt") * df$lwage5_17 +
        getb("lurate_17_cafqt") * df$lurate_17 + getb("tuition_cafqt") * df$tuition)),
      mean(dens * (getb("mhgc") + 2 * getb("mhgc2") * df$mhgc +
        getb("pub4_mhgc") * df$pub4 + getb("lwage5_17_mhgc") * df$lwage5_17 +
        getb("lurate_17_mhgc") * df$lurate_17 + getb("tuition_mhgc") * df$tuition)),
      mean(dens * (getb("numsibs") + 2 * getb("numsibs2") * df$numsibs +
        getb("pub4_numsibs") * df$pub4 + getb("lwage5_17_numsibs") * df$lwage5_17 +
        getb("lurate_17_numsibs") * df$lurate_17 + getb("tuition_numsibs") * df$tuition)),
      mean(dens * getb("urban14")),
      mean(dens * (getb("lavlocwage17") + 2 * getb("lavlocwage172") * df$lavlocwage17)),
      mean(dens * (getb("avurate") + 2 * getb("avurate2") * df$avurate)),
      mean(dens * (getb("pub4") + getb("pub4_cafqt") * df$cafqt +
        getb("pub4_mhgc") * df$mhgc + getb("pub4_numsibs") * df$numsibs)),
      mean(dens * (getb("lwage5_17") + getb("lwage5_17_cafqt") * df$cafqt +
        getb("lwage5_17_mhgc") * df$mhgc + getb("lwage5_17_numsibs") * df$numsibs)),
      mean(dens * (getb("lurate_17") + getb("lurate_17_cafqt") * df$cafqt +
        getb("lurate_17_mhgc") * df$mhgc + getb("lurate_17_numsibs") * df$numsibs)),
      mean(dens * (getb("tuition") + getb("tuition_cafqt") * df$cafqt +
        getb("tuition_mhgc") * df$mhgc + getb("tuition_numsibs") * df$numsibs))
    )
  )
}

bootstrap_avg_derivatives <- function(
    df,
    B = 250,
    seed = 703296661,
    spec = "baseline",
    boot_dir = chv2011_boot_dir(),
    use_bootdata = NULL) {
  if (is.null(use_bootdata)) {
    use_bootdata <- !identical(Sys.getenv("CHV2011_USE_BOOTDATA", "TRUE"), "FALSE")
  }

  status <- if (use_bootdata) bootdata_status(boot_dir) else list(ok = FALSE)
  if (isTRUE(status$ok) && status$B >= B) {
    boot <- matrix(NA_real_, nrow = 10, ncol = B)
    for (b in seq_len(B)) {
      dfb <- load_chv2011_boot_sample(b, boot_dir = boot_dir, for_analysis = TRUE)
      fitb <- glm(choice_formula(spec), data = dfb, family = binomial(link = "logit"))
      boot[, b] <- average_marginal_derivatives(fitb, dfb)$avg_deriv
    }
    return(apply(boot, 1, sd, na.rm = TRUE))
  }

  set.seed(seed)
  n <- nrow(df)
  idx_mat <- matrix(sample.int(n, n * B, replace = TRUE), nrow = n, ncol = B)
  boot <- matrix(NA_real_, nrow = 10, ncol = B)
  for (b in seq_len(B)) {
    dfb <- df[idx_mat[, b], , drop = FALSE]
    fitb <- glm(choice_formula(spec), data = dfb, family = binomial(link = "logit"))
    boot[, b] <- average_marginal_derivatives(fitb, dfb)$avg_deriv
  }
  apply(boot, 1, sd, na.rm = TRUE)
}

load_chv2011_analysis <- function(spec = "baseline", trim = TRUE) {
  raw <- load_chv2011_raw() |>
    add_chv2011_derived() |>
    filter_chv2011_spec(spec) |>
    add_chv2011_wage_spec_vars(spec)
  prop <- estimate_propensity(raw, spec)
  raw$phat <- prop$phat
  raw$xbeta <- prop$xbeta
  raw$zgamma <- prop$zgamma
  if (trim) {
    raw <- trim_common_support(raw)
  }
  list(
    data = raw,
    propensity = prop,
    mode = attr(load_chv2011_raw(), "chv2011_data_mode"),
    spec = spec
  )
}

make_bootstrap_indices <- function(n, B = 250, seed = 703296661) {
  set.seed(seed)
  matrix(sample.int(n, n * B, replace = TRUE), nrow = n, ncol = B)
}

# ---- Bootstrap samples (Stata getboot.do) ------------------------------------

#' Column order written to bootdata/phat*.asc (matches getboot.do outfile).
chv2011_boot_columns <- function() {
  c(
    "caseid", "state", "wage", "const", "exp", "expsq", "cafqt", "mhgc",
    paste0("d", 57:63),
    "lwage5", "lurate", "pub4", "lwage5_17", "lurate_17",
    "urban14", "lavlocwage17", "avurate", "numsibs", "tuit4c"
  )
}

chv2011_boot_dir <- function(aer_dir = chv2011_aer_dir) {
  file.path(aer_dir, "bootdata")
}

#' Build auxboot frame: sorted by caseid with observation index n (Stata egen n = group(caseid)).
prepare_chv2011_boot_base <- function(aer_dir = chv2011_aer_dir) {
  cols <- chv2011_boot_columns()
  load_chv2011_raw(aer_dir) |>
    arrange(caseid) |>
    mutate(n = row_number()) |>
    select(all_of(cols), n)
}

write_chv2011_boot_asc <- function(df, path, cols = chv2011_boot_columns()) {
  dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
  mat <- df |> select(all_of(cols))
  write.table(
    mat,
    file = path,
    row.names = FALSE,
    col.names = FALSE,
    sep = " ",
    quote = FALSE
  )
  invisible(path)
}

bootdata_status <- function(boot_dir = chv2011_boot_dir()) {
  meta_path <- file.path(boot_dir, "boot_meta.rds")
  if (!file.exists(meta_path)) {
    return(list(
      ok = FALSE,
      message = "bootdata/ not generated — run Rcode/getboot.R or generate_chv2011_bootdata()"
    ))
  }
  meta <- readRDS(meta_path)
  asc_files <- list.files(boot_dir, pattern = "^phat[0-9]+\\.asc$", full.names = FALSE)
  asc_count <- length(asc_files)
  list(
    ok = asc_count >= meta$B,
    B = meta$B,
    n = meta$n,
    asc_count = asc_count,
    data_mode = meta$data_mode,
    seed = meta$seed,
    boot_dir = boot_dir,
    message = if (asc_count >= meta$B) {
      sprintf("%d bootstrap .asc files ready (N = %d)", asc_count, meta$n)
    } else {
      sprintf("Incomplete: %d / %d .asc files", asc_count, meta$B)
    }
  )
}

#' R port of replication_aer/getboot.do.
#'
#' Writes phat1.asc (full sample) and phat2.asc … phatB.asc (bootstrap draws),
#' plus boot_base.rds, bootstrap_indices.rds, and boot_meta.rds for fast R reuse.
generate_chv2011_bootdata <- function(
    B = as.integer(Sys.getenv("CHV2011_BOOT", "250")),
    seed = as.integer(Sys.getenv("CHV2011_BOOT_SEED", "703296661")),
    boot_dir = chv2011_boot_dir(),
    write_asc = TRUE,
    overwrite = isTRUE(as.logical(Sys.getenv("CHV2011_BOOT_OVERWRITE", "FALSE"))),
    aer_dir = chv2011_aer_dir) {
  if (B < 1L) {
    stop("B must be >= 1")
  }

  base <- prepare_chv2011_boot_base(aer_dir)
  n <- nrow(base)
  cols <- chv2011_boot_columns()
  mode_info <- detect_chv2011_data_mode(aer_dir)

  dir.create(boot_dir, recursive = TRUE, showWarnings = FALSE)

  set.seed(seed)
  idx_mat <- if (B > 1L) {
    matrix(sample.int(n, n * (B - 1L), replace = TRUE), nrow = n, ncol = B - 1L)
  } else {
    matrix(integer(0), nrow = n, ncol = 0L)
  }

  paths <- character(B)
  if (write_asc) {
    for (b in seq_len(B)) {
      path_b <- file.path(boot_dir, sprintf("phat%d.asc", b))
      if (!file.exists(path_b) || overwrite) {
        sample_b <- if (b == 1L) {
          base
        } else {
          base[idx_mat[, b - 1L], , drop = FALSE]
        }
        write_chv2011_boot_asc(sample_b, path_b, cols)
      }
      paths[b] <- path_b
    }
  }

  saveRDS(base, file.path(boot_dir, "boot_base.rds"))
  saveRDS(idx_mat, file.path(boot_dir, "bootstrap_indices.rds"))
  saveRDS(
    list(
      n = n,
      B = B,
      seed = seed,
      columns = cols,
      data_mode = mode_info$mode,
      generated = Sys.time()
    ),
    file.path(boot_dir, "boot_meta.rds")
  )

  invisible(list(
    paths = paths,
    n = n,
    B = B,
    idx_mat = idx_mat,
    base = base,
    status = bootdata_status(boot_dir)
  ))
}

#' Load bootstrap sample i (1 = full sample / phat1, 2…B = resamples).
load_chv2011_boot_sample <- function(
    i,
    boot_dir = chv2011_boot_dir(),
    for_analysis = FALSE) {
  if (i < 1L) {
    stop("i must be >= 1")
  }

  meta_path <- file.path(boot_dir, "boot_meta.rds")
  if (!file.exists(meta_path)) {
    stop(
      "bootdata/ not found under ", boot_dir,
      ". Run generate_chv2011_bootdata() or Rscript Rcode/getboot.R first."
    )
  }

  meta <- readRDS(meta_path)
  if (i > meta$B) {
    stop(sprintf("Only %d bootstrap samples available (requested i = %d)", meta$B, i))
  }

  cols <- chv2011_boot_columns()
  base <- readRDS(file.path(boot_dir, "boot_base.rds"))

  df <- if (i == 1L) {
    base[, cols, drop = FALSE]
  } else {
    idx <- readRDS(file.path(boot_dir, "bootstrap_indices.rds"))[, i - 1L]
    base[idx, cols, drop = FALSE]
  }

  if (for_analysis) {
    df <- add_chv2011_derived(df)
  }
  df
}

wage_interaction_terms <- function(df, phat_col = "phat") {
  p <- df[[phat_col]]
  s <- df$state
  out <- df
  for (v in chv2011_x) {
    out[[paste0("p", v)]] <- p * out[[v]]
    out[[paste0("s", v)]] <- s * out[[v]]
  }
  out
}

annualize_return <- function(x) x / 4