# Load required libraries
library(tidyverse)
library(haven)     # For reading Stata files
library(ivreg)     # For instrumental variables regression
library(knitr)     # For generating tables
library(kableExtra)
library(broom)     # For tidying model results
library(here)
library(gt)         # For creating gt tables
source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R"))

# Read the data
data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta"))

# Rename variables
data <- data %>%
  rename(
    AGE = v1,
    AGEQ = v2,
    EDUC = v4,
    ENOCENT = v5,
    ESOCENT = v6,
    LWKLYWGE = v9,
    MARRIED = v10,
    MIDATL = v11,
    MT = v12,
    NEWENG = v13,
    CENSUS = v16,
    SOB = v17,     # State of birth - key variable for Table VII
    QOB = v18,
    RACE = v19,
    SMSA = v20,
    SOATL = v21,
    WNOCENT = v24,
    WSOCENT = v25,
    YOB = v27
  )

# Adjust YOB for years >= 1900
data <- data %>%
  mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB))

# Create YOB dummies
for (i in 0:9) {
  data <- data %>%
    mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0))
}

# Create QOB dummies
for (i in 1:4) {
  data <- data %>%
    mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0))
}

# Create QOB*YOB interaction dummies
for (j in 1:3) {
  for (i in 0:9) {
    data <- data %>%
      mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i)))
  }
}

# Create COHORT variable and adjust AGEQ
data <- data %>%
  mutate(
    COHORT = case_when(
      YOB <= 29 & YOB >= 20 ~ 2029,
      YOB <= 39 & YOB >= 30 ~ 3039,
      YOB <= 49 & YOB >= 40 ~ 4049,
      TRUE ~ NA_real_
    ),
    AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ),
    AGEQSQ = AGEQ * AGEQ
  )

# Select particular men born
# Filter data for COHORT >3000 and COHORT < 3040
data <- data %>%
  filter(COHORT > 3000 & COHORT < 3040)

# Create state dummies
# First get unique states of birth
unique_states <- sort(unique(data$SOB))
num_states <- length(unique_states)

# Create state dummies (state1 to state51)
for (i in 1:num_states) {
  data <- data %>%
    mutate(!!paste0("state", i) := if_else(SOB == unique_states[i], 1, 0))
}

# Create QOB*SOB interaction dummies
for (j in 1:3) {
  for (i in 1:(num_states-1)) {
    data <- data %>%
      mutate(!!paste0("QTR", j, "state", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("state", i)))
  }
}

# Define included and excluded instrument variables
# YR0-YR8 are part of both formulas and instruments
yr_vars <- paste0("YR", 0:8)

# QTR*YR interactions as instruments (excluded instruments)
qtr_yr_vars <- c()
for (j in 1:3) {
  for (i in 0:9) {
    qtr_yr_vars <- c(qtr_yr_vars, paste0("QTR", j, "YR", i))
  }
}

# QTR*state interactions as instruments
qtr_state_vars <- c()
for (j in 1:3) {
  for (i in 1:(num_states-1)) {
    qtr_state_vars <- c(qtr_state_vars, paste0("QTR", j, "state", i))
  }
}

# State dummies
state_vars <- paste0("state", 1:(num_states-1))

# Create formula strings for the excluded instruments (for clarity)
excluded_instruments <- c(qtr_yr_vars, qtr_state_vars)
excluded_instr_str <- paste(excluded_instruments, collapse = " + ")

# Define regression formulas
# Model 1: OLS with state dummies (no IV)
formula1 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + ")))

# Model 2: IV with QTR*YR and QTR*state as instruments
formula2 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + "), " | ", 
                             paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + "), " + ", 
                             excluded_instr_str))

# Model 3: OLS with state dummies and age controls
formula3 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", 
                             paste(state_vars, collapse = " + ")))

# Model 4: IV with age controls
formula4 <- as.formula(paste("LWKLYWGE ~ EDUC + ", paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", 
                             paste(state_vars, collapse = " + "), " | ", 
                             paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", 
                             paste(state_vars, collapse = " + "), " + ", 
                             excluded_instr_str))

# Model 5: OLS with demographic controls
formula5 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
                             "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", 
                             paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + ")))

# Model 6: IV with demographic controls
formula6 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
                             "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", 
                             paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + "), " | ", 
                             "RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
                             "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", 
                             paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + "), " + ", 
                             excluded_instr_str))

# Model 7: OLS with all controls
formula7 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
                             "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", 
                             paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", 
                             paste(state_vars, collapse = " + ")))

# Model 8: IV with all controls
formula8 <- as.formula(paste("LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
                             "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + ", 
                             paste(yr_vars, collapse = " + "), " + AGEQ + AGEQSQ + ", 
                             paste(state_vars, collapse = " + "), " | ", 
                             "RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + ",
                             "WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ + ", 
                             paste(yr_vars, collapse = " + "), " + ", 
                             paste(state_vars, collapse = " + "), " + ", 
                             excluded_instr_str))

# Run regressions
model1 <- lm(formula1, data = data)
# summary(model1)
model2 <- ivreg(formula2, data = data)
# summary(model2)
model3 <- lm(formula3, data = data)
# summary(model3)
model4 <- ivreg(formula4, data = data)
# summary(model4)
model5 <- lm(formula5, data = data)
# summary(model5)
model6 <- ivreg(formula6, data = data)
# summary(model6)
model7 <- lm(formula7, data = data)
# summary(model7)
model8 <- ivreg(formula8, data = data)
# summary(model8)

# Extract coefficients and standard errors
table_data <- bind_rows(
  broom::tidy(model1, conf.int = TRUE) %>% mutate(model = "model1"),
  broom::tidy(model2, conf.int = TRUE) %>% mutate(model = "model2"),
  broom::tidy(model3, conf.int = TRUE) %>% mutate(model = "model3"),
  broom::tidy(model4, conf.int = TRUE) %>% mutate(model = "model4"),
  broom::tidy(model5, conf.int = TRUE) %>% mutate(model = "model5"),
  broom::tidy(model6, conf.int = TRUE) %>% mutate(model = "model6"),
  broom::tidy(model7, conf.int = TRUE) %>% mutate(model = "model7"),
  broom::tidy(model8, conf.int = TRUE) %>% mutate(model = "model8")
) %>%
  filter(term %in% c("EDUC", "RACE", "SMSA", "MARRIED", "AGEQ", "AGEQSQ")) %>%
  mutate(
    term = case_when(
      term == "EDUC" ~ "Years of education",
      term == "RACE" ~ "Race(1 = black)",
      term == "SMSA" ~ "SMSA (1 = center city)",
      term == "MARRIED" ~ "Married (1 = married)",
      term == "AGEQ" ~ "Age",
      term == "AGEQSQ" ~ "Age-squared",
      TRUE ~ term
    ),
    p.sig = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01 ~ "**",
      p.value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    value = ak91_coef_cell(estimate, std.error, p.sig)
  ) %>%
  select(term, model, value) %>%
  pivot_wider(
    names_from = model,
    values_from = value
  ) %>%
  # reorder the term
  mutate(
    term = factor(
      term, 
      levels = c("Years of education", "Race(1 = black)", "SMSA (1 = center city)", 
                "Married (1 = married)", "Age", "Age-squared"))
  ) %>%
  arrange(term) %>%
  rename_all(~ all_of(c("Independent Variables", "(1) <br>OLS", "(2) <br>TSLS", 
                       "(3) <br>OLS", "(4) <br>TSLS", "(5) <br>OLS", "(6) <br>TSLS", 
                       "(7) <br>OLS", "(8) <br>TSLS")))

# Create the table using kableExtra
kbl_out <- table_data %>%
  kbl(
    format = "html", escape = FALSE,
    scroll = TRUE,
    align = c("l", "c", "c", "c", "c", "c", "c", "c", "c", "c"),
    caption = "Table VII: OLS and TSLS Estimates of the Return to Education for Men Born 1930-1939: 1980 Census"
    ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  footnote(
    general = "Yes-No Dummies are not kept in the shown figure.",
    number = c(
      "$^{*}$ p<0.05; $^{**}$ p<0.01; $^{***}$ p<0.001",
      "The dependent variable is the log of weekly wage. The sample includes men born 1930-1939 in the 1980 Census.",
      "The excluded instrumental variables for all TSLS models are not shown in the table and include quarter of birth-year of birth (27 interactions) and quarter of birth-state of birth (150 interactions) dummies."
    )
  )

# Create the table using gt
table_data <- ak91_quarto_blank_df(table_data)
gt_tbl <- table_data %>%
  gt() %>%
  # Set title
  tab_header(
    title = "Table VII: OLS and TSLS Estimates of the Return to Education for Men Born 1930-1939: 1980 Census"
  ) %>%
  # Set column alignment
  cols_align(
    align = "left",
    columns = 1
  ) %>%
  cols_align(
    align = "center",
    columns = 2:9
  ) %>%
  # Add footnotes
  tab_footnote(
    footnote = "Yes-No Dummies are not kept in the shown figure.",
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = md("$^{*}p<0.05; ^{**}p<0.01; ^{***}p<0.001$"),
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = "The dependent variable is the log of weekly wage. The sample includes men born 1930-1939 in the 1980 Census.",
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = md("The excluded instrumental variables for all TSLS models are not shown in the table and include quarter of birth-year of birth (27 interactions) and quarter of birth-state of birth (150 interactions) dummies."),
    locations = cells_title(groups = "title")
  ) %>%
  # Set table style
  opt_row_striping() %>%
  tab_options(
    table.font.size = px(14),
    heading.background.color = "#e8e8e8",
    column_labels.font.weight = "bold",
    table.border.top.width = px(3),
    table.border.bottom.width = px(3),
    container.width = pct(100),
    table.width = px(900)  # 固定宽度会自动启用滚动条
  ) %>%
  # Ensure math symbols render correctly
  fmt_markdown(columns = everything()) %>%
  # Ensure all column headers use gt package's html() syntax
  cols_label(.list = ak91_gt_cols_label(names(table_data)))

# Save the objects to RData file
save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_VII.RData"))