# Load required libraries
library(tidyverse)
library(haven)  # For reading Stata files
library(ivreg)  # For instrumental variables regression
library(knitr)  # For generating tables
library(kableExtra)
library(broom)  # For tidying model results
library(here)
library(gt)  # For creating gt tables
source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R"))

# Read the data
data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta"))

# Rename variables
data <- data %>%
  rename(
    AGE = v1,
    AGEQ = v2,
    EDUC = v4,
    ENOCENT = v5,
    ESOCENT = v6,
    LWKLYWGE = v9,
    MARRIED = v10,
    MIDATL = v11,
    MT = v12,
    NEWENG = v13,
    CENSUS = v16,
    QOB = v18,
    RACE = v19,
    SMSA = v20,
    SOATL = v21,
    WNOCENT = v24,
    WSOCENT = v25,
    YOB = v27
  )

# Adjust YOB for years >= 1900
data <- data %>%
  mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB))

# Create YOB dummies
for (i in 0:9) {
  data <- data %>%
    mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0))
}

# Create QOB dummies
for (i in 1:4) {
  data <- data %>%
    mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0))
}

# Create QOB*YOB interaction dummies
for (j in 1:3) {
  for (i in 0:9) {
    data <- data %>%
      mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i)))
  }
}

# Create COHORT variable and adjust AGEQ
data <- data %>%
  mutate(
    COHORT = case_when(
      YOB <= 29 & YOB >= 20 ~ 2029,
      YOB <= 39 & YOB >= 30 ~ 3039,
      YOB <= 49 & YOB >= 40 ~ 4049,
      TRUE ~ NA_real_
    ),
    AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ),
    AGEQSQ = AGEQ * AGEQ
  )

# Select particular men born
# Filter data for COHORT > 4000
data <- data %>%
  filter(COHORT > 4000)

# Define excluded instrumental variables
iv_excluded <- c(
  "QTR1YR0", "QTR1YR1", "QTR1YR2", "QTR1YR3", "QTR1YR4", "QTR1YR5", "QTR1YR6",
  "QTR1YR7", "QTR1YR8", "QTR1YR9", "QTR2YR0", "QTR2YR1", "QTR2YR2", "QTR2YR3",
  "QTR2YR4", "QTR2YR5", "QTR2YR6", "QTR2YR7", "QTR2YR8", "QTR2YR9", "QTR3YR0",
  "QTR3YR1", "QTR3YR2", "QTR3YR3", "QTR3YR4", "QTR3YR5", "QTR3YR6", "QTR3YR7",
  "QTR3YR8", "QTR3YR9"
)

# Define regression formulas
formula1 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8

formula2 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 |
            QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 +
            QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 +
            QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 +
            QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 +
            QTR3YR7 + QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 +
            YR6 + YR7 + YR8
iv2_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8"
)
iv2_all <- c(iv2_included, iv_excluded)

formula3 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
            AGEQ + AGEQSQ

formula4 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
            AGEQ + AGEQSQ | QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 +
            QTR1YR4 + QTR1YR5 + QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 +
            QTR2YR1 + QTR2YR2 + QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 +
            QTR2YR8 + QTR2YR9 + QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 +
            QTR3YR5 + QTR3YR6 + QTR3YR7 + QTR3YR8 + QTR3YR9 +
            YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
            AGEQ + AGEQSQ

iv4_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
  "AGEQ", "AGEQSQ"
)
iv4_all <- c(iv4_included, iv_excluded)

formula5 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 +
            YR4 + YR5 + YR6 + YR7 + YR8

formula6 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 +
            YR4 + YR5 + YR6 + YR7 + YR8 | QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 +
            QTR1YR4 + QTR1YR5 + QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 +
            QTR2YR1 + QTR2YR2 + QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 +
            QTR2YR8 + QTR2YR9 + QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 +
            QTR3YR5 + QTR3YR6 + QTR3YR7 + QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 +
            YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
            RACE + MARRIED + SMSA +
            NEWENG + MIDATL + ENOCENT +  WNOCENT + SOATL + ESOCENT + WSOCENT + MT
iv6_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
  "RACE", "MARRIED", "SMSA",
  "NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT"
)
iv6_all <- c(iv6_included, iv_excluded)

formula7 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 +
            YR4 + YR5 + YR6 + YR7 + YR8 + AGEQ + AGEQSQ

formula8 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT +
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ +
            YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 |
            QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 + QTR1YR6 +
            QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 + QTR2YR3 +
            QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 + QTR3YR0 +
            QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 + QTR3YR7 +
            QTR3YR8 + QTR3YR9 +
            YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 +
            RACE + MARRIED + SMSA +
            NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT +
            AGEQ + AGEQSQ

iv8_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
  "RACE", "MARRIED", "SMSA",
  "NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT",
  "AGEQ", "AGEQSQ"
)
iv8_all <- c(iv8_included, iv_excluded)

# Run regressions
model1 <- lm(formula1, data = data)
model2 <- ivreg(formula2, data = data)
model3 <- lm(formula3, data = data)
model4 <- ivreg(formula4, data = data)
model5 <- lm(formula5, data = data)
model6 <- ivreg(formula6, data = data)
model7 <- lm(formula7, data = data)
model8 <- ivreg(formula8, data = data)

# Extract coefficients and standard errors
table_data <- bind_rows(
  broom::tidy(model1, conf.int = TRUE) %>% mutate(model = "model1"),
  broom::tidy(model2, conf.int = TRUE) %>% mutate(model = "model2"),
  broom::tidy(model3, conf.int = TRUE) %>% mutate(model = "model3"),
  broom::tidy(model4, conf.int = TRUE) %>% mutate(model = "model4"),
  broom::tidy(model5, conf.int = TRUE) %>% mutate(model = "model5"),
  broom::tidy(model6, conf.int = TRUE) %>% mutate(model = "model6"),
  broom::tidy(model7, conf.int = TRUE) %>% mutate(model = "model7"),
  broom::tidy(model8, conf.int = TRUE) %>% mutate(model = "model8")
) %>%
  filter(term %in% c("EDUC", "RACE", "SMSA", "MARRIED", "AGEQ", "AGEQSQ")) %>%
  mutate(
    term = case_when(
      term == "EDUC" ~ "Years of education",
      term == "RACE" ~ "Race (1 = black)",
      term == "SMSA" ~ "SMSA (1 = center city)",
      term == "MARRIED" ~ "Married (1 = married)",
      term == "AGEQ" ~ "Age",
      term == "AGEQSQ" ~ "Age-squared",
      TRUE ~ term
    ),
    p.sig = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01 ~ "**",
      p.value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    value = ak91_coef_cell(estimate, std.error, p.sig)
  ) %>%
  select(term, model, value) %>%
  pivot_wider(
    names_from = model,
    values_from = value
  ) %>%
  # reorder the term
  mutate(
    term = factor(
      term,
      levels = c("Years of education", "Race (1 = black)", "SMSA (1 = center city)",
                "Married (1 = married)", "Age", "Age-squared"))
  ) %>%
  arrange(term) %>%
  rename_all(~ all_of(c("Independent Variables", "(1) <br>OLS", "(2) <br>TSLS",
                       "(3) <br>OLS", "(4) <br>TSLS", "(5) <br>OLS", "(6) <br>TSLS",
                       "(7) <br>OLS", "(8) <br>TSLS")))

# Create the table using kableExtra
kbl_out <- table_data %>%
  kbl(
    format = "html", escape = FALSE,
    scroll = TRUE,
    align = c("l", "c", "c", "c", "c", "c", "c", "c", "c", "c"),
    caption = "Table VI: OLS and TSLS Estimates of the Return to Education for Men Born 1940-1949: 1980 Census"
    ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  footnote(
    general = "Yes-No Dummies are not kept in the shown figure.",
    number = c(
      "$^{*}$ p<0.05; $^{**}$ p<0.01; $^{***}$ p<0.001",
      "The dependent variable is the log of weekly wage. The sample includes men born 1940-1949 in the 1980 Census.",
      "The excluded instrumental variables for all TSLS models are not shown in the table and are the same which are the 27 interaction terms between the quarter of birth and year of birth $QTR_j \\times YR_i$, $i \\in \\{0, 1, \\dots, 9\\}$ and $j \\in \\{1, 2, 3\\}$."
    )
  )

# Create the table using gt
table_data <- ak91_quarto_blank_df(table_data)
gt_tbl <- table_data %>%
  gt() %>%
  # Set title
  tab_header(
    title = "Table VI: OLS and TSLS Estimates of the Return to Education for Men Born 1940-1949: 1980 Census"
  ) %>%
  # Set column alignment
  cols_align(
    align = "left",
    columns = 1
  ) %>%
  cols_align(
    align = "center",
    columns = 2:9
  ) %>%
  # Add footnotes
  tab_footnote(
    footnote = "Yes-No Dummies are not kept in the shown figure.",
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = md("$^{*}p<0.05; ^{**}p<0.01; ^{***}p<0.001$"),
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = "The dependent variable is the log of weekly wage. The sample includes men born 1940-1949 in the 1980 Census.",
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = md("The excluded instrumental variables for all TSLS models are not shown in the table and are the same which are the 27 interaction terms between the quarter of birth and year of birth $QTR_j \\times YR_i$, $i \\in \\{0, 1, \\dots, 9\\}$ and $j \\in \\{1, 2, 3\\}$."),
    locations = cells_title(groups = "title")
  ) %>%
  # Set table style
  opt_row_striping() %>%
  tab_options(
    table.font.size = px(14),
    heading.background.color = "#e8e8e8",
    column_labels.font.weight = "bold",
    table.border.top.width = px(3),
    table.border.bottom.width = px(3),
    container.width = pct(100),
    table.width = px(100)
  ) %>%
  # Ensure math symbols render correctly
  fmt_markdown(columns = everything()) %>%
  # Ensure all column headers use gt package's html() syntax
  cols_label(.list = ak91_gt_cols_label(names(table_data)))

# Save the objects to RData file
save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_VI.RData"))