# Load required libraries
library(tidyverse)
library(haven)  # For reading Stata files
library(ivreg)  # For instrumental variables regression
library(knitr)  # For generating tables
library(kableExtra)
library(broom)  # For tidying model results
library(here)
library(gt)
source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R"))

# Read the data
data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta"))

# Rename variables
data <- data %>%
  rename(
    AGE = v1,
    AGEQ = v2,
    EDUC = v4,
    ENOCENT = v5,
    ESOCENT = v6,
    LWKLYWGE = v9,
    MARRIED = v10,
    MIDATL = v11,
    MT = v12,
    NEWENG = v13,
    CENSUS = v16,
    QOB = v18,
    RACE = v19,
    SMSA = v20,
    SOATL = v21,
    WNOCENT = v24,
    WSOCENT = v25,
    YOB = v27
  )

# Adjust YOB for years >= 1900
data <- data %>%
  mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB))

# Create YOB dummies
for (i in 0:9) {
  data <- data %>%
    mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0))
}

# Create QOB dummies
for (i in 1:4) {
  data <- data %>%
    mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0))
}

# Create QOB*YOB interaction dummies
for (j in 1:3) {
  for (i in 0:9) {
    data <- data %>%
      mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i)))
  }
}

# Create COHORT variable and adjust AGEQ
data <- data %>%
  mutate(
    COHORT = case_when(
      YOB <= 29 & YOB >= 20 ~ 2029,
      YOB <= 39 & YOB >= 30 ~ 3039,
      YOB <= 49 & YOB >= 40 ~ 4049,
      TRUE ~ NA_real_
    ),
    AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ),
    AGEQSQ = AGEQ * AGEQ
  )

# Select particular men born
# Filter data for COHORT >3000 and  COHORT < 3040
data <- data %>%
  filter(COHORT > 3000 & COHORT < 3040)

## Define excluded instrumental variables
iv_excluded <- c(
  "QTR1YR0", "QTR1YR1", "QTR1YR2", "QTR1YR3", "QTR1YR4", "QTR1YR5", "QTR1YR6", 
  "QTR1YR7", "QTR1YR8", "QTR1YR9", "QTR2YR0", "QTR2YR1", "QTR2YR2", "QTR2YR3", 
  "QTR2YR4", "QTR2YR5", "QTR2YR6", "QTR2YR7", "QTR2YR8", "QTR2YR9", "QTR3YR0", 
  "QTR3YR1", "QTR3YR2", "QTR3YR3", "QTR3YR4", "QTR3YR5", "QTR3YR6", "QTR3YR7", 
  "QTR3YR8", "QTR3YR9"
)

# Define regression formulas
formula1 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8

formula2 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 | 
            QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 + 
            QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 + 
            QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 + 
            QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 + 
            QTR3YR7 + QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + 
            YR6 + YR7 + YR8

iv2_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8"
)
iv2_all <- c(iv2_included, iv_excluded)

formula3 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 + 
            AGEQ + AGEQSQ
formula4 <- LWKLYWGE ~ EDUC + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 + 
            AGEQ + AGEQSQ | QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + 
            QTR1YR4 + QTR1YR5 + QTR1YR6 + QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + 
            QTR2YR1 + QTR2YR2 + QTR2YR3 + QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + 
            QTR2YR8 + QTR2YR9 + QTR3YR0 + QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + 
            QTR3YR5 + QTR3YR6 + QTR3YR7 + QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 + 
            YR3 + YR4 + YR5 + YR6 + YR7 + YR8 + 
            AGEQ + AGEQSQ

iv4_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
  "AGEQ", "AGEQSQ"
)
iv4_all <- c(iv4_included, iv_excluded)

formula5 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + 
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 + 
            YR4 + YR5 + YR6 + YR7 + YR8
formula6 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + 
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 + 
            YR4 + YR5 + YR6 + YR7 + YR8 | RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + 
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + QTR1YR0 + QTR1YR1 + 
            QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 + QTR1YR6 + QTR1YR7 + QTR1YR8 + 
            QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 + QTR2YR3 + QTR2YR4 + QTR2YR5 + 
            QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 + QTR3YR0 + QTR3YR1 + QTR3YR2 + 
            QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 + QTR3YR7 + QTR3YR8 + QTR3YR9 + 
            YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8

iv6_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
  "RACE", "MARRIED", "SMSA",
  "NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT"
)
iv6_all <- c(iv6_included, iv_excluded)

formula7 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + 
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR0 + YR1 + YR2 + YR3 + 
            YR4 + YR5 + YR6 + YR7 + YR8 + AGEQ + AGEQSQ
formula8 <- LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + 
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ + 
            YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8 | RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + 
            WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ + 
            QTR1YR0 + QTR1YR1 + QTR1YR2 + QTR1YR3 + QTR1YR4 + QTR1YR5 + QTR1YR6 + 
            QTR1YR7 + QTR1YR8 + QTR1YR9 + QTR2YR0 + QTR2YR1 + QTR2YR2 + QTR2YR3 + 
            QTR2YR4 + QTR2YR5 + QTR2YR6 + QTR2YR7 + QTR2YR8 + QTR2YR9 + QTR3YR0 + 
            QTR3YR1 + QTR3YR2 + QTR3YR3 + QTR3YR4 + QTR3YR5 + QTR3YR6 + QTR3YR7 + 
            QTR3YR8 + QTR3YR9 + YR0 + YR1 + YR2 + YR3 + YR4 + YR5 + YR6 + YR7 + YR8

## included instrumental variables
iv8_included <- c(
  "YR0", "YR1", "YR2", "YR3", "YR4", "YR5", "YR6", "YR7", "YR8",
  "RACE", "MARRIED", "SMSA",
  "NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT",
  "AGEQ", "AGEQSQ"
)
iv8_all <- c(iv8_included, iv_excluded)

# Run regressions
model1 <- lm(formula1, data = data)
# summary(model1)
model2 <- ivreg(formula2, data = data)
# summary(model2)
model3 <- lm(formula3, data = data)
# summary(model3)
model4 <- ivreg(formula4, data = data)
# summary(model4)
model5 <- lm(formula5, data = data)
# summary(model5)
model6 <- ivreg(formula6, data = data)
# summary(model6)
model7 <- lm(formula7, data = data)
# summary(model7)
model8 <- ivreg(formula8, data = data)
# summary(model8)

# Extract coefficients and standard errors
table_data <- bind_rows(
  broom::tidy(model1, conf.int = TRUE) %>% mutate(model = "model1"),
  broom::tidy(model2, conf.int = TRUE) %>% mutate(model = "model2"),
  broom::tidy(model3, conf.int = TRUE) %>% mutate(model = "model3"),
  broom::tidy(model4, conf.int = TRUE) %>% mutate(model = "model4"),
  broom::tidy(model5, conf.int = TRUE) %>% mutate(model = "model5"),
  broom::tidy(model6, conf.int = TRUE) %>% mutate(model = "model6"),
  broom::tidy(model7, conf.int = TRUE) %>% mutate(model = "model7"),
  broom::tidy(model8, conf.int = TRUE) %>% mutate(model = "model8")
) %>%
  filter(term %in% c("EDUC", "RACE", "SMSA", "MARRIED", "AGEQ", "AGEQSQ")) %>%
  mutate(
    term = case_when(
      term == "EDUC" ~ "Years of education",
      term == "RACE" ~ "Race (1 = black)",
      term == "SMSA" ~ "SMSA (1 = center city)",
      term == "MARRIED" ~ "Married (1 = married)",
      term == "AGEQ" ~ "Age",
      term == "AGEQSQ" ~ "Age-squared",
      TRUE ~ term
    ),
    p.sig = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01 ~ "**",
      p.value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    value = ak91_coef_cell(estimate, std.error, p.sig)
  ) %>%
  select(term, model, value) %>%
  pivot_wider(
    names_from = model,
    values_from = value
  ) %>%
  # reorder the term
  mutate(
    term = factor(
      term, 
      levels = c("Years of education", "Race (1 = black)", "SMSA (1 = center city)", 
                "Married (1 = married)", "Age", "Age-squared"))
  ) %>%
  arrange(term) %>%
  rename_all(~ all_of(c("Independent Variables", "(1) <br>OLS", "(2) <br>TSLS", 
                       "(3) <br>OLS", "(4) <br>TSLS", "(5) <br>OLS", "(6) <br>TSLS", 
                       "(7) <br>OLS", "(8) <br>TSLS")))

# Create the table using kableExtra
kbl_out <- table_data %>%
  kbl(
    format = "html", escape = FALSE,
    scroll = TRUE,
    align = c("l", "c", "c", "c", "c", "c", "c", "c", "c", "c"),
    caption = "Table V: OLS and TSLS Estimates of the Return to Education for Men Born 1930-1939: 1980 Census"
    ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  footnote(
    general = "Yes-No Dummies are not kept in the shown figure.",
    number = c(
      "$^{*}$ p<0.05; $^{**}$ p<0.01; $^{***}$ p<0.001",
      "The dependent variable is the log of weekly wage. The sample includes men born 1930-1939 in the 1980 Census.",
      "The excluded instrumental variables for all TSLS models are not shown in the table and are the same which are the 27 interaction terms between the quarter of birth and year of birth $QTR_j \\times YR_i$, $i \\in \\{0, 1, \\dots, 9\\}$ and $j \\in \\{1, 2, 3\\}$."
    )
  )

# Create the table using gt
table_data <- ak91_quarto_blank_df(table_data)
gt_tbl <- table_data %>%
  gt() %>%
  # Set title
  tab_header(
    title = "Table V: OLS and TSLS Estimates of the Return to Education for Men Born 1930-1939: 1980 Census"
  ) %>%
  # Set column alignment
  cols_align(
    align = "left",
    columns = 1
  ) %>%
  cols_align(
    align = "center",
    columns = 2:9
  ) %>%
  # Add footnotes
  tab_footnote(
    footnote = "Yes-No Dummies are not kept in the shown figure.",
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = md("$^{*}p<0.05; ^{**}p<0.01; ^{***}p<0.001$"),
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = "The dependent variable is the log of weekly wage. The sample includes men born 1930-1939 in the 1980 Census.",
    locations = cells_title(groups = "title")
  ) %>%
  tab_footnote(
    footnote = md("The excluded instrumental variables for all TSLS models are not shown in the table and are the same which are the 27 interaction terms between the quarter of birth and year of birth $QTR_j \\times YR_i$, $i \\in \\{0, 1, \\dots, 9\\}$ and $j \\in \\{1, 2, 3\\}$."),
    locations = cells_title(groups = "title")
  ) %>%
  # Set table style
  opt_row_striping() %>%
  tab_options(
    table.font.size = px(14),
    heading.background.color = "#e8e8e8",
    column_labels.font.weight = "bold",
    table.border.top.width = px(3),
    table.border.bottom.width = px(3),
    container.width = pct(100),
    table.width = pct(100)
  ) %>%
  # Ensure math symbols render correctly
  fmt_markdown(columns = everything()) %>%
  # Ensure all column headers use gt package's html() syntax
  cols_label(.list = ak91_gt_cols_label(names(table_data)))

# Save the objects to RData file
save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_V.RData"))