# Load required packages----
library(here)
library(tidyverse)
library(haven)  # For reading Stata data
library(broom)  # For tidying regression results
library(knitr)  # For generating tables
library(kableExtra)
library(systemfit)  # For seemingly unrelated regression
library(gt)
source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R"))

# Read the data
raw_data <- read_dta("D:/github/course-em-eng/04-topics/rep-ak1991/stata/raw_data.dta")

# Rename variables to match Stata code
data <- raw_data %>%
  rename(
    AGE = v1,
    AGEQ = v2,
    EDUC = v4,
    ENOCENT = v5,
    ESOCENT = v6,
    LWKLYWGE = v9,
    MARRIED = v10,
    MIDATL = v11,
    MT = v12,
    NEWENG = v13,
    CENSUS = v16,
    SOB = v17,
    QOB = v18,
    RACE = v19,
    SMSA = v20,
    SOATL = v21,
    WNOCENT = v24,
    WSOCENT = v25,
    YOB = v27
  )

# Adjust YOB for years >= 1900
data <- data %>%
  mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB))

# Create YOB dummies
for (i in 0:9) {
  data <- data %>%
    mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0))
}

# Create QOB dummies
for (i in 1:4) {
  data <- data %>%
    mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0))
}

# Create QOB*YOB interaction dummies
for (j in 1:3) {
  for (i in 0:9) {
    data <- data %>%
      mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i)))
  }
}

# Create COHORT variable
data <- data %>%
  mutate(
    COHORT = case_when(
      YOB <= 29 & YOB >= 20 ~ 2029,
      YOB <= 39 & YOB >= 30 ~ 3039,
      YOB <= 49 & YOB >= 40 ~ 4049,
      TRUE ~ NA_real_
    ),
    AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ),
    AGEQSQ = AGEQ * AGEQ
  )

# Panel A: 1970 Census - Men Born 1920-1929
panel_a <- data %>%
  filter(COHORT == 2029)

# Summary statistics for Panel A
panel_a_summary <- panel_a %>%
  group_by(QTR1) %>%
  summarise(
    ln_wage_mean = mean(LWKLYWGE, na.rm = TRUE),
    educ_mean = mean(EDUC, na.rm = TRUE)
  )

# Regressions for Panel A
# Wage regression
wage_reg_a <- lm(LWKLYWGE ~ QTR1, data = panel_a)
wage_reg_a_tidy <- tidy(wage_reg_a)

# Education regression
educ_reg_a <- lm(EDUC ~ QTR1, data = panel_a)
educ_reg_a_tidy <- tidy(educ_reg_a)

# Seemingly unrelated regression for Panel A
sur_a <- systemfit(list(
  wage = LWKLYWGE ~ QTR1,
  educ = EDUC ~ QTR1
), data = panel_a)

# Calculate Wald estimate for Panel A
# Note: In Stata, the standard error is calculated using nlcom command after sureg
# We use the SUR results from systemfit to match Stata's calculation
# From coef(sur_a), we can see:
# wage_QTR1 is at index 2 (-0.008978875)
# educ_QTR1 is at index 4 (-0.125555298)
wald_a <- coef(sur_a)[2] / coef(sur_a)[4]  # ratio of QTR1 coefficients

# Extract the variance-covariance matrix from SUR
# From vcov_sur_a, we can see:
# wage_QTR1 variance is at [2,2] (9.070617e-06)
# educ_QTR1 variance is at [4,4] (2.414637e-04)
# wage_QTR1-educ_QTR1 covariance is at [2,4] (0.000000e+00)
vcov_sur_a <- vcov(sur_a)

# Calculate standard error using the delta method with SUR variance-covariance matrix
# The formula follows Stata's nlcom command calculation:
# SE = sqrt(Var(b1/b2)) = sqrt(
#   (Var(b1)/b2^2) + (b1^2*Var(b2)/b2^4) - (2*b1*Cov(b1,b2)/b2^3)
# )
# where b1 is the coefficient of QTR1 in wage equation
# and b2 is the coefficient of QTR1 in education equation
wald_a_se <- sqrt(
  (vcov_sur_a[2,2] / (coef(sur_a)[4]^2)) +
  ((coef(sur_a)[2]^2) * vcov_sur_a[4,4] / (coef(sur_a)[4]^4)) -
  (2 * coef(sur_a)[2] * vcov_sur_a[2,4] / (coef(sur_a)[4]^3))
)

# OLS return to education for Panel A
ols_a <- lm(LWKLYWGE ~ EDUC, data = panel_a)
ols_a_tidy <- tidy(ols_a)

# Panel B: 1980 Census - Men Born 1930-1939
panel_b <- data %>%
  filter(COHORT == 3039)

# Summary statistics for Panel B
panel_b_summary <- panel_b %>%
  group_by(QTR1) %>%
  summarise(
    ln_wage_mean = mean(LWKLYWGE, na.rm = TRUE),
    educ_mean = mean(EDUC, na.rm = TRUE)
  )

# Regressions for Panel B
# Wage regression
wage_reg_b <- lm(LWKLYWGE ~ QTR1, data = panel_b)
wage_reg_b_tidy <- tidy(wage_reg_b)

# Education regression
educ_reg_b <- lm(EDUC ~ QTR1, data = panel_b)
educ_reg_b_tidy <- tidy(educ_reg_b)

# Seemingly unrelated regression for Panel B
sur_b <- systemfit(list(
  wage = LWKLYWGE ~ QTR1,
  educ = EDUC ~ QTR1
), data = panel_b)

# Calculate Wald estimate for Panel B
# Note: In Stata, the standard error is calculated using nlcom command after sureg
# We use the SUR results from systemfit to match Stata's calculation
# From coef(sur_b), we can see:
# wage_QTR1 is at index 2
# educ_QTR1 is at index 4
wald_b <- coef(sur_b)[2] / coef(sur_b)[4]  # ratio of QTR1 coefficients

# Extract the variance-covariance matrix from SUR
# From vcov_sur_b, we can see:
# wage_QTR1 variance is at [2,2]
# educ_QTR1 variance is at [4,4]
# wage_QTR1-educ_QTR1 covariance is at [2,4]
vcov_sur_b <- vcov(sur_b)

# Calculate standard error using the delta method with SUR variance-covariance matrix
# The formula follows Stata's nlcom command calculation:
# SE = sqrt(Var(b1/b2)) = sqrt(
#   (Var(b1)/b2^2) + (b1^2*Var(b2)/b2^4) - (2*b1*Cov(b1,b2)/b2^3)
# )
# where b1 is the coefficient of QTR1 in wage equation
# and b2 is the coefficient of QTR1 in education equation
wald_b_se <- sqrt(
  (vcov_sur_b[2,2] / (coef(sur_b)[4]^2)) +
  ((coef(sur_b)[2]^2) * vcov_sur_b[4,4] / (coef(sur_b)[4]^4)) -
  (2 * coef(sur_b)[2] * vcov_sur_b[2,4] / (coef(sur_b)[4]^3))
)

# OLS return to education for Panel B
ols_b <- lm(LWKLYWGE ~ EDUC, data = panel_b)
ols_b_tidy <- tidy(ols_b)

# Create table data
table_data <- tibble(
  "Variable" = c(
    "$ln$(wkly. wage)",
    "Education",
    "Wald est. of return to education",
    "OLS return to education",
    "$ln$(wkly. wage)",
    "Education",
    "Wald est. of return to education",
    "OLS return to education"
  ),
  "Born in 1st quarter of year" = c(
    ak91_fmt_num(panel_a_summary$ln_wage_mean[panel_a_summary$QTR1 == 1]),
    ak91_fmt_num(panel_a_summary$educ_mean[panel_a_summary$QTR1 == 1]),
    "",
    "",
    ak91_fmt_num(panel_b_summary$ln_wage_mean[panel_b_summary$QTR1 == 1]),
    ak91_fmt_num(panel_b_summary$educ_mean[panel_b_summary$QTR1 == 1]),
    "",
    ""
  ),
  "Born in 2nd, 3rd, or 4th quarter of year" = c(
    ak91_fmt_num(panel_a_summary$ln_wage_mean[panel_a_summary$QTR1 == 0]),
    ak91_fmt_num(panel_a_summary$educ_mean[panel_a_summary$QTR1 == 0]),
    "",
    "",
    ak91_fmt_num(panel_b_summary$ln_wage_mean[panel_b_summary$QTR1 == 0]),
    ak91_fmt_num(panel_b_summary$educ_mean[panel_b_summary$QTR1 == 0]),
    "",
    ""
  ),
  "Difference (std. error) (1)-(2)" = c(
    ak91_fmt_br_se(wage_reg_a_tidy$estimate[2], wage_reg_a_tidy$std.error[2]),
    ak91_fmt_br_se(educ_reg_a_tidy$estimate[2], educ_reg_a_tidy$std.error[2]),
    ak91_fmt_br_se(wald_a, wald_a_se),
    ak91_fmt_br_se(ols_a_tidy$estimate[2], ols_a_tidy$std.error[2]),
    ak91_fmt_br_se(wage_reg_b_tidy$estimate[2], wage_reg_b_tidy$std.error[2]),
    ak91_fmt_br_se(educ_reg_b_tidy$estimate[2], educ_reg_b_tidy$std.error[2]),
    ak91_fmt_br_se(wald_b, wald_b_se),
    ak91_fmt_br_se(ols_b_tidy$estimate[2], ols_b_tidy$std.error[2])
  )
)

# Create the table using kableExtra
kbl_out <- table_data %>%
  kbl(
    format = "html",
    align = c('l', 'c', 'c', 'c'),
    caption = "Table 3: Wald Estimates",
    escape = FALSE
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  add_header_above(c(" " = 1, "(1)" = 1, "(2)" = 1, "(3)" = 1)) %>%
  add_header_above(c("PANEL A: WALD ESTIMATES FOR 1970 CENSUS - MEN BORN 1920-1929" = 4)) %>%
  pack_rows("PANEL B: WALD ESTIMATES FOR 1980 CENSUS - MEN BORN 1930-1939", 5, 8) %>%
  footnote(
    general = c(
      "The results of standard errors of Wald estimates is different from Stata's nlcom command, eg. R result is 0.02556816 in panel A while Stata's result is 0218682."
      )
  )

# Create the table using gt
table_data <- ak91_quarto_blank_df(table_data)
gt_tbl <- table_data %>%
  gt() %>%
  # Set title
  tab_header(
    title = "Table 3: Wald Estimates"
  ) %>%
  # Set column alignment
  cols_align(
    align = "left",
    columns = 1
  ) %>%
  cols_align(
    align = "center",
    columns = 2:4
  ) %>%
  # Add top panel header
  # tab_spanner(
  #   label = "PANEL A: WALD ESTIMATES FOR 1970 CENSUS - MEN BORN 1920-1929",
  #   columns = c("Variable", "Born in 1st quarter of year", "Born in 2nd, 3rd, or 4th quarter of year", "Difference (std. error) (1)-(2)"),
  #   level = 2
  # ) %>%
  # Add column numbers as headers
  tab_spanner(
    label = "(1)",
    columns = "Born in 1st quarter of year"
  ) %>%
  tab_spanner(
    label = "(2)",
    columns = "Born in 2nd, 3rd, or 4th quarter of year"
  ) %>%
  tab_spanner(
    label = "(3)",
    columns = "Difference (std. error) (1)-(2)"
  ) %>%
  # Define Panel B before Panel A so gt displays Panel A first (paper order)
  tab_row_group(
    label = "PANEL B: WALD ESTIMATES FOR 1980 CENSUS - MEN BORN 1930-1939",
    rows = 5:8
  ) %>%
  tab_row_group(
    label = "PANEL A: WALD ESTIMATES FOR 1970 CENSUS - MEN BORN 1920-1929",
    rows = 1:4
  ) %>%
  # Add footnote
  tab_footnote(
    footnote = "The results of standard errors of Wald estimates is different from Stata's nlcom command, eg. R result is 0.02556816 in panel A while Stata's result is 0218682.",
    locations = cells_title(groups = "title")
  ) %>%
  # Set table style
  opt_row_striping() %>%
  tab_options(
    table.font.size = px(14),
    heading.background.color = "#e8e8e8",
    column_labels.font.weight = "bold",
    table.border.top.width = px(3),
    table.border.bottom.width = px(3),
    row_group.background.color = "#f8f8f8",
    row_group.font.weight = "bold",
    quarto.disable_processing = TRUE
  ) %>%
  # Ensure math symbols render correctly
  fmt_markdown(columns = everything()) %>%
  # Ensure all column headers use gt package's html() syntax
  cols_label(.list = ak91_gt_cols_label(names(table_data)))

# Save the objects to RData file
save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_III.RData"))