# Load required packages---- library(here) library(tidyverse) library(haven) # For reading Stata data library(broom) # For tidying regression results library(knitr) # For generating tables library(kableExtra) library(systemfit) # For seemingly unrelated regression library(gt) source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R")) # Read the data raw_data <- read_dta("D:/github/course-em-eng/04-topics/rep-ak1991/stata/raw_data.dta") # Rename variables to match Stata code data <- raw_data %>% rename( AGE = v1, AGEQ = v2, EDUC = v4, ENOCENT = v5, ESOCENT = v6, LWKLYWGE = v9, MARRIED = v10, MIDATL = v11, MT = v12, NEWENG = v13, CENSUS = v16, SOB = v17, QOB = v18, RACE = v19, SMSA = v20, SOATL = v21, WNOCENT = v24, WSOCENT = v25, YOB = v27 ) # Adjust YOB for years >= 1900 data <- data %>% mutate(YOB = if_else(YOB >= 1900, YOB - 1900, YOB)) # Create YOB dummies for (i in 0:9) { data <- data %>% mutate(!!paste0("YR", i) := if_else(YOB %in% c(20 + i, 30 + i, 40 + i), 1, 0)) } # Create QOB dummies for (i in 1:4) { data <- data %>% mutate(!!paste0("QTR", i) := if_else(QOB == i, 1, 0)) } # Create QOB*YOB interaction dummies for (j in 1:3) { for (i in 0:9) { data <- data %>% mutate(!!paste0("QTR", j, "YR", i) := !!sym(paste0("QTR", j)) * !!sym(paste0("YR", i))) } } # Create COHORT variable data <- data %>% mutate( COHORT = case_when( YOB <= 29 & YOB >= 20 ~ 2029, YOB <= 39 & YOB >= 30 ~ 3039, YOB <= 49 & YOB >= 40 ~ 4049, TRUE ~ NA_real_ ), AGEQ = if_else(CENSUS == 80, AGEQ - 1900, AGEQ), AGEQSQ = AGEQ * AGEQ ) # Panel A: 1970 Census - Men Born 1920-1929 panel_a <- data %>% filter(COHORT == 2029) # Summary statistics for Panel A panel_a_summary <- panel_a %>% group_by(QTR1) %>% summarise( ln_wage_mean = mean(LWKLYWGE, na.rm = TRUE), educ_mean = mean(EDUC, na.rm = TRUE) ) # Regressions for Panel A # Wage regression wage_reg_a <- lm(LWKLYWGE ~ QTR1, data = panel_a) wage_reg_a_tidy <- tidy(wage_reg_a) # Education regression educ_reg_a <- lm(EDUC ~ QTR1, data = panel_a) educ_reg_a_tidy <- tidy(educ_reg_a) # Seemingly unrelated regression for Panel A sur_a <- systemfit(list( wage = LWKLYWGE ~ QTR1, educ = EDUC ~ QTR1 ), data = panel_a) # Calculate Wald estimate for Panel A # Note: In Stata, the standard error is calculated using nlcom command after sureg # We use the SUR results from systemfit to match Stata's calculation # From coef(sur_a), we can see: # wage_QTR1 is at index 2 (-0.008978875) # educ_QTR1 is at index 4 (-0.125555298) wald_a <- coef(sur_a)[2] / coef(sur_a)[4] # ratio of QTR1 coefficients # Extract the variance-covariance matrix from SUR # From vcov_sur_a, we can see: # wage_QTR1 variance is at [2,2] (9.070617e-06) # educ_QTR1 variance is at [4,4] (2.414637e-04) # wage_QTR1-educ_QTR1 covariance is at [2,4] (0.000000e+00) vcov_sur_a <- vcov(sur_a) # Calculate standard error using the delta method with SUR variance-covariance matrix # The formula follows Stata's nlcom command calculation: # SE = sqrt(Var(b1/b2)) = sqrt( # (Var(b1)/b2^2) + (b1^2*Var(b2)/b2^4) - (2*b1*Cov(b1,b2)/b2^3) # ) # where b1 is the coefficient of QTR1 in wage equation # and b2 is the coefficient of QTR1 in education equation wald_a_se <- sqrt( (vcov_sur_a[2,2] / (coef(sur_a)[4]^2)) + ((coef(sur_a)[2]^2) * vcov_sur_a[4,4] / (coef(sur_a)[4]^4)) - (2 * coef(sur_a)[2] * vcov_sur_a[2,4] / (coef(sur_a)[4]^3)) ) # OLS return to education for Panel A ols_a <- lm(LWKLYWGE ~ EDUC, data = panel_a) ols_a_tidy <- tidy(ols_a) # Panel B: 1980 Census - Men Born 1930-1939 panel_b <- data %>% filter(COHORT == 3039) # Summary statistics for Panel B panel_b_summary <- panel_b %>% group_by(QTR1) %>% summarise( ln_wage_mean = mean(LWKLYWGE, na.rm = TRUE), educ_mean = mean(EDUC, na.rm = TRUE) ) # Regressions for Panel B # Wage regression wage_reg_b <- lm(LWKLYWGE ~ QTR1, data = panel_b) wage_reg_b_tidy <- tidy(wage_reg_b) # Education regression educ_reg_b <- lm(EDUC ~ QTR1, data = panel_b) educ_reg_b_tidy <- tidy(educ_reg_b) # Seemingly unrelated regression for Panel B sur_b <- systemfit(list( wage = LWKLYWGE ~ QTR1, educ = EDUC ~ QTR1 ), data = panel_b) # Calculate Wald estimate for Panel B # Note: In Stata, the standard error is calculated using nlcom command after sureg # We use the SUR results from systemfit to match Stata's calculation # From coef(sur_b), we can see: # wage_QTR1 is at index 2 # educ_QTR1 is at index 4 wald_b <- coef(sur_b)[2] / coef(sur_b)[4] # ratio of QTR1 coefficients # Extract the variance-covariance matrix from SUR # From vcov_sur_b, we can see: # wage_QTR1 variance is at [2,2] # educ_QTR1 variance is at [4,4] # wage_QTR1-educ_QTR1 covariance is at [2,4] vcov_sur_b <- vcov(sur_b) # Calculate standard error using the delta method with SUR variance-covariance matrix # The formula follows Stata's nlcom command calculation: # SE = sqrt(Var(b1/b2)) = sqrt( # (Var(b1)/b2^2) + (b1^2*Var(b2)/b2^4) - (2*b1*Cov(b1,b2)/b2^3) # ) # where b1 is the coefficient of QTR1 in wage equation # and b2 is the coefficient of QTR1 in education equation wald_b_se <- sqrt( (vcov_sur_b[2,2] / (coef(sur_b)[4]^2)) + ((coef(sur_b)[2]^2) * vcov_sur_b[4,4] / (coef(sur_b)[4]^4)) - (2 * coef(sur_b)[2] * vcov_sur_b[2,4] / (coef(sur_b)[4]^3)) ) # OLS return to education for Panel B ols_b <- lm(LWKLYWGE ~ EDUC, data = panel_b) ols_b_tidy <- tidy(ols_b) # Create table data table_data <- tibble( "Variable" = c( "$ln$(wkly. wage)", "Education", "Wald est. of return to education", "OLS return to education", "$ln$(wkly. wage)", "Education", "Wald est. of return to education", "OLS return to education" ), "Born in 1st quarter of year" = c( ak91_fmt_num(panel_a_summary$ln_wage_mean[panel_a_summary$QTR1 == 1]), ak91_fmt_num(panel_a_summary$educ_mean[panel_a_summary$QTR1 == 1]), "", "", ak91_fmt_num(panel_b_summary$ln_wage_mean[panel_b_summary$QTR1 == 1]), ak91_fmt_num(panel_b_summary$educ_mean[panel_b_summary$QTR1 == 1]), "", "" ), "Born in 2nd, 3rd, or 4th quarter of year" = c( ak91_fmt_num(panel_a_summary$ln_wage_mean[panel_a_summary$QTR1 == 0]), ak91_fmt_num(panel_a_summary$educ_mean[panel_a_summary$QTR1 == 0]), "", "", ak91_fmt_num(panel_b_summary$ln_wage_mean[panel_b_summary$QTR1 == 0]), ak91_fmt_num(panel_b_summary$educ_mean[panel_b_summary$QTR1 == 0]), "", "" ), "Difference (std. error) (1)-(2)" = c( ak91_fmt_br_se(wage_reg_a_tidy$estimate[2], wage_reg_a_tidy$std.error[2]), ak91_fmt_br_se(educ_reg_a_tidy$estimate[2], educ_reg_a_tidy$std.error[2]), ak91_fmt_br_se(wald_a, wald_a_se), ak91_fmt_br_se(ols_a_tidy$estimate[2], ols_a_tidy$std.error[2]), ak91_fmt_br_se(wage_reg_b_tidy$estimate[2], wage_reg_b_tidy$std.error[2]), ak91_fmt_br_se(educ_reg_b_tidy$estimate[2], educ_reg_b_tidy$std.error[2]), ak91_fmt_br_se(wald_b, wald_b_se), ak91_fmt_br_se(ols_b_tidy$estimate[2], ols_b_tidy$std.error[2]) ) ) # Create the table using kableExtra kbl_out <- table_data %>% kbl( format = "html", align = c('l', 'c', 'c', 'c'), caption = "Table 3: Wald Estimates", escape = FALSE ) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% add_header_above(c(" " = 1, "(1)" = 1, "(2)" = 1, "(3)" = 1)) %>% add_header_above(c("PANEL A: WALD ESTIMATES FOR 1970 CENSUS - MEN BORN 1920-1929" = 4)) %>% pack_rows("PANEL B: WALD ESTIMATES FOR 1980 CENSUS - MEN BORN 1930-1939", 5, 8) %>% footnote( general = c( "The results of standard errors of Wald estimates is different from Stata's nlcom command, eg. R result is 0.02556816 in panel A while Stata's result is 0218682." ) ) # Create the table using gt table_data <- ak91_quarto_blank_df(table_data) gt_tbl <- table_data %>% gt() %>% # Set title tab_header( title = "Table 3: Wald Estimates" ) %>% # Set column alignment cols_align( align = "left", columns = 1 ) %>% cols_align( align = "center", columns = 2:4 ) %>% # Add top panel header # tab_spanner( # label = "PANEL A: WALD ESTIMATES FOR 1970 CENSUS - MEN BORN 1920-1929", # columns = c("Variable", "Born in 1st quarter of year", "Born in 2nd, 3rd, or 4th quarter of year", "Difference (std. error) (1)-(2)"), # level = 2 # ) %>% # Add column numbers as headers tab_spanner( label = "(1)", columns = "Born in 1st quarter of year" ) %>% tab_spanner( label = "(2)", columns = "Born in 2nd, 3rd, or 4th quarter of year" ) %>% tab_spanner( label = "(3)", columns = "Difference (std. error) (1)-(2)" ) %>% # Define Panel B before Panel A so gt displays Panel A first (paper order) tab_row_group( label = "PANEL B: WALD ESTIMATES FOR 1980 CENSUS - MEN BORN 1930-1939", rows = 5:8 ) %>% tab_row_group( label = "PANEL A: WALD ESTIMATES FOR 1970 CENSUS - MEN BORN 1920-1929", rows = 1:4 ) %>% # Add footnote tab_footnote( footnote = "The results of standard errors of Wald estimates is different from Stata's nlcom command, eg. R result is 0.02556816 in panel A while Stata's result is 0218682.", locations = cells_title(groups = "title") ) %>% # Set table style opt_row_striping() %>% tab_options( table.font.size = px(14), heading.background.color = "#e8e8e8", column_labels.font.weight = "bold", table.border.top.width = px(3), table.border.bottom.width = px(3), row_group.background.color = "#f8f8f8", row_group.font.weight = "bold", quarto.disable_processing = TRUE ) %>% # Ensure math symbols render correctly fmt_markdown(columns = everything()) %>% # Ensure all column headers use gt package's html() syntax cols_label(.list = ak91_gt_cols_label(names(table_data))) # Save the objects to RData file save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_III.RData"))