# Load required packages---- library(here) library(tidyverse) library(haven) # For reading Stata data library(broom) # For tidying regression results library(knitr) # For generating tables library(kableExtra) library(gt) source(here("04-topics/rep-ak1991/Rcode/ak1991-gt-quarto.R")) # Set working directory #setwd("D:/github/course-em-eng/04-topics/rep-ak1991/stata") # Read data---- data <- read_dta(here("04-topics/rep-ak1991/stata/raw_data.dta")) # Rename variables---- data <- data %>% rename( AGE = v1, AGEQ = v2, EDUC = v4, ENOCENT = v5, ESOCENT = v6, LWKLYWGE = v9, MARRIED = v10, MIDATL = v11, MT = v12, NEWENG = v13, CENSUS = v16, QOB = v18, RACE = v19, SMSA = v20, SOATL = v21, WNOCENT = v24, WSOCENT = v25, YOB = v27 ) # View(head(data,1000)) # Data preprocessing ---- data <- data %>% mutate( # Adjust YOB (only if >= 1900). the cohort of YOB YOB = ifelse(YOB >= 1900, YOB - 1900, YOB), # Create YOB dummy variables. the year of each cohort. YR0 = ifelse(YOB == 20 | YOB == 30 | YOB == 40, 1, 0), YR1 = ifelse(YOB == 21 | YOB == 31 | YOB == 41, 1, 0), YR2 = ifelse(YOB == 22 | YOB == 32 | YOB == 42, 1, 0), YR3 = ifelse(YOB == 23 | YOB == 33 | YOB == 43, 1, 0), YR4 = ifelse(YOB == 24 | YOB == 34 | YOB == 44, 1, 0), YR5 = ifelse(YOB == 25 | YOB == 35 | YOB == 45, 1, 0), YR6 = ifelse(YOB == 26 | YOB == 36 | YOB == 46, 1, 0), YR7 = ifelse(YOB == 27 | YOB == 37 | YOB == 47, 1, 0), YR8 = ifelse(YOB == 28 | YOB == 38 | YOB == 48, 1, 0), YR9 = ifelse(YOB == 29 | YOB == 39 | YOB == 49, 1, 0), # Create QOB dummy variables QTR1 = ifelse(QOB == 1, 1, 0), QTR2 = ifelse(QOB == 2, 1, 0), QTR3 = ifelse(QOB == 3, 1, 0), QTR4 = ifelse(QOB == 4, 1, 0), # Create YQ* variables ## year and quarter one YQ000 = QTR1 * YR0, YQ100 = QTR1 * YR1, YQ200 = QTR1 * YR2, YQ300 = QTR1 * YR3, YQ400 = QTR1 * YR4, YQ500 = QTR1 * YR5, YQ600 = QTR1 * YR6, YQ700 = QTR1 * YR7, YQ800 = QTR1 * YR8, YQ900 = QTR1 * YR9, ## year and quarter two YQ025 = QTR2 * YR0, YQ125 = QTR2 * YR1, YQ225 = QTR2 * YR2, YQ325 = QTR2 * YR3, YQ425 = QTR2 * YR4, YQ525 = QTR2 * YR5, YQ625 = QTR2 * YR6, YQ725 = QTR2 * YR7, YQ825 = QTR2 * YR8, YQ925 = QTR2 * YR9, ## year and quarter three YQ050 = QTR3 * YR0, YQ150 = QTR3 * YR1, YQ250 = QTR3 * YR2, YQ350 = QTR3 * YR3, YQ450 = QTR3 * YR4, YQ550 = QTR3 * YR5, YQ650 = QTR3 * YR6, YQ750 = QTR3 * YR7, YQ850 = QTR3 * YR8, YQ950 = QTR3 * YR9, ## year and quarter four YQ075 = QTR4 * YR0, YQ175 = QTR4 * YR1, YQ275 = QTR4 * YR2, YQ375 = QTR4 * YR3, YQ475 = QTR4 * YR4, YQ575 = QTR4 * YR5, YQ675 = QTR4 * YR6, YQ775 = QTR4 * YR7, YQ875 = QTR4 * YR8, YQ975 = QTR4 * YR9, # Create COHORT variable (following Stata's approach) COHORT = 2029, COHORT = ifelse(YOB <= 39 & YOB >= 30, 3039, COHORT), COHORT = ifelse(YOB <= 49 & YOB >= 40, 4049, COHORT), # Adjust AGEQ variable (following Stata's approach) AGEQ = ifelse(CENSUS == 80, AGEQ - 1900, AGEQ), #? AGEQSQ = AGEQ * AGEQ, # Create YQ variable following Stata's approach exactly YQ = 0 ) ## Create YQ variable following Stata's approach exactly---- for (j in 1:4) { for (i in 20:49) { data$YQ[data$YOB == i & data$QOB == j] <- 100 * i + 25 * (j - 1) } } ## Create education level dummy variables ---- data <- data %>% mutate( edu_all = ifelse(EDUC >= 0, 1, 0), hs_grad = ifelse(EDUC >= 12, 1, 0), bach_grad = ifelse(EDUC >= 16, 1, 0), ms_grad = ifelse(EDUC >= 18, 1, 0), doc_grad = ifelse(EDUC >= 20, 1, 0) ) # Define function to calculate moving average ---- calculate_ma <- function(df =data, cohort=3039, var_name="hs_grad", var_type="dep_var_d") { # Calculate mean for each YQ* if (var_type == "dep_var_s") { ma_data <- df %>% #dplyr::filter(YQ >= YQ_lwr, YQ <= YQ_upr) %>% dplyr::filter(COHORT == cohort) %>% # Specify age group group_by(YQ) %>% summarise(mean = mean(EDUC, na.rm = TRUE), n = n()) %>% arrange(YQ) %>% ungroup() } else if (var_type == "dep_var_d") { ma_data <- df %>% #dplyr::filter(YQ >= YQ_lwr, YQ <= YQ_upr) %>% dplyr::filter(COHORT == cohort) %>% # Specify age group group_by(YQ) %>% summarise(mean = mean(!!sym(var_name), na.rm = TRUE), n = n()) %>% arrange(YQ) %>% ungroup() } else { stop("Invalid var_type value.") } # Create result data frame result <- ma_data %>% #select(YQ, MA) %>% mutate( var.type = var_type, var = var_name, cohort.add = cohort ) return(result) } # Calculate moving averages for each education level---- ## MA for schooling years---- ma_result <- bind_rows( calculate_ma(data, 3039, "edu_all", var_type = "dep_var_s"), calculate_ma(data, 4049, "edu_all", var_type = "dep_var_s"), calculate_ma(data, 3039, "hs_grad", var_type = "dep_var_s"), calculate_ma(data, 4049, "hs_grad", var_type = "dep_var_s"), calculate_ma(data, 3039, "bach_grad", var_type = "dep_var_s"), calculate_ma(data, 4049, "bach_grad", var_type = "dep_var_s"), calculate_ma(data, 3039, "ms_grad", var_type = "dep_var_s"), calculate_ma(data, 4049, "ms_grad", var_type = "dep_var_s"), calculate_ma(data, 3039, "doc_grad", var_type = "dep_var_s"), calculate_ma(data, 4049, "doc_grad", var_type = "dep_var_s") ) # adjust MA result ma_result_adj <- ma_result %>% select(var, cohort.add, YQ, mean, n) %>% group_by(var) %>% mutate( lag2 = lag(mean, 2), lag1 = lag(mean, 1), lag0 = lag(mean, 0), lead1 = lead(mean, 1), lead2 = lead(mean, 2) ) %>% ungroup() %>% mutate( MA_adj = (lag2 + lag1 + lead1 + lead2) / 4 ) ## MA for dummy education---- ma_result_d <- bind_rows( calculate_ma(data, 3039, "edu_all", var_type = "dep_var_d"), calculate_ma(data, 4049, "edu_all", var_type = "dep_var_d"), calculate_ma(data, 3039, "hs_grad", var_type = "dep_var_d"), calculate_ma(data, 4049, "hs_grad", var_type = "dep_var_d"), calculate_ma(data, 3039, "bach_grad", var_type = "dep_var_d"), calculate_ma(data, 4049, "bach_grad", var_type = "dep_var_d"), calculate_ma(data, 3039, "ms_grad", var_type = "dep_var_d"), calculate_ma(data, 4049, "ms_grad", var_type = "dep_var_d"), calculate_ma(data, 3039, "doc_grad", var_type = "dep_var_d"), calculate_ma(data, 4049, "doc_grad", var_type = "dep_var_d") ) # adjust MA result ma_result_d_adj <- ma_result_d %>% select(var, cohort.add, YQ, mean, n) %>% group_by(var) %>% mutate( lag2 = lag(mean, 2), lag1 = lag(mean, 1), lag0 = lag(mean, 0), lead1 = lead(mean, 1), lead2 = lead(mean, 2) ) %>% ungroup() %>% mutate( MA_adj = (lag2 + lag1 + lead1 + lead2) / 4 ) # Create regression analysis function ---- run_regression <- function(df=data, cohort=3039, dep_var="bach_grad", dep_type = "dep_var_d") { if (dep_type == "dep_var_s") { # For dep_var_s case, MA dataset uses actual "years of education" ma_result_adj_target <- ma_result_adj } else if (dep_type == "dep_var_d") { # For dep_var_d case, MA dataset uses "education" dummy variable ma_result_adj_target <- ma_result_d_adj } else { stop("Invalid var_type value.") } # filter ma_data ma_data_filter <- ma_result_adj_target %>% ungroup() %>% dplyr::filter(var ==dep_var) %>% dplyr::filter(cohort.add == cohort) %>% select(YQ, MA_adj, cohort.add) # Set YQ lower and upper bounds # Set age group if (cohort == 3039) { YQ_lwr <- 3050 YQ_upr <- 3975 cohort_lwr <- 3000 cohort_upr <- 3049 } else if (cohort == 4049) { YQ_lwr <- 4000 YQ_upr <- 4925 cohort_lwr <- 4000 cohort_upr <- 9000 # Assuming 9000 is a reasonable upper bound } else { stop("Invalid cohort value.") } # Ensure data filtering is correct reg_data <- df %>% #dplyr::filter(!!sym(dep_var) ==1) %>% # Specify education level dplyr::filter(COHORT == cohort) %>% # Specify age group # left join ma data left_join( ., ma_data_filter %>% dplyr::filter(cohort.add == cohort) %>% select(-cohort.add), by = "YQ" ) %>% mutate(dep_var_s = EDUC - MA_adj ) %>% # Don't delete rows with NA MA values yet mutate(dep_var_d = !!sym(dep_var) - MA_adj) # Don't delete rows with NA MA values yet # Check group means ## Variable case: years of education cohort.mean <- reg_data %>% dplyr::filter(!!sym(dep_var) ==1) %>% # Specify education level select(EDUC,YQ, MA_adj, dep_var_s, QTR1, QTR2, QTR3) %>% dplyr::filter(YQ >= YQ_lwr & YQ <= YQ_upr) %>% # Remove some samples important! pull("EDUC") %>% mean() ## Variable case: education level dummy variable cohort.mean.dummy <- df %>% select(EDUC,YQ, QTR1, QTR2, QTR3,all_of(dep_var)) %>% dplyr::filter(YQ >= YQ_lwr & YQ <= YQ_upr) %>% # Remove some samples important! pull(dep_var) %>% mean(., na.rm = TRUE) if (dep_type == "dep_var_s") { # For dep_var_s case, dataset specifies education level ols_data <- reg_data %>% dplyr::filter(!!sym(dep_var) ==1) %>% # Specify education level dplyr::filter(!is.na(MA_adj)) # Remove rows with NA MA values # Output mean cohort.mean.out <- cohort.mean } else if (dep_type == "dep_var_d") { # For dep_var_d case, dataset does not specify education level ols_data <- reg_data %>% dplyr::filter(!is.na(MA_adj)) # Remove rows with NA MA values # Output mean cohort.mean.out <- cohort.mean.dummy } else { stop("Invalid var_type value.") } n.obs <- nrow(ols_data) cat("Cohort:", cohort, "Education level: ", dep_var, "Number of observations after filtering:", n.obs, "\n") # Run regression mod_str <- str_c(dep_type, "~ QTR1 + QTR2 + QTR3") model <- lm(formula(mod_str), data = ols_data # Remove rows with NA MA values here ) # Calculate F-test smry.mod <- summary(model) f_test<- smry.mod$fstatistic f_stat <- unname(f_test["value"]) f_d1 <- f_test["numdf"] f_d2 <- f_test["dendf"] f_pval <- unname(pf(f_stat, f_d1, f_d2, lower.tail = FALSE)) # Return regression results and F-test results list( dep.type = dep_type, mean = cohort.mean.out, obs = n.obs, coef = tidy(model) %>% filter(term != "(Intercept)"), f_test = list(f_stat = f_stat, f_pval = f_pval) ) } # Run all regressions ---- results <- list( # Total years of education model11 = run_regression(data, 3039, "edu_all", dep_type = "dep_var_s"), model12 = run_regression(data, 4049, "edu_all", dep_type = "dep_var_s"), # High school graduation model21 = run_regression(data, 3039, "hs_grad", dep_type = "dep_var_d"), model22 = run_regression(data, 4049, "hs_grad", dep_type = "dep_var_d"), # Year of High school graduation model31 = run_regression(data, 3039, "hs_grad", dep_type = "dep_var_s"), model32 = run_regression(data, 4049, "hs_grad", dep_type = "dep_var_s"), # Bachelor's degree model41 = run_regression(data, 3039, "bach_grad", dep_type = "dep_var_d"), model42 = run_regression(data, 4049, "bach_grad", dep_type = "dep_var_d"), # Master's degree model51 = run_regression(data, 3039, "ms_grad"), model52 = run_regression(data, 4049, "ms_grad"), # Doctoral degree model61 = run_regression(data, 3039, "doc_grad"), model62 = run_regression(data, 4049, "doc_grad") ) # Generate results table results_coef <- map_dfr(results, ~.x$coef, .id = "model") %>% select(model, term, estimate, std.error) %>% mutate( estimate_str = ak91_fmt_br_se(estimate, std.error) ) %>% select(-estimate, -std.error) %>% pivot_wider( names_from = term, values_from = estimate_str ) results_mean <- sapply(results, \(x) x$mean) %>% as.data.frame() %>% setNames(c("Mean")) %>% rownames_to_column("model") %>% mutate( Mean = ak91_fmt_num(Mean) ) results_Ftest <- sapply(results, \(x) x$f_test) %>% t() %>% as.data.frame() %>% rownames_to_column("model") %>% mutate( `F-test` = str_c( ak91_fmt_num(f_stat), "
[", ak91_fmt_num(f_pval), "]" ) ) %>% select(-f_stat, -f_pval) # Create formatted table ---- table_data <- tibble( "Outcome variable" = rep(c( "Total years of education", "High school graduate", "Years of educ. for high school graduates", "College graduates", "Completed master's degree", "Completed doctoral degree" ), each = 2), "Birth cohort" = rep(c("1930-1939", "1940-1949"), 6) ) %>% bind_cols( ., results_mean %>% select(-model), results_coef %>% select(-model), results_Ftest %>% select(-model) ) # Output formatted kableExtra table ---- kbl_out <- kableExtra::kbl(table_data, caption = "Table 1: The Effect of Quarter of Birth on Various Educational Outcome Variables", format = "html", escape = FALSE, align = c("l", "c", "c", "c", "c", "c", "c")) %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::add_header_above(c(" " = 1, " " = 2, "Quarter-of-birth effect" = 3, " " =1)) # Output formatted gt table ---- table_data <- ak91_quarto_blank_df(table_data) gt_tbl <- table_data %>% gt() %>% # Set title tab_header( title = "Table 1: The Effect of Quarter of Birth on Various Educational Outcome Variables" ) %>% # Set column alignment cols_align( align = "left", columns = 1 ) %>% cols_align( align = "center", columns = 2:7 ) %>% # Add column spanners (similar to add_header_above in kableExtra) tab_spanner( label = "Quarter-of-birth effect", columns = c("QTR1", "QTR2", "QTR3") ) %>% # # Set table style opt_row_striping() %>% tab_options( table.font.size = px(14), heading.background.color = "#e8e8e8", column_labels.font.weight = "bold", table.border.top.width = px(3), table.border.bottom.width = px(3) ) %>% # Ensure math symbols render correctly fmt_markdown(columns = everything()) %>% # Ensure all column headers use gt package's html() syntax cols_label(.list = ak91_gt_cols_label(names(table_data))) # Save the objects to RData file save(table_data, gt_tbl, file = here("04-topics/rep-ak1991/Rcode/Table_I.RData"))