# Card (1993) Table 1 – sample characteristics (cols 2–3 reproducible). library(tidyverse) library(here) library(gt) source(here("04-topics/rep-card1993/Rcode/card1993-data-prep.R")) source(here("04-topics/rep-card1993/Rcode/card1993-gt-quarto.R")) data <- load_card93_data() stopifnot(nrow(data) == 3613L) s2 <- data s3 <- card93_wage_sample(data) stopifnot(nrow(s3) == 3010L) pct <- function(x) 100 * mean(x, na.rm = TRUE) mean_num <- function(x) mean(x, na.rm = TRUE) region_ne <- function(d) pct(d$reg661 + d$reg662) region_midwest <- function(d) pct(d$reg663 + d$reg664) region_west <- function(d) pct(d$reg668 + d$reg669) region_south <- function(d) pct(d$south66) age_pct <- function(d, lo, hi) { pct(d$age66 >= lo & d$age66 <= hi) } section_row <- function(label) { tibble( stub = label, col1 = " ", col2 = " ", col3 = " ", is_header = TRUE ) } stat_row <- function(label, fn, digits = 1L, col1 = "–") { tibble( stub = label, col1 = col1, col2 = card93_fmt_num(fn(s2), digits), col3 = card93_fmt_num(fn(s3), digits), is_header = FALSE ) } # Section headers in stub column preserve @tbl-card-1 row order (no gt row_group reordering). rows <- bind_rows( section_row("1. Age Distribution in 1966:"), stat_row("Age 14-15 (%)", function(d) age_pct(d, 14, 15)), stat_row("Age 16-17", function(d) age_pct(d, 16, 17)), stat_row("Age 18-20", function(d) age_pct(d, 18, 20)), stat_row("Age 21-24", function(d) age_pct(d, 21, 24)), section_row("2. Regional Distribution in 1966:"), stat_row("Northeast (%)", region_ne), stat_row("Midwest", region_midwest), stat_row("South", region_south), stat_row("West", region_west), stat_row("3. Lived in SMSA 1966 (%)", function(d) pct(d$smsa66r)), stat_row("4. Lived Near 4-year College in 1966 (%)", function(d) pct(d$nearc4)), section_row("5. Family structure at Age 14:"), stat_row("Mother & Father (%)", function(d) pct(d$momdad14)), stat_row("Mother Only (%)", function(d) pct(d$sinmom14)), section_row("6. Average Parental Education"), stat_row("Mother's Education (yrs)", function(d) mean_num(d$momed), digits = 1L), stat_row("Father's Education (yrs)", function(d) mean_num(d$daded), digits = 1L), stat_row("7. Percent Black", function(d) pct(d$black)), stat_row("8. Average score on KWW Test", function(d) mean_num(d$kww), digits = 1L), stat_row("9. Interviewed in 1976 (%)", function(d) 100), stat_row("10. Mean Education in 1976", function(d) mean_num(d$ed76), digits = 1L), stat_row("11. Live in south in 1976 (%)", function(d) pct(d$reg76r)), tibble( stub = "12. Sample size", col1 = "5225", col2 = as.character(nrow(s2)), col3 = as.character(nrow(s3)), is_header = FALSE ) ) header_rows <- which(rows$is_header) paper_targets <- tribble( ~row, ~col2, ~col3, "Age 14-15", 25.3, 25.5, "Age 16-17", 23.8, 24.1, "Age 18-20", 26.1, 24.6, "Age 21-24", 26.7, 25.8, "Northeast", 20.0, 20.7, "Midwest", 26.3, 26.0, "South", 41.3, 41.4, "West", 12.5, 11.9, "SMSA 1966", 64.3, 65.0, "Near college", 67.8, 68.2, "Mom & Dad", 79.2, 78.9, "Mom only", 10.0, 10.1, "Mom ed", 10.4, 10.3, "Dad ed", 10.0, 10.0, "Black", 23.0, 23.0, "KWW", 33.5, 33.5, "Mean ed76", 13.2, 13.3, "South 1976", 40.0, 40.3, "N col2", 3613, NA_real_, "N col3", NA_real_, 3010 ) replication <- tibble( col2_n = nrow(s2), col3_n = nrow(s3), age_14_15_c2 = age_pct(s2, 14, 15), age_14_15_c3 = age_pct(s3, 14, 15), nearc4_c2 = pct(s2$nearc4), nearc4_c3 = pct(s3$nearc4), black_c2 = pct(s2$black), mean_ed_c2 = mean_num(s2$ed76), mean_ed_c3 = mean_num(s3$ed76) ) message("Table I replication (selected stats):") print(replication) table_data <- ak91_quarto_blank_df(rows |> select(stub, col1, col2, col3, is_header)) gt_tbl <- table_data |> gt() |> cols_hide(columns = is_header) |> cols_label( stub = "", col1 = html("(1)
Overall
NLS-YM"), col2 = html("(2)
1976 interview;
valid education"), col3 = html("(3)
Valid wage
& education") ) gt_tbl <- card93_gt_standard( gt_tbl, stub_col = "stub", data_cols = c("col1", "col2", "col3"), header_rows = header_rows ) gt_tbl <- gt_tbl |> card93_gt_source_notes( c( paste( "**Notes:** Means are based on all available valid observations", "in each subsample (Card, Table 1 notes)." ) ) ) |> card93_gt_col_footnote( "col1", paste( "Column (1) shows published values for the original NLS-YM cohort (N = 5225).", "The proximity extract (`nls.dat`) contains only the 1976 cross-section;", "column (1) cannot be fully replicated from the distributed data file." ) ) |> card93_gt_stub_footnote( table_data, "2. Regional Distribution in 1966:", paste( "Northeast = `reg661` + `reg662`; Midwest = `reg663` + `reg664`;", "West = `reg668` + `reg669`; South = `south66`." ) ) gt_tbl <- card93_gt_finalize(gt_tbl) save(table_data, gt_tbl, replication, paper_targets, file = here("04-topics/rep-card1993/Rcode/Table_I.RData"))