# Card (1993) Table 1 – sample characteristics (cols 2–3 reproducible).
library(tidyverse)
library(here)
library(gt)
source(here("04-topics/rep-card1993/Rcode/card1993-data-prep.R"))
source(here("04-topics/rep-card1993/Rcode/card1993-gt-quarto.R"))
data <- load_card93_data()
stopifnot(nrow(data) == 3613L)
s2 <- data
s3 <- card93_wage_sample(data)
stopifnot(nrow(s3) == 3010L)
pct <- function(x) 100 * mean(x, na.rm = TRUE)
mean_num <- function(x) mean(x, na.rm = TRUE)
region_ne <- function(d) pct(d$reg661 + d$reg662)
region_midwest <- function(d) pct(d$reg663 + d$reg664)
region_west <- function(d) pct(d$reg668 + d$reg669)
region_south <- function(d) pct(d$south66)
age_pct <- function(d, lo, hi) {
pct(d$age66 >= lo & d$age66 <= hi)
}
section_row <- function(label) {
tibble(
stub = label,
col1 = " ",
col2 = " ",
col3 = " ",
is_header = TRUE
)
}
stat_row <- function(label, fn, digits = 1L, col1 = "–") {
tibble(
stub = label,
col1 = col1,
col2 = card93_fmt_num(fn(s2), digits),
col3 = card93_fmt_num(fn(s3), digits),
is_header = FALSE
)
}
# Section headers in stub column preserve @tbl-card-1 row order (no gt row_group reordering).
rows <- bind_rows(
section_row("1. Age Distribution in 1966:"),
stat_row("Age 14-15 (%)", function(d) age_pct(d, 14, 15)),
stat_row("Age 16-17", function(d) age_pct(d, 16, 17)),
stat_row("Age 18-20", function(d) age_pct(d, 18, 20)),
stat_row("Age 21-24", function(d) age_pct(d, 21, 24)),
section_row("2. Regional Distribution in 1966:"),
stat_row("Northeast (%)", region_ne),
stat_row("Midwest", region_midwest),
stat_row("South", region_south),
stat_row("West", region_west),
stat_row("3. Lived in SMSA 1966 (%)", function(d) pct(d$smsa66r)),
stat_row("4. Lived Near 4-year College in 1966 (%)", function(d) pct(d$nearc4)),
section_row("5. Family structure at Age 14:"),
stat_row("Mother & Father (%)", function(d) pct(d$momdad14)),
stat_row("Mother Only (%)", function(d) pct(d$sinmom14)),
section_row("6. Average Parental Education"),
stat_row("Mother's Education (yrs)", function(d) mean_num(d$momed), digits = 1L),
stat_row("Father's Education (yrs)", function(d) mean_num(d$daded), digits = 1L),
stat_row("7. Percent Black", function(d) pct(d$black)),
stat_row("8. Average score on KWW Test", function(d) mean_num(d$kww), digits = 1L),
stat_row("9. Interviewed in 1976 (%)", function(d) 100),
stat_row("10. Mean Education in 1976", function(d) mean_num(d$ed76), digits = 1L),
stat_row("11. Live in south in 1976 (%)", function(d) pct(d$reg76r)),
tibble(
stub = "12. Sample size",
col1 = "5225",
col2 = as.character(nrow(s2)),
col3 = as.character(nrow(s3)),
is_header = FALSE
)
)
header_rows <- which(rows$is_header)
paper_targets <- tribble(
~row, ~col2, ~col3,
"Age 14-15", 25.3, 25.5,
"Age 16-17", 23.8, 24.1,
"Age 18-20", 26.1, 24.6,
"Age 21-24", 26.7, 25.8,
"Northeast", 20.0, 20.7,
"Midwest", 26.3, 26.0,
"South", 41.3, 41.4,
"West", 12.5, 11.9,
"SMSA 1966", 64.3, 65.0,
"Near college", 67.8, 68.2,
"Mom & Dad", 79.2, 78.9,
"Mom only", 10.0, 10.1,
"Mom ed", 10.4, 10.3,
"Dad ed", 10.0, 10.0,
"Black", 23.0, 23.0,
"KWW", 33.5, 33.5,
"Mean ed76", 13.2, 13.3,
"South 1976", 40.0, 40.3,
"N col2", 3613, NA_real_,
"N col3", NA_real_, 3010
)
replication <- tibble(
col2_n = nrow(s2),
col3_n = nrow(s3),
age_14_15_c2 = age_pct(s2, 14, 15),
age_14_15_c3 = age_pct(s3, 14, 15),
nearc4_c2 = pct(s2$nearc4),
nearc4_c3 = pct(s3$nearc4),
black_c2 = pct(s2$black),
mean_ed_c2 = mean_num(s2$ed76),
mean_ed_c3 = mean_num(s3$ed76)
)
message("Table I replication (selected stats):")
print(replication)
table_data <- ak91_quarto_blank_df(rows |> select(stub, col1, col2, col3, is_header))
gt_tbl <- table_data |>
gt() |>
cols_hide(columns = is_header) |>
cols_label(
stub = "",
col1 = html("(1)
Overall
NLS-YM"),
col2 = html("(2)
1976 interview;
valid education"),
col3 = html("(3)
Valid wage
& education")
)
gt_tbl <- card93_gt_standard(
gt_tbl,
stub_col = "stub",
data_cols = c("col1", "col2", "col3"),
header_rows = header_rows
)
gt_tbl <- gt_tbl |>
card93_gt_source_notes(
c(
paste(
"**Notes:** Means are based on all available valid observations",
"in each subsample (Card, Table 1 notes)."
)
)
) |>
card93_gt_col_footnote(
"col1",
paste(
"Column (1) shows published values for the original NLS-YM cohort (N = 5225).",
"The proximity extract (`nls.dat`) contains only the 1976 cross-section;",
"column (1) cannot be fully replicated from the distributed data file."
)
) |>
card93_gt_stub_footnote(
table_data,
"2. Regional Distribution in 1966:",
paste(
"Northeast = `reg661` + `reg662`; Midwest = `reg663` + `reg664`;",
"West = `reg668` + `reg669`; South = `south66`."
)
)
gt_tbl <- card93_gt_finalize(gt_tbl)
save(table_data, gt_tbl, replication, paper_targets, file = here("04-topics/rep-card1993/Rcode/Table_I.RData"))