From 665b7944564a785a8e6488c3e407da69731c648e Mon Sep 17 00:00:00 2001 From: jori-kandra Date: Thu, 14 May 2026 14:54:43 -0400 Subject: [PATCH 1/2] April 2026 update --- code/ado/process_rawbasic.ado | 2 +- documentation/docs/changes/changelog.md | 3 +++ documentation/mkdocs.yml | 4 ++-- master.do | 6 +++--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/code/ado/process_rawbasic.ado b/code/ado/process_rawbasic.ado index 23e71e5..8741a8c 100644 --- a/code/ado/process_rawbasic.ado +++ b/code/ado/process_rawbasic.ado @@ -128,7 +128,7 @@ if `datenum' >= tm(2022m10) & `datenum' <= tm(2024m5) { * determine dictionary/NBER do-file to use * June 2024 - present -if tm(2024m6) <= `datenum' & `datenum' <= tm(2026m3) local nberprogname cpsbjun2024 +if tm(2024m6) <= `datenum' & `datenum' <= tm(2026m4) local nberprogname cpsbjun2024 * March 2021 - May 2024 if tm(2021m3) <= `datenum' & `datenum' <= tm(2024m5) local nberprogname cpsbmar2021 * January 2020 - February 2021 diff --git a/documentation/docs/changes/changelog.md b/documentation/docs/changes/changelog.md index d0a0a7d..57b7cb5 100644 --- a/documentation/docs/changes/changelog.md +++ b/documentation/docs/changes/changelog.md @@ -6,6 +6,9 @@ If you use the EPI extracts for your research, please cite them as Economic Policy Institute. {{ year }}. Current Population Survey Extracts, Version {{ version }}, https://microdata.epi.org. ## Recent changes +### Version 2026.5.14 -- 2026-05-14 +#### Added +* April 2026 extracts ### Version 2026.4.13 -- 2026-04-13 #### Added * March 2026 extracts diff --git a/documentation/mkdocs.yml b/documentation/mkdocs.yml index 399b110..37dbfaf 100644 --- a/documentation/mkdocs.yml +++ b/documentation/mkdocs.yml @@ -45,6 +45,6 @@ plugins: dev_addr: '127.0.0.1:8282' extra: - version: 2026.4.13 - latestdata: 2026m3 + version: 2026.5.14 + latestdata: 2026m4 year: 2026 diff --git a/master.do b/master.do index b341af2..77eff77 100644 --- a/master.do +++ b/master.do @@ -41,7 +41,7 @@ set trace off * DATA VERSION ******************************************************************************* * The version is saved in the dataset labels and notes -global dataversion 2026.4.13 +global dataversion 2026.5.14 ******************************************************************************* @@ -85,11 +85,11 @@ adopath ++ ${code}ado * process the raw data and convert it to Stata format * this is only necessary for additional months of data * process_rawmarch, begin(1998) end(2018) -*process_rawbasic, begin(2026m3) end(2026m3) +process_rawbasic, begin(2025m5) end(2026m4) * create EPI's extracts from the processed raw data * creates both basic monthly and ORG subsample -create_extracts, begin(2017m1) end(2026m3) +create_extracts, begin(1962m1) end(2026m4) do ${code}tc_fix.do * create documentation From 5cf559bd63a34162f71bcf214b7e58f0cda8fa04 Mon Sep 17 00:00:00 2001 From: jori-kandra Date: Fri, 15 May 2026 11:23:57 -0400 Subject: [PATCH 2/2] fix veteran for CPS basic (1976-1981) --- .gitignore | 5 +- CLAUDE.md | 268 ++++++++++++++++++++++++ code/variables/generate_veteran.do | 10 +- documentation/docs/changes/changelog.md | 2 + extracts_to_feather.R | 5 +- 5 files changed, 283 insertions(+), 7 deletions(-) create mode 100644 CLAUDE.md diff --git a/.gitignore b/.gitignore index 966a8bd..9124500 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ pension_health.R *.smcl *.feather -tmpdata.do \ No newline at end of file +tmpdata.do + +test_veteran_change.do +check_vet_raw.do diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..e070352 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,268 @@ +# EPI CPS Microdata Extracts — CLAUDE.md + +## Project Overview + +This project produces uniform, cleaned extracts of the Current Population Survey (CPS) for public release at https://microdata.epi.org. It covers four CPS samples: + +| Sample | Years | Source | +|--------|-------|--------| +| **Basic Monthly** | 1976–present | Unicon (pre-1994), Census/BLS raw ASCII (1994+) | +| **May** | 1973–1981 | Unicon | +| **ORG** (Outgoing Rotation Group) | 1979–present | Unicon (pre-1994), Census (1994+) | +| **March (ASEC)** | 1962–present | Unicon (pre-1998), Census (1998+) | + +Output files: `epi_cps{basic|march|may|org}_{year}.{dta|feather}` in `extracts/`. + +--- + +## Tech Stack + +- **Primary language:** Stata (`.do`, `.ado`) +- **R:** used only for `.dta → .feather` conversion (`extracts_to_feather.R`, 6-core parallel) +- **Documentation:** MkDocs + Material theme (source: `documentation/docs/`) +- **Deployment:** AWS S3 via `aws s3 sync`, `rsync` for internal data dirs +- **Orchestration:** `Makefile` (not `targets`/`make` for data; Stata is called manually) + +--- + +## Directory Structure + +``` +master.do # Entry point — sets globals, loads programs, example calls +Makefile # Release pipeline (feather conversion, docs, deploy) +extracts_to_feather.R # DTA → Feather converter (R, parallel) +suppdata/ + cpiurs_extended.csv # CPI-U-RS values used for wage trimming and deflation + state_geocodes.csv # State FIPS, Census codes, divisions, regions +code/ + ado/ # Stata programs loaded via adopath + create_extracts.ado # Main workhorse — month-by-month extract creation + process_rawbasic.ado # Converts Census/BLS raw ASCII to Stata (1994+) + load_epiextracts.ado # Public user-facing utility to load extracts + load_rawcps.ado # Dev utility to load raw CPS data + merge_rawextracts.ado # Dev utility: merge raw + processed for testing + topcode_impute.ado # Pareto-based top-code adjustment for wages + keepifexist.ado # Safely keep only existing variables + valuelabel2md.ado # Converts value labels to markdown tables + variables/ + generate_*.do # One script per variable (139 total) + dictionaries/ # NBER/Census format dicts for raw ASCII parsing (~100 files) + reweights/ # Reweight dictionaries + docs/ + variables_groups.csv # Master variable list with groups/categories + descriptions/ + shortdesc/*.md # One-line variable descriptions + longdesc/*.md # Full variable documentation + availability/*.md # Year/sample availability per variable + analysis/ + *_analysis.do # Webdoc scripts generating analysis figures for docs + createdocs.do # Generates MkDocs markdown from Stata/CSV sources +documentation/ + mkdocs.yml # Site config (version, latestdata, theme) + docs/ # MkDocs source pages + variables/ # Auto-generated variable doc pages (10 subdirs) + changes/changelog.md # Release changelog (versioned) + methodology/ # Technical methodology docs +packages/stata/ # Stata package files for public distribution +``` + +--- + +## Global Variables Set by `master.do` + +```stata +global dataversion "2026.5.14" /* YYYY.M.DD — embedded in dataset labels/notes */ +global code "code/" +global extracts "extracts/" +global suppdata "suppdata/" +global codevars "code/variables/" +global codedocs "code/docs/" +global dictionaries "code/dictionaries/" +global reweights "code/dictionaries/reweights/" +global docs "documentation/docs/" + +* Raw data paths (external — not in repo) +global censusbasicraw "/data/cps/basic/census/raw/" +global censusbasicstata "/data/cps/basic/census/stata/" +global censusmarchraw "/data/cps/march/census/raw/" +global censusmarchstata "/data/cps/march/census/stata/" +global uniconbasic "/data/cps/basic/unicon/" +global uniconmay "/data/cps/may/unicon/" +global uniconmarch "/data/cps/march/unicon/" +global uniconorg "/data/cps/org/unicon/" +``` + +--- + +## Pipeline: Creating Extracts + +### 1. Convert raw ASCII to Stata (Census data, 1994+) +```stata +do master.do +process_rawbasic, begin(2025m5) end(2026m4) +``` +Uses NBER dictionaries in `code/dictionaries/` to parse fixed-width ASCII files from Census. + +### 2. Create EPI extracts (all samples, all years) +```stata +create_extracts, begin(1962m1) end(2026m4) +do ${code}tc_fix.do /* retroactive top-code corrections for 2023-2024 */ +``` +`create_extracts.ado` loops year-by-year and month-by-month, setting context globals then running each `generate_*.do` variable script. Output: one `.dta` per year per sample in `extracts/`. + +**Context globals set inside `create_extracts`:** +- `$monthlycps`, `$maycps`, `$marchcps` — which sample is active +- `$basicfile` — 1 if this month has a basic monthly file +- `$earnerinfo` — 1 if ORG/earnings data available +- `$date` — current month in Stata monthly date format + +### 3. Generate documentation +```stata +do code/docs/createdocs.do +``` +Uses `webdoc` to create MkDocs markdown from variable descriptions and analysis scripts. + +--- + +## Pipeline: Release + +Run via `make` from the project root: + +```makefile +all: createdocs deploywebdocs createfeather deploydata deploywebcode deploywebdata +``` + +| Target | Action | +|--------|--------| +| `createfeather` | `Rscript extracts_to_feather.R` — parallel DTA→Feather (6 cores) | +| `deploydata` | `rsync` `.dta`/`.feather` to `/data/cps/{sample}/epi/` | +| `createdocs` | `mkdocs build` in `documentation/` | +| `deploywebdocs` | `aws s3 sync documentation/site/ s3://microdata.epi.org/` | +| `deploywebcode` | Copy `.ado` files to S3 (Stata package distribution) | +| `deploywebdata` | Zip by decade, tar.gz Feather bundles, sync to S3 | + +--- + +## Variable System + +### 139 variables across 10 categories + +Each variable has its own `code/variables/generate_{varname}.do` script. The master list with group assignments is `code/docs/variables_groups.csv`. + +| Category | Count | Key Variables | +|----------|-------|---------------| +| Demographics | ~20 | `age`, `female`, `hispanic`, `wbho`, `wbhao`, `wbhom`, `married`, `veteran`, `citizen`, disability flags (`diffhear`, `diffmemory`, etc.) | +| Employment | ~20 | `lfstat`, `emp`, `unemp`, `union`, `unmem`, `uncov`, `selfemp`, `cow1`, `cow2`, `pubsec`, `pubst`, `pubfed`, `publoc` | +| Income/Wages | ~12 | `wage`, `wageotc`, `weekpay`, `wage_noadj`, `earnhour`, `otcamt`, `faminc`, `a_weekpay`, `tc_weekpay` | +| Hours | ~10 | `hoursu1`, `hoursu2`, `hourslw1`, `hourslw2`, `hoursumay`, `hoursvary`, `hoursu1i` (imputed), `hourslwtw` (telework) | +| Education | 4 | `educ`, `gradecom`, `gradehi`, `gradeatn` | +| Geography | ~8 | `statefips`, `division`, `region`, `countyfips`, `cbsafips`, `cbsasize`, `metstat` | +| Industry | ~10 | `indcode`, `ind70/80/90/02/07/12/17/22`, `mind03`, `mind16`, `dind03` | +| Occupation | ~10 | `occcode`, `occ70/80/90/00/10/18`, `mocc03`, `mocc10`, `docc03` | +| Identifiers | ~15 | `hrhhid`, `hhid`, `famid`, `year`, `month`, `hhseq`, `pernum` | +| Weights | ~12 | `finalwgt`, `basicwgt`, `cmpwgt`, `orgwgt`, `hhwgt`, `famwgt`, `earnwgt` | + +### Variable script conventions + +Scripts check context globals to handle era-specific source variable names: +```stata +* typical pattern in a generate_*.do script +if $monthlycps { + if $date < tm(1994m1) { + * Unicon era variable names + gen wage = prernwa ... + } + else { + * Census era variable names + gen wage = pternwa ... + } +} +``` + +--- + +## Key Technical Details + +### CPI adjustment +- Uses **CPI-U-RS** (extended) from `suppdata/cpiurs_extended.csv` +- Base year: **1989** +- Wage trimming bounds: `wage_lower = $0.50 × CPI_ratio`, `wage_upper = $100.00 × CPI_ratio` +- 2026 CPI: projected using CBO's 2025–2026 growth rate (applied in `create_extracts.ado`) + +### Top-code handling +| Period | Method | +|--------|--------| +| Pre-April 2023 | Fixed ceiling ($2,884.61/week); Pareto imputation via `topcode_impute.ado` | +| April 2023–March 2024 | Dynamic Census top-codes (phased rollout by `minsamp`); `tc_fix.do` applies retroactive corrections | +| April 2024+ | No adjustment needed (dynamic codes fully standard) | + +### Hours imputation (`hoursu1i`) +For non-hourly workers (`paidhre == 0`) who report "hours vary," hours are imputed via regression on demographics and industry. Used in wage calculation. + +### Race/ethnicity coding evolution +- `wbho`: White, Black, Hispanic, Other (1989+) +- `wbhao`: adds Asian category (1989+) +- `wbhom`: multiracial-aware version (2003+) +- `wbo_only`, `wbho_only`: single-race only variants + +### Industry/occupation over time +Census reclassifies codes periodically. Both raw Census codes (`ind02`, `ind07`, etc.) and harmonized codes (`mind16`, `mocc10`) are included. Harmonized codes require manual crosswalk maintenance. + +--- + +## Stata Dependencies (must be installed) + +```stata +ssc install webdoc, replace +ssc install gtools +ssc install ashell, replace +ssc install moreobs, replace +ssc install maptile, replace +ssc install spmap, replace +ssc install labutil, replace +maptile_install using "http://files.michaelstepner.com/geo_statehex.zip" +maptile_install using "http://files.michaelstepner.com/geo_cbsa2013.zip" +maptile_install using "http://files.michaelstepner.com/geo_state.zip" +maptile_install using "http://files.michaelstepner.com/geo_county2014.zip" +``` + +Also requires: Python `tabulate` module, EPI CPI Stata package (`github.com/Economic/cpi`). + +--- + +## Versioning & Release Cadence + +- **Version format:** `YYYY.M.DD` (e.g., `2026.5.14` = May 14, 2026) +- **Frequency:** Monthly, typically ~2 weeks after Census releases new CPS data +- **Version is set** in `master.do` (`global dataversion`) and `documentation/mkdocs.yml` (`extra.version`, `extra.latestdata`) +- **Changelog:** `documentation/docs/changes/changelog.md` — must be updated with every release +- **Critical bugs** may trigger retroactive updates across prior releases (e.g., the 2023–2024 top-code fix) + +### Steps to update for a new month +1. Update `global dataversion` in `master.do` +2. Update `version` and `latestdata` in `documentation/mkdocs.yml` +3. `process_rawbasic, begin(...) end(...)` for new months +4. `create_extracts, begin(1962m1) end(...)` (or targeted date range) +5. `do ${code}tc_fix.do` if applicable +6. `do code/docs/createdocs.do` +7. `make` (or individual make targets as needed) + +--- + +## Development & Testing + +- **Test template:** `code/variables/test_variable.do` — loads a date range, merges raw+processed via `merge_rawextracts.ado`, runs crosstabs +- **No automated test suite** — testing is manual spot-checks and crosstab review +- **Log file:** `test_create_extracts.log` — generated during test runs +- **Dev workflow:** use `merge_rawextracts.ado` to load a processed extract and merge back specific raw Census variables for comparison + +--- + +## Common Gotchas + +- **October 2025 is skipped** in `create_extracts.ado` (hardcoded `continue` for 2025m10) — likely a data quality issue with that month's release +- **Unicon vs Census era** variable names differ; all `generate_*.do` scripts must handle both +- **`keepifexist.ado`** is used throughout because not all variables exist in all samples/years — use it instead of `keep varlist` to avoid errors +- **`adopath ++ ${code}ado`** in `master.do` must run before any program is called; always run `master.do` first +- **CPI 2026** is a projection, not official data — will need updating when BLS releases actuals +- The `.feather` files use Apache Arrow format; `extracts_to_feather.R` uses the `arrow` R package with `haven` for label preservation diff --git a/code/variables/generate_veteran.do b/code/variables/generate_veteran.do index c7c12ea..182853b 100644 --- a/code/variables/generate_veteran.do +++ b/code/variables/generate_veteran.do @@ -6,11 +6,13 @@ gen byte veteran = . if $monthlycps == 1 | $maycps == 1 { + * vet is a basic questionnaire item in the Unicon era, not ORG-specific, + * so this block does not require $earnerinfo + if tm(1973m1) <= $date & $date <= tm(1993m12) { + replace veteran = 0 if vet == 6 + replace veteran = 1 if 1 <= vet & vet <= 5 + } if $earnerinfo == 1 { - if tm(1973m1) <= $date & $date <= tm(1993m12) { - replace veteran = 0 if vet == 6 - replace veteran = 1 if 1 <= vet & vet <= 5 - } if tm(1994m12) <= $date & $date <= tm(2005m7) { replace veteran = 0 if peafwhen == 6 replace veteran = 1 if 1 <= peafwhen & peafwhen <= 5 diff --git a/documentation/docs/changes/changelog.md b/documentation/docs/changes/changelog.md index 57b7cb5..2619146 100644 --- a/documentation/docs/changes/changelog.md +++ b/documentation/docs/changes/changelog.md @@ -9,6 +9,8 @@ If you use the EPI extracts for your research, please cite them as ### Version 2026.5.14 -- 2026-05-14 #### Added * April 2026 extracts +#### Modified +* Fixed `veteran` to correctly populate for CPS Basic Monthly 1976–1981. Previously, `veteran` was missing for these years because the recode was inadvertently dependent on earner/ORG information being present. Thanks to Ben Zipperer for flagging this error! ### Version 2026.4.13 -- 2026-04-13 #### Added * March 2026 extracts diff --git a/extracts_to_feather.R b/extracts_to_feather.R index b891378..1ecf8ee 100644 --- a/extracts_to_feather.R +++ b/extracts_to_feather.R @@ -41,9 +41,10 @@ dta_to_feather <- function(x) { return(NULL) } -# grab filenames to convert +# grab filenames to convert, skipping any that already have a .feather file extractsdir <- "extracts" -files <- list.files(extractsdir, pattern = "*.dta", full.names = TRUE) +files <- list.files(extractsdir, pattern = "*.dta", full.names = TRUE) |> + Filter(\(x) !file.exists(gsub("\\.dta$", ".feather", x)), x = _) # process files in parallel registerDoParallel(cores=6)