StatFunGen · danielnachun · Jun 24, 2026 · Jun 24, 2026
diff --git a/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb b/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb
@@ -660,8 +660,16 @@
     "# Optional genotype-derived covariates applied uniformly across contexts;\n",
     "# per-context covariates come from the manifest cov_path column.\n",
     "parameter: genotype_covariates = path('.')\n",
+    "# Construct-time variant/sample QC (stored on the QtlDataset; cutoff 0 = off).\n",
     "parameter: maf_cutoff = 0.0\n",
     "parameter: xvar_cutoff = 0.0\n",
+    "parameter: mac_cutoff = 0.0\n",
+    "parameter: imiss_cutoff = 0.0\n",
+    "# Drop indel variants (default keeps them, matching QtlDataset()).\n",
+    "parameter: drop_indel = False\n",
+    "# Optional whitespace-delimited ID files to restrict samples / variants (\".\" = none).\n",
+    "parameter: keep_samples = \".\"\n",
+    "parameter: keep_variants = \".\"\n",
     "output: f\"{cwd:a}/qtl_dataset/{study}.qtl_dataset.rds\"\n",
     "task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n",
     "bash: expand = '${ }', stderr = f\"{_output}.stderr\", stdout = f\"{_output}.stdout\", container = container\n",
@@ -673,6 +681,11 @@
     "        ${'--transpose-covariates' if transpose_covariates else ''} \\\n",
     "        --maf-cutoff ${maf_cutoff} \\\n",
     "        --xvar-cutoff ${xvar_cutoff} \\\n",
+    "        --mac-cutoff ${mac_cutoff} \\\n",
+    "        --imiss-cutoff ${imiss_cutoff} \\\n",
+    "        ${'--drop-indel' if drop_indel else ''} \\\n",
+    "        ${('--keep-samples ' + str(keep_samples)) if keep_samples != '.' else ''} \\\n",
+    "        ${('--keep-variants ' + str(keep_variants)) if keep_variants != '.' else ''} \\\n",
     "        --output ${_output}"
    ]
   },
@@ -701,12 +714,43 @@
     "parameter: fine_mapping_coverage = 0.95\n",
     "# Comma-separated context names to restrict both passes to; empty = all contexts.\n",
     "parameter: contexts = \"\"\n",
+    "# Fine-mapping SER pre-screen (0 = off, <0 = adaptive 3/nVariants).\n",
+    "parameter: pip_cutoff_to_skip = 0.0\n",
+    "# TWAS weight-learning knobs.\n",
+    "parameter: min_twas_maf = 0.01\n",
+    "parameter: min_twas_xvar = 0.01\n",
+    "parameter: max_cv_variants = 5000\n",
+    "parameter: cv_folds = 5\n",
+    "parameter: cv_threads = 1\n",
+    "# Reproducibility: integer RNG seed; -1 = unset.\n",
+    "parameter: seed = -1\n",
+    "# Opt-in per-analysis overrides of the QtlDataset's construct-time filters,\n",
+    "# applied to both passes. Sentinels: a negative cutoff / drop_indel False /\n",
+    "# \".\" path = leave the dataset's stored value untouched.\n",
+    "parameter: maf_cutoff = -1.0\n",
+    "parameter: mac_cutoff = -1.0\n",
+    "parameter: xvar_cutoff = -1.0\n",
+    "parameter: imiss_cutoff = -1.0\n",
+    "parameter: drop_indel = False\n",
+    "parameter: keep_samples = \".\"\n",
+    "parameter: keep_variants = \".\"\n",
     "# Optional JSON of per-method kwargs forwarded to the pipelines, e.g.\n",
     "# '{\"susie\":{\"L\":10}}' for fine-mapping or '{\"lasso\":{\"nfolds\":10}}' for TWAS.\n",
     "parameter: fine_mapping_method_args = \"\"\n",
     "parameter: twas_method_args = \"\"\n",
     "fail_if(len(region_name) == 0, \"susie_twas: pass --region-name <gene-id> [<gene-id> ...] to choose which gene(s) to analyze.\")\n",
     "qtl_dataset = path(f\"{cwd:a}/qtl_dataset/{name}.qtl_dataset.rds\")\n",
+    "# Assemble the shared opt-in worker flags once (command construction only).\n",
+    "seed_arg = f\"--seed {seed}\" if seed >= 0 else \"\"\n",
+    "filter_overrides = \" \".join(filter(None, [\n",
+    "    f\"--maf-cutoff {maf_cutoff}\"       if maf_cutoff   >= 0 else \"\",\n",
+    "    f\"--mac-cutoff {mac_cutoff}\"       if mac_cutoff   >= 0 else \"\",\n",
+    "    f\"--xvar-cutoff {xvar_cutoff}\"     if xvar_cutoff  >= 0 else \"\",\n",
+    "    f\"--imiss-cutoff {imiss_cutoff}\"   if imiss_cutoff >= 0 else \"\",\n",
+    "    \"--drop-indel\"                     if drop_indel        else \"\",\n",
+    "    f\"--keep-samples {keep_samples}\"   if keep_samples  != \".\" else \"\",\n",
+    "    f\"--keep-variants {keep_variants}\" if keep_variants != \".\" else \"\",\n",
+    "]))\n",
     "input: qtl_dataset, for_each = \"region_name\"\n",
     "output: fine_mapping = f\"{cwd:a}/fine_mapping/{name}.{_region_name}.univariate_bvsr.rds\",\n",
     "        twas_weights = f\"{cwd:a}/twas_weights/{name}.{_region_name}.univariate_twas_weights.rds\"\n",
@@ -718,6 +762,8 @@
     "        --cis-window ${cis_window} \\\n",
     "        --methods ${fine_mapping_methods} \\\n",
     "        --coverage ${fine_mapping_coverage} \\\n",
+    "        --pip-cutoff-to-skip ${pip_cutoff_to_skip} \\\n",
+    "        ${seed_arg} ${filter_overrides} \\\n",
     "        ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n",
     "        ${(\"--method-args '\" + fine_mapping_method_args + \"'\") if fine_mapping_method_args else \"\"} \\\n",
     "        --output ${_output[\"fine_mapping\"]}\n",
@@ -727,113 +773,18 @@
     "        --gene-id ${_region_name} \\\n",
     "        --cis-window ${cis_window} \\\n",
     "        --methods ${twas_methods} \\\n",
+    "        --min-twas-maf ${min_twas_maf} \\\n",
+    "        --min-twas-xvar ${min_twas_xvar} \\\n",
+    "        --max-cv-variants ${max_cv_variants} \\\n",
+    "        --cv-folds ${cv_folds} \\\n",
+    "        --cv-threads ${cv_threads} \\\n",
     "        --fine-mapping-result ${_output[\"fine_mapping\"]} \\\n",
+    "        ${seed_arg} ${filter_overrides} \\\n",
     "        ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n",
     "        ${(\"--method-args '\" + twas_method_args + \"'\") if twas_method_args else \"\"} \\\n",
     "        --output ${_output[\"twas_weights\"]}"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "kernel": "SoS"
-   },
-   "outputs": [],
-   "source": [
-    "[ensemble_twas_weight]\n",
-    "# Ensemble multiple per-method TWAS weights (susie/susie_inf/mrash/enet/lasso/mcp/scad/l0learn/bayes_r/bayes_c)\n",
-    "# into a single stacked-regression ensemble weight, using pecotmr::ensemble_weights().\n",
-    "# Input: the *.univariate_twas_weights.rds produced by the [susie_twas] step (one per region/peak).\n",
-    "# Output: *.ensemble_twas_weights.rds, identical structure but with an added `ensemble_weights` column.\n",
-    "# Default -1e9 keeps every method (no R^2 filtering) so the ensemble always\n",
-    "# computes; raise this (e.g. 0.01) on real data to drop poorly-predicting methods.\n",
-    "parameter: ensemble_r2_threshold = -1e9\n",
-    "parameter: ensemble_solver = \"quadprog\"\n",
-    "parameter: ensemble_alpha = 1.0\n",
-    "depends: sos_variable(\"regional_data\")\n",
-    "meta_info = regional_data['meta_info']\n",
-    "input: regional_data[\"data\"], group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n",
-    "output_files = [f'{cwd:a}/twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.ensemble_twas_weights.rds']\n",
-    "output: output_files\n",
-    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n",
-    "R: expand = '${ }', stdout = f\"{_output[0]:nn}.ensemble_twas_weight.stdout\", stderr = f\"{_output[0]:nn}.ensemble_twas_weight.stderr\", container = container\n",
-    "    options(warn=1)\n",
-    "    library(pecotmr)\n",
-    "    # ---- locate the upstream univariate_twas_weights.rds for this region ----\n",
-    "    region_tag <- \"${_meta_info[0].split(':')[0]}_${_meta_info[2]}\"\n",
-    "    in_rds <- paste0(\"${cwd:a}\", \"/twas_weights/\", \"${name}\", \".\", region_tag, \".univariate_twas_weights.rds\")\n",
-    "    if (!file.exists(in_rds)) stop(paste0(\"Missing upstream TWAS weights file: \", in_rds, \" -- run the susie_twas step first.\"))\n",
-    "    obj <- readRDS(in_rds)\n",
-    "    # ---- reload the regional phenotype y (needed as the stacking response) ----\n",
-    "    phenotype_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[1:len(_input)//2+1]])})\n",
-    "    covariate_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[len(_input)//2+1:]])})\n",
-    "    conditions = c(${\",\".join(['\"%s\"' % x for x in _meta_info[4:]])})\n",
-    "    fdat <- load_regional_univariate_data(genotype = ${_input[0]:anr},\n",
-    "                                          phenotype = phenotype_files,\n",
-    "                                          covariate = covariate_files,\n",
-    "                                          region = ${(\"'%s'\" % _meta_info[0]) if not trans_analysis else 'NULL'},\n",
-    "                                          association_window = \"${_meta_info[1]}\",\n",
-    "                                          conditions = conditions,\n",
-    "                                          maf_cutoff = ${maf},\n",
-    "                                          mac_cutoff = ${mac},\n",
-    "                                          imiss_cutoff = ${imiss},\n",
-    "                                          keep_indel = ${\"TRUE\" if indel else \"FALSE\"},\n",
-    "                                          extract_region_name = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])}),\n",
-    "                                          phenotype_header = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"},\n",
-    "                                          region_name_col = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"},\n",
-    "                                          scale_residuals = FALSE)\n",
-    "    # ---- per region in this object, run the ensemble ----\n",
-    "    ensemble_r2_threshold <- ${ensemble_r2_threshold}\n",
-    "    ensemble_solver <- \"${ensemble_solver}\"\n",
-    "    ensemble_alpha <- ${ensemble_alpha}\n",
-    "    for (rg in names(obj)) {\n",
-    "        for (ph in names(obj[[rg]])) {\n",
-    "            res <- obj[[rg]][[ph]]\n",
-    "            if (is.null(res$twas_cv_result) || is.null(res$twas_cv_result$performance) || is.null(res$twas_weights)) {\n",
-    "                message(\"Skipping \", rg, \"/\", ph, \": no twas_cv_result/twas_weights present.\"); next\n",
-    "            }\n",
-    "            # observed phenotype y for this condition (matched by name to the loaded fdat)\n",
-    "            y <- tryCatch(fdat$residual_Y[[ph]], error = function(e) NULL)\n",
-    "            if (is.null(y)) y <- tryCatch(fdat$residual_Y[[1]], error = function(e) NULL)\n",
-    "            if (is.null(y)) { message(\"Skipping \", rg, \"/\", ph, \": could not recover phenotype y.\"); next }\n",
-    "            y <- as.numeric(y)\n",
-    "            # rank methods by CV R-squared (same logic as twas_weights_pipeline)\n",
-    "            method_rsq <- vapply(res$twas_cv_result$performance, function(perf) perf[1, \"rsq\"], numeric(1))\n",
-    "            names(method_rsq) <- gsub(\"_performance$\", \"\", names(method_rsq))\n",
-    "            passing <- !is.na(method_rsq) & method_rsq >= ensemble_r2_threshold\n",
-    "            n_passing <- sum(passing)\n",
-    "            if (n_passing < 2) {\n",
-    "                surviving <- if (n_passing == 1) names(method_rsq)[passing] else NA\n",
-    "                message(\"Ensemble skipped for \", rg, \"/\", ph, \": \", n_passing, \" of \", length(method_rsq),\n",
-    "                        \" methods passed R-squared cutoff \", ensemble_r2_threshold, \" (need >= 2).\")\n",
-    "                obj[[rg]][[ph]]$ensemble_weights <- NULL\n",
-    "                next\n",
-    "            }\n",
-    "            passing_base <- names(method_rsq)[passing]\n",
-    "            passing_pred_names <- paste0(passing_base, \"_predicted\")\n",
-    "            passing_weight_names <- paste0(passing_base, \"_weights\")\n",
-    "            filtered_cv <- res$twas_cv_result\n",
-    "            filtered_cv$prediction <- filtered_cv$prediction[passing_pred_names]\n",
-    "            filtered_weights <- res$twas_weights[passing_weight_names]\n",
-    "            message(\"Computing ensemble TWAS weights via stacked regression for \", rg, \"/\", ph,\n",
-    "                    \" using \", n_passing, \" methods: \", paste(passing_base, collapse=\", \"))\n",
-    "            ens_result <- ensemble_weights(cv_results = filtered_cv, Y = y,\n",
-    "                                           twas_weight_list = filtered_weights,\n",
-    "                                           solver = ensemble_solver, alpha = ensemble_alpha)\n",
-    "            if (!is.null(ens_result$ensemble_twas_weights)) {\n",
-    "                ens_wt <- ens_result$ensemble_twas_weights\n",
-    "                if (!is.matrix(ens_wt)) ens_wt <- matrix(ens_wt, ncol = 1)\n",
-    "                obj[[rg]][[ph]]$twas_weights$ensemble_weights <- ens_wt\n",
-    "                obj[[rg]][[ph]]$ensemble <- ens_result\n",
-    "            }\n",
-    "        }\n",
-    "    }\n",
-    "    dir.create(dirname(\"${_output[0]}\"), showWarnings = FALSE, recursive = TRUE)\n",
-    "    saveRDS(obj, \"${_output[0]}\", compress = \"xz\")\n",
-    "    message(\"Saved ensemble TWAS weights to: \", \"${_output[0]}\")\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/code/script/pecotmr_integration/fine_mapping.R b/code/script/pecotmr_integration/fine_mapping.R
@@ -85,10 +85,56 @@ parser <- add_argument(parser, "--L-greedy",
 parser <- add_argument(parser, "--method-args",
                        help = "JSON object {token: {kwarg: value, ...}, ...} for fineMappingPipeline()",
                        type = "character", default = "")
+parser <- add_argument(parser, "--seed",
+                       help = "Integer RNG seed set before fitting (reproducibility); unset = no seeding",
+                       type = "integer", default = NA)
+parser <- add_argument(parser, "--pip-cutoff-to-skip",
+                       help = "Single-effect (SER) pre-screen cutoff (fineMappingPipeline pipCutoffToSkip), QTL mode; 0 = off, <0 = adaptive 3/nVariants",
+                       type = "numeric", default = 0)
+# --- Per-analysis overrides of the QtlDataset's construct-time filters. Each is
+# opt-in (unset leaves the dataset's stored value); applied to the loaded object
+# before the pipeline runs (QTL mode only).
+parser <- add_argument(parser, "--maf-cutoff",
+                       help = "Override QtlDataset mafCutoff for this analysis",
+                       type = "numeric", default = NA)
+parser <- add_argument(parser, "--mac-cutoff",
+                       help = "Override QtlDataset macCutoff",
+                       type = "numeric", default = NA)
+parser <- add_argument(parser, "--xvar-cutoff",
+                       help = "Override QtlDataset xvarCutoff",
+                       type = "numeric", default = NA)
+parser <- add_argument(parser, "--imiss-cutoff",
+                       help = "Override QtlDataset imissCutoff",
+                       type = "numeric", default = NA)
+parser <- add_argument(parser, "--drop-indel",
+                       help = "Drop indels (sets keepIndel = FALSE); omit to use the dataset's stored value",
+                       flag = TRUE)
+parser <- add_argument(parser, "--keep-samples",
+                       help = "Path to a whitespace-delimited file of sample IDs to restrict to (overrides keepSamples)",
+                       type = "character", default = "")
+parser <- add_argument(parser, "--keep-variants",
+                       help = "Path to a whitespace-delimited file of variant IDs to restrict to (overrides keepVariants)",
+                       type = "character", default = "")
 parser <- add_argument(parser, "--output",
                        help = "Output RDS path", type = "character")
 argv <- parse_args(parser)
 
+# Opt-in overrides of a loaded QtlDataset's construct-time filter slots. Filters
+# apply lazily at extraction, so mutating the slot re-filters without rebuilding
+# the RDS. Shared by both QTL workers (see twas_weights.R).
+apply_qd_filter_overrides <- function(qd, argv) {
+  if (!is.na(argv$maf_cutoff))   qd@mafCutoff   <- argv$maf_cutoff
+  if (!is.na(argv$mac_cutoff))   qd@macCutoff   <- argv$mac_cutoff
+  if (!is.na(argv$xvar_cutoff))  qd@xvarCutoff  <- argv$xvar_cutoff
+  if (!is.na(argv$imiss_cutoff)) qd@imissCutoff <- argv$imiss_cutoff
+  if (isTRUE(argv$drop_indel))   qd@keepIndel   <- FALSE
+  read_ids <- function(p) if (nzchar(p) && p != "." && file.exists(p))
+    unique(trimws(unlist(strsplit(readLines(p), "[[:space:]]+")))) else NULL
+  ks <- read_ids(argv$keep_samples);  if (!is.null(ks)) qd@keepSamples  <- ks
+  kv <- read_ids(argv$keep_variants); if (!is.null(kv)) qd@keepVariants <- kv
+  qd
+}
+
 # Parse --method-args into a nested named list of per-method kwargs.
 # Empty / '.' / '{}' all yield NULL (which makes us pass the char-vector
 # form of methods= to fineMappingPipeline).
@@ -119,6 +165,9 @@ secondary_cov <- as.numeric(trimws(strsplit(argv$secondary_coverage, ",", fixed
 median_abs_corr <- if (length(argv$median_abs_corr) != 1L || is.na(argv$median_abs_corr))
                      NULL else argv$median_abs_corr
 
+# Seed up front for reproducible fits (mirrors the legacy susie_twas set.seed).
+if (length(argv$seed) == 1L && !is.na(argv$seed)) set.seed(as.integer(argv$seed))
+
 parse_region <- function(s) {
   m <- regmatches(s, regexec("^([^:]+):([0-9]+)-([0-9]+)$", s))[[1L]]
   if (length(m) != 4L)
@@ -186,10 +235,13 @@ if (has_gwas) {
   if (!has_gene && !has_region)
     stop("QTL mode requires --gene-id (with --cis-window) or --region.")
   qd <- readRDS(argv$qtl_dataset)
-  # `contexts` is QTL-mode only (NULL = all contexts), so it rides on qtl_args
+  qd <- apply_qd_filter_overrides(qd, argv)
+  # `contexts` and `pipCutoffToSkip` are QTL-mode only, so they ride on qtl_args
   # rather than the mode-shared cs_args.
   qtl_args <- c(list(qd), cs_args,
-                list(cisWindow = argv$cis_window, contexts = contexts_arg))
+                list(cisWindow       = argv$cis_window,
+                     contexts        = contexts_arg,
+                     pipCutoffToSkip = argv$pip_cutoff_to_skip))
   res <- if (has_region) {
     do.call(fineMappingPipeline, c(qtl_args, list(region = parse_region(argv$region))))
   } else {