From 9e3eb54944d3a6316c95c1775f926ddaaeb05cd0 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 26 Jun 2026 23:18:54 -0700 Subject: [PATCH] mvsusie fixes --- .../mnm_methods/mnm_regression.ipynb | 557 +++++------------- .../script/pecotmr_integration/fine_mapping.R | 121 +++- .../pecotmr_integration/region_manifest.R | 161 +++++ .../script/pecotmr_integration/twas_weights.R | 25 + 4 files changed, 429 insertions(+), 435 deletions(-) create mode 100644 code/script/pecotmr_integration/region_manifest.R diff --git a/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb b/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb index 075cd8a8..00f7449b 100644 --- a/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb +++ b/code/SoS/mnm_analysis/mnm_methods/mnm_regression.ipynb @@ -313,7 +313,7 @@ "source": [ "### Multivariate analysis: mvSuSiE and mr.mash (`mnm`)\n", "\n", - "Fine-maps multiple molecular phenotypes jointly across a shared window using mvSuSiE with mr.mash priors. This step needs **multiple contexts** (e.g. the same genes measured under two or more conditions). The toy set ships a single context, so for this MWE we add a **second, synthetic context** (`example/colocboost_ctx2.bed.gz`, derived from context 1 with added noise \u2014 *demo data only, not a real measurement*) and a multi-context phenotype manifest. With that in place the `mnm` step now runs end-to-end on the toy data and produces `multicontext_bvsr.rds`, `multicontext_twas_weights.rds`, and `multicontext_data.rds` per gene." + "Fine-maps multiple molecular phenotypes jointly across a shared window using mvSuSiE with mr.mash priors. This step needs **multiple contexts** (e.g. the same genes measured under two or more conditions). The toy set ships a single context, so for this MWE we add a **second, synthetic context** (`example/colocboost_ctx2.bed.gz`, derived from context 1 with added noise — *demo data only, not a real measurement*) and a multi-context phenotype manifest. With that in place the `mnm` step now runs end-to-end on the toy data and produces `multicontext_bvsr.rds`, `multicontext_twas_weights.rds`, and `multicontext_data.rds` per gene." ] }, { @@ -795,79 +795,45 @@ "outputs": [], "source": [ "[mnm]\n", - "# Prior model file generated from mashr. \n", - "# Default will be used if it does not exist.\n", - "parameter: mixture_prior = path()\n", - "parameter: mixture_prior_cv = path()\n", - "parameter: prior_weights_min = 5e-4\n", - "parameter: prior_canonical_matrices = False\n", - "parameter: sample_partition = path() \n", + "# Multivariate cross-context fine-mapping with mvSuSiE over the pre-built\n", + "# QtlDataset (from qtl_dataset_construct), one gene per fan-out unit. Each gene's\n", + "# contexts are fit jointly (jointSpecification=\"context\"). The canonical mixture\n", + "# prior is used by default; pass --prior-twas-weights (a TwasWeights RDS from a\n", + "# preceding `twas_weights.R --methods mrmash` run) for the mr.mash data-driven\n", + "# reweighted prior. Replaces the legacy mnm.R + multivariate_analysis_pipeline.\n", + "parameter: cis_window = 1000000\n", "parameter: mvsusie_max_iter = 200\n", - "parameter: mrmash_max_iter = 5000\n", - "depends: sos_variable(\"regional_data\")\n", - "# Check if both 'data' and 'meta_info' are empty lists\n", - "stop_if(len(regional_data['data']) == 0, f'Either genotype or phenotype data are not available for region {\", \".join(region_name)}.')\n", - "\n", - "meta_info = regional_data['meta_info']\n", - "\n", - "input: regional_data[\"data\"], group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n", - "if skip_fine_mapping and skip_twas_weights:\n", - " save_data = True\n", - " output_files = [f'{cwd:a}/data/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multicontext_data.rds']\n", - "elif not skip_fine_mapping and skip_twas_weights:\n", - " output_files = [f'{cwd:a}/multivariate_fine_mapping/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multicontext_bvsr.rds']\n", - "elif skip_fine_mapping and not skip_twas_weights:\n", - " # Warning that fine-mapping is required for TWAS weights\n", - " if not ld_reference_meta_file.is_dir():\n", - " print(\"WARNING: Fine-mapping is required to generate TWAS weights. Setting skip_fine_mapping to False. \"\n", - " \"Fine-mapping will be performed and saved on variants listed in %s.\" % ld_reference_meta_file)\n", - " else:\n", - " print(\"WARNING: Fine-mapping is required to generate TWAS weights. Setting skip_fine_mapping to False.\")\n", - " skip_fine_mapping = False\n", - " output_files = [f'{cwd:a}/multivariate_fine_mapping/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multicontext_bvsr.rds',\n", - " f'{cwd:a}/multivariate_twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multicontext_twas_weights.rds']\n", - "else: \n", - " output_files = [f'{cwd:a}/multivariate_fine_mapping/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multicontext_bvsr.rds',\n", - " f'{cwd:a}/multivariate_twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multicontext_twas_weights.rds'] \n", - "output: output_files\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", - "bash: expand= \"${ }\", stderr = f\"{_output[0]:nn}.mnm.stderr\", stdout = f\"{_output[0]:nn}.mnm.stdout\", container = container, entrypoint = entrypoint\n", - " Rscript ${modular_script_dir}/mnm_analysis/mnm_methods/mnm.R \\\n", - " --genotype ${_input[0]:a} \\\n", - " --phenotype \"${\",\".join([str(x.absolute()) for x in _input[1:len(_input)//2+1]])}\" \\\n", - " --covariate \"${\",\".join([str(x.absolute()) for x in _input[len(_input)//2+1:]])}\" \\\n", - " --region \"${_meta_info[0]}\" \\\n", - " --window \"${_meta_info[1]}\" \\\n", - " --region-name \"${_meta_info[2]}\" \\\n", - " --extract-region-names \"${\"|\".join([x if isinstance(x,str) else \",\".join(x) for x in _meta_info[3]])}\" \\\n", - " --conditions \"${\",\".join(_meta_info[4:])}\" \\\n", - " --skip-analysis-pip-cutoff \"${\",\".join(skip_analysis_pip_cutoff)}\" \\\n", - " --maf ${maf} \\\n", - " --mac ${mac} \\\n", - " --imiss ${imiss} \\\n", - " ${\"--indel\" if indel else \"\"} \\\n", - " ${\"--keep-samples \" + str(keep_samples) if keep_samples.is_file() else \"\"} \\\n", - " ${\"--keep-variants \" + str(keep_variants) if not keep_variants.is_dir() else \"\"} \\\n", - " ${\"--save-data\" if save_data else \"\"} \\\n", + "parameter: pip_cutoff_to_skip = 0.0\n", + "# Comma-separated context names to restrict to; empty = all contexts.\n", + "parameter: contexts = \"\"\n", + "# Optional TwasWeights RDS (preceding mr.mash run) supplying the data-driven prior.\n", + "parameter: prior_twas_weights = path()\n", + "parameter: prior_weights_min = 1e-10\n", + "# Optional JSON of extra mvsusie kwargs (overrides the max_iter default), e.g. '{\"mvsusie\":{\"max_iter\":500}}'.\n", + "parameter: mvsusie_method_args = \"\"\n", + "fail_if(len(region_name) == 0, \"mnm: pass --region-name [ ...] to choose which gene(s) to analyze.\")\n", + "qtl_dataset = path(f\"{cwd:a}/qtl_dataset/{name}.qtl_dataset.rds\")\n", + "mvsusie_args = mvsusie_method_args if mvsusie_method_args else f'{{\"mvsusie\":{{\"max_iter\":{mvsusie_max_iter}}}}}'\n", + "input: qtl_dataset, for_each = \"region_name\"\n", + "output: f\"{cwd:a}/multivariate_fine_mapping/{name}.{_region_name}.multicontext_bvsr.rds\"\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n", + "bash: expand = \"${ }\", stderr = f\"{_output:n}.stderr\", stdout = f\"{_output:n}.stdout\", container = container, entrypoint = entrypoint\n", + " Rscript ${modular_script_dir}/pecotmr_integration/fine_mapping.R \\\n", + " --qtl-dataset ${_input} \\\n", + " --gene-id ${_region_name} \\\n", + " --cis-window ${cis_window} \\\n", + " --methods mvsusie \\\n", + " --joint-specification context \\\n", + " --coverage ${coverage[0]} \\\n", + " ${(\"--secondary-coverage \" + \",\".join([str(x) for x in coverage[1:]])) if len(coverage) > 1 else \"\"} \\\n", " --pip-cutoff ${pip_cutoff} \\\n", - " --coverage \"${\",\".join([str(x) for x in coverage])}\" \\\n", + " --pip-cutoff-to-skip ${pip_cutoff_to_skip} \\\n", " --seed ${seed} \\\n", - " --cwd ${cwd:a} \\\n", - " ${\"--skip-fine-mapping\" if skip_fine_mapping else \"\"} \\\n", - " ${\"--skip-twas-weights\" if skip_twas_weights else \"\"} \\\n", - " --xvar-cutoff ${xvar_cutoff} \\\n", - " --mvsusie-max-iter ${mvsusie_max_iter} \\\n", - " --mrmash-max-iter ${mrmash_max_iter} \\\n", - " --max-cv-variants ${max_cv_variants} \\\n", - " --twas-cv-folds ${twas_cv_folds} \\\n", - " --twas-cv-threads ${twas_cv_threads} \\\n", - " ${\"--ld-reference-meta-file \" + str(ld_reference_meta_file) if not ld_reference_meta_file.is_dir() else \"\"} \\\n", - " ${\"--mixture-prior \" + str(mixture_prior) if mixture_prior.is_file() else \"\"} \\\n", - " ${\"--mixture-prior-cv \" + str(mixture_prior_cv) if mixture_prior_cv.is_file() else \"\"} \\\n", - " --prior-weights-min ${prior_weights_min} \\\n", - " ${\"--prior-canonical-matrices\" if prior_canonical_matrices else \"\"} \\\n", - " ${\"--sample-partition \" + str(sample_partition) if sample_partition.is_file() else \"\"} \\\n", - " --output-prefix ${_output[0]:nn}\n" + " ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n", + " --method-args '${mvsusie_args}' \\\n", + " ${(\"--twas-weights \" + str(prior_twas_weights)) if prior_twas_weights.is_file() else \"\"} \\\n", + " --data-driven-prior-weights-cutoff ${prior_weights_min} \\\n", + " --output ${_output}\n" ] }, { @@ -878,295 +844,59 @@ }, "outputs": [], "source": [ - "[mnm_genes]\n", - "depends: sos_variable(\"regional_data\")\n", - "# Check if both 'data' and 'meta_info' are empty lists\n", - "stop_if(len(regional_data['data']) == 0, f'Either genotype or phenotype data are not available for region {\", \".join(region_name)}.')\n", - " \n", - "meta_info = regional_data['meta_info']\n", - "genes = meta_info[0][3]\n", - "if isinstance(genes, tuple):\n", - " genes = genes[0] if isinstance(genes[0], tuple) else list(genes)\n", - "else:\n", - " genes = [genes]\n", - "\n", - "if len(skip_analysis_pip_cutoff) == 0:\n", - " skip_analysis_pip_cutoff = [0.0] * len(genes)\n", - "\n", - "if len(skip_analysis_pip_cutoff) == 1:\n", - " skip_analysis_pip_cutoff = skip_analysis_pip_cutoff * len(genes)\n", - "\n", - "skip_analysis_pip_cutoff = [\"'{}'={}\".format(genes[i], skip_analysis_pip_cutoff[i].split('=')[1]) for i in range(len(genes))]\n", - "\n", - "parameter: pheno_id_map_file = path\n", - "parameter: fine_mapping_meta = path\n", - "parameter: data_driven_prior = False\n", - "# Parameters below are relevant when we use data driven prior\n", - "# which is an experimental feature\n", - "parameter: n_random = 10\n", - "parameter: n_null = 10\n", - "parameter: independent_variant_list = path()\n", - "parameter: prior_weights_min = 5e-4\n", - "parameter: prior_canonical_matrices = False\n", - "# This is relevant to cross-validation\n", - "parameter: sample_partition = path() \n", + "[mnm_genes_1]\n", + "# Resolve per-locus regions (region_id -> chr:start-end) into a manifest via\n", + "# region_manifest.R for the cross-gene (jointSpecification=\"trait\") mvSuSiE fit.\n", + "# Loci come from --customized-association-windows (or --region-list); restrict\n", + "# with --region-name. Falls back to per-gene cis windows from the manifest.\n", + "input: None\n", + "output: f\"{cwd:a}/multivariate_fine_mapping/{name}.mnm_genes_region_manifest.tsv\"\n", + "task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n", + "bash: expand = \"${ }\", stderr = f\"{_output}.stderr\", stdout = f\"{_output}.stdout\", container = container, entrypoint = entrypoint\n", + " Rscript ${modular_script_dir}/pecotmr_integration/region_manifest.R \\\n", + " --pheno-manifest ${phenoFile[0]} \\\n", + " --cis-window ${cis_window if cis_window > 0 else 1000000} \\\n", + " ${(\"--customized-association-windows \" + str(customized_association_windows)) if customized_association_windows.is_file() else \"\"} \\\n", + " ${(\"--region-name \" + \",\".join(region_name)) if len(region_name) > 0 else \"\"} \\\n", + " ${(\"--region-list \" + str(region_list)) if region_list.is_file() else \"\"} \\\n", + " --output ${_output}\n", + "\n", + "[mnm_genes_2]\n", + "# Cross-gene (multi-trait) mvSuSiE per locus: region mode +\n", + "# jointSpecification=\"trait\" joins the genes whose coordinates overlap the\n", + "# locus. Optional mr.mash data-driven prior via --prior-twas-weights (canonical\n", + "# otherwise). Replaces the legacy multigene combine + multigene_udr machinery.\n", "parameter: mvsusie_max_iter = 200\n", - "parameter: mrmash_max_iter = 5000\n", - "\n", - "if not data_driven_prior:\n", - " prior_canonical_matrices = True\n", - "input: regional_data[\"data\"],group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n", - "\n", - "if skip_fine_mapping and skip_twas_weights:\n", - " save_data = True\n", - " output_files = [f'{cwd:a}/data/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multigene_data.rds']\n", - "elif not skip_fine_mapping and skip_twas_weights:\n", - " output_files = [f'{cwd:a}/multivariate_fine_mapping/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multigene_bvsr.rds']\n", - "elif skip_fine_mapping and not skip_twas_weights:\n", - " output_files = [f'{cwd:a}/multivariate_twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multigene_twas_weights.rds']\n", - "else: \n", - " output_files = [f'{cwd:a}/multivariate_fine_mapping/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multigene_bvsr.rds',\n", - " f'{cwd:a}/multivariate_twas_weights/{name}.{_meta_info[0].split(\":\")[0]}_{_meta_info[2]}.multigene_twas_weights.rds'] \n", - "output: output_files\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", - "R: expand = '${ }', stdout = f\"{_output[0]:nn}.mnm_genes.stdout\", stderr = f\"{_output[0]:nn}.mnm_genes.stderr\", container = container, entrypoint = entrypoint\n", - "\n", - " library(data.table)\n", - " library(dplyr)\n", - " library(pecotmr)\n", - " combine_result_list <- function(univariate_finemapping_metadata, condition_name, conditions = NULL, extracted_region){\n", - " regional_window_combined_susie_result <- list()\n", - " regional_window_combined_sumstats_result <- list()\n", - " extracted_univariate_finemapping_metadata <- univariate_finemapping_metadata%>%filter(region_id%in%extracted_region)\n", - " for(i in 1:dim(extracted_univariate_finemapping_metadata)[1]){\n", - " gene_id <- extracted_univariate_finemapping_metadata$region_id[i]\n", - " susie_result <- readRDS(extracted_univariate_finemapping_metadata$combined_data[i])\n", - " sumstats_result <- readRDS(extracted_univariate_finemapping_metadata$combined_data_sumstats[i]) \n", - " if (!is.null(conditions)) {\n", - " if (any(names(susie_result[[gene_id]]) %in% conditions)) {\n", - " extracted_conditions = names(susie_result[[gene_id]])[names(susie_result[[gene_id]]) %in% conditions]\n", - " for (j in 1:length(extracted_conditions)){\n", - " regional_window_combined_susie_result[[condition_name]][[extracted_conditions[j]]] <- susie_result[[gene_id]][[extracted_conditions[j]]]\n", - " regional_window_combined_sumstats_result[[condition_name]][[extracted_conditions[j]]] <- sumstats_result[[gene_id]][[extracted_conditions[j]]]\n", - " }\n", - " }\n", - " } else {\n", - " regional_window_combined_susie_result[[condition_name]][[gene_id]] <- susie_result[[gene_id]][[condition_name]]\n", - " regional_window_combined_sumstats_result[[condition_name]][[gene_id]] <- sumstats_result[[gene_id]][[condition_name]]\n", - " }\n", - " } \n", - " return(list(regional_window_combined_susie_result = regional_window_combined_susie_result, regional_window_combined_sumstats_result = regional_window_combined_sumstats_result))\n", - " }\n", - "\n", - " \n", - " extract_non_null_cs_list <- function(combined_list, condition_name, extracted_region) {\n", - " # List to store results for genes passing the first step\n", - " extracted_regional_window_combined_susie_result <- list()\n", - " extracted_regional_window_combined_sumstats_result <- list()\n", - "\n", - " # Iterate over each gene's susie and sumstats results\n", - " filtered_region <- extracted_region[extracted_region%in%names(combined_list[[\"regional_window_combined_susie_result\"]][[condition_name]])]\n", - " for (gene_id in filtered_region) {\n", - " susie_result <- get_nested_element(combined_list,c(\"regional_window_combined_susie_result\",condition_name,gene_id))\n", - "\n", - " # Check if there exits top_loci.\n", - " if (!is.null(susie_result[[\"top_loci\"]])&&nrow(susie_result[[\"top_loci\"]])[1]!=0) {\n", - " # If not all zeros, add this gene's susie and sumstats results to the lists\n", - " extracted_regional_window_combined_susie_result[[condition_name]][[gene_id]] <- get_nested_element(combined_list,c(\"regional_window_combined_susie_result\",condition_name,gene_id))\n", - " extracted_regional_window_combined_sumstats_result[[condition_name]][[gene_id]] <- get_nested_element(combined_list,c(\"regional_window_combined_sumstats_result\",condition_name,gene_id))\n", - " }\n", - " }\n", - " return(list(extracted_regional_window_combined_susie_result = extracted_regional_window_combined_susie_result, extracted_regional_window_combined_sumstats_result = extracted_regional_window_combined_sumstats_result))\n", - " }\n", - " \n", - " process_and_compare_variants <- function(combined_list, condition_name) {\n", - " extracted_regional_window_combined_susie_result <- list()\n", - " extracted_regional_window_combined_sumstats_result <- list()\n", - " compare_variants <- function(variants1, variants2) {\n", - " return(any(variants1 %in% variants2) | any(variants2 %in% variants1))\n", - " }\n", - " subset_top_loci_variants <- function(df) {\n", - " find_non_zero_rows <- function(df) {\n", - " non_zero_indices <- which(rowSums(df!= 0) > 0)\n", - " return(non_zero_indices)\n", - " }\n", - " non_zero_indices <- find_non_zero_rows(df)\n", - "\n", - " if (length(non_zero_indices) > 0) {\n", - " non_zero_index_variants <- df[non_zero_indices, , drop = FALSE]%>%select(variant_id)%>%pull(variant_id)\n", - " zero_index_filtered_variants <- df[-non_zero_indices, ,drop = FALSE]%>%filter(pip>=0.01)%>%select(variant_id)%>%pull(variant_id)\n", - " top_loci_variants <- c(non_zero_index_variants, zero_index_filtered_variants)\n", - " } else {\n", - " zero_index_filtered_variants <- df%>%filter(pip>=0.01)%>%select(variant_id)%>%pull(variant_id)\n", - " top_loci_variants <- zero_index_filtered_variants\n", - " }\n", - " return(top_loci_variants)\n", - " }\n", - " \n", - " for (i in 1:length(combined_list[[\"extracted_regional_window_combined_susie_result\"]][[condition_name]])) {\n", - " susie_result_1 <- combined_list[[\"extracted_regional_window_combined_susie_result\"]][[condition_name]][[i]]\n", - " gene_id <- names(combined_list[[\"extracted_regional_window_combined_susie_result\"]][[condition_name]])[i]\n", - " \n", - " # Extract top_loci variants\n", - " variants1 <- subset_top_loci_variants(susie_result_1[[\"top_loci\"]])\n", - " if (length(variants1) > 0) {\n", - " different_in_all_genes <- TRUE\n", - " \n", - " for (j in 1:length(combined_list[[\"extracted_regional_window_combined_susie_result\"]][[condition_name]])) {\n", - " if (i != j) {\n", - " susie_result_2 <- combined_list[[\"extracted_regional_window_combined_susie_result\"]][[condition_name]][[j]]\n", - " variants2 <- subset_top_loci_variants(susie_result_2[[\"top_loci\"]])\n", - " \n", - " # Compare variants\n", - " if (length(variants2) > 0) {\n", - " if (compare_variants(variants1, variants2)) {\n", - " # If variants are not totally different across all genes, set the flag to FALSE\n", - " different_in_all_genes <- FALSE\n", - " break\n", - " }\n", - " }\n", - " }\n", - " }\n", - " \n", - " # After the loop, check the flag\n", - " if (!different_in_all_genes) {\n", - " extracted_regional_window_combined_susie_result[[condition_name]][[gene_id]] <- get_nested_element(combined_list, c(\"extracted_regional_window_combined_susie_result\", condition_name, gene_id))\n", - " extracted_regional_window_combined_sumstats_result[[condition_name]][[gene_id]] <- get_nested_element(combined_list, c(\"extracted_regional_window_combined_sumstats_result\", condition_name, gene_id))\n", - " }\n", - " }\n", - " }\n", - " return(list(extracted_regional_window_combined_susie_result = extracted_regional_window_combined_susie_result, extracted_regional_window_combined_sumstats_result = extracted_regional_window_combined_sumstats_result))\n", - " }\n", - "\n", - " fine_mapping_metadata = fread(${fine_mapping_meta:r})\n", - " phenotype_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[1:len(_input)//2+1]])})\n", - " covariate_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[len(_input)//2+1:]])})\n", - " condition_name = ${(\"'%s'\" % _meta_info[-1])}\n", - " region_names = ${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])}\n", - " if (any(region_names %in% fine_mapping_metadata$region_id)) {\n", - " filtered_region <- fine_mapping_metadata%>%filter(region_id%in%region_names)%>%select(region_id)%>%pull(region_id)\n", - " filtered_event_names <- filtered_region\n", - " } else {\n", - " pheno_id_map <- fread(${pheno_id_map_file:r})\n", - " gene_ids <- pheno_id_map%>%filter(ID%in%region_names)%>%select(gene)%>%pull(gene)%>%{.[!duplicated(.)]}\n", - " filtered_region <- fine_mapping_metadata%>%filter(region_id%in%gene_ids)%>%select(region_id)%>%pull(region_id)\n", - " filtered_event_names <- pheno_id_map%>%filter(ID%in%region_names&gene%in%filtered_region)%>%select(ID)%>%pull(ID)\n", - " }\n", - " keep_samples = NULL\n", - " if (${\"TRUE\" if keep_samples.is_file() else \"FALSE\"}) {\n", - " keep_samples = unlist(strsplit(readLines(${keep_samples:ar}), \"\\\\s+\"))\n", - " message(paste(length(keep_samples), \"samples are selected to be loaded for analysis\"))\n", - " }\n", - " \n", - " fdat = load_regional_multivariate_data(genotype = ${_input[0]:anr},\n", - " phenotype = phenotype_files,\n", - " covariate = covariate_files,\n", - " region = ${(\"'%s'\" % _meta_info[1]) if int(_meta_info[1].split('-')[-1])>0 else 'NULL'},\n", - " maf_cutoff = ${maf}, \n", - " mac_cutoff = ${mac},\n", - " imiss_cutoff = ${imiss},\n", - " conditions = NULL,\n", - " association_window = ${(\"'%s'\" % _meta_info[1]) if int(_meta_info[1].split('-')[-1])>0 else 'NULL'},\n", - " extract_region_name = list(filtered_event_names),\n", - " region_name_col = ${\"4\" if int(_meta_info[1].split('-')[-1])>0 else \"1\"},\n", - " keep_indel = ${\"TRUE\" if indel else \"FALSE\"},\n", - " keep_samples = keep_samples,\n", - " keep_variants = ${'\"%s\"' % keep_variants if not keep_variants.is_dir() else \"NULL\"},\n", - " phenotype_header = ${\"4\" if int(_meta_info[1].split('-')[-1])>0 else \"1\"}, # skip first 4 rows of transposed phenotype for chr, start, end and ID\n", - " scale_residuals = FALSE)\n", - " if (${\"TRUE\" if save_data else \"FALSE\"}) {\n", - " saveRDS(fdat, \"${_output[0]:ann}.multigene_data.rds\", compress='xz')\n", - " }\n", - " if (${\"FALSE\" if skip_fine_mapping and skip_twas_weights else \"TRUE\"}) {\n", - " # Extract and consolidate univariate results to determine which genes to focus on\n", - " if (grepl(\"sQTL\", condition_name)) {conditions <- paste(condition_name, filtered_event_names, sep = \"_\")} else {conditions <- NULL}\n", - " combined_regional_window_list <- combine_result_list(fine_mapping_metadata, condition_name, conditions, filtered_region)\n", - " if (grepl(\"sQTL\",condition_name)) { filtered_region <- paste(condition_name, filtered_event_names, sep = \"_\") }\n", - " extracted_non_null_combined_susie_list <- extract_non_null_cs_list(combined_list = combined_regional_window_list, condition_name, filtered_region)\n", - " if (length(extracted_non_null_combined_susie_list[[\"extracted_regional_window_combined_susie_result\"]])!=0) {\n", - " extracted_combined_susie_list <- process_and_compare_variants(combined_list = extracted_non_null_combined_susie_list, condition_name)\n", - " } else {\n", - " extracted_combined_susie_list <- NULL\n", - " }\n", - " if (!is.null(extracted_combined_susie_list)&length(extracted_combined_susie_list[[\"extracted_regional_window_combined_susie_result\"]])!=0) {\n", - " if (grepl(\"sQTL\", condition_name)) {\n", - " filtered_event_names <- sub(paste0(\"^\", condition_name, \"_\"), \"\", names(get_nested_element(extracted_combined_susie_list,c(\"extracted_regional_window_combined_susie_result\", condition_name))))\n", - " } else if (grepl(\"pQTL\", condition_name)) {\n", - " filtered_event_names <- pheno_id_map %>%\n", - " filter(ID%in%region_names)%>%\n", - " filter(gene %in% names(get_nested_element(extracted_combined_susie_list, c(\"extracted_regional_window_combined_susie_result\", condition_name)))) %>%\n", - " select(ID) %>% pull(ID)\n", - " } else {\n", - " filtered_event_names <- names(get_nested_element(extracted_combined_susie_list, c(\"extracted_regional_window_combined_susie_result\", condition_name)))\n", - " }\n", - " } else {\n", - " filtered_event_names <- NULL\n", - " }\n", - "\n", - " if (\"${_meta_info[2]}\" != \"${_meta_info[3]}\") {\n", - " region_name = c(\"${_meta_info[2]}\", c(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])}))\n", - " } else {\n", - " region_name = \"${_meta_info[2]}\"\n", - " }\n", - "\n", - " region_info = list(region_coord=parse_region(\"${_meta_info[0]}\"), grange=parse_region(\"${_meta_info[1]}\"), region_name=region_name)\n", - " \n", - " dd_prior <- NULL\n", - " if (${\"TRUE\" if data_driven_prior else \"FALSE\"}) {\n", - " # FIXME: please complete this function call\n", - " dd_prior <- multigene_udr(extracted_combined_susie_list, coverage = paste0(\"cs_coverage_\",${coverage[0]},sep=\"\"), independent_variant_list = ${independent_variant_list:r}, n_random = ${n_random}, n_null = ${n_null}, seed = ${seed})\n", - " }\n", - " if (length(filtered_event_names)>=2) {\n", - " pip_cutoff_to_skip = list(${\",\".join(skip_analysis_pip_cutoff)})\n", - " pip_cutoff_to_skip = unlist(pip_cutoff_to_skip[filtered_event_names])\n", - " dd_prior_cv <- dd_prior\n", - " set.seed(${seed})\n", - " result <- multivariate_analysis_pipeline(\n", - " X=fdat$X,\n", - " Y=fdat$residual_Y[,filtered_event_names],\n", - " maf=fdat$maf,\n", - " X_variance = fdat$X_variance,\n", - " other_quantities = list(dropped_samples = fdat$dropped_samples),\n", - " # filters\n", - " imiss_cutoff = ${imiss}, \n", - " maf_cutoff = ${maf},\n", - " xvar_cutoff = ${xvar_cutoff},\n", - " ld_reference_meta_file = ${('\"%s\"' % ld_reference_meta_file) if not ld_reference_meta_file.is_dir() else \"NULL\"},\n", - " pip_cutoff_to_skip = pip_cutoff_to_skip,\n", - " # methods parameter configuration\n", - " max_L = -1,\n", - " data_driven_prior_matrices = dd_prior, \n", - " data_driven_prior_matrices_cv = dd_prior_cv, \n", - " data_driven_prior_weights_cutoff = ${prior_weights_min}, \n", - " canonical_prior_matrices = ${\"TRUE\" if prior_canonical_matrices else \"FALSE\"},\n", - " mvsusie_max_iter = ${mvsusie_max_iter}, \n", - " estimate_residual_variance = ${'TRUE' if estimate_residual_variance else 'FALSE'},\n", - " mrmash_max_iter = ${mrmash_max_iter},\n", - " # fine-mapping results summary\n", - " signal_cutoff = ${pip_cutoff},\n", - " coverage = c(${\",\".join([str(x) for x in coverage])}),\n", - " # TWAS weights and CV for TWAS weights\n", - " twas_weights = ${\"TRUE\" if not skip_twas_weights else \"FALSE\"},\n", - " sample_partition = ${\"NULL\" if sample_partition=='.' or sample_partition=='' else sample_partition},\n", - " max_cv_variants = ${max_cv_variants}, \n", - " cv_folds = ${twas_cv_folds},\n", - " cv_threads = ${twas_cv_threads} \n", - " )\n", - " result$region_info <- region_info\n", - " if (!is.null(result$twas_weights_result)) {\n", - " for (r in names(result$twas_weights_result)) {\n", - " result$twas_weights_result[[r]]$region_info <- region_info\n", - " }\n", - " saveRDS(list(\"${_meta_info[2]}\" = result$twas_weights_result), \"${_output[0]:ann}.multigene_twas_weights.rds\", compress='xz')\n", - " result$twas_weights_result <- NULL\n", - " }\n", - " saveRDS(list(\"${_meta_info[2]}\" = result), \"${_output[0]:ann}.multigene_bvsr.rds\", compress='xz') \n", - " } else {\n", - " for (f in c(${_output:ar,})) {\n", - " saveRDS(NULL, f)\n", - " }\n", - " }\n", - " }" + "parameter: pip_cutoff_to_skip = 0.0\n", + "parameter: contexts = \"\"\n", + "parameter: prior_twas_weights = path()\n", + "parameter: prior_weights_min = 1e-10\n", + "parameter: mvsusie_method_args = \"\"\n", + "import csv\n", + "manifest = path(f\"{cwd:a}/multivariate_fine_mapping/{name}.mnm_genes_region_manifest.tsv\")\n", + "jobs = list(csv.DictReader(open(manifest), delimiter='\\t'))\n", + "stop_if(len(jobs) == 0, \"mnm_genes: empty region manifest; check --region-name / --customized-association-windows.\")\n", + "qtl_dataset = path(f\"{cwd:a}/qtl_dataset/{name}.qtl_dataset.rds\")\n", + "mvsusie_args = mvsusie_method_args if mvsusie_method_args else f'{{\"mvsusie\":{{\"max_iter\":{mvsusie_max_iter}}}}}'\n", + "input: qtl_dataset, for_each = \"jobs\"\n", + "output: f\"{cwd:a}/multivariate_fine_mapping/{name}.{_jobs['region_id']}.multigene_bvsr.rds\"\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n", + "bash: expand = \"${ }\", stderr = f\"{_output:n}.stderr\", stdout = f\"{_output:n}.stdout\", container = container, entrypoint = entrypoint\n", + " Rscript ${modular_script_dir}/pecotmr_integration/fine_mapping.R \\\n", + " --qtl-dataset ${_input} \\\n", + " --region ${_jobs['ld_block']} \\\n", + " --methods mvsusie \\\n", + " --joint-specification trait \\\n", + " --coverage ${coverage[0]} \\\n", + " ${(\"--secondary-coverage \" + \",\".join([str(x) for x in coverage[1:]])) if len(coverage) > 1 else \"\"} \\\n", + " --pip-cutoff ${pip_cutoff} \\\n", + " --pip-cutoff-to-skip ${pip_cutoff_to_skip} \\\n", + " --seed ${seed} \\\n", + " ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n", + " --method-args '${mvsusie_args}' \\\n", + " ${(\"--twas-weights \" + str(prior_twas_weights)) if prior_twas_weights.is_file() else \"\"} \\\n", + " --data-driven-prior-weights-cutoff ${prior_weights_min} \\\n", + " --output ${_output}\n" ] }, { @@ -1177,64 +907,63 @@ }, "outputs": [], "source": [ - "[fsusie]\n", - "# prior can be either of [\"mixture_normal\", \"mixture_normal_per_scale\"]\n", + "[fsusie_1]\n", + "# Resolve per-locus regions into a manifest via region_manifest.R for the\n", + "# functional SuSiE fit (one locus = one fsusie unit; fsusieR::susiF jointly fits\n", + "# the multi-trait function within each context). Loci come from\n", + "# --customized-association-windows (or --region-list); restrict with --region-name.\n", + "input: None\n", + "output: f\"{cwd:a}/fsusie/{name}.fsusie_region_manifest.tsv\"\n", + "task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n", + "bash: expand = \"${ }\", stderr = f\"{_output}.stderr\", stdout = f\"{_output}.stdout\", container = container, entrypoint = entrypoint\n", + " Rscript ${modular_script_dir}/pecotmr_integration/region_manifest.R \\\n", + " --pheno-manifest ${phenoFile[0]} \\\n", + " --cis-window ${cis_window if cis_window > 0 else 1000000} \\\n", + " ${(\"--customized-association-windows \" + str(customized_association_windows)) if customized_association_windows.is_file() else \"\"} \\\n", + " ${(\"--region-name \" + \",\".join(region_name)) if len(region_name) > 0 else \"\"} \\\n", + " ${(\"--region-list \" + str(region_list)) if region_list.is_file() else \"\"} \\\n", + " --output ${_output}\n", + "\n", + "[fsusie_2]\n", + "# Functional SuSiE per locus (fsusieR::susiF) over the pre-built QtlDataset.\n", + "# fsusie-specific knobs ride on --method-args; --susie-top-pc>0 additionally\n", + "# fine-maps each context's top principal components with univariate SuSiE\n", + "# (usePCA; ports the legacy fsusie.R susie_on_top_pc). Replaces fsusie.R.\n", "parameter: prior = \"mixture_normal\"\n", + "parameter: max_scale = 10\n", + "parameter: min_purity = 0.5\n", + "parameter: post_processing = \"TI\"\n", "parameter: max_SNP_EM = 100\n", - "# Max scale is such that 2^max_scale being the number of phenotypes in the transformed space. Default to 2^10 = 1024. Don't change it unless you know what you are doing. Max_scale should be at least larger than 5.\n", - "parameter: max_scale = 10\n", - "# Purity and coverage used to call cs\n", - "parameter: min_purity = 0.5\n", - "# Epigenetics mark filter\n", - "parameter: epigenetics_mark_treshold = 16\n", - "# Run susie for top pc of the fsusie input\n", - "parameter: susie_top_pc = 10\n", - "# Run fsusie with this post-processing option. Available options are TI, HMM, and none. TI is recommended as default. HMM performs particularly well when analyzing data with, say, few sampling points (i.e., ncol(Y)<30) or when the data are particularly noisy (low signal-to-noise ratio).\n", - "parameter: post_processing = \"TI\"\n", - "# Run fsusie with small sample correction \n", "parameter: small_sample_correction = False\n", - "\n", - "depends: sos_variable(\"regional_data\")\n", - "# Check if both 'data' and 'meta_info' are empty lists\n", - "stop_if(len(regional_data['data']) == 0, f'Either genotype or phenotype data are not available for region {\", \".join(region_name)}.')\n", - "meta_info = regional_data['meta_info']\n", - "input: regional_data[\"data\"], group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n", - "output: f'{cwd:a}/{step_name[:-2]}/{name}.{_meta_info[0].replace(\":\", \"_\").replace(\"-\", \"_\")}.fsusie_{prior}_{post_processing}_{\"_top_pc_weights\" if not skip_twas_weights else \"\"}.rds'\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", - "bash: expand= \"${ }\", stderr = f\"{_output:n}.stderr\", stdout = f\"{_output:n}.stdout\", container = container, entrypoint = entrypoint\n", - " Rscript ${modular_script_dir}/mnm_analysis/mnm_methods/fsusie.R \\\n", - " --genotype ${_input[0]:a} \\\n", - " --phenotype \"${\",\".join([str(x.absolute()) for x in _input[1:len(_input)//2+1]])}\" \\\n", - " --covariate \"${\",\".join([str(x.absolute()) for x in _input[len(_input)//2+1:]])}\" \\\n", - " --region \"${_meta_info[0]}\" \\\n", - " --window \"${_meta_info[1]}\" \\\n", - " --region-name \"${_meta_info[2]}\" \\\n", - " --conditions \"${\",\".join(_meta_info[4:])}\" \\\n", - " --maf ${maf} \\\n", - " --mac ${mac} \\\n", - " --imiss ${imiss} \\\n", - " ${\"--indel\" if indel else \"\"} \\\n", - " ${\"--keep-samples \" + str(keep_samples) if keep_samples.is_file() else \"\"} \\\n", - " ${\"--keep-variants \" + str(keep_variants) if not keep_variants.is_dir() else \"\"} \\\n", - " --prior \"${prior}\" \\\n", - " --max-snp-em ${max_SNP_EM} \\\n", - " --max-scale ${max_scale} \\\n", - " --min-purity ${min_purity} \\\n", - " --epigenetics-mark-threshold ${epigenetics_mark_treshold} \\\n", - " --susie-top-pc ${susie_top_pc} \\\n", - " --post-processing \"${post_processing}\" \\\n", - " ${\"--small-sample-correction\" if small_sample_correction else \"\"} \\\n", + "# >0 -> also fine-map the top `susie_top_pc` PCs per context with univariate SuSiE.\n", + "parameter: susie_top_pc = 0\n", + "parameter: pip_cutoff_to_skip = 0.0\n", + "parameter: contexts = \"\"\n", + "# Optional JSON overriding the assembled fsusie kwargs, e.g. '{\"fsusie\":{\"prior\":\"mixture_normal_per_scale\"}}'.\n", + "parameter: fsusie_method_args = \"\"\n", + "import csv, json\n", + "manifest = path(f\"{cwd:a}/fsusie/{name}.fsusie_region_manifest.tsv\")\n", + "jobs = list(csv.DictReader(open(manifest), delimiter='\\t'))\n", + "stop_if(len(jobs) == 0, \"fsusie: empty region manifest; check --region-name / --customized-association-windows.\")\n", + "qtl_dataset = path(f\"{cwd:a}/qtl_dataset/{name}.qtl_dataset.rds\")\n", + "fsusie_args = fsusie_method_args if fsusie_method_args else json.dumps({\"fsusie\": {\"prior\": prior, \"max_scale\": max_scale, \"min_purity\": min_purity, \"post_processing\": post_processing, \"max_SNP_EM\": max_SNP_EM, \"cor_small\": bool(small_sample_correction)}})\n", + "input: qtl_dataset, for_each = \"jobs\"\n", + "output: f\"{cwd:a}/fsusie/{name}.{_jobs['region_id']}.fsusie.rds\"\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f\"{step_name}_{_output:bn}\"\n", + "bash: expand = \"${ }\", stderr = f\"{_output:n}.stderr\", stdout = f\"{_output:n}.stdout\", container = container, entrypoint = entrypoint\n", + " Rscript ${modular_script_dir}/pecotmr_integration/fine_mapping.R \\\n", + " --qtl-dataset ${_input} \\\n", + " --region ${_jobs['ld_block']} \\\n", + " --methods fsusie \\\n", + " --coverage ${coverage[0]} \\\n", + " ${(\"--secondary-coverage \" + \",\".join([str(x) for x in coverage[1:]])) if len(coverage) > 1 else \"\"} \\\n", " --pip-cutoff ${pip_cutoff} \\\n", - " --coverage \"${\",\".join([str(x) for x in coverage])}\" \\\n", - " --L-greedy ${init_L} \\\n", - " --L ${L} \\\n", - " ${\"--skip-twas-weights\" if skip_twas_weights else \"\"} \\\n", - " --max-cv-variants ${max_cv_variants} \\\n", - " --twas-cv-folds ${twas_cv_folds} \\\n", - " --twas-cv-threads ${twas_cv_threads} \\\n", - " ${\"--save-data\" if save_data else \"\"} \\\n", - " --output-prefix ${_output:n} \\\n", - " --cwd ${cwd:a}\n" + " --pip-cutoff-to-skip ${pip_cutoff_to_skip} \\\n", + " --seed ${seed} \\\n", + " ${(\"--contexts \" + contexts) if contexts else \"\"} \\\n", + " --method-args '${fsusie_args}' \\\n", + " ${(\"--use-pca --n-pcs \" + str(susie_top_pc)) if susie_top_pc > 0 else \"\"} \\\n", + " --output ${_output}\n" ] }, { @@ -1330,4 +1059,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/code/script/pecotmr_integration/fine_mapping.R b/code/script/pecotmr_integration/fine_mapping.R index 5baf570f..1a592fa0 100644 --- a/code/script/pecotmr_integration/fine_mapping.R +++ b/code/script/pecotmr_integration/fine_mapping.R @@ -29,6 +29,26 @@ # forwarded to the underlying fitter (susieR::susie, # susieR::susie_rss, mvsusieR::mvsusie, fsusieR::susiF, # etc.). Example: '{"susie":{"L":1,"refine":false}}'. +# For mvsusie / fsusie this is how you pass their kwargs, +# e.g. '{"mvsusie":{"max_iter":200}}' or +# '{"fsusie":{"prior":"mixture_normal","max_scale":10}}'. +# +# Multivariate / joint fits (QTL mode only; mvsusie / fsusie tokens): +# --joint-specification Comma-separated joint-fit axes (subset of +# context,trait,study). "context" joins a gene's contexts +# (cross-context mvSuSiE); "trait" joins the genes in the +# region (cross-trait / multi-gene). Empty (default) leaves +# the implicit per-(context,trait) branch. +# --twas-weights Optional TwasWeights RDS from a preceding mr.mash run +# (twas_weights.R --methods mrmash) supplying the mvSuSiE +# data-driven reweighted prior. Omit for the canonical +# prior. Requires a pecotmr build with twasWeights support. +# --data-driven-prior-weights-cutoff Prior-component weight floor for the +# reweighted mvSuSiE prior (fineMappingPipeline +# dataDrivenPriorWeightsCutoff). Only used with --twas-weights. +# --use-pca / --n-pcs Fine-map each multi-trait context's top principal +# components with univariate SuSiE (ports fsusie.R +# susie_on_top_pc). Requires a pecotmr build with usePCA. # --output Output RDS path. suppressPackageStartupMessages({ @@ -91,6 +111,24 @@ parser <- add_argument(parser, "--seed", parser <- add_argument(parser, "--pip-cutoff-to-skip", help = "Single-effect (SER) pre-screen cutoff (fineMappingPipeline pipCutoffToSkip), QTL mode; 0 = off, <0 = adaptive 3/nVariants", type = "numeric", default = 0) +# --- Multivariate / joint-fit knobs (QTL mode; mvsusie / fsusie). Each is +# opt-in and omitted from the pipeline call when left at its default, so this +# wrapper also runs against a pecotmr build that predates twasWeights / usePCA. +parser <- add_argument(parser, "--joint-specification", + help = "Comma-separated joint-fit axes (context,trait,study) for mvsusie/fsusie; empty = implicit per-(context,trait) branch", + type = "character", default = "") +parser <- add_argument(parser, "--twas-weights", + help = "Optional TwasWeights RDS (preceding mr.mash run) supplying the mvSuSiE data-driven prior; omit for the canonical prior", + type = "character", default = "") +parser <- add_argument(parser, "--data-driven-prior-weights-cutoff", + help = "Prior-component weight floor for the reweighted mvSuSiE prior (only used with --twas-weights)", + type = "numeric", default = 1e-10) +parser <- add_argument(parser, "--use-pca", + help = "Fine-map each multi-trait context's top PCs with univariate SuSiE (usePCA; ports fsusie.R susie_on_top_pc)", + flag = TRUE) +parser <- add_argument(parser, "--n-pcs", + help = "Cap on top principal components per context when --use-pca (nPCs)", + type = "integer", default = 10L) # --- Per-analysis overrides of the QtlDataset's construct-time filters. Each is # opt-in (unset leaves the dataset's stored value); applied to the loaded object # before the pipeline runs (QTL mode only). @@ -190,11 +228,18 @@ contexts_arg <- if (nzchar(argv$contexts) && argv$contexts != ".") methods <- trimws(strsplit(argv$methods, ",", fixed = TRUE)[[1L]]) -# Build the final `methods` argument for fineMappingPipeline as the named-list -# form {token: kwargs}. The pipeline's SuSiE fit defaults (L = --L, -# L_greedy = --L-greedy) seed every SuSiE-family token; any matching -# --method-args entry overrides them per key (explicit > default). Keys in -# --method-args must be among the --methods tokens (no silent typos). +# Joint-fit axes for mvsusie/fsusie: "" -> NULL (implicit per-tuple branch). +joint_spec <- if (nzchar(argv$joint_specification) && argv$joint_specification != ".") + trimws(strsplit(argv$joint_specification, ",", fixed = TRUE)[[1L]]) else NULL +# Optional mr.mash data-driven prior (TwasWeights from a preceding mrmash run). +twas_weights_obj <- if (nzchar(argv$twas_weights) && argv$twas_weights != "." && + file.exists(argv$twas_weights)) readRDS(argv$twas_weights) else NULL + +# Build the `methods` argument: the bare tokens, or the named-list {token: kwargs} +# form when --method-args is given. SuSiE L / L_greedy defaults + susie-family +# routing live in pecotmr (.fmNormalizeMethods); the wrapper only forwards the +# --L / --L-greedy values as top-level args (see cs_args). --method-args keys +# must be among the --methods tokens (no silent typos). if (!is.null(parsed_method_args)) { unknown <- setdiff(names(parsed_method_args), methods) if (length(unknown) > 0L) @@ -202,19 +247,18 @@ if (!is.null(parsed_method_args)) { paste(unknown, collapse = ", "), "'; --methods = '", paste(methods, collapse = ", "), "').") } -.susie_family <- c("susie", "susieInf", "susieAsh") -.fit_defaults <- list(L = argv$L, L_greedy = argv[["L_greedy"]]) -methods_arg <- setNames(lapply(methods, function(tk) { - base <- if (tk %in% .susie_family) .fit_defaults else list() - user <- if (!is.null(parsed_method_args) && tk %in% names(parsed_method_args)) - parsed_method_args[[tk]] else list() - modifyList(base, user) -}), methods) +methods_arg <- if (is.null(parsed_method_args)) methods else + setNames(lapply(methods, function(tk) + if (tk %in% names(parsed_method_args)) parsed_method_args[[tk]] else list()), + methods) # Credible-set / coverage knobs common to both modes. medianAbsCorr is added # only when set (NULL -> omitted), so the call also works against a pecotmr -# that predates that argument. +# that predates that argument. L / L_greedy are forwarded top-level; pecotmr +# seeds the susie-family tokens and applies explicit --method-args overrides. cs_args <- list(methods = methods_arg, + L = argv$L, + Lgreedy = argv[["L_greedy"]], coverage = argv$coverage, secondaryCoverage = secondary_cov, signalCutoff = argv$pip_cutoff, @@ -238,20 +282,55 @@ if (has_gwas) { qd <- apply_qd_filter_overrides(qd, argv) # `contexts` and `pipCutoffToSkip` are QTL-mode only, so they ride on qtl_args # rather than the mode-shared cs_args. + # cisWindow is a gene-mode (traitId) knob — it expands each trait's own + # coordinates. In region mode the literal variant window comes from --region, + # and passing both is an error, so cisWindow rides on the traitId branch only. qtl_args <- c(list(qd), cs_args, - list(cisWindow = argv$cis_window, - contexts = contexts_arg, + list(contexts = contexts_arg, pipCutoffToSkip = argv$pip_cutoff_to_skip)) - res <- if (has_region) { - do.call(fineMappingPipeline, c(qtl_args, list(region = parse_region(argv$region)))) - } else { - do.call(fineMappingPipeline, c(qtl_args, list(traitId = argv$gene_id))) + # Opt-in multivariate / joint knobs. Each is added only when the user set it, + # so the call stays compatible with a pecotmr that predates the argument + # (mirrors the medianAbsCorr handling above). + if (!is.null(joint_spec)) qtl_args$jointSpecification <- joint_spec + if (!is.null(twas_weights_obj)) { + qtl_args$twasWeights <- twas_weights_obj + qtl_args$dataDrivenPriorWeightsCutoff <- argv$data_driven_prior_weights_cutoff + } + if (isTRUE(argv$use_pca)) { + qtl_args$usePCA <- TRUE + qtl_args$nPCs <- argv$n_pcs } label <- if (has_region) paste0("region '", argv$region, "'") else paste0("gene '", argv$gene_id, "'") + run_fm <- function() if (has_region) { + do.call(fineMappingPipeline, c(qtl_args, list(region = parse_region(argv$region)))) + } else { + do.call(fineMappingPipeline, c(qtl_args, + list(traitId = argv$gene_id, + cisWindow = argv$cis_window))) + } + # A multivariate fan-out unit can legitimately have nothing to jointly fit: + # a locus with < 2 genes overlapping it (cross-trait / mnm_genes / fsusie), a + # single-context trait (cross-context), etc. fineMappingPipeline signals this + # in two ways that only the multivariate paths (explicit jointSpecification AND + # auto-detected mvsusie / fsusie) raise -- the univariate SuSiE family never + # does: the early multivariate guard ("mvsusie/fsusie requires multi-trait + # [or multi-context] input ...") and the late dispatch check ("no joint fits + # produced"). For a per-locus batch each is the honest NULL result, not a + # failure, so write NULL rather than aborting the substep; every other error + # propagates unchanged. + res <- tryCatch(run_fm(), error = function(e) { + if (grepl("no joint fits produced|requires multi-trait|requires multi-context", + conditionMessage(e))) { + message("fine_mapping.R: no multivariate fit for ", label, + " (scope yields < 2 joinable conditions); writing NULL.") + NULL + } else stop(e) + }) } dir.create(dirname(argv$output), showWarnings = FALSE, recursive = TRUE) saveRDS(res, argv$output) +n_rows <- if (is.null(res)) 0L else nrow(res) cat(sprintf("Wrote fineMapping result for %s (%d row(s)) to %s\n", - label, nrow(res), argv$output)) + label, n_rows, argv$output)) diff --git a/code/script/pecotmr_integration/region_manifest.R b/code/script/pecotmr_integration/region_manifest.R new file mode 100644 index 00000000..1b13b833 --- /dev/null +++ b/code/script/pecotmr_integration/region_manifest.R @@ -0,0 +1,161 @@ +#!/usr/bin/env Rscript +# region_manifest.R +# +# Resolve mnm_regression's [mnm_genes] / [fsusie] per-locus analysis units into +# a manifest TSV, so the downstream step can fan out region-mode +# fineMappingPipeline over its rows with inline csv.DictReader (no notebook-local +# Python parsing). One row per region (locus). +# +# A "region" here is a genomic window that a cross-trait (multi-gene) mvSuSiE or +# a functional fSuSiE fit runs over; fineMappingPipeline auto-selects the traits +# whose coordinates overlap the window. Region source priority: +# 1. --customized-association-windows BED (#chr start end region_id): each row +# is a locus. Filtered to --region-name when given. +# 2. --region-list: a file whose LAST column lists region IDs; resolved against +# the customized windows (else treated as gene IDs, see 3). +# 3. A selected ID present only in --pheno-manifest (a gene): that gene's +# coordinates +/- --cis-window (single-gene fallback). +# +# Inputs: +# --pheno-manifest QtlDataset phenotype manifest TSV (ID, #chr, start, end); +# supplies gene coordinates for the single-gene fallback and +# the default region set when no windows/list are given. +# --customized-association-windows Optional BED (#chr start end region_id). +# --region-name Optional comma-separated region/gene IDs to restrict to. +# --region-list Optional file whose LAST column lists region IDs. +# --cis-window bp added on each side of a gene for the single-gene +# fallback window. Default 1000000. +# --output Output manifest TSV path. +# +# Output TSV columns (one row per region): region_id, chr, start, end, ld_block. + +suppressPackageStartupMessages({ + library(argparser) +}) + +parser <- arg_parser("Resolve per-locus regions into a manifest TSV for mnm_genes / fsusie") +parser <- add_argument(parser, "--pheno-manifest", + help = "QtlDataset phenotype manifest TSV (ID, #chr, start, end)", + type = "character") +parser <- add_argument(parser, "--customized-association-windows", + help = "Optional BED (#chr start end region_id) of loci", + type = "character", default = "") +parser <- add_argument(parser, "--region-name", + help = "Optional comma-separated region/gene IDs to restrict to", + type = "character", default = "") +parser <- add_argument(parser, "--region-list", + help = "Optional file whose last column lists region IDs", + type = "character", default = "") +parser <- add_argument(parser, "--cis-window", + help = "bp added on each side of a gene for the single-gene fallback window", + type = "numeric", default = 1000000) +parser <- add_argument(parser, "--output", + help = "Output manifest TSV path", type = "character") +argv <- parse_args(parser) + +norm_chr <- function(x) { + x <- as.character(x) + ifelse(is.na(x) | !nzchar(x), x, + ifelse(startsWith(x, "chr"), x, paste0("chr", x))) +} + +# ----- Gene coordinates from the phenotype manifest ------------------------- +if (is.null(argv$pheno_manifest) || !nzchar(argv$pheno_manifest) || + !file.exists(argv$pheno_manifest)) + stop("--pheno-manifest is required and must exist: ", argv$pheno_manifest) +pm <- read.table(argv$pheno_manifest, header = TRUE, sep = "\t", + stringsAsFactors = FALSE, check.names = FALSE, comment.char = "") +id_col <- intersect(c("ID", "gene_id", "phenotype_id"), names(pm))[1L] +chr_col <- intersect(c("#chr", "chrom", "chr"), names(pm))[1L] +start_col <- intersect(c("start", "Start"), names(pm))[1L] +end_col <- intersect(c("end", "End"), names(pm))[1L] +if (any(is.na(c(id_col, chr_col, start_col, end_col)))) + stop("--pheno-manifest needs ID, #chr/chrom, start, end columns (got: ", + paste(names(pm), collapse = ", "), ").") +genes <- as.character(pm[[id_col]]) +gchr <- norm_chr(pm[[chr_col]]) +gstart <- suppressWarnings(as.integer(pm[[start_col]])) +gend <- suppressWarnings(as.integer(pm[[end_col]])) +uniqGenes <- unique(genes) +geneCoord <- lapply(uniqGenes, function(g) { + idx <- which(genes == g) + list(chr = gchr[idx][[1L]], start = min(gstart[idx], na.rm = TRUE), + end = max(gend[idx], na.rm = TRUE)) +}) +names(geneCoord) <- uniqGenes + +# ----- Customized association windows (loci) -------------------------------- +windows <- list() +windowOrder <- character(0) +if (nzchar(argv$customized_association_windows) && + argv$customized_association_windows != "." && + file.exists(argv$customized_association_windows)) { + caw <- read.table(argv$customized_association_windows, header = FALSE, + sep = "", stringsAsFactors = FALSE, comment.char = "#") + for (i in seq_len(nrow(caw))) { + rid <- as.character(caw[[4L]][[i]]) + windows[[rid]] <- list(chr = norm_chr(caw[[1L]][[i]]), + start = as.integer(caw[[2L]][[i]]), + end = as.integer(caw[[3L]][[i]])) + windowOrder <- c(windowOrder, rid) + } +} + +# ----- Resolve the selected region IDs -------------------------------------- +selected <- character(0) +rn <- trimws(strsplit(argv$region_name, ",")[[1L]]) +rn <- rn[nzchar(rn)] +if (length(rn) > 0L) { + selected <- rn +} else if (nzchar(argv$region_list) && argv$region_list != "." && + file.exists(argv$region_list)) { + for (line in readLines(argv$region_list)) { + line <- trimws(line) + if (!nzchar(line) || startsWith(line, "#")) next + parts <- strsplit(line, "\\s+")[[1L]] + selected <- c(selected, parts[[length(parts)]]) + } + selected <- unique(selected) +} else if (length(windowOrder) > 0L) { + selected <- windowOrder # all loci from the windows file +} else { + selected <- uniqGenes # degenerate: per-gene (single-trait) fallback + message("NOTE: no --customized-association-windows / --region-list / ", + "--region-name given; emitting one region per gene from the ", + "phenotype manifest (single-trait regions are degenerate for ", + "cross-trait / fSuSiE fits).") +} + +cis <- as.integer(argv$cis_window) +resolve <- function(id) { + if (!is.null(windows[[id]])) { + w <- windows[[id]] + return(list(chr = w$chr, start = max(w$start, 0L), end = w$end)) + } + if (!is.null(geneCoord[[id]])) { + c0 <- geneCoord[[id]] + return(list(chr = c0$chr, start = max(c0$start - cis, 0L), end = c0$end + cis)) + } + stop("Region/gene '", id, "' not found in --customized-association-windows ", + "or --pheno-manifest; cannot determine its window.") +} + +rows <- list() +for (id in selected) { + r <- resolve(id) + rows[[length(rows) + 1L]] <- data.frame( + region_id = gsub("[^A-Za-z0-9_]", "_", id), + chr = r$chr, start = r$start, end = r$end, + ld_block = sprintf("%s:%d-%d", r$chr, r$start, r$end), + stringsAsFactors = FALSE) +} +if (length(rows) == 0L) + stop("No regions selected; check --region-name / --region-list / ", + "--customized-association-windows against the phenotype manifest.") +out <- do.call(rbind, rows) + +dir.create(dirname(argv$output), showWarnings = FALSE, recursive = TRUE) +write.table(out, file = argv$output, sep = "\t", quote = FALSE, + row.names = FALSE, na = "") +cat(sprintf("Wrote region manifest with %d region(s) to %s\n", + nrow(out), argv$output)) diff --git a/code/script/pecotmr_integration/twas_weights.R b/code/script/pecotmr_integration/twas_weights.R index fc6117da..13718c5b 100644 --- a/code/script/pecotmr_integration/twas_weights.R +++ b/code/script/pecotmr_integration/twas_weights.R @@ -30,6 +30,14 @@ # values are kwargs lists forwarded to the # underlying per-method learner. Example: # '{"lasso":{"nfolds":10}}'. +# --mixture-prior Optional RDS of mr.mash data-driven prior +# covariance matrices (dataDrivenPriorMatrices) +# for the 'mrmash' method. Omitted -> canonical +# prior (canonicalPriorMatrices = TRUE). mr.mash +# requires one of these; this wrapper supplies a +# default so 'mrmash' runs out of the box, and is +# also the producer of the mvSuSiE data-driven +# prior consumed by fine_mapping.R --twas-weights. # --output Output RDS path (one TwasWeights) suppressPackageStartupMessages({ @@ -65,6 +73,13 @@ parser <- add_argument(parser, "--fine-mapping-result", parser <- add_argument(parser, "--method-args", help = "JSON object {token: {kwarg: value, ...}, ...} for twasWeightsPipeline()", type = "character", default = "") +parser <- add_argument(parser, "--mixture-prior", + help = paste0("Path to an RDS of mr.mash data-driven prior covariance matrices ", + "(dataDrivenPriorMatrices) for the 'mrmash' method. When omitted, ", + "mr.mash uses the canonical prior set (canonicalPriorMatrices = TRUE). ", + "Only consulted when 'mrmash' is in --methods and the prior is not ", + "already set via --method-args."), + type = "character", default = "") parser <- add_argument(parser, "--min-twas-maf", help = "Minimum MAF for the variants used to learn TWAS weights (twasWeightsPipeline minTwasMaf), applied on top of the dataset's construct-time mafCutoff", type = "numeric", default = 0.01) @@ -179,6 +194,15 @@ methods_arg <- if (is.null(parsed_method_args)) { }), methods) } +# mr.mash data-driven prior override. The wrapper stays thin: it hands the prior +# covariance set to pecotmr as a MashPrior object; twasWeightsPipeline unpacks it +# (.unpackMashPrior) and injects it into the mr.mash fit. pecotmr's mrmash method +# default supplies the canonical set, so the common case needs no prior here. +mash_prior <- if (nzchar(argv$mixture_prior) && argv$mixture_prior != "." && + file.exists(argv$mixture_prior)) { + MashPrior(fullFit = readRDS(argv$mixture_prior)) +} else NULL + parse_region <- function(s) { m <- regmatches(s, regexec("^([^:]+):([0-9]+)-([0-9]+)$", s))[[1L]] if (length(m) != 4L) @@ -209,6 +233,7 @@ fmr <- if (nzchar(fmr_path) && fmr_path != "." && file.exists(fmr_path)) { # the weights (on top of the QtlDataset's construct-time cutoffs); the cv* knobs # control the cross-validated predictive-performance refits. tw_args <- list(methods = methods_arg, + mashPrior = mash_prior, cisWindow = argv$cis_window, contexts = contexts_arg, minTwasMaf = argv$min_twas_maf,