From fb937fb94ce941dd519a1b669816e8b0a3476fa0 Mon Sep 17 00:00:00 2001 From: Anjing Liu Date: Tue, 23 Jun 2026 01:31:42 -0700 Subject: [PATCH 1/3] Fix output filename for .bed TPM input in bulk_expression_QC.R normalize_output_prefix now strips exactly 3 extensions (mirroring SOS :bnnn) instead of pattern-matching known suffixes, so .bed inputs like protocol_example.rnaseq.tpm_matrix.bed correctly resolve to protocol_example rather than the full basename. Co-Authored-By: Claude Sonnet 4.6 --- code/script/molecular_phenotypes/QC/bulk_expression_QC.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/code/script/molecular_phenotypes/QC/bulk_expression_QC.R b/code/script/molecular_phenotypes/QC/bulk_expression_QC.R index ea7400526..7fef50261 100644 --- a/code/script/molecular_phenotypes/QC/bulk_expression_QC.R +++ b/code/script/molecular_phenotypes/QC/bulk_expression_QC.R @@ -52,10 +52,9 @@ if (is.null(opt[["tpm-gct"]])) stop("--tpm-gct is required") dir.create(opt$cwd, showWarnings = FALSE, recursive = TRUE) -normalize_output_prefix <- function(path) { - stem <- sub("\\.(gct|GCT)(\\.gz)?$", "", basename(path)) - stem <- sub("\\.gene_tpm$", "", stem) - stem <- sub("\\.tpm$", "", stem) +normalize_output_prefix <- function(path, n_ext = 3) { + stem <- basename(path) + for (i in seq_len(n_ext)) stem <- sub("\\.[^.]+$", "", stem) stem } From c1162c3928fc3ac601dd2f7c77833217858b1627 Mon Sep 17 00:00:00 2001 From: Anjing Liu Date: Tue, 23 Jun 2026 01:32:51 -0700 Subject: [PATCH 2/3] Fix reference_data paths to use input/reference_data prefix in gene_annotation notebook Update all example command paths from reference_data/ to input/reference_data/ to match the actual tutorial directory layout. Co-Authored-By: Claude Sonnet 4.6 --- .../phenotype/gene_annotation.ipynb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb b/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb index 2de6ed2f3..8d4ae4802 100644 --- a/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb +++ b/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb @@ -63,8 +63,8 @@ "| `input/rnaseq/protocol_example.leafcutter.intron_count.tsv` | Toy leafcutter intron-count table (IDs `chr:start:end:clu_N_strand`). |\n", "| `input/rnaseq/protocol_example.leafcutter.phenotype.bed.gz` | Toy leafcutter intron-excision phenotype matrix. |\n", "| `input/rnaseq/protocol_example.psichomics.phenotype.tsv` | Toy psichomics phenotype matrix (IDs end in `_`). |\n", - "| `reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf` | Collapsed gene-model GTF (gene/protein annotation). |\n", - "| `reference_data/Homo_sapiens.GRCh38.103.chr.gtf` | Full gene-model GTF with exons (leafcutter/psichomics). |\n" + "| `input/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf` | Collapsed gene-model GTF (gene/protein annotation). |\n", + "| `input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf` | Full gene-model GTF with exons (leafcutter/psichomics). |\n" ] }, { @@ -132,7 +132,7 @@ "sos run pipeline/gene_annotation.ipynb annotate_coord \\\n", " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.rnaseq.bed.gz \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", " --phenotype-id-column gene_id\n" ] }, @@ -183,7 +183,7 @@ "sos run pipeline/gene_annotation.ipynb annotate_coord \\\n", " --cwd output/gene_annotation \\\n", " --phenoFile input/proteomics/protocol_example.protein.no_coord.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", " --phenotype-id-column gene_id \\\n", " --molecular-trait-type protein\n" ] @@ -227,7 +227,7 @@ " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.leafcutter.phenotype.bed.gz \\\n", " --intron-count input/rnaseq/protocol_example.leafcutter.intron_count.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", " --map-stra site\n" ] }, @@ -293,7 +293,7 @@ " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.leafcutter.phenotype.bed.gz \\\n", " --intron-count input/rnaseq/protocol_example.leafcutter.intron_count.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", " --map-stra site\n" ] }, @@ -335,7 +335,7 @@ "sos run pipeline/gene_annotation.ipynb annotate_psichomics_isoforms \\\n", " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.psichomics.phenotype.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.gtf\n" + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf\n" ] }, { @@ -623,7 +623,7 @@ "kernel": "SoS" }, "source": [ - "The gtf used here should be the collapsed gtf, i.e. the final output of reference_data gtf processing and the one used to called rnaseq." + "The gtf used here should be the collapsed gtf, i.e. the final output of input/reference_data gtf processing and the one used to called rnaseq." ] }, { @@ -763,4 +763,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} From ede57c1b565e2f6df90cc29da4c59bb2f5a386a0 Mon Sep 17 00:00:00 2001 From: Anjing Liu Date: Tue, 23 Jun 2026 01:35:37 -0700 Subject: [PATCH 3/3] Remove spurious input/ prefix from descriptive text in gene_annotation notebook Co-Authored-By: Claude Sonnet 4.6 --- code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb b/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb index 8d4ae4802..750b7c06c 100644 --- a/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb +++ b/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb @@ -622,9 +622,7 @@ "metadata": { "kernel": "SoS" }, - "source": [ - "The gtf used here should be the collapsed gtf, i.e. the final output of input/reference_data gtf processing and the one used to called rnaseq." - ] + "source": "The gtf used here should be the collapsed gtf, i.e. the final output of reference_data gtf processing and the one used to called rnaseq." }, { "cell_type": "code", @@ -763,4 +761,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file