diff --git a/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb b/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb index 2de6ed2f3..750b7c06c 100644 --- a/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb +++ b/code/SoS/data_preprocessing/phenotype/gene_annotation.ipynb @@ -63,8 +63,8 @@ "| `input/rnaseq/protocol_example.leafcutter.intron_count.tsv` | Toy leafcutter intron-count table (IDs `chr:start:end:clu_N_strand`). |\n", "| `input/rnaseq/protocol_example.leafcutter.phenotype.bed.gz` | Toy leafcutter intron-excision phenotype matrix. |\n", "| `input/rnaseq/protocol_example.psichomics.phenotype.tsv` | Toy psichomics phenotype matrix (IDs end in `_`). |\n", - "| `reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf` | Collapsed gene-model GTF (gene/protein annotation). |\n", - "| `reference_data/Homo_sapiens.GRCh38.103.chr.gtf` | Full gene-model GTF with exons (leafcutter/psichomics). |\n" + "| `input/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf` | Collapsed gene-model GTF (gene/protein annotation). |\n", + "| `input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf` | Full gene-model GTF with exons (leafcutter/psichomics). |\n" ] }, { @@ -132,7 +132,7 @@ "sos run pipeline/gene_annotation.ipynb annotate_coord \\\n", " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.rnaseq.bed.gz \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", " --phenotype-id-column gene_id\n" ] }, @@ -183,7 +183,7 @@ "sos run pipeline/gene_annotation.ipynb annotate_coord \\\n", " --cwd output/gene_annotation \\\n", " --phenoFile input/proteomics/protocol_example.protein.no_coord.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.ERCC.gtf \\\n", " --phenotype-id-column gene_id \\\n", " --molecular-trait-type protein\n" ] @@ -227,7 +227,7 @@ " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.leafcutter.phenotype.bed.gz \\\n", " --intron-count input/rnaseq/protocol_example.leafcutter.intron_count.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", " --map-stra site\n" ] }, @@ -293,7 +293,7 @@ " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.leafcutter.phenotype.bed.gz \\\n", " --intron-count input/rnaseq/protocol_example.leafcutter.intron_count.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf \\\n", " --map-stra site\n" ] }, @@ -335,7 +335,7 @@ "sos run pipeline/gene_annotation.ipynb annotate_psichomics_isoforms \\\n", " --cwd output/gene_annotation \\\n", " --phenoFile input/rnaseq/protocol_example.psichomics.phenotype.tsv \\\n", - " --coordinate-annotation reference_data/Homo_sapiens.GRCh38.103.chr.gtf\n" + " --coordinate-annotation input/reference_data/Homo_sapiens.GRCh38.103.chr.gtf\n" ] }, { @@ -622,9 +622,7 @@ "metadata": { "kernel": "SoS" }, - "source": [ - "The gtf used here should be the collapsed gtf, i.e. the final output of reference_data gtf processing and the one used to called rnaseq." - ] + "source": "The gtf used here should be the collapsed gtf, i.e. the final output of reference_data gtf processing and the one used to called rnaseq." }, { "cell_type": "code", diff --git a/code/script/molecular_phenotypes/QC/bulk_expression_QC.R b/code/script/molecular_phenotypes/QC/bulk_expression_QC.R index ea7400526..7fef50261 100644 --- a/code/script/molecular_phenotypes/QC/bulk_expression_QC.R +++ b/code/script/molecular_phenotypes/QC/bulk_expression_QC.R @@ -52,10 +52,9 @@ if (is.null(opt[["tpm-gct"]])) stop("--tpm-gct is required") dir.create(opt$cwd, showWarnings = FALSE, recursive = TRUE) -normalize_output_prefix <- function(path) { - stem <- sub("\\.(gct|GCT)(\\.gz)?$", "", basename(path)) - stem <- sub("\\.gene_tpm$", "", stem) - stem <- sub("\\.tpm$", "", stem) +normalize_output_prefix <- function(path, n_ext = 3) { + stem <- basename(path) + for (i in seq_len(n_ext)) stem <- sub("\\.[^.]+$", "", stem) stem }