Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c1d75ae
fix bug with weights not being used at all
OlivierCoen Jun 3, 2026
019c51f
Update merge_counts.py
OlivierCoen Jun 3, 2026
3b45e96
improve missing value imputation by sampling data in posterior pred
OlivierCoen Jun 3, 2026
a062e45
make computation of gene statistics more scalable
OlivierCoen Jun 3, 2026
0aa6d9e
fix possible read timeout errors with expression atlas
OlivierCoen Jun 3, 2026
0950704
Create get_eatlas_supported_species.py
OlivierCoen Jun 3, 2026
cd9326c
update default weights
OlivierCoen Jun 5, 2026
cdfd8e7
publish merged counts
OlivierCoen Jun 8, 2026
632e975
fix bug for strict synthax
OlivierCoen Jun 11, 2026
9f858e7
update snapshots
OlivierCoen Jun 11, 2026
df434b1
Update nextflow_schema.json
OlivierCoen Jun 11, 2026
24c8253
Merge branch 'nf-core:dev' into dev
OlivierCoen Jun 11, 2026
c26ec00
pass pre-commit
OlivierCoen Jun 11, 2026
90d745b
Update nextflow_schema.json
OlivierCoen Jun 11, 2026
f78eed0
update config
OlivierCoen Jun 11, 2026
f74bbb5
set knn as default missing value imputer and expose some imputation p…
OlivierCoen Jun 11, 2026
f83b742
replace all float64 casts to float32
OlivierCoen Jun 11, 2026
ef4ef91
add batch k means clustering before knn imputation for more scalability
OlivierCoen Jun 12, 2026
1ec6790
Update impute_missing_values.py
OlivierCoen Jun 12, 2026
19d50a7
update nf-test snapshots
OlivierCoen Jun 12, 2026
c5aa9b2
pass linters
OlivierCoen Jun 12, 2026
719851a
update multiqc
OlivierCoen Jun 12, 2026
d1bc5c4
Update default.nf.test.snap
OlivierCoen Jun 12, 2026
8d1f8ed
upgrade volume necessary for nf-test CI runners
OlivierCoen Jun 12, 2026
7f8abce
Update detect_rare_genes.py
OlivierCoen Jun 16, 2026
40d337c
decrease number of decimals wen exporting to csv from polars
OlivierCoen Jun 16, 2026
9690c68
upgrade nf-test version to 0.9.5
OlivierCoen Jun 16, 2026
4738b35
simplify and make more consistent all genorm scripts
OlivierCoen Jun 18, 2026
b21eafa
set default nb of sections to 10
OlivierCoen Jun 18, 2026
0ae4409
update snapshots
OlivierCoen Jun 18, 2026
44e75f1
add possibility to provide gff file as URL
OlivierCoen Jun 19, 2026
0e33f9b
Update main.nf.test
OlivierCoen Jun 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/nf-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ concurrency:

env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NFT_VER: "0.9.3"
NFT_VER: "0.9.5"
NFT_WORKDIR: "~"
NXF_ANSI_LOG: false
NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
Expand All @@ -30,6 +30,7 @@ jobs:
runs-on: # use self-hosted runners
- runs-on=${{ github.run_id }}-nf-test-changes
- runner=4cpu-linux-x64
- volume=40gb
outputs:
shard: ${{ steps.set-shards.outputs.shard }}
total_shards: ${{ steps.set-shards.outputs.total_shards }}
Expand Down Expand Up @@ -64,6 +65,7 @@ jobs:
runs-on: # use self-hosted runners
- runs-on=${{ github.run_id }}-nf-test
- runner=4cpu-linux-x64
- volume=40gb
strategy:
fail-fast: false
matrix:
Expand Down
1 change: 0 additions & 1 deletion .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ lint:
- tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet
- tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet
- tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet
schema_lint: false

nf_core_version: 3.5.2
repository_type: pipeline
Expand Down
2 changes: 1 addition & 1 deletion bin/aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def concat_cast_to_string_and_drop_duplicates(files: list[Path]) -> pl.DataFrame
def cast_count_columns_to_float(df: pl.DataFrame) -> pl.DataFrame:
return df.select(
pl.col(config.GENE_ID_COLNAME),
pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64),
pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float32),
)


Expand Down
6 changes: 5 additions & 1 deletion bin/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,17 @@ def parse_table(file: Path):
raise ValueError(f"Unsupported file format: {file.suffix}")


def get_nb_rows(lf: pl.LazyFrame):
return lf.select(pl.len()).collect().item()


def parse_count_table(file: Path):
df = parse_table(file)
first_col = df.columns[0]
# whatever the name of the first col, rename it to "gene_id"
return df.rename({first_col: config.GENE_ID_COLNAME}).select(
pl.col(config.GENE_ID_COLNAME).cast(pl.String()),
pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()),
pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float32()),
)


Expand Down
66 changes: 38 additions & 28 deletions bin/compute_gene_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# outfile names
ALL_GENES_RESULT_OUTFILE_SUFFIX = "stats_all_genes.csv"

RCV_MULTIFILER = 1.4826 # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/
RCV_MULTIPLIER = 1.4826 # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/

# quantile intervals
NB_QUANTILES = 100
Expand Down Expand Up @@ -95,9 +95,9 @@ def parse_args():
return parser.parse_args()


def get_counts(file: Path) -> pl.DataFrame:
def get_counts(file: Path) -> pl.LazyFrame:
# sorting dataframe (necessary to get consistent output)
return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False)
return pl.scan_parquet(file).sort(config.GENE_ID_COLNAME, descending=False)


def get_colname(colname: str, platform: str | None) -> str:
Expand Down Expand Up @@ -125,22 +125,41 @@ def get_valid_samples(


def compute_ratios_null_values(
df: pl.DataFrame, valid_samples: list[str], platform: str | None
) -> pl.DataFrame:
# the samples showing a low gene count will not be taken into account for the zero count penalty
nb_nulls = df.select(pl.exclude(config.GENE_ID_COLNAME).is_null()).sum_horizontal()
lf: pl.LazyFrame, valid_samples: list[str], platform: str | None
) -> pl.LazyFrame:
samples_cols = [
col for col in lf.collect_schema().names() if col != config.GENE_ID_COLNAME
]
nb_samples = len(samples_cols) - 1
found_valid_samples = [sample for sample in valid_samples if sample in samples_cols]

found_valid_samples = [sample for sample in valid_samples if sample in df.columns]
# the samples showing a low gene count will not be taken into account for the zero count penalty
nb_nulls = (
lf.select(
pl.exclude(config.GENE_ID_COLNAME).is_null()
) # select all columns except GENE_ID_COLNAME and check if they are null
.select(
pl.sum_horizontal(pl.all()).alias("nb_nulls_all_samples")
) # sum the number of null values across all columns
.collect()
.to_series()
)

if found_valid_samples:
nb_nulls_valid_samples = df.select(
pl.col(found_valid_samples).is_null()
).sum_horizontal()
nb_nulls_valid_samples = (
lf.select(
pl.col(found_valid_samples).is_null()
) # select all columns in valid_samples and check if they are null
.select(
pl.sum_horizontal(pl.all()).alias("nb_nulls_valid_samples")
) # sum the number of null values across all columns
.collect()
.to_series()
)
else:
nb_nulls_valid_samples = nb_nulls

nb_samples = len(df.columns) - 1
return df.select(
return lf.select(
pl.col(config.GENE_ID_COLNAME),
(nb_nulls / nb_samples).alias(
get_colname(config.RATIO_NULLS_COLNAME, platform)
Expand Down Expand Up @@ -174,7 +193,7 @@ def get_main_statistics(lf: pl.LazyFrame, platform: str | None) -> pl.LazyFrame:
(pl.col("std") / pl.col("mean")).alias(
get_colname(config.COEFFICIENT_OF_VARIATION_COLNAME, platform)
),
(pl.col("mad") / pl.col("median") * RCV_MULTIFILER).alias(
(pl.col("mad") / pl.col("median") * RCV_MULTIPLIER).alias(
get_colname(config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, platform)
),
)
Expand Down Expand Up @@ -244,15 +263,12 @@ def main():
)

logger.info("Loading count data (before missing value imputation)")
non_imputed_count_df = get_counts(args.count_file)
non_imputed_count_lf = get_counts(args.count_file)

ratio_nulls_df = compute_ratios_null_values(
non_imputed_count_df, valid_samples, args.platform
ratio_nulls_lf = compute_ratios_null_values(
non_imputed_count_lf, valid_samples, args.platform
)

# deleting non_imputed_count_df in order to free unused memory
del non_imputed_count_df

# if the user provided an imputed count file, use it; otherwise, use the original count file
if args.imputed_count_file:
logger.info("Using imputed count file")
Expand All @@ -262,20 +278,14 @@ def main():
count_file = args.count_file

logger.info("Loading count data...")
count_df = get_counts(count_file)
logger.info(
f"Loaded count data with {count_df.shape[0]} rows and {count_df.shape[1]} columns"
)
count_lf = get_counts(count_file)

logger.info("Computing statistics and stability score")
count_lf = count_df.lazy()
# getting expression statistics
stat_lf = get_main_statistics(count_lf, args.platform)

# adding column for nb of null values for each gene
stat_lf = stat_lf.join(
ratio_nulls_df.lazy(), on=config.GENE_ID_COLNAME, how="inner"
)
stat_lf = stat_lf.join(ratio_nulls_lf, on=config.GENE_ID_COLNAME, how="inner")

# adding a column for the frequency of zero values
stat_lf = compute_ratio_zeros(count_lf, stat_lf, args.platform)
Expand Down
Loading
Loading