nf-core · OlivierCoen · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml
@@ -18,7 +18,7 @@ concurrency:
 
 env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  NFT_VER: "0.9.3"
+  NFT_VER: "0.9.5"
   NFT_WORKDIR: "~"
   NXF_ANSI_LOG: false
   NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
@@ -30,6 +30,7 @@ jobs:
     runs-on: # use self-hosted runners
       - runs-on=${{ github.run_id }}-nf-test-changes
       - runner=4cpu-linux-x64
+      - volume=40gb
     outputs:
       shard: ${{ steps.set-shards.outputs.shard }}
       total_shards: ${{ steps.set-shards.outputs.total_shards }}
@@ -64,6 +65,7 @@ jobs:
     runs-on: # use self-hosted runners
       - runs-on=${{ github.run_id }}-nf-test
       - runner=4cpu-linux-x64
+      - volume=40gb
     strategy:
       fail-fast: false
       matrix:

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -15,7 +15,6 @@ lint:
     - tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet
     - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet
     - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet
-  schema_lint: false
 
 nf_core_version: 3.5.2
 repository_type: pipeline

diff --git a/bin/aggregate_results.py b/bin/aggregate_results.py
@@ -115,7 +115,7 @@ def concat_cast_to_string_and_drop_duplicates(files: list[Path]) -> pl.DataFrame
 def cast_count_columns_to_float(df: pl.DataFrame) -> pl.DataFrame:
     return df.select(
         pl.col(config.GENE_ID_COLNAME),
-        pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64),
+        pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float32),
     )
 
 

diff --git a/bin/common.py b/bin/common.py
@@ -46,13 +46,17 @@ def parse_table(file: Path):
         raise ValueError(f"Unsupported file format: {file.suffix}")
 
 
+def get_nb_rows(lf: pl.LazyFrame):
+    return lf.select(pl.len()).collect().item()
+
+
 def parse_count_table(file: Path):
     df = parse_table(file)
     first_col = df.columns[0]
     # whatever the name of the first col, rename it to "gene_id"
     return df.rename({first_col: config.GENE_ID_COLNAME}).select(
         pl.col(config.GENE_ID_COLNAME).cast(pl.String()),
-        pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()),
+        pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float32()),
     )
 
 

diff --git a/bin/compute_gene_statistics.py b/bin/compute_gene_statistics.py
@@ -15,7 +15,7 @@
 # outfile names
 ALL_GENES_RESULT_OUTFILE_SUFFIX = "stats_all_genes.csv"
 
-RCV_MULTIFILER = 1.4826  # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/
+RCV_MULTIPLIER = 1.4826  # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/
 
 # quantile intervals
 NB_QUANTILES = 100
@@ -95,9 +95,9 @@ def parse_args():
     return parser.parse_args()
 
 
-def get_counts(file: Path) -> pl.DataFrame:
+def get_counts(file: Path) -> pl.LazyFrame:
     # sorting dataframe (necessary to get consistent output)
-    return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False)
+    return pl.scan_parquet(file).sort(config.GENE_ID_COLNAME, descending=False)
 
 
 def get_colname(colname: str, platform: str | None) -> str:
@@ -125,22 +125,41 @@ def get_valid_samples(
 
 
 def compute_ratios_null_values(
-    df: pl.DataFrame, valid_samples: list[str], platform: str | None
-) -> pl.DataFrame:
-    # the samples showing a low gene count will not be taken into account for the zero count penalty
-    nb_nulls = df.select(pl.exclude(config.GENE_ID_COLNAME).is_null()).sum_horizontal()
+    lf: pl.LazyFrame, valid_samples: list[str], platform: str | None
+) -> pl.LazyFrame:
+    samples_cols = [
+        col for col in lf.collect_schema().names() if col != config.GENE_ID_COLNAME
+    ]
+    nb_samples = len(samples_cols) - 1
+    found_valid_samples = [sample for sample in valid_samples if sample in samples_cols]
 
-    found_valid_samples = [sample for sample in valid_samples if sample in df.columns]
+    # the samples showing a low gene count will not be taken into account for the zero count penalty
+    nb_nulls = (
+        lf.select(
+            pl.exclude(config.GENE_ID_COLNAME).is_null()
+        )  # select all columns except GENE_ID_COLNAME and check if they are null
+        .select(
+            pl.sum_horizontal(pl.all()).alias("nb_nulls_all_samples")
+        )  # sum the number of null values across all columns
+        .collect()
+        .to_series()
+    )
 
     if found_valid_samples:
-        nb_nulls_valid_samples = df.select(
-            pl.col(found_valid_samples).is_null()
-        ).sum_horizontal()
+        nb_nulls_valid_samples = (
+            lf.select(
+                pl.col(found_valid_samples).is_null()
+            )  # select all columns in valid_samples and check if they are null
+            .select(
+                pl.sum_horizontal(pl.all()).alias("nb_nulls_valid_samples")
+            )  # sum the number of null values across all columns
+            .collect()
+            .to_series()
+        )
     else:
         nb_nulls_valid_samples = nb_nulls
 
-    nb_samples = len(df.columns) - 1
-    return df.select(
+    return lf.select(
         pl.col(config.GENE_ID_COLNAME),
         (nb_nulls / nb_samples).alias(
             get_colname(config.RATIO_NULLS_COLNAME, platform)
@@ -174,7 +193,7 @@ def get_main_statistics(lf: pl.LazyFrame, platform: str | None) -> pl.LazyFrame:
         (pl.col("std") / pl.col("mean")).alias(
             get_colname(config.COEFFICIENT_OF_VARIATION_COLNAME, platform)
         ),
-        (pl.col("mad") / pl.col("median") * RCV_MULTIFILER).alias(
+        (pl.col("mad") / pl.col("median") * RCV_MULTIPLIER).alias(
             get_colname(config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, platform)
         ),
     )
@@ -244,15 +263,12 @@ def main():
     )
 
     logger.info("Loading count data (before missing value imputation)")
-    non_imputed_count_df = get_counts(args.count_file)
+    non_imputed_count_lf = get_counts(args.count_file)
 
-    ratio_nulls_df = compute_ratios_null_values(
-        non_imputed_count_df, valid_samples, args.platform
+    ratio_nulls_lf = compute_ratios_null_values(
+        non_imputed_count_lf, valid_samples, args.platform
     )
 
-    # deleting non_imputed_count_df in order to free unused memory
-    del non_imputed_count_df
-
     # if the user provided an imputed count file, use it; otherwise, use the original count file
     if args.imputed_count_file:
         logger.info("Using imputed count file")
@@ -262,20 +278,14 @@ def main():
         count_file = args.count_file
 
     logger.info("Loading count data...")
-    count_df = get_counts(count_file)
-    logger.info(
-        f"Loaded count data with {count_df.shape[0]} rows and {count_df.shape[1]} columns"
-    )
+    count_lf = get_counts(count_file)
 
     logger.info("Computing statistics and stability score")
-    count_lf = count_df.lazy()
     # getting expression statistics
     stat_lf = get_main_statistics(count_lf, args.platform)
 
     # adding column for nb of null values for each gene
-    stat_lf = stat_lf.join(
-        ratio_nulls_df.lazy(), on=config.GENE_ID_COLNAME, how="inner"
-    )
+    stat_lf = stat_lf.join(ratio_nulls_lf, on=config.GENE_ID_COLNAME, how="inner")
 
     # adding a column for the frequency of zero values
     stat_lf = compute_ratio_zeros(count_lf, stat_lf, args.platform)