From a2d7c6449d18764dcb6c6b62e1dd7df74047287c Mon Sep 17 00:00:00 2001 From: Jonathan Sulc Date: Wed, 3 Jun 2026 16:22:30 +0200 Subject: [PATCH] Fix yeojohnson lambda estimate ignoring missing values estimate_yeojohnson_lambda computed n with length(x) before dropping NAs, so the count of "observations" included missing values. That inflated n in the log-likelihood, shifting the optimized lambda. Appending NAs to a vector therefore changed the estimated lambda. Compute n after removing missing values so the log-likelihood uses the number of nonmissing observations. Add a regression test asserting the estimated lambda is unchanged when NAs are appended. --- DESCRIPTION | 2 +- R/yeojohnson.R | 2 +- tests/testthat/test_yeojohnson.R | 7 +++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8326fdb..ce3476e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bestNormalize Type: Package Title: Normalizing Transformation Functions -Version: 1.9.2 +Version: 1.9.3 Date: 2025-11-29 Authors@R: person("Ryan A", "Peterson", email = "ryan-peterson@uiowa.edu", diff --git a/R/yeojohnson.R b/R/yeojohnson.R index e70f1d2..c24756c 100644 --- a/R/yeojohnson.R +++ b/R/yeojohnson.R @@ -126,9 +126,9 @@ print.yeojohnson <- function(x, ...) { # Helper functions that estimates yj lambda parameter #' @importFrom stats var optimize estimate_yeojohnson_lambda <- function(x, lower = -5, upper = 5, eps = .001, ...) { - n <- length(x) ccID <- !is.na(x) x <- x[ccID] + n <- length(x) pos_idx = which(x >= 0) neg_idx = which(x < 0) diff --git a/tests/testthat/test_yeojohnson.R b/tests/testthat/test_yeojohnson.R index e50cb4f..b1c2302 100644 --- a/tests/testthat/test_yeojohnson.R +++ b/tests/testthat/test_yeojohnson.R @@ -33,6 +33,13 @@ test_that('yeojohnson correctly handles missing new data', { expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) }) +test_that('yeojohnson lambda is unaffected by appended missing values', { + expect_equal( + yeojohnson(train)$lambda, + yeojohnson(c(train, NA, NA, NA))$lambda + ) +}) + # without standardization yeojohnson_obj <- yeojohnson(train, standardize = FALSE)