From e91352707705043c3288d0aa16ef2e9065fe289f Mon Sep 17 00:00:00 2001
From: Dimitry Wintermantel <dywintermantel@gmail.com>
Date: Wed, 26 Feb 2025 14:08:48 +0100
Subject: [PATCH 1/2] Initial commit - adding allocate_treatments function that
 assigns subjects to treatment groups while balancing specified covariates

---
 R/allocate_treatments.R | 208 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 R/allocate_treatments.R

diff --git a/R/allocate_treatments.R b/R/allocate_treatments.R
new file mode 100644
index 00000000..7c481ddd
--- /dev/null
+++ b/R/allocate_treatments.R
@@ -0,0 +1,208 @@
+library(anticlust)
+
+#' Allocate Treatments to Subjects
+#'
+#' This function assigns subjects to treatment groups while balancing specified covariates and ensuring replicates per group.
+#'
+#' @param data A data frame containing the subjects to be allocated, including columns for the covariates.
+#' @param treatments A vector specifying the treatment group labels.
+#' @param treatment_var A string specifying the name of the treatment column to be created. Defaults to `"Treatment"`. 
+#' @param covariates A vector of column names in `data` representing covariates to balance.
+#' @param n_replicates Integer. Specifies the number of replicates per treatment group. If `NULL`, either all subjects are allocated or the highest possible equal number per treatment is allocated depending on `ensure_equal_n_replicates`. Defaults to `NULL`.
+#' @param ensure_equal_n_replicates Logical. If `TRUE`, ensures equal numbers of replicates across treatment groups. Defaults to `TRUE`. If `FALSE` all subjects are allocated. 
+#' @param keep_excluded_data Logical. If `TRUE`, retains subjects that were not allocated within the specified constraints. Defaults to `FALSE`.
+#' @param objective A string specifying the objective function for anticlustering. Options include `"variance"` (default), `"diversity"`, `"average-diversity"`, `"kplus"`, and `"dispersion"`. 
+#' @param method. A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`.
+#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `10`.
+#' @param match_within A column name in `data` (optional). Specifies a variable within which matching should occur, ensuring that subjects are grouped within subsets defined by this variable. Defaults to `NULL`.
+#' @param standardize Logical. If `TRUE`, covariates are standardized via `scale()` before optimization starts. Defaults to `TRUE`.
+#' 
+#'
+#' @return A data frame with subjects assigned to treatments. The column name for treatment allocation is specified by `treatment_var`. If `keep_excluded_data = TRUE`, excluded subjects are included in the output with `NA` in the treatment column.
+#'
+#' @details
+#' The function balances covariates among treatment groups by creating groups of similar individuals based on the specified covariates. The treatments are then assigned to these groups in a way that minimizes variance within groups.
+#'
+#' - If `ensure_equal_n_replicates = TRUE`, the number of replicates per treatment is enforced.
+#' - If `keep_excluded_data = TRUE`, subjects that cannot be allocated to treatments are retained in the output.
+#' - Covariates can be scaled to standardize their ranges if `standardize = TRUE`.
+#' - If `match_within` is specified, subjects are grouped within levels of the specified variable, ensuring allocations respect the structure defined by this variable.
+#'
+#' @section Validations:
+#' The function validates the following conditions:
+#' - All required arguments (`data`, `treatments`, `covariates`) are provided.
+#' - Covariates exist in the `data` frame and are numeric.
+#' - Covariates do not have constant values across all rows.
+#'
+#' @examples
+#' # Example dataset: Bee data
+#' set.seed(123)
+#' example_bee_data <- data.frame(
+#'   ID = as.factor(as.character(seq(1, 100, 1))),
+#'   Bee_count = rnorm(100, mean = 1000, sd = 200),
+#'   Colony_weight = rnorm(100, mean = 500, sd = 100))
+#' 
+#' # Example dataset: Site data
+#' example_site_data <- data.frame(
+#'   Site = as.factor(as.character(seq(1, 16, 1))),
+#'   Organic = as.factor(as.character(c(rep("yes", 4), rep("no", 12)))), 
+#'   Field_quality = rnorm(16, mean = 3, sd = 0.5))
+#' 
+#' treatments <- c("Control", "Pesticide")
+#' bee_covariates <- c("Bee_count", "Colony_weight")
+#' 
+#' # Allocate bee data to treatments
+#' allocated_bee_data <- allocate_treatments(
+#'   example_bee_data, 
+#'   treatments = treatments, 
+#'   covariates = bee_covariates,
+#'   ensure_equal_n_replicates = TRUE, 
+#'   keep_excluded_data = FALSE)
+#' 
+#' # Allocate site data to treatments while matching within "Organic"
+#' allocated_site_data <- allocate_treatments(
+#'   example_site_data, 
+#'   treatments = treatments, 
+#'   covariates = "Field_quality", 
+#'   ensure_equal_n_replicates = TRUE, 
+#'   keep_excluded_data = FALSE,
+#'   match_within = "Organic")
+#'
+#' @section Required Packages:
+#' This function requires the following packages:
+#' - `anticlust`: for anticlustering optimization.
+#'
+#' @export
+#' 
+
+allocate_treatments <- function(data, treatments, treatment_var = "Treatment",
+                                covariates, n_replicates = NULL, 
+                                ensure_equal_n_replicates = TRUE, 
+                                keep_excluded_data = FALSE,
+                                objective = "variance", 
+                                method = "local-maximum",
+                                repetitions = 10, 
+                                match_within = NULL,
+                                standardize = TRUE) {
+  
+  # Use validate_args for argument validation
+  validate_args <- function(arg, arg_name) {
+    if (missing(arg) || is.null(arg)) {
+      stop(paste0("Error: The required argument '", arg_name, "' is missing or NULL. Please provide a valid input."))
+    }
+  }
+  
+  # Validate required arguments
+  validate_args(data, "data")
+  validate_args(treatments, "treatments")
+  validate_args(covariates, "covariates")
+  
+  # Check that specified covariates are present in the data
+  missing_covariates <- setdiff(covariates, colnames(data))
+  if (length(missing_covariates) > 0) {
+    stop("The following covariates are missing in the data: ", paste(missing_covariates, collapse = ", "), ". Ensure these covariates are included in the dataset.")
+  }
+  
+  # Check if any covariate is not numeric 
+  non_numeric_covariates <- covariates[!sapply(data[, covariates, drop = FALSE], is.numeric)]
+  if (length(non_numeric_covariates) > 0) {
+    stop("The following covariates are not numeric: ", paste(non_numeric_covariates, collapse = ", "), ". Convert these covariates to numeric before running the function.")
+  }
+  
+  # Check if any covariate takes the same value for all rows
+  constant_covariates <- covariates[sapply(data[, covariates, drop = FALSE], function(x) length(unique(x[!is.na(x)])) == 1)]
+  if (length(constant_covariates) > 0) {
+    stop("The following covariates take the same value for all observations: ", paste(constant_covariates, collapse = ", "), ". Remove these covariates or provide covariates with variability.")
+  }
+  
+  # Validate match_within
+  if (!is.null(match_within)) {
+    if (!match_within %in% colnames(data)) {
+      stop("The column specified in match_within ('", match_within, "') is not present in the data frame. Please ensure it is a valid column name.")
+    }
+    match_within_vector <- data[[match_within]]
+  } else {
+    match_within_vector <- NULL
+  }
+  
+  # Shuffle treatments for random assignment
+  treatments_shuffled <- sample(treatments)
+  
+  # Calculate the number of replicates per treatment if not provided
+  n_treatments <- length(treatments)
+  
+  if (is.null(n_replicates)) {
+    n_replicates <- floor(nrow(data) / n_treatments)
+  } else {
+    if (!ensure_equal_n_replicates) {
+      ensure_equal_n_replicates <- TRUE
+      warning("'ensure_equal_n_replicates' was changed to TRUE as 'n_replicates' was manually specified.")
+    }
+  }
+  
+  # Scale covariates if specified
+  covariate_data <- if (standardize) {
+    scale(data[, covariates, drop = FALSE])
+  } else {
+    data[, covariates, drop = FALSE]
+  }
+  
+  if (ensure_equal_n_replicates | n_treatments * n_replicates == nrow(data)) {
+    # Create groups with similar individuals based on covariates
+    data$set_similar <- matching(
+      covariate_data, 
+      p = n_treatments,
+      match_within = match_within_vector
+    )
+    
+    # Include individuals based on the number of replicates
+    included <- data[!is.na(data$set_similar) & data$set_similar <= n_replicates, ]
+    
+    # Assign individuals of the same set to different treatment groups
+    included[[treatment_var]] <- anticlustering(
+      included[, covariates, drop = FALSE], 
+      K = n_treatments,
+      objective = objective, 
+      method = method,
+      categories = included$set_similar,
+      repetitions = repetitions
+    ) 
+    
+    included$set_similar <- NULL
+    
+    if (keep_excluded_data) {
+      # Extract data that was excluded from the experiment
+      excluded <- data[data$set_similar > n_replicates | is.na(data$set_similar), ]
+      excluded$set_similar <- NULL
+      excluded[[treatment_var]] <- NA
+      
+      # Combine included and excluded data
+      output <- rbind(included, excluded)
+    } else {
+      output <- included
+    }
+  } else {
+    # Directly assign treatments without ensuring equal replicates
+    data[[treatment_var]] <- anticlustering(
+      covariate_data, 
+      K = n_treatments,
+      objective = objective, 
+      method = method,
+      categories = match_within_vector,
+      repetitions = repetitions
+    )
+    output <- data
+  }
+  
+  # Map treatment values to shuffled treatments
+  treatment_map <- setNames(treatments_shuffled, as.character(1:n_treatments))
+  output[[treatment_var]] <- unname(treatment_map[as.character(output[[treatment_var]])])
+  
+  # Convert treatment variable to factor
+  output[[treatment_var]] <- factor(output[[treatment_var]], levels = treatments)
+  
+  # Reorder columns to have treatment_var first
+  output <- output[, c(treatment_var, setdiff(names(output), treatment_var))]
+  
+  return(output)
+}

From d306abacd9a7ac5b3912bdea99f2bae773c4e90c Mon Sep 17 00:00:00 2001
From: Dimitry Wintermantel <dywintermantel@gmail.com>
Date: Tue, 17 Mar 2026 21:08:43 +0100
Subject: [PATCH 2/2] Update allocation function and add second function for
 multi-level designs

---
 R/allocate_treatments.R        |  19 +++-
 R/allocate_within_treatments.R | 185 +++++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 R/allocate_within_treatments.R

diff --git a/R/allocate_treatments.R b/R/allocate_treatments.R
index 7c481ddd..0e008bc6 100644
--- a/R/allocate_treatments.R
+++ b/R/allocate_treatments.R
@@ -12,8 +12,8 @@ library(anticlust)
 #' @param ensure_equal_n_replicates Logical. If `TRUE`, ensures equal numbers of replicates across treatment groups. Defaults to `TRUE`. If `FALSE` all subjects are allocated. 
 #' @param keep_excluded_data Logical. If `TRUE`, retains subjects that were not allocated within the specified constraints. Defaults to `FALSE`.
 #' @param objective A string specifying the objective function for anticlustering. Options include `"variance"` (default), `"diversity"`, `"average-diversity"`, `"kplus"`, and `"dispersion"`. 
-#' @param method. A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`.
-#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `10`.
+#' @param method A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`.
+#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `100`.
 #' @param match_within A column name in `data` (optional). Specifies a variable within which matching should occur, ensuring that subjects are grouped within subsets defined by this variable. Defaults to `NULL`.
 #' @param standardize Logical. If `TRUE`, covariates are standardized via `scale()` before optimization starts. Defaults to `TRUE`.
 #' 
@@ -33,6 +33,16 @@ library(anticlust)
 #' - All required arguments (`data`, `treatments`, `covariates`) are provided.
 #' - Covariates exist in the `data` frame and are numeric.
 #' - Covariates do not have constant values across all rows.
+#' 
+#' @section How to cite:
+#' If you use this function in academic work, please cite:
+#' Wintermantel, D., Osterman, J., Mair, M. M., & Hartig, F. (in preparation).
+#' *Equivalence testing in pesticide risk assessment – Evaluation and practical guidance
+#' for design, analysis and interpretation*.
+#'
+#' The treatment allocation implemented here relies on anticlustering functions.
+#' Users are encouraged to also cite the `anticlust` package where appropriate
+#' (see `citation("anticlust")`).
 #'
 #' @examples
 #' # Example dataset: Bee data
@@ -76,12 +86,13 @@ library(anticlust)
 #' 
 
 allocate_treatments <- function(data, treatments, treatment_var = "Treatment",
-                                covariates, n_replicates = NULL, 
+                                covariates, 
+                                n_replicates = NULL, 
                                 ensure_equal_n_replicates = TRUE, 
                                 keep_excluded_data = FALSE,
                                 objective = "variance", 
                                 method = "local-maximum",
-                                repetitions = 10, 
+                                repetitions = 100, 
                                 match_within = NULL,
                                 standardize = TRUE) {
   
diff --git a/R/allocate_within_treatments.R b/R/allocate_within_treatments.R
new file mode 100644
index 00000000..4e742db9
--- /dev/null
+++ b/R/allocate_within_treatments.R
@@ -0,0 +1,185 @@
+#' Allocate Subjects to Groups *Within* Treatments
+#'
+#' This function assigns subjects to experimental groups *within each treatment level* while balancing specified covariates and (optionally) ensuring equal replicates per group. Internally, it applies [`allocate_treatments()`] to each treatment subset, using the available groups as the target labels.
+#'
+#' @param data_subjects A data frame containing the subjects to be allocated. Must include a column named in `treatment_var` as well as the specified `covariates`.
+#' @param data_groups A data frame describing the available groups within each treatment. Must include columns named in `group_var` and `treatment_var` indicating which groups belong to which treatment.
+#' @param treatment_var A string specifying the name of the treatment column present in both `data_subjects` and `data_groups`.
+#' @param group_var A string specifying the name of the group column in `data_groups` to which subjects will be allocated within each treatment (e.g., site, cage, plot).
+#' @param covariates A vector of column names in `data_subjects` representing covariates to balance.
+#' @param ensure_equal_n_replicates Logical. If `TRUE`, ensures equal numbers of replicates across groups within each treatment (handled inside `allocate_treatments()`). Defaults to `TRUE`. If `FALSE`, all subjects are allocated without enforcing equal replicates.
+#' @param keep_excluded_data Logical. If `TRUE`, retains subjects that were not allocated within the specified constraints (as determined by `allocate_treatments()`). Defaults to `FALSE`.
+#' @param objective A string specifying the objective function for anticlustering. Options include `"variance"` (default), `"diversity"`, `"average-diversity"`, `"kplus"`, and `"dispersion"`.
+#' @param method A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`.
+#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `10`.
+#' @param match_within A column name in `data_subjects` (optional). Specifies a variable within which matching should occur, ensuring that subjects are grouped within subsets defined by this variable before allocation. Defaults to `NULL`.
+#' @param standardize Logical. If `TRUE`, covariates are standardized via `scale()` before optimization starts. Defaults to `TRUE`.
+#'
+#' @return
+#' A data frame containing the merged group information from `data_groups` and the subject allocations from `data_subjects`. The output includes:
+#' - the original treatment column `treatment_var`,
+#' - the allocated group column `group_var` (created by assigning subjects to groups within each treatment),
+#' - all remaining columns from both inputs (joined per treatment).
+#' If `keep_excluded_data = TRUE`, excluded subjects are included with `NA` in the group column.
+#'
+#' @details
+#' The function operates treatment-by-treatment:
+#' 1. For each treatment level present in `data_subjects[[treatment_var]]`, it identifies the corresponding set of groups in `data_groups` that belong to that treatment.
+#' 2. It subsets the subjects to the current treatment and calls [`allocate_treatments()`], passing the available groups as the `treatments` argument, to balance the specified `covariates`.
+#' 3. It restores the original treatment column and merges the per-treatment allocations with the corresponding rows in `data_groups`.
+#'
+#' - If `ensure_equal_n_replicates = TRUE`, the number of replicates per group (within a treatment) is enforced by the internal call to `allocate_treatments()`.
+#' - If `keep_excluded_data = TRUE`, subjects that cannot be allocated under the constraints are retained in the output with `NA` in `group_var`.
+#' - Covariates can be scaled to standardize their ranges if `standardize = TRUE`.
+#' - If `match_within` is specified, subjects are matched within levels of the specified variable prior to allocation, respecting that structure.
+#'
+#' @section Validations:
+#' The function validates the following conditions:
+#' - All required arguments (`data_subjects`, `data_groups`, `group_var`, `covariates`) are provided.
+#' - `treatment_var` exists in `data_subjects`; `group_var` exists in `data_groups`.
+#' - Types are harmonized internally (e.g., factors/characters) to ensure consistent merging and labeling.
+#' Additional checks on covariates (presence, numeric type, non-constant) are performed inside [`allocate_treatments()`].
+#'
+#' @section How to cite:
+#' If you use this function in academic work, please cite:
+#' Wintermantel, D., Osterman, J., Mair, M. M., & Hartig, F. (in preparation).
+#' *Equivalence testing in pesticide risk assessment – Evaluation and practical guidance
+#' for design, analysis and interpretation*.
+#'
+#' The treatment allocation implemented here relies on anticlustering functions
+#' Users are encouraged to also cite the `anticlust` package where appropriate
+#' (see `citation("anticlust")`).
+#'
+#' @examples
+#' set.seed(123)
+#'
+#' # Example dataset: Bee subjects (to be allocated to treatments, then to sites within treatments)
+#' example_bee_data <- data.frame(
+#'   ID = as.factor(as.character(seq(1, 100, 1))),
+#'   Bee_count = rnorm(100, mean = 1000, sd = 200),
+#'   Colony_weight = rnorm(100, mean = 500, sd = 100)
+#' )
+#'
+#' # Example dataset: Sites (groups) – with site-level covariates and a blocking variable
+#' example_site_data <- data.frame(
+#'   Site = as.factor(as.character(seq(1, 16, 1))),
+#'   Irrigated = as.factor(as.character(c(rep("yes", 4), rep("no", 12)))),
+#'   Field_quality = rnorm(16, mean = 3, sd = 0.5)
+#' )
+#'
+#' treatments <- c("Control", "Pesticide")
+#' bee_covariates <- c("Bee_count", "Colony_weight")
+#'
+#' # 1) Allocate subjects to treatments (balancing colony-level covariates)
+#' allocated_bee_data <- allocate_treatments(
+#'   data = example_bee_data,
+#'   treatments = treatments,
+#'   covariates = bee_covariates,
+#'   ensure_equal_n_replicates = TRUE,
+#'   keep_excluded_data = FALSE
+#' )
+#' nrow(allocated_bee_data)
+#'
+#' # 2) Allocate sites to treatments (balancing site quality and matching within irrigation status)
+#' allocated_site_data <- allocate_treatments(
+#'   data = example_site_data,
+#'   treatments = treatments,
+#'   covariates = "Field_quality",
+#'   ensure_equal_n_replicates = TRUE,
+#'   keep_excluded_data = FALSE,
+#'   match_within = "Irrigated"
+#' )
+#' nrow(allocated_site_data)
+#'
+#' # 3) Allocate subjects to specific sites *within each treatment*
+#' allocated_data <- allocate_within_treatments(
+#'   data_subjects = allocated_bee_data,
+#'   data_groups   = allocated_site_data,
+#'   treatment_var = "Treatment",
+#'   group_var     = "Site",
+#'   covariates    = bee_covariates,
+#'   ensure_equal_n_replicates = TRUE,
+#'   keep_excluded_data = FALSE,
+#'   standardize = TRUE
+#' )
+#' head(allocated_data)
+#'
+#' @seealso
+#' [`allocate_treatments()`] for the underlying allocation within each treatment using anticlustering.
+#'
+#' @section Required Packages:
+#' This function relies on:
+#' - `anticlust`: for anticlustering optimization (used within `allocate_treatments()`).
+#'
+#' @export
+
+allocate_within_treatments <- function(data_subjects, data_groups, treatment_var, group_var, covariates, 
+                                       ensure_equal_n_replicates = TRUE, keep_excluded_data = FALSE,  
+                                       objective = "variance", method = "local-maximum",
+                                       repetitions = 10, match_within = NULL, standardize = TRUE) {
+  
+  # Helper function to validate arguments
+  validate_args <- function(arg, arg_name) {
+    if (missing(arg) || is.null(arg)) {
+      stop(paste0("Error: The required argument '", arg_name, "' is missing or NULL. Please provide a valid input."))
+    }
+  }
+  
+  # Validate necessary arguments
+  validate_args(data_subjects, "data_subjects")
+  validate_args(data_groups, "data_groups")
+  validate_args(group_var, "group_var")
+  validate_args(covariates, "covariates")
+  
+  if (!treatment_var %in% names(data_subjects)) {
+    stop("The treatment_var column is missing in data_subjects.")
+  }
+  if (!group_var %in% names(data_groups)) {
+    stop("The group_var column is missing in data_groups.")
+  }
+  
+  treatments <- unique(as.character(data_subjects[[treatment_var]]))
+  
+  # Ensure correct data types
+  data_subjects[[treatment_var]] <- as.character(data_subjects[[treatment_var]])
+  data_groups[[group_var]] <- as.factor(as.character(data_groups[[group_var]]))
+  data_groups[[treatment_var]] <- as.character(data_groups[[treatment_var]])
+  
+  # Function to allocate subjects to groups within a single treatment
+  allocate_to_groups_within_treatment <- function(treatment) {
+    sub_group_data <- data_groups[data_groups[[treatment_var]] == treatment, , drop = FALSE]
+    groups <- unique(as.character(sub_group_data[[group_var]]))
+    
+    sub_subject_data <- data_subjects[data_subjects[[treatment_var]] == treatment, , drop = FALSE]
+    
+    allocated_sub_subject_data <- allocate_treatments(
+      data = sub_subject_data,
+      treatments = groups,
+      covariates = covariates,
+      ensure_equal_n_replicates = ensure_equal_n_replicates,
+      keep_excluded_data = keep_excluded_data,
+      objective = objective, 
+      method = method,
+      repetitions = repetitions,
+      match_within = match_within,
+      standardize = standardize
+    ) 
+    
+    # Rename the column that was overwritten by group assignment
+    colnames(allocated_sub_subject_data)[colnames(allocated_sub_subject_data) == treatment_var] <- group_var
+    
+    # Add the original treatment column back
+    allocated_sub_subject_data[[treatment_var]] <- treatment
+    
+    # Merge subject and group data for this treatment
+    merged_data <- merge(sub_group_data, allocated_sub_subject_data, all = TRUE)
+    
+    return(merged_data)
+  }
+  
+  # Apply the allocation for each treatment
+  allocated_data_list <- lapply(treatments, allocate_to_groups_within_treatment)
+  allocated_data <- do.call(rbind, allocated_data_list)
+  
+  return(allocated_data)
+}
\ No newline at end of file