From e91352707705043c3288d0aa16ef2e9065fe289f Mon Sep 17 00:00:00 2001 From: Dimitry Wintermantel Date: Wed, 26 Feb 2025 14:08:48 +0100 Subject: [PATCH 1/2] Initial commit - adding allocate_treatments function that assigns subjects to treatment groups while balancing specified covariates --- R/allocate_treatments.R | 208 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 R/allocate_treatments.R diff --git a/R/allocate_treatments.R b/R/allocate_treatments.R new file mode 100644 index 00000000..7c481ddd --- /dev/null +++ b/R/allocate_treatments.R @@ -0,0 +1,208 @@ +library(anticlust) + +#' Allocate Treatments to Subjects +#' +#' This function assigns subjects to treatment groups while balancing specified covariates and ensuring replicates per group. +#' +#' @param data A data frame containing the subjects to be allocated, including columns for the covariates. +#' @param treatments A vector specifying the treatment group labels. +#' @param treatment_var A string specifying the name of the treatment column to be created. Defaults to `"Treatment"`. +#' @param covariates A vector of column names in `data` representing covariates to balance. +#' @param n_replicates Integer. Specifies the number of replicates per treatment group. If `NULL`, either all subjects are allocated or the highest possible equal number per treatment is allocated depending on `ensure_equal_n_replicates`. Defaults to `NULL`. +#' @param ensure_equal_n_replicates Logical. If `TRUE`, ensures equal numbers of replicates across treatment groups. Defaults to `TRUE`. If `FALSE` all subjects are allocated. +#' @param keep_excluded_data Logical. If `TRUE`, retains subjects that were not allocated within the specified constraints. Defaults to `FALSE`. +#' @param objective A string specifying the objective function for anticlustering. Options include `"variance"` (default), `"diversity"`, `"average-diversity"`, `"kplus"`, and `"dispersion"`. +#' @param method. A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`. +#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `10`. +#' @param match_within A column name in `data` (optional). Specifies a variable within which matching should occur, ensuring that subjects are grouped within subsets defined by this variable. Defaults to `NULL`. +#' @param standardize Logical. If `TRUE`, covariates are standardized via `scale()` before optimization starts. Defaults to `TRUE`. +#' +#' +#' @return A data frame with subjects assigned to treatments. The column name for treatment allocation is specified by `treatment_var`. If `keep_excluded_data = TRUE`, excluded subjects are included in the output with `NA` in the treatment column. +#' +#' @details +#' The function balances covariates among treatment groups by creating groups of similar individuals based on the specified covariates. The treatments are then assigned to these groups in a way that minimizes variance within groups. +#' +#' - If `ensure_equal_n_replicates = TRUE`, the number of replicates per treatment is enforced. +#' - If `keep_excluded_data = TRUE`, subjects that cannot be allocated to treatments are retained in the output. +#' - Covariates can be scaled to standardize their ranges if `standardize = TRUE`. +#' - If `match_within` is specified, subjects are grouped within levels of the specified variable, ensuring allocations respect the structure defined by this variable. +#' +#' @section Validations: +#' The function validates the following conditions: +#' - All required arguments (`data`, `treatments`, `covariates`) are provided. +#' - Covariates exist in the `data` frame and are numeric. +#' - Covariates do not have constant values across all rows. +#' +#' @examples +#' # Example dataset: Bee data +#' set.seed(123) +#' example_bee_data <- data.frame( +#' ID = as.factor(as.character(seq(1, 100, 1))), +#' Bee_count = rnorm(100, mean = 1000, sd = 200), +#' Colony_weight = rnorm(100, mean = 500, sd = 100)) +#' +#' # Example dataset: Site data +#' example_site_data <- data.frame( +#' Site = as.factor(as.character(seq(1, 16, 1))), +#' Organic = as.factor(as.character(c(rep("yes", 4), rep("no", 12)))), +#' Field_quality = rnorm(16, mean = 3, sd = 0.5)) +#' +#' treatments <- c("Control", "Pesticide") +#' bee_covariates <- c("Bee_count", "Colony_weight") +#' +#' # Allocate bee data to treatments +#' allocated_bee_data <- allocate_treatments( +#' example_bee_data, +#' treatments = treatments, +#' covariates = bee_covariates, +#' ensure_equal_n_replicates = TRUE, +#' keep_excluded_data = FALSE) +#' +#' # Allocate site data to treatments while matching within "Organic" +#' allocated_site_data <- allocate_treatments( +#' example_site_data, +#' treatments = treatments, +#' covariates = "Field_quality", +#' ensure_equal_n_replicates = TRUE, +#' keep_excluded_data = FALSE, +#' match_within = "Organic") +#' +#' @section Required Packages: +#' This function requires the following packages: +#' - `anticlust`: for anticlustering optimization. +#' +#' @export +#' + +allocate_treatments <- function(data, treatments, treatment_var = "Treatment", + covariates, n_replicates = NULL, + ensure_equal_n_replicates = TRUE, + keep_excluded_data = FALSE, + objective = "variance", + method = "local-maximum", + repetitions = 10, + match_within = NULL, + standardize = TRUE) { + + # Use validate_args for argument validation + validate_args <- function(arg, arg_name) { + if (missing(arg) || is.null(arg)) { + stop(paste0("Error: The required argument '", arg_name, "' is missing or NULL. Please provide a valid input.")) + } + } + + # Validate required arguments + validate_args(data, "data") + validate_args(treatments, "treatments") + validate_args(covariates, "covariates") + + # Check that specified covariates are present in the data + missing_covariates <- setdiff(covariates, colnames(data)) + if (length(missing_covariates) > 0) { + stop("The following covariates are missing in the data: ", paste(missing_covariates, collapse = ", "), ". Ensure these covariates are included in the dataset.") + } + + # Check if any covariate is not numeric + non_numeric_covariates <- covariates[!sapply(data[, covariates, drop = FALSE], is.numeric)] + if (length(non_numeric_covariates) > 0) { + stop("The following covariates are not numeric: ", paste(non_numeric_covariates, collapse = ", "), ". Convert these covariates to numeric before running the function.") + } + + # Check if any covariate takes the same value for all rows + constant_covariates <- covariates[sapply(data[, covariates, drop = FALSE], function(x) length(unique(x[!is.na(x)])) == 1)] + if (length(constant_covariates) > 0) { + stop("The following covariates take the same value for all observations: ", paste(constant_covariates, collapse = ", "), ". Remove these covariates or provide covariates with variability.") + } + + # Validate match_within + if (!is.null(match_within)) { + if (!match_within %in% colnames(data)) { + stop("The column specified in match_within ('", match_within, "') is not present in the data frame. Please ensure it is a valid column name.") + } + match_within_vector <- data[[match_within]] + } else { + match_within_vector <- NULL + } + + # Shuffle treatments for random assignment + treatments_shuffled <- sample(treatments) + + # Calculate the number of replicates per treatment if not provided + n_treatments <- length(treatments) + + if (is.null(n_replicates)) { + n_replicates <- floor(nrow(data) / n_treatments) + } else { + if (!ensure_equal_n_replicates) { + ensure_equal_n_replicates <- TRUE + warning("'ensure_equal_n_replicates' was changed to TRUE as 'n_replicates' was manually specified.") + } + } + + # Scale covariates if specified + covariate_data <- if (standardize) { + scale(data[, covariates, drop = FALSE]) + } else { + data[, covariates, drop = FALSE] + } + + if (ensure_equal_n_replicates | n_treatments * n_replicates == nrow(data)) { + # Create groups with similar individuals based on covariates + data$set_similar <- matching( + covariate_data, + p = n_treatments, + match_within = match_within_vector + ) + + # Include individuals based on the number of replicates + included <- data[!is.na(data$set_similar) & data$set_similar <= n_replicates, ] + + # Assign individuals of the same set to different treatment groups + included[[treatment_var]] <- anticlustering( + included[, covariates, drop = FALSE], + K = n_treatments, + objective = objective, + method = method, + categories = included$set_similar, + repetitions = repetitions + ) + + included$set_similar <- NULL + + if (keep_excluded_data) { + # Extract data that was excluded from the experiment + excluded <- data[data$set_similar > n_replicates | is.na(data$set_similar), ] + excluded$set_similar <- NULL + excluded[[treatment_var]] <- NA + + # Combine included and excluded data + output <- rbind(included, excluded) + } else { + output <- included + } + } else { + # Directly assign treatments without ensuring equal replicates + data[[treatment_var]] <- anticlustering( + covariate_data, + K = n_treatments, + objective = objective, + method = method, + categories = match_within_vector, + repetitions = repetitions + ) + output <- data + } + + # Map treatment values to shuffled treatments + treatment_map <- setNames(treatments_shuffled, as.character(1:n_treatments)) + output[[treatment_var]] <- unname(treatment_map[as.character(output[[treatment_var]])]) + + # Convert treatment variable to factor + output[[treatment_var]] <- factor(output[[treatment_var]], levels = treatments) + + # Reorder columns to have treatment_var first + output <- output[, c(treatment_var, setdiff(names(output), treatment_var))] + + return(output) +} From d306abacd9a7ac5b3912bdea99f2bae773c4e90c Mon Sep 17 00:00:00 2001 From: Dimitry Wintermantel Date: Tue, 17 Mar 2026 21:08:43 +0100 Subject: [PATCH 2/2] Update allocation function and add second function for multi-level designs --- R/allocate_treatments.R | 19 +++- R/allocate_within_treatments.R | 185 +++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 4 deletions(-) create mode 100644 R/allocate_within_treatments.R diff --git a/R/allocate_treatments.R b/R/allocate_treatments.R index 7c481ddd..0e008bc6 100644 --- a/R/allocate_treatments.R +++ b/R/allocate_treatments.R @@ -12,8 +12,8 @@ library(anticlust) #' @param ensure_equal_n_replicates Logical. If `TRUE`, ensures equal numbers of replicates across treatment groups. Defaults to `TRUE`. If `FALSE` all subjects are allocated. #' @param keep_excluded_data Logical. If `TRUE`, retains subjects that were not allocated within the specified constraints. Defaults to `FALSE`. #' @param objective A string specifying the objective function for anticlustering. Options include `"variance"` (default), `"diversity"`, `"average-diversity"`, `"kplus"`, and `"dispersion"`. -#' @param method. A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`. -#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `10`. +#' @param method A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`. +#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `100`. #' @param match_within A column name in `data` (optional). Specifies a variable within which matching should occur, ensuring that subjects are grouped within subsets defined by this variable. Defaults to `NULL`. #' @param standardize Logical. If `TRUE`, covariates are standardized via `scale()` before optimization starts. Defaults to `TRUE`. #' @@ -33,6 +33,16 @@ library(anticlust) #' - All required arguments (`data`, `treatments`, `covariates`) are provided. #' - Covariates exist in the `data` frame and are numeric. #' - Covariates do not have constant values across all rows. +#' +#' @section How to cite: +#' If you use this function in academic work, please cite: +#' Wintermantel, D., Osterman, J., Mair, M. M., & Hartig, F. (in preparation). +#' *Equivalence testing in pesticide risk assessment – Evaluation and practical guidance +#' for design, analysis and interpretation*. +#' +#' The treatment allocation implemented here relies on anticlustering functions. +#' Users are encouraged to also cite the `anticlust` package where appropriate +#' (see `citation("anticlust")`). #' #' @examples #' # Example dataset: Bee data @@ -76,12 +86,13 @@ library(anticlust) #' allocate_treatments <- function(data, treatments, treatment_var = "Treatment", - covariates, n_replicates = NULL, + covariates, + n_replicates = NULL, ensure_equal_n_replicates = TRUE, keep_excluded_data = FALSE, objective = "variance", method = "local-maximum", - repetitions = 10, + repetitions = 100, match_within = NULL, standardize = TRUE) { diff --git a/R/allocate_within_treatments.R b/R/allocate_within_treatments.R new file mode 100644 index 00000000..4e742db9 --- /dev/null +++ b/R/allocate_within_treatments.R @@ -0,0 +1,185 @@ +#' Allocate Subjects to Groups *Within* Treatments +#' +#' This function assigns subjects to experimental groups *within each treatment level* while balancing specified covariates and (optionally) ensuring equal replicates per group. Internally, it applies [`allocate_treatments()`] to each treatment subset, using the available groups as the target labels. +#' +#' @param data_subjects A data frame containing the subjects to be allocated. Must include a column named in `treatment_var` as well as the specified `covariates`. +#' @param data_groups A data frame describing the available groups within each treatment. Must include columns named in `group_var` and `treatment_var` indicating which groups belong to which treatment. +#' @param treatment_var A string specifying the name of the treatment column present in both `data_subjects` and `data_groups`. +#' @param group_var A string specifying the name of the group column in `data_groups` to which subjects will be allocated within each treatment (e.g., site, cage, plot). +#' @param covariates A vector of column names in `data_subjects` representing covariates to balance. +#' @param ensure_equal_n_replicates Logical. If `TRUE`, ensures equal numbers of replicates across groups within each treatment (handled inside `allocate_treatments()`). Defaults to `TRUE`. If `FALSE`, all subjects are allocated without enforcing equal replicates. +#' @param keep_excluded_data Logical. If `TRUE`, retains subjects that were not allocated within the specified constraints (as determined by `allocate_treatments()`). Defaults to `FALSE`. +#' @param objective A string specifying the objective function for anticlustering. Options include `"variance"` (default), `"diversity"`, `"average-diversity"`, `"kplus"`, and `"dispersion"`. +#' @param method A string specifying the optimization method for anticlustering. Options include `"local-maximum"` (default), `"exchange"`, `"brusco"`, `"ilp"`, and `"2PML"`. +#' @param repetitions Integer. Specifies the number of times the optimization is repeated when using heuristic methods (`"exchange"`, `"local-maximum"`, `"brusco"`, or `"2PML"`). The best solution is selected. Defaults to `10`. +#' @param match_within A column name in `data_subjects` (optional). Specifies a variable within which matching should occur, ensuring that subjects are grouped within subsets defined by this variable before allocation. Defaults to `NULL`. +#' @param standardize Logical. If `TRUE`, covariates are standardized via `scale()` before optimization starts. Defaults to `TRUE`. +#' +#' @return +#' A data frame containing the merged group information from `data_groups` and the subject allocations from `data_subjects`. The output includes: +#' - the original treatment column `treatment_var`, +#' - the allocated group column `group_var` (created by assigning subjects to groups within each treatment), +#' - all remaining columns from both inputs (joined per treatment). +#' If `keep_excluded_data = TRUE`, excluded subjects are included with `NA` in the group column. +#' +#' @details +#' The function operates treatment-by-treatment: +#' 1. For each treatment level present in `data_subjects[[treatment_var]]`, it identifies the corresponding set of groups in `data_groups` that belong to that treatment. +#' 2. It subsets the subjects to the current treatment and calls [`allocate_treatments()`], passing the available groups as the `treatments` argument, to balance the specified `covariates`. +#' 3. It restores the original treatment column and merges the per-treatment allocations with the corresponding rows in `data_groups`. +#' +#' - If `ensure_equal_n_replicates = TRUE`, the number of replicates per group (within a treatment) is enforced by the internal call to `allocate_treatments()`. +#' - If `keep_excluded_data = TRUE`, subjects that cannot be allocated under the constraints are retained in the output with `NA` in `group_var`. +#' - Covariates can be scaled to standardize their ranges if `standardize = TRUE`. +#' - If `match_within` is specified, subjects are matched within levels of the specified variable prior to allocation, respecting that structure. +#' +#' @section Validations: +#' The function validates the following conditions: +#' - All required arguments (`data_subjects`, `data_groups`, `group_var`, `covariates`) are provided. +#' - `treatment_var` exists in `data_subjects`; `group_var` exists in `data_groups`. +#' - Types are harmonized internally (e.g., factors/characters) to ensure consistent merging and labeling. +#' Additional checks on covariates (presence, numeric type, non-constant) are performed inside [`allocate_treatments()`]. +#' +#' @section How to cite: +#' If you use this function in academic work, please cite: +#' Wintermantel, D., Osterman, J., Mair, M. M., & Hartig, F. (in preparation). +#' *Equivalence testing in pesticide risk assessment – Evaluation and practical guidance +#' for design, analysis and interpretation*. +#' +#' The treatment allocation implemented here relies on anticlustering functions +#' Users are encouraged to also cite the `anticlust` package where appropriate +#' (see `citation("anticlust")`). +#' +#' @examples +#' set.seed(123) +#' +#' # Example dataset: Bee subjects (to be allocated to treatments, then to sites within treatments) +#' example_bee_data <- data.frame( +#' ID = as.factor(as.character(seq(1, 100, 1))), +#' Bee_count = rnorm(100, mean = 1000, sd = 200), +#' Colony_weight = rnorm(100, mean = 500, sd = 100) +#' ) +#' +#' # Example dataset: Sites (groups) – with site-level covariates and a blocking variable +#' example_site_data <- data.frame( +#' Site = as.factor(as.character(seq(1, 16, 1))), +#' Irrigated = as.factor(as.character(c(rep("yes", 4), rep("no", 12)))), +#' Field_quality = rnorm(16, mean = 3, sd = 0.5) +#' ) +#' +#' treatments <- c("Control", "Pesticide") +#' bee_covariates <- c("Bee_count", "Colony_weight") +#' +#' # 1) Allocate subjects to treatments (balancing colony-level covariates) +#' allocated_bee_data <- allocate_treatments( +#' data = example_bee_data, +#' treatments = treatments, +#' covariates = bee_covariates, +#' ensure_equal_n_replicates = TRUE, +#' keep_excluded_data = FALSE +#' ) +#' nrow(allocated_bee_data) +#' +#' # 2) Allocate sites to treatments (balancing site quality and matching within irrigation status) +#' allocated_site_data <- allocate_treatments( +#' data = example_site_data, +#' treatments = treatments, +#' covariates = "Field_quality", +#' ensure_equal_n_replicates = TRUE, +#' keep_excluded_data = FALSE, +#' match_within = "Irrigated" +#' ) +#' nrow(allocated_site_data) +#' +#' # 3) Allocate subjects to specific sites *within each treatment* +#' allocated_data <- allocate_within_treatments( +#' data_subjects = allocated_bee_data, +#' data_groups = allocated_site_data, +#' treatment_var = "Treatment", +#' group_var = "Site", +#' covariates = bee_covariates, +#' ensure_equal_n_replicates = TRUE, +#' keep_excluded_data = FALSE, +#' standardize = TRUE +#' ) +#' head(allocated_data) +#' +#' @seealso +#' [`allocate_treatments()`] for the underlying allocation within each treatment using anticlustering. +#' +#' @section Required Packages: +#' This function relies on: +#' - `anticlust`: for anticlustering optimization (used within `allocate_treatments()`). +#' +#' @export + +allocate_within_treatments <- function(data_subjects, data_groups, treatment_var, group_var, covariates, + ensure_equal_n_replicates = TRUE, keep_excluded_data = FALSE, + objective = "variance", method = "local-maximum", + repetitions = 10, match_within = NULL, standardize = TRUE) { + + # Helper function to validate arguments + validate_args <- function(arg, arg_name) { + if (missing(arg) || is.null(arg)) { + stop(paste0("Error: The required argument '", arg_name, "' is missing or NULL. Please provide a valid input.")) + } + } + + # Validate necessary arguments + validate_args(data_subjects, "data_subjects") + validate_args(data_groups, "data_groups") + validate_args(group_var, "group_var") + validate_args(covariates, "covariates") + + if (!treatment_var %in% names(data_subjects)) { + stop("The treatment_var column is missing in data_subjects.") + } + if (!group_var %in% names(data_groups)) { + stop("The group_var column is missing in data_groups.") + } + + treatments <- unique(as.character(data_subjects[[treatment_var]])) + + # Ensure correct data types + data_subjects[[treatment_var]] <- as.character(data_subjects[[treatment_var]]) + data_groups[[group_var]] <- as.factor(as.character(data_groups[[group_var]])) + data_groups[[treatment_var]] <- as.character(data_groups[[treatment_var]]) + + # Function to allocate subjects to groups within a single treatment + allocate_to_groups_within_treatment <- function(treatment) { + sub_group_data <- data_groups[data_groups[[treatment_var]] == treatment, , drop = FALSE] + groups <- unique(as.character(sub_group_data[[group_var]])) + + sub_subject_data <- data_subjects[data_subjects[[treatment_var]] == treatment, , drop = FALSE] + + allocated_sub_subject_data <- allocate_treatments( + data = sub_subject_data, + treatments = groups, + covariates = covariates, + ensure_equal_n_replicates = ensure_equal_n_replicates, + keep_excluded_data = keep_excluded_data, + objective = objective, + method = method, + repetitions = repetitions, + match_within = match_within, + standardize = standardize + ) + + # Rename the column that was overwritten by group assignment + colnames(allocated_sub_subject_data)[colnames(allocated_sub_subject_data) == treatment_var] <- group_var + + # Add the original treatment column back + allocated_sub_subject_data[[treatment_var]] <- treatment + + # Merge subject and group data for this treatment + merged_data <- merge(sub_group_data, allocated_sub_subject_data, all = TRUE) + + return(merged_data) + } + + # Apply the allocation for each treatment + allocated_data_list <- lapply(treatments, allocate_to_groups_within_treatment) + allocated_data <- do.call(rbind, allocated_data_list) + + return(allocated_data) +} \ No newline at end of file