From 78572b014c1015ebb10726f4df4d7cb5289803b6 Mon Sep 17 00:00:00 2001 From: donishadsmith Date: Sat, 21 Mar 2026 00:07:19 -0400 Subject: [PATCH] Replace list output with R6 classes --- .pre-commit-config.yaml | 4 +- CHANGELOG.md | 6 + DESCRIPTION | 9 +- NAMESPACE | 10 +- R/append_param_keys.R | 2 +- R/categorical_encoding.R | 76 +- R/{classCV.R => class_cv.R} | 1073 +++++++++-------- R/constants.R | 265 ++-- R/curves.R | 126 -- R/curves_utils.R | 266 ++-- R/error_handling.R | 278 ++--- R/genFolds.R | 126 -- R/plot.vswift.R | 96 -- R/plot_utils.R | 63 +- R/print.vswift.R | 83 -- R/print_utils.R | 386 +++--- R/r6_classes.R | 856 +++++++++++++ R/sampling.R | 53 + R/utils.R | 425 ++++--- R/validation.R | 4 +- README.md | 869 ++----------- ...eme_gradient_boosting_cv_precision_ad..png | Bin 3780 -> 3725 bytes ...xtreme_gradient_boosting_cv_recall_ad..png | Bin 3755 -> 3699 bytes ...ient_boosting_train_test_precision_ad..png | Bin 4373 -> 4369 bytes ...radient_boosting_train_test_recall_ad..png | Bin 4338 -> 4338 bytes .../naivebayes_cv_precision_recall_curve.png | Bin 5540 -> 5389 bytes assets/curves/naivebayes_cv_roc_curve.png | Bin 6377 -> 6204 bytes ...ayes_train_test_precision_recall_curve.png | Bin 5169 -> 5041 bytes .../naivebayes_train_test_roc_curve.png | Bin 6209 -> 6074 bytes ...gularized_logistic_regression_cv_F1_No.png | Bin 3579 -> 0 bytes ...ularized_logistic_regression_cv_F1_Yes.png | Bin 3592 -> 0 bytes ...ed_logistic_regression_cv_Precision_No.png | Bin 3772 -> 0 bytes ...d_logistic_regression_cv_Precision_Yes.png | Bin 3768 -> 0 bytes ...rized_logistic_regression_cv_Recall_No.png | Bin 3716 -> 0 bytes ...ized_logistic_regression_cv_Recall_Yes.png | Bin 3735 -> 0 bytes ..._regression_cv_classification_accuracy.png | Bin 3799 -> 3746 bytes ...gularized_logistic_regression_cv_f1_No.png | Bin 0 -> 3527 bytes ...ularized_logistic_regression_cv_f1_Yes.png | Bin 0 -> 3539 bytes ...ed_logistic_regression_cv_precision_No.png | Bin 0 -> 3720 bytes ...d_logistic_regression_cv_precision_Yes.png | Bin 0 -> 3716 bytes ...rized_logistic_regression_cv_recall_No.png | Bin 0 -> 3665 bytes ...ized_logistic_regression_cv_recall_Yes.png | Bin 0 -> 3683 bytes inst/WORDLIST | 171 +-- man/CurveResult.Rd | 242 ++++ man/Vswift.Rd | 755 ++++++++++++ man/{classCV.Rd => class_cv.Rd} | 285 +++-- man/genFolds.Rd | 62 - man/plot.vswift.Rd | 81 -- man/prCurve.Rd | 91 -- man/print.vswift.Rd | 64 - man/rocCurve.Rd | 91 -- .../{tests_classCV.R => tests_class_cv.R} | 239 ++-- tests/testthat/tests_genFolds.R | 39 - .../{tests_plot.vswift.R => tests_plot.R} | 6 +- .../{tests_prCurve.R => tests_pr_curve.R} | 348 +++--- .../{tests_print.vswift.R => tests_print.R} | 46 +- .../{tests_rocCurve.R => tests_roc_curve.R} | 348 +++--- tests/testthat/utils.R | 13 +- vignettes/vswift-intro.Rmd | 163 ++- 59 files changed, 4403 insertions(+), 3717 deletions(-) rename R/{classCV.R => class_cv.R} (54%) delete mode 100644 R/curves.R delete mode 100644 R/genFolds.R delete mode 100644 R/plot.vswift.R delete mode 100644 R/print.vswift.R create mode 100644 R/r6_classes.R create mode 100644 R/sampling.R delete mode 100644 assets/thyroid/regularized_logistic_regression_cv_F1_No.png delete mode 100644 assets/thyroid/regularized_logistic_regression_cv_F1_Yes.png delete mode 100644 assets/thyroid/regularized_logistic_regression_cv_Precision_No.png delete mode 100644 assets/thyroid/regularized_logistic_regression_cv_Precision_Yes.png delete mode 100644 assets/thyroid/regularized_logistic_regression_cv_Recall_No.png delete mode 100644 assets/thyroid/regularized_logistic_regression_cv_Recall_Yes.png create mode 100644 assets/thyroid/regularized_logistic_regression_cv_f1_No.png create mode 100644 assets/thyroid/regularized_logistic_regression_cv_f1_Yes.png create mode 100644 assets/thyroid/regularized_logistic_regression_cv_precision_No.png create mode 100644 assets/thyroid/regularized_logistic_regression_cv_precision_Yes.png create mode 100644 assets/thyroid/regularized_logistic_regression_cv_recall_No.png create mode 100644 assets/thyroid/regularized_logistic_regression_cv_recall_Yes.png create mode 100644 man/CurveResult.Rd create mode 100644 man/Vswift.Rd rename man/{classCV.Rd => class_cv.Rd} (50%) delete mode 100644 man/genFolds.Rd delete mode 100644 man/plot.vswift.Rd delete mode 100644 man/prCurve.Rd delete mode 100644 man/print.vswift.Rd delete mode 100644 man/rocCurve.Rd rename tests/testthat/{tests_classCV.R => tests_class_cv.R} (67%) delete mode 100644 tests/testthat/tests_genFolds.R rename tests/testthat/{tests_plot.vswift.R => tests_plot.R} (74%) rename tests/testthat/{tests_prCurve.R => tests_pr_curve.R} (69%) rename tests/testthat/{tests_print.vswift.R => tests_print.R} (75%) rename tests/testthat/{tests_rocCurve.R => tests_roc_curve.R} (70%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b9b3d6..42d5b9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # R specific hooks: https://github.com/lorenzwalthert/precommit repos: - repo: https://github.com/lorenzwalthert/precommit - rev: v0.4.3.9017 + rev: v0.4.3.9020 hooks: - id: style-files args: [--style_pkg=styler, --style_fun=tidyverse_style] @@ -40,7 +40,7 @@ repos: - id: parsable-R - id: no-browser-statement - id: no-print-statement - exclude: '.*tests_print.*' + exclude: '.*tests_print.*|.*s3methods.*|.*print\.vswift.*' - id: no-debug-statement - id: deps-in-desc - id: pkgdown diff --git a/CHANGELOG.md b/CHANGELOG.md index 948c9b5..10adbf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,12 @@ All notable future changes to vswift will be documented in this file. noted in the changelog (i.e new functions or parameters, changes in parameter defaults or function names, etc). - *.patch* : Contains no new features, simply fixes any identified bugs. +## [0.6.0] - 2026-03-21 +### 🚀 New/Added +- Returns R6 classes +- Deleted `genFolds` +- Change all methods from camelcase to snakecase + ## [0.5.0.9006] - 2025-08-15 ### 🐛 Fixes - Correct misspelling of youden in docs and list components diff --git a/DESCRIPTION b/DESCRIPTION index 9b168d0..7ebd28e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: vswift Title: Classification Model Evaluation -Version: 0.5.0.9006 -Date: 2025-08-15 +Version: 0.6.0 +Date: 2026-03-21 Authors@R: person(given = "Donisha", family = "Smith", role = c("aut", "cre"), @@ -25,7 +25,8 @@ Imports: future.apply, kknn, glmnet, - data.table + data.table, + R6 Suggests: testthat, covr, @@ -34,5 +35,5 @@ Suggests: VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 1e84e1c..47d9368 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,11 @@ # Generated by roxygen2: do not edit by hand -S3method(plot,vswift) -S3method(print,vswift) -export(classCV) +export(CurveResult) +export(Vswift) +export(class_cv) export(contr.dummy) export(contr.ordinal) -export(genFolds) -export(prCurve) -export(rocCurve) +importFrom(R6,R6Class) importFrom(data.table,":=") importFrom(data.table,.SD) importFrom(data.table,data.table) diff --git a/R/append_param_keys.R b/R/append_param_keys.R index 9c7592e..0272994 100644 --- a/R/append_param_keys.R +++ b/R/append_param_keys.R @@ -1,5 +1,5 @@ # Function for appending missing keys to parameters that are lists -.append_param_keys <- function(param, struct, models = NULL, caller = "classCV", ...) { +.append_param_keys <- function(param, struct, models = NULL, ...) { # Evaluate to get default keys for specific parameters default_keys <- eval(as.list(.DEFAULT_KEYS[[2]])[[param]]) diff --git a/R/categorical_encoding.R b/R/categorical_encoding.R index 55ad763..fa0d6e1 100644 --- a/R/categorical_encoding.R +++ b/R/categorical_encoding.R @@ -1,38 +1,38 @@ -# Create dictionary for target variable if needed for certain algos -.create_dictionary <- function(target_vector, threshold = NULL, alternate_warning = FALSE, curve_method = NULL) { - counter <- 0 - new_classes <- c() - class_dict <- list() - - for (class in names(table(target_vector))) { - new_classes <- c(new_classes, paste(class, "=", counter, collapse = " ")) - class_dict[[class]] <- counter - counter <- counter + 1 - } - - standard_msg <- sprintf( - "due to %s being specified", ifelse(is.null(threshold), "'logistic' or 'xgboost'", "`model_params$threshold`") - ) - - msg <- if (!alternate_warning) standard_msg else sprintf("for `%sCurve`", curve_method) - - warning(sprintf( - "creating keys for target variable %s;\n classes are now encoded: %s", - msg, paste(new_classes, collapse = ", ") - )) - - return(class_dict) -} - -# Helper function to convert keys -.convert_keys <- function(target_vector, keys, direction) { - if (direction == "encode") { - labels <- sapply(target_vector, function(x) keys[[x]]) - } else { - converted_keys <- as.list(names(keys)) - names(converted_keys) <- as.character(as.vector(unlist(keys))) - labels <- sapply(target_vector, function(x) converted_keys[[as.character(x)]]) - } - - return(labels) -} +# Create dictionary for target variable if needed for certain algos +.create_dictionary <- function(target_vector, threshold = NULL, alternate_warning = FALSE, curve_method = NULL) { + counter <- 0 + new_classes <- c() + class_dict <- list() + + for (class in names(table(target_vector))) { + new_classes <- c(new_classes, paste(class, "=", counter, collapse = " ")) + class_dict[[class]] <- counter + counter <- counter + 1 + } + + standard_msg <- sprintf( + "due to %s being specified", ifelse(is.null(threshold), "'logistic' or 'xgboost'", "`model_params$threshold`") + ) + + msg <- if (!alternate_warning) standard_msg else sprintf("for `%s_curve`", curve_method) + + warning(sprintf( + "creating keys for target variable %s;\n classes are now encoded: %s", + msg, paste(new_classes, collapse = ", ") + )) + + return(class_dict) +} + +# Helper function to convert keys +.convert_keys <- function(target_vector, keys, direction) { + if (direction == "encode") { + labels <- sapply(target_vector, function(x) keys[[x]]) + } else { + converted_keys <- as.list(names(keys)) + names(converted_keys) <- as.character(as.vector(unlist(keys))) + labels <- sapply(target_vector, function(x) converted_keys[[as.character(x)]]) + } + + return(labels) +} diff --git a/R/classCV.R b/R/class_cv.R similarity index 54% rename from R/classCV.R rename to R/class_cv.R index 00cea32..7737c42 100644 --- a/R/classCV.R +++ b/R/class_cv.R @@ -1,486 +1,587 @@ -#' Perform Train-Test Splitting and/or Cross-Validation on Classification Data -#' -#' @name classCV -#' -#' @description Performs train-test splitting and/or cross-validation on classification data using various -#' classification algorithms. -#' -#' @param data A data frame. -#' -#' @param formula A formula specifying the model to use. This argument cannot be used when \code{target} -#' (and optionally \code{predictors}) is specified. Default is \code{NULL}. -#' -#' @param target The name or numerical index of the target (response) variable in \code{data}. This argument cannot be -#' used when \code{formula} is specified. Default is \code{NULL}. -#' -#' @param predictors A vector of variable names or numerical indices indicating the predictors in \code{data}, -#' used in conjunction with \code{target}. Default is \code{NULL}. -#' -#' @param models A character string or a character vector specifying the classification algorithm(s) to use. -#' The following options are available: -#' \itemize{ -#' \item \code{"lda"}: Linear Discriminant Analysis -#' \item \code{"qda"}: Quadratic Discriminant Analysis -#' \item \code{"logistic"}: Unregularized Logistic Regression -#' \item \code{"regularized_logistic"}: Regularized Logistic Regression -#' \item \code{"svm"}: Support Vector Machine -#' \item \code{"naivebayes"}: Naive Bayes -#' \item \code{"nnet"}: Neural Network -#' \item \code{"knn"}: K-Nearest Neighbors -#' \item \code{"decisiontree"}: Decision Tree -#' \item \code{"randomforest"}: Random Forest -#' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression -#' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression -#' \item \code{"xgboost"}: Extreme Gradient Boosting -#' } -#' \strong{Notes:} -#' \itemize{ -#' \item \code{"knn"}: The \code{ks} parameter should be set to specify the desired value of \emph{k}, ensuring that -#' the same value is used in all folds. If \code{ks} is not provided, the optimal \emph{k} is automatically selected -#' using the \pkg{kknn} package. -#' \item \code{"nnet"}: An additional argument \code{size} must be specified. -#' \item \code{"regularized_logistic"} and \code{"regularized_multinomial"}: If \code{"lambda"} is specified in the -#' additional arguments and is a vector of length > 1, then internal nested cross-validation is done on the training -#' set to determine the optimal lambda value. The number of folds for the nested cross-validation can be -#' specified by using \code{nfolds} in the additional arguments. If \code{"stratified"} is \code{TRUE}, then the -#' relative proportions of the classes in the training set will be retained in each fold. -#' \item \code{"xgboost"}: The following \code{objective} functions are supported: -#' \code{"reg:logistic"}, \code{"binary:logistic"}, \code{"binary:logitraw"}, \code{"binary:hinge"}, -#' and \code{"multi:softprob"}. -#' } -#' -#' @param model_params A list that can include the following elements: -#' \itemize{ -#' \item \code{"map_args"}: A list of named sub-lists used when more than one model is specified in \code{models}. -#' Each sub-list corresponds to a particular model in the \code{models}] parameter and contains the arguments that will -#' be passed to that model. Default is \code{NULL}. Refer to the "Additional Model Parameters" section for acceptable -#' arguments. -#' \item \code{"threshold"}: A numeric value in the interval [0, 1] that serves as the cutoff value for assigning binary targets. -#' Observations are assigned to the class coded as "1" if \code{P(Class = 1 | Features) >= threshold}; otherwise, they -#' are assigned to the class coded as "0". A default threshold of 0.5 is used when \code{"logistic"} is included in -#' \code{models}, or when \code{"xgboost"} is included in \code{models} with one of these objective functions: -#' \code{"reg:logistic"}, \code{"binary:logistic"}, or \code{"binary:logitraw"}. If \code{NULL}, the remaining models -#' will use there respective default assignment methods (maximizing the posterior probability). Default is \code{NULL}. -#' \item \code{"rule"}: A character that dictates the rule used to select the optimal lambda when using -#' \code{regularized_logistic} or \code{"regularized_multinomial"}. Available options are: \code{"min"} or -#' \code{"1se"}. Default is \code{"min"}. -#' \item \code{verbose}: A logical value indicating whether to state the optimal lambda based on the nested -#' cross-validation. \item \code{"final_model"}: A logical value indicating whether to use all complete observations -#' in the input data for model training. Default is \code{FALSE}. -#' } -#' -#' @param train_params A list that can contain the following parameters: -#' \itemize{ -#' \item \code{split}: A numeric value between 0 and 1 indicating the proportion of data to use -#' for training. The remaining observations are allocated to the test set. If not specified or set to \code{NULL}, no -#' train-test splitting is performed. Note that this split is separate from cross-validation. Default is \code{NULL}. -#' \item \code{n_folds}: An integer greater than 2 specifying the number of folds for cross-validation. If \code{NULL}, -#' no cross-validation is performed. Default is \code{NULL}. -#' \item \code{stratified}: A logical value indicating whether stratified sampling should be used during splitting. -#' Default is \code{FALSE}. -#' \item \code{random_seed}: A numeric value for the random seed to ensure reproducibility of random splitting and any -#' model training that relies on random starts. Default is \code{NULL}. -#' \item \code{standardize}: A logical or a numeric/character vector. If \code{TRUE}, all numeric columns -#' (except the target) are standardized by computing the mean and standard deviation from the training subset and -#' applying them to both the training and test/validation sets. This prevents data leakage. A vector of column indices -#' or names can also be provided to only standardize specific columns. -#' \item \code{remove_obs}: A logical value indicating whether to remove observations in the test/validation set that -#' contain levels of categorical predictors not seen in the training data. Some algorithms may produce errors when -#' encountering such levels in the validation data during prediction. Default is \code{FALSE}. -#' } -#' -#' @param impute_params A list defining how to handle missing values among predictors/features. During imputation, the -#' target variable is excluded from both training and test/validation sets. Prior to imputation, unlabeled data -#' (observations with missing targets) are removed, and any specified train-test split or cross-validation folds are -#' created. A separate imputation model is then generated for each training subset (one for the train-test split and -#' one per fold). Each imputation model is applied to both its corresponding training and test/validation subsets to -#' minimize data leakage. Note that numerical columns are automatically standardized (regardless of -#' \code{train_params$standardize}) before imputation occurs. The \pkg{recipes} package is used for imputation. The -#' following parameters are available: -#' \itemize{ -#' \item \code{method}: A character specifying the imputation method. Options include: -#' \itemize{ -#' \item \code{"impute_bag"}: Bagged Trees Imputation -#' \item \code{"impute_knn"}: K-Nearest Neighbors Imputation -#' } -#' Default is \code{NULL}. -#' \item \code{args}: A list of additional arguments for the chosen imputation method. -#' \itemize{ -#' \item \code{"impute_bag"}: \code{trees}, \code{seed_val} -#' \item \code{"impute_knn"}: \code{neighbors} -#' } -#' For more details about these arguments, consult the \pkg{recipes} documentation. Default is \code{NULL}. -#' } -#' -#' @param save A list that may include the following: -#' \itemize{ -#' \item \code{models}: A logical value indicating whether to save the trained models (including imputation models) -#' used for train-test splits or cross-validation. Default is \code{FALSE}. -#' \item \code{data}: A logical value indicating whether to save all training and test/validation sets used during -#' train-test splitting and/or cross-validation. Default is \code{FALSE}. -#' } -#' -#' @param parallel_configs A list that may include the following: -#' \itemize{ -#' \item \code{n_cores}: A numeric value specifying the number of cores for parallel processing. Default is \code{NULL}. -#' \item \code{future.seed}: A numeric value indicating the seed to use with \pkg{future} for parallel processing. -#' } -#' -#' @param ... Additional arguments for the chosen classification algorithm. These arguments serve as an alternative to -#' specifying model-specific parameters in \code{model_params$map_args} when only a single model is specified in -#' \code{models}. If multiple models are specified, then \code{map_args} must be used. Refer to each algorithm's -#' documentation for details on additional arguments. -#' -#' @section Additional Model Parameters: -#' Each element in \code{models} accepts arguments specific to its underlying classification algorithm. Refer to the -#' original package documentation for more information about these arguments. Further details on the external package -#' functions used for each model are provided in the "Package Dependencies" section. -#' The available arguments for each \code{models} are: -#' \itemize{ -#' \item \code{"lda"}: \code{prior}, \code{method}, \code{nu}, \code{tol} -#' \item \code{"qda"}: \code{prior}, \code{method}, \code{nu} -#' \item \code{"logistic"}: \code{weights}, \code{singular.ok}, \code{maxit} -#' \item \code{"regularized_logistic"}: \code{"alpha"}, \code{"lambda"}, \code{"penalty.factor"}, \code{"maxit"}, -#' \code{"thresh"}, \code{"nfolds"} -#' \item \code{"svm"}: \code{kernel}, \code{degree}, \code{gamma}, \code{cost}, \code{nu}, \code{class.weights}, -#' \code{shrinking}, \code{epsilon}, \code{tolerance}, \code{cachesize} -#' \item \code{"naivebayes"}: \code{prior}, \code{laplace}, \code{usekernel}, \code{usepoisson} -#' \item \code{"nnet"}: \code{size}, \code{rang}, \code{decay}, \code{maxit}, \code{softmax}, \code{entropy}, -#' \code{abstol}, \code{reltol}, \code{Hess}, \code{skip} -#' \item \code{"knn"}: \code{kmax}, \code{ks}, \code{distance}, \code{kernel} -#' \item \code{"decisiontree"}: \code{parms}, \code{control}, \code{cost} -#' \item \code{"randomforest"}: \code{weights}, \code{ntree}, \code{mtry}, \code{nodesize}, \code{importance}, -#' \code{localImp}, \code{nPerm}, \code{proximity}, \code{keep.forest}, \code{norm.votes} -#' \item \code{"multinom"}: \code{Hess} -#' \item \code{"regularized_multinomial"}: \code{"alpha"}, \code{"lambda"}, \code{"penalty.factor"}, \code{"maxit"}, -#' \code{"thresh"}, \code{"nfolds"} -#' \item \code{"xgboost"}: \code{params}, \code{nrounds}, \code{print_every_n}, \code{feval}, \code{verbose}, -#' \code{early_stopping_rounds}, \code{obj}, \code{save_period}, \code{save_name} -#' } -#' -#' @section Package Dependencies: -#' Each option of \code{models} uses the following function from the specified packages: -#' \itemize{ -#' \item \code{"lda"}: \code{lda} from \pkg{MASS} package -#' \item \code{"qda"}: \code{qda} from \pkg{MASS} package -#' \item \code{"logistic"}: \code{glm} from \pkg{base} package with \code{family = "binomial"} -#' \item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package with \code{family = "binomial"} and using -#' \code{cv.glmnet} to select the optimal lambda. -#' \item \code{"svm"}: \code{svm()} from \pkg{e1071} package -#' \item \code{"naivebayes"}: \code{naive_bayes} from \pkg{naivebayes} package -#' \item \code{"nnet"}: \code{nnet} from \pkg{nnet} package -#' \item \code{"knn"}: \code{train.kknn} from \pkg{kknn} package -#' \item \code{"decisiontree"}: \code{rpart} from \pkg{rpart} package -#' \item \code{"randomforest"}: \code{randomForest} from \pkg{randomForest} package -#' \item \code{"multinom"}: \code{multinom} from \pkg{nnet} package -#' \item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package with \code{family = "multinomial"} and using -#' \code{cv.glmnet} to select the optimal lambda. -#' \item \code{"xgboost"}: \code{xgb.train} from \pkg{xgboost} package -#' } -#' -#' @return A list (vswift object) containing: -#' \itemize{ -#' \item Any train-test split or cross-validation results (if specified). -#' \item Performance metrics. -#' \item Class distribution details for the training set, test set, and folds (if applicable). -#' \item Saved models (if requested). -#' \item Saved datasets (if requested). -#' \item A final model (if requested). -#' } -#' -#' @seealso \code{\link{print.vswift}}, \code{\link{plot.vswift}} -#' -#' @examples -#' # Load an example dataset -#' data(iris) -#' -#' # Perform a train-test split with an 80% training set using LDA -#' result <- classCV( -#' data = iris, -#' target = "Species", -#' models = "lda", -#' train_params = list(split = 0.8) -#' ) -#' -#' # Print parameters and metrics -#' result -#' -#' # Perform 5-fold cross-validation using Extreme Gradient Boosting -#' # w/ additional parameters: params & nrounds -#' result <- classCV( -#' data = iris, -#' formula = Species ~ ., -#' models = "xgboost", -#' train_params = list(n_folds = 5, random_seed = 123), -#' params = list( -#' objective = "multi:softprob", -#' num_class = 3, -#' eta = 0.3, -#' max_depth = 6 -#' ), -#' nrounds = 10 -#' ) -#' -#' # Print parameters and metrics -#' result -#' -#' -#' # Perform 5-fold cross-validation a train-test split with multiple models -#' map_args <- list("knn" = list(ks = 5), "nnet" = list(size = 20)) -#' result <- classCV( -#' data = iris, -#' target = 5, -#' predictors = c(1:3), -#' models = c("decisiontree", "knn", "nnet", "svm"), -#' model_params = list(map_args = map_args), -#' train_params = list( -#' n_folds = 5, -#' stratified = TRUE, -#' random_seed = 123 -#' ) -#' ) -#' -#' # Print parameters and metrics -#' result -#' -#' @author Donisha Smith -#' -#' @importFrom stats as.formula complete.cases glm model.matrix predict sd -#' @importFrom data.table := data.table .SD -#' -#' @export -classCV <- function(data, - formula = NULL, - target = NULL, - predictors = NULL, - models, - model_params = list( - "map_args" = NULL, "threshold" = NULL, "rule" = "min", "verbose" = TRUE, - "final_model" = FALSE - ), - train_params = list( - "split" = NULL, "n_folds" = NULL, "stratified" = FALSE, - "random_seed" = NULL, "standardize" = FALSE, "remove_obs" = FALSE - ), - impute_params = list("method" = NULL, "args" = NULL), - save = list("models" = FALSE, "data" = FALSE), - parallel_configs = list("n_cores" = NULL, "future.seed" = NULL), - ...) { - # Ensure model type is lowercase - if (!is.null(models)) models <- tolower(models) - - # Ensure model types are unique - models <- unique(models) - - # Append arguments; append missing so that default arguments appear in the output list and in order - model_params <- .append_param_keys("model_params", model_params, models, ...) - train_params <- .append_param_keys("train_params", train_params) - impute_params <- .append_param_keys("impute_params", impute_params) - save <- .append_param_keys("save", save) - parallel_configs <- .append_param_keys("parallel_configs", parallel_configs) - - # Checking if inputs are valid - .error_handling( - data = data, formula = formula, target = target, predictors = predictors, models = models, - model_params = model_params, train_params = train_params, impute_params = impute_params, - save = save, parallel_configs = parallel_configs, caller = "classCV" - ) - - # Get character form of target and predictor variables - vars <- .get_var_names(formula, target, predictors, data) - - # Get information on unlabeled data and labeled data with missing features - missing_info <- .missing_summary(data, vars$target) - - # Ensure data row names have an enforced order - rownames(data) <- seq(nrow(data)) - - # Clean data; Unlabeled data dropped and labeled missing data dropped if imputation is not requested - clean_outputs <- .clean_data(data, missing_info, !is.null(impute_params$method)) - preprocessed_data <- clean_outputs$cleaned_data - perform_imputation <- clean_outputs$perform_imputation - - # Ensure target is factored and get all levels of character columns obtained if svm in models - factored <- .convert_to_factor(preprocessed_data, vars$target, models, train_params$remove_obs) - preprocessed_data <- factored$data - col_levels <- factored$col_levels - - # Store information - final_output <- .store_parameters( - formula, missing_info, preprocessed_data, vars, models, model_params, train_params, - impute_params, save, parallel_configs - ) - - # Create class dictionary - if (any(models %in% c("logistic", "xgboost")) || !is.null(model_params$threshold)) { - final_output$class_summary$keys <- .create_dictionary(preprocessed_data[, vars$target], model_params$threshold) - } - - # Sampling data - if (!is.null(train_params$split) || !is.null(train_params$n_folds)) { - # Initialize list to store sample indices - final_output$data_partitions <- list() - final_output <- .sampling(preprocessed_data, train_params, vars$target, final_output) - # Create the empty dataframes for metrics - final_output$metrics <- .expand_dataframe(train_params, models, final_output$class_summary$classes) - } - - # Generate vector for iteration - iters <- .gen_iterations(train_params, model_params) - - # Obtain imputation model, if imputation requested - if (!is.null(impute_params$method) && perform_imputation) { - impute_models <- list() - for (i in iters) { - if (i != "final") { - test_indices <- .get_indices(final_output$data_partitions$indices, i) - df_list <- .partition(preprocessed_data, test_indices) - impute_models[[i]] <- .impute_prep( - train = df_list$train, vars = vars, impute_params = impute_params - ) - } else { - impute_models[[i]] <- .impute_prep( - preprocessed_data = preprocessed_data, vars = vars, - impute_params = impute_params - ) - } - } - } - - # Create kwargs - if (!is.null(train_params$split) || !is.null(train_params$n_folds)) { - kwargs <- list( - preprocessed_data = preprocessed_data, - formula = final_output$configs$formula, - model_params = model_params, - vars = vars, - train_params = train_params, - col_levels = col_levels, - class_summary = final_output$class_summary, - save_mods = save$models, - met_df = final_output$metrics, - indices = final_output$data_partitions$indices, - impute_models = if (exists("impute_models")) impute_models[!names(impute_models) == "final"] else NULL - ) - } - - # Iterate to obtain validation metrics, training models, and final model for each algo - for (model in models) { - if (exists("kwargs")) { - if (is.null(parallel_configs$n_cores) || parallel_configs$n_cores <= 1) { - kwargs$iters <- iters[!iters == "final"] - kwargs$model <- model - train_out <- .sequential(kwargs) - } else { - kwargs$model <- model - train_out <- .parallel(kwargs, parallel_configs, iters[!iters == "final"]) - } - - # Add metrics information and model information - if ("split" %in% iters) { - final_output$metrics[[model]]$split <- train_out$metrics$split - train_out$metrics <- train_out$metrics[!names(train_out$metrics) == "split"] - } - - if (!is.null(train_params$n_folds)) { - cv_df <- .merge_df( - iters[!iters %in% c("split", "final")], - train_out$metrics$cv, - final_output$metrics[[model]]$cv - ) - - final_output$metrics[[model]]$cv <- .get_desc(cv_df, train_params$n_folds) - } - - if ("models" %in% names(train_out)) final_output$models[[model]] <- train_out$models - - if ("optimal_lambdas" %in% names(train_out)) final_output$metrics[[model]]$optimal_lambdas <- train_out$optimal_lambdas - } - - # Generate final model - if ("final" %in% iters) { - preproc_kwargs <- list() - - if (exists("impute_models") && "final" %in% names(impute_models)) { - preproc_kwargs$prep <- impute_models$final - } - - if (!is.null(preproc_kwargs$impute) || isTRUE(train_params$standardize)) { - preproc_kwargs <- c(preproc_kwargs, list("vars" = vars, "standardize" = train_params$standardize)) - preprocessed_data <- .prep_data(preprocessed_data = preprocessed_data, preproc_kwargs = preproc_kwargs) - } - - # Generate model depending on chosen models - if (startsWith(model, "regularized")) { - final_out <- .regularized( - id = "Final Model", - model = model, - vars = vars, - data = preprocessed_data, - add_args = model_params$mod_args, - random_seed = train_params$random_seed, - stratified = if (is.null(train_params$stratified)) FALSE else train_params$stratified, - rule = if (is.null(model_params$rule)) "min" else model_params$rule, - verbose = if (is.null(model_params$verbose)) TRUE else model_params$verbose - ) - - if ("optimal_lambda" %in% names(final_out)) { - vec <- c("final" = final_out$optimal_lambda) - final_output$metrics[[model]]$optimal_lambdas <- c(final_output$metrics[[model]]$optimal_lambdas, vec) - final_out$optimal_lambda <- NULL - } - - final_output$models[[model]]$final <- final_out - } else { - final_output$models[[model]]$final <- .generate_model( - model = model, - formula = final_output$configs$formula, - vars = vars, - data = preprocessed_data, - add_args = model_params$mod_args, - random_seed = train_params$random_seed - ) - } - } - } - - # Save data - if (isTRUE(save$data)) { - if (exists("kwargs")) { - for (i in iters[!iters == "final"]) { - test_indices <- .get_indices(kwargs$indices, i) - # Get training and validation data - df_list <- .partition(kwargs$preprocessed_data, test_indices) - # Prep data - df_list <- .prep_data(i, df_list$train, df_list$test, kwargs) - - # Store data - if (i == "split") { - final_output$data_partitions$dataframes$split <- df_list - } else { - final_output$data_partitions$dataframes$cv[[i]] <- df_list - } - } - } - - # Will already be standardized and imputed - if ("final" %in% iters) final_output$data_partitions$dataframes$preprocessed_data <- preprocessed_data - } - - # Save imputation models - if (save$models && exists("impute_models")) { - for (i in names(impute_models)) { - if (i %in% c("split", "final")) { - id <- ifelse(i == "final", "preprocessed_data", i) - final_output$imputation_models[[id]] <- impute_models[[i]] - } else { - final_output$imputation_models$cv[[i]] <- impute_models[[i]] - } - } - } - - # Make list a vswift class - class(final_output) <- "vswift" - - return(final_output) -} +#' Perform Train-Test Splitting and/or Cross-Validation on Classification Data +#' +#' @name class_cv +#' +#' @description Performs train-test splitting and/or cross-validation on +#' classification data using various classification algorithms. +#' +#' @param data A data frame. +#' +#' @param formula A formula specifying the model to use. This argument cannot +#' be used when \code{target} (and optionally \code{predictors}) is specified. +#' Default is \code{NULL}. +#' +#' @param target The name or numerical index of the target (response) variable +#' in \code{data}. This argument cannot be used when \code{formula} is specified. +#' Default is \code{NULL}. +#' +#' @param predictors A vector of variable names or numerical indices indicating +#' the predictors in \code{data}, used in conjunction with \code{target}. +#' Default is \code{NULL}. +#' +#' @param models A character string or a character vector specifying the +#' classification algorithm(s) to use. The following options are available: +#' \itemize{ +#' \item \code{"lda"}: Linear Discriminant Analysis +#' \item \code{"qda"}: Quadratic Discriminant Analysis +#' \item \code{"logistic"}: Unregularized Logistic Regression +#' \item \code{"regularized_logistic"}: Regularized Logistic Regression +#' \item \code{"svm"}: Support Vector Machine +#' \item \code{"naivebayes"}: Naive Bayes +#' \item \code{"nnet"}: Neural Network +#' \item \code{"knn"}: K-Nearest Neighbors +#' \item \code{"decisiontree"}: Decision Tree +#' \item \code{"randomforest"}: Random Forest +#' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression +#' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic +#' Regression +#' \item \code{"xgboost"}: Extreme Gradient Boosting +#' } +#' \strong{Notes:} +#' \itemize{ +#' \item \code{"knn"}: The \code{ks} parameter should be set to specify the +#' desired value of \emph{k}, ensuring that the same value is used in all +#' folds. If \code{ks} is not provided, the optimal \emph{k} is automatically +#' selected using the \pkg{kknn} package. +#' \item \code{"nnet"}: An additional argument \code{size} must be specified. +#' \item \code{"regularized_logistic"} and \code{"regularized_multinomial"}: +#' If \code{"lambda"} is specified in the additional arguments and is a vector +#' of length > 1, then internal nested cross-validation is done on the +#' training set to determine the optimal lambda value. The number of folds for +#' the nested cross-validation can be specified by using \code{n_folds} in the +#' additional arguments. If \code{"stratified"} is \code{TRUE}, then the +#' relative proportions of the classes in the training set will be retained +#' in each fold. +#' \item \code{"xgboost"}: The following \code{objective} functions are +#' supported: \code{"reg:logistic"}, \code{"binary:logistic"}, +#' \code{"binary:logitraw"}, \code{"binary:hinge"}, and \code{"multi:softprob"}. +#' } +#' +#' @param model_params A list that can include the following elements: +#' \itemize{ +#' \item \code{"map_args"}: A list of named sub-lists used when more than one +#' model is specified in \code{models}. Each sub-list corresponds to a +#' particular model in the \code{models}] parameter and contains the +#' arguments that will be passed to that model. Default is \code{NULL}. Refer +#' to the "Additional Model Parameters" section for acceptable arguments. +#' \item \code{"threshold"}: A numeric value in the interval [0, 1] that serves +#' as the cutoff value for assigning binary targets. Observations are assigned +#' to the class coded as "1" if \code{P(Class = 1 | Features) >= threshold}; +#' otherwise, they are assigned to the class coded as "0". A default threshold +#' of 0.5 is used when \code{"logistic"} is included in \code{models}, or when +#' \code{"xgboost"} is included in \code{models} with one of these objective +#' functions: \code{"reg:logistic"}, \code{"binary:logistic"}, or +#' \code{"binary:logitraw"}. If \code{NULL}, the remaining models will use +#' there respective default assignment methods (maximizing the posterior +#' probability). Default is \code{NULL}. +#' \item \code{"rule"}: A character that dictates the rule used to select +#' the optimal lambda when using \code{regularized_logistic} or +#' \code{"regularized_multinomial"}. Available options are: \code{"min"} or +#' \code{"1se"}. Default is \code{"min"}. +#' \item \code{verbose}: A logical value indicating whether to state the +#' optimal lambda based on the nested cross-validation. +#' \item \code{"final_model"}: A logical value indicating whether to use all +#' complete observations in the input data for model training. Default is +#' \code{FALSE}. +#' } +#' +#' @param train_params A list that can contain the following parameters: +#' \itemize{ +#' \item \code{split}: A numeric value between 0 and 1 indicating the +#' proportion of data to use for training. The remaining observations are +#' allocated to the test set. If not specified or set to \code{NULL}, no +#' train-test splitting is performed. Note that this split is separate from +#' cross-validation. Default is \code{NULL}. +#' \item \code{n_folds}: An integer greater than 2 specifying the number of +#' folds for cross-validation. If \code{NULL}, no cross-validation is +#' performed. Default is \code{NULL}. +#' \item \code{stratified}: A logical value indicating whether stratified +#' sampling should be used during splitting. Default is \code{FALSE}. +#' \item \code{random_seed}: A numeric value for the random seed to ensure +#' reproducibility of random splitting and any model training that relies on +#' random starts. Default is \code{NULL}. +#' \item \code{standardize}: A logical or a numeric/character vector. If +#' \code{TRUE}, all numeric columns (except the target) are standardized by +#' computing the mean and standard deviation from the training subset and +#' applying them to both the training and test/validation sets. This prevents +#' data leakage. A vector of column indices or names can also be provided to +#' only standardize specific columns. +#' \item \code{remove_obs}: A logical value indicating whether to remove +#' observations in the test/validation set that contain levels of categorical +#' predictors not seen in the training data. Some algorithms may produce errors +#' when encountering such levels in the validation data during prediction. +#' Default is \code{FALSE}. +#' } +#' +#' @param impute_params A list defining how to handle missing values among +#' predictors/features. During imputation, the target variable is excluded from +#' both training and test/validation sets. Prior to imputation, unlabeled data +#' (observations with missing targets) are removed, and any specified train-test +#' split or cross-validation folds are created. A separate imputation model is +#' then generated for each training subset (one for the train-test split and +#' one per fold). Each imputation model is applied to both its corresponding +#' training and test/validation subsets to minimize data leakage. Note that +#' numerical columns are automatically standardized (regardless of +#' \code{train_params$standardize}) before imputation occurs. The +#' \pkg{recipes} package is used for imputation. The following parameters are +#' available: +#' \itemize{ +#' \item \code{method}: A character specifying the imputation method. Options +#' include: +#' \itemize{ +#' \item \code{"impute_bag"}: Bagged Trees Imputation +#' \item \code{"impute_knn"}: K-Nearest Neighbors Imputation +#' } +#' Default is \code{NULL}. +#' \item \code{args}: A list of additional arguments for the chosen imputation +#' method. +#' \itemize{ +#' \item \code{"impute_bag"}: \code{trees}, \code{seed_val} +#' \item \code{"impute_knn"}: \code{neighbors} +#' } +#' For more details about these arguments, consult the \pkg{recipes} +#' documentation. Default is \code{NULL}. +#' } +#' +#' @param save A list that may include the following: +#' \itemize{ +#' \item \code{models}: A logical value indicating whether to save the trained +#' models (including imputation models) used for train-test splits or +#' cross-validation. Default is \code{FALSE}. +#' \item \code{data}: A logical value indicating whether to save all training +#' and test/validation sets used during train-test splitting and/or +#' cross-validation. Default is \code{FALSE}. +#' } +#' +#' @param parallel_configs A list that may include the following: +#' \itemize{ +#' \item \code{n_cores}: A numeric value specifying the number of cores for +#' parallel processing. Default is \code{NULL}. +#' \item \code{future.seed}: A numeric value indicating the seed to use with +#' \pkg{future} for parallel processing. +#' } +#' +#' @param ... Additional arguments for the chosen classification algorithm. +#' These arguments serve as an alternative to specifying model-specific +#' parameters in \code{model_params$map_args} when only a single model is +#' specified in \code{models}. If multiple models are specified, then +#' \code{map_args} must be used. Refer to each algorithm's documentation for +#' details on additional arguments. +#' +#' @section Additional Model Parameters: +#' Each element in \code{models} accepts arguments specific to its underlying +#' classification algorithm. Refer to the original package documentation for +#' more information about these arguments. Further details on the external +#' package functions used for each model are provided in the "Package +#' Dependencies" section. The available arguments for each \code{models} are: +#' \itemize{ +#' \item \code{"lda"}: \code{prior}, \code{method}, \code{nu}, \code{tol} +#' \item \code{"qda"}: \code{prior}, \code{method}, \code{nu} +#' \item \code{"logistic"}: \code{weights}, \code{singular.ok}, \code{maxit} +#' \item \code{"regularized_logistic"}: \code{"alpha"}, \code{"lambda"}, +#' \code{"penalty.factor"}, \code{"maxit"}, \code{"thresh"}, \code{"nfolds"} +#' \item \code{"svm"}: \code{kernel}, \code{degree}, \code{gamma}, \code{cost}, +#' \code{nu}, \code{class.weights}, \code{shrinking}, \code{epsilon}, +#' \code{tolerance}, \code{cachesize} +#' \item \code{"naivebayes"}: \code{prior}, \code{laplace}, \code{usekernel}, +#' \code{usepoisson} +#' \item \code{"nnet"}: \code{size}, \code{rang}, \code{decay}, \code{maxit}, +#' \code{softmax}, \code{entropy}, \code{abstol}, \code{reltol}, \code{Hess}, +#' \code{skip} +#' \item \code{"knn"}: \code{kmax}, \code{ks}, \code{distance}, \code{kernel} +#' \item \code{"decisiontree"}: \code{parms}, \code{control}, \code{cost} +#' \item \code{"randomforest"}: \code{weights}, \code{ntree}, \code{mtry}, +#' \code{nodesize}, \code{importance}, \code{localImp}, \code{nPerm}, +#' \code{proximity}, \code{keep.forest}, \code{norm.votes} +#' \item \code{"multinom"}: \code{Hess} +#' \item \code{"regularized_multinomial"}: \code{"alpha"}, \code{"lambda"}, +#' \code{"penalty.factor"}, \code{"maxit"}, +#' \code{"thresh"}, \code{"nfolds"} +#' \item \code{"xgboost"}: \code{params}, \code{nrounds}, \code{print_every_n}, +#' \code{feval}, \code{verbose}, \code{early_stopping_rounds}, \code{obj}, +#' \code{save_period}, \code{save_name} +#' } +#' +#' @section Package Dependencies: +#' Each option of \code{models} uses the following function from the specified +#' packages: +#' \itemize{ +#' \item \code{"lda"}: \code{lda} from \pkg{MASS} package +#' \item \code{"qda"}: \code{qda} from \pkg{MASS} package +#' \item \code{"logistic"}: \code{glm} from \pkg{base} package with +#' \code{family = "binomial"} +#' \item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package +#' with \code{family = "binomial"} and using \code{cv.glmnet} to select the +#' optimal lambda. +#' \item \code{"svm"}: \code{svm()} from \pkg{e1071} package +#' \item \code{"naivebayes"}: \code{naive_bayes} from \pkg{naivebayes} package +#' \item \code{"nnet"}: \code{nnet} from \pkg{nnet} package +#' \item \code{"knn"}: \code{train.kknn} from \pkg{kknn} package +#' \item \code{"decisiontree"}: \code{rpart} from \pkg{rpart} package +#' \item \code{"randomforest"}: \code{randomForest} from \pkg{randomForest} +#' package +#' \item \code{"multinom"}: \code{multinom} from \pkg{nnet} package +#' \item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package +#' with \code{family = "multinomial"} and using +#' \code{cv.glmnet} to select the optimal lambda. +#' \item \code{"xgboost"}: \code{xgb.train} from \pkg{xgboost} package +#' } +#' +#' @return A \code{\link{Vswift}} object containing: +#' \itemize{ +#' \item Configuration parameters accessible via \code{$configs()}. +#' \item Performance metrics accessible via \code{$metrics()}. +#' \item Class distribution details accessible via \code{$class_info()}. +#' \item Missing data summary accessible via \code{$get_missing_data_summary()}. +#' \item Data partition indices and dataframes accessible via +#' \code{$get_partition()} (if requested). +#' \item Trained models accessible via \code{$get_trained_model()} (if +#' requested). +#' \item Imputation models accessible via \code{$get_imputation_model()} (if +#' requested). +#' } +#' +#' @seealso \code{\link{Vswift}}, \code{\link{CurveResult}} +#' +#' @seealso \code{\link{Vswift}} +#' +#' @examples +#' # Load an example dataset +#' data(iris) +#' +#' # Perform a train-test split with an 80% training set using LDA +#' results <- class_cv( +#' data = iris, +#' target = "Species", +#' models = "lda", +#' train_params = list(split = 0.8) +#' ) +#' +#' # Print parameters and metrics +#' results$print() +#' +#' # Perform 5-fold cross-validation using Extreme Gradient Boosting +#' # w/ additional parameters: params & nrounds +#' results <- class_cv( +#' data = iris, +#' formula = Species ~ ., +#' models = "xgboost", +#' train_params = list(n_folds = 5, random_seed = 123), +#' params = list( +#' objective = "multi:softprob", +#' num_class = 3, +#' eta = 0.3, +#' max_depth = 6 +#' ), +#' nrounds = 10 +#' ) +#' +#' # Print parameters and metrics +#' results$print() +#' +#' # Perform 5-fold cross-validation a train-test split with multiple models +#' map_args <- list("knn" = list(ks = 5), "nnet" = list(size = 20)) +#' results <- class_cv( +#' data = iris, +#' target = 5, +#' predictors = c(1:3), +#' models = c("decisiontree", "knn", "nnet", "svm"), +#' model_params = list(map_args = map_args), +#' train_params = list( +#' n_folds = 5, +#' stratified = TRUE, +#' random_seed = 123 +#' ) +#' ) +#' +#' # Print parameters and metrics +#' results$print() +#' +#' @importFrom stats as.formula complete.cases glm model.matrix predict sd +#' @importFrom data.table := data.table .SD +#' +#' @export +class_cv <- function(data, + formula = NULL, + target = NULL, + predictors = NULL, + models, + model_params = list( + "map_args" = NULL, "threshold" = NULL, "rule" = "min", + "verbose" = TRUE, "final_model" = FALSE + ), + train_params = list( + "split" = NULL, "n_folds" = NULL, "stratified" = FALSE, + "random_seed" = NULL, "standardize" = FALSE, + "remove_obs" = FALSE + ), + impute_params = list("method" = NULL, "args" = NULL), + save = list("models" = FALSE, "data" = FALSE), + parallel_configs = list( + "n_cores" = NULL, "future.seed" = NULL + ), + ...) { + # Ensure model type is lowercase + if (!is.null(models)) models <- tolower(models) + + # Ensure model types are unique + models <- unique(models) + + # Append arguments; append missing so that default arguments appear in the + # output list and in order + model_params <- .append_param_keys("model_params", model_params, models, ...) + train_params <- .append_param_keys("train_params", train_params) + impute_params <- .append_param_keys("impute_params", impute_params) + save <- .append_param_keys("save", save) + parallel_configs <- .append_param_keys("parallel_configs", parallel_configs) + + # Checking if inputs are valid + .error_handling( + data = data, formula = formula, target = target, predictors = predictors, + models = models, model_params = model_params, train_params = train_params, + impute_params = impute_params, save = save, + parallel_configs = parallel_configs + ) + + # Get character form of target and predictor variables + vars <- .get_var_names(formula, target, predictors, data) + + # Get information on unlabeled data and labeled data with missing features + missing_info <- .missing_summary(data, vars$target) + + # Ensure data row names have an enforced order + rownames(data) <- seq(nrow(data)) + + # Clean data; Unlabeled data dropped and labeled missing data dropped if + # imputation is not requested + clean_outputs <- .clean_data( + data, missing_info, !is.null(impute_params$method) + ) + preprocessed_data <- clean_outputs$cleaned_data + perform_imputation <- clean_outputs$perform_imputation + + # Ensure target is factored and get all levels of character columns + # obtained if svm in models + factored <- .convert_to_factor( + preprocessed_data, vars$target, models, train_params$remove_obs + ) + preprocessed_data <- factored$data + col_levels <- factored$col_levels + + # Store information + final_output <- .store_parameters( + formula, missing_info, preprocessed_data, vars, models, model_params, + train_params, impute_params, save, parallel_configs + ) + + # Create class dictionary + if (any(models %in% c("logistic", "xgboost")) || + !is.null(model_params$threshold)) { + final_output$class_summary$keys <- .create_dictionary( + preprocessed_data[, vars$target], model_params$threshold + ) + } + + # Sampling data + if (!is.null(train_params$split) || !is.null(train_params$n_folds)) { + # Initialize list to store sample indices + final_output$data_partitions <- list() + final_output <- .sampling( + preprocessed_data, train_params, vars$target, final_output + ) + # Create the empty dataframes for metrics + final_output$metrics <- .expand_dataframe( + train_params, models, final_output$class_summary$classes + ) + } + + # Generate vector for iteration + iters <- .gen_iterations(train_params, model_params) + + # Obtain imputation model, if imputation requested + if (!is.null(impute_params$method) && perform_imputation) { + impute_models <- list() + for (i in iters) { + if (i != "final") { + test_indices <- .get_indices(final_output$data_partitions$indices, i) + df_list <- .partition(preprocessed_data, test_indices) + impute_models[[i]] <- .impute_prep( + train = df_list$train, vars = vars, impute_params = impute_params + ) + } else { + impute_models[[i]] <- .impute_prep( + preprocessed_data = preprocessed_data, vars = vars, + impute_params = impute_params + ) + } + } + } + + # Create kwargs + if (!is.null(train_params$split) || !is.null(train_params$n_folds)) { + if (exists("impute_models")) { + impute_models_arg <- impute_models[!names(impute_models) == "final"] + } else { + impute_models_arg <- NULL + } + + kwargs <- list( + preprocessed_data = preprocessed_data, + formula = final_output$configs$formula, + model_params = model_params, + vars = vars, + train_params = train_params, + col_levels = col_levels, + class_summary = final_output$class_summary, + save_mods = save$models, + met_df = final_output$metrics, + indices = final_output$data_partitions$indices, + impute_models = impute_models_arg + ) + } + + # Iterate to obtain validation metrics, training models, and final model + # for each algo + for (model in models) { + if (exists("kwargs")) { + if (is.null(parallel_configs$n_cores) || parallel_configs$n_cores <= 1) { + kwargs$iters <- iters[!iters == "final"] + kwargs$model <- model + train_out <- .sequential(kwargs) + } else { + kwargs$model <- model + train_out <- .parallel(kwargs, parallel_configs, iters[!iters == "final"]) + } + + # Add metrics information and model information + if ("split" %in% iters) { + final_output$metrics[[model]]$split <- train_out$metrics$split + train_out$metrics <- train_out$metrics[!names(train_out$metrics) == "split"] + } + + if (!is.null(train_params$n_folds)) { + cv_df <- .merge_df( + iters[!iters %in% c("split", "final")], + train_out$metrics$cv, + final_output$metrics[[model]]$cv + ) + + final_output$metrics[[model]]$cv <- .get_desc(cv_df, train_params$n_folds) + } + + if ("models" %in% names(train_out)) { + final_output$models[[model]] <- train_out$models + } + + if ("optimal_lambdas" %in% names(train_out)) { + final_output$metrics[[model]]$optimal_lambdas <- train_out$optimal_lambdas + } + } + + # Generate final model + if ("final" %in% iters) { + preproc_kwargs <- list() + + if (exists("impute_models") && "final" %in% names(impute_models)) { + preproc_kwargs$prep <- impute_models$final + } + + if (!is.null(preproc_kwargs$prep) || isTRUE(train_params$standardize)) { + preproc_kwargs <- c( + preproc_kwargs, + list("vars" = vars, "standardize" = train_params$standardize) + ) + preprocessed_data <- .prep_data( + preprocessed_data = preprocessed_data, preproc_kwargs = preproc_kwargs + ) + } + + # Generate model depending on chosen models + if (startsWith(model, "regularized")) { + final_out <- .regularized( + id = "Final Model", + model = model, + vars = vars, + data = preprocessed_data, + add_args = model_params$mod_args, + random_seed = train_params$random_seed, + stratified = ifelse( + is.null(train_params$stratified), FALSE, train_params$stratified + ), + rule = ifelse(is.null(model_params$rule), "min", model_params$rule), + verbose = ifelse( + is.null(model_params$verbose), TRUE, model_params$verbose + ) + ) + + if ("optimal_lambda" %in% names(final_out)) { + vec <- c("final" = final_out$optimal_lambda) + final_output$metrics[[model]]$optimal_lambdas <- c( + final_output$metrics[[model]]$optimal_lambdas, vec + ) + final_out$optimal_lambda <- NULL + } + + final_output$models[[model]]$final <- final_out + } else { + final_output$models[[model]]$final <- .generate_model( + model = model, + formula = final_output$configs$formula, + vars = vars, + data = preprocessed_data, + add_args = model_params$mod_args, + random_seed = train_params$random_seed + ) + } + } + } + + # Save data + if (isTRUE(save$data)) { + if (exists("kwargs")) { + for (i in iters[!iters == "final"]) { + test_indices <- .get_indices(kwargs$indices, i) + # Get training and validation data + df_list <- .partition(kwargs$preprocessed_data, test_indices) + # Prep data + df_list <- .prep_data(i, df_list$train, df_list$test, kwargs) + + # Store data + if (i == "split") { + final_output$data_partitions$dataframes$split <- df_list + } else { + final_output$data_partitions$dataframes$cv[[i]] <- df_list + } + } + } + + # Will already be standardized and imputed + if ("final" %in% iters) { + final_output$data_partitions$dataframes$preprocessed_data <- preprocessed_data + } + } + + # Save imputation models + if (save$models && exists("impute_models")) { + for (i in names(impute_models)) { + if (i %in% c("split", "final")) { + id <- ifelse(i == "final", "preprocessed_data", i) + final_output$imputation_models[[id]] <- impute_models[[i]] + } else { + final_output$imputation_models$cv[[i]] <- impute_models[[i]] + } + } + } + + results <- Vswift$new( + configs = final_output$configs, + class_summary = final_output$class_summary, + metrics = final_output$metrics, + trained_models = final_output$models, + missing_data_summary = final_output$missing_data_summary, + data_partitions = final_output$data_partitions, + imputation_models = final_output$imputation_models + ) + + return(results) +} diff --git a/R/constants.R b/R/constants.R index 4291e56..2291dd1 100644 --- a/R/constants.R +++ b/R/constants.R @@ -1,130 +1,135 @@ -# A list mapping default keys to certain parameters; Lazy evaluation -.DEFAULT_KEYS <- substitute({ - list( - "model_params" = list( - "map_args" = NULL, - "threshold" = NULL, - "rule" = "min", - "final_model" = FALSE, - "verbose" = TRUE - ), - "train_params" = .train_params_keys(caller), - "impute_params" = list( - "method" = NULL, - "args" = NULL - ), - "save" = list( - "models" = FALSE, - "data" = FALSE - ), - "parallel_configs" = list( - "n_cores" = NULL, - "future.seed" = NULL - ) - ) -}) - -# Helper function to return train keys depending on function call -.train_params_keys <- function(caller) { - keys <- list("split" = NULL, "n_folds" = NULL, "stratified" = FALSE, "random_seed" = NULL) - - if (caller == "classCV") keys <- c(keys, list("standardize" = FALSE, "remove_obs" = FALSE)) - - return(keys) -} - -# A list mapping parameters to certain types; The %s_params parameters not included since their class is checked -# by .append_param_keys -.PARAM_TYPES <- list( - primary = list( - data = c("data.frame"), - formula = c("formula", "NULL"), - target = c("character", "numeric", "integer", "NULL"), - predictors = c("character", "numeric", "integer", "NULL"), - models = c("character"), - model_params = c("list"), - train_params = c("list"), - impute_params = c("list"), - parallel_configs = c("list"), - save = c("list"), - create_data = c("logical") # Only in genFolds - ), - secondary = list( - map_args = c("list", "NULL"), - threshold = c("numeric", "NULL"), - rule = c("character", "NULL"), - final_model = c("logical"), - verbose = c("logical", "NULL"), - split = c("numeric", "NULL"), - n_folds = c("numeric", "NULL"), - stratified = c("logical"), - random_seed = c("numeric", "NULL"), - standardize = c("logical", "numeric", "integer", "character"), - remove_obs = c("logical"), - method = c("character", "NULL"), - args = c("list", "NULL"), - models = c("logical"), - data = c("logical"), - n_cores = c("numeric", "NULL"), - future.seed = c("numeric", "NULL") - ) -) - -# A list mapping models to their proper names -.MODEL_LIST <- list( - "lda" = "Linear Discriminant Analysis", - "qda" = "Quadratic Discriminant Analysis", - "svm" = "Support Vector Machine", - "nnet" = "Neural Network", - "decisiontree" = "Decision Tree", - "randomforest" = "Random Forest", - "xgboost" = "Extreme Gradient Boosting", - "logistic" = "Unegularized Logistic Regression", - "regularized_logistic" = "Regularized Logistic Regression", - "regularized_multinomial" = "Regularized Multinomial Logistic Regression", - "multinom" = "Unregularized Multinomial Logistic Regression", - "knn" = "K-Nearest Neighbors", "naivebayes" = "Naive Bayes" -) - - -# List of valid arguments for each model type -.GLMNET_ARGS <- c( - "alpha", "lambda", "penalty.factor", "maxit", "thresh", "nfolds" -) - -# List of valid arguments for each model type -.VALID_ARGS <- list( - "model" = list( - "lda" = c("prior", "method", "nu", "tol"), - "qda" = c("prior", "method", "nu"), - "logistic" = c("weights", "singular.ok", "maxit"), - "regularized_logistic" = .GLMNET_ARGS, - "svm" = c( - "kernel", "degree", "gamma", "cost", "nu", "class.weights", "shrinking", - "epsilon", "tolerance", "cachesize" - ), - "naivebayes" = c( - "prior", "laplace", "usekernel", "usepoisson" - ), - "nnet" = c( - "size", "rang", "decay", "maxit", "softmax", "entropy", "abstol", "reltol", "Hess", - "skip" - ), - "knn" = c("kmax", "ks", "distance", "kernel"), - "decisiontree" = c("method", "parms", "control", "cost"), - "randomforest" = c( - "classwt", "ntree", "mtry", "nodesize", "importance", "localImp", - "nPerm", "proximity", "keep.forest", "norm.votes" - ), - "multinom" = c("Hess"), - "regularized_multinomial" = .GLMNET_ARGS, - "xgboost" = c( - "params", "nrounds", "print_every_n", "feval", "verbose", - "early_stopping_rounds", "obj", "save_period", "save_name" - ) - ), - "imputation" = list( - "impute_bag" = c("trees", "seed_val"), - "impute_knn" = c("neighbors") - ) -) +# A list mapping default keys to certain parameters; Lazy evaluation +.DEFAULT_KEYS <- substitute({ + list( + "model_params" = list( + "map_args" = NULL, + "threshold" = NULL, + "rule" = "min", + "final_model" = FALSE, + "verbose" = TRUE + ), + "train_params" = .train_params_keys(), + "impute_params" = list( + "method" = NULL, + "args" = NULL + ), + "save" = list( + "models" = FALSE, + "data" = FALSE + ), + "parallel_configs" = list( + "n_cores" = NULL, + "future.seed" = NULL + ) + ) +}) + +# Helper function to return train keys depending on function call +.train_params_keys <- function() { + keys <- list( + "split" = NULL, + "n_folds" = NULL, + "stratified" = FALSE, + "random_seed" = NULL, + "standardize" = FALSE, + "remove_obs" = FALSE + ) + + return(keys) +} + +# A list mapping parameters to certain types; The %s_params parameters not included since their class is checked +# by .append_param_keys +.PARAM_TYPES <- list( + primary = list( + data = c("data.frame"), + formula = c("formula", "NULL"), + target = c("character", "numeric", "integer", "NULL"), + predictors = c("character", "numeric", "integer", "NULL"), + models = c("character"), + model_params = c("list"), + train_params = c("list"), + impute_params = c("list"), + parallel_configs = c("list"), + save = c("list"), + create_data = c("logical") # Only in genFolds + ), + secondary = list( + map_args = c("list", "NULL"), + threshold = c("numeric", "NULL"), + rule = c("character", "NULL"), + final_model = c("logical"), + verbose = c("logical", "NULL"), + split = c("numeric", "NULL"), + n_folds = c("numeric", "NULL"), + stratified = c("logical"), + random_seed = c("numeric", "NULL"), + standardize = c("logical", "numeric", "integer", "character"), + remove_obs = c("logical"), + method = c("character", "NULL"), + args = c("list", "NULL"), + models = c("logical"), + data = c("logical"), + n_cores = c("numeric", "NULL"), + future.seed = c("numeric", "NULL") + ) +) + +# A list mapping models to their proper names +.MODEL_LIST <- list( + "lda" = "Linear Discriminant Analysis", + "qda" = "Quadratic Discriminant Analysis", + "svm" = "Support Vector Machine", + "nnet" = "Neural Network", + "decisiontree" = "Decision Tree", + "randomforest" = "Random Forest", + "xgboost" = "Extreme Gradient Boosting", + "logistic" = "Unegularized Logistic Regression", + "regularized_logistic" = "Regularized Logistic Regression", + "regularized_multinomial" = "Regularized Multinomial Logistic Regression", + "multinom" = "Unregularized Multinomial Logistic Regression", + "knn" = "K-Nearest Neighbors", "naivebayes" = "Naive Bayes" +) + + +# List of valid arguments for each model type +.GLMNET_ARGS <- c( + "alpha", "lambda", "penalty.factor", "maxit", "thresh", "nfolds" +) + +# List of valid arguments for each model type +.VALID_ARGS <- list( + "model" = list( + "lda" = c("prior", "method", "nu", "tol"), + "qda" = c("prior", "method", "nu"), + "logistic" = c("weights", "singular.ok", "maxit"), + "regularized_logistic" = .GLMNET_ARGS, + "svm" = c( + "kernel", "degree", "gamma", "cost", "nu", "class.weights", "shrinking", + "epsilon", "tolerance", "cachesize" + ), + "naivebayes" = c( + "prior", "laplace", "usekernel", "usepoisson" + ), + "nnet" = c( + "size", "rang", "decay", "maxit", "softmax", "entropy", "abstol", "reltol", "Hess", + "skip" + ), + "knn" = c("kmax", "ks", "distance", "kernel"), + "decisiontree" = c("method", "parms", "control", "cost"), + "randomforest" = c( + "classwt", "ntree", "mtry", "nodesize", "importance", "localImp", + "nPerm", "proximity", "keep.forest", "norm.votes" + ), + "multinom" = c("Hess"), + "regularized_multinomial" = .GLMNET_ARGS, + "xgboost" = c( + "params", "nrounds", "print_every_n", "feval", "verbose", + "early_stopping_rounds", "obj", "save_period", "save_name" + ) + ), + "imputation" = list( + "impute_bag" = c("trees", "seed_val"), + "impute_knn" = c("neighbors") + ) +) diff --git a/R/curves.R b/R/curves.R deleted file mode 100644 index 1afc984..0000000 --- a/R/curves.R +++ /dev/null @@ -1,126 +0,0 @@ -#' Plot Receiver Operating Characteristic (ROC) Curves for Binary Classification Tasks -#' -#' @name rocCurve -#' -#' @description Produces ROC curves and computes the area under the curve (AUC) and Youden's Index. -#' Only works for binary classification tasks. -#' -#' @param x A list object of class \code{"vswift"}. Note that the models must be saved using -#' \code{save = list("models" = TRUE)} in \code{classCV} for this function to work. -#' -#' @param data A data frame. If \code{NULL}, then the preprocessed data muse be saved using -#' \code{save = list("data" = TRUE)} in \code{classCV} Default = \code{NULL}. -#' -#' @param models A character string or a character vector specifying the classification algorithm(s) to plot curves -#' for. If \code{NULL}, all models will be plotted. The following options are available: -#' \itemize{ -#' \item \code{"lda"}: Linear Discriminant Analysis -#' \item \code{"qda"}: Quadratic Discriminant Analysis -#' \item \code{"logistic"}: Unregularized Logistic Regression -#' \item \code{"regularized_logistic"}: Regularized Logistic Regression -#' \item \code{"svm"}: Support Vector Machine -#' \item \code{"naivebayes"}: Naive Bayes -#' \item \code{"nnet"}: Neural Network -#' \item \code{"knn"}: K-Nearest Neighbors -#' \item \code{"decisiontree"}: Decision Tree -#' \item \code{"randomforest"}: Random Forest -#' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression -#' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression -#' \item \code{"xgboost"}: Extreme Gradient Boosting -#' } -#' Default = \code{NULL}. -#' -#' @param split A logical value indicating whether to plot curves for the train-test split results. Default is -#' \code{TRUE}. -#' -#' @param cv A logical value indicating whether to plot curves for cross-validation results. Default is \code{TRUE}. -#' -#' @param thresholds A numerical vector specifying the thresholds to use when producing the curves. If left as NULL -#' the unique probability values produced by the training model will be used as thresholds. Default is \code{NULL}. -#' -#' @param return_output A logical value indicating whether to return the output list. Default is \code{TRUE}. -#' -#' @param path A character string specifying the directory (with a trailing slash) to save the plots. -#' Default is \code{NULL}. -#' -#' @param ... Additional arguments passed to the \code{png} function. -#' -#' @return A list containing thresholds used to generate the ROC curve, target labels, false positive rates (FPR), -#' true positive rates (TPR), area under the curve (AUC), and Youden's Index for all training and validation sets -#' for each model. -#' -#' @examples -#' # Load an example dataset -#' data <- iris -#' -#' # Make Binary -#' data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") -#' -#' # Perform a train-test split with an 80% training set and stratified sampling using QDA -#' result <- classCV( -#' data = data, -#' target = "Species", -#' models = "qda", -#' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), -#' save = list(data = TRUE, models = TRUE) -#' ) -#' -#' # Get ROC curve -#' rocCurve(result, return_output = FALSE) -#' -#' @author Donisha Smith -#' -#' @importFrom grDevices rainbow -#' @importFrom graphics lines -#' -#' @export -rocCurve <- function(x, data = NULL, models = NULL, split = TRUE, cv = TRUE, thresholds = NULL, return_output = TRUE, - path = NULL, ...) { - return(.curve_entry(x, data, models, split, cv, thresholds, return_output, "roc", path, ...)) -} - -#' Plot Precision-Recall (PR) Curves for Binary Classification Tasks -#' -#' @name prCurve -#' -#' @description Produces PR curves and computes the area under the curve (AUC) and the threshold with the maximum F1. -#' score. Only works for binary classification tasks. -#' -#' @inheritParams rocCurve -#' -#' @return A list containing thresholds used to generate the PR curve, target labels, precision, recall, -#' area under the curve (AUC), and maximum F1 score and its associated optimal threshold for all training and -#' validation sets for each model. -#' -#' @examples -#' # Load an example dataset -#' data <- iris -#' -#' # Make Binary -#' data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") -#' -#' # Perform a train-test split with an 80% training set and stratified sampling using QDA -#' result <- classCV( -#' data = data, -#' target = "Species", -#' models = "qda", -#' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), -#' save = list(data = TRUE, models = TRUE) -#' ) -#' -#' # Get PR curve -#' prCurve(result, return_output = FALSE) -#' -#' @author Donisha Smith -#' -#' -#' @export -prCurve <- function() {} - -# Get the function signature from rocCurve -formals(prCurve) <- formals(rocCurve) - -# Substitute in the body of the prCurve function -body(prCurve) <- substitute({ - return(.curve_entry(x, data, models, split, cv, thresholds, return_output, "pr", path, ...)) -}) diff --git a/R/curves_utils.R b/R/curves_utils.R index 4e9bcbc..012fa56 100644 --- a/R/curves_utils.R +++ b/R/curves_utils.R @@ -8,81 +8,85 @@ return_output = TRUE, curve_method, path = NULL, ...) { - if (inherits(x, "vswift")) { - # Perform checks and get dictionary class keys and variables - info <- .perform_checks(x, data, curve_method) + info <- .perform_checks(x, data, curve_method) - # Unlist keys to turn into a named vector - info$keys <- unlist(info$keys) + info$keys <- unlist(info$keys) - # Get valid models - models <- .intersect_models(x, models) - - if ("xgboost" %in% models && x$configs$model_params$map_args$xgboost$params$objective == "multi:softmax") { - warnings("'xgboost' cannot be specified when the 'multi:softmax; objective is used since probabilties are needed") - models <- models[!models == "xgboost"] - } + model_params <- x$configs("model_params") + if ("xgboost" %in% models && + model_params$map_args$xgboost$params$objective == "multi:softmax") { + warnings("'xgboost' cannot be specified when the 'multi:softmax; objective is used since probabilties are needed") + models <- models[!models == "xgboost"] + } - if ("xgboost" %in% models && x$configs$model_params$map_args$xgboost$params$objective == "binary:hinge") { - if (is.null(thresholds)) stop("`thresholds` must be specified since 'xgboost' uses the 'binary:hinge' objective") + if ("xgboost" %in% models && model_params$map_args$xgboost$params$objective == "binary:hinge") { + if (is.null(thresholds)) { + stop( + "`thresholds` must be specified since 'xgboost' uses the 'binary:hinge' objective" + ) } + } - if (length(models) == 0) stop("no valid models to plot") + if (length(models) == 0) stop("no valid models to plot") - # Iterate over models - output <- list() + # Iterate over models + output <- list() - for (model in models) { - output[[model]] <- .curve_pipeline( - x, data, model, .MODEL_LIST[[model]], split, cv, thresholds, info, curve_method, path, ... - ) + for (model in models) { + output[[model]] <- .curve_pipeline( + x, data, model, .MODEL_LIST[[model]], split, cv, thresholds, info, + curve_method, path, ... + ) - if (!isTRUE(return_output)) output[[model]] <- NULL - } + if (!isTRUE(return_output)) output[[model]] <- NULL + } - if (isTRUE(return_output)) { - return(output) - } - } else { - stop("`x` must be an object of class 'vswift'") + if (isTRUE(return_output)) { + return(output) } } -# Helper function to perform checks to ensure information needed is available and to obtain information needed for plotting +# Helper function to perform checks to ensure information needed is +# available and to obtain information needed for plotting .perform_checks <- function(x, data, curve_method) { - if (is.null(x$models)) { - stop("models must be saved in order to use `rocCurve`") + if (is.null(x$get_trained_model())) { + stop("models must be saved in order to use this method") } # Check if data is available - if (!is.data.frame(data) && is.null(x$data_partitions$dataframes)) { - stop("data cannot be NULL if dataframes were not saved by `classCV`") + if (!is.data.frame(data) && is.null(x$get_partition("dataframes"))) { + stop("data cannot be NULL if dataframes were not saved by `class_cv`") } # Check if target is binary - df <- .get_data(x, data)$data - - vars <- .get_var_names(formula = x$configs$formula, data = df) - - if (length(x$class_summary$classes) != 2) { - stop("`rocCurve` currently only supports binary targets") + if (length(x$classes) != 2) { + stop("`roc_curve` and `pr_curve` currently only supports binary targets") } # Convert target - class_keys <- .create_dictionary(x$class_summary$classes, alternate_warning = TRUE, curve_method = curve_method) + class_keys <- .create_dictionary( + x$classes, + alternate_warning = TRUE, curve_method = curve_method + ) + + df <- .get_data(x, data)$data + + vars <- .get_var_names(formula = x$configs()$formula, data = df) return(list("keys" = class_keys, "vars" = vars)) } # Helper function to get data, indices, and models -.get_data <- function(x, data, id = NULL, foldid = NULL, get_indices = FALSE, vars = NULL, model = NULL, - discard_unusable_data = TRUE) { +.get_data <- function(x, data, id = NULL, foldid = NULL, get_indices = FALSE, + vars = NULL, model = NULL, discard_unusable_data = TRUE) { preprocess <- ifelse(is.data.frame(data), TRUE, FALSE) # Get information for indexing for either dataframes or the test set indices - id <- ifelse(is.null(id), names(x$data_partitions$indices)[1], id) + id <- ifelse(is.null(id), names(x$get_partition()$indices)[1], id) - if (!is.null(x$data_partitions$indices$cv)) { - foldid <- ifelse(is.null(foldid), names(x$data_partitions$indices$cv)[1], foldid) + if (!is.null(x$get_partition("indices")$cv)) { + foldid <- ifelse( + is.null(foldid), names(x$get_partition("indices")$cv)[1], foldid + ) } # Get data @@ -91,30 +95,29 @@ rownames(df) <- seq(nrow(df)) # Discard missing labels if (discard_unusable_data) { - miss_info <- .missing_summary(data, all.vars(x$configs$formula)[1]) - discard_indices <- c(miss_info$unlabeled_data_indices, miss_info$missing_all_features_indices) + miss_info <- .missing_summary(data, all.vars(x$configs("formula"))[1]) + discard_indices <- c( + miss_info$unlabeled_data_indices, + miss_info$missing_all_features_indices + ) if (length(discard_indices) != 0) df <- df[-discard_indices, ] } } else { if (id == "split") { - df <- rbind( - x$data_partitions$dataframes$split$train, x$data_partitions$dataframes$split$test - ) + split_df <- x$get_partition("dataframes", "split") + df <- rbind(split_df$train, split_df$test) } else { - df <- rbind( - x$data_partitions$dataframes$cv[[foldid]]$train, x$data_partitions$dataframes$cv[[foldid]]$test - ) + cv_df <- x$get_partition("dataframes", "cv") + df <- rbind(cv_df[[foldid]]$train, cv_df[[foldid]]$test) } } - # Sort rows if data extracted from vswift object if (!is.data.frame(data)) df <- df[order(as.numeric(rownames(df))), ] - # Ensure all characters are factors if (isTRUE(preprocess) && !is.null(vars)) { out <- .convert_to_factor(df, vars$target, model, remove_obs = FALSE) miss_info <- .missing_summary(out$data, vars$target) - impute <- ifelse(!is.null(x$imputation_models), TRUE, FALSE) + impute <- ifelse(!is.null(x$get_imputation_model()), TRUE, FALSE) cleaned_data <- .clean_data(out$data, miss_info, impute, FALSE) out$data <- cleaned_data$cleaned_data } else { @@ -123,64 +126,95 @@ # Get the test set if (get_indices) { - indices <- if (id == "split") x$data_partitions$indices$split$test else x$data_partitions$indices$cv[[foldid]] - out$indices <- indices + if (id == "split") { + out$indices <- x$get_partition("indices", "split", "test") + } else { + out$indices <- x$get_partition("indices", "cv", foldid) + } } return(out) } # Helper function to perform quick preparation of input dataframe -.quick_prep <- function(x, df_list, id, foldid, info, preprocess, model, col_levels) { +.quick_prep <- function( + x, df_list, id, foldid, info, preprocess, model, col_levels +) { # Check imputation first - if (!is.null(x$imputation_models) && isTRUE(preprocess)) { - prep <- if (id == "split") x$imputation_models$split else x$imputation_models$cv[[foldid]] - df_list <- .impute_bake(train = df_list$train, test = df_list$test, vars = info$vars, prep = prep) + if (!is.null(x$get_imputation_model()) && isTRUE(preprocess)) { + if (id == "split") { + prep <- x$get_imputation_model("split") + } else { + prep <- x$get_imputation_model("cv")[[foldid]] + } + df_list <- .impute_bake( + train = df_list$train, test = df_list$test, vars = info$vars, prep = prep + ) } # Determine if standardizing is needed - standardize <- ((isTRUE(x$configs$train_params$standardize) || is.numeric(x$configs$train_params$standardize)) && - is.null(x$imputation_models)) + condition1 <- isTRUE(x$configs("train_params")$standardize) + condition2 <- is.numeric(x$configs("train_params")$standardize) + condition3 <- is.null(x$get_imputation_model()) + standardize <- (condition1 || condition2) && condition3 - # Check if standardized need standardized if (standardize) { df_list <- .standardize_train( df_list$train, df_list$test, - standardize = x$configs$train_params$standardize, info$vars$target + standardize = x$configs("train_params")$standardize, info$vars$target ) } # Relevel columns if svm if (model == "svm" && !is.null(col_levels)) { - for (i in names(df_list)) df_list[[i]] <- .relevel_cols(df_list[[i]], col_levels) + for (i in names(df_list)) { + df_list[[i]] <- .relevel_cols(df_list[[i]], col_levels) + } } return(df_list) } # Helper function that serves as the pipeline for producing curves -.curve_pipeline <- function(x, data, model, plot_title, split, cv, thresholds, info, curve_method, path, ...) { +.curve_pipeline <- function( + x, data, model, plot_title, split, cv, thresholds, info, curve_method, + path, ... +) { out <- list() - if (isTRUE(split) && !is.null(x$configs$train_params$split)) { - out$split <- .get_thresholds(x, data, "split", NULL, model, thresholds, info) + if (isTRUE(split) && !is.null(x$configs("train_params")$split)) { + out$split <- .get_thresholds( + x, data, "split", NULL, model, thresholds, info + ) for (i in c("train", "test")) { - out$split[[i]] <- c(out$split[[i]], .get_curve_metrics(out$split[[i]], curve_method)) + out$split[[i]] <- c( + out$split[[i]], .get_curve_metrics(out$split[[i]], curve_method) + ) # Rename tpr to recall - if (curve_method != "roc") names(out$split[[i]]$metrics) <- .rename_metrics(out$split[[i]]$metrics) + if (curve_method != "roc") { + names(out$split[[i]]$metrics) <- .rename_metrics(out$split[[i]]$metrics) + } } # Plot curves .plot_curve(out$split, curve_method, "train_test", model, path, ...) } - if (isTRUE(cv) && !is.null(x$configs$train_params$n_folds)) { - for (foldid in paste0("fold", seq(x$configs$train_params$n_folds))) { - out$cv[[foldid]] <- .get_thresholds(x, data, "cv", foldid, model, thresholds, info) - out$cv[[foldid]] <- c(out$cv[[foldid]], .get_curve_metrics(out$cv[[foldid]], curve_method)) + if (isTRUE(cv) && !is.null(x$configs("train_params", "n_folds"))) { + for (foldid in paste0("fold", seq(x$configs("train_params")$n_folds))) { + out$cv[[foldid]] <- .get_thresholds( + x, data, "cv", foldid, model, thresholds, info + ) + out$cv[[foldid]] <- c( + out$cv[[foldid]], .get_curve_metrics(out$cv[[foldid]], curve_method) + ) # Rename tpr to recall - if (curve_method != "roc") names(out$cv[[foldid]]$metrics) <- .rename_metrics(out$cv[[foldid]]$metrics) + if (curve_method != "roc") { + names(out$cv[[foldid]]$metrics) <- .rename_metrics( + out$cv[[foldid]]$metrics + ) + } } # Plot curves @@ -198,12 +232,19 @@ # Obtain AUC and Youden's Index or Max F1 if (curve_method == "roc") { - out$auc <- .integrate(fpr = out$metrics$fpr, tpr = out$metrics$tpr, curve_method = curve_method) - out$youdens_indx <- .youdens_indx(out$metrics$fpr, out$metrics$tpr, x$thresholds) + out$auc <- .integrate( + fpr = out$metrics$fpr, tpr = out$metrics$tpr, curve_method = curve_method + ) + out$youdens_indx <- .youdens_indx( + out$metrics$fpr, out$metrics$tpr, x$thresholds + ) } else { - out$auc <- .integrate(precision = out$metrics$precision, tpr = out$metrics$tpr, curve_method = curve_method) + out$auc <- .integrate( + precision = out$metrics$precision, tpr = out$metrics$tpr, + curve_method = curve_method + ) scores <- .maxf1(out$metrics$tpr, out$metrics$precision, x$thresholds) - out$maxF1 <- scores$maxF1 + out$max_f1 <- scores$max_f1 out$optimal_threshold <- scores$optimal_threshold } @@ -219,13 +260,14 @@ } # Helper function to obtain thresholds used for ROC curve -.get_thresholds <- function(x, data, id, foldid = NULL, model, thresholds, info) { +.get_thresholds <- function(x, data, id, foldid = NULL, model, + thresholds, info) { preprocess <- ifelse(is.data.frame(data), TRUE, FALSE) # Get training model if (id == "split") { - train_mod <- x$models[[model]]$split + train_mod <- x$get_trained_model(model, "split") } else { - train_mod <- x$models[[model]]$cv[[foldid]] + train_mod <- x$get_trained_model(model, "cv")[[foldid]] } # Get data @@ -233,10 +275,15 @@ # Partition training and test data df_list <- .partition(out$data, out$indices) - if (preprocess) df_list <- .quick_prep(x, df_list, id, foldid, info, preprocess, model, out$col_levels) + if (preprocess) { + df_list <- .quick_prep( + x, df_list, id, foldid, info, preprocess, model, out$col_levels + ) + } + model_params <- x$configs("model_params")$map_args$xgboost$params$objective results <- .prediction( - id, model, train_mod, info$vars, df_list, NULL, x$configs$model_params$map_args$xgboost$params$objective, + id, model, train_mod, info$vars, df_list, NULL, model_params, length(info$keys), probs = TRUE, keys = info$keys, caller = "curve" ) @@ -251,20 +298,25 @@ out[[name]]$probs <- results$pred[[name]] if (inherits(results$ground[[name]], "character")) { - out[[name]]$labels <- unlist(Map(function(x) info$key[[x]], results$ground[[name]])) + out[[name]]$labels <- unlist( + Map(function(x) info$key[[x]], results$ground[[name]]) + ) } } # For ids that start with fold, unnest if (id == "cv") { - out <- list("thresholds" = out$test$thresholds, "probs" = out$test$probs, "labels" = out$test$labels) + out <- list( + "thresholds" = out$test$thresholds, "probs" = out$test$probs, + "labels" = out$test$labels + ) } return(out) } -# Helper function to compute true positive rates (recall) and false positive rates for each -# threshold using outer product matrix +# Helper function to compute true positive rates (recall) and false positive +# rates for each threshold using outer product matrix .compute_scores <- function(probs, thresholds, ground, curve_method) { # Create outer product matrix; rows = probs and cols = thresholds mat <- outer(probs, thresholds, ">=") @@ -272,11 +324,9 @@ true_pos <- colSums(mat[ground == 1, ]) # Subtract column sums from true_pos to obtain false_pos false_pos <- colSums(mat) - true_pos - # Compute tpr tpr <- true_pos / sum(ground) if (curve_method == "roc") { - # Compute fpr fpr <- false_pos / sum(!ground) return(list("fpr" = fpr, "tpr" = tpr)) } else { @@ -295,7 +345,7 @@ return(list("x" = x, "y" = y)) } -# Helper function to add anchor for prCurve +# Helper function to add anchor for pr_curve .add_anchor <- function(x, y, plot = FALSE) { paired_list <- Map(list, "x" = x, "y" = y) # Sort @@ -345,13 +395,15 @@ order_names <- c("tpr", "precision") } - # Order by decreasing -> increasing for metric that is x-axis (fpr for roc and tpr for pr) - # For metric that is y-axis order from increasing -> decreasing - # Each first instance of duplicated pairs will have the minimum x paired with the maximum y + # Order by decreasing -> increasing for metric that is x-axis (fpr for roc + # and tpr for pr). For metric that is y-axis order from increasing -> + # decreasing. Each first instance of duplicated pairs will have the minimum x + # paired with the maximum y paired_list_ordered <- .order_paired_list(paired_list, order_names) i <- ifelse(curve_method == "roc", "fpr", "tpr") - # Obtain the fpr or tpr values, determine which is not duplicated to retain only the first instance + # Obtain the fpr or tpr values, determine which is not duplicated to retain + # only the first instance paired_list_final <- paired_list_ordered[!duplicated(sapply(paired_list_ordered, function(x) x[[i]]))] return(paired_list_final) @@ -362,7 +414,9 @@ paired_list <- .create_paired_list(fpr, precision, tpr, curve_method) N <- length(paired_list) - 1 # sum all areas to compute total area = auc - auc <- sum(sapply(seq(N), function(x) .trapezoid(paired_list[[x]], paired_list[[x + 1]], curve_method))) + auc <- sum( + sapply(seq(N), function(x) .trapezoid(paired_list[[x]], paired_list[[x + 1]], curve_method)) + ) return(auc) } @@ -398,7 +452,9 @@ # Select index of max F1 score max_indx <- which.max(f1_scores) - return(list("maxF1" = f1_scores[max_indx], "optimal_threshold" = thresholds[max_indx])) + return(list( + "max_f1" = f1_scores[max_indx], "optimal_threshold" = thresholds[max_indx] + )) } # Helper function to plot curves @@ -466,7 +522,11 @@ legend_colors <- c(legend_colors, "black") legend_lty <- c(legend_lty, 2) - legend("bottomright", legend = legend_labels, col = legend_colors, lty = legend_lty) + legend( + "bottomright", + legend = legend_labels, col = legend_colors, + lty = legend_lty, bty = "n" + ) # Dashed line if (curve_method == "roc") { @@ -483,11 +543,13 @@ .curve_names <- function(curve_method) { if (curve_method == "roc") { names <- list( - "main" = "ROC", "png" = "roc", "x" = "False Positive Rate (FPR)", "y" = "True Positive Rate (TPR)" + "main" = "ROC", "png" = "roc", "x" = "False Positive Rate (FPR)", + "y" = "True Positive Rate (TPR)" ) } else { names <- list( - "main" = "Precision-Recall", "png" = "precision_recall", "x" = "Recall", "y" = "Precision" + "main" = "Precision-Recall", "png" = "precision_recall", "x" = "Recall", + "y" = "Precision" ) } diff --git a/R/error_handling.R b/R/error_handling.R index 90480ba..8830323 100644 --- a/R/error_handling.R +++ b/R/error_handling.R @@ -1,147 +1,131 @@ -# Helper function for classCV and genFolds to check if inputs are valid -.error_handling <- function(data, formula = NULL, target = NULL, predictors = NULL, models = NULL, - model_params = NULL, train_params = NULL, impute_params = NULL, save = NULL, - parallel_configs = NULL, create_data = NULL, caller = NULL) { - valid_models <- names(.VALID_ARGS$model) - valid_imputes <- names(.VALID_ARGS$imputation) - - # Create list of parameters - if (caller == "classCV") { - params_list <- list( - data = data, formula = formula, target = target, predictors = predictors, models = models, - train_params = train_params, model_params = model_params, impute_params = impute_params, - save = save, parallel_configs = parallel_configs - ) - } else { - params_list <- list(data = data, target = target, train_params = train_params, create_data = create_data) - } - - # Check types - for (param in names(params_list)) .type_validator(param, params_list[[param]]) - - # Determine to stop execution - .stop_execution(train_params, model_params, caller) - - if (!is.null(train_params$n_folds) && train_params$n_folds <= 2) stop("`train_params$n_folds` must greater than 2") - - if (!is.null(train_params$split) && c(train_params$split < 0 || train_params$split > 1)) { - stop("`train_params$split` must a numeric value from 0 to 1") - } - - # Check formula and target - msg <- ifelse(caller == "classCV", "either `formula` or `target` must be specified", "`target` must be specified") - if (inherits(c(formula, target), "NULL")) stop(msg) - - # Check vars - .check_vars(formula, target, predictors, data) - - # Exit early for genFolds - if (caller == "genFolds") { - return(0) - } - - # Check that only formula and target are specified - if (!is.null(formula) && any(!is.null(target), !is.null(predictors))) { - stop(sprintf("`formula` cannot be used when `target` or `predictors` are specified")) - } - - # Check models - error_msg <- "invalid model specified in `%s`, the following is a list of valid models: '%s'" - - if (!is.null(models) && !all(models %in% valid_models)) { - stop(sprintf(error_msg, "models", paste(valid_models, collapse = "', '"))) - } - - # Check map_args - .check_map_args(model_params, valid_models, error_msg) - - # Check rule - if (any(c("regularized_logistic", "regularized_multinomial") %in% models) && !is.null(model_params$rule)) { - intersect_char <- intersect(c("min", "1se"), model_params$rule) - if (length(intersect_char) == 0) stop("'min' and '1se' are the only valid options for `model_params$rule`") - } - - # Check if target is binary - .check_binary_models(data, formula, target, models, model_params) - - # Check if impute method and args is valid - .check_imputes(valid_imputes, impute_params) - - # Check n_cores - .check_cores(parallel_configs, train_params) -} - -.stop_execution <- function(train_params, model_params, caller) { - # Check split, n_folds - void <- all(is.null(train_params$split), is.null(train_params$n_folds)) - - if (caller == "classCV") { - void <- all(void, is.null(model_params$final_model) || isFALSE(model_params$final_model)) - } - - if (void) { - if (caller == "genFolds") { - msg <- "neither `train_params$split` or `train_params$n_folds` specified" - } else { - msg <- "neither `train_params$split`, `train_params$n_folds`, or `model_params$final_model` specified" - } - stop(msg) - } -} - -.check_binary_models <- function(data, formula, target, models, model_params) { - if (!is.null(formula)) target <- .get_var_names(formula = formula, data = data)$target - binary_target <- length(levels(factor(data[, target], exclude = NA))) == 2 - - obj <- c("reg:logistic", "binary:logistic", "binary:logitraw") - binary_models <- (any(c("logistic", "regularized_logistic") %in% models) || - "xgboost" %in% models && model_params$map_args$xgboost$params$objective %in% obj) - - if (binary_models && !binary_target) { - stop("'logistic', 'regularized_logistic', and 'xgboost' (with a logistic regression objective) requires a binary target") - } - - # Check threshold - if (!is.null(model_params$threshold)) { - valid_threshold <- model_params$threshold >= 0 && model_params$threshold <= 1 - if (!valid_threshold) stop("`model_params$threshold` must a numeric value from 0 to 1") - } -} - -.check_map_args <- function(model_params, valid_models, error_msg) { - map_args_models <- names(model_params$map_args) - if (!is.null(map_args_models) && !all(map_args_models %in% valid_models)) { - stop(sprintf(error_msg, "model_params$map_args", paste(valid_models, collapse = "', '"))) - } - - if (!is.null(model_params$map_args)) .check_args(model_params = model_params, caller = "model") -} - -.check_imputes <- function(valid_imputes, impute_params) { - msg <- sprintf( - "invalid method specified in `impute_params$method`, the following is a list of valid methods: '%s'", - paste(valid_imputes, collapse = "', '") - ) - - if (!is.null(impute_params$method)) { - if (!impute_params$method %in% valid_imputes) stop(msg) - - if (!is.null(impute_params$args)) .check_args(impute_params = impute_params, caller = "imputation") - } -} - -.check_cores <- function(parallel_configs, train_params) { - # Check n_cores - if (!is.null(parallel_configs$n_cores)) { - if (is.null(train_params$n_folds)) { - stop("parallel processing is only available when `train_params$n_folds` is not NULL") - } - - if (parallel_configs$n_cores > as.vector(future::availableCores())) { - stop(sprintf( - "more cores specified than available; only %s cores available but %s cores specified", - as.vector(future::availableCores()), parallel_configs$n_cores - )) - } - } -} +# Helper function for class_cv to check if inputs are valid +.error_handling <- function(data, formula = NULL, target = NULL, predictors = NULL, models = NULL, + model_params = NULL, train_params = NULL, impute_params = NULL, save = NULL, + parallel_configs = NULL, create_data = NULL) { + valid_models <- names(.VALID_ARGS$model) + valid_imputes <- names(.VALID_ARGS$imputation) + + params_list <- list( + data = data, formula = formula, target = target, predictors = predictors, models = models, + train_params = train_params, model_params = model_params, impute_params = impute_params, + save = save, parallel_configs = parallel_configs + ) + + # Check types + for (param in names(params_list)) .type_validator(param, params_list[[param]]) + + .stop_execution(train_params, model_params) + + if (!is.null(train_params$n_folds) && train_params$n_folds <= 2) { + stop("`train_params$n_folds` must greater than 2") + } + + if (!is.null(train_params$split) && c(train_params$split < 0 || train_params$split > 1)) { + stop("`train_params$split` must a numeric value from 0 to 1") + } + + # Check formula and target + if (inherits(c(formula, target), "NULL")) { + stop("either `formula` or `target` must be specified") + } + + # Check vars + .check_vars(formula, target, predictors, data) + + # Check that only formula and target are specified + if (!is.null(formula) && any(!is.null(target), !is.null(predictors))) { + stop(sprintf("`formula` cannot be used when `target` or `predictors` are specified")) + } + + # Check models + error_msg <- "invalid model specified in `%s`, the following is a list of valid models: '%s'" + + if (!is.null(models) && !all(models %in% valid_models)) { + stop(sprintf(error_msg, "models", paste(valid_models, collapse = "', '"))) + } + + .check_map_args(model_params, valid_models, error_msg) + + # Check rule + if (any(c("regularized_logistic", "regularized_multinomial") %in% models) && !is.null(model_params$rule)) { + intersect_char <- intersect(c("min", "1se"), model_params$rule) + if (length(intersect_char) == 0) { + stop("'min' and '1se' are the only valid options for `model_params$rule`") + } + } + + .check_binary_models(data, formula, target, models, model_params) + + .check_imputes(valid_imputes, impute_params) + + .check_cores(parallel_configs, train_params) +} + +.stop_execution <- function(train_params, model_params) { + void <- all(is.null(train_params$split), is.null(train_params$n_folds)) + void <- all(void, is.null(model_params$final_model) || isFALSE(model_params$final_model)) + if (void) { + msg <- "neither `train_params$split`, `train_params$n_folds`, or `model_params$final_model` specified" + stop(msg) + } +} + +.check_binary_models <- function(data, formula, target, models, model_params) { + if (!is.null(formula)) target <- .get_var_names(formula = formula, data = data)$target + binary_target <- length(levels(factor(data[, target], exclude = NA))) == 2 + + obj <- c("reg:logistic", "binary:logistic", "binary:logitraw") + binary_models <- (any(c("logistic", "regularized_logistic") %in% models) || + "xgboost" %in% models && model_params$map_args$xgboost$params$objective %in% obj) + + if (binary_models && !binary_target) { + stop("'logistic', 'regularized_logistic', and 'xgboost' (with a logistic regression objective) requires a binary target") + } + + # Check threshold + if (!is.null(model_params$threshold)) { + valid_threshold <- model_params$threshold >= 0 && model_params$threshold <= 1 + if (!valid_threshold) { + stop("`model_params$threshold` must a numeric value from 0 to 1") + } + } +} + +.check_map_args <- function(model_params, valid_models, error_msg) { + map_args_models <- names(model_params$map_args) + if (!is.null(map_args_models) && !all(map_args_models %in% valid_models)) { + stop(sprintf(error_msg, "model_params$map_args", paste(valid_models, collapse = "', '"))) + } + + if (!is.null(model_params$map_args)) { + .check_args(model_params = model_params, caller = "model") + } +} + +.check_imputes <- function(valid_imputes, impute_params) { + msg <- sprintf( + "invalid method specified in `impute_params$method`, the following is a list of valid methods: '%s'", + paste(valid_imputes, collapse = "', '") + ) + + if (!is.null(impute_params$method)) { + if (!impute_params$method %in% valid_imputes) stop(msg) + + if (!is.null(impute_params$args)) .check_args(impute_params = impute_params, caller = "imputation") + } +} + +.check_cores <- function(parallel_configs, train_params) { + if (!is.null(parallel_configs$n_cores)) { + if (is.null(train_params$n_folds)) { + stop("parallel processing is only available when `train_params$n_folds` is not NULL") + } + + if (parallel_configs$n_cores > as.vector(future::availableCores())) { + stop(sprintf( + "more cores specified than available; only %s cores available but %s cores specified", + as.vector(future::availableCores()), parallel_configs$n_cores + )) + } + } +} diff --git a/R/genFolds.R b/R/genFolds.R deleted file mode 100644 index 1f58532..0000000 --- a/R/genFolds.R +++ /dev/null @@ -1,126 +0,0 @@ -#' Create Split Datasets and/or Folds with Optional Stratification -#' -#' @name genFolds -#' -#' @description A standalone function to generate train-test split datasets and/or cross-validation folds, optionally -#' performing stratified sampling based on class distribution. -#' -#' @param data A data frame. -#' -#' @param target A numeric or character value specifying the target variable. Only required if\code{stratified = TRUE}. -#' Default is \code{NULL}. -#' -#' @param train_params A list that can contain the following parameters: -#' \itemize{ -#' \item \code{split}: A numeric value between 0 and 1 indicating the proportion of data to use -#' for training. The remaining observations are allocated to the test set. If not specified or set to \code{NULL}, no -#' train-test splitting is performed. Note that this split is separate from cross-validation. Default is \code{NULL}. -#' \item \code{n_folds}: An integer greater than 2 specifying the number of folds for cross-validation. If \code{NULL}, -#' no cross-validation is performed. Default is \code{NULL}. -#' \item \code{stratified}: A logical value indicating whether stratified sampling should be used during splitting. -#' Default is \code{FALSE}. -#' \item \code{random_seed}: A numeric value for the random seed to ensure reproducibility of random splitting and any -#' model training that relies on random starts. Default is \code{NULL}. -#' } -#' -#' @param create_data A logical value indicating whether to create all training and test/validation data frames. -#' Default is \code{FALSE}. -#' -#' @return A list containing the indices for train-test splitting and/or cross-validation, with information on class -#' distribution in the training, test sets, and folds (if applicable). It also includes the generated split datasets -#' and folds based on those indices. -#' -#' @examples -#' # Load example dataset -#' -#' data(iris) -#' -#' # Obtain indices for 80% training/test split and 5-fold CV -#' -#' output <- genFolds( -#' data = iris, -#' target = "Species", -#' train_params = list(split = 0.8, n_folds = 5, random_seed = 123) -#' ) -#' -#' @author Donisha Smith -#' -#' @export -genFolds <- function(data, - target, - train_params = list(split = NULL, n_folds = NULL, stratified = FALSE, random_seed = NULL), - create_data = FALSE) { - # Append train_params - train_params <- .append_param_keys("train_params", train_params, caller = "genFolds") - # Check validity of inputs - .error_handling(data = data, target = target, train_params = train_params, create_data = create_data, caller = "genFolds") - - # Initialize final output list - final_output <- list("configs" = train_params) - final_output <- c(final_output, .append_output(data[, target], train_params$stratified)) - final_output$data_partitions <- list() - - # Perform sampling - final_output <- .sampling(data, train_params, target, final_output) - - # Get data partitions - if (isTRUE(create_data)) { - final_output$data_partitions$dataframes <- .create_data(data, final_output$data_partitions$indices) - } - - return(final_output) -} - -# Sampling function used by classCV and genFolds -.sampling <- function(data, train_params, target, final_output) { - # Base args - base_args <- list(N = nrow(data), random_seed = train_params$random_seed) - - if (isTRUE(train_params$stratified)) { - # Create args list - strat_args <- list( - classes = final_output$class_summary$classes, - class_indxs = final_output$class_summary$indices, - class_props = final_output$class_summary$proportions - ) - - strat_args <- c(base_args, strat_args) - # Get stratified indices - if (!is.null(train_params$split)) { - strat_args$split <- train_params$split - final_output$data_partitions$indices$split <- do.call(.stratified_split, strat_args) - # Get proportions of classes in the stratified indices - final_output$data_partitions$proportions$split <- .get_proportions( - data[, target], - final_output$data_partitions$indices$split - ) - } - - if (!is.null(train_params$n_folds)) { - # Remove split arg - strat_args <- strat_args[!names(strat_args) == "split"] - strat_args$n_folds <- train_params$n_folds - final_output$data_partitions$indices$cv <- do.call(.stratified_cv, strat_args) - # Get proportions of classes in the stratified indices - final_output$data_partitions$proportions$cv <- .get_proportions( - data[, target], - final_output$data_partitions$indices$cv - ) - } - } else { - # Non-stratified sampling - if (!is.null(train_params$split)) { - base_args$split <- train_params$split - final_output$data_partitions$indices$split <- do.call(.split, base_args) - } - - if (!is.null(train_params$n_folds)) { - # Remove split arg - base_args <- base_args[!names(base_args) == "split"] - base_args$n_folds <- train_params$n_folds - final_output$data_partitions$indices$cv <- do.call(.cv, base_args) - } - } - - return(final_output) -} diff --git a/R/plot.vswift.R b/R/plot.vswift.R deleted file mode 100644 index 0b2cc2f..0000000 --- a/R/plot.vswift.R +++ /dev/null @@ -1,96 +0,0 @@ -#' Plot Model Evaluation Metrics -#' -#' @aliases plot.vswift -#' -#' @description Plots classification metrics (accuracy, precision, recall, and f1 for each class). -#' -#' @param x A list object of class \code{"vswift"}. -#' -#' @param metrics A character vector indicating which metrics to plot. Supported options are \code{"accuracy"}, -#' \code{"precision"}, \code{"recall"}, and \code{"f1"}. Default is \code{c("accuracy", "precision", "recall", "f1")}. - -#' -#' @param models A character string or a character vector specifying the classification algorithm(s) evaluation metrics -#' to plot. If \code{NULL}, all models will be plotted. The following options are available: -#' \itemize{ -#' \item \code{"lda"}: Linear Discriminant Analysis -#' \item \code{"qda"}: Quadratic Discriminant Analysis -#' \item \code{"logistic"}: Unregularized Logistic Regression -#' \item \code{"regularized_logistic"}: Regularized Logistic Regression -#' \item \code{"svm"}: Support Vector Machine -#' \item \code{"naivebayes"}: Naive Bayes -#' \item \code{"nnet"}: Neural Network -#' \item \code{"knn"}: K-Nearest Neighbors -#' \item \code{"decisiontree"}: Decision Tree -#' \item \code{"randomforest"}: Random Forest -#' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression -#' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression -#' \item \code{"xgboost"}: Extreme Gradient Boosting -#' } -#' Default = \code{NULL}. -#' -#' @param split A logical value indicating whether to plot metrics for the train-test split results. Default is -#' \code{TRUE}. -#' -#' @param cv A logical value indicating whether to plot metrics for cross-validation results. Default is \code{TRUE}. -#' -#' @param class_names A vector of the specific classes to plot. If \code{NULL}, plots are generated for all classes. -#' Default is \code{NULL}. -#' -#' @param path A character string specifying the directory (with a trailing slash) to save the plots. -#' Default is \code{NULL}. -#' -#' @param ... Additional arguments passed to the \code{png} function. -#' -#' @examples -#' # Load an example dataset -#' data(iris) -#' -#' # Perform a train-test split with an 80% training set and stratified sampling using QDA -#' -#' result <- classCV( -#' data = iris, -#' target = "Species", -#' models = "qda", -#' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), -#' save = list(models = TRUE) -#' ) -#' -#' -#' # Plot performance metrics for train-test split -#' -#' plot(result, class_names = "setosa", metrics = "f1") -#' -#' @importFrom grDevices dev.off dev.new graphics.off png -#' @importFrom graphics axis abline legend -#' -#' @author Donisha Smith -#' @method plot vswift -#' -#' @export -"plot.vswift" <- function(x, metrics = c("accuracy", "precision", "recall", "f1"), models = NULL, split = TRUE, - cv = TRUE, class_names = NULL, path = NULL, ...) { - # Lowercase and intersect common names - metrics <- intersect(unlist(lapply(metrics, function(x) tolower(x))), c("accuracy", "precision", "recall", "f1")) - if (length(metrics) == 0) { - stop(sprintf("no metrics specified, available metrics: %s", paste(c("accuracy", "precision", "recall", "f1"), collapse = ", "))) - } - # intersect common names - if (!is.null(class_names)) { - class_names <- intersect(class_names, x$class_summary$classes) - if (length(class_names) == 0) { - stop(sprintf("no classes specified, available classes: %s", paste(x$class_summary$classes, collapse = ", "))) - } - } - - # Get models - models <- .intersect_models(x, models) - - # Iterate over models - for (model in models) { - .plot( - x = x, metrics = metrics, model = model, plot_title = .MODEL_LIST[[model]], split = split, cv = cv, - class_names = class_names, path = path, ... - ) - } -} diff --git a/R/plot_utils.R b/R/plot_utils.R index e104ac9..bb7a40c 100644 --- a/R/plot_utils.R +++ b/R/plot_utils.R @@ -14,7 +14,7 @@ # Entry point for plotting train-test split and cross-validation evaluation metrics .plot <- function(x, metrics, model, plot_title, split, cv, class_names, path, ...) { # Get dataframe - df <- x$metrics[[model]] + df <- x$metrics(model) if (!is.null(path)) { # Get OS separator @@ -27,11 +27,13 @@ } # Create Metrics List - metrics_list <- list("precision" = "Precision", "recall" = "Recall", "f1" = "F1") + metrics_list <- list( + "precision" = "Precision", "recall" = "Recall", "f1" = "F1" + ) # Get classes if (is.null(class_names)) { - classes <- x$class_summary$classes + classes <- x$classes } else { classes <- class_names } @@ -55,7 +57,9 @@ } # Function to plot train-test split evaluation metrics -.plot_split <- function(df, classes, metrics, metrics_list, plot_title, path, os.sep, png_name, ...) { +.plot_split <- function( + df, classes, metrics, metrics_list, plot_title, path, os.sep, png_name, ... +) { # Base plot kwargs plot_kwargs <- list(x = 1:2, ylim = 0:1, xlab = "Set", xaxt = "n") axis_kwargs <- list(side = 1, at = 1:2, labels = c("Training", "Test")) @@ -70,7 +74,10 @@ # Create png if (!is.null(path)) { png( - filename = paste0(path, os.sep, sprintf("%s_train_test_classification_accuracy.png", png_name)), + filename = paste0( + path, os.sep, + sprintf("%s_train_test_classification_accuracy.png", png_name) + ), ... ) } @@ -100,7 +107,10 @@ if (!is.null(path)) { png(filename = paste0( path, os.sep, - sprintf("%s_train_test_%s_%s.png", png_name, metric, paste(unlist(strsplit(class, split = " ")), collapse = "_")) + sprintf( + "%s_train_test_%s_%s.png", png_name, tolower(metric), + paste(unlist(strsplit(class, split = " ")), collapse = "_") + ) ), ...) } @@ -115,9 +125,12 @@ } # Function to plot cross-validation evaluation metrics -.plot_cv <- function(df, classes, metrics, metrics_list, plot_title, path, os.sep, png_name, ...) { - # Get the last row index subtracted by three to avoid getting mean, standard dev, and standard error - idx <- nrow(df$cv) - 3 +.plot_cv <- function( + df, classes, metrics, metrics_list, plot_title, path, os.sep, png_name, ... +) { + # Get the last row index subtracted by three to avoid getting mean, + # standard dev, and standard error + index <- nrow(df$cv) - 3 # Create vector of metrics to obtain col_names <- c() @@ -129,15 +142,18 @@ intersected_metrics <- intersect(metrics, names(metrics_list)) converted_metrics <- lapply(intersected_metrics, function(x) metrics_list[[x]]) # Get column names from dataframe - col_names <- c(col_names, as.vector(sapply(classes, function(x) paste("Class:", x, converted_metrics)))) + col_names <- c( + col_names, + as.vector(sapply(classes, function(x) paste("Class:", x, converted_metrics))) + ) } # Base plot kwargs - plot_kwargs <- list(x = 1:idx, ylim = c(0, 1), xlab = "Folds", xaxt = "n") + plot_kwargs <- list(x = 1:index, ylim = c(0, 1), xlab = "Folds", xaxt = "n") for (col_name in col_names) { # Get values - plot_kwargs$y <- df$cv[1:idx, col_name] + plot_kwargs$y <- df$cv[1:index, col_name] # Get Title and if (col_name == "Classification Accuracy") { @@ -164,7 +180,10 @@ plot_kwargs$main <- sprintf("%s - Class: %s", plot_title, class_name) if (!is.null(path)) { - full_png_name <- sprintf("%s_cv_%s_%s.png", png_name, metric_name, paste(class_name, collapse = "_")) + full_png_name <- sprintf( + "%s_cv_%s_%s.png", png_name, tolower(metric_name), + paste(class_name, collapse = "_") + ) filename <- paste0(path, os.sep, full_png_name) } } @@ -175,14 +194,22 @@ # Generate plot do.call(plot, plot_kwargs) # Add axis info - axis(side = 1, at = as.integer(1:idx), labels = as.integer(1:idx)) + axis(side = 1, at = as.integer(1:index), labels = as.integer(1:index)) # Add mean and standard deviation to the plot abline(h = mean(plot_kwargs$y, na.rm = TRUE), col = "red", lwd = 1) - abline(h = mean(plot_kwargs$y, na.rm = TRUE) + sd(plot_kwargs$y, na.rm = TRUE), col = "blue", lty = 2, lwd = 1) - abline(h = mean(plot_kwargs$y, na.rm = TRUE) - sd(plot_kwargs$y, na.rm = TRUE), col = "blue", lty = 2, lwd = 1) + abline( + h = mean(plot_kwargs$y, na.rm = TRUE) + sd(plot_kwargs$y, na.rm = TRUE), + col = "blue", lty = 2, lwd = 1 + ) + abline( + h = mean(plot_kwargs$y, na.rm = TRUE) - sd(plot_kwargs$y, na.rm = TRUE), + col = "blue", lty = 2, lwd = 1 + ) - # Add legend - legend("bottomright", legend = c("Mean", "Mean \U00B1 SD"), col = c("red", "blue"), lty = c(1, 2), lwd = 1) + legend("bottomright", + legend = c("Mean", "Mean \U00B1 SD"), + col = c("red", "blue"), lty = c(1, 2), lwd = 1, bty = "n" + ) # Use dev.new for certain R environments or dev.off if png is used .display(path) diff --git a/R/print.vswift.R b/R/print.vswift.R deleted file mode 100644 index d7a3a40..0000000 --- a/R/print.vswift.R +++ /dev/null @@ -1,83 +0,0 @@ -#' Print Parameter Information and/or Model Evaluation Metrics -#' -#' @aliases print.vswift -#' -#' @description Prints model configuration details and/or model evaluation metrics (classification accuracy, precision, -#' recall, and F1 scores). -#' -#' @param x A list object of class \code{"vswift"}. -#' -#' @param configs A logical value indicating whether to print model configuration information from the vswift -#' object. Default is \code{TRUE}. -#' -#' @param metrics A logical value indicating whether to print model evaluation metrics from the vswift object. If -#' \code{TRUE}, precision, recall, and F1 scores for each class will be displayed, along with their mean values -#' (if cross-validation was used). Default is \code{TRUE}. -#' -#' @param models A character string or a character vector specifying the classification algorithm(s) information to be -#' printed. If \code{NULL}, all model information will be printed. The following options are available: -#' \itemize{ -#' \item \code{"lda"}: Linear Discriminant Analysis -#' \item \code{"qda"}: Quadratic Discriminant Analysis -#' \item \code{"logistic"}: Unregularized Logistic Regression -#' \item \code{"regularized_logistic"}: Regularized Logistic Regression -#' \item \code{"svm"}: Support Vector Machine -#' \item \code{"naivebayes"}: Naive Bayes -#' \item \code{"nnet"}: Neural Network -#' \item \code{"knn"}: K-Nearest Neighbors -#' \item \code{"decisiontree"}: Decision Tree -#' \item \code{"randomforest"}: Random Forest -#' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression -#' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression -#' \item \code{"xgboost"}: Extreme Gradient Boosting -#' } -#' Default = \code{NULL}. -#' -#' @param ... No additional arguments are currently supported. -#' -#' @examples -#' # Load an example dataset -#' -#' data(iris) -#' -#' # Perform a train-test split with an 80% training set using LDA -#' -#' result <- classCV( -#' data = iris, -#' target = "Species", -#' models = "lda", -#' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123) -#' ) -#' -#' # Print parameter information and performance metrics -#' print(result) -#' -#' @importFrom utils capture.output -#' -#' @author Donisha Smith -#' @method print vswift -#' -#' @export -"print.vswift" <- function(x, configs = TRUE, metrics = TRUE, models = NULL, ...) { - # Get models - models <- .intersect_models(x, models) - - # Calculate string length of classes - str_list <- .dashed_lines(x$class_summary$classes, TRUE) - for (model in models) { - cat(paste("Model:", .MODEL_LIST[[model]]), "\n\n") - # Print parameter information - if (configs) .print_configs(x, model) - - if (metrics) { - if (is.data.frame(x$metrics[[model]]$split)) { - .print_metrics_split(x, x$metrics[[model]]$split, str_list$max, str_list$diff) - } - if (is.data.frame(x$metrics[[model]]$cv)) { - .print_metrics_cv(x, x$metrics[[model]]$cv, str_list$max, str_list$diff) - } - } - # Add dashed line to separate each model - .dashed_lines(x$class_summary$classes) - } -} diff --git a/R/print_utils.R b/R/print_utils.R index 4384902..34f2c25 100644 --- a/R/print_utils.R +++ b/R/print_utils.R @@ -1,182 +1,204 @@ -# Function to print configs to console -.print_configs <- function(x, model) { - # Print parameter information - if (x$configs$n_features > 20) { - cat(sprintf("Target: %s\n\n", all.vars(x$configs$formula)[1])) - } else { - str <- capture.output(dput(deparse(x$configs$formula))) - str <- gsub("\\s+", " ", paste(str, collapse = "")) - str <- gsub('\"', "", str) - cat(sprintf("Formula: %s\n\n", str)) - } - cat(sprintf("Number of Features: %s\n\n", x$configs$n_features)) - cat(sprintf("Classes: %s\n\n", paste(x$class_summary$classes, collapse = ", "))) - str <- capture.output(dput(x$configs$train_params)) - str <- gsub("\\s+", " ", paste(str, collapse = "")) - cat(sprintf("Training Parameters: %s\n\n", str)) - - # Modify model parameters - info <- x$configs$model_params - - # Show threshold - val <- .determine_threshold(model, info$map_args$xgboost$params$objective, info$threshold, FALSE) - if (!is.null(val)) info$threshold <- val - - if (!startsWith(model, "regularized") || (startsWith(model, "regularized") && is.null(info$rule))) { - info <- info[!names(info) %in% c("rule", "verbose")] - } - - info$map_args <- info$map_args[!names(info$map_args) != model] - - if (length(info$map_args) == 0) { - info$map_args <- NULL - info <- c(list(map_args = NULL), info) - } - - str <- capture.output(dput(info)) - str <- gsub("\\s+", " ", paste(str, collapse = "")) - cat(sprintf("Model Parameters: %s\n\n", str)) - - # Print sample size and missing data for user transparency - cat(sprintf("Unlabeled Observations: %s\n\n", x$missing_data_summary$unlabeled_observations)) - cat(sprintf("Incomplete Labeled Observations: %s\n\n", x$missing_data_summary$incomplete_labeled_observations)) - cat(sprintf("Observations Missing All Features: %s\n\n", x$missing_data_summary$observations_missing_all_features)) - - if (!is.null(x$configs$impute_params$method)) { - total <- x$missing_data_summary$complete_observations + x$missing_data_summary$incomplete_labeled_observations - cat(sprintf("Sample Size (Complete + Imputed Incomplete Labeled Observations): %s\n\n", total)) - } else { - cat(sprintf("Sample Size (Complete Observations): %s\n\n", x$missing_data_summary$complete_observations)) - } - - str <- capture.output(dput(x$configs$impute_params)) - str <- gsub("\\s+", " ", paste(str, collapse = "")) - cat(sprintf("Imputation Parameters: %s\n\n", str)) - - # Print information for parallel processing - str <- capture.output(dput(x$configs$parallel_configs)) - str <- gsub("\\s+", " ", paste(str, collapse = "")) - cat(sprintf("Parallel Configs: %s\n\n", str)) -} - -# Function to print train-test split metrics to console -.print_metrics_split <- function(x, data, max_str_len, str_diff) { - for (set in c("Training", "Test")) { - # Variable for which class string length to print to ensure all values have equal spacing - class_pos <- 1 - # Print name of the set metrics to be printed and add underscores - cat("\n\n", set, "\n") - cat(rep("_", 21), "\n\n") - # Print classification accuracy - cat("Classification Accuracy: ", format(round(data[data$Set == set, "Classification Accuracy"], 2), nsmall = 2), "\n\n") - # Print name of metrics - cat("Class:", rep("", max_str_len - 1), "Precision:", "", "Recall:", strrep(" ", 5), "F1:\n\n") - # For loop to obtain vector of values for each class - - for (class in x$class_summary$classes) { - # Get class specific columns - class_cols <- .split_colnames(class, data) - - # Print metric corresponding to class - class_met <- sapply(data[data$Set == set, class_cols], function(x) format(round(x, 2), nsmall = 2)) - # Add spacing - padding <- nchar(paste("Class:", "", "Pre")) - - # Pad output with strings - formatted_class_met <- c() - for (i in seq_along(class_met)) { - formatted_class_met <- c(formatted_class_met, class_met[i]) - if (i != length(class_met)) { - if (i == 1) space <- if (class_met[i] != "NaN") rep("", 4) else rep("", 5) - if (i == 2) space <- if (class_met[i] != "NaN") rep("", 5) else rep("", 6) - formatted_class_met <- c(formatted_class_met, space) - } - } - - cat(class, rep("", (padding + str_diff[class_pos])), paste(formatted_class_met, collapse = " "), "\n") - - class_pos <- class_pos + 1 - } - } -} - -# Function to print cross validation metrics to console -.print_metrics_cv <- function(x, data, max_str_len, str_diff) { - # Variable for which class string length to print to ensure all values have equal spacing - class_pos <- 1 - # Get number of folds to select the correct rows for mean and stdev - n_folds <- x$configs$train_params$n_folds - # Print parameters name - cat("\n\n", "Cross-validation (CV)", "\n") - cat(rep("_", 21), "\n\n") - mean_cv <- round(data[data$Fold == "Mean CV:", "Classification Accuracy"], 2) - sd_cv <- round(data[data$Fold == "Standard Deviation CV:", "Classification Accuracy"], 2) - acc_met <- c(format(mean_cv, nsmall = 2), format(sd_cv, nsmall = 2)) - acc_met <- sprintf("%s \U00B1 %s (SD)", acc_met[1], acc_met[2]) - cat("Average Classification Accuracy: ", acc_met, "\n\n") - cat( - "Class:", rep("", max_str_len), strrep(" ", 2), "Average Precision:", strrep(" ", 6), - "Average Recall:", strrep(" ", 10), "Average F1:\n\n" - ) - - # Go through column names, split the colnames and class name to see if the column name is the metric for that class - for (class in x$class_summary$classes) { - # Get class specific columns - class_cols <- .split_colnames(class, data) - - # Print metric corresponding to class - mean_met <- sapply(data[((n_folds + 1)), class_cols], function(x) format(round(x, 2), nsmall = 2)) - sd_met <- sapply(data[((n_folds + 2)), class_cols], function(x) format(round(x, 2), nsmall = 2)) - sd_met_pos <- 1 - class_met <- c() - - for (metric in mean_met) { - class_met <- c(class_met, sprintf("%s \u00B1 %s (SD)", metric, sd_met[sd_met_pos])) - sd_met_pos <- sd_met_pos + 1 - } - - # Pad output with strings - formatted_class_met <- c() - for (i in seq_along(class_met)) { - formatted_class_met <- c(formatted_class_met, class_met[i]) - space <- if (class_met[i] == "NaN \u00B1 NA (SD)") rep("", 9) else rep("", 6) - if (i != length(class_met)) formatted_class_met <- c(formatted_class_met, space) - } - - # Add spacing - padding <- nchar(paste("Class:", "", "Ave")) - cat(class, rep("", (padding + str_diff[class_pos])), paste(formatted_class_met), "\n") - # Update variable - class_pos <- class_pos + 1 - } -} - -.split_colnames <- function(class, data) { - class_cols <- c() - for (colname in colnames(data)) { - split_colname <- unlist(strsplit(colname, split = " ")) - # Remove the first and last element, corresponds to "Class:" and some metric name - split_colname <- split_colname[-c(1, length(split_colname))] - split_classname <- unlist(strsplit(class, split = " ")) - if (all(split_classname %in% split_colname) && length(setdiff(split_colname, split_classname)) == 0) { - # Store colnames for the class is variable - class_cols <- c(class_cols, colname) - } - } - - return(class_cols) -} - -# Calculate string length of classes to create a border of dashed lines -.dashed_lines <- function(classes, return_str = FALSE) { - str_len <- sapply(classes, nchar) - max_str_len <- max(str_len) - cat("\n") - partial_output_names <- "Average Precision: Average Recall: Average F1:\n\n" - cat(rep("-", nchar(paste("Class:", strrep(" ", max_str_len), partial_output_names)) %/% 1.5), "\n") - cat("\n\n") - - if (return_str) { - return(list("max" = max_str_len, "diff" = max_str_len - str_len)) - } -} +# Calculate string length of classes to create a border of dashed lines +.dashed_lines <- function(classes, return_str = FALSE) { + str_len <- sapply(classes, nchar) + max_str_len <- max(str_len) + partial_output_names <- "Average Precision: Average Recall: Average F1:\n\n" + str <- paste("\nClass:", strrep(" ", max_str_len), partial_output_names) + cat(rep("-", nchar(str) %/% 1.5), "\n\n\n") + + if (return_str) { + return(list("max" = max_str_len, "diff" = max_str_len - str_len)) + } +} + +# Function to print configs to console +.print_configs <- function(x, model) { + # Print parameter information + if (x$configs("n_features") > 20) { + cat(sprintf("Target: %s\n\n", all.vars(x$configs("formula"))[1])) + } else { + str <- capture.output(dput(deparse(x$configs("formula")))) + str <- gsub("\\s+", " ", paste(str, collapse = "")) + str <- gsub('\"', "", str) + cat(sprintf("Formula: %s\n\n", str)) + } + + cat(sprintf("Number of Features: %s\n\n", x$configs("n_features"))) + cat(sprintf("Classes: %s\n\n", paste(x$classes, collapse = ", "))) + + str <- capture.output(dput(x$configs("train_params"))) + str <- gsub("\\s+", " ", paste(str, collapse = "")) + cat(sprintf("Training Parameters: %s\n\n", str)) + + # Modify model parameters + info <- x$configs("model_params") + + # Show threshold + val <- .determine_threshold( + model, info$map_args$xgboost$params$objective, info$threshold, FALSE + ) + if (!is.null(val)) info$threshold <- val + + if (!startsWith(model, "regularized") || + (startsWith(model, "regularized") && is.null(info$rule))) { + info <- info[!names(info) %in% c("rule", "verbose")] + } + + info$map_args <- info$map_args[!names(info$map_args) != model] + + if (length(info$map_args) == 0) { + info$map_args <- NULL + info <- c(list(map_args = NULL), info) + } + + str <- capture.output(dput(info)) + str <- gsub("\\s+", " ", paste(str, collapse = "")) + cat(sprintf("Model Parameters: %s\n\n", str)) + + # Print sample size and missing data for user transparency + missing_data_summary <- x$get_missing_data_summary() + complete_obs <- missing_data_summary$complete_observations + incomplete_obs <- missing_data_summary$incomplete_labeled_observations + + cat(sprintf( + "Unlabeled Observations: %s\n\n", + missing_data_summary$unlabeled_observations + )) + cat(sprintf("Incomplete Labeled Observations: %s\n\n", incomplete_obs)) + cat(sprintf( + "Observations Missing All Features: %s\n\n", + missing_data_summary$observations_missing_all_features + )) + + if (!is.null(x$configs("impute_params", "method"))) { + total <- complete_obs + incomplete_obs + cat(sprintf( + "Sample Size (Complete + Imputed Incomplete Labeled Observations): %s\n\n", + total + )) + } else { + cat(sprintf("Sample Size (Complete Observations): %s\n\n", complete_obs)) + } + + str <- capture.output(dput(x$configs("impute_params"))) + str <- gsub("\\s+", " ", paste(str, collapse = "")) + cat(sprintf("Imputation Parameters: %s\n\n", str)) + + str <- capture.output(dput(x$configs("parallel_configs"))) + str <- gsub("\\s+", " ", paste(str, collapse = "")) + cat(sprintf("Parallel Configs: %s\n\n", str)) +} + +.format_metric <- function(value) { + if (is.na(value)) { + return("NaN") + } + format(round(value, 2), nsmall = 2) +} + + +.format_mean_sd <- function(mean_val, sd_val) { + sprintf( + "%s \u00B1 %s (SD)", + .format_metric(mean_val), + .format_metric(sd_val) + ) +} + +# Helper to get class-specific column names from a metrics dataframe +.get_class_columns <- function(class, data) { + class_cols <- c() + for (colname in colnames(data)) { + split_colname <- unlist(strsplit(colname, split = " ")) + class_name_components <- split_colname[-c(1, length(split_colname))] + split_classname <- unlist(strsplit(class, split = " ")) + if (all(split_classname %in% class_name_components) && + length(setdiff(class_name_components, split_classname)) == 0) { + class_cols <- c(class_cols, colname) + } + } + + return(class_cols) +} + +# Function to print train-test split metrics to console +.print_metrics_split <- function(x, model, max_str_len) { + data <- x$metrics(model, "split") + + met_width <- 14 + + for (set in data$Set) { + cat(sprintf("\n\n %s\n", set)) + cat(rep("_", 21), "\n\n") + + acc <- data[data$Set == set, "Classification Accuracy"] + cat(sprintf(" Classification Accuracy: %s\n\n", .format_metric(acc))) + + # sprintf has alignment patterns %-*s, is dynamic left alignment + cat(sprintf( + " %-*s %-*s %-*s %s\n\n", + max_str_len + 6, "Class:", + met_width, "Precision:", + met_width, "Recall:", + "F1:" + )) + + for (class in x$classes) { + class_cols <- .get_class_columns(class, data) + vals <- sapply(data[data$Set == set, class_cols], .format_metric) + + cat(sprintf( + " %-*s %-*s %-*s %s\n", + max_str_len + 6, class, + met_width, vals[1], + met_width, vals[2], + vals[3] + )) + } + } +} + +# Function to print cross validation metrics to console +.print_metrics_cv <- function(x, model, max_str_len) { + data <- x$metrics(model, "cv") + + cat(sprintf("\n\n Cross-validation (CV)\n")) + cat(rep("_", 21), "\n\n") + + # Classification accuracy + mean_acc <- data[data$Fold == "Mean CV:", "Classification Accuracy"] + sd_acc <- data[data$Fold == "Standard Deviation CV:", "Classification Accuracy"] + cat(sprintf( + " Average Classification Accuracy: %s\n\n", + .format_mean_sd(mean_acc, sd_acc) + )) + + met_width <- 24 + + cat(sprintf( + " %-*s %-*s %-*s %s\n\n", + max_str_len + 6, "Class:", + met_width, "Average Precision:", + met_width, "Average Recall:", + "Average F1:" + )) + + for (class in x$classes) { + class_cols <- .get_class_columns(class, data) + + mean_vals <- as.numeric(data[x$n_folds + 1, class_cols]) + sd_vals <- as.numeric(data[x$n_folds + 2, class_cols]) + + formatted <- sapply(seq_along(mean_vals), function(i) { + .format_mean_sd(mean_vals[i], sd_vals[i]) + }) + + cat(sprintf( + " %-*s %-*s %-*s %s\n", + max_str_len + 6, class, + met_width, formatted[1], + met_width, formatted[2], + formatted[3] + )) + } +} diff --git a/R/r6_classes.R b/R/r6_classes.R new file mode 100644 index 0000000..d878c7e --- /dev/null +++ b/R/r6_classes.R @@ -0,0 +1,856 @@ +#' Classification Results +#' +#' @name Vswift +#' +#' @description +#' An R6 class containing classification results produced by \code{\link{class_cv}}. +#' Provides methods for accessing metrics, trained models, data partitions, +#' and generating plots and curves. +#' +#' @examples +#' result <- class_cv( +#' data = iris, +#' target = "Species", +#' models = c("svm", "lda"), +#' train_params = list(split = 0.8, n_folds = 5, random_seed = 123) +#' ) +#' +#' result$summary() +#' result$metrics("svm", "cv") +#' result$plot(metrics = "f1") +#' +#' @importFrom R6 R6Class +#' @export +Vswift <- R6Class("Vswift", + public = list( + #' @description Create a new vswift result object. + #' + #' @param configs List of configuration parameters. + #' + #' @param class_summary List with class-level info. + #' + #' @param metrics Named list of per-model metric dataframes. + #' + #' @param trained_models Named list of trained models. + #' + #' @param missing_data_summary Named list of missing data information. + #' + #' @param data_partitions List with indices and dataframes. + #' + #' @param imputation_models List of prep objects. + initialize = function(configs, class_summary, metrics, trained_models, + missing_data_summary, data_partitions, + imputation_models) { + private$.configs <- configs + private$.class_summary <- class_summary + private$.metrics <- metrics + private$.trained_models <- trained_models + private$.missing_data_summary <- missing_data_summary + private$.data_partitions <- data_partitions + private$.imputation_models <- imputation_models + }, + #' @description Retrieve evaluation metrics. + #' + #' @param model Character. Model name. \code{NULL} returns all. + #' + #' @param type Character. "split" or "cv". \code{NULL} returns all for that + #' model. Default is \code{NULL}. + #' + #' @return A data.frame or named list. + metrics = function(model, type = NULL) { + if (all(is.null(model), !is.null(type))) { + stop("`model` cannot be `NULL` while type is not `NULL`") + } + + if (is.null(model)) { + return(private$.metrics) + } + + obj <- .get_object(private$.metrics, model) + if (is.null(type)) { + return(obj) + } + + return(.get_object(obj, type)) + }, + #' @description Retrieve configuration parameters. + #' + #' @param param Character. Config key. \code{NULL} returns all. Default + #' is \code{NULL}. + #' + #' @param keys Character or list of characters. The sub-keys within param. + #' \code{NULL} returns all keys of \code{param}. Default is \code{NULL}. + #' + #' @return The requested configuration value. + configs = function(param = NULL, keys = NULL) { + if (is.null(param)) { + return(private$.configs) + } + + obj <- .get_object(private$.configs, param) + # NULL default needed for early return + if (is.null(keys)) { + return(obj) + } + + + for (key in c(keys)) obj <- .get_object(obj, key) + + return(obj) + }, + #' @description Retrieve trained model objects. + #' + #' @param model Character. Model name. If \code{NULL}, returns all + #' all models. Default is \code{NULL}. + #' + #' @param partition Character. "split", "final", or "fold1".."foldN". + #' If \code{NULL}, returns all partitions. Default is \code{NULL}. + #' + #' @return A trained model object or named list. + get_trained_model = function(model = NULL, partition = NULL) { + if (is.null(private$.trained_models)) { + return(NULL) + } + + if (is.null(model)) { + return(private$.trained_models) + } + + obj <- .get_object(private$.trained_models, model) + + if (is.null(partition)) { + return(obj) + } + + return(.get_object(obj, partition)) + }, + #' @description Retrieve the imputation objects. + #' + #' @param model Character. Model name. If \code{NULL}, returns all + #' all models. Default is \code{NULL}. + #' + #' @param partition Character. "split", "final", or "fold1".."foldN". + #' If \code{NULL}, returns all partitions. Default is \code{NULL}. + #' + #' @return An imputation model object or named list. + get_imputation_model = function(model = NULL, partition = NULL) { + if (is.null(private$.imputation_models)) { + return(NULL) + } + + if (is.null(model)) { + return(private$.imputation_models) + } + + obj <- .get_object(private$.imputation_models, model) + + if (is.null(partition)) { + return(obj) + } + + return(.get_object(obj, partition)) + }, + #' @description Retrieve missing data summary. + #' + #' @param what Character. The specific missing data information. + #' \code{NULL} returns all. Default is \code{NULL}. + #' + #' @return The requested missing data information. + get_missing_data_summary = function(what = NULL) { + if (is.null(what)) { + return(private$.missing_data_summary) + } + + return(.get_object(private$.missing_data_summary, what)) + }, + #' @description Retrieve data partition information. + #' + #' @param what Character. "indices", "proportions", or "dataframes". + #' Default is \code{NULL}. + #' + #' @param partition Character. "split" or "fold1".."foldN". If \code{NULL}, + #' returns all partitions. Default is \code{NULL}. + #' + #' @param set Character. "train" or "test". \code{NULL}, returns the + #' training and test set. Default is \code{NULL}. + #' + #' @return Requested partition data. + get_partition = function(what = NULL, partition = NULL, set = NULL) { + if (is.null(what)) { + return(private$.data_partitions) + } + + obj <- .get_object(private$.data_partitions, what) + + if (is.null(partition)) { + return(obj) + } + + obj <- .get_object(obj, partition) + + if (is.null(set)) { + return((obj)) + } + + return(.get_object(obj, set)) + }, + #' @description Retrieve class summary information. + #' + #' @param what Character. "classes", "keys", "proportions", or "indices". + #' + #' @return The requested class summary component. + class_info = function(what) { + return(.get_object(private$.class_summary, what)) + }, + #' @description List models present in this result object. + #' @return Character vector of model names. + available_models = function() { + return(self$configs("models")) + }, + #' @description Prints model configuration details and/or model evaluation + #' metrics (classification accuracy, precision, recall, and F1 scores). + #' + #' @param configs A logical value indicating whether to print model + #' configuration information from the vswift class. Default is \code{TRUE}. + #' + #' @param metrics A logical value indicating whether to print model + #' evaluation metrics from the vswift class If \code{TRUE}, precision, + #' recall, and F1 scores for each class will be displayed, along with their + #' mean values (if cross-validation was used). Default is \code{TRUE}. + #' + #' @param models A character string or a character vector specifying the + #' classification algorithm(s) information to be printed. If \code{NULL}, + #' all model information will be printed. The following options ar + #' available: + #' \itemize{ + #' \item \code{"lda"}: Linear Discriminant Analysis + #' \item \code{"qda"}: Quadratic Discriminant Analysis + #' \item \code{"logistic"}: Unregularized Logistic Regression + #' \item \code{"regularized_logistic"}: Regularized Logistic Regression + #' \item \code{"svm"}: Support Vector Machine + #' \item \code{"naivebayes"}: Naive Bayes + #' \item \code{"nnet"}: Neural Network + #' \item \code{"knn"}: K-Nearest Neighbors + #' \item \code{"decisiontree"}: Decision Tree + #' \item \code{"randomforest"}: Random Forest + #' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + #' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + #' Regression + #' \item \code{"xgboost"}: Extreme Gradient Boosting + #' } + #' Default = \code{NULL}. + #' + #' @param ... No additional arguments are currently supported. + #' + #' @examples + #' # Load an example dataset + #' + #' data(iris) + #' + #' # Perform a train-test split with an 80% training set using LDA + #' + #' results <- class_cv( + #' data = iris, + #' target = "Species", + #' models = "lda", + #' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123) + #' ) + #' + #' # Print parameter information and performance metrics + #' results$print() + #' + #' @importFrom utils capture.output + print = function(configs = TRUE, metrics = TRUE, models = NULL) { + # Calculate string length of classes + str_list <- .dashed_lines(self$classes, TRUE) + for (model in private$.resolve_models(models)) { + cat(paste("Model:", .MODEL_LIST[[model]]), "\n\n") + if (configs) .print_configs(self, model) + + kwargs <- list( + x = self, model = model, max_str_len = str_list$max + ) + + if (metrics) { + if (self$has_split) do.call(.print_metrics_split, kwargs) + if (self$has_cv) do.call(.print_metrics_cv, kwargs) + } + + .dashed_lines(self$classes) + } + + invisible(self) + }, + #' @description Plots classification metrics (accuracy, precision, recall, + #' and f1 for each class). + #' + #' @param metrics A character vector indicating which metrics to plot. + #' Supported options are "accuracy", "recall", "precision", "f1". Default is + #' \code{c("accuracy", "precision", "recall", "f1")}. + #' + #' @param models A character string or a character vector specifying the + #' classification algorithm(s) evaluation metrics to plot. If \code{NULL}, + #' all models will be plotted. The following options are available: + #' \itemize{ + #' \item \code{"lda"}: Linear Discriminant Analysis + #' \item \code{"qda"}: Quadratic Discriminant Analysis + #' \item \code{"logistic"}: Unregularized Logistic Regression + #' \item \code{"regularized_logistic"}: Regularized Logistic Regression + #' \item \code{"svm"}: Support Vector Machine + #' \item \code{"naivebayes"}: Naive Bayes + #' \item \code{"nnet"}: Neural Network + #' \item \code{"knn"}: K-Nearest Neighbors + #' \item \code{"decisiontree"}: Decision Tree + #' \item \code{"randomforest"}: Random Forest + #' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + #' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + #' Regression + #' \item \code{"xgboost"}: Extreme Gradient Boosting + #' } + #' Default = \code{NULL}. + #' + #' @param split A logical value indicating whether to plot metrics for the + #' train-test split results. Default is \code{TRUE}. + #' + #' @param cv A logical value indicating whether to plot metrics for + #' cross-validation results. Default is \code{TRUE}. + #' + #' @param class_names A vector of the specific classes to plot. If + #' \code{NULL}, plots are generated for all classes. Default is \code{NULL}. + #' + #' @param path A character string specifying the directory (with a trailing + #' slash) to save the plots. + #' Default is \code{NULL}. + #' + #' @param ... Additional arguments passed to the \code{png} function. + #' + #' @examples + #' # Load an example dataset + #' data(iris) + #' + #' # Perform a train-test split with an 80% training set and stratified + #' # sampling using QDA + #' results <- class_cv( + #' data = iris, + #' target = "Species", + #' models = "qda", + #' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + #' save = list(models = TRUE) + #' ) + #' + #' + #' # Plot performance metrics for train-test split + #' + #' results$plot(class_names = "setosa", metrics = "f1") + #' + #' @importFrom grDevices dev.off dev.new graphics.off png + #' @importFrom graphics axis abline legend + plot = function(metrics = c("accuracy", "precision", "recall", "f1"), + models = NULL, split = TRUE, cv = TRUE, class_names = NULL, + path = NULL, ...) { + valid_metrics <- c("accuracy", "precision", "recall", "f1") + metrics <- intersect(unlist(lapply(metrics, tolower)), valid_metrics) + if (length(metrics) == 0) { + met_str <- paste(valid_metrics, collapse = ", ") + stop( + sprintf("no metrics specified, available metrics: %s", met_str) + ) + } + if (!is.null(class_names)) { + class_names <- intersect(class_names, self$classes) + if (length(class_names) == 0) { + class_str <- paste(self$classes, collapse = ", ") + stop( + sprintf("no classes specified, available classes: %s", class_str) + ) + } + } + + for (model in private$.resolve_models(models)) { + .plot( + x = self, metrics = metrics, model = model, + plot_title = .MODEL_LIST[[model]], split = split, cv = cv, + class_names = class_names, path = path, ... + ) + } + + invisible(self) + }, + #' @description Print a compact summary of results. + summary = function() { + cat("Classification Results\n") + cat("-----------------------------\n") + cat(" Models: ", paste(self$available_models(), collapse = ", "), "\n") + cat(" Classes: ", paste(self$classes, collapse = ", "), "\n") + if (self$has_split) { + train_split <- self$configs("train_params", "split") + test_split <- 1 - train_split + cat(" Split: ", sprintf("%s (Training), %s (Test)", train_split, test_split), "\n") + } + if (self$has_cv) cat(" Folds: ", self$n_folds, "\n") + + if (self$has_split) cat("\n Mean Classification Accuracy (Train-Test Split):\n") + for (model in self$available_models()) { + split_df <- self$metrics(model, "split") + if (is.data.frame(split_df)) { + train_acc <- split_df[split_df$Set == "Training", "Classification Accuracy"] + test_acc <- split_df[split_df$Set == "Test", "Classification Accuracy"] + cat( + sprintf( + " %-30s %.3f (Training), %.3f (Test)\n", + .MODEL_LIST[[model]], train_acc, test_acc + ) + ) + } + } + + cat("\n Mean Classification Accuracy (CV):\n") + for (model in self$available_models()) { + cv_df <- self$metrics(model, "cv") + if (is.data.frame(cv_df)) { + acc <- cv_df[cv_df$Fold == "Mean CV:", "Classification Accuracy"] + cat(sprintf(" %-30s %.3f\n", .MODEL_LIST[[model]], acc)) + } + } + }, + #' @description Produces ROC curves and computes the area under the curve + #' (AUC) and Youden's Index. Only works for binary classification tasks. + #' + #' @param data A data frame. If \code{NULL}, then the preprocessed data must + #' be saved using \code{save = list("data" = TRUE)} in \code{class_cv}. + #' Default = \code{NULL}. + #' + #' @param models A character string or a character vector specifying the + #' classification algorithm(s) to plot curves for. If \code{NULL}, all + #' models will be plotted. The following options are available: + #' \itemize{ + #' \item \code{"lda"}: Linear Discriminant Analysis + #' \item \code{"qda"}: Quadratic Discriminant Analysis + #' \item \code{"logistic"}: Unregularized Logistic Regression + #' \item \code{"regularized_logistic"}: Regularized Logistic Regression + #' \item \code{"svm"}: Support Vector Machine + #' \item \code{"naivebayes"}: Naive Bayes + #' \item \code{"nnet"}: Neural Network + #' \item \code{"knn"}: K-Nearest Neighbors + #' \item \code{"decisiontree"}: Decision Tree + #' \item \code{"randomforest"}: Random Forest + #' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + #' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + #' Regression + #' \item \code{"xgboost"}: Extreme Gradient Boosting + #' } + #' Default = \code{NULL}. + #' + #' @param split A logical value indicating whether to plot curves for the + #' train-test split results. Default is \code{TRUE}. + #' + #' @param cv A logical value indicating whether to plot curves for + #' cross-validation results. Default is \code{TRUE}. + #' + #' @param thresholds A numerical vector specifying the thresholds to use + #' when producing the curves. If left as \code{NULL} the unique probability + #' values produced by the training model will be used as thresholds. + #' Default is \code{NULL}. + #' + #' @param return_output A logical value indicating whether to return the + #' output list. Default is \code{TRUE}. + #' + #' @param path A character string specifying the directory (with a trailing + #' slash) to save the plots. Default is \code{NULL}. + #' + #' @param ... Additional arguments passed to the \code{png} function. + #' + #' @return A \code{\link{CurveResult}} object containing thresholds, target + #' labels, false positive rates (FPR), true positive rates (TPR), area under + #' the curve (AUC), and Youden's Index for all training and validation sets + #' for each model. + #' + #' @examples + #' # Load an example dataset + #' data <- iris + #' + #' # Make Binary + #' data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + #' + #' # Perform a train-test split with an 80% training set and stratified + #' # sampling using QDA + #' results <- class_cv( + #' data = data, + #' target = "Species", + #' models = "qda", + #' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + #' save = list(data = TRUE, models = TRUE) + #' ) + #' + #' # Get ROC curve + #' results$roc_curve(return_output = FALSE) + #' + #' @importFrom grDevices rainbow + #' @importFrom graphics lines + roc_curve = function(data = NULL, + models = NULL, + split = TRUE, + cv = TRUE, + thresholds = NULL, + return_output = TRUE, + path = NULL, ...) { + models <- private$.resolve_models(models) + results <- .curve_entry( + self, data, models, split, cv, thresholds, return_output, + "roc", path, ... + ) + + return(CurveResult$new(results, "roc")) + }, + #' @description Produces PR curves and computes the area under the curve + #' (AUC) and the threshold with the maximum F1 score. Only works for binary + #' classification tasks. + #' + #' @param data A data frame. If \code{NULL}, then the preprocessed data + #' must be saved using \code{save = list("data" = TRUE)} in \code{class_cv}. + #' Default = \code{NULL}. + #' + #' @param models A character string or a character vector specifying the + #' classification algorithm(s) to plot curves for. If \code{NULL}, all + #' models will be plotted. The following options are available: + #' \itemize{ + #' \item \code{"lda"}: Linear Discriminant Analysis + #' \item \code{"qda"}: Quadratic Discriminant Analysis + #' \item \code{"logistic"}: Unregularized Logistic Regression + #' \item \code{"regularized_logistic"}: Regularized Logistic Regression + #' \item \code{"svm"}: Support Vector Machine + #' \item \code{"naivebayes"}: Naive Bayes + #' \item \code{"nnet"}: Neural Network + #' \item \code{"knn"}: K-Nearest Neighbors + #' \item \code{"decisiontree"}: Decision Tree + #' \item \code{"randomforest"}: Random Forest + #' \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + #' \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + #' Regression + #' \item \code{"xgboost"}: Extreme Gradient Boosting + #' } + #' Default = \code{NULL}. + #' + #' @param split A logical value indicating whether to plot curves for the + #' train-test split results. Default is \code{TRUE}. + #' + #' @param cv A logical value indicating whether to plot curves for + #' cross-validation results. Default is \code{TRUE}. + #' + #' @param thresholds A numerical vector specifying the thresholds to use + #' when producing the curves. If left as \code{NULL} the unique probability + #' values produced by the training model will be used as thresholds. + #' Default is \code{NULL}. + #' + #' @param return_output A logical value indicating whether to return the + #' output list. Default is \code{TRUE}. + #' + #' @param path A character string specifying the directory (with a trailing + #' slash) to save the plots. + #' Default is \code{NULL}. + #' + #' @param ... Additional arguments passed to the \code{png} function. + #' + #' @return A \code{\link{CurveResult}} object containing thresholds, target + #' labels, precision, recall, area under the curve (AUC), and maximum F1 + #' score and its associated optimal threshold for all training and validation + #' sets for each model. + #' + #' @examples + #' # Load an example dataset + #' data <- iris + #' + #' # Make Binary + #' data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + #' + #' # Perform a train-test split with an 80% training set and stratified + #' # sampling using QDA + #' results <- class_cv( + #' data = data, + #' target = "Species", + #' models = "qda", + #' train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + #' save = list(data = TRUE, models = TRUE) + #' ) + #' + #' # Get PR curve + #' results$pr_curve(return_output = FALSE) + pr_curve = function(data = NULL, + models = NULL, + split = TRUE, + cv = TRUE, + thresholds = NULL, + return_output = TRUE, + path = NULL, + ...) { + models <- private$.resolve_models(models) + results <- .curve_entry( + self, data, models, split, cv, thresholds, return_output, "pr", + path, ... + ) + + return(CurveResult$new(results, "pr")) + } + ), + private = list( + .configs = NULL, + .class_summary = NULL, + .metrics = NULL, + .trained_models = NULL, + .missing_data_summary = NULL, + .data_partitions = NULL, + .imputation_models = NULL, + .resolve_models = function(target_models) { + # Get models + if (is.null(target_models)) { + models <- self$available_models() + } else { + # Make lowercase + models <- sapply(target_models, tolower) + models <- intersect(models, self$available_models()) + + if (length(models) == 0) stop("no valid models specified in `models`") + + if (length(invalid_models <- setdiff(models, target_models)) > 0) { + warning( + sprintf( + "invalid model in models or information for specified model + not present in vswift x: %s", + paste(unlist(invalid_models), collapse = ", ") + ) + ) + } + } + + return(models) + } + ), + active = list( + #' @field classes Character vector of target classes. + classes = function() private$.class_summary$classes, + #' @field n_models Number of models in this result. + n_models = function() length(private$.metrics), + #' @field model_names Character vector of model names. + model_names = function() names(private$.metrics), + #' @field has_split TRUE if train-test split was performed. + has_split = function() !is.null(private$.configs$train_params$split), + #' @field has_cv TRUE if cross-validation was performed. + has_cv = function() !is.null(private$.configs$train_params$n_folds), + #' @field n_folds Number of CV folds. NULL if no CV. + n_folds = function() private$.configs$train_params$n_folds + ) +) + + +#' Curve Results +#' +#' @name CurveResult +#' +#' @description +#' An R6 class containing ROC or Precision-Recall curve results produced by +#' the \code{roc_curve} or \code{pr_curve} methods of a \code{\link{Vswift}} +#' object. Provides methods for accessing probabilities, AUC, optimal +#' thresholds, and comparing models. +#' +#' @export +CurveResult <- R6::R6Class( + "CurveResult", + public = list( + #' @description Create a new CurveResult object. + #' + #' @param results Named list of curve results keyed by model name. + #' + #' @param curve_type Character. Either "roc" or "pr". + initialize = function(results, curve_type) { + private$.results <- results + private$.type <- curve_type + }, + + #' @description Retrieve curve results for a specific model. + #' + #' @param model Character. Model name. \code{NULL} returns all models. + #' Default is \code{NULL}. + #' + #' @return A named list of curve results, or all results if \code{model} + #' is \code{NULL}. + get_model = function(model = NULL) { + if (is.null(model)) { + return(private$.results) + } + + return(.get_object(private$.results, model)) + }, + + #' @description Retrieve predicted probabilities for a model partition. + #' + #' @param model Character. Model name. + #' + #' @param partition Character. "split" or "fold1".."foldN". + #' + #' @param set Character. "train" or "test". + #' + #' @return A numeric vector of predicted probabilities. + get_probs = function(model, partition, set) { + obj <- .get_object(private$.results, model) + obj <- .get_object(obj, partition) + + return(.get_object(obj, set)$probs) + }, + + #' @description Retrieve the area under the curve (AUC). + #' + #' @param model Character. Model name. If \code{NULL}, returns AUC for + #' all models as a named vector. Default is \code{NULL}. + #' + #' @param partition Character. "split" or "fold1".."foldN". Default is + #' \code{"split"}. + #' + #' @param set Character. "train" or "test". Default is \code{"test"}. + #' + #' @return A numeric value or named numeric vector of AUC values. + get_auc = function(model = NULL, partition = "split", set = "test") { + if (!is.null(model)) { + obj <- .get_object(private$.results, model) + obj <- .get_object(obj, partition) + + return(.get_object(obj, set)$auc) + } + + auc <- sapply(names(private$.results), function(x) { + if (partition == "split") { + private$.results[[x]]$split[[set]]$auc + } else { + private$.results[[x]]$cv[[partition]]$auc + } + }) + + return(auc) + }, + + #' @description Retrieve the maximum F1 score. Only available for + #' Precision-Recall curves. + #' + #' @param model Character. Model name. If \code{NULL}, returns max F1 for + #' all models as a named vector. Default is \code{NULL}. + #' + #' @param partition Character. "split" or "fold1".."foldN". Default is + #' \code{"split"}. + #' + #' @param set Character. "train" or "test". Default is \code{"test"}. + #' + #' @return A numeric value, named numeric vector, or \code{NULL} if the + #' curve type is not "pr". + get_max_f1 = function(model = NULL, partition = "split", set = "test") { + if (private$.type != "pr") { + return(NULL) + } + + if (!is.null(model)) { + obj <- .get_object(private$.results, model) + obj <- .get_object(obj, partition) + + return(.get_object(obj, set)$max_f1) + } + + max_f1 <- sapply(names(private$.results), function(x) { + if (partition == "split") { + private$.results[[x]]$split[[set]]$max_f1 + } else { + private$.results[[x]]$cv[[partition]]$max_f1 + } + }) + + return(max_f1) + }, + + #' @description Retrieve the optimal threshold. For ROC curves, this is + #' Youden's Index. For PR curves, this is the threshold that maximizes the + #' F1 score. + #' + #' @param model Character. Model name. If \code{NULL}, returns optimal + #' thresholds for all models as a named vector. Default is \code{NULL}. + #' + #' @param partition Character. "split" or "fold1".."foldN". Default is + #' \code{"split"}. + #' + #' @param set Character. "train" or "test". Default is \code{"test"}. + #' + #' @return A numeric value or named numeric vector of optimal thresholds. + get_optimal_threshold = function(model = NULL, partition = "split", + set = "test") { + metric <- ifelse( + private$.type == "pr", "optimal_threshold", "youdens_index" + ) + + if (!is.null(model)) { + obj <- .get_object(private$.results, model) + obj <- .get_object(obj, partition) + + return(.get_object(obj, set)[[metric]]) + } + + optimal_threshold <- sapply(names(private$.results), function(x) { + if (partition == "split") { + private$.results[[x]]$split[[set]][[metric]] + } else { + private$.results[[x]]$cv[[partition]][[metric]] + } + }) + + return(optimal_threshold) + }, + + #' @description Retrieve curve metrics (FPR/TPR for ROC, or + #' precision/recall for PR curves). + #' + #' @param model Character. Model name. + #' + #' @param partition Character. "split" or "fold1".."foldN". Default is + #' \code{"split"}. + #' + #' @param set Character. "train" or "test". Default is \code{"test"}. + #' + #' @return A named list containing the curve metrics. + get_metrics = function(model, partition = "split", set = "test") { + obj <- .get_object(private$.results, model) + obj <- .get_object(obj, partition) + + return(.get_object(obj, set)$metrics) + }, + + #' @description Compare AUC across all models for a given partition. + #' + #' @param partition Character. "split" or "fold1".."foldN". + #' + #' @param set Character. "train" or "test". + #' + #' @return A data.frame with columns \code{model} and \code{auc}. + compare = function(partition, set) { + models <- names(private$.results) + data.frame( + model = models, + auc = sapply(models, function(m) self$get_auc(m, partition, set)), + row.names = NULL + ) + } + ), + private = list( + .results = NULL, + .type = NULL + ) +) + +.get_object <- function(x, name) { + if (!name %in% names(x)) { + valid_names <- names(x) + valid_name_str <- paste(valid_names, collapse = ", ") + stop( + sprintf( + "'%s' is not a valid name, valid names are: %s", + name, valid_name_str + ) + ) + } + + return(x[[name]]) +} diff --git a/R/sampling.R b/R/sampling.R new file mode 100644 index 0000000..9b83fbd --- /dev/null +++ b/R/sampling.R @@ -0,0 +1,53 @@ +# Sampling function used by classCV and genFolds +.sampling <- function(data, train_params, target, final_output) { + # Base args + base_args <- list(N = nrow(data), random_seed = train_params$random_seed) + + if (isTRUE(train_params$stratified)) { + # Create args list + strat_args <- list( + classes = final_output$class_summary$classes, + class_indxs = final_output$class_summary$indices, + class_props = final_output$class_summary$proportions + ) + + strat_args <- c(base_args, strat_args) + # Get stratified indices + if (!is.null(train_params$split)) { + strat_args$split <- train_params$split + final_output$data_partitions$indices$split <- do.call(.stratified_split, strat_args) + # Get proportions of classes in the stratified indices + final_output$data_partitions$proportions$split <- .get_proportions( + data[, target], + final_output$data_partitions$indices$split + ) + } + + if (!is.null(train_params$n_folds)) { + # Remove split arg + strat_args <- strat_args[!names(strat_args) == "split"] + strat_args$n_folds <- train_params$n_folds + final_output$data_partitions$indices$cv <- do.call(.stratified_cv, strat_args) + # Get proportions of classes in the stratified indices + final_output$data_partitions$proportions$cv <- .get_proportions( + data[, target], + final_output$data_partitions$indices$cv + ) + } + } else { + # Non-stratified sampling + if (!is.null(train_params$split)) { + base_args$split <- train_params$split + final_output$data_partitions$indices$split <- do.call(.split, base_args) + } + + if (!is.null(train_params$n_folds)) { + # Remove split arg + base_args <- base_args[!names(base_args) == "split"] + base_args$n_folds <- train_params$n_folds + final_output$data_partitions$indices$cv <- do.call(.cv, base_args) + } + } + + return(final_output) +} diff --git a/R/utils.R b/R/utils.R index 4d9dbd8..29d2da8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,222 +1,203 @@ -# Helper function to partition data; indices is always the test set -.partition <- function(data, indices) { - return(list("train" = data[-indices, ], "test" = data[indices, ])) -} - -# Helper function to generate all dataframes; primarily used if save$data = TRUE -.create_data <- function(data, subsets) { - df_list <- list() - # Get data for train-test split - if ("split" %in% names(subsets)) { - df_list$split$train <- data[subsets$split$train, ] - df_list$split$test <- data[subsets$split$test, ] - } - - # Get train and test partitions for cv - if ("cv" %in% names(subsets)) { - for (fold in names(subsets$cv)) { - df_list$cv[[fold]]$train <- data[-subsets$cv[[fold]], ] - df_list$cv[[fold]]$test <- data[subsets$cv[[fold]], ] - } - } - - return(df_list) -} - -# Helper function to get test indices -.get_indices <- function(obj, id) { - if (id == "split") { - return(obj[[id]]$test) - } else { - # Should be foldn, such as fold1, fold2, etc - return(obj$cv[[id]]) - } -} - -# Helper function to generate foldid -.get_foldid <- function(cv_indxs, N) { - # Replace fold ids with numbers - names(cv_indxs) <- 1:length(cv_indxs) - # Get the fold number of each observation - foldid <- as.vector(sapply(1:N, function(indx) .get_key(indx, cv_indxs))) - - # Returns vector where each position has an id, corresponding to the fold an observation belongs to - return(foldid) -} - -# Helper function to obtain key of a value in a sublist -.get_key <- function(indx, cv_indxs) { - # Get a bool vector that indicates which sublist has the indx/observation - bool_vec <- sapply(cv_indxs, function(x) indx %in% x) - - # Return fold id for index - return(as.numeric(names(bool_vec)[bool_vec])) -} - -# Helper function to unnest parallel list -.unnest <- function(par_list, iters, model, saved_mods) { - targets <- c("metrics") - metrics <- list() - lambdas <- c() - - if (isTRUE(saved_mods)) { - targets <- c("metrics", "models") - models <- list() - } - - # Append the optimal lambdas; use c() to retain names - if (startsWith(model, "regularized")) { - for (i in seq_along(iters)) lambdas <- c(lambdas, par_list[[i]]$optimal_lambda) - } - - for (target in targets) { - for (i in seq_along(iters)) { - if (target == "metrics") { - if (iters[i] == "split") { - metrics$split <- par_list[[i]]$metrics$split - } else { - metrics$cv <- c(metrics$cv, par_list[[i]]$metrics$cv) - } - } else { - if (iters[i] == "split") { - models$split <- par_list[[i]]$models$split - } else { - models$cv <- c(models$cv, par_list[[i]]$models$cv) - } - } - } - } - - out <- list("metrics" = metrics) - - if (isTRUE(saved_mods)) out$models <- models - - if (length(lambdas) > 0) out$optimal_lambdas <- lambdas - - return(out) -} - - -# Helper function to prep the data for validation -.prep_data <- function(i = NULL, train = NULL, test = NULL, kwargs = NULL, preprocessed_data = NULL, preproc_kwargs = NULL) { - is_standardized <- FALSE - - if (is.null(preprocessed_data)) { - # Impute; determine if impute_models is not NULL or an empty list - if (!is.null(kwargs$impute_models) && length(kwargs$impute_models) > 0) { - df_list <- .impute_bake(train = train, test = test, vars = kwargs$vars, prep = kwargs$impute_models[[i]]) - train <- df_list$train - test <- df_list$test - is_standardized <- TRUE - } - - # Standardize - if (isFALSE(is_standardized) && isTRUE(kwargs$train_params$standardize)) { - df_list <- .standardize_train(train, test, kwargs$train_params$standardize, target = kwargs$vars$target) - train <- df_list$train - test <- df_list$test - } - - return(list("train" = train, "test" = test)) - } else { - # Impute; determine if impute_models is not NULL or an empty list - if (!is.null(preproc_kwargs$prep)) { - df_list <- .impute_bake( - preprocessed_data = preprocessed_data, - vars = preproc_kwargs$vars, prep = preproc_kwargs$prep - ) - preprocessed_data <- df_list$preprocessed_data - is_standardized <- TRUE - } - - # Standardize - if (isFALSE(is_standardized) && isTRUE(preproc_kwargs$standardize)) { - df_list <- .standardize(preprocessed_data, standardize = TRUE, preproc_kwargs$vars$target) - } - - return(df_list$preprocessed_data) - } -} - - -# Helper function for to remove observations in test set with factors in predictors not observed during train -.remove_obs <- function(train, test, col_levels, id) { - # Iterate over columns and check for the factors that exist in test set but not the train set - for (col in names(col_levels)) { - delete_rows <- which(!test[, col] %in% train[, col]) - obs <- row.names(test)[delete_rows] - - if (length(obs) > 0) { - warning(sprintf( - "for predictor `%s` in `%s` data partition has at least one class the model has not trained on\nthese observations will be temporarily removed: %s", - col, id, paste(obs, collapse = ",") - )) - test <- test[-delete_rows, ] - } - } - - return(list("test" = test)) -} - -# Helper function to get models present in vswift object -.intersect_models <- function(x, models) { - # Get models - if (is.null(models)) { - models <- x$configs$models - } else { - # Make lowercase - models <- sapply(models, function(x) tolower(x)) - models <- intersect(models, x$configs$models) - - if (length(models) == 0) stop("no valid models specified in `models`") - - # Warning when invalid models specified - invalid_models <- models[which(!models %in% models)] - if (length(invalid_models) > 0) { - warning(sprintf( - "invalid model in models or information for specified model not present in vswift x: %s", - paste(unlist(invalid_models), collapse = ", ") - )) - } - } - - return(models) -} - - -# Helper function to convert matrices to vectors -# Handle prediction output, some models will produce a matrix with posterior probabilities for binary outcomes -.tovec <- function(model, result, keys) { - convert <- ( - !(model %in% c("logistic", "regularized_logistic", "nnet", "multinom", "xgboost")) && - !is.null(keys) && length(dim(result)) == 2 - ) - - if (convert) result <- result[, names(keys)[keys == 1]] - - return(as.vector(result)) -} - -# Helper function to determine if default boundary should be used -.determine_threshold <- function(model, obj, threshold, issue_warning = TRUE) { - xgboost_logistic <- c("reg:logistic", "binary:logistic", "binary:logitraw") - check_bool <- obj %in% xgboost_logistic - - if ((model == "logistic" || (model == "xgboost" && isTRUE(check_bool))) && is.null(threshold)) { - threshold <- 0.5 - if (issue_warning) warning(sprintf("using a default threshold of 0.5 to classify groups for %s model", model)) - } else if (model == "xgboost" && obj == "binary:hinge") { - threshold <- NULL - } - - return(threshold) -} - -# Helper function to ensure all columns have the same levels as the original data for svm -.relevel_cols <- function(data, col_levels) { - data[, names(col_levels)] <- data.frame( - lapply(names(col_levels), function(col) factor(data[, col], levels = col_levels[[col]])) - ) - - return(data) -} +# Helper function to partition data; indices is always the test set +.partition <- function(data, indices) { + return(list("train" = data[-indices, ], "test" = data[indices, ])) +} + +# Helper function to generate all dataframes; primarily used if save$data = TRUE +.create_data <- function(data, subsets) { + df_list <- list() + # Get data for train-test split + if ("split" %in% names(subsets)) { + df_list$split$train <- data[subsets$split$train, ] + df_list$split$test <- data[subsets$split$test, ] + } + + # Get train and test partitions for cv + if ("cv" %in% names(subsets)) { + for (fold in names(subsets$cv)) { + df_list$cv[[fold]]$train <- data[-subsets$cv[[fold]], ] + df_list$cv[[fold]]$test <- data[subsets$cv[[fold]], ] + } + } + + return(df_list) +} + +# Helper function to get test indices +.get_indices <- function(obj, id) { + if (id == "split") { + return(obj[[id]]$test) + } else { + # Should be foldn, such as fold1, fold2, etc + return(obj$cv[[id]]) + } +} + +# Helper function to generate foldid +.get_foldid <- function(cv_indxs, N) { + # Replace fold ids with numbers + names(cv_indxs) <- 1:length(cv_indxs) + # Get the fold number of each observation + foldid <- as.vector(sapply(1:N, function(indx) .get_key(indx, cv_indxs))) + + # Returns vector where each position has an id, corresponding to the fold an observation belongs to + return(foldid) +} + +# Helper function to obtain key of a value in a sublist +.get_key <- function(indx, cv_indxs) { + # Get a bool vector that indicates which sublist has the indx/observation + bool_vec <- sapply(cv_indxs, function(x) indx %in% x) + + # Return fold id for index + return(as.numeric(names(bool_vec)[bool_vec])) +} + +# Helper function to unnest parallel list +.unnest <- function(par_list, iters, model, saved_mods) { + targets <- c("metrics") + metrics <- list() + lambdas <- c() + + if (isTRUE(saved_mods)) { + targets <- c("metrics", "models") + models <- list() + } + + # Append the optimal lambdas; use c() to retain names + if (startsWith(model, "regularized")) { + for (i in seq_along(iters)) lambdas <- c(lambdas, par_list[[i]]$optimal_lambda) + } + + for (target in targets) { + for (i in seq_along(iters)) { + if (target == "metrics") { + if (iters[i] == "split") { + metrics$split <- par_list[[i]]$metrics$split + } else { + metrics$cv <- c(metrics$cv, par_list[[i]]$metrics$cv) + } + } else { + if (iters[i] == "split") { + models$split <- par_list[[i]]$models$split + } else { + models$cv <- c(models$cv, par_list[[i]]$models$cv) + } + } + } + } + + out <- list("metrics" = metrics) + + if (isTRUE(saved_mods)) out$models <- models + + if (length(lambdas) > 0) out$optimal_lambdas <- lambdas + + return(out) +} + + +# Helper function to prep the data for validation +.prep_data <- function( + i = NULL, train = NULL, test = NULL, kwargs = NULL, + preprocessed_data = NULL, preproc_kwargs = NULL +) { + is_standardized <- FALSE + + if (is.null(preprocessed_data)) { + # Impute; determine if impute_models is not NULL or an empty list + if (!is.null(kwargs$impute_models) && length(kwargs$impute_models) > 0) { + df_list <- .impute_bake( + train = train, test = test, vars = kwargs$vars, + prep = kwargs$impute_models[[i]] + ) + train <- df_list$train + test <- df_list$test + is_standardized <- TRUE + } + + # Standardize + if (isFALSE(is_standardized) && isTRUE(kwargs$train_params$standardize)) { + df_list <- .standardize_train(train, test, kwargs$train_params$standardize, target = kwargs$vars$target) + train <- df_list$train + test <- df_list$test + } + + return(list("train" = train, "test" = test)) + } else { + # Impute; determine if impute_models is not NULL or an empty list + if (!is.null(preproc_kwargs$prep)) { + df_list <- .impute_bake( + preprocessed_data = preprocessed_data, + vars = preproc_kwargs$vars, prep = preproc_kwargs$prep + ) + preprocessed_data <- df_list$preprocessed_data + is_standardized <- TRUE + } + + # Standardize + if (isFALSE(is_standardized) && isTRUE(preproc_kwargs$standardize)) { + df_list <- .standardize(preprocessed_data, standardize = TRUE, preproc_kwargs$vars$target) + } + + return(df_list$preprocessed_data) + } +} + + +# Helper function for to remove observations in test set with factors in predictors not observed during train +.remove_obs <- function(train, test, col_levels, id) { + # Iterate over columns and check for the factors that exist in test set but not the train set + for (col in names(col_levels)) { + delete_rows <- which(!test[, col] %in% train[, col]) + obs <- row.names(test)[delete_rows] + + if (length(obs) > 0) { + warning(sprintf( + "for predictor `%s` in `%s` data partition has at least one class the model has not trained on\nthese observations will be temporarily removed: %s", + col, id, paste(obs, collapse = ",") + )) + test <- test[-delete_rows, ] + } + } + + return(list("test" = test)) +} + + +# Helper function to convert matrices to vectors +# Handle prediction output, some models will produce a matrix with posterior probabilities for binary outcomes +.tovec <- function(model, result, keys) { + convert <- ( + !(model %in% c("logistic", "regularized_logistic", "nnet", "multinom", "xgboost")) && + !is.null(keys) && length(dim(result)) == 2 + ) + + if (convert) result <- result[, names(keys)[keys == 1]] + + return(as.vector(result)) +} + +# Helper function to determine if default boundary should be used +.determine_threshold <- function(model, obj, threshold, issue_warning = TRUE) { + xgboost_logistic <- c("reg:logistic", "binary:logistic", "binary:logitraw") + check_bool <- obj %in% xgboost_logistic + + if ((model == "logistic" || (model == "xgboost" && isTRUE(check_bool))) && is.null(threshold)) { + threshold <- 0.5 + if (issue_warning) warning(sprintf("using a default threshold of 0.5 to classify groups for %s model", model)) + } else if (model == "xgboost" && obj == "binary:hinge") { + threshold <- NULL + } + + return(threshold) +} + +# Helper function to ensure all columns have the same levels as the original data for svm +.relevel_cols <- function(data, col_levels) { + data[, names(col_levels)] <- data.frame( + lapply(names(col_levels), function(col) factor(data[, col], levels = col_levels[[col]])) + ) + + return(data) +} diff --git a/R/validation.R b/R/validation.R index bc42ff7..e15db32 100644 --- a/R/validation.R +++ b/R/validation.R @@ -86,7 +86,9 @@ if (!is.null(add_args)) mod_args <- c(mod_args, add_args) # Prevent default internal scaling for models with the scale parameter - if (!model %in% c("decisiontree", "xgboost", "logistic")) mod_args$scale <- FALSE + if (!model %in% c("decisiontree", "xgboost", "logistic")) { + mod_args$scale <- FALSE + } switch(model, "decisiontree" = { diff --git a/README.md b/README.md index 887e47d..976ae48 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,20 @@ [![Codecov](https://codecov.io/github/donishadsmith/vswift/graph/badge.svg?token=7DYAPU2M0G)](https://codecov.io/github/donishadsmith/vswift) [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) -vswift provides a unified interface to multiple classification algorithms from popular R packages for performing model evaluation on classification tasks (binary and multi-class). +vswift provides a unified interface to multiple classification algorithms from +popular R packages for performing model evaluation on classification tasks +(binary and multi-class). ## Supported Classification Algorithms -The following classification algorithms are available through their respective R packages: +The following classification algorithms are available through their respective +R packages: - `lda` from MASS package for Linear Discriminant Analysis - `qda` from MASS package for Quadratic Discriminant Analysis - - `glm` from base package with `family = "binomial"` for Unregularized Logistic Regression - - `glmnet` from `glmnet` package with `family = "binomial"` or `family = "multinomial"`and using `cv.glmnet` to select the optimal lambda for + - `glm` from base package with `family = "binomial"` for Unregularized + Logistic Regression + - `glmnet` from `glmnet` package with `family = "binomial"` or + `family = "multinomial"`and using `cv.glmnet` to select the optimal lambda for Regularized Logistic Regression and Regularized Multinomial Logistic Regression. - `svm` from e1071 package for Support Vector Machine - `naive_bayes` from naivebayes package for Naive Bayes @@ -20,29 +25,45 @@ The following classification algorithms are available through their respective R - `train.kknn` from kknn package for K-Nearest Neighbors - `rpart` from rpart package for Decision Trees - `randomForest` from randomForest package for Random Forest - - `multinom` from nnet package for Unregularized Multinomial Logistic Regression + - `multinom` from nnet package for Unregularized Multinomial Logistic + Regression - `xgb.train` from xgboost package for Extreme Gradient Boosting ## Features ### Data Handling -- **Versatile Data Splitting**: Perform train-test splits or cross-validation on your classification data. -- **Stratified Sampling Option**: Ensure representative class distribution using stratified sampling based on class proportions. -- **Handling Unseen Categorical Levels**: Automatically exclude observations from the validation/test set with categories not seen during model training. +- **Versatile Data Splitting**: Perform train-test splits or cross-validation +on your classification data. +- **Stratified Sampling Option**: Ensure representative class distribution +using stratified sampling based on class proportions. +- **Handling Unseen Categorical Levels**: Automatically exclude observations +from the validation/test set with categories not seen during model training. ### Model Configuration -- **Support for Popular Algorithms**: Choose from a wide range of classification algorithms. Multiple algorithms can be specified in a single function call. -- **Model Saving Capabilities**: Save all models utilized for training and testing for both train-test splitting and cross-validation. +- **Support for Popular Algorithms**: Choose from a wide range of classification +algorithms. Multiple algorithms can be specified in a single function call. +- **Model Saving Capabilities**: Save all models utilized for training and +testing for both train-test splitting and cross-validation. - **Final Model Creation**: Easily create and save final models for future use. -- **Dataset Saving Options**: Preserve split datasets and folds for reproducibility. -- **Parallel Processing**: Utilize multi-core processing for cross-validation through the future package, configurable via `n_cores` and `future.seed` keys in the `parallel_configs` parameter. +- **Dataset Saving Options**: Preserve split datasets and folds for +reproducibility. +- **Parallel Processing**: Utilize multi-core processing for cross-validation +through the future package, configurable via `n_cores` and `future.seed` keys +in the `parallel_configs` parameter. ### Data Preprocessing -- **Missing Data Imputation**: Select either Bagged Tree Imputation or KNN Imputation, implemented using the recipes package. Imputation only uses feature data (specifically observations where not all features are missing) from the training set to prevent leakage. -- **Automatic Numerical Encoding**: Target variable classes are automatically encoded numerically for algorithms requiring numerical inputs. +- **Missing Data Imputation**: Select either Bagged Tree Imputation or KNN +Imputation, implemented using the recipes package. Imputation only uses feature +data (specifically observations where not all features are missing) from the +training set to prevent leakage. +- **Automatic Numerical Encoding**: Target variable classes are automatically +encoded numerically for algorithms requiring numerical inputs. ### Model Evaluation -- **Comprehensive Metrics**: Generate and save performance metrics including classification accuracy, precision, recall, and F1 for each class. For binary classification tasks, produce ROC (Receiver Operating Characteristic) and PR (Precision-Recall) curves and calculate AUC (Area Under Curve) scores. +- **Comprehensive Metrics**: Generate and save performance metrics including +classification accuracy, precision, recall, and F1 for each class. For binary +classification tasks, produce ROC (Receiver Operating Characteristic) and PR +(Precision-Recall) curves and calculate AUC (Area Under Curve) scores. ## Installation @@ -64,7 +85,7 @@ help(package = "vswift") ```R # Install 'vswift' package install.packages( - "https://github.com/donishadsmith/vswift/releases/download/0.5.0.9006/vswift_0.5.0.9006.tar.gz", + "https://github.com/donishadsmith/vswift/releases/download/0.6.0/vswift_0.6.0.tar.gz", repos = NULL, type = "source" ) @@ -74,7 +95,7 @@ help(package = "vswift") ``` ## Usage -The type of classification algorithm is specified using the `models` parameter in the `classCV` function. +The type of classification algorithm is specified using the `models` parameter in the `class_cv` function. Acceptable inputs for the `models` parameter includes: @@ -121,14 +142,14 @@ library(vswift) map_args <- list(regularized_logistic = list(alpha = 1, nfolds = 3)) # Perform train-test split and cross-validation with stratified sampling -results <- classCV( +results <- class_cv( data = thyroid_data, formula = Recurred ~ ., models = "regularized_logistic", model_params = list( - map_args = map_args, - rule = "1se", # rule can be "min" or "1se" - verbose = TRUE + map_args = map_args, + rule = "1se", # rule can be "min" or "1se" + verbose = TRUE ), train_params = list( split = 0.8, @@ -156,27 +177,41 @@ Model: regularized_logistic | Partition: Fold 5 | Optimal lambda: 0.01253 (neste - Print optimal lambda values. ```R -print(results$metrics$regularized_logistic$optimal_lambdas) +results$metrics("regularized_logistic", "optimal_lambdas") ``` -
- -Output - +**Output** ``` split fold1 fold2 fold3 fold4 fold5 0.094590537 0.009834647 0.079494739 0.013763132 0.005649260 0.012525544 ``` -
+```R +# Quick summary +results$summary() +``` +**Output** +``` +Classification Results +----------------------------- + Models: regularized_logistic + Classes: No, Yes + Split: 0.8 (Training), 0.2 (Test) + Folds: 5 + + Mean Classification Accuracy (Train-Test Split): + Regularized Logistic Regression 0.928 (Training), 0.910 (Test) + + Mean Classification Accuracy (CV): + Regularized Logistic Regression 0.948 +``` ```R # Print parameter information and model evaluation metrics -print(results, parameters = TRUE, metrics = TRUE) +results$print(configs = TRUE, metrics = TRUE) ```
@@ -184,7 +219,7 @@ print(results, parameters = TRUE, metrics = TRUE) Output ``` -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Model: Regularized Logistic Regression @@ -213,7 +248,7 @@ Parallel Configs: list(n_cores = NULL, future.seed = NULL) -Training +Training _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ Classification Accuracy: 0.93 @@ -252,7 +287,7 @@ Yes 0.97 ± 0.03 (SD) 0.84 ± 0.12 (SD) 0.90 ± 0.06 (SD) ```R # Plot model evaluation metrics -plot(results, split = TRUE, cv = TRUE, path = getwd()) +results$plot(split = TRUE, cv = TRUE, path = getwd()) ```
@@ -282,8 +317,8 @@ ROC and PR curves are only available for binary classification tasks. To generat saved. ```R -# Can `target` parameter, which accepts characters and integers instead of `formula` -results <- classCV( +# Can use `target` parameter, which accepts characters and integers instead of `formula` +results <- class_cv( data = thyroid_data, target = "Recurred", # Using 17, the column index of "Recurred" is also valid models = "naivebayes", @@ -298,16 +333,23 @@ results <- classCV( ) ``` -Output consists of a list containing thresholds used to generate the ROC, target labels, False Positive Rates (FPR), -True Positive Rates (TPR)/Recall, Area Under The Curve (AUC), and Youden's Index for all training and validation sets -for each model. For the PR curve, the outputs replace the FPR with Precision and Youden's Index with the maximum -F1 score and its associated optimal threshold. +Output consists of a `CurveResult` object containing thresholds used to generate the ROC, target labels, False Positive Rates (FPR), True Positive Rates (TPR)/Recall, Area Under The Curve (AUC), and Youden's Index for all training and validation sets for each model. For the PR curve, the outputs replace the FPR with Precision and Youden's Index with the maximum F1 score and its associated optimal threshold. ```R # Will derive thresholds from the probabilities -roc_output <- rocCurve(results, thyroid_data, return_output = TRUE, thresholds = NULL, path = getwd()) +roc_output <- results$roc_curve( + data = thyroid_data, + return_output = TRUE, + thresholds = NULL, + path = getwd() +) -pr_output <- prCurve(results, thyroid_data, return_output = TRUE, thresholds = NULL, path = getwd()) +pr_output <- results$pr_curve( + data = thyroid_data, + return_output = TRUE, + thresholds = NULL, + path = getwd() +) ```
@@ -316,12 +358,12 @@ pr_output <- prCurve(results, thyroid_data, return_output = TRUE, thresholds = N ``` Warning message: -In .create_dictionary(x$class_summary$classes, TRUE) : +In .create_dictionary(x$classes, TRUE) : creating keys for target variable for `rocCurve`; classes are now encoded: No = 0, Yes = 1 Warning message: -In .create_dictionary(x$class_summary$classes, TRUE) : +In .create_dictionary(x$classes, TRUE) : creating keys for target variable for `prCurve`; classes are now encoded: No = 0, Yes = 1 ``` @@ -333,468 +375,40 @@ In .create_dictionary(x$class_summary$classes, TRUE) :
+Access curve results using the `CurveResult` methods: ```R -print(roc_output) +# Get AUC for a specific model and partition +roc_output$get_auc("naivebayes", "split", "test") + +# Get probabilities +roc_output$get_probs("naivebayes", "split", "train") + +# Get curve metrics (FPR/TPR for ROC, precision/recall for PR) +roc_output$get_metrics("naivebayes", "split", "test") + +# Get optimal threshold (Youden's Index for ROC, max F1 threshold for PR) +roc_output$get_optimal_threshold("naivebayes", "split", "test") + +# Compare AUC across all models +roc_output$compare("split", "test") ``` +
- Output - - $naivebayes - $naivebayes$split - $naivebayes$split$train - $naivebayes$split$train$thresholds - [1] 0.000000e+00 3.693169e-12 6.027285e-12 4.073907e-11 4.968343e-11 8.108774e-11 8.381271e-11 9.095559e-11 9.298886e-11 9.346182e-11 1.114830e-10 - [12] 1.422435e-10 1.495389e-10 1.587088e-10 1.749989e-10 1.817692e-10 2.138270e-10 2.230345e-10 2.388617e-10 2.522502e-10 3.014117e-10 3.081804e-10 - [23] 3.520544e-10 4.990782e-10 5.274227e-10 6.636250e-10 1.169893e-09 2.079998e-09 2.664128e-09 2.994029e-09 3.799429e-09 4.480205e-09 8.110897e-09 - [34] 1.160010e-08 1.906848e-08 2.064864e-08 4.727744e-08 4.855115e-08 1.106413e-07 2.729630e-07 3.255273e-07 4.366589e-07 4.367644e-07 4.377311e-07 - [45] 4.491246e-07 4.605240e-07 4.802007e-07 4.941300e-07 7.626642e-07 7.711416e-07 7.982413e-07 1.141365e-06 1.141641e-06 1.144168e-06 1.144997e-06 - [56] 1.150071e-06 1.150071e-06 1.151461e-06 1.159124e-06 1.190156e-06 1.206049e-06 1.209843e-06 1.210205e-06 1.219731e-06 1.233172e-06 1.255177e-06 - [67] 1.285671e-06 1.312232e-06 1.320459e-06 1.327175e-06 1.359847e-06 1.542586e-06 1.589408e-06 1.653207e-06 1.913469e-06 1.938166e-06 1.942925e-06 - [78] 1.952949e-06 1.955310e-06 2.019883e-06 2.048006e-06 2.049502e-06 2.065634e-06 2.145067e-06 2.145594e-06 2.149258e-06 2.183214e-06 2.485015e-06 - [89] 2.756766e-06 3.163302e-06 3.167126e-06 3.188202e-06 3.193598e-06 3.221965e-06 3.317270e-06 3.466089e-06 3.761135e-06 4.180838e-06 4.375333e-06 - [100] 4.394098e-06 5.025238e-06 5.040660e-06 5.510162e-06 5.880527e-06 6.284863e-06 6.601731e-06 8.102284e-06 8.600226e-06 8.819485e-06 8.904729e-06 - [111] 9.205136e-06 9.645657e-06 9.923509e-06 1.026900e-05 1.062422e-05 1.064212e-05 1.176210e-05 1.222003e-05 1.605820e-05 1.624896e-05 1.702907e-05 - [122] 1.706613e-05 1.786223e-05 1.877624e-05 1.982338e-05 1.991122e-05 2.209691e-05 2.484095e-05 2.641378e-05 2.866620e-05 3.027296e-05 3.316481e-05 - [133] 3.320879e-05 5.117616e-05 5.327541e-05 5.394977e-05 5.488148e-05 5.718563e-05 5.983486e-05 6.241298e-05 6.974128e-05 7.051644e-05 8.795213e-05 - [144] 9.358483e-05 1.136694e-04 1.172686e-04 1.183107e-04 1.429573e-04 1.611606e-04 1.939608e-04 2.188150e-04 2.214605e-04 2.607734e-04 2.653517e-04 - [155] 3.571503e-04 3.856822e-04 4.807467e-04 5.274486e-04 5.299381e-04 6.543110e-04 6.621021e-04 6.801931e-04 9.347791e-04 1.140267e-03 1.393163e-03 - [166] 1.450880e-03 1.557559e-03 1.742214e-03 1.869285e-03 1.950507e-03 1.958437e-03 1.962707e-03 2.855645e-03 3.687773e-03 4.079561e-03 5.480979e-03 - [177] 6.199631e-03 7.280673e-03 8.575042e-03 8.980047e-03 1.037477e-02 1.169980e-02 1.650789e-02 1.922552e-02 2.256963e-02 2.507362e-02 2.562613e-02 - [188] 3.231358e-02 3.259402e-02 3.444110e-02 4.256348e-02 6.708378e-02 7.683335e-02 7.999753e-02 9.268128e-02 1.387462e-01 1.442460e-01 2.284205e-01 - [199] 2.674336e-01 2.915627e-01 3.236000e-01 3.998560e-01 4.034738e-01 4.077277e-01 4.332884e-01 4.591731e-01 5.209434e-01 5.455645e-01 6.364075e-01 - [210] 6.773157e-01 7.260277e-01 7.279223e-01 7.307607e-01 7.541637e-01 7.771591e-01 7.878685e-01 7.926057e-01 8.284044e-01 9.219936e-01 9.530463e-01 - [221] 9.549080e-01 9.798928e-01 9.858326e-01 9.878472e-01 9.899242e-01 9.943402e-01 9.979936e-01 9.981558e-01 9.982179e-01 9.983899e-01 9.985805e-01 - [232] 9.991084e-01 9.992326e-01 9.997728e-01 9.997956e-01 9.999074e-01 9.999161e-01 9.999199e-01 9.999397e-01 9.999576e-01 9.999603e-01 9.999753e-01 - [243] 9.999791e-01 9.999804e-01 9.999879e-01 9.999906e-01 9.999933e-01 9.999935e-01 9.999935e-01 9.999940e-01 9.999941e-01 9.999949e-01 9.999979e-01 - [254] 9.999984e-01 9.999986e-01 9.999989e-01 9.999989e-01 9.999993e-01 9.999994e-01 9.999994e-01 9.999997e-01 9.999997e-01 9.999998e-01 9.999998e-01 - [265] 9.999999e-01 9.999999e-01 9.999999e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - [276] 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - [287] 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - - $naivebayes$split$train$probs - [1] 2.994029e-09 3.081804e-10 4.480205e-09 2.145067e-06 1.495389e-10 1.114830e-10 1.422435e-10 9.095559e-11 1.169893e-09 3.520544e-10 2.138270e-10 - [12] 2.079998e-09 2.522502e-10 4.727744e-08 1.587088e-10 1.749989e-10 8.381271e-11 6.636250e-10 3.799429e-09 8.108774e-11 4.073907e-11 9.346182e-11 - [23] 5.274227e-10 4.968343e-11 8.381271e-11 4.990782e-10 1.160010e-08 3.014117e-10 8.110897e-09 9.095559e-11 6.027285e-12 1.422435e-10 1.817692e-10 - [34] 9.298886e-11 2.388617e-10 3.693169e-12 2.230345e-10 6.199631e-03 2.145594e-06 2.664128e-09 2.729630e-07 6.601731e-06 1.219731e-06 7.711416e-07 - [45] 1.062422e-05 7.982413e-07 4.605240e-07 1.982338e-05 4.394098e-06 4.491246e-07 5.488148e-05 2.064864e-08 4.491246e-07 1.312232e-06 1.312232e-06 - [56] 4.941300e-07 5.510162e-06 4.367644e-07 4.377311e-07 4.366589e-07 1.210205e-06 4.802007e-07 4.377311e-07 7.626642e-07 1.064212e-05 1.542586e-06 - [67] 1.906848e-08 1.991122e-05 2.562613e-02 9.923509e-06 5.025238e-06 1.429573e-04 9.878472e-01 3.466089e-06 1.589408e-06 8.600226e-06 1.106413e-07 - [78] 3.316481e-05 2.149258e-06 1.172686e-04 2.049502e-06 1.183107e-04 2.065634e-06 2.019883e-06 3.221965e-06 2.048006e-06 5.394977e-05 4.807467e-04 - [89] 6.284863e-06 2.485015e-06 1.144997e-06 1.702907e-05 1.209843e-06 1.955310e-06 3.856822e-04 1.653207e-06 1.150071e-06 3.221965e-06 1.952949e-06 - [100] 1.150071e-06 5.117616e-05 1.159124e-06 3.571503e-04 8.102284e-06 2.653517e-04 1.938166e-06 1.144997e-06 9.645657e-06 1.320459e-06 1.144997e-06 - [111] 3.221965e-06 3.163302e-06 1.327175e-06 3.761135e-06 5.040660e-06 1.942925e-06 1.144168e-06 7.051644e-05 2.866620e-05 3.167126e-06 3.188202e-06 - [122] 1.359847e-06 6.621021e-04 1.206049e-06 1.190156e-06 8.795213e-05 1.151461e-06 3.317270e-06 1.150071e-06 1.255177e-06 1.159124e-06 2.484095e-05 - [133] 1.026900e-05 1.877624e-05 1.141641e-06 4.375333e-06 1.285671e-06 1.611606e-04 1.233172e-06 2.183214e-06 2.641378e-05 1.136694e-04 3.193598e-06 - [144] 4.180838e-06 5.880527e-06 5.274486e-04 2.209691e-05 2.855645e-03 1.176210e-05 2.756766e-06 1.141365e-06 1.913469e-06 3.761135e-06 1.706613e-05 - [155] 1.786223e-05 5.718563e-05 4.855115e-08 3.255273e-07 1.624896e-05 6.801931e-04 4.079561e-03 2.674336e-01 2.607734e-04 1.650789e-02 7.683335e-02 - [166] 4.034738e-01 5.209434e-01 7.878685e-01 8.904729e-06 1.939608e-04 7.541637e-01 5.299381e-04 1.140267e-03 6.974128e-05 4.077277e-01 1.557559e-03 - [177] 9.549080e-01 8.284044e-01 9.991084e-01 9.999994e-01 9.982179e-01 9.979936e-01 9.943402e-01 9.999906e-01 9.999753e-01 9.999791e-01 6.708378e-02 - [188] 1.387462e-01 9.268128e-02 5.480979e-03 8.819485e-06 6.241298e-05 9.985805e-01 5.327541e-05 1.869285e-03 1.922552e-02 4.256348e-02 3.027296e-05 - [199] 1.442460e-01 3.687773e-03 9.347791e-04 1.958437e-03 1.605820e-05 3.320879e-05 1.169980e-02 8.575042e-03 9.358483e-05 5.983486e-05 9.205136e-06 - [210] 1.742214e-03 2.188150e-04 1.222003e-05 1.450880e-03 1.393163e-03 8.980047e-03 6.543110e-04 3.259402e-02 3.231358e-02 2.915627e-01 2.507362e-02 - [221] 1.950507e-03 2.214605e-04 6.364075e-01 7.279223e-01 9.219936e-01 3.444110e-02 6.773157e-01 7.260277e-01 1.962707e-03 7.771591e-01 7.307607e-01 - [232] 2.284205e-01 7.280673e-03 1.037477e-02 7.999753e-02 4.591731e-01 3.236000e-01 9.999941e-01 9.981558e-01 9.999979e-01 9.798928e-01 9.999935e-01 - [243] 9.997956e-01 9.992326e-01 9.999993e-01 9.999074e-01 9.999989e-01 9.899242e-01 9.999199e-01 9.999161e-01 9.999989e-01 9.999935e-01 9.999940e-01 - [254] 9.999999e-01 9.999984e-01 9.999999e-01 9.999999e-01 9.530463e-01 5.455645e-01 9.983899e-01 3.998560e-01 1.000000e+00 9.999879e-01 9.999986e-01 - [265] 9.999994e-01 9.999397e-01 1.000000e+00 9.999603e-01 4.332884e-01 9.997728e-01 9.999576e-01 9.999933e-01 7.926057e-01 9.999949e-01 1.000000e+00 - [276] 1.000000e+00 1.000000e+00 1.000000e+00 9.999998e-01 9.999804e-01 9.858326e-01 1.000000e+00 1.000000e+00 2.256963e-02 1.000000e+00 9.999998e-01 - [287] 1.000000e+00 1.000000e+00 9.999997e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - [298] 9.999997e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - - $naivebayes$split$train$labels - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No Yes Yes No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes Yes Yes - 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 - Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes - 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - $naivebayes$split$train$metrics - $naivebayes$split$train$metrics$fpr - [1] 1.00000000 1.00000000 0.99543379 0.99086758 0.98630137 0.98173516 0.97716895 0.96803653 0.95890411 0.95433790 0.94977169 0.94520548 0.93607306 - [14] 0.93150685 0.92694064 0.92237443 0.91780822 0.91324201 0.90867580 0.90410959 0.89954338 0.89497717 0.89041096 0.88584475 0.88127854 0.87671233 - [27] 0.87214612 0.86757991 0.86301370 0.85844749 0.85388128 0.84931507 0.84474886 0.84018265 0.83561644 0.83105023 0.82648402 0.82191781 0.81735160 - [40] 0.81278539 0.80821918 0.80365297 0.79908676 0.79452055 0.78538813 0.77625571 0.77168950 0.76712329 0.76255708 0.75799087 0.75342466 0.74885845 - [53] 0.74429224 0.73972603 0.73515982 0.72146119 0.71232877 0.70776256 0.70319635 0.69406393 0.68949772 0.68493151 0.68036530 0.67579909 0.67123288 - [66] 0.66666667 0.66210046 0.65753425 0.64840183 0.64383562 0.63926941 0.63470320 0.63013699 0.62557078 0.62100457 0.61643836 0.61187215 0.60730594 - [79] 0.60273973 0.59817352 0.59360731 0.58904110 0.58447489 0.57990868 0.57534247 0.57077626 0.56621005 0.56164384 0.55707763 0.55251142 0.54794521 - [92] 0.54337900 0.53881279 0.53424658 0.52054795 0.51598174 0.51141553 0.50228311 0.49771689 0.49315068 0.48858447 0.48401826 0.47945205 0.47488584 - [105] 0.47031963 0.46575342 0.46118721 0.45662100 0.45205479 0.44748858 0.44292237 0.43835616 0.43378995 0.42922374 0.42465753 0.42009132 0.41552511 - [118] 0.41095890 0.40639269 0.40182648 0.39726027 0.39269406 0.38812785 0.38356164 0.37899543 0.37442922 0.36986301 0.36529680 0.36073059 0.35616438 - [131] 0.35159817 0.34703196 0.34246575 0.33789954 0.33333333 0.32876712 0.32420091 0.31963470 0.31506849 0.31050228 0.30593607 0.30136986 0.29680365 - [144] 0.29223744 0.28767123 0.28310502 0.27853881 0.27397260 0.27397260 0.26940639 0.26484018 0.26027397 0.25570776 0.25114155 0.24657534 0.24200913 - [157] 0.23744292 0.23287671 0.22831050 0.22374429 0.21917808 0.21461187 0.21004566 0.20547945 0.20091324 0.19634703 0.19178082 0.19178082 0.18721461 - [170] 0.18264840 0.17808219 0.17351598 0.16894977 0.16438356 0.15981735 0.15525114 0.15525114 0.15068493 0.14611872 0.14155251 0.13698630 0.13242009 - [183] 0.12785388 0.12328767 0.11872146 0.11415525 0.10958904 0.10502283 0.10045662 0.09589041 0.09132420 0.08675799 0.08675799 0.08219178 0.08219178 - [196] 0.08219178 0.08219178 0.07762557 0.07305936 0.06849315 0.06392694 0.06392694 0.06392694 0.05936073 0.05936073 0.05479452 0.05479452 0.05022831 - [209] 0.05022831 0.04566210 0.04109589 0.03652968 0.03196347 0.02739726 0.02283105 0.01826484 0.01369863 0.01369863 0.01369863 0.00913242 0.00913242 - [222] 0.00913242 0.00913242 0.00913242 0.00913242 0.00913242 0.00913242 0.00913242 0.00913242 0.00913242 0.00913242 0.00456621 0.00456621 0.00456621 - [235] 0.00456621 0.00456621 0.00456621 0.00456621 0.00456621 0.00456621 0.00456621 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [248] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [261] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [274] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [287] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$split$train$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [14] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [27] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [40] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [53] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [66] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [79] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [92] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [105] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [118] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [131] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [144] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 - [157] 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.98837209 0.97674419 0.97674419 - [170] 0.97674419 0.97674419 0.97674419 0.97674419 0.97674419 0.97674419 0.97674419 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 - [183] 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.96511628 0.95348837 0.95348837 0.94186047 - [196] 0.93023256 0.91860465 0.91860465 0.91860465 0.91860465 0.91860465 0.90697674 0.89534884 0.89534884 0.88372093 0.88372093 0.87209302 0.87209302 - [209] 0.86046512 0.86046512 0.86046512 0.86046512 0.86046512 0.86046512 0.86046512 0.86046512 0.86046512 0.84883721 0.83720930 0.83720930 0.82558140 - [222] 0.81395349 0.80232558 0.79069767 0.77906977 0.76744186 0.75581395 0.74418605 0.73255814 0.72093023 0.70930233 0.70930233 0.69767442 0.68604651 - [235] 0.67441860 0.66279070 0.65116279 0.63953488 0.62790698 0.61627907 0.60465116 0.60465116 0.59302326 0.58139535 0.56976744 0.55813953 0.54651163 - [248] 0.53488372 0.52325581 0.51162791 0.50000000 0.48837209 0.47674419 0.46511628 0.45348837 0.44186047 0.43023256 0.41860465 0.40697674 0.39534884 - [261] 0.38372093 0.37209302 0.36046512 0.34883721 0.33720930 0.32558140 0.31395349 0.30232558 0.29069767 0.27906977 0.26744186 0.25581395 0.24418605 - [274] 0.23255814 0.22093023 0.20930233 0.19767442 0.18604651 0.17441860 0.16279070 0.15116279 0.13953488 0.12790698 0.11627907 0.10465116 0.09302326 - [287] 0.08139535 0.06976744 0.05813953 0.04651163 0.03488372 0.02325581 0.01162791 - - - $naivebayes$split$train$auc - [1] 0.984443 - - $naivebayes$split$train$youdens_indx - [1] 0.06708378 - - - $naivebayes$split$test - $naivebayes$split$test$thresholds - [1] 0.000000e+00 3.419046e-12 8.037675e-11 1.446776e-10 2.736345e-10 4.103528e-10 6.237191e-10 7.855963e-10 1.320994e-09 6.122650e-09 8.910814e-09 - [12] 1.885894e-08 1.141641e-06 1.171399e-06 1.173949e-06 1.209843e-06 1.233172e-06 1.255177e-06 1.285671e-06 1.367424e-06 1.498882e-06 1.568718e-06 - [23] 1.938634e-06 3.193598e-06 3.221965e-06 3.249283e-06 4.025117e-06 6.763704e-06 6.858539e-06 9.334476e-06 1.007429e-05 1.540776e-05 1.861228e-05 - [34] 2.127390e-05 2.297209e-05 3.914902e-05 5.179590e-05 7.984316e-05 8.398688e-05 8.937363e-05 3.008463e-04 3.473288e-04 4.111522e-04 7.764788e-04 - [45] 9.035761e-04 1.294476e-03 4.548041e-03 6.388913e-03 1.560677e-02 4.588421e-02 1.236871e-01 1.463831e-01 5.246222e-01 6.944354e-01 7.739301e-01 - [56] 7.851533e-01 8.557994e-01 8.570802e-01 8.702376e-01 9.336602e-01 9.508416e-01 9.655271e-01 9.911272e-01 9.987593e-01 9.989888e-01 9.995555e-01 - [67] 9.998730e-01 9.999771e-01 9.999957e-01 9.999971e-01 9.999985e-01 9.999997e-01 9.999999e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - [78] 1.000000e+00 - - $naivebayes$split$test$probs - [1] 1.861228e-05 5.179590e-05 3.249283e-06 1.173949e-06 4.025117e-06 7.855963e-10 8.557994e-01 8.570802e-01 1.209843e-06 2.736345e-10 1.233172e-06 - [12] 4.548041e-03 4.111522e-04 9.987593e-01 1.255177e-06 2.297209e-05 1.171399e-06 3.008463e-04 3.193598e-06 8.037675e-11 1.320994e-09 8.910814e-09 - [23] 7.764788e-04 6.122650e-09 1.294476e-03 7.739301e-01 1.446776e-10 1.540776e-05 1.007429e-05 3.221965e-06 6.858539e-06 1.367424e-06 6.944354e-01 - [34] 9.911272e-01 9.334476e-06 3.419046e-12 8.937363e-05 9.998730e-01 8.398688e-05 6.388913e-03 2.127390e-05 1.885894e-08 1.938634e-06 7.984316e-05 - [45] 1.463831e-01 6.237191e-10 1.498882e-06 1.568718e-06 1.285671e-06 4.588421e-02 3.473288e-04 1.141641e-06 4.103528e-10 6.763704e-06 3.914902e-05 - [56] 1.000000e+00 9.999971e-01 8.702376e-01 9.336602e-01 9.999771e-01 9.508416e-01 9.655271e-01 1.236871e-01 1.000000e+00 9.999957e-01 7.851533e-01 - [67] 9.999997e-01 9.995555e-01 9.999985e-01 9.999999e-01 1.000000e+00 9.989888e-01 1.000000e+00 5.246222e-01 1.000000e+00 1.560677e-02 9.035761e-04 - [78] 1.000000e+00 - - $naivebayes$split$test$labels - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes No Yes - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 - - $naivebayes$split$test$metrics - $naivebayes$split$test$metrics$fpr - [1] 1.00000000 1.00000000 0.98214286 0.96428571 0.94642857 0.92857143 0.91071429 0.89285714 0.87500000 0.85714286 0.83928571 0.82142857 0.80357143 0.78571429 - [15] 0.76785714 0.75000000 0.73214286 0.71428571 0.69642857 0.67857143 0.66071429 0.64285714 0.62500000 0.60714286 0.58928571 0.57142857 0.55357143 0.53571429 - [29] 0.51785714 0.50000000 0.48214286 0.46428571 0.44642857 0.42857143 0.41071429 0.39285714 0.37500000 0.35714286 0.33928571 0.32142857 0.30357143 0.28571429 - [43] 0.26785714 0.25000000 0.23214286 0.21428571 0.19642857 0.17857143 0.16071429 0.16071429 0.14285714 0.14285714 0.12500000 0.12500000 0.10714286 0.08928571 - [57] 0.08928571 0.07142857 0.05357143 0.05357143 0.05357143 0.05357143 0.05357143 0.03571429 0.01785714 0.01785714 0.01785714 0.00000000 0.00000000 0.00000000 - [71] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$split$test$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [15] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [29] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [43] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.95454545 0.95454545 0.90909091 0.90909091 0.86363636 0.86363636 0.86363636 - [57] 0.81818182 0.81818182 0.81818182 0.77272727 0.72727273 0.68181818 0.63636364 0.63636364 0.63636364 0.59090909 0.54545455 0.54545455 0.50000000 0.45454545 - [71] 0.40909091 0.36363636 0.31818182 0.27272727 0.22727273 0.18181818 0.13636364 0.09090909 - - - $naivebayes$split$test$auc - [1] 0.9691558 - - $naivebayes$split$test$youdens_indx - [1] 0.01560677 - - - - $naivebayes$cv - $naivebayes$cv$fold1 - $naivebayes$cv$fold1$thresholds - [1] 0.000000e+00 6.926573e-10 1.058739e-09 1.097283e-09 1.239473e-09 1.446012e-09 1.450237e-09 5.841954e-09 3.720668e-07 8.195183e-07 9.145964e-07 - [12] 9.146899e-07 1.109063e-06 1.379724e-06 1.380430e-06 1.487064e-06 2.110256e-06 2.178912e-06 2.403434e-06 2.473826e-06 2.708080e-06 3.125280e-06 - [23] 3.126877e-06 3.773267e-06 4.100084e-06 1.074193e-05 1.506352e-05 1.591691e-05 2.134440e-05 3.224282e-05 5.942329e-05 7.628979e-05 8.247039e-05 - [34] 8.341840e-05 8.380451e-05 1.154517e-04 1.224300e-04 1.414311e-04 1.703968e-04 1.712512e-04 2.166325e-04 2.249853e-04 5.620210e-04 9.429119e-04 - [45] 2.056291e-03 3.031717e-03 2.091306e-02 2.998587e-02 3.129405e-02 3.199544e-02 5.565634e-02 9.688890e-02 1.451437e-01 3.304639e-01 5.749348e-01 - [56] 6.052083e-01 8.630677e-01 8.739984e-01 8.841471e-01 9.470591e-01 9.921251e-01 9.923163e-01 9.932885e-01 9.986615e-01 9.990137e-01 9.999360e-01 - [67] 9.999626e-01 9.999707e-01 9.999870e-01 9.999879e-01 9.999895e-01 9.999982e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - [78] 1.000000e+00 - - $naivebayes$cv$fold1$probs - [1] 9.932885e-01 1.506352e-05 1.591691e-05 1.446012e-09 5.620210e-04 1.380430e-06 5.565634e-02 3.031717e-03 5.749348e-01 1.379724e-06 3.720668e-07 - [12] 8.341840e-05 3.304639e-01 9.688890e-02 6.052083e-01 3.773267e-06 1.058739e-09 1.450237e-09 9.470591e-01 3.129405e-02 2.249853e-04 1.154517e-04 - [23] 1.487064e-06 4.100084e-06 1.239473e-09 7.628979e-05 2.178912e-06 8.195183e-07 3.125280e-06 6.926573e-10 2.056291e-03 2.134440e-05 8.739984e-01 - [34] 2.166325e-04 1.414311e-04 9.146899e-07 1.097283e-09 1.109063e-06 3.224282e-05 2.110256e-06 3.126877e-06 5.942329e-05 2.091306e-02 1.703968e-04 - [45] 2.473826e-06 8.247039e-05 5.841954e-09 1.712512e-04 2.708080e-06 9.145964e-07 9.429119e-04 1.224300e-04 8.380451e-05 1.074193e-05 2.403434e-06 - [56] 3.199544e-02 1.451437e-01 9.986615e-01 2.998587e-02 9.921251e-01 9.999870e-01 9.999360e-01 1.000000e+00 8.630677e-01 1.000000e+00 9.923163e-01 - [67] 1.000000e+00 8.841471e-01 9.999895e-01 1.000000e+00 1.000000e+00 9.999879e-01 9.999707e-01 1.000000e+00 9.999982e-01 9.990137e-01 9.999626e-01 - - $naivebayes$cv$fold1$labels - Yes No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - $naivebayes$cv$fold1$metrics - $naivebayes$cv$fold1$metrics$fpr - [1] 1.00000000 1.00000000 0.98181818 0.96363636 0.94545455 0.92727273 0.90909091 0.89090909 0.87272727 0.85454545 0.83636364 0.81818182 0.80000000 0.78181818 - [15] 0.76363636 0.74545455 0.72727273 0.70909091 0.69090909 0.67272727 0.65454545 0.63636364 0.61818182 0.60000000 0.58181818 0.56363636 0.54545455 0.52727273 - [29] 0.50909091 0.49090909 0.47272727 0.45454545 0.43636364 0.41818182 0.40000000 0.38181818 0.36363636 0.34545455 0.32727273 0.30909091 0.29090909 0.27272727 - [43] 0.25454545 0.23636364 0.21818182 0.20000000 0.18181818 0.16363636 0.16363636 0.14545455 0.12727273 0.10909091 0.09090909 0.09090909 0.07272727 0.05454545 - [57] 0.03636364 0.03636364 0.01818182 0.01818182 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [71] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$cv$fold1$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [15] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [29] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [43] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.95454545 0.95454545 0.95454545 0.95454545 0.95454545 0.90909091 0.90909091 0.90909091 - [57] 0.90909091 0.86363636 0.86363636 0.81818182 0.81818182 0.77272727 0.72727273 0.68181818 0.63636364 0.59090909 0.54545455 0.50000000 0.45454545 0.40909091 - [71] 0.36363636 0.31818182 0.27272727 0.22727273 0.18181818 0.13636364 0.09090909 0.04545455 - - - $naivebayes$cv$fold1$auc - [1] 0.9876033 - - $naivebayes$cv$fold1$youdens_indx - [1] 0.8630677 - - - $naivebayes$cv$fold2 - $naivebayes$cv$fold2$thresholds - [1] 0.000000e+00 8.252106e-11 1.394845e-10 2.045964e-10 2.397596e-10 7.316426e-10 1.367383e-09 4.848305e-09 5.958924e-09 3.490987e-08 5.976694e-08 - [12] 3.769434e-07 1.134405e-06 1.220656e-06 1.228132e-06 1.258924e-06 1.338448e-06 1.371539e-06 1.413623e-06 1.466895e-06 2.317212e-06 2.401335e-06 - [23] 3.378434e-06 3.505122e-06 4.670208e-06 5.823203e-06 7.394891e-06 7.772923e-06 9.477956e-06 1.254589e-05 1.349301e-05 1.694008e-05 2.044235e-05 - [34] 2.168101e-05 2.691045e-05 3.172650e-05 6.236732e-05 9.466039e-05 9.970190e-05 1.860573e-04 4.088123e-04 4.319679e-04 4.706078e-04 5.042684e-04 - [45] 8.696632e-04 1.339262e-03 1.597441e-03 4.610006e-03 8.067712e-03 8.189751e-03 1.291712e-02 4.322432e-02 9.411275e-02 3.420353e-01 4.144298e-01 - [56] 4.492723e-01 5.166574e-01 7.168681e-01 9.352064e-01 9.782054e-01 9.993518e-01 9.994584e-01 9.994638e-01 9.999012e-01 9.999700e-01 9.999792e-01 - [67] 9.999988e-01 9.999990e-01 9.999996e-01 9.999997e-01 9.999998e-01 9.999998e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - - $naivebayes$cv$fold2$probs - [1] 5.166574e-01 3.378434e-06 1.367383e-09 4.322432e-02 3.505122e-06 2.044235e-05 1.254589e-05 4.848305e-09 1.220656e-06 3.769434e-07 8.252106e-11 - [12] 1.394845e-10 4.492723e-01 5.042684e-04 2.045964e-10 4.670208e-06 3.490987e-08 1.258924e-06 1.220656e-06 1.371539e-06 1.134405e-06 1.338448e-06 - [23] 9.477956e-06 5.976694e-08 2.317212e-06 1.339262e-03 9.970190e-05 8.696632e-04 8.067712e-03 1.338448e-06 6.236732e-05 7.772923e-06 4.706078e-04 - [34] 7.394891e-06 3.172650e-05 4.610006e-03 1.291712e-02 9.994584e-01 2.397596e-10 2.401335e-06 5.823203e-06 1.860573e-04 9.411275e-02 1.228132e-06 - [45] 9.466039e-05 7.316426e-10 1.466895e-06 1.694008e-05 1.413623e-06 1.349301e-05 2.168101e-05 4.319679e-04 1.597441e-03 4.088123e-04 2.691045e-05 - [56] 5.958924e-09 9.999990e-01 9.999700e-01 4.144298e-01 9.782054e-01 8.189751e-03 9.993518e-01 9.999997e-01 9.999792e-01 9.999998e-01 1.000000e+00 - [67] 7.168681e-01 1.000000e+00 3.420353e-01 9.999012e-01 1.000000e+00 9.999996e-01 9.352064e-01 9.999998e-01 9.994638e-01 1.000000e+00 9.999988e-01 - - $naivebayes$cv$fold2$labels - Yes No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - $naivebayes$cv$fold2$metrics - $naivebayes$cv$fold2$metrics$fpr - [1] 1.00000000 1.00000000 0.98181818 0.96363636 0.94545455 0.92727273 0.90909091 0.89090909 0.87272727 0.85454545 0.83636364 0.81818182 0.80000000 0.78181818 - [15] 0.74545455 0.72727273 0.70909091 0.67272727 0.65454545 0.63636364 0.61818182 0.60000000 0.58181818 0.56363636 0.54545455 0.52727273 0.50909091 0.49090909 - [29] 0.47272727 0.45454545 0.43636364 0.41818182 0.40000000 0.38181818 0.36363636 0.34545455 0.32727273 0.30909091 0.29090909 0.27272727 0.25454545 0.23636364 - [43] 0.21818182 0.20000000 0.18181818 0.16363636 0.14545455 0.12727273 0.10909091 0.09090909 0.09090909 0.07272727 0.05454545 0.03636364 0.03636364 0.03636364 - [57] 0.01818182 0.01818182 0.01818182 0.01818182 0.01818182 0.01818182 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [71] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$cv$fold2$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [15] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [29] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [43] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.95454545 0.95454545 0.95454545 0.95454545 0.90909091 0.86363636 - [57] 0.86363636 0.81818182 0.77272727 0.72727273 0.68181818 0.63636364 0.63636364 0.59090909 0.54545455 0.50000000 0.45454545 0.40909091 0.36363636 0.31818182 - [71] 0.27272727 0.22727273 0.18181818 0.13636364 0.09090909 0.04545455 - - - $naivebayes$cv$fold2$auc - [1] 0.9917355 - - $naivebayes$cv$fold2$youdens_indx - [1] 0.3420353 - - - $naivebayes$cv$fold3 - $naivebayes$cv$fold3$thresholds - [1] 0.000000e+00 4.166955e-11 1.268524e-09 1.387761e-09 2.041215e-09 2.230543e-09 4.596017e-09 5.548072e-09 1.170537e-08 2.000165e-08 3.034705e-08 - [12] 3.730429e-08 1.534516e-07 8.949720e-07 1.107594e-06 1.214897e-06 1.353577e-06 1.388951e-06 1.429185e-06 1.627416e-06 1.821953e-06 2.063055e-06 - [23] 2.152679e-06 2.250336e-06 2.825387e-06 2.864401e-06 3.685952e-06 4.149009e-06 5.500331e-06 6.206848e-06 6.813480e-06 1.226435e-05 1.602321e-05 - [34] 1.645971e-05 3.038054e-05 4.486434e-05 4.740408e-05 6.535659e-05 9.124965e-05 2.032303e-04 2.341545e-04 4.644807e-04 1.003675e-03 1.191548e-03 - [45] 1.377197e-03 2.295484e-03 2.371533e-03 4.086496e-03 7.058532e-03 1.207092e-02 5.945217e-02 1.369425e-01 1.768495e-01 1.936103e-01 1.944124e-01 - [56] 2.737499e-01 6.055941e-01 8.747338e-01 8.950181e-01 9.178667e-01 9.521727e-01 9.977758e-01 9.998251e-01 9.998717e-01 9.999022e-01 9.999642e-01 - [67] 9.999747e-01 9.999975e-01 9.999990e-01 9.999996e-01 9.999999e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - [78] 1.000000e+00 - - $naivebayes$cv$fold3$probs - [1] 1.000000e+00 1.821953e-06 1.170537e-08 4.149009e-06 1.429185e-06 2.737499e-01 3.038054e-05 5.500331e-06 1.268524e-09 1.107594e-06 2.152679e-06 - [12] 1.388951e-06 4.596017e-09 4.086496e-03 1.645971e-05 2.825387e-06 1.353577e-06 7.058532e-03 2.864401e-06 3.685952e-06 8.949720e-07 4.740408e-05 - [23] 2.041215e-09 1.369425e-01 5.945217e-02 1.627416e-06 5.548072e-09 1.377197e-03 2.000165e-08 2.230543e-09 8.950181e-01 2.063055e-06 2.250336e-06 - [34] 4.486434e-05 1.387761e-09 2.371533e-03 1.214897e-06 1.944124e-01 3.730429e-08 8.747338e-01 1.003675e-03 1.207092e-02 1.534516e-07 6.055941e-01 - [45] 1.191548e-03 9.124965e-05 1.936103e-01 6.813480e-06 1.602321e-05 3.034705e-08 2.295484e-03 4.166955e-11 6.206848e-06 2.341545e-04 6.535659e-05 - [56] 1.226435e-05 9.178667e-01 9.999747e-01 1.000000e+00 4.644807e-04 9.999022e-01 2.032303e-04 9.998251e-01 9.998717e-01 9.999996e-01 1.000000e+00 - [67] 9.999642e-01 1.000000e+00 1.000000e+00 9.977758e-01 9.521727e-01 1.000000e+00 9.999975e-01 1.768495e-01 1.000000e+00 9.999999e-01 9.999990e-01 - - $naivebayes$cv$fold3$labels - Yes No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - $naivebayes$cv$fold3$metrics - $naivebayes$cv$fold3$metrics$fpr - [1] 1.00000000 1.00000000 0.98181818 0.96363636 0.94545455 0.92727273 0.90909091 0.89090909 0.87272727 0.85454545 0.83636364 0.81818182 0.80000000 0.78181818 - [15] 0.76363636 0.74545455 0.72727273 0.70909091 0.69090909 0.67272727 0.65454545 0.63636364 0.61818182 0.60000000 0.58181818 0.56363636 0.54545455 0.52727273 - [29] 0.50909091 0.49090909 0.47272727 0.45454545 0.43636364 0.41818182 0.40000000 0.38181818 0.36363636 0.34545455 0.32727273 0.30909091 0.30909091 0.29090909 - [43] 0.29090909 0.27272727 0.25454545 0.23636364 0.21818182 0.20000000 0.18181818 0.16363636 0.14545455 0.12727273 0.10909091 0.10909091 0.09090909 0.07272727 - [57] 0.05454545 0.03636364 0.01818182 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [71] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$cv$fold3$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [15] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [29] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.95454545 0.95454545 - [43] 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.90909091 0.86363636 0.86363636 0.86363636 - [57] 0.86363636 0.86363636 0.86363636 0.86363636 0.81818182 0.77272727 0.72727273 0.68181818 0.63636364 0.59090909 0.54545455 0.50000000 0.45454545 0.40909091 - [71] 0.36363636 0.31818182 0.27272727 0.22727273 0.18181818 0.13636364 0.09090909 0.04545455 - - - $naivebayes$cv$fold3$auc - [1] 0.9690083 - - $naivebayes$cv$fold3$youdens_indx - [1] 0.9178667 - - - $naivebayes$cv$fold4 - $naivebayes$cv$fold4$thresholds - [1] 0.000000e+00 2.750609e-11 5.826425e-11 1.047954e-09 1.586441e-09 3.670578e-09 3.840366e-09 9.993278e-09 2.340079e-08 8.764292e-08 3.923538e-07 - [12] 7.444014e-07 7.510190e-07 7.524146e-07 7.700181e-07 8.327453e-07 9.910310e-07 1.189423e-06 1.199997e-06 1.226250e-06 1.461871e-06 1.573581e-06 - [23] 1.812051e-06 2.468316e-06 2.470309e-06 4.245418e-06 4.398834e-06 7.873324e-06 8.295479e-06 1.021129e-05 1.139041e-05 1.415351e-05 1.422147e-05 - [34] 1.523270e-05 1.855146e-05 1.947724e-05 4.247748e-05 4.484758e-05 4.803528e-05 7.233260e-05 1.554400e-04 3.769016e-04 4.842950e-04 6.300584e-04 - [45] 9.585976e-04 2.631964e-03 6.537015e-03 1.606167e-02 2.765587e-02 7.096413e-02 1.050832e-01 1.194691e-01 5.348245e-01 6.415981e-01 6.595952e-01 - [56] 7.202827e-01 7.460445e-01 7.618050e-01 8.120549e-01 8.197187e-01 9.525616e-01 9.973930e-01 9.974010e-01 9.999581e-01 9.999870e-01 9.999990e-01 - [67] 9.999999e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - - $naivebayes$cv$fold4$probs - [1] 4.484758e-05 6.537015e-03 5.826425e-11 7.700181e-07 1.523270e-05 1.199997e-06 2.750609e-11 1.573581e-06 8.197187e-01 1.415351e-05 4.247748e-05 - [12] 3.840366e-09 4.842950e-04 1.606167e-02 1.855146e-05 1.947724e-05 2.340079e-08 7.873324e-06 3.769016e-04 8.120549e-01 7.510190e-07 1.586441e-09 - [23] 9.910310e-07 4.803528e-05 1.047954e-09 1.139041e-05 7.444014e-07 8.327453e-07 9.999870e-01 4.398834e-06 2.470309e-06 3.923538e-07 1.021129e-05 - [34] 6.300584e-04 1.554400e-04 7.202827e-01 3.670578e-09 1.226250e-06 2.468316e-06 9.993278e-09 8.295479e-06 1.812051e-06 4.245418e-06 2.631964e-03 - [45] 7.618050e-01 7.524146e-07 1.422147e-05 9.585976e-04 6.415981e-01 1.461871e-06 1.189423e-06 7.096413e-02 8.764292e-08 7.460445e-01 7.233260e-05 - [56] 9.999990e-01 1.000000e+00 1.000000e+00 6.595952e-01 2.765587e-02 1.000000e+00 9.974010e-01 1.000000e+00 1.000000e+00 9.973930e-01 9.999999e-01 - [67] 1.000000e+00 1.000000e+00 1.000000e+00 5.348245e-01 1.000000e+00 9.525616e-01 9.999581e-01 1.050832e-01 1.000000e+00 1.194691e-01 - - $naivebayes$cv$fold4$labels - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - $naivebayes$cv$fold4$metrics - $naivebayes$cv$fold4$metrics$fpr - [1] 1.00000000 1.00000000 0.98181818 0.96363636 0.94545455 0.92727273 0.90909091 0.89090909 0.87272727 0.85454545 0.83636364 0.81818182 0.80000000 0.78181818 - [15] 0.76363636 0.74545455 0.72727273 0.70909091 0.69090909 0.67272727 0.65454545 0.63636364 0.61818182 0.60000000 0.58181818 0.56363636 0.54545455 0.52727273 - [29] 0.50909091 0.49090909 0.47272727 0.45454545 0.43636364 0.41818182 0.40000000 0.38181818 0.36363636 0.34545455 0.32727273 0.30909091 0.29090909 0.27272727 - [43] 0.25454545 0.23636364 0.21818182 0.20000000 0.18181818 0.16363636 0.14545455 0.14545455 0.12727273 0.12727273 0.12727273 0.12727273 0.10909091 0.10909091 - [57] 0.09090909 0.07272727 0.05454545 0.03636364 0.01818182 0.01818182 0.01818182 0.01818182 0.01818182 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [71] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$cv$fold4$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [15] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [29] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [43] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.95238095 0.95238095 0.90476190 0.85714286 0.80952381 0.80952381 0.76190476 - [57] 0.76190476 0.76190476 0.76190476 0.76190476 0.76190476 0.71428571 0.66666667 0.61904762 0.57142857 0.57142857 0.52380952 0.47619048 0.42857143 0.38095238 - [71] 0.33333333 0.28571429 0.23809524 0.19047619 0.14285714 0.09523810 0.04761905 - - - $naivebayes$cv$fold4$auc - [1] 0.9701299 - - $naivebayes$cv$fold4$youdens_indx - [1] 0.02765587 - - - $naivebayes$cv$fold5 - $naivebayes$cv$fold5$thresholds - [1] 0.000000e+00 1.243220e-10 1.505994e-10 1.770026e-10 2.778590e-10 3.390278e-10 3.435670e-10 3.699224e-10 4.367596e-10 9.970310e-10 1.035714e-08 - [12] 6.799967e-08 1.115610e-07 1.190574e-07 1.428305e-07 1.502362e-07 1.539886e-07 1.544179e-07 1.555239e-07 1.626120e-07 1.642041e-07 1.675243e-07 - [23] 1.682050e-07 2.602299e-07 2.672825e-07 3.393142e-07 3.412692e-07 3.503210e-07 3.506751e-07 4.240769e-07 4.581072e-07 5.379954e-07 8.287792e-07 - [34] 9.181698e-07 1.044712e-06 1.279986e-06 1.644985e-06 1.795628e-06 8.046071e-06 1.310011e-05 1.575581e-05 4.086641e-05 7.917471e-05 3.765162e-04 - [45] 5.668416e-04 6.099128e-04 4.584922e-03 9.840668e-03 9.889498e-03 1.374765e-02 1.442990e-02 4.569236e-02 8.051530e-02 1.883479e-01 5.990581e-01 - [56] 8.951428e-01 9.203794e-01 9.885687e-01 9.972530e-01 9.985056e-01 9.987838e-01 9.992696e-01 9.996667e-01 9.999425e-01 9.999562e-01 9.999678e-01 - [67] 9.999835e-01 9.999869e-01 9.999942e-01 9.999946e-01 9.999967e-01 9.999971e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 - - $naivebayes$cv$fold5$probs - [1] 9.840668e-03 4.086641e-05 5.990581e-01 1.626120e-07 4.240769e-07 1.555239e-07 1.682050e-07 5.379954e-07 3.412692e-07 1.310011e-05 1.644985e-06 - [12] 5.668416e-04 3.412692e-07 3.393142e-07 1.279986e-06 2.672825e-07 1.770026e-10 1.502362e-07 2.602299e-07 1.374765e-02 1.539886e-07 3.503210e-07 - [23] 9.181698e-07 9.970310e-10 1.428305e-07 1.642041e-07 1.675243e-07 9.985056e-01 1.044712e-06 2.778590e-10 1.035714e-08 1.243220e-10 3.435670e-10 - [34] 3.699224e-10 7.917471e-05 4.569236e-02 3.765162e-04 1.442990e-02 3.506751e-07 3.390278e-10 1.544179e-07 1.795628e-06 1.505994e-10 9.885687e-01 - [45] 4.584922e-03 1.115610e-07 4.581072e-07 8.046071e-06 8.287792e-07 1.575581e-05 1.190574e-07 4.367596e-10 9.889498e-03 6.799967e-08 6.099128e-04 - [56] 1.000000e+00 9.999946e-01 1.000000e+00 9.987838e-01 9.999869e-01 1.000000e+00 9.203794e-01 9.999562e-01 9.972530e-01 9.999678e-01 9.999971e-01 - [67] 9.996667e-01 8.951428e-01 9.992696e-01 1.883479e-01 9.999835e-01 9.999425e-01 8.051530e-02 9.999942e-01 1.000000e+00 9.999967e-01 - - $naivebayes$cv$fold5$labels - No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No No - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - $naivebayes$cv$fold5$metrics - $naivebayes$cv$fold5$metrics$fpr - [1] 1.00000000 1.00000000 0.98181818 0.96363636 0.94545455 0.92727273 0.90909091 0.89090909 0.87272727 0.85454545 0.83636364 0.81818182 0.80000000 0.78181818 - [15] 0.76363636 0.74545455 0.72727273 0.70909091 0.69090909 0.67272727 0.65454545 0.63636364 0.61818182 0.60000000 0.58181818 0.56363636 0.54545455 0.50909091 - [29] 0.49090909 0.47272727 0.45454545 0.43636364 0.41818182 0.40000000 0.38181818 0.36363636 0.34545455 0.32727273 0.30909091 0.29090909 0.27272727 0.25454545 - [43] 0.23636364 0.21818182 0.20000000 0.18181818 0.16363636 0.14545455 0.12727273 0.10909091 0.09090909 0.07272727 0.05454545 0.05454545 0.05454545 0.03636364 - [57] 0.03636364 0.03636364 0.01818182 0.01818182 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - [71] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 - - $naivebayes$cv$fold5$metrics$tpr - [1] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [15] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [29] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 - [43] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 0.95238095 0.90476190 0.90476190 - [57] 0.85714286 0.80952381 0.80952381 0.76190476 0.76190476 0.71428571 0.66666667 0.61904762 0.57142857 0.52380952 0.47619048 0.42857143 0.38095238 0.33333333 - [71] 0.28571429 0.23809524 0.19047619 0.14285714 0.09523810 0.04761905 - - - $naivebayes$cv$fold5$auc - [1] 0.9926407 - - $naivebayes$cv$fold5$youdens_indx - [1] 0.0805153 - - + +Output
-Optimal thresholds values can be used as input for `classCV` to assess the performance when using a specific threshold. +Optimal thresholds values can be used as input for `class_cv` to assess the performance when using a specific threshold. ```R -avg_youdens_indx <- mean(sapply(roc_output$naivebayes$cv, function(x) x$youdens_indx)) +# Get average Youden's Index across folds +nb_results <- roc_output$get_model("naivebayes") +avg_youdens_indx <- mean(sapply(nb_results$cv, function(x) x$youdens_indx)) # Using 17, the column index of "Recurred" -results <- classCV( +results <- class_cv( data = thyroid_data, target = 17, models = "naivebayes", @@ -810,7 +424,7 @@ results <- classCV( save = list(models = TRUE) ) -print(results) +results$print() ``` @@ -875,7 +489,7 @@ for (i in 1:ncol(thyroid_data)) { thyroid_data[sample(1:nrow(thyroid_data), size = round(nrow(thyroid_data) * .01)), i] <- NA } -results <- classCV( +results <- class_cv( formula = Recurred ~ ., data = thyroid_data, models = "randomforest", @@ -891,7 +505,7 @@ results <- classCV( save = list(models = FALSE, data = FALSE) ) -print(results) +results$print() ``` @@ -973,255 +587,6 @@ Yes 0.95 ± 0.03 (SD) 0.92 ± 0.03 (SD) 0.94 ± 0.01 (SD)
-Displaying what is contained in the vswift object by converting its class to a list and using R's base `print` function. - -```R -class(results) <- "list" -print(results) -``` - -
- Output - - $configs - $configs$formula - Recurred ~ . - - $configs$n_features - [1] 16 - - $configs$models - [1] "randomforest" - - $configs$model_params - $configs$model_params$map_args - NULL - - $configs$model_params$threshold - NULL - - $configs$model_params$rule - NULL - - $configs$model_params$final_model - [1] FALSE - - $configs$model_params$verbose - NULL - - - $configs$train_params - $configs$train_params$split - [1] 0.8 - - $configs$train_params$n_folds - [1] 5 - - $configs$train_params$stratified - [1] TRUE - - $configs$train_params$random_seed - [1] 123 - - $configs$train_params$standardize - [1] TRUE - - $configs$train_params$remove_obs - [1] FALSE - - - $configs$impute_params - $configs$impute_params$method - [1] "impute_bag" - - $configs$impute_params$args - $configs$impute_params$args$trees - [1] 20 - - $configs$impute_params$args$seed_val - [1] 123 - - - - $configs$parallel_configs - $configs$parallel_configs$n_cores - NULL - - $configs$parallel_configs$future.seed - NULL - - - $configs$save - $configs$save$models - [1] FALSE - - $configs$save$data - [1] FALSE - - - - $missing_data_summary - $missing_data_summary$unlabeled_observations - [1] 8 - - $missing_data_summary$observations_missing_all_features - [1] 0 - - $missing_data_summary$incomplete_labeled_observations - [1] 110 - - $missing_data_summary$complete_observations - [1] 265 - - - $class_summary - $class_summary$classes - [1] "No" "Yes" - - $class_summary$proportions - target_vector - No Yes - 0.7173333 0.2826667 - - $class_summary$indices - $class_summary$indices$No - [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 - [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 - [69] 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 - [103] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 - [137] 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 - [171] 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 - [205] 211 212 213 214 215 216 217 218 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 - [239] 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 334 335 348 - - $class_summary$indices$Yes - [1] 48 86 87 88 89 90 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 293 294 295 296 297 298 299 300 - [35] 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 336 - [69] 337 338 339 340 341 342 343 344 345 346 347 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 - [103] 372 373 374 375 - - - - $data_partitions - $data_partitions$indices - $data_partitions$indices$split - $data_partitions$indices$split$train - [1] 185 14 201 124 255 270 335 159 96 97 282 203 286 191 98 275 143 247 105 73 26 7 176 279 217 170 79 82 43 109 123 77 149 32 - [35] 260 115 274 272 175 75 23 161 194 54 141 252 254 172 243 34 70 276 264 64 147 216 103 283 288 38 21 213 41 181 287 61 16 122 - [69] 100 6 92 205 39 165 292 51 246 4 13 245 133 271 53 22 95 166 25 35 174 118 30 146 202 127 116 164 65 148 68 157 128 80 - [103] 91 171 142 52 256 112 104 169 195 17 46 55 212 207 187 24 119 113 251 108 155 5 71 209 158 262 215 56 76 84 284 162 49 78 - [137] 144 117 258 1 348 160 208 285 150 94 248 145 153 20 110 177 99 36 193 114 192 50 42 60 85 11 184 8 163 173 67 140 111 151 - [171] 244 44 178 183 45 289 131 93 33 40 211 10 277 83 9 259 261 59 62 190 263 242 129 291 280 218 58 29 130 250 154 120 189 121 - [205] 257 139 334 278 66 239 126 253 74 125 214 86 315 375 225 236 374 333 351 373 303 329 344 372 308 327 338 304 330 349 302 301 369 314 - [239] 332 235 354 221 361 305 297 341 356 234 310 307 309 346 366 299 224 233 355 296 237 316 313 320 87 317 323 230 298 229 359 362 306 318 - [273] 321 238 363 232 319 312 295 364 358 220 360 222 326 322 223 367 231 347 370 340 336 342 311 228 89 88 343 170 280 - - $data_partitions$indices$split$test - [1] 197 135 69 249 19 152 2 138 206 101 167 265 267 132 27 210 107 268 168 179 240 241 180 18 137 198 12 196 273 266 3 15 37 269 - [35] 188 28 63 134 72 290 200 102 31 204 186 182 106 57 281 136 47 81 156 48 350 352 331 90 357 345 219 227 293 325 368 328 371 300 - [69] 324 365 226 339 294 353 199 337 - - - $data_partitions$indices$cv - $data_partitions$indices$cv$fold1 - [1] 293 52 185 14 201 124 255 270 335 159 96 97 282 203 286 191 98 275 143 247 105 73 26 7 176 279 217 170 79 82 43 109 123 77 - [35] 149 32 260 115 274 272 175 75 23 161 194 54 141 252 254 172 243 34 70 276 264 329 225 351 366 360 237 304 233 347 307 313 359 326 - [69] 228 90 340 355 364 305 297 350 - - $data_partitions$indices$cv$fold2 - [1] 116 148 59 40 4 15 84 158 192 61 25 114 200 29 41 210 140 36 177 199 152 138 198 76 179 81 189 153 102 110 207 169 60 94 - [35] 241 134 126 249 288 216 19 55 63 205 266 120 257 28 142 135 168 259 193 130 306 220 299 318 89 356 231 295 375 238 333 311 315 332 - [69] 222 324 336 361 48 316 302 - - $data_partitions$indices$cv$fold3 - [1] 122 157 21 147 91 30 174 156 83 182 33 65 100 139 13 208 250 69 144 183 181 128 53 334 213 50 62 12 258 11 9 103 106 291 - [35] 39 256 95 271 45 195 251 85 242 263 265 277 38 285 240 246 202 146 151 49 300 371 309 367 227 339 330 308 235 345 223 314 86 312 - [69] 341 352 294 319 354 358 301 - - $data_partitions$indices$cv$fold4 - [1] 206 10 145 58 107 209 8 253 71 248 37 160 93 133 46 80 218 74 163 5 180 6 57 16 131 211 113 42 108 281 284 171 150 118 - [35] 278 212 155 127 244 184 196 187 2 119 166 24 56 165 173 3 287 164 269 22 296 373 226 368 310 362 363 338 365 327 219 346 369 88 - [69] 303 357 353 232 229 337 221 - - $data_partitions$indices$cv$fold5 - [1] 162 111 215 204 125 129 214 190 239 186 273 154 99 64 348 104 78 290 132 167 268 117 18 92 137 289 31 51 44 262 121 66 101 136 - [35] 178 112 47 72 280 35 283 1 68 17 267 188 245 261 292 20 67 197 27 325 349 321 342 236 234 224 87 323 331 328 344 372 370 343 - [69] 298 230 322 320 317 374 - - - - $data_partitions$proportions - $data_partitions$proportions$split - $data_partitions$proportions$split$train - - No Yes - 0.7209302 0.2790698 - - $data_partitions$proportions$split$test - - No Yes - 0.7105263 0.2894737 - - - $data_partitions$proportions$cv - $data_partitions$proportions$cv$fold1 - - No Yes - 0.7105263 0.2894737 - - $data_partitions$proportions$cv$fold2 - - No Yes - 0.72 0.28 - - $data_partitions$proportions$cv$fold3 - - No Yes - 0.72 0.28 - - $data_partitions$proportions$cv$fold4 - - No Yes - 0.72 0.28 - - $data_partitions$proportions$cv$fold5 - - No Yes - 0.7162162 0.2837838 - - - - - $metrics - $metrics$randomforest - $metrics$randomforest$split - Set Classification Accuracy Class: No Precision Class: No Recall Class: No F1 Class: Yes Precision Class: Yes Recall Class: Yes F1 - 1 Training 0.9966555 0.9953704 1.000000 0.9976798 1.0000000 0.9880952 0.9940120 - 2 Test 0.9605263 0.9811321 0.962963 0.9719626 0.9130435 0.9545455 0.9333333 - - $metrics$randomforest$cv - Fold Classification Accuracy Class: No Precision Class: No Recall Class: No F1 Class: Yes Precision Class: Yes Recall - 1 Fold 1 0.973684211 0.981481481 0.981481481 0.981481481 0.95454545 0.95454545 - 2 Fold 2 0.973333333 0.964285714 1.000000000 0.981818182 1.00000000 0.90476190 - 3 Fold 3 0.960000000 0.963636364 0.981481481 0.972477064 0.95000000 0.90476190 - 4 Fold 4 0.960000000 0.963636364 0.981481481 0.972477064 0.95000000 0.90476190 - 5 Fold 5 0.959459459 0.980769231 0.962264151 0.971428571 0.90909091 0.95238095 - 6 Mean CV: 0.965295401 0.970761831 0.981341719 0.975936473 0.95272727 0.92424242 - 7 Standard Deviation CV: 0.007502020 0.009467624 0.013343010 0.005234449 0.03223750 0.02668577 - 8 Standard Error CV: 0.003355006 0.004234050 0.005967175 0.002340917 0.01441705 0.01193424 - Class: Yes F1 - 1 0.954545455 - 2 0.950000000 - 3 0.926829268 - 4 0.926829268 - 5 0.930232558 - 6 0.937687310 - 7 0.013483016 - 8 0.006029788 - - -
- ### Using Parallel Processing Parallel processing operates at the fold level, which means the system can simultaneously process multiple cross-validation folds (and the train-test split) even when training a single model. @@ -1271,7 +636,7 @@ print("Without Parallel Processing:") start <- proc.time() # Run the same model without parallel processing -results <- classCV( +results <- class_cv( data = ad_data, target = "ad.", models = c("knn", "svm", "decisiontree", "xgboost"), @@ -1298,7 +663,7 @@ options(future.globals.maxSize = 1200 * 1024^2) start_par <- proc.time() # Run model using parallel processing with 4 cores -results <- classCV( +results <- class_cv( data = ad_data, target = "ad.", models = c("knn", "svm", "decisiontree", "xgboost"), @@ -1351,7 +716,7 @@ In .create_dictionary(preprocessed_data[, vars$target]) : ```R # Print parameter information and model evaluation metrics; If number of features > 20, the target replaces the formula -print(results, models = c("xgboost", "knn")) +results$print(models = c("xgboost", "knn")) ```
@@ -1488,8 +853,7 @@ nonad. 0.97 ± 0.01 (SD) 0.95 ± 0.01 (SD) 0.96 ± 0.01 ( ```R # Plot results -plot( - results, +results$plot( models = "xgboost", class_names = "ad.", metrics = c("precision", "recall"), @@ -1511,5 +875,6 @@ plot( ## Acknowledgements The development of this package was inspired by other machine learning packages such as -topepo's [caret](https://github.com/topepo/caret) package and the -[scikit-learn](https://github.com/scikit-learn/scikit-learn) package. +topepo's [caret](https://github.com/topepo/caret) package, the +[scikit-learn](https://github.com/scikit-learn/scikit-learn) package, and the +[mlr3](https://github.com/mlr-org/mlr3) package. diff --git a/assets/ads/extreme_gradient_boosting_cv_precision_ad..png b/assets/ads/extreme_gradient_boosting_cv_precision_ad..png index 5c7c9110e1df78e6692b824e6a979296cc5a45b9..d47ae6e34ecc8b288ed79f85c5c8236a3f726677 100644 GIT binary patch delta 2945 zcmYLLc{o)2AFhjLtV5O&GO~+Mma&9rn5)d7ZZwuqvScqinfsf0r1HBeBg$B^&MnF` zwjy*$h#~tjjKNj58QT~;zfrg6cm6rg^F8PD`Ml?S-_QG;Psjf{PKF+~_p(J;{t=zU z%odEJRUNZ=so(j@$o6tuSamQ}@%@@MvoLVJ2V*->^yc#)|LpIZ@kEzvhg;p_<*zgU z?tdPmsvwPo9l}9*`F|G|az7L*NbouWBaHLIu$>M#c<;wEE@_LObbN(-8XAP~jOF~5 zDB>)Bp30_QX)b#RX|vc{orm1LH@2A`&k~Tmp@VfwWWw8+&Pwo zb*mFW78fVOPc5H~j?Mm%D{C2(I&&6`?xDZcytXxHMdfe`vy^K6ngEaVDe zwxNuQ_MkON=CxscHpW(^rf5KT}4GfsfNMXjchaB>i^ zNK)I6hu682Uv8KIh68rfT_SvHZM1eIhU5V5BsdFp5@(X3vq@4a8h13RkAYMo+`5|t z4N*7zj*o%SM5p#D#tJI=^bPFR><7^g0y(N7jjvQqXuDFtx0WVV6D(@^_3-k!(1`LH z)7Fz0E`6g^R?4OIIr2e;MiC_8eEyt3vR)^Rc9R|-BB(uXM}Q{T+Lu-1>*XL!Axhwu zEYPf=*?L({gfigTaBLC$M&4Uxp4n8WxV9I0Sp@d#!dwc#S4OGHzUn)byDGAF^BLM| z0jy2$*y!J5Z)_|3Lbf(;>c$2Wo#U34lU8&!>~AX#CA6+lTWC#B*)uaSl(k7_!2&$; zdlU^vpvHu*OnH^pm~}g*7q+U5OUaj9oD(TNDJwe6vjS%k`?W1Yc~*QPR+i0n4+E%Z ztk=jSsYT)=*om;ruY;B2P42Y`J!H;s2u-f|DhZYX0h0<>vj%XhvYRJakFCg%3Q4)H za#z>Eq-wVlL!c1>`q|o%VMAL%vvb_i8c65j(_#om!lwJ6>-Fy&S!&Pqr0pbfV< z-ZRXl%X`{htHemuhTANK5uHH0lP9{s!z9j7G1#>pFqre(mS>n~>1Vo-IIbC-I+lqf zOM|6INFWz8#HkaG#H!+&Ey+gzV*A;2>*xYY=4_Jlx`vz45P`9~`IHr!QQls<%>&@Q z$6#Q2<@voA5P8*kdZ_vabm?#u?$JK2>`=@9VTo#q;F<}K4+~?PvV|vI75_#1c=Yj5XWn2eJVGHmz-m(zFkYd10q6?NA z)0lP4d7VD-cq81cTm!aMoocM{tU32#CQASA(%Z&AiVi5s@r)bu?oT8HQSM#)Tg51c zW;yUG&x?2a9KNZr7<3WLIm~6B>j7!&1Haq?)6Z@{d0twO_(SnN@xYYFPL^Mn3UzUJ zU(gXP!;L%8>>dydhK=y0*a<0ZNWUCLfC`#3Rkx(Q9;z(&76q!>d zRGHX379wdgV~TO3@SEguj_A9|BaMa!*aPx=HOFN|^0^EOQ9f~fR;g1Hx5WCdBU|j! zI%RmK%>Z9|qmVUgrSwd2{LzymVjYQ#EnsMUauZhFMJ}iN1iH?ABY#5dQg&$`G}(hz7)AO{{&Ahy86r6`5r zUoUdB1wj4|PqKY_LzhdAD*ywBaPRYAy4_zcibJj@e);fT(7q+PEW8JV^y)X<5CqIW zx>X!dta%`g2G?i@j`SRWaEMhn%-y~_F7{U_3FL9Ei^6rGDA< z;8!jhz)caP{bwp=4SKi?g0}(M(D(ED>=g<0D|zj!hkq1L`0r35Bkxbr&C^k zmpP^|6q4VI-QOHr{HvY)xq{KJbCQXQ`}6MCI=0}P|lt^jwvj1lBe|T;ga}#B@J>%wbgLt1$#<4XCl&0A2}bzqc8w9BsVSiQ*+y+ZpZ_%uSV!#l*9UJDP*WE0^mZ zpo4=WBL6G^r|9f``}+YpMEO?gnix3cyExJqInhdW_B9CW2hmulSB?_fI?;L&)9|o< zBZU2%Y+;HPszz&(51H4LdAqO9cRjFn``StV$eBU<@Y%Kt zajtRQ({KmY-wa~JZ_E-;pTUSbk(#qFMh$=}h&*YYeY523h;#9S$Y`FakK98HFAWv{ zw$(YYVtq?`wUc#QI;~b?LjlJmN|xvegu?k?Y`{qy$7dvG-NL^bE>Y(#A&lhz_g7e0 aYSY5bAoccXr%v@l+{@M)jd)|_iTfXo4yMfj delta 2943 zcmYjSc{~&RAFrN?skvD>^N=GVM@&TsBe5T0uG~jNMvh$1YJ22Zh?Pkja!n~@j=7(S zB#h;p6?5NAuBV26)APK3uiy8d@Avz8y*}sr{rP-SrP`$`!AGsKEliB;?&DVpKqRB~ zDD;&!waB%%sQqh04Q58?@|XpExR{jr#`0F`^N;I4qmQ#bn_eygnlmTX_s5=;@CdXC z^P#v4M2eEF2}f8buo=7tNa`y#a-zF{mke7UQ+J16}U zS|su3-cpJ8D4`E?SGdr#=bQBY_t-*#^<=@r7%MxSCg+Fti^`42&wVv)z4~=I*8}aEvI|fB-Z&SXAlw7%5cE&F+^oQCfWYC%&tt24{ie9(+Pwa!h z;*SxHn=zN~i?XS3C(+ghD+xSNe&YQ2PyH3*U~r2xB8F0nA9L;FVwmZvlNEO1rj35O zaEaiB8?19G^ynwr(LXyoGxjY`>Jr|ZGp~g?_^;Ga{G)lW0|S8_S#R0y`~EHn!KRe2 zW}YtNLBTL2A|q2X>s)Q-uS^EuGzAz*vcUl{OkLxprgYRY)3GFET(+zFsLx)mhs*Gd zIT$||aghyA-!ywE;tHq$yV}vf>LhfQ@++@bS$f>c+jtrg9Q67_MnttB-u`V?FlERL zqv8GY$6}L6)8pLYb+ZWj@D}z(RBv9QeK=BV%7rxjkohApq7G|cQk3F*CqU2q74f8+ zUZ^O+KB`MjFQ z#or+)0hSW^nj?Y>)nQZ#;SnQ}m{UeAIf_LBlE-dwDU$YR!$XsDa2~#g<){&LwWzG% zwl=|h0A`>bq?Y@v1jHv2Lz_C|C<#Z^U2+0JjV6`;b7)CC)?N|Sk@Wd19$I;-oC_DC zTBavx#!^POp|JFb^#o&<1!5d|ydrM|mimBsCLbimO_{ncviCe;HW~NSBUC_@@85v_ zk)QbU!BtA*wF3PniZ}AhxbGmeJ5A1>CcXJd=0GUEeZ`QqeTE!8`p0nOc)_r+Y6hGy z;|_YC2BUNq?!vJJhUH^_Hmt-0*JgpEf{U#lWI+H1?SUNvnqwav<+>X#CYfF^nc1a` zE-quvc;%`ke%QQOf&*oiqgL?=-5SCs+$YXDN`iw~**Xx{1yhc0s4LJ9wP5)2&`zl9 zR>87A&J-aKZor>wMVehYI!`g0@{+$3OIJ{pJbJwnLSQhx(xS$ghA+YAu}^rNu=yZ4 zq=2fmKV{=~PEWxCEA@bf4|V7cgi$7?AGd^4mch8lM^1jXfX}G*)ygepp*iJ`>Ud5Z zERJ_M`;f{de{lTuV-D*^7V6ofJTwL#z9FZ&yw=*Dglc5^dhXB+Loa03kb;E8(vF;U z#8ZPWY;q@BX;I3S?TcGKW z4LLcd<^-Wb90L?a@v`R&OJ_|FHAoNB=I?E~?HJ+Vd)gLTI;q4;;|slwK5Pan00VA3F=ZdcA^&mI>JYOQ%$6_#L0{5t zJ;T*4LFFKT9NyfTib6rjW1PlGW?E?Q3(ZsX+j|3SgJn)qWc%$|pbiPnUC&u*X6MgB z93|~&%Ux)s9K;S^P0`yZE&rLo6!1`W=43p5odXUxFNbEFay$J33Qp-U5Qh*F&pp2i z2~s-57Q<<{(=jycyOY9LCQv_lb%5c#j#aVMdZ$RR8IHi8HLxA#d zssJF7waZUTGDGl$-$jpL-`!H>pC2+p@cfTD0NTIQA#7 z4-UbnJ>)Su5T6B8tSpxpXKtv8#2qW}{+3)6{#{(EE%06s-iQS>#BKn=$n#+e(xBc} zOb+f;y`C5e~=mwe#9Y6lUL05~7NMq&)D3X;bOVVmDZfT}asYD(=akHv+4h@=xdfuq4 zE@D0Ag2-I)ho8E97)qH^_t(Iz@#Rt9$R3H_Z`V!cb+Vn1@UiLnu$NLZ*Kyw+%@3ex zZyILjA)`CNcR3pqZ_vkWI7Nh8rE#JR3Y*ii?5!%G%(mGgG`n?YP^)Q;oj*)$4UTSd z*Znm++vZyx5DKSnm&C<}O*&LgSB{aJUCH;3-E>VtxTB;Jhd!)wR9T;P>IOQrC)^yE zk9`P7g0A9DkkanayTXPsb65TrBR9E{hmJFxztmE!`_QMm8n_<&Bo8Q39{YSnxdC^C z9VC8@uV2H~M)0I86INM~od({NGS^1Mg0)cSq)Sgt@64YI zPS&k9^^{x;lIRUJ@;0!yY}nR`l{ z(?|L-Cf02&Dxs}A$^Id|jTJ&c!i2bcr~!F*RGs_pGqQJK!Icx6OowDYc}uJYwUNsXhkt{R4*`ZZc=}m80xfez}H?Vpgk7 zcgLj<1y+^8=tTOJ`0uLek-c^_X&H|z8c|l9rMLQfeKwe6Jmw$&I64HThkx3r{C7Tp zk$RD&#f#A8lBuRRsG+zZE2g}Pq&MK^d@!gPfSm-YitWi#1rrgEy-I23Rxh$8=$W(I4 zWHq-~%TX=4?nmUdI>a#7nUMTe=XcKY*XQ-TUeD+AJn#4C^LonmENEA1Y&?xXI@o(e z=ZP7&3qEr8B)jFd5DpB-12ct7=vaewH!zLwT6dmtMGL7F= z+A3fF*sJc398F%<>KEeKBcrq|{YYX7ZTLmoy#7paj7=SPO3t$&FynLITt1fGS_R=w z*y{``Cw$!Q{o=FEKx)*3ktC#FBcWEk#B`~;R@izKY3nI@>Go{-hBSR`ieqnC5Ic2c zDniQhcndc#+=Vc;5_B?SyB_(DFqr`|>gbc|iiMikY)!zDt_EmAB7gCTp?%`^OT4 zuMmAC!BWu#lKxS_xBhJ#EkIO%%#JeU>3`$bx1||1@!s^O{osO;{jP59rC=-0G;wh_ zl;=2qAc~$Oju|Z4!*fIu-+7k~oQ!mtsAVSOnQn|atvXcYB#xyRdEu$a*Cxpbbf(x^ z(P}4mZ}02el5c&DLgToldDPJUy8v}$5uh5bd`pTGROz2P)NO@hJ;*;88_jzZl=7tL zAg?0md5B^~{xeMrDGSUp7UOd3&a zUDmI*=N$5#Hzic`wDG}rY-yD|*;bc0$wiCRK-KpzpWg=^w$N4?xTQSy$Rr`wbz=%P zf`$U?)39JAq7Mk6vPnli%2_c!5gVprtOPWRjB9GrSYNj62POaJ$rIUdpG7nf`8$CV z_sfK>3)J9#tDsLCRCu4mr%BY{ha!?(66zTYqF!vu!;s2vvF{U9{`h|;`$`xCS4BCR z)vXQL3j@Is1UOgJcp9RVIPE6+f?n%_fFMenR{oiMzrRqkICYL3o-Cx3&C`|VM5b{r zyFrkv=~B&^@teXgL80y~rawGl8>(>iv zS=rk%DgbeAQkjCH1Gr@lVaEONBGYa~$WMzmc5Zmv+KNA*A zlgb}rA_9j^0n*KXus1z{Zmm8JqLQ#-u{nHy@vAh^MZ=a?6Ge$#O|Cu9L+Q^58)Qo0 zOT2QZwp9xA2Rw*6m_||9N_3awHG4RT1Duk}Kk3S(>i$PN_j1@zf$kQ(gSq1uAEpai z+0C(ePp(w>j;H;q{re@Lz;9%I@t!Pn0t5?5%ce&hXXiT0m94IRKv{}@4TeGfuF(o& z6`?s=Cb6wrO2-9F58g-o8GmUTxUkRY<62$Amuj(374a6SqnsQ$w%f}+Ko17d3?R!M zk*nZ7)=64w*K~yHZ3CleBYXo=`Nan!{^q?ov~6#ksSsaP9>V!AyBg`ODxa!bq?bls zc~$_ikZfm{3vu^9O3iFrcI#2?`mJ)8e_+|1Op;O1d{fl*=x6tw*!gT(O%FnUPZ)xB|MPD?KVkf&DwBH*2xeWeCqTk?-e|GZkj=t%5dM*q ziK@1-tz{=&L|WH{dBloHw3=xf5OCsOH8S8v4nJrYv#~wOO7sc<9dAj1pE_5SK z220E|+@RX#vVQnIm?6I}%+jmU2q9~L5RFmiF*Q)~Nl#vF@2Dm(u?qp(eST>v`wu7C zC8kN%0`g$F^h3AoN@f&wFz>U(AV;1lmn8Elix1P9Ki}Ob5QLevwA?y*%mV%15v-_c z?5=EqmhkkNjJ#-iU=P{+ZzU3X_2+uO)tA?jQ72Tit@ustrtDUY)g=Lb3(#bEWKH~w zOQR)VeKyL%JPzN1&azqVxvF(jXun*WU$!Shz-_Ox9^S=kL2|TkblmGDk>mDL6IT!Y zNABn0PeCiK%EcNzAz>#@vk%{SGPZe_E-WNNKwKBC&7`#dif13BkBHzh-q=vKvw;(+YF!!wpG7OIR@?0KKLlT<-cWX3_v=yknisJc_< z1%WX9Lf=v17ykm2d4;qQ1%2VI+^B_wceH7C+Ki0B2?_POjP*`2Z zw!9Ym^Z266cSvjVoq{Axd}U~JD0E;l%*+CfNIU_eHo0cO{+-I7T~`3;fxan7GXhe^z9eN6PXz>u)769oj{O)lJv>q*eqZ&KR2$VO4k!^eK7Ejx9FZB;E zok3P2xKauZKUmL#aFX1)yM$)^Y@wJAiFw<97CN;At2tMAkH4LE7;H|w{suJWqLg{; zvPpu`_@edu6W2e0{8q*LfYHZUa^b;?>XWw_d95#ePr{9&JZ8J$iBgm$d$KO3{%-i` z>0E3-ICT&nzkDE7njXZo^?eC5i*h6A@FHpXrj&Df;^%g+UsRWAZyvq6Wr&Jh4QQQ6 z*)-tBi;J(!q+&Np5uFO6f!oaN6y_%*nmyP=eI5LLBmHgfUx|YoWDDu&?oe?Y3;YlF C-H2!a delta 2944 zcmYjR2T+sQ7T(pwP#y#X6G{+70wM@3H39}f5CbGGs6glzN<*9S(Kt8L=gxC zfi=j|VyII6QA7-&5iImPk=`tnB(xWFcHiEaJ9F=ubHDS=cfNDKAxc$P@SkyHjIH(Q zsNDHH$=JbK#p5N$J=(EqoMhfDBoud}toL=QTE-#Ei3k{oAH8(vKZun?+n)Bb5^5(vh zLr{h$-j;;qNua9q)7X&MOPcMZbyxuU(3eubrR(aHRE;m|Bff3CIc)A;4U48TvJbXe zOe{$LlBX>Bqht4n`WM_aPF}HEn=Fv88mp|#xGP{WAH$HH5!!VR)}LmWS4`0)O~VcG z9yQrK#9K1S{SX2PhWQ%UVNVG$qyd_7TtQ^2e-yGjGIzL}P`b1hxrG6e6R5t(1zjhdsrV#_m`f#W-sJy%o9GtZJ}&anQkKf?ZXE zxZCFphUeBN@|c<+Gnh5jW^pj}Pp#VG`o z{m!ED#wy4S%Fb9`Ah@Ip!y2`(L(d?I9i8Axc0-1ART&Sp+rkI}_%-(3E* zrU4cD^N9w+0n^HrPh+d&1Q*5yoez6IU5t&2ZOCA#kn-j%qqb17?F;K?-NM+1p*@?# zp|S;BhU5V;=_e3`L})s!VF&P0%^d+s?ez$Y-|yS5;v%;4do5)w`fBj3ke{goT080q z(Bb!D2Q>)=kR-owk5xd>{ALMYv>ou+es-6aXYJaJ|8)wiz70e53zyNw&1RRa!#M|W zf5^`5EZ>h}_^vkoD#BgCE=@!)zYzn8t?Ure9!V$|$L*Bx^*H)@8I=5jUcbTVc@y(9 zW%$AJ4p5`xEMy7jd1K@Dot+34A+g!jw(`tVohA#GCodu0|J&B3HjQCDd+^qpvRzf0 z^cgc%X)elBjC+#jXS3oFGxrDuMT5m$?F*sDS*~)UL%@S3l5TRaq!pODd#;|4Cr)Cu8l=D=z{f~r7P#<8qS*KIFDVs1-Y~%Dwa_vO z?N~Pp$%VvX;70hye@m;h_(NJ+f%rMmr?^M7A>`b5l%g=$D=Lo<-;kXHZx>X~G;&p4 z&a9=F>Fdo)RFyS+)G4P2$+gU4GRivYPDm~gtX_C9ODhm-b}aZmWR|IalQE#R4{ z+nq8K-`XE}K+P^|4xv@Q-f7E1m9P1|gRUqa=l(DT)sMU_E~VX?g%X1mU*p9cEQ&BJ zvU~ShQj{zPyRPYh_}=~Gm7OE%49=SboF zO3GqwU8hn^fsb6C&*dOgdZM4c-)*FbSBVyi0fC||1Lcn#y(rDr ztW>FEe_DeolXyS7G(s!Zt@}-Xlq*i=`34nUw^sq2g90CYJUrT|q-Fx5?{j4mq9QKlaka znNf5WFIuyp)qCQe8pA21`$W)X=J7+abB9!nx%6r3mf8n5^J!`kYsMTzAtg9hZ!4<< zl&Q{Xf7i6QX1fLSS_+=jcf~{IdVy-zn><{0>-}31rGLEeFJsc$Ki=8f z%Z)bRaD)zKM))qfSV>u39K;B}1fZWD@Sw>;j>+@nWR^O$Qr=92RG%ot=9^G?atWdt zhLe^0oaAqHw{lWcf0>zZ7rSaL2SBY_9ZY=@Ta0-74jYzaj#&KCpxtf~L8QWTT6+KJ zt~?~S_wqb*NA$BH1iKsrsf!X}*@N zE3$jfeNfem!urJdmHTOjUk8KHheQH6Q+ddYL&V;1_xVD#dS^gPg=zEdxPf(!&1wvFd@ySkc?2 z+OhcyhdB&=rLX{YC@z7{Nt}N2Sm6TEG-bk)4~vX`999giEw7)aS{#7}XwJe6Xvm)1 zAR`)4au@Og)+6Nq^K5Ttz)!FMDk5g|CoO&r0zvfxL9yvI`iaLs^CFJ3;P^F{e+2{B zHX<)5kbARvF;<+bsc}zpTIoZ%#PPc3#)5j^`&Z`jbg>S@cX>5M4GxRf020n4c(Pf@jV>&ZvlQTs+om8K`Z2YEK z=P4EwL0s%VGPr=g=Ri>r*8FB^F+1XI$2t_b-Yt%!d5Mo~$nJwgTg*_084$CB#zuHr zf`k!yA&YmZv*4p-xd0B%i@z@D8!`c6M>lL)i1CPoX^&<>%=y4cqvE;@#vO};QH{9! zB>AqnhR~i+^_@9tDirYbVk1EmFSug1i(JL?cV+!q!!jbwVE&pSip5 PC(**#;cP2x{3!ne{XVB2 diff --git a/assets/ads/extreme_gradient_boosting_train_test_precision_ad..png b/assets/ads/extreme_gradient_boosting_train_test_precision_ad..png index eb329535c77b46d40b959cd88037e33e7dab1bc0..994e3a46840474feb769be70ce883fd799986ae1 100644 GIT binary patch literal 4369 zcmds5c~p~E7LU*rB$e0^NM{tHMMY5(X;C1usAwqy0RcrI1j-TwWR(yEl9*DZf=Cnr z6(lGq$XXT?_P_{Yi3%74h5(63*aEVKon$_sJup4aIi52!|M-0`?|tw0?tSOJyS;mI z)f#QON#P#~AP{Ji*^kF%R~G3!Do;wsf#^8YG8LT%ymSDO(&gp<4%%Ffe_uU#rEVBkt(VT6)cFNV{DhBzHv=|UxuaigHh{jl%T zNL$h_zH(PVmC<070#t5%n!v>fwi>unvNCdNs*AlCTK<{k7l*r;&Kf>f^e1Hs&oN5} z2e^Lam3-%i&N@ug9D9JW+)y0#7~*-p`49qf1M2b6lF~pE${X^pgrqvhcT&M7A?6vO zVq9)b#E{GOJrnBsKH)U%6rbscAL%Ky((G>C96(MM&V1PPD&Y0&HV8)bv@YlCAS@zq zqH2-RsUK^#n|+FtCK-1>WpCvCrqNxT6>`6W7JG7%TM`FGr<8juGN+FbLzuIEo|Kf# zim}L2`X2FtN$6-!{-Ke|m+@km<~9xG(R05tnC2JY4kY5B&3p%2)ot!H?R3Pj`*?Zh zjxD#M`{YM)Fpc&R4ciDCq^*tp%@Bj68M$_0o!w|N-3HrVH4EnWbXUeA;5!~=B*2?| zDk#&(O4`|pUb*5>(M-)Pvq@)0A>1kW{r> zB3VY~LPB4V7O`Th?GE`_^#cOmdq_*&UCf#86*|C?STRQ_tgj3eC5+6RHlw%ED8tnYEms ziYjVpNVIf(&}~L0`7p_k!Y(@VVUhI`jEHeDuF8WesX#j4Fpz;4U`rBbd`7HUM}8^X zck7?q%{@#c#ATHJzycS1a7^HU{f+P3+Ls`i8fCf5>eq3 z>4FV%r!e^W-Q=53ZfABiH8py;+q5cTR}}C;9VyDlCKX?3>*ROq5cs)}PEv6W61%&) z!pU%e*KabD=J8MxOo3Syx9A+Ad03vgSrP%MS~5GM?LfR!dajV1`os!Ox6F}!N1ySA zg<(c{lchK>=0Rw-@%0?|u}&vJgNCovoIF1i{Xl=9-?*o@GGyj1Hgd6?c;}5>5+v2X z*Z<<=pzx%4XrR(f_|HP5Xs7cWdq@UW!q8jtp7h6y!H znQ)mR=KeMfwqEO?dsY#Xp;&%;Cw2UW6yPc=xH zRiOg5+b?odvM$Q=J$0m^xZuPFWu)|3u@>?dXlTA%de65@dwkpZ5PYFXgLCInOafOl zlJ!Gb`w51hgPwIzKjOI`*PRk`m~hmeg>s5GP(SU9zg9_D`lfS>Qc83iCHH$>Pol{? zFzK~*7==c+h_6f9n7ty#Ur+W~fry+f$7ylI;HEkmWmv=F5?wKv;YkV1cz9XE17#|kn=bER? zjlYD$0r>*_S?+Hkk=I)Gs=eKg4C(UWvpYK=2J&l;{WE%eQ2}XGJzX-O{tIG!aVTN57^>*XUg};u_Kfi9((8-Eg#oePjSm1t53^Z{KzwTa3S5gt@q#*8Wf(x&vYzszIpp&P z5s__)_hpqAQa42b$;`q2OCXu4hXLMtxw+yyz;)M3kdJsb`Q|}@gsHf!Rp2`;-3?v` zY=ZzUBw)|~?4wmoUz2nvS5DG-`xueH_Cllab-}Z&CDUiyNYK7SuWlFnBpIu zLc?4A0^V!u_Iti?Wue;9bvJ1 zFn{+;;nLUu@9uR(ELSA*v1E~d=pP(0XPGxx&b_OLY?Zu1RdI(XVW@V9C0G7NWWTUIg{@vL@` z$I6Fbx>FWknFgj#JA|Noa!j91aEb;Trk>)8Q`0*F=eeK3d;@+Gi$IRqey?vXtu8;`%@Tr1J~>{kgOa(vDp5+=b>(0v)wCi29gz2!&*BIz kF^vkJX7x|2!X?m!?|A-^Z(HM*|JpD+fj(Yv%=PDg0rG-dvj6}9 literal 4373 zcmdrQdsvd^o^U)iouh3To@ZwgIVIhATf&{8Vy6EV*o0hhM@ycOGAQafFuS) z!vJuJNaxbHS~>HE=v?4S2QXSXU908NkX#yuO9uvx3orsit`;ByfUDI4Vp=*4 zVA68wONd1Y0B!R_l0OKv%D~;#*@u`Z<{2Da*OLXAijCFhuzwt^6z@2_n`ikQ;Xo&F zJ(=D!qd^}OwT(h(PC6nu?~`@~%DoPr)xkf9+IVd}^bWDhJNn|(*WwjF>p#_9Y5mG% zJ@fyEc%aJ4y1%p9j})>n-0-C`2b~QL3JT+3L^#|D5zaB}S{k9eQbnmEjXSm^A^O^4 zkCFKcp0&6*-rJ$Pc5??rE0@wL1P2ZOP2T|}9_LzkP0~lQwn$98B>cOdS;pBlgZX93 z;%~h8F`)!`Ji9zmnC?b#AkB2D(VqC(yQfUwa@^n)rD|*+BR@T8i!lA|JY5mfXZ6dy zZ6qkXg5oDc;Ae5{jYQwuLA46`0l4-nsU(l3Dqr0xVNr0n+5By-)2UjOUg5g*bygF+ z?iB6B(7h+NYF^m6&V^%QdUa+Vzh!(lREF>j9q8DK)7ng+89h50RvujIvjm?2g4=Un zMqL9;%o4ZxlFM#oUl5NW$f3%XQjtd1t9cmnQrsRABI`;`f>>o#1;WXNUdsC_Fhm#2geUgtQPr``vXZOz}$3?1Ad^0z4`9X>z^#J?BGEG2EMD%37 z@EpU%^dyvXAm&yUBA~eT$m>+{?S$I>H-^hl2>++L2UzICH|KYp;Ixk(ni=?_nsoVA zwu$Cxahm-RQhD#hNaJJ1TvxYLbI3FieSV^8kbHds(-}`OG4XuJD_s{nuyDLv-#@vt zyhjDGg0^U=T#2SxfG>`ToNIU~Swnnj+(&wFTRWXz>b?b4ICDzvxkSx++-o~uT| zgp|i}Sf%|IiPe;7W>~A9Q_CP&f;tvr)#S&RvEBizJVjQ#J*drnk8LK3>dw*SbWIKNBfRi2hwL=P7oFSGgSl&*C>Q`5Ai{Rfz4-z#*V zoo^ufc(D2vk0;;Y3aP0Z@x=BJf&xn-N|S4Gl9*}sVbXrAdV&YRVx)L=1*dbIhl?mO z^~HwB@KwoNvpFiBTiW)7xnA2x@jD2VfRCF6Pon6mv|;LN9r%rQ^b?T)2OR^ofms&{ zP)@|V^Pr(CuXQfNLZPby|5wBd%%p&4-Gyu(dcoviX|$WJLEDv#BD7~r&pT>-6?ed551nUr3MBHA)448hY)gMAEf`rluGQ@{m8qHRk3%|Gn>%tV}C zL2tK#p?6vFI`iMw{iA~K0i!Jsa?cAJj&JDz8`^Tz@r6QRUVsP}9@)1KY$)&9!n9*U z?z!pvWNswRIyHPFH;PGX8kB+G@iH?_-!*0RJN6R%7`>)OGhXgHxDRBQUdRrNK$ziX zki;8K4HIUE6`tL@sR`#kNHHm>^bHtBrPObP{4-qM18XkF{-0K+|3+bnf2%c>!|Fim zJLu&K7Va4Iz^as^C#eaH!r~hit-^;E$06p}r}5G^MW;;7tViPz1(lVOlIrrR+9ew= zugXlD(EYtHOnaav_TKmsM`Tp4z#qNnM7v9FP@n!c%KGE+&ayXUyEg4Gj;3yqO{%|U zjrHn(bgjZ6R$vRDoMm&j$(>~|qfgNbt_i&nlaOU68?8>y$>^VtO>O=0xb@ae*+FS$ zzP9cDL&h1&HFdN~%I(>Hn1}t+#6WI8yW)&DiG{9NmhUxguT>qbO+Ei;Ioa!TD@j)( zbR1?VdeX(!;+{Vy92TQHt!Ip4h`<^o`A?|0$93^5V+HHr0pEltIJz$9W-uWW7Xjb4 zoUvOsTeaunl1uWIc0`;nDGCeJMST9zHC4qbY^2VH$BViqFmhrj!qm|%Q=6Boz$Isy z`coW*lzyT^ccqI!G151i!F6axx*B~7Z2FbeS@M)-)v?|EhE*8If z+D?jF)6mbfN``rC)RO5$($Ss%?`+e_e__ukOns2BWE>t|!K!c9HY^7=IK{Al82KhJ=R2w!VCU+h zHV!N!{pzp27%4}d7zG^zT)_!lxFNd~XcI;~68s93|D;fC>h$zprbK4ez>H zrt`ZN^oO#?Z(a+Oy+B={&Flnz{}#|d`}uDHE$ieN>F~h}`f9IOabxuB{SF57b<$HF zvafov%zVp+&R~Wt`#%xk@w$KRt;>O7!^+mPzvx9Ah+%-fcCv44;pMnObJ?rL~y^dd*1c;AOPl#a9wC!$> zJXm!Tgi~_pRr}LK8Od@@`R#@IP%c*MurbEbK-8daPgwnLc1vvTo*s%QJHJa$E@DEsysgn38m~^BC%-Yk*540R$-vqc# z%|=)~uta%y2sJ1}0w{nE$}u2))^is)U5{@T38x$HEY$eB3_L$y@K?TNPvoFnxmc{t ShJwYvH{5r5x|Vz$eCZ#91(2-( diff --git a/assets/ads/extreme_gradient_boosting_train_test_recall_ad..png b/assets/ads/extreme_gradient_boosting_train_test_recall_ad..png index f9b56746114eef64a8c12210a07891a52a5179c7..e48e0c3c1ed05a079d16c341ce9fd70a77753bc5 100644 GIT binary patch delta 1530 zcmZvb4Nwwh7{?PONJ?}Isi}~+oB0);m!_s7eI2)~WN9I%qLubxil?YxDvX#`u55Q) zGY{uSUCHuW%AsiJ95^LVkL4$%;^z_+L2-!6u-UoY?Y(>Ny?dVb|2+Taz2B>K);iba z?sR=GU?VjgtMC>Ie6msb93#x494C1Yt+xY4if&)jE>=1CJPW=1vx7ucI~04_Sgn7M zq@cQavxDb(6Vq>_h2`a4071fyhPK*B1k+T{YTtQ`E>DBsL-Qpq$6$1d90V4u&DcqKw zqAUaxWX*C0OU>3%CwOl7=%(Um3pV{3-JWWT5w80N5;H8aMTH8vF$aZ)BiN`$hOrw zd^S65ss2~=;u3#ukwE58DImYSm^e@u8o0g@W_pw=OR*h1SdzmdnToQDfN1of2WcQ< zcX5dCf6D6fJ9&wu(33GH!FbW( zxOfK0;-AMG?B4@Z!Fp;m`hf+l=PLMeDK#Sx{eB;eo#UlZ+-?V|oW< zdotYOnHppuZVQV>66f1|c2qER79(t0oi*b`ULR|>++u+>2ZB0nA$)|@3r3oG_;f<) z$`|}9X;(U`+soW_+xM zJ4t({-z?STY)! zl;*)l2jbF(&I6r#FeIwP-*g<;; z4^hxBU3-%AJ3yRf@^o`om5Lm^G|4oVMTiWRT+17MzYtZ{q?wZI7HgJ%qq1(u&QkAwlJQSoCS z?XioXwM0+f)Gxe{$2-U zNNx2R3|oDd;@o6<$BR(yutJtpw#AT9@>7PhQ|oB-@L=xr-rhhOJMk!QR1 zZ_iy8$Mu&dXB$tpCgMgep*RfAZw-3%A>4dJNL+$%CR-g9@M$ui zJQyN$;O)F@7YxG(hX4<1o>IS9QBaLL?#L`WZ?!bAc!+iT+7CG+aOx3 z!xbfXuZ3BEF4J|zTzcreUgoAh78=vS*d47&9s0RxbJz36~oWPcR_Zck~@C#w9H3{iNNq=L|`%a>(6{lfSNskW> zMZ`a-d?;R%%RU;34-IVL6FKMT;ik8DB_cfj*pimkQ!KMV4#VC2LP%gG*M9919Bg&5g)jEk&MC?e zYa~8vLy|>0NBD%D<-G)7aaFkusJD0VQ{NMN*)~JTDVs{|UR_1hRb;v(;|Wi1s!!G^ zX9FYY_hjTDoBs=LD@1^NM-OTyIiB2Oa+S?@$%2~~>+}&DQxu~exs)|Hp%@zvjUeQeL(^g4+;H&Y{K=%vqU5`S z(HTiEKW_2Gor?YAZ0|rF>Htr=*#r?#J{YSg=(q)0jxAM~^Xdpy(wbi}5#YI{op6kn zATmW4a>!jb5m!x4O7owXVR{PSH>dpxsCt1rorR90#3Z9!g1LwqX?K1btc~H`UlFV7 z5!Wrip4{5nINPiR6#H^1O(`_PYAip;o*2&ell%Mw!3(6L;FF@toS}O@HTI0jGD3Eewd^$*J4GnjIT=eNri?9Xmh3SoCSx0VNVe=e zcA{({ON9K6=Xu}v@87wub6wx%-1mL%?Q?$$B@-kFtmlnbzv^pimYVPk}^)`(x zoAWpDMknW63HVkv7(Vzs959qP_=6-#JZoPbxPSC43{BsCc;dBYhG4gv|7D`|;rdbS zmm`Tf)wUxwLT&iOk6OmIVeABzFO_1zx#m^Glb|l|A(K9C*L&Bj@;8*ujH@0siCEfVoF zYmy>w-yTLE*w_u{XN>(M9E~K|UuV5(H}bc*xtxDQn$A4Lwe;1S!VuB?(8hyAKHDgJ zj+WKg_`dwKJz*lumVSooQ6=V&({^~h;Hx`ZYKjvE;S7u|Co82l0n_#Fs?CYZu4!Gq zWis1NtosB9T1OA9ZMmOhj#C!BWtZ1J!Ufg5$u2JyqZt|>A*uOpr81!@yLq((AL#O~ zW~=lBjYgUk*`<-a?3PGp&p!?oN z%{3`nyORA8T$rpxK(rpd> zy>`|Y7sgVWHPkk#UH>q|F=r1m1~aT$oth>owRn1)rEJDk%*+(}C!IYb|@ARd;|)iz^Y+ zmXGp1aB#^`NXFP@lo3?M2O0Jz&9=jr*j&hPgz~fI4+AV4UV~}TR)i8U3*20|pNUrI zNxZ1%wqB;rnVUTgrO1A)w7I&h8@X6MLe7oMqb^sLScwiR4K{8l4O5)7=bi<74*ZGKUR`M^@<#-TKb4RK({)*E_-TX?3i_`AgspZMjR3lB46BVC3+>1383_qxVqP2fBX_b1&Q?(EatlD$K_zo#8L zme~(P+}VxF5*fdl(mOiB=ma%RV>Kg8UY30|SHRk;kxiHzl-qs?Eu1?vuLD+AdRVIZ$O%*VVPZ8W984lRk0M^w15 z-?WD7q?uJ<0_mv#6|vC4yWXESv0*&x8a_zWgThjFY;RpUX#8`6QH&^U<$53;sL8I^ z2}irX0R(}Y+qEMi)}*(VH3EUUQ4kXCuh_Yrot#c*+scAPti*Nl2wF+ZI;}U1so|)j zUTIg~S0m+jnAypdPr^j>#wuXGG^!{>j}uuFsHefh$47U-!9akb{)dHPO@4K zk#LEf;PD}2EvJPNx0Bm|)Zvd&o$9-~AE{WL+jaf@>{iO-7G{+T|90-xmzW+Ve)>Yp zf;z$%p_YzHx4psJ!vtppO`UBcp)=7UIm{fp-OSic8A?g&TyWhZfNhsdBPKsMzU%sy^ zKy?N;titKjf=jOgcmSL5M?FVE2sIR!BloqOS;FbQiHZw?Rf=1EA4ao)t&Y3j<>-MD zvFpX=2cp`uDHZMTyX?x(%M7W9^W!387#A?%2;Ubv ztfCKG+bv$oJoq~siY*qb%!@o(YhnUR^lL2l0NFc2l@%&W0uUOGHSV%Mr|-s{-LxKK z{*e}VJM*r;KroOSY=8|a`!e4?uyd)jrr<$q7*I;p`!kF247g~*bvfR9iDk|ICH4Ey z&-Gd6EJn&?@=;3FA1dk=I`u_PK8zzD(Nc!jG+`@a@Bx&l_U z@#wy9tST)7Rk1*TRQ^Uisb$z?TremAcWlO8z(`&+n}HI90*jTGql%`#-M1oehBuGI z1SN-d!M4dOhZEALN(8lq;=G8t?E$O}h56C-8a(Z{DbS@h?9d_K<0lV58+j zpJhl?qN2~6l*BV+BCTWIxe$U~lfe5o;~N*TCf~jyQQu^$LxV_C+7e|W`C|?2z4Z2} zX=`j#PFxk?#)mX}()ofY^(OY-pNk%@-fLz`e^iGe^P*e^}6~qRK%bJ%7x* zfo^_Xi)U9u;7~0eQ5CaQ6RKb2+SZX_x_8d1BEK(!SQUq50~rHs{85#L&KTx>{s9`pdj@#gNzvD_<>Q8qQ`4uA!y>W zPEhf^0gqic11raC#W8f@At(*xYk63(Y%s;nM&$S^SGBwep7tpw&~45?Hhn z7X)WAG*yV+r@7?Yr2jLvM5mu(7k*1LVMat=q+3yp9&7gc5uIh^xq=1k{7Q~axpH+0 z5#wV@4T4)|aS$M`wIh&EGWAzr9JGx+Bo_}6lLbkvmI;F!B>Km0!VmC8Z>mW?r!Rw< z2W(525@QlIlzur;%01@buWU)vS1KaCXi1+ujH3XNJ-6sbTj<+wgpY>Thq4n?tNe;h zMiqW&y~!iJ@ED1#6>6YAEO4-FXg{7)kn`jCTq=n+F$4ng4!Qxc8)W>Tit{&2_gvj{ zG`#;u?FA6yM&)E&O7Mn;0S<{>Zae3vqlfhiNItKLuVwx-fzOJfaBAkINKNuQ!dD+m zrSm3+)#fw!%~|%Yp)==Yla=Y;DluBnn!q`>3 z-C-eV`P0{{hfo8qzBDpC*Fn5p4uw24+>^zKw@)CazFq~j2!F}c=G%owu(p=mT)-Hz z`y$GcK$KY&Hjp>TTb6Si7Mm3q!5w~;Cyo8KU9-)SEqi#bu&^;lgd4I3Pkb?#4<+hJ z2Zwr|R!)_^s8atlS#4gN2Y8;M(5Fh>qsk+#w^qf>k9f+WJw}oTL|f3V@!l?D!oZ&a z`d1)z4cHx?ShbJTCp#9miB|oR9B?h=k#eD5cBplRKekE|KW|Y*xYyd}l52hiTWS~o z*1bh;vK|w@b?^ykE$+l&TzGS6>Gf}cz+%Buduh)SWjZgt=vBQMpjLMYIR0?0=$v?9 zsP8)*53FL9-uolh>t@#oa%+p~Lp~|^T7$_g2YX8L7+bCKtJ;~4a$yQUw z=goaTc2BFz^|&?%BIjvmfTiNMom{nDJ5!z;n}w61$XY9JRFA?tS6!v&ZjO3;XubCI zksKb%uJ_YTFnxQ{R!ZfN4FRxadEcl|re;`ushn9ZX2N3MWDiyXU7~ux3Cbg$*HVNm zawxMGA+Ek}ci0Wy=%hTvIak^q+#CZrH>uMLVaU>x-(5{ z$bQ{IApS~Nj(Wz!zDvgBV)S2aSG1?uiZtEG{uU39#CVIKbXTHT0E0Dt+Cu+9FFL3F zRfeb4k_5#l8(S(%#>Z+V-=8TvwCNPP6=Pt1@3saYOfS!vm1kiH}Zgz6UpSPEdy@ z8!riEZ5=C9jg6ZCrqq)bT^j{H=|W=TF|YKk8|24yK=FusUL@?UI4?^OCBnDz2BS qx#I1i)U}jYJodZ@{s}!j`wl=HeC0{hZk{;@KKeSw+Lf9Z?Ee9Kr7UUy literal 5540 zcmbU_2{=^m*Ap|@Vnhbn%aUD~XzZb=Q1-Dk#2`i_`%*^8mhrQMEH#O&$uik>vy^N} zB!k8-8D!rw|EvG)|A=8BAoh?6iy~mbi$KDHBoYBj0wIlnf)M^l60i`$9}WTmMIc}a z-~}?0Ku9K#Ndz(oX(TL!ghi1EV8fC~WROAvku(yCMkdi{-4I$g4MHZM$OJ5z1hzjy zWE#i<0y2$8gFwI`Xe0uS3^q^#Y=`JquBZnP$cYd`JstDFY-&zo9=Tc|M!;<#I^ByP zEg=7M=FCmL1eOY51Y6-3fUe!9(e4#kT_L>~PM@*f9oyQ`mX}|R-`A@w{Yr%9OU`N@ zQIKW7%Figy2IXYH$I=6B5tvDB1s~xI)xU?XVs36YBiFTB8?;=r)7Iz}HM4(!T@m4l zze3vzV=GTO=-z4?8CjfoqgC-d{OqEggrgU*h#$tAM02V0W36$dS~m6egG;ZNG)whX zW@?{zV7!2K8)nVMGjR-GC#9(0TwUs1V)wSEime;JIPTP_*_k`GW7R#m7VepRo6|zV zfo%Ra?Yp(;P*U)6q14JR>EqUmmbf$UU-Agb8eh2OJ$s9WlpDW0Xt*Un(@%E0R#LNB zGGH-4toN_H2kHk&y|qT2i})ENL2`N`vGHaktOUEb*7yNYV(y+@&Wqy(F zk3PPbm(X~5Kg=%@;qqc+Nt3JvBS5+y#ErRVOgJ?o{)zx; zRQ90KhWsr2ciatk!4Zzq&zLJKmC6lieCMFjN|zP1E|r(nFzkKdL$42ZJt@Grjutrk z1B*_T9u3l(Quok5q}wzn(Rm*_ZaMCZt((ToAilwhqfpIKa-e6K%IVgQk@lZiTeTCd zPz~jkZY~}CIReX-P2h<3QKge&A=jDwJ8+k0xn*+Grr|a>M0U2#i}4Sebr4bsY+drF zX~r?dny-Y2;QZeBw7LGv`F9a687EbPUh%Des-g?UKPF%e0CYaKECI_#t zQ&b0PB)}yQARpH*WQ3rCv+ZneQ77GqG}lh9qJ&#uwz3mo%f{ws>5V$@IzoSYN&Q)X zy*vQSAc6^&G>mAY`}YD(K$OmM`Xq2fP)_hjZ`USI>26Y!AF*fC-GqG)Tdm&=_&}l@ z6NYiRu4`B)Hg;&JUcVN1xWXmUF(rhVa#$Gsi24$s{fwUV6f7ig`e2*-G^f!g)Sny( z2i70}`zR~4u&e2KscVL>I-{;P_jyLSB);$Y9qgm}2z*z=T(0DGk=l55s7WPmRa*fX zKo9(&`==BORM%UDJzI|b7%-;oU4K<>yNnJ$CsI#&!@54lt`2_=^Nqb--BTQYNsS-c zr~8m%!yMM;78FHs<~cRo>nG%%trkIXp63`fC_%nD$kFyz+=i-zlCpEhnV=H7Jw_&Z zg#qtC4#L)_31hO_Yp}~BMjA0(ixh=*MO_mp`H+m@kCK7j&KwzYsk1-o8S=eTr-CQt zZlHhU5z3JSgk8y(yGp^3?Lj~E(qkTo;DriI!-BIDK%Coaee#<)7J$Gv@Q4l%a)@YZ zg&j*Jy5>K|oVKvERpCnzQMC^)z%&>-oda^{-|FzSP(~Df1vBPTLfu%r)Nm&%g;gbX z&7*qcc+7aCQz9`}ADqMULvKp89-kKGkqm^MXY4SKKK{7L})oJ>=-ii+xy=DN=g-7t@Ryg?0|08aU5qNlsQ=M+oBjG#j-IutR(w(3` zD7a;f^Nh1ik^vs;#m7!HW6t6ns2=$g-dKrYL93HA!NZ_PV8q|PZA%}0J=2h|E6Rb~TzC>pHX)KN3k^{!t(Q95WT^yNO~?hmorObK0Y zO{grH4%mVGkF>)I{6=(@F!{??FaVvHy5xeRjOFR=4=8u9dtFsr6svtbDG3JWbF3g) zY`A6?>|f~`k_1L@RRkEoJHhEGG0Fxc3F z*>3#V`jydaH0Y*0zI@DN_MYG&{^$$0znNmQTdZbge9TTz1=ZNqS(Vm1M;x3s`qz98 z9MVFg(CfwOd~>}-^y=8Ft~)Kqh7nNj%?m@8>&2dXQ51J=$t*CHXJ8no2Ns#!tyCW;&%oa|%F<0mrikPdr*@Eg%PVa@a4cRQ_F>|5ABi&hV9@!s@*<~9 zNBDCq7e%N`fBR(caxYOH@jYzkH%j)2~ON{m$H5xUCn+~3B; z_stusl05^N56WO^6z!0(BMdV>$567>c)Ctg}dV7mBzOP$denBUH2nBO+$|Dw;y;Cq){ zVp=d1xx!S`R}Sc59hWQw3A3#+Vxe;DaKQ8(z1B*m{T41zR+;NnO%i zQGdWReys1gNT49-qgL3Bm*KsEY`u6%!IW$l1yJZi{GII-6>v9Q*F;~HHTAxr*GF8I z6FlI<4>;mORvc%f?&++kKb1R-1O5cKIGgF8`p#M_LzTzgfWH)U1Q?u$X8kiu{JlC* zBK*y_4TQZv$Eonb23aAIgc%nz5fJzLcO(}=zUIx)=ROz%||H3Por9V5A>PE84i&z<_*u7zL=_huNTxt+DX3UDWb|z)PcaC z{`*~eC+R(d(haq`m)14S9yM{JkCG$;!^^ae)0VzP7mO_}U1)J^nC{V7H7QF%K`%Ky zq>P|^Sb>l+1qo%UmMs~tzqM9&u3P1s?2h#K`LL~(Xd3^p(652{AP4r*#}Gi~NU@*L zw-N^V2Ndcg-v%|kNz36kb%5`*b-wG8Mff{4^vmVpq=+!dg9X2gXzk1fvGjYYZ`_`)rhi$w*l zSDa|Dzu--K+O!O)9USj^{HDt^pJ*u=Ty~{@f3|D&P7eO`c;sN8`&M{JtQ+IzCprD9 zM(Z@9?C;0#<@M2-o#cf%>;`}7Jt+5XkAtRQl~&K5%L9kB$q$c?ykl2qeoxqKrkoF? zn|wbsq|Erm*+{4Ob)wkzcoN(gNiqPE4Ui;u{#b)D!+kZ2Y~y?+(^RCGk4;MLwF6~a z@qw%d{ZVnMWrjo&+R{w)bJhbGkxQI(do1H=p-1K2J)Ki!h6X^K8^~a+_-sft0iND= z%g7WN*;y@aRE-PQyJdtby_ViWry=&Wi&Bs~&9BC-_@%~VLYn2s|NRLEwvX>x#RM=!}9z>KGHWJNle_d-7u9ffa z5SMh-DKqT*G_1gT_%bL`W*CrnVZ|9#vGUFsIO@T`HLbYjIT#QYt9;H~f*kp=Vo0>^ zQYx_!QfAopY$Pkc-nkCtqT|h+Ufh*9BQ=(wmZZlqm0`>r)+L0{YV+4y-Wl=$1W&162# zb#vc|UkhrPsk&J#J27fXny-Bv1PZD2;Cv-#u>^Q%M2k7EWO{ifs1M|54u8TK9xtpT zxDV!3(SiIW?%4X)ZHhrp)SY?eubiLl&K#@?F`IgaPAYS!4|1`(%ayJ!W^7@=3DmnK zr)(&mU*{bIxTZwRUYNXwlgoA^@BcGU%x|*%$CNTdwKcuJKA1-jcs>zIwn^#c^gLz8 zYYfgi=sA>w#n?@b%#Yqg+*)hfW+G^So>BVCbIe{w@7`H+augzC`|gUF`6z-CRiFf4 zsQ3*>sn#|F#}(cJ#cV=gWS==7eSzZDc+sPIAmN9$2Zzt6EWK>=oAC~5hhaOP!WH|U z7ci1}M(yTni_mF2vI}qnYM5qFPi{@PN+6f(dhucxWsYL6W|+-hTA~_?@A^C?_+sAR z7^0h&Ce5|V%NV4QeoHxZL+E{p7I%B_D=F1mtWmeZbdhJvPAcwB&&50aO7sk#Ue*c~ z8Q?#RFH3k1?x5_YX@zysO6Skk%U{q7%vaEnL8CI6hGVJbV@E7XDvIBZcb+TCVJxtb zB@w}2xn{9+(7j7S=HroRivAON#id#N=6ulG%G0!xSXMvXXYMbrQ4O$PUhXHgSnVzP zIv4<9yczn-E{SWFDhyc{bqC|Dx$P{BJGB+99JuaiKg)nNa&&j;#d|P5>iHC+Admdz zAM8Rp4I^)1YPi0qe5`YMlFG4fI5<2y*LW>u@bNQzyGHtx-xM1ruhB>kJk_*%B(BSa zR-#oCvNA4(n>w$`(R!u7)U8H3unx^)nHXJv!k*Jpi=nH`{9@VK&&vvAJM6cnUKS)e z%<$h^$IH$*z2evUlBoZhHT*9F!mt8P^-z=7@3J>A=!-HkEAtv+0RpBh^nen-OxMFG zm|#=kbh+*=N8!NeNEhDM6>pz!rH=X;$R30Cc~{7sMj&{m8UE|ZiEAT9NxCj*20q1C zIG!=qxRMa560wk4YeGV1iSw3po*btDiE(svE z6)-3H!(*=HGwveGIVIxI+(o|_$u~|2e2U{{;)dCnnp9X&gg}8!$ydIPtcc&vZT0U` z2Muudcr{O(ZhnIZbRdu^5(x5^yu@hH*n6+LWPb?&cO z`;{;G)B51V=$uqK_SAxzqbvo&6qglqm+ z&5RQ=Unw(91zYg%8gGU+v-eIs_icgx^b339u~H2FZ=W9oM__e83hK+RvH+Pkhc;@y zQjmwDv*tf{7sNtU?5JAB`r#s~zTNfBmD{BU<{(kw1KZy{`0J$|aXoI5p~#}xe|Xwz MsBf%SegT932f+zZ*Z=?k diff --git a/assets/curves/naivebayes_cv_roc_curve.png b/assets/curves/naivebayes_cv_roc_curve.png index 676ca7b7aea0ec8724f439b9f59bb2696a83aaeb..47d654750a2951cfe382b0360e92f511f7c5a067 100644 GIT binary patch literal 6204 zcmbVxc|6qL_y1I5-&?F>d&^caLuDCTLK<5OvV|#(EHSpoGHRw!WF}-u$P%(I89Plx zmI&Dz%OJ)YhL~ZP?|6T{zdwJE$M^fakMlbB@i^x`@8jHa&pEHxy{{KHEsS|MML7Wg z01xcC;cWom80qL?Kf#oka9R{FyXBi^Rz}Po0DuAj2mllcfHFA_fM60HfM5U^2&RH5 zg#r*qN+^@#pa=vM!Q^--j!8tszeogtDLop5fZ`BG4UfPviHN}C5O^kM;Gh89zhE2! zfctk44hO~I5IAPTGl_^J;&DVKXW$V4JOYYmPJx3Sq47kf63>(~@OTCh&tUWd7`+St z5eFsW5&sP$GMIWMBQh8a0Dw6K1CL`6nZ!geWlYeK=94Lq<;yW)?Y;Gvn5pDi#i{jp*}mgtUT&F zs+xaj4KkF`Aes=!*y`1#jBf<_on~#lirxs?ikw69NF}|W)O7y-*L&Ldn7#F|)&1#9 zHwb+DuMfh;-6geurAHzj5eE*Z(3VL7-H1$L*{> z-6z?k&Y#=q*Z*Bo`tl4~UHe*x#P}OMh%wY8vel1M$u{p>P}!f~#2)f(JDRl-jEFR)o#50NZHNnh9}G1 zxNI|TV@=Q=qhg*hSInsIciQR+rR@Z-yP0e}u8x?f^a|@fXHoO9z-ZO2pEA}EH#`0( z!#Ps^_Cl?7wu8jZHkNjUFq_vQd6qjzpqDba+$_JV_b2mkEi40*R^N&g;~{9DP6oGIiQq%mfmaG+3$a?f7bMM`yEs=5*EI_8d8aM;t5ZUOb&-^l6ig1ET@l z_xDcfm|GmPCr^#2y4b21tyq#@aGtP8>R3dBQ|J)aw#C7r5l zw|V1+nY?FN#ytuf5NvvfRx4kK>2b2y*Qg;IUPZD~IZr{~+AMeZFjWYQkKrlidRFxlyRHlCVOO<2T2+A@^hJC>pXOH4c*t9Jpy11M ztp97P-ray5p;3?_KfBzuts(=6lD&{P8aB|fFr{SYIit~rfdKpCXA;85q@F1}T?H8Z zBvRmj`r2*NikO1~;LT7o0sc>+(12XI~Dj>Wgi@F`BR!0vN={&-lR;;pyTds(m&MItaA%n{b zc8x7wx_J(Cyof*kp;LwpBg(~rM;#|*aPL<30+H-3@#iUG{z${SE+^1)Gv`F6(B<~s z{Al;KeNAf7^B+)d)4bS?UoH`uulEx$hYp0xUg3fT-F+P;t#Px1!bSLbcGM;)6l z+XSX`>)IKKp5Bx2RZ6{H*MQeR#AEea)o8!se(0@oEF06=5oX!J9W(;}MB63QYs`^- zX%A5lV6e(9+C&y5q?${*4zon>Q`ZwlubC|hX=(uSx3#9}aCRrwpE=BO-lOWVxr#i_ zo|)WraCIZcY&}?)5c+hStP}DM0!3_S<-%zRNuT+jFjuy5yZ4svZWb$A{(i!MSjVZy z5!xhM_A=EJCw8YO8vQCH=qPBkwJs+2AGl*km1(7WCH>^Tn@?P%Z6Vc8?3rof2YcAA zUk@kapOf4+Bm;M!dju@*RPr}*yTlb~r(Sc_N5;~6$cKqg-t>qG5s(2g!-yWQrrfYQ z_L~JfH+pLW#{H^h!fP_Y!g1p&Dg&5Rbvd$rHCi!>#E;~sQ(-C|GyNl!x?rXQz-_p& z4TgQSCF|LYD$<9S?$H6U*nTl5obMhiNF%vPWnGDE3n z$n&Fvm+J*<7N7R>k#HhA>?-24rD-FNBsq%idbXBW|E2`U_SdufhFJ6zW6S>bbW& z{b-Q6^7xtuEq`jA2VZdT=7#r8vaCm^3~+GqX780eA#t!9gEKJ&QfFI3%yQyg1lF=H z0l_DLShJh+0lEdG+T+V;gRNMTkvbq*$Y{Ip8Tmf}>L)-io;7Xzyt}oqP}Of{pnU2S z2)^07nbGit2;o!bZN%Od0u?&1ce$6)1$E3nVvD$k%Ofpids+&V_D|XFf=nHd6)58e zvAQu6lRN22B&5Fk2|uZ^EMyNkT$`}e$H)6%JBmY{u*$JCzKdUi`M@mCfbdSngp`A1 z9kOV?B+vD?g`^(`4IGGh@F{Pfs&+c5LIj7w=7ZwKpFf=o=TlQtm)F(Ppi(q<%Z=pU zVNR6mp1-?Js4pc##%Y9Gi{fgoY&DCVuc_h*n1cjVB55H}8WL3sk3|}|)0oD+aqbL*1cY^~ zt{k3Ih7AdTgbIyLxyO)HMq=aN)xC6&k0q&$#>S^p=?(0EoWQ>>dvKySk26z(Uc~Ejmf# z#hrBpw9#h7;d}B}DvR-IFgM};pW2l|6UfCWbZ~JwDr{Z1O|myG(!I?st8^=IrLdbC zQy<<(&I%V4f4!d(L9z30|1%m%N&6CnLb@8RkA_jw0tWp!<0YrN-1}e#8>%6rIc$A= z@iEj`_r8g0uDPdC&YdPBefHXX8Im9#Lwdsk+=*XIUhU?0f)$-tF~_{r8bfBoq-}mD z{EMefVKNfquS3#i(=cf~O4n&YVu^qGRw71pmL33yD6yc4XQ|HQzNm1jiMStUeh2Q> zn?>&!+%e5D@R9=1@_bgn1g5xMi&}r;XAYObvJtOx_7XDy=j5o=)tLuV!abTE%py~3 zzik}6s@L74AfS{Lu-kbyIhunje`T~LrteO$e1p-Yd##18^Pdjl&yLxK8q1_!YMIN? z%4ZlL+3WhezkRJvke>DH{MnI?feH1$6KLR)4DLS^FNRZ42LTXrlgedFW2w}Z=}%-i zS&|SI_EW?ZGC(72&i#{$4V7#M)QcDWz!4B zy(P78%gJkKwtnQW;3H4vS=pq&5+6N;a~6Nkx6vj4k%8{rNnbM$eN2GuF~e8J`JtM5 z>r&j}^`BtRfumPMopseeSu}Is837SiG zECp)RdQ7E1^5&_}*|pE^3-Gj_QQ%}f(?rafdk4tmZ9!wwbzf5I`-GM3hHX|VFLFdp zv$p#~=xW)QkAYAQ@7uDDP6<%-Ep0Jr8DZ4-n*!Xef-&q-&SHkZ6Jku&ai%I*pcPJC zI~z*aAeyhDXKJIzSUO`sGT8+~#givro~e(WJVkxBZ``u?jvi0lwQ}Gl`xUgpVU#}- zwA*62yo4>wX+W+tHBW2dS^f~X)|zmiO4Aoxhz5<)#UDE37k;|0VxO)LC_8)Rg8vZ&EtANS zy-COHH}3Ux!ZymOjU`ToT(I7;nf%ie!q(Xlcwnp^uTc~F#PG_KI=1rXSACw(_b!(S zn^Y)d+u4QL+W^bGmXOo8D-_agf|ozwo%)1x&pl44#&*3N_08V+Fjf}R9ODaGM%U|9 zekLXEUq!k^Ij3RjpY!>yZ}+64X_6At?=B_j2dmsveAXCH!5>P~MS9pw_`xn65-|$}O(}td<|54LGv|>|7faXiEyU zxgW`E@nwDEgACVw-!7B(_3Lr68ih2%`?Fs_+)?~#@RMHyYk_8 zrl0+F_QbL|rzZ*t3fz(Vm`s3&CceY|Jq?0wt{U3-4nEFJ&pWYX?yoy?10vh%xJ&xt z3u%V+b%t`t&DR#IFWLIU3z+yOtW@fvJ!nB^XQuKDujAD@xTV7hu;l(Zfd{%jvnUcHLVX# zZrt$Sf79Dd2o#D}(@T9hjcRD%!aC?g%i{iA?NtU@7zaxa6|uRq3_1$`IwYw47;scr zA>6&8F!@|&o)p+kZu0%s_o0c6ZNO2ZgHEz4>VNs?19woe^iRjsWG9=&K%pZXfl8)Z$gaF%+M6qDdP&T2GTqS2r6go&(R}v&_wKu(-rG@Soh^&>ONya# zSzW;cN^TnWzf@kF`CZMDq6z{}E-s~o=kE#Mcj!iba-DN;=yEq@9s^iUI{2OxHgRo+ z&E;`KsS*pjXB#)P&R4(c@;B#?>+#M#Cq8sS=3+1}>s}e6?&JLMLOknVw!BAg!RzDI zBRBRR`T3|!6`I_Xk`*V~%vNyp1;p-adJRC7759VZEuNEYvR){k9rjQl>M7T@ z_E)XJE+cN@?a*QXQ-ufmY#$Qr@Y_*wsP#`1?pe7e@2#Al9mH1v`x3>fm?P+%) zAaGwEdM5aB{%P*kV~~$U@sOR%&%Tz)r9wAiId742wtM}cQq%v#-i@Pm zmY4YSk@Z@=n%=wNQojs^RoyRl3hO1{5!ZA&AeFg~%cg`Y2lMRW{0c~dd%woZ1I2s^ z^Dhhk*4+B8Lenp~`a>kZvjp#9p-}sVWjK`y_A+y0+39mG0n33AoG;Ttknb~ z`IK#B!PDa|{>3EKjz6_0U0ZEy67BqvI^y-p!NyQF2LZ9%~(wk@9Z-yEzMh-x$Z{zOU_1_(GOTg zfx;|X>uDWO(dZzxTKdC0hROP+d`qP{Z1U!hr~HlZ8vSyRPxnQ*O1t$MxIkUB*r#p( z_d1x-r#dyiQzqf)Lp$H%9G|RnI-O(LswZUs=O?L{yA@OKYsVhitd$kZP;CS1N1a{7 zrXr`NHpA_(Wj-_q@1NW^%lFsrR1mi1avlpiM+f7eYX zZd4`wt$TQF6i+V=|5nuq&9I5ReU& zZK$00rOkXyRlH?t{ncRO^x{7a6YtgggJ5@Ke`wl9II*`(plC%Rj?nTzYsQ)ep{Xr)1tLAIuzr=&_I`->DjggojOr$*v6$rE=QjqVG5oD#f_fN z%8Z1F8%{eVf)U`9V;+Vn4u@6m_h;J>N)3Wo`%A|f>@R*%&XykIX*Bxe`qbz@_fP+$ cYw4{>o&@CY($iaCj{X9`j4TYxuENp(1tdKk@c;k- literal 6377 zcmbVRc|26__rEjN2`Q2_D$5v4rm{;uC4^9R31b_=AbW_3sO&qRnCz8(A4`LogzVXO zCi~tXnjy_^`h36N-=DwN>-)Q}bI-l^oag=A=bZDL^W4{SU*Ff)Vmrlm3IG5$?Ynmj z0RTk&b28J>Bs!<`-_tG|_wN~N(5?WW3;;+#Ss75K@dN-#BP@WV0#qbTL6a&2$UjPD z8c$G0B9)Oej#W;e(KPZOBod%W|1?4>Cm{bESY!f?rjghLB$mdh3CcjiKi~u;knm5V zgaqY;1Y`p3z|v?sVH%q-P2*H762Kyrv9vA{l>eZy(=;WPCZ}St)M+f0+6z#7slarC z@-!CtZ_qTArl+xKDwPTVv@WRF1nM-6Xb75&2Ku9^Re+!XfV)Zi&Mo7BtZhtm_G0bX zPRyp2H}~oCLdbn*SSG_!&UR$l-o(e0v@4I-I;hlEBh`}>H(1j4pmT^&1FoaG?E8jo ztfqtxhYOdk4e`tKEmAo3HlnUrNm>Qc+^ zcOmmN%SX%MKVc2lyT+kMat6dq!)B{zrLj}lN%yazc?ZOZw=8p=LSFE{#3(GxZj_IB zj1Rp#fw76wHB9K>SJ|T$)-mD)=%JN`p!v6!;5wLTwzK7LC(k((>;l}~(&FSN>hMHi z)`-*=Vn?+;Z2!otruk%4{pq>Vr)+s@uSJCXXAlR2D_v$CEYEX&fF8G0hV4nZ zRf2a$s>I>It(4NTYA&?GjOLV}MZXY_FsiWid)95b^DwWh5*5nfUgI6pP#0hYc7nj2 z;n86rC99nq_?=(8R)g-D({-_YTR+b%KZGqtq;5a0S-B%2sZkjeb$hfGYQj{*@zkZN zSWOP(ZNC_Sr`g*&&7DnhYYwI!*RwE?ag#T9%;(-lF&gP@Ja!D4tht6QFAJzTp7fwf z8?N*fT;hlB0kguW{iN`AM2SL3oXTk&WQ9X+r z2AQbgjpG5O0QXs-mvfc**yP7HlzqrFbOS7J&CFUzmvP|ElVZQ?P8o;=_qHw!7~~<{ zCeyf0f7eRC|FVa3^rgfIO{g+l$9)UsSEDKzAU-6U1b><{M#pl4)ghqs13J|)&iK9E zBI9TLlhqp%P1t1zvd2{MQH%d6MjgAcWzt9>yjX3$Ko&7fv-dXIg5k^5uuSx5XQ^c= zqw?u$3tVtxQKbAi#G!a0s&TUE@xjUTf`1##;r;siflfwcK{avH){+wEKi=X)tNU(^ zbn?Pj@b2me#Z)@Pm~=N5Z~`g1{`a%;e-V=Op0~nsARs0I-6&CT%tHoHhm^G%Cjuu? z9IENYSZKG+N)VzCu;7Xf)rw*Cne3^IH&PQz0@Uw|6FYx?x2(9?nS?x(ipY`(^w8WY zH3W^0Foql21}NC^cySE%O2l|*NvZeWeq$1`eDZ2NhYusw`QZM$@dDVAy5|(fOjC*i zWBBUAyikSGakT5{+v4Dud$W(A_=b@lA8?G$$qjbFE<=`1Ucna-($+PHr=F0OEmV&X zCxZD@MV`=njnrEm^rcr*2&rlZ1(4=tOLtC$Ks8bxq)>X9~L%WOMK>{@!T69y9EnOf9)YJM!Biw<8hs?dZ0 zYOR4J766LBwCAD_PKWoPXim5e0G?ohi>)CKQUH<@`+Jknovky7b;jV?J+>&Dl#1_^ z=uNr6E{n&1Ti0Iw`rz(2h&Z7j?L64+Bm!XX&hEShBGQwUtscyMpobQ?I-eo>_;AG_ z@MhAEchOEpByW-lOYT|X1R7aAICx@4La<`wu*Ma3!U0x1^8p99Y zB5n*?jP>GWd=ahAjk}>OPiK$Yw_F52g3%Mm-_C-1xYr@rbc86ll2{{eM{Dr>a6m;uVoq@|qO1SN^&j6x)!mx2#ECa$$E5?@KWaLw=*^vxIGSaB z{R+YQWA2-NQZ&f7 zxk-7!m?^{%cuDz)029W2bK1rSzZxj3;o(cNU!NGgwK=M5 zJt=^Yu6b=~?EAL1CH1qyV~!eqm~cNU@~<cK|{|7;C=|)h`6|0*eSB*Rhrk9 z{3mjN*7*-djai%|{!3fv+}%Qb(?`!2bxnMX35&YuD%z}0XCclT3$by|4`gQcw7~Tm zds*T5jlU&jeInv*1>FVwgxXK@h|d#=*G)rN42*VaBy#uIgo+*7f7UvNf;SWGpU^S( zD10*iaVur#=6os8#-+rc5DmO!aX)#~L6_wb49}V57i$C|4ekzvR7OA>MPs?(Zn!wK z%&~M(zuXI?RNPh7M`aI1JWwrXgz@vFPc&vP&Rtu3p1& zW1c`5=bo8*p~7`Dr{U9Nq6q0oZ!*`~NBuSF07kIYZteHn*Jgeb^dU*MW(E+-?<@YC zYrL^KR2LQUeW&(HJY#xJ&`<^d{XE{tmPTa|iDmZG3BI6J?f!itwB^BXa}czByowF! zW}sm^Ytt_J_y&5I;~+1whsiVMAdl1R;plam!!kBC-k1F#HvxjyGE;JoS<#$Jn|zrL zRx(pckLl4yOPj0{{6bQ)CGIeQp1#8_-=#}$&?Li>|lbvTcR*BWh+53V#FU@e8DSOvb2mLG&>KF zi7!$@4tnMMeQcFK5yq>M>uO8~T6PIt%n+M#j$TPSdj#DWM@&S80T+s|Y%h*otgrMd z$JRgiTvVm^5Co)Yr0V^^LEYd3xv8(~Xl$y#35sDwJfal~|A?51nws1nHbugE_=%;e@#W5veY@qPio&3t=3(xB z9@8f(1WwpPbbS;Ps%WP8k2 zY4ei)WJweQ&*NavBQH>|Q*)4v#;Iod)%&X6O2EoDt(L{FCk4+z{Sg(@F>ygH(l6UD z_mPQiWM#TJ%Z)YJe3t!h{H#2vVEB1TZ?t?YvcMwjx*^HlFmRW^J38PPNq0dUs>Uv^ zfdBhj-)T?3U~jst?5t+(#&Y@0rbj0P9@EUeijXU_PChYZ@+xnB?4fIxHjTnKZBX(C zEtKB1FDWtydH2=}J6D@Jysg^F?A7l47A|=X_k?pmU+n}|S3k&?e1$v>g7Xn_u@!O8C zJ$UCB_ot3)8I(|IWG|)uXsnJwW`uc|wxc4#^qKc3tppSt21bk*I5FNks4;YQ2=^YU z0r#W!_R)5!i2ESoi~92&5^u=zFV4b?a*0)IvUK?U2jU3NZ~4Y*^<4HMB@`D`KQa90 zdSM6A46s)~IJY7o363%T~A9aU}M$L@wKyDC!Csy2qww|TFjaso&j8{2%F4{G# zx>qQtg#omZ*ok9|JQwiL?F`NgeIZmPy8Xv~xL7Z{QaMHCrqT&{>sE8^nB84Ic!A6A zDSRnV{7h)_<9i`y{L4}9jqx_N_@m>Vm+W$g2XB?kC~VmdxgqwUtikbql%Y4z65$B* zcx*R7IMCbj6#uP(T|;4v0lZ=BdG$%|`N<6j(K&RjF|!RL^U z>ItsT6O3#{YZ@gZ!BQ^HrsHqN*zlXC%31aFQX4Cyn83$n>74eaq6S`vN)~vs^#hIl z^d4y@sTRdrkawWna{}}y7g7I(ja8UZgw5>SdU^($A|yDUb*{VMWBU?&0FIs|=VfLw zU-=Z`u|+-MC^_Ucv$cU_8J1b6Y z^ZPS}O9mA{22*4d$yfKuzc!#pAXh2BWu z+d4qE#7}+0dg-8C=R}F;@;u~9xBvPVouYV$=SwUtiCO3G6Ch;kL2}1`Yuf(uN_SQdn;?Y~I z_fo66mI}O5fS3o9yrNe0IP85AvXAo+g*8!q3_sh_Si31xOP`C}GVTM2=E1 zP@LcFc!=Y_K5 z=goIMEt+;#0#CRNeR2&2{4cc?L!CfRxNlFZ1zc@^!rdJL@1$$D;whqsTC1g3%|tk+ z<|Aezt}cWp@7$Es@ZuiRU`+gY24wBUeHzdtaF>1jN-^JXkQN3DXwxhHhWrKDh?Sz5 zf+hb6hPhfIVA1{cz+4z{BiR0F@&?rWqAstpkSjRmm4}E&l3r%W68vP%>dvgV7j7}J7pOK{S?;`qvHGAOxHGRVU)OZMBT4XZx+ zlIo-LF8cJRHc)-?&>lI0F~dOzGSg%|!@4>w(8)U;QM>2I&Bs?)tu!sPxh4ZYBaI@>ghGWv<`3vA9#JT~>|BvWR6;#l~i%4&1M$#pH)}oT5;z zN~Is4gp3`(r<|f_tV0>@Y}?pgA!G0Hosy4}p!{?X{@x(HnOhZK;!=o6{?fPw7^e8c zZ$8p-YR%+w(?Yz1ZK@1nu*0j`V-|+T$Tu)FOh%h(J!#kqJs5HcH;A1(lo}u?!6_WTZPYXvCE3=fuQUP*Ua~H1;e8e(d@`Jz9 z$=&lKD)CYu>b0BK?uG@{(<>(B%x%}AbLS4|!vaf^!I6Q(M$8+{`VgKCRfEeP6jWdc z4nU7-LaoqTzUHk|7yrk8MMY6h3eXn`$aq$Zcx)*z5>xQYbh7+l9KI_FSG5@vZUjze z|LVp}0z-B8I?At$6+-{7y|r)XO_Q!oe?NZ9tUxE<{ZAZBHvAYVSCLu+B+G{i6rW_J zCF)90vodB>b9+lAEAAoo#3g@)BQPy%2gc*sEb;WD-@T2HA=6sbq!b?-(vy)}m&p8$ zSJk4WZY%g{P{CdM~7RtgyFQiuql)Ld&enQ!+VMETS~KLaKx}+yZ+q;Kw@Vu z44Yp6;@ikHq%={y78zStKh|(H2(t72oZE5{+;sMh_L?w DBQ`Q5 diff --git a/assets/curves/naivebayes_train_test_precision_recall_curve.png b/assets/curves/naivebayes_train_test_precision_recall_curve.png index 03b8b7ae15d37d85009fe8baf75a9c4cb2ebd258..f1c1536c1689448928c292d13fb6a9ac55f54cdc 100644 GIT binary patch delta 4445 zcmYjT2{@E(_ir(lvG2S>5i>EC&=6jF2@zw-zK)P3!<0RvshK1ZuZ9`JkPtDJtc5`n zl0C`34Oz1dk)3b!{r}hXJ=b-v>wcc+KKJ=OzjL17Ic4B7&^yl4=6ngp*Z;8&C#__# zM~+sD8WyM#vt>I5One_WT~v1oTlA|MA91+*zOz>#_V}2a(~eKzv~lh=KGb0*sLV_3 z{3VGgJs!!_S*A4qg|p|!^IxMwMv}*>vjWve5nh8~M%t1I?w#+s)%@spZUKKbE znsRv9s|ck2OoW#0UG6g(%SbOaGee(oViu=vDSU`20g!WqUp*SEk+#o!uoM;OqVkSr zH0f!6bhmIJ<8S?u5$DWjuQrXl5|CnPfv=E^@0#cL zQqbh{V)Ev%1L6IMGMmlH2Eg{$Vj9A|TM(iPnb;nX(PuxJmy1;Bp-v6z>Y&vDCY0S> zSJLf!Y3K5P3+O6-ld#RcX4+w09*?58pugk2#NLiA_-8W<+RPwaWhXQRv0ZORSqlEK z=8~&d%Ykm$2F^M42#x9MUb3&}UW;6M`5Mq6_3NN_e81&KV`G)~jnw^rGUuhK8)Iz3 zH`Ae$rbpW8bp;O(9W#B2BfMz;cDNZ`O-Y@sD!~T56AzDww{k)-c$T#3RWDsj1Vm0c z5jYg>Nv8J#)Q63Bl~g02HkW$#=NFclI1Z;NA{>1!Nm(X^P3is-m2uB$Qe{Q(Uy|h! zy$TOJ$V=JJp?BKr;|>*;oRHk(gKh1ual+B?6OdB?@?Dw3$JJ=>8*VzYdD6GVzI5i6 z`Of+#qbmxYPuhi9@;YA()pAJ5AY>G!!7t?Bun3@bxj4%r;yQ9wB6`1npJ3opvj1+I zEqb7nj;iU62KU&gUq|jdwB2FYf${`1@ND<`&J7 zx^KQJH<>;o^Y=$Sj_!WMyI9H{F%o#$;KZ|;Z#lU9AScWGLiqffsjF*(c$%#C))5}# zo<*~d;E%?5%$g;$fQL$y=WBTKXaXOsYndc%UmLuTOrUiv?Amh<+V$=e9?Mut#?v}P zifOP7SsUN0>;1T`e2?OW|Xr;%HvL63gocnNlp^ z9mpfhpx(+IVrjm@#`I&5eapoa*D=*o&W}Mm$e{H3(MxAPUJ1wacW#5WM~$OaApj$I zICXqRTwZ{KtAti@n#O(BnVR$~K@iByVxf#Cs035_JEOnK(b(p!o(j^LDwym(@^`-B zkUs@nm0h!c4M$G1H$cz4z~p@Es=Nt=JuP8+uhb%NE+v7f@vhtROB!;6K(3y}9NI-6 zr6E`gCoo`mIGgsik`J~1Jpw2}x`T;QT`*@4Xh5<6jAm7+2oLEZ2cSf}Dn{vdISy%zLg%S4gh==6f;=Mf}AxX*d}MBfmIeRtki=T)-GU&3smERdct$Q?FV z{Aa`t(dt|dXJ!Rwf)N?!NWfUtwe2Gr-)JS+xkI~=E2nXPB8DA+4N`0TQJz$m5ZN1V z_WYUPg8)9)XvI>jsEY_8e|Hu~2Wkp1_J2_$KC%1Ar|tSwh;f%7>Woy6h>ga%pB6`x z9#FDDynsZ4BSi|}VHsza3y1iHsQjthojjM&tDw7=12YJfRP~keKI5>-9SN=|OV@AA zla(C_-Zmvss^#Dh(x^T~tyj=GxA1efkV-yO>Up~Tuk6tonadWKj`Q>%XS!WChsty& zcVlY7;mah^pSMx;p)$WXXsV352tSZHU!EGz4SSNnq*|*isL98GXbop$&L=}t*Qv1| zUDY%g6#>Bh>MmjMn<_xY-cnY^t?^$#i?dF2;bUy@{oczK^i$2kka^Ey0&34ks#%zF zJOk5l$5+VpWntwU1twRtd*1g6MHu_nE_fFA7q!>A0VPiT&ZaULt>tF~yvPU`gP|^Z zUW_X z-fMTD#M1$Sv6&Jc94=yyp}LE4a7Ff>sKpDwn(i)9+Cdt_h|A#T1^ixgWg8_iDMh$- z!UYX>+w#29VtS~wLpDbY6L~1GyU_HhtxYhrW_*vl z&@xk3x+*gDil}9j0x}|tif`}@@n8G0@QKOymI0Z#HUU#(K)|BcsWfFB5fUD^nn{UjbKL~{s3StR zrzb|+G}9Th)Qv{#DT5jku>&%x{FAMxK@HfZG9TmzLlzCh2rd1o-3@|&S}@b#!l@X9 z@(D{p)D3)K+?!PffWO%B>hTBa`#tkAhkdtgXI;n-)d1Z3T2%5}2xD;;fbDa3rhn=T znhT+W7p7I(Mp<%b?#~V^5|Hb8jD-&9=36f|*s}$j&4=qVlL90z@k(kPvGmx0+VqhD z0{kbRO?)SX)ilq7ZCcPn?i|38Cuu+_Yb+b!dYR;KbS7q(=2uWHx1e^IUh}Kq_juk< zD2l%v<5^us(1^cYuqpyCqam7Yo}Thl-5-m4T>s~2=1187JNgt7DWU^}p-}B%(`yC6PJYz3|mfoe^LKZd?ZE7;DklEMM zY-)D1j-L4dVUJlWr>)0|x}Ug79bN^WFHY4)~KpwqleMiZ1#ZL5h* zCR?^+Jda(!o=t)&U^Gh-x8g2FCHGm1NTDk#z3gOk(e7U#Q1EB!fQ%llXr85y>FOJ0|Dz^p~z9UH#MJSfI~7Lgp2ccfQHN!`+_F5HqY*g|?_wAF@c zg)o`RizPRp&1zC?DjkC@+y+uL(|V)pr&(P)tfu%ORescs+`>$v$=eFydQb&5d#pYC zcUk8h>+2_CWGH~~iXi)OCc|Zd>mh|x*_;DnkWu(Pu4qaILe;lmwLR9QY_XM8Q^g`4 z8q`xV(=3~QxNeU9&W?`QH9G#Ir5&5;VN))0iI{25EQ95LE>;N%Tq~!DI`eQ>Rr_J_+#1)F77|Rpy%2Lr6USELyN9#ToHF z9!j-F?OOG+o1L-H$SSokUT$@Cd-?7FM~HQR=uhW4oyPj{v+?Dk3)tP0z^lq+NYr&5$`RJRn;#fF5Sh`BrKc0 zJZP`r{96?>V_$h7+ljUX2sc@4CY7Z0BR+2gRHhrEu^Lx1uP8L@GVzUWeH>7 zs##~1085%`weSQ_mjun-!WRm9J-&^qsX{i1+Ic zdnzwb=StD~UTIB*;U*^F7ZHVPiz}%iV@@DB$>$>4eOF3^oE2}mHURUJZXd*~LBbT9 zvUE2yzp5m;a2#%SH%Dw|27%0wqF?;}GKaUWH+1J&%3JKUgumi*Zpy5q6HCxRRkZuD z-+$gNI8xhnR4_!UbU`Uhp&`QFVX_1*jSBI7(ubhHW#=>s>kwDIWd~sY{jAIKVe0f) zbWd;w84yXh#GTiMwF$lwMpLlm-_>n~r8RYh9m$PE3~WeQSJVbBn`J#OIcJ__YV&I8 z9GFBPwNN&8V&G}gf1smJu(x3`^1k04>%94hEnm&w40oPIAc=<6fFrBrT%y9+C4<^l ze$5}SG-ThR!}53xzxYHtP0@t@kk|hGua~!7uVy`><>+r!vzSe$HUdigjZmr8M-tD% z%EHXcPn#f{pc>#I;~+|cDO~^bWv{=+E4HF+(-7Xr4hyJS^r!=iPBLs>%9(x;S3fw! z`IVW_kxVH4`w zlXoqwXbE1?{IP9`cAxkT=L>VT(i9u<%r7-F1MSX)(BF)fCF*DrW$nFe2kvoKl~lD% z)DOvA*rS}~lJ&(=S@;=!ll}h<-Hqi#O|Lin7JdA;q~*3YggVu_Z>4-@mg1tyJD`2j zRCkGhF*0|+Y~_T{7A>ye-YaDop}3-+^+woZR(?d?O`Pu6wV;2wB(Zi^vgPiy<#NTt zOW`xgiQ6HPElxTo&am}xxBZKgDc1Xm(+-US+V|UA_kXTd?AmHG-kxn+$iiLV6~tw+ gvY(hJW8ypFZWt{StDaf1IK{Y(4Pe(x^x&BP0Q7L4JOBUy delta 4574 zcmZuzXH=6()OOb-bP$7pw1`6JAV>*)l@g^&3q?vOB25Sa0#dRj2-1~Ek-CJUK|b_qlhbTC7^UitRTODElql>y{ze zKXaI{6AgmVC5kVJ@ZQlL&qp#TSrzI{cJ~_}j*iO&D0a;5p4@2fFa+sxp3$RiAO0G* zY4|OL;wm7r&QP2E2O%-yjGGO?4=PP~24y*ISp5s(*wpfkUsaKTca91VCJm(aj~vz< z7~Q3I*9OU<)w?_uGg*@(+|yXeorAk=u7Xk3;CQGxPq;$)>@Dqf_{5LS&2xh@VLG(P z{FK4=9X@RrZu61VOVnM0XRqZ+m~7-R zc6ePjiqHb?%iIM@B7D zZSvYi`ChX*Qi9{g+6mH|9PJhe90)A&ZOobEiGFiyJ9pw^sbrH`XF$7AJmlFpt9V~` zMB4mJ>RT+EaJfVV~Qokg8Qy?vUX#3mV*PNwfLlZbn%3 zzPPXAM}JIq8-A5pt#3!ps)(!mOhjByw2V<9uXymV0e4-< z?-#V)dLP(q6?;_l!pAd8c4T)Nz5Mbcmm034cI?8%!FPI5(?k{i95TIVsrZ{f{T?MjE72pC5gGjp3kRnFd6x4xQ(g)kUf7RgAVrw0z-{f=q19ED4TC+)qEea zmJZXCM9p9cl+6qh<@*2HoMjd&z5a@#6~(kVtYbkF2QjVAC*Y96x)^Qo5H4A91j@+> zhV^O(uM_QR=qqM09xwIg+L>R@e@Y=9OqbjMC})@#%%g*7AZ&ne9V7(z_J#9V z4S)o&(Lezb?T|llG2QA`;=I{=IGrnhiz6i4jhYK7ZoCFnSeatVYgUoHoPRn|%nt{b z3n(Hq9($vXAm$wbA2W<)3n))BC>Mu(m4}{SJs8Fv%1vGe%v4RQ{Csqs({RWF9oj`s zuf<=nw`m@M*eF=EPd)^k9BpjcR%Om?l6gf_NVuIVTXAG0$q#`5i}eqmsh{Uo%cl%6 zI(-yY1rGI;XCCMy6(2FLwoo~*G-1@OISb!yo@dbnQN!%cvZU2YG#U<+CUn~3s4CLx zLfo|wG5s~ImICgsC4Bl1{P%B4H~G!!w7_J!SWOH|aTj3%v6f)uz#%3djY(Pv&JMok zLoriyC0sUV{FxawhW+^KZdqy6Mw50q7zNUpxh=6}ft%8`QSJOu!^uHQLj^?Lw z6~vmTI}W8h#l13=a(s@f_UMo@zsg;%{1OMy!F`5hW3mZB^;-jm;Z?@X0)d~WR2tHC z@>0|H&{+>vX{4US`}*8qj;QfD|`cv1Y( z3YXRl*)Pb9_GS6aNXH(f#NxHgA$n8=ZA1@SK8U_aPRRMbr4N?8H?MxjTRc{x&mN#V z8N`17HxIa22bx`iFwC8W_JY~C$An}SVeG{}Xpz-wq*{`uYYYx~sTHj%=3P5GfrJXT zQVvah=lR`L%M{ok9z?zEDVfc1mEBKE-lusw za4EQkH|qs^`($^@)uyXVb2!@s4v47B@`u#TaDmbuS+ibMR1HLFvh-A~rNh-3y+hUrpJ3=UZ5lwLF*hnwUQ((QRUfS7s# zxBFpy#T0*9tIBJeGd9m5M_WzX`!{JPAK+JE?P*bm92!5*oi z_OWf0ddf`9SOR+}lMmA+w?Jnc{eWHz%|Yk-_b78%bof;Sou00M&r^g(In}saP~@=t zuG}gKPx^-pO<69S>UsoU$QarrJP(q+$IiZ{hi_L>d%wPnuMBe(&Z+fOIA>pQR$vD> zt#wD>USz9~;glH-vBfP7<0Zp(f)_>8yIfK8Dgr zs{8cV{{)?VhTbb+0lppL0cSQy%sN1!D}z7*ai2El)7p(4|M8$*LY;{d`*})VpK)`$ zrsN!b&bQmMmPx*^&UTdI_EhUWif_7x!{n6KQ}{FN|G}Sq4><3MBhp#UsGOCF6x@`e zd*xhpD{14}0b)OxDH0z4CUAOf(7iD^$-7$(`0_erNXsv{362^vo1`LYB~=2LOg78q%`{`b3FWfEgmznIqi^7iFa%DK}o38ZonruDh6H#AA5~ zIGtN989=RMW#9ZLK2g?)K2WlW-aTA+a%tTqT4iWg&aY+j^0N?mGQ)o$oAIGGWkUv< zx@f10@M^!x$HHG_jiDvj^5ZF`24Tqc1#H&Q4fd!8FId@8d~+>(@;Z>SC0XoHp#Jtt z-C?*b4~e_?D_8Vgx9*3s+EY%krUzXLeQwPfRGJN5eOWH#WWqYx_parFL1Y_ty?Fl? zs;4Wj^l`s_c<`I!iF?7RXw5lJ$8A%$4uv)-mh~WgWro;kh7q@tpCOlsjpBHuu%g%1 zE}Y1u)$~~9Z6xj~SQHnE2>aHSf?(jsR(55kP5OQ)WXzcVmaI^CcomW@5~c6v)%C>d z=$4nJJ%nmi&o=zsm677fw!GTGn6xjp5*T_Pk*v=hDz6a0UL#-&tDq6 zu1zlxs|9MNdrEhz``j&c&-d-2vD!dQt5Ut1SJn9@+thvxnSsiNiomSr_qo{NQRTBLrqXUKX~R*9ciQI81q;qyr%y2{^_PkdLv-pGM%VNcTAa_Mm% z>6@IHY08}o&57v zE*4YQZ@-qPaYz$?L%7+IK%;zO2FIK)%lO8DO(Na?N87cMr{c~gTrimJm?QPvre}gT zMqD$lr1_emAF4TLh!|S(WqP4@qFwDU=h{Lnef{zk8_%x|)DKKRM}HF3*<0+&I3kX{ zs1dECj*7`0A@)0#_gQ>w-_EP?d3&uzaD>H7CF&uoAgZlQWYNckB_(rDvg1qZ?-H=hqD2MP}x2z+Y zT2kxTPKLb4tN26(!AGdhQj+yXJVyKqbnc{SH7%Jnej;_=wR6R@;CN6YO2fC(7O%5R z)}{XSLSd_qF=dxH&*R)6@dn#sTm)T4*=o9za+Udj17Q(xSFs;V8fk2`WNXb25Xc61Dx(WEq{o9*^2m4qwKKWTLU`W=8d+ahq+RdB-Db49MSt z>BZ+;U3^p=372)F7A!?m4uYx%?((e=-^qfhG0&>OnziN{{X!ZG5(EB}}>l{EE1fp0V`VvcJ` z)WU$h|F#w1zMA7SkxN6j|1OHmk2KI&M^+W)>!bgUw21))-JPsJ9r1)v{NxJW!-Bh_mv+Bok9p9 zbG%x|$Y!d8s_~)Uw>btBksy|eG>J;A*4yDZ6ChF1t9&X+*Ijf63b#Miu{vCy}$uu zl$|@c@{E7DaB*PB5`~yJGFbOU8{)IKg)R7^tIcIhRDK?DA>g!Zn zt(dN_|Jt1Da_gxf$dQ3iM}IR)6RKDkv5VK4wALTZ%nCl{(IS2ewZTnQ?iQgzrJ>F4 zmr0@N8IfahVAI@E2-SMM$9A|`%+G1mab0ysSbx!(N!D>~;?af5<7k|i$vz#!ss%1o z6mn{iy|Uf9ZM}|KJ1C;WTC`3#h7i{5N{^8H6;8~#h!JYUHjNxEF|%!W7I1q&_!|U2 dz3;hC7?zy^@I72fOK0fcEj<(6iW_L${{X8d+o}Kn diff --git a/assets/curves/naivebayes_train_test_roc_curve.png b/assets/curves/naivebayes_train_test_roc_curve.png index d86182ca40b98d625ada0c452bb0f1069d8f5a03..32301352dc31dd69abad431f9f3b8378c12cab03 100644 GIT binary patch literal 6074 zcmbU_XIN89)43W5(t^@Eh=BAU3K$fWYC#Z?UJ^ku{E_smY&ojo)A%*5O`GGJpCUMEb6bKFn!Ras=grYY*h(@Dh=_oh|but5|!(=!L z1xL{#9-d5ZWYoWGC=eZcLWF`RqfQsWO#Bi zDw+P^>5ZIB#wU~MkcLNr@F+N*E+QFz!i^`>C-HPV4Ueah@iZDa8BWHd{+FCgqtDO* znMR|5Ky+prKABE&LU@8ZA*aunwD_olKs>K*>)w12lCzr2lKr!uJL-CyqPYAd&m}U0 z&7Din|M@57o2g=|>7u6qe3M4InI=D$?$w=s9`UwtJsc^may4BAW57#>`=_7&=h58= z5KMu?P>4y29n1;A#+(M)BauID!2HBA)&D!R9~0(dFCL8b6(e) z&!~Fsv9pEiI<#t>SGYyBH~BRv3~i)_gdZgA;GW&QhV6LB&Am#M*aebcW#%jSIwp0h zm+%}d^UCLUz9t?tq*Q~5#s?G5RV3E z64{FZ;u)Wg)7|X`9HRh+~oCcP;)lBtn<5uk-)gE2%R?YnW)vV3@C|5QU;-Phm zPgi0u+I~kyt2{%Lw(TIrkUx@Ltnt{>`1Oza%YVvaTUV@vZNHnejJN(d$d64yH`pxl zWM4no@n?(r9M?|0H_^iM zcTimQU`jc1E2^a3e9hR=t!6xO6>a2jBruQuYOq5^39K9^2gkA zP>`DrGpg9sSsrRzVxslQvZZS5Ip5`@E$X-DiNzyhMA0QFn-gQR1D}L!SN*gvh>xzF zn4(H-w9$>ux*h|kFF{ceCJR4J0N{5RN&ZV#iCIb3w59uz{^V+|?p zV&7b(k*G_ny^*)_Tc28YvsOX0!%P^h(=BgSpdpR3j<9C}hyA`UW{9^mE--*O3&V%R zGkJ1xA{jGkNAdM-FMf*ztjk&Z&#Dt+Hn^|`vMI7zI&1bgC5^$YE)Jnn$M?1e<=TFf z(1*Y-;4{^Na~Ni9n`K8-A6>=X>gbE zdL+AU^)dKX@BvFbU!#IxEOOQO&Ew2Bg{**|N=UN!e|$WVFI$2Y*mMkfDz0v`lBfjk zik+K%{%;gGWQa2;@~#@kpTg$)s@G3pp@yq|*`gLB;jvRq>Kx@=@0Q|IKo|@)#OqYZ zy1X3I@Q)#%Q0b@B6v)G>N+VD-(q*DkukgZCt%z%rfeZbiOvCftksel|TnYWbj$-Pv z6XEi?Aa+2}&SW~h0#MiT{~;jA!lHU8gblKjY_qK%F^%H5c6Dk2aA0|JbK_UMzogti zm2J}OB^q6tI_fw33iB;}2R=IcmUa!UPhg28j`EHXC2({?1A?jO-%yC_AfWLV z8L+%gLIe&WSqW;cx$nAT0mlSVo{@LA|6 z@A*#h)`x(}u+k~NT9%};^SD4?G9Ec0J_M_sHbI$-s6DQe)xbtGLyqNba3@_n3F1?^ zf)j5fj;>OCU#2D&h&OgOwk_1&uWFdYKc8_Z`ljK7fq{3G6A(klC3=9K?}J7mB{)9} zv`Hpq*)suBMHCGWI)z~7Hzjvpkm2v(;X@>o11P?L9@Kj;xluf8PBZ1DcOPkQ6d11a zVly5w5;zQnPXfPEhtMiz!&@xmywvN~H&lJ5H2!RGFLxVtdZcvnl!Ix>q)j|R8RnV_dakvkuB;J*ta1UXn0g#kPHzg zSYVCEVk4SXCb)EbnXH&F`|t1A>^0Y1$A5)J%}DzG#?eN)jr?Tc_Q2gM13U+cQ<_y& zK5Fe?9_tuyZuyMl(Kyqm6;-NVP|wHT&O!%=GY=S)mPSgUs7W~u(syXJP%R55cS||K z1E|{B2Sp#w=rSn9S0D%funx_9-sE0M7R@P`c}`xKHNMaEDRtokpPt60-4(V~myJ-e z8~GO)@;5J5`T`RH&kMKnxx5%_bkCR1%6@Tleu3BfsH)PH$>CJ5$z{YYxnc1wIGUun zu!9X=^9uZe)#1~;D;9F<3n`8=Bg53KWR&{eX-qfCClGRIOh!q`5QzIA09Py(CL?`bEAz2NT+SuyJie|Bnn1I-jUIfamvzRi2~vm-F5 zZCP>BKRiP35$nHHhB3UsS@JICDcnM(2-@AW^8Gy36=UR;Mg?a8*cYC)*?#kW(w=bp z4130FAJrQh6Ma&9M8R5Yn^!(%6AYb)#M!ce86cO#bfR;SkE6B^UCcsL>`4z8qANyB zH=G+DE;MCrOGFpMFtblq&uf2*|-St z5bqVR=n;Jz=Y;MaSr6iXl9@nk^wHUEXx0bvzlYQy_o?7oP5 z1Acu|-)6(!=p=rQqTmX7ges;?z`Vp=>QSVZ6OT~Mv@)2tX&P52OaD-r!mfLN+W(2u zi_)T-g+`3(RkWQe2CUOa%!y5OfrgrK=Dxo$3wJf+9{8?cj#39M;HZEfbK+E0@Yd@f z5`82({Yz!JfbO0x1|+(vH_s#z(DQxi_I$ZW0gM8~I=FLw46dcF3g{k=4!?q&_X9kAG7lgvA@3f@mhkd^I5nJFe zG$W9O5zmA_#wHDjChbF8doKa5s!5n zVx(4hapgJq>6c&++jUoPYG<%Rpz*#IIQpp~0@fNzCgz*(d>z6Kj#pBlkgxoj$ewuA# zv1U8V=hcS?-@ZNi$TX}Z?P0q_|yRGv&5MmRrQnpo%ow>p9W-+%}u~xS(#TCCZ zmUR|o&cia}cJ5g*Co5s6T|G7m0XfgbN>GfKRDXYnywCZNjZsloijP<)=Qjswt5Pv> z=KWHop-dmIbL*PpA~ou_a5tl^^hvIHRTs_y8(|85Oi|NiG1{<02V*WQ@P~e6UwG-N z;=VCBZPiIfvP3x`zVNUTrk*MAh0wF7GgkBsC^K2Y4imMjv9rNK+T5%JiL<3h*f|cv z91ey;w8+_Y>$G`@po1JArmMY~k+O}+_NriLTYtZ&Ml!D2Q6-NvM@{>ksev*|oB}9* zW88(@SM+A=DxUbQRYn5p-j6tCrWB8_U+F>O#Hd8HJ?pdf49Z>pvlbW_b;r7jB-DQ; zS99t;Hm>6m`@4sE*8X;zmd3#=-zf&i3!gu6MF`|1yZfH1Q)4z>K(u}epRs;zdwfKl zb0J!`m}(wW^Nk9P24@@)!ndthcyvVO%@3bh%2P!SoRB^|h*4;-0B`cl;-62X!YC$S z*^9L1U%W#26Xo*wd)3zIgD<=poGF<#ZFruEJEu}@mMxvx6pz)nlDBY$yB;`>!-Af# zT#0wT|KJ`8ZBokJ#P8Ibf~y1={HL{jNxF4}xLUJxuWiq<=RV^;(lO`vK6^Yjf{n<6 zDL9+>zStWz=qsTq3>H^1Ek8ilb{B$osWFxfFTu785FJ?YxxQsvv$8jgyxLH5DSzcY z(uAIe^sbl63R|!l6@>S510A&WwIFPE_{K|7#IT~M#a8LVFs@b^15yjce$JdtqzsBD zLf1HRK5tl?s*$?n!*)W;xck_DlXv~O!l_Qm7N{+&)I9Dq(>zFTsN6@pW8&=`kvgTN zEka#FK^R!Na74td>ig_R*a5oQ8g{5U+g2)w4yU<(OjT)Lw5(-p=n(VZrzh5T>E3ot znT5rur)u8Y)B!Q#Pw)$rk0tLUwWo9Eyh0?-d#AvWkDLA45qYl!4QzP0b8-+eN#-54 zCR#qCzm`3`h30B|JSxu*%TGegeh6Q8f6>BfZx=3GX_nS-wzA2u-62LmLCMA~a>iQb zm!F!UD>~`#(pUXzi9KZ2aIcea;*ob%8%Frpd-%d7_DufBgnH>3=-EZ}djwP8NtalS zBWjRZVk|z7^Y0nmqv3FvN^coLaxY8JV5`XSMYwM%X4&NO`pr1ot+7tkICnMQ@-aRkn^;-kG7MS1B zrgzdpr5H9nNQW#cWG*}VqH{M3%oV*)RJ=p_ar(;uVoCuP-rtAn-UyWM0RD0g|CD6C zG-u)QT3p0>RXzF(y%DF0zjM8;mbj$ZC~hUJ*IV(+t8?GmX%dqEv2fM8ANR&a%z)(S zYW^;dTgyqhcIo0rajWZ49^3AykDgU>zz?Zkqgb!XN5DF^?>=re_;wK?g@F)$4@~J5=wo$Zz&3 z?6PyuvgJIVqCRaiZLS|Ju-!a<+6Kb?`WjBdsmngBFOBc#rDP)CJx7Bu1Wx&TfKC{0 zsbfmxRagI$x6L6el^M6Z6w^zSx}hD8;8tb5JN3tnYQf5lDxW5R8&~K2^K}TL_YEB+ zJ+qy9q-pGs-*-zBbE&t2H+jl!izb9p1+y3DhocZR{BLOz7LJO$h~L?+&fSL^X1%4y zj`{pqi=7L~LnZTDhfzIl0bG>MpQ7=~3M~vHy{|^9`jneYd?kvkJc7j*TZTmgCFdKJ zw}Bwmb(ZZ9Rx#E-Y|DD3`CWnSX?y`ACDtqU(J8{3*)uZk8-uwICylb*r}l0z0u@Q6 zt5CZL<#6c4#o{O4u0A*C9M{LI8QiO#L_N96+v`VODsw&JFzb5pvSQ7rqavp`$*{{z zVF9^U_<)gjKDL-HxCKe4C;Yzp?g29#b+l0ApF9yJ;%RF{T`Zj9 zb4nev>Y5yCe#{d#VybvXc>xV$68!&_6FJL}n~jzJMWx>75h~}aBzj$lZUxIFK@~54 z>hoBp+jJ}!+eYcsV+BvuW2pmy0m2^!;bh}uuQXcilNl>(cj1W3ebMaCMj3h+wqt)6 zwVbjrvzv38?^#GFG5*!SRQ@&V#n2ypdj1{r<1%C3GuVWm!ADbST8>lskv6T;^`1qdy^} zsB-I@y-BgIQWJ4bMT>0w(^+BJZ!ppO8r2e=Y98FwR{KCwb43+y12~IXYZW(yjMG(A zo)ga>Cu0QGsdEvrJT@`j*QpH{$^+*+m+z!)a$I9@=Cs!;RvYL|Z#-)8RReYbgIGh?$rbZ607D&uEAG`r6Atb)w%(vhx5-U6c m*j1QqdzyUp#s8uLd<+F|&bbZo9Q<=qgTJk3q+6+t#QqN(bkUmt literal 6209 zcmbVRcUTik*N>ObL4))rAWi9_KxTzkO!UnLX!s=IrjwoINued*4K#nSqA^1OhP|+|@A$ zfi4ivPdX}!j3T9oR6vMxjS7XP zs2~t32nxlhq)-YD@y84SqDap{2$dAX`GG;CPznx#NkL#J^ejaMl=8Cz zNl8JZP!0^G;8Jjy6dZ+~VGtk;LIp!bfrOsjUlX%o$^Z>+Ty!*m6p~mtiNB%Zw?d+ z7WO;Y>XMJFd2q73wGv!-^!eH=y;>rSeI`qBF)e?-8!_Q^u(p-CEUP48d}7C^BMhCo z4on0Z?Cxfc5#mX9@A7n(T2Gv_XTtJI(;{dl$XC5ESw=R+N84Qsza5kUg_Ou9$Dyhs z`D*VQYF>Km?2eTF%4tlOzI!UuvwLVS_KE!cpu-`7Rx2_HT=C=)`{r2cFCjy#QHh;_ z>lHB7Tt(LAG`EM-=xc|mzHdvrBk+^DGSD8gwPw}wj0V}Xx?MZn+TK;0#TSEu9DrP7-!wH_7pcWo@wp)W=RqdyKoq$E>PJ`IeNtbE3M2aY|& zBd>S#jIsmrer2_@I-1DS;^F1(cc{caPai*&nDWd-!{DrO47F7A^3ByTX6hbu zIvH;7xesA^<)knVPOZ1YITLM$o9+K1GhEUb75V9yuU?koxjmF0o;Gu0%M`yKm6W>1 z;p5;HZ^&z13r*`D>2}I?FZa>t*HGHfVt&_3uG`k?+WY)ti_@9ai{i0e2>RBqNZp>L z0sQOX?X?FkrjZ~9Y1n_;h7H|QF`tUb=Tm;g~J*A;7=kR7BKkruV zPb*bzA}v_f(EPAL+>Ev%l7al`qvKVf%)bgy`4PXY$g61S%nPINJdr9+_0`Isf4lzz zitjSjgj|l^0QVX_K~AO0JITT=F88eG73V&rfhAeq$vuRS<{eYCm``gpV;4^Dsyb1F zZx+GUAiePx1zn<~FeTwlxhK=kSCFEHvV1O3mOL%uUD&gZ`#%5$Qb4VA1QWnU=Tn%E zx8^X7f-|ISN8+((kd{iu^o0`H$uPDSchC3CfLbkwA{hO}e*7UgFlR=~m(B@{(?k&< zg3xdk#&@sad-N%*2b`{hem3cG@I{{jOn$3*G~v^#w9|j)x^U&-o_;&~sK^$!Ze#=yd@cI&bz7K$&W72IuD~wj~w%$y@|y@06(&f&X^Et1&Z! zwsV}R0lnsx;py{^eC7*u+oO66!F(dzw~4t4=VBHQ51X{OpbzY{RxFq^L=P-8AvnN zA}I=9p70~k0?3Vz)$&dJaNIBXNm2o9N_8G4>E*4-Gx?n0fdN-jwg-*WVL1r2gpx}G zH$P%Spgu`+5tb#%jXUfHOpF@s;eh3jWM3Z851EBB<|>fw8dc4abb0CDPqc`IM%$dI3?~{a zq?iAE0LXUmZTg*g3kF>k%W8Q%V>Z|oj??l5Y(s>CxR>`B|G2wN+~}Ru_`S`H>Ox@V z{aUgAB%ffS2#pm!XBUNnY!SbCHN@A$SDK~4vbRaapB=KVT=bgZKyktoU}@qb2~u?4 zHzLS0*auy1=z-rfT@YIr7Hm;pe8428o@u%XEW0>iMikBF@mTe+Lwi>NbrJyMq5oy1 zf#Po0CZUuIb)6tcIz(S}oi`mnm=x0BLS=j@y>#jzsnQ|2zRzG^9vmwbL`uZ4y{cue@0rxl~Pbs6A-(Ku#@+JEUg6@gRK2R9SF5a|!AzlCZ{5@*f^$8de zSET$xA4#NKB~-w1`%I(og_jHmuP)ga5uzv^BK!@EU*EG%C{)i1J&4}nLTMaK2l%7t ze?}4Lh2|&rwPLmtX&#Jwuo_X7GV+su9ZlDqDL69&h6HF_`g(W_qD2C@HN7%wM+C5m z_8;I3Z7$4UQxJa(^itkm_rjmPYv=D03rVOH3ZfSba=KeTu)2Yo(%eXRhi{~X@#R_w ztppzMs9h^{{bz*})kyg;KV>-{S*UuvK=%KP<-a`gwkM5VY*NioOWX2?Lhs1j>S|Fl zHS0bvA5v_3d&4AkvGN`URd&RTeR#cm+gbb+tk{Bk#$hzRO*c_!3fuH1+oi#qdXeZ0 zs1<}Fu#HEv8#RQPbg|Cx3RHS4$gvr@9z@eC0Sip4V+|OhNJC{KLrE2-$2MzOR~n;< z!pW6c3?*9=#lCtamo!v^hU4jBRp<4~mH?Hv(rr0v!O7Rxa4XL{Mfnyu__{WjSqnH- ze$iEGYCzqAp*4P8-*Kda=IWFp0$?HYYx{DLXH_fY5x@;rDbndyG8Xy#6)}e3OoyHF zRTatpHB3=j12-B5Rucf14-82&+epCWHAB*@PY^MyL`ef`0t}IvFKYPptC#>gtUSo& z90jnfHu2iDf(BFr*ijmIGgZh~#M;|dvnaSeH?5}HMcXBo8S5~?BgOl~5l%J1)7w#F z%Iei*y>vx@FAJ{tavZ)w8E@QLWC|C)+*=Zy-persNrqd5&n`Qi%uWw9x$q~#?ZeF? zv*q^Ik!YcPhcaOkV1}6WC^nm!Wx*`6tu9GRuHisgSHM&v~smosEha7Y?!P%Er7poq3_BGwCP&(6%oYRyq7OT>0CFg*xHh%sru$kyp$W87L zlw8^j6ahNNRe69)`drRgmeWfBn{T4$U9_L5cHBy6f#MubZ&Pr3g70VI;prIm348CH zPY`puZ9_@VC=1>4cHry>14&~1h{#rgXr>#|_xKZCI(kmM=1D!eL@bE858ZyJmPvEB z<>GAryG%ko$v=B&=bAvzV*?f!B+>m?g;B$5`X{uAabi))N)AVY%aD5G747<7p=oMf z5k`th3z3$i8l{rWgTS+0KUxpgX8D#1)@BtfrKXXyEmq}UKQ;NZMWFX_{nUQ_Kp8n9 zsdWqying~WN}L>x624hx+m+(?dw*J$IV2SRnaR9Px>x40m3Au<)xQ)XnP}K(@Dra;JFy79BFpTPYpHjwa#9L*q!YXi+g`>y?DxVdSgrH zdH2QR=zQ89gFta^AtT~W9WgulH&D0Vx^dSeJIWz;vDsDf@FOfvfvb2PrMoB1oNsUd zz`;N@Y(R=@6D@YD4p3f)QP+4;9?fwKZhnC2;SoR-9)CUBpWJw&kBMBQ#J$YP^qyxx)K=S(vaKnRL+( zyhdOuUW$r_q6%ce!bzJVApj9)yn&i;h?%60yh2Iq%oz2^Gp6mR8;4U-LDbqctd70q z&8jkT;*8<5T#hE`Na*!6gTQ|PC|bglmrktvE?Qzh;A4K6QUUzqK!1WRm;lE`I%sQMNPZrg?zLYS~^v08Fml%-KjE5*~D`qG zPr>4FTfCYwf8O3uGOK`5vm)1y3IeSc`mAhqH`VTuFUG3#x{{QzVL$KT+M#`O$iB!K zUwiU~Y1=#zQ3X8hpCfW2PfkB`Cd*6s$>JUGNV5o0$)z3v7pm;kpukX4n~UjtxWW$> z`!OGoUHT@-fM>{eu+MRIg3v2Q1BH z&ey^u0T`?Fsm!hktYN4Cm0QG!SJGVmdLIynNgAQvTZme5xGT!{TrTg6A4=`7{THY! z4?FU04u5VWTi28*GO0|-Aw;9#F#MXev;vehWSX%Y==yEmSTZvbFND&4ge+KF ziPt{~`IcDp&@#B~kBRA=ZUgi9G$i|U#w-h-g>-+dW_N! z$7C-{!O0g!dC!GkF8`*BySEy)cn#Y(IHo5nmZ`fZ#Vu%MKg%9Idb8v0fRIvBoGOTo z8oi^c<2&V2u6nXs%c~Ewkrld@nT!{TtbCy8r)`1NBBt=_AC+9~(8zp^e^KN*6^Up# zj#5&OJHS3I=JiY{6n8Gx9B1+79hu@^dSWL0#^9LnuxwZMx8^VN-0tjJe>@iI9jE5Z zD0_HyOoFoL==q#zUE4(8_!izuIsl%HK8()WA0xO6R5rX(wu|G2y@!uQC&^6i-{Y`B zQoDqCvV>UZWT%Ds8vFXQRW%I%X~06VOfQNDzznvP(Tj<(SK2|G}Idew$scLHoV z5IJViRM3xG*P{4Q#=tWqVU;K5X8|`UDYGbqVW&pvBu#388p5=*9b@+VT02$@9HVvY zVIrAH$`GJlz&|wy+OcFPsIeH5u7=b@b7m*=PxKOV`xO4az?cxFSs9M;BX z|D@BPI45Cm9XzcT2sz)@dtt)-~q&LO^wq+ZAC&Ec;P~ME=`^e{;W?P-|HN_ufs9>g! zbQ`W)6-vfCGlTs)OHa0?ck9p@&$Hpc(&e)K4)Z>@;RhJ5J1*~}M5b-G_U(Rc8Oa0= zrkb|SxZpY04O2NT;0|sbrbduiv-~8w`|XFlP}JG!*BeR=s*In9dGuITip!s|SPsuz zId%0elr&t%BfcWGLU-1^_!HmE{*xYgAMgkH95 zSk|g3R^FBh^UcybpkB7NH9*hWjK^1ANv>*_blb2ief%MiHSH&H3T$c?nZGT)9?iy#-zljHzkcp38KRBnevG|Q}0eK zj(3AkdoA+DkS6Jx-Q9$*_B%}RE!m%B25B>T-e%NnxS3ZFtTMvm3K5$RP(72zGI2!5zKVynOSv41%=g7*;nT>zTCVqKzX zv#iZ36aXA4x41WzRsP+{8_03XKlx4;pp{?H-jrlx;NKzp0sRy2viQ9z0=W7rN`Sq-{34e{s)Na@DP2 zXo#Vo{B2jcWojgV;zRQKuLq$Na7*i<>A04qxl-P-O?c4ttD^3WYri0B)WNKF8l_6c zd+DFjuhsxZds@WJ`9Pm{W$UJ=RY+3jH7j*)4wQI-`CPgFs3~qt-P^iqQFU=7XkeGE zJg;fz2Y#UqAz*8qtP#Ysi;~G736;Dn*|*@&LKTJ7{g_rKVn-ix?STXINVTYPx_E5Jb4M5kH{j`|;` CV>F@w diff --git a/assets/thyroid/regularized_logistic_regression_cv_F1_No.png b/assets/thyroid/regularized_logistic_regression_cv_F1_No.png deleted file mode 100644 index c235bc17b0662a0ddf12c4cd0be36fa25f92f738..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3579 zcmb_fdpuO>8=p2|NFkAXg_MzzgeD`BB=^=OxtGqET%twHkeb<2GcH3e2@$1TF>cdw z8`^18Xmvp7(ve&wH*r*jY-5 zDTu*fFbS)phmXTxYd9;X=vv5fOw8^+)C@V;I9Wg~42FWi02oXFL!n?O0EVK#01z@k zHz*jeVnacif&w4`Q4~nn5E{Y(FaTIZKn<{RfB=O8P$10)KnegtT0lX;D8D@nz$gHU z0ss(#LP326f)onKrm!IqfB*~xP#^^o00bdnLp_iJp)LUk3fQ1Pz^0(s6o3svvicBP z0Chr&Ef5GG2S7joQXtSOb_EWZkNXwAg2C3mv^s3&6q(J<2_Ukb${#uN_|#gB7d!4^ z&C3n%RNXbGSJ$6V-#t+Mv`Zi`%VD+|4y6^#V;4W{SMA&$;FS<&ESI2dn|ev#aZn!i zXzK|jAu`b?R8o5bq0y|&NK{sY@#fo&VQP_kuWNS%+nt-?j8gZFh$IDb&!eBGUl1Y> z+9jQO`TY1mybEt=dZz-tsH(iV<;C0cE^2|NQJa-wY%Cibxohd}JQZ>La(i>ve6b~V z#@CYp6tu_&PA@Z-6$LRe5?)#1pR(SMgcGCh_qrgoF#y2YhsbMstBAH&=8kB}WxFzaO6L-xQ3iY2^u^%%2wbLp zC9b2wU9~|c;N_Va<9RF%(R)`x)5@au!@>JS{tH%%5BtV~H|H$wMtIz>Tpil*+ZLi0 z&XN}~U2o?S>J)c*)~-X3q*Fs`AMdY_OTOS2m{jOdxvS2pj*t;$U6T>aZtkBTD2G*$ zd}dimVHHJ{soZ)G?v4=aq2UFoyV+9oRc-cQW44f{D{T%uZnuv3h^#%ZUia zft=!71$mwDRF2pC$%!P6$+eVryN!eeQwLq!DIGcW>PSRD;}xE!Z2C*<>6ni|Y>9P< z@j@@TAfNbX*W;MZ2GVOfzv;mqDW&gdj5nR+uT+vmh`lVy)2z;twh=7q%XwR<@%2Sb zHZMq*ayRJ3d?KJvRyRE#(q&~JdsRhEY$b_bf-)Sp&i_58{Tr70 zCO#`!S8B-&X?~l^=yC~(M%$;G56{c9T7<3K84>M|M{dINiC3hx6UhdVHi}9*Gx;lQ zNfp{_7MAC*%A7hl=F7^K?M9*}y+t|-$IgGgHo@5%x+mpfU0{kV>XNj!-GqTr?r@HQ zk;8k7e?ht5O&oyUWcraS3wqHwQf6z%s}5&g#r4#ry5K;M-KarMW_P!9UMvOLM=w zft9$jDO~n`xMJ%f!`B>n@}HL)I|;^o$GBGPiJtBHVTW?0l{%iP95UB@QIv0p9Gx^b z*X$Ht3FUZLMWX6~^HK_^JB1ulkIzF1iJVKTMQyF}?+fwUmyjW$$6o6q^$JJQAHjcu zoRU=fuEd%LoO*GbKx{PIT%T$D#gJg$Xr)M3abEIMU~Ds8b#C~W?zD)An8r#?Wu1hs z<80j-ttJXaC2?Ar2aN)utTWm|E|l0Hk7NFWBK!ZC@jsUM>~2C&fxj;1c%ly$(QEfy zs^h8XhBT0{x179XZ;IhvcM`eRTn*K`!-yZ)zK~{_8>(r|7)TbfUQ%zvYCBzX3UkE*`ZBA>%G($lEpQ*!i#c;Q zhY;vHsCb=|hgIi~a&w=#gm~dJSv1m-kS6_2bcNO)H^sn{11~-d?1y!T8xQEvh@QZ&#Z2wV z!w{!uhTq$NF<7qOjOv$}aK%sXb2$>nGzr1imH8|0Ek};_!J5AG1?9wS>lI<|oxXnW zl-u#)2;Xf+x6dH-7!kiN#SA2;9q?J^2v^fRtK-4llUntIsU=CJ17Z=`E{auE;&VDa zQ;JfP)@8%A{_1+q6QW*y%MleffeEr$c5<-_?}s{Fu8~}hHm2U6O6>S$g!?$hptt)C zC=r2EZavy94>{DX0SDA4*6SQA%cb@IzPb}hKB8-42{9JqI#cK-eF9QPk3bYvK2|6d z9hxkqbojl4UIK?qj^}JW6qkY(Mm+he{c5{V4}LdWKGbsGVAiW_=aJI-0S99wM`=Ob z`i4iHG<_^YDdhEpm*~~r{&uxIG%bWQBqukPW*t+ltJ!(mq=ig7?Mptq_;(u1}E%07=Qdu=R;0gyb6CaHa z={09?1~NpWu5o{sn8ho<+R2LdyZ*<-{W}(|!4N8U#gSBLqX%VXz9^11)SM&CU0k^C zv;X-Rj8%i1o_|X9va2^a){mHT;zisxU2xTZI9(fsPo=p|X8Ev%H?pF>RJf^dy$eaR zeAwxZSB4BXI7sH>Zg+@bb3gAHtw)b^BIds5Il@5%0>H>{E{d6)C6;&yiHE!*Ct|Qj moEcI$`o--n?9 diff --git a/assets/thyroid/regularized_logistic_regression_cv_F1_Yes.png b/assets/thyroid/regularized_logistic_regression_cv_F1_Yes.png deleted file mode 100644 index f249516d3e6e2a8ccb35a5cfae5a3561a5033549..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3592 zcmcgvX;>3!63$>CkjOELIwBGT2`UKeKsW*+0tSX#6h#CQbR|Fp35p;B;v^y?QH&GA zEvJHtgGex3ird3PL`05=906@b2oWOp4Q3mh>(A`2`#ih-RM*%2e%0OIQT0~!#XW8g zibypi3`IKJ;Ax=oQBB*)2$HkKfwJ;bC2BW}WQWy>g!%<*37L3A! zAm|1Mqby@MNV9MhNO(9FBtpm+{1D z7>j~qQ7DkbuU0%3izj3WA(8SZFdhZRV?jdU@gNaGJ&@u-T~Z!TD&$F}LKaTQq6m4A ztUM%?LYTo8*Km0!j7g4WWv>KVkeVg#r)yG`t0QRvO-z#PP zn}K&T0>;71NiWNDu|2UHi43zQQ<8n8najmO2}*GOHQb>>{M3v}>q&AQS>t=ydHV9fBth7d%P|Z0)3KoZ0R~pi4-vJEeV(FXsb3T!XV4{1$_D^NqQ$)SS%@Jx3~ce_uFHC2xyJ;RMfTj}&`{`0lTK-Nra$`mTVn^D-(hR4o| zo~bIRjxhZg4mggFUB{PL(i0V#o?rVuX-JKsPrM84Z;%zaPWB6*Ef3Dp)-g0&Y4EnP zX$+yMTeIBsGr-ZBbzeKQIJj*D6GIoUQ_LDnsTqCsqOPusT_I_vS23~G?~=9p z>5Wswtk^2N>lthEC-RlMy^{RFlR@?D%Wf= zhgTz$1G`S%DXtThsi@2N$%fTd-q9D{NYKcYV+yL2UyvQ`(CXVRQc7ivIns($4aqG4G90xepUt@HSu#jJn!PJT0u zybKC08jVYrY_(@2RfY{iRWv)v-A1{})=u9T%ESyhQ-5W_i1iOOh;;$KyuX&$wx1Sj z;htjGHzzA-lymd~B47O6pQez{NKn&HW?G+iL8}}5wtU7m5xIL9%S&ls-#W!@)H0XH zu)=U$O>_5v~uONLi-pI5KhaXm9=Uz}IpR&T|TtLqm9ajXa51^QV` zGC@s$)tyiR3D-%RW2}M}W%-(>!>mHtJ>hfoQP6vm7<%5EfXwoiRWJR+aWlYkw7Ti% zCHumZW1^B3|5M2{^VDWrJ42J(z*xxe7L!Ta0>}@kAFWL$W#X3iuc$$DX!O4JhP=ED zYRHmk#RDBJEvS_*(emun?OVSz$MxBd&U-g?JXnbz^qjrMN+7&;xpat1D>VU%EQ=J3x>?W;if1n$aP2_+l~x-J#(Nb7P~RWxzaX+T+N6Xa}4B}4%t zadkn^7v$UjjfX&-G2~&AX1v{@G-6$dZvPK~e}^YJc>(F2d==pPclmOgQm!Z${a{a% zkNC(iW8e4FR~EI~(BwaRZ{?cj@Ye5b6_=g1vhqj%`)gnI4O*p%1MK)s9Q_g^0);)O zj_@iQWY}HsDF1IOjX%#^SMvkgGN~=V40ofwuvwb{u08h`Mav=e^r@hrPvn;me|hRk zlApHO1h^9(yP1+TYTe&>d_~Wdh%v*bog0D6Iq_R z{>lBR9=suZA(sQjFO3PpE;mkcu0|ru65_(P(iw#uuy`h3%?F5(c#e4rA|GCY?Y$NB792R-jT_dG&rb zXEMvC#%}rm$pUKMbmZZFIOg%!Z)@c&-HI;@g$W5DXLFb?GniK!^pC0^PsuFy7`Px_ z@HHE|wq;C!G2nRQTQ_uwPq+|aiB}$QdW8p_Qp}+l=y}h8ci{_;#fpE_qPOHPsEq1% zdfoV>MK}jh_OO<=`4265s%k*shK+#Dd8L~z^42v^<(0G|wj~T4N%$+js!swRh_iaX9e^R=4pO#?Frk1pKeBCOiM*>S_{jKN8peAB$@%hsVJ z`!l<>Aq(JPoIBaHv0$meq*Vj$Z}m52SK0xFM5{JS#I@}XjJp&a=5CR7Ephv7uAGpA z?JlwP Gr~eZ`5R4%J diff --git a/assets/thyroid/regularized_logistic_regression_cv_Precision_No.png b/assets/thyroid/regularized_logistic_regression_cv_Precision_No.png deleted file mode 100644 index 4418cd62cc33e7aa5d2461308b65d7b99de6b442..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3772 zcmbVP3pms3A77^x!g2|PkqG6Q>&&eZBC~WtLyKZAx#uzyPFo$Oymx5c-p?j&%zTpJ`%t3{sSN#4Lh~uvU6wpp z09q8rcO9vN2}%o4-|amk4^Aa`hKXrH{%@wEG2WQe^J#X5cTK^AWmx%{O?y~y-*$RA z-IEzjFFo@x^R#iX)a7-}lYrv_8>uasIv*SrwYWA#jm8Wf zG&#u`#t|$Pgf?iGzjUo=2T05&AQ5$|%slhqUXi7$JSSzlDDFb)nm_q`{{8$QnhHOp z)XeV+<3mU-$HyRdVZk}FXt~d55Ikx6E@g0=PPiAZ&QcyK=(v7$jf zHzsoS&7kY}(s+905&`F)nW^=DY3bNpMVEbtnR&-2NTm?7Elv`6pHC4?$}Cq+;JlMm zjVMc+X4O2oKd!B%kMNy*^>4XcyPf}I_->e_suRnmNzoVOVSggOxDPXg_+BE>9 zbgi2?S|F@^+#Rx6WuEm=|#=bJI}2n+(x8plI|&8a5v~8BWPu>Wm@_Jo(Bpx+N+>|fJ1y4+&mGkWRO<8tTImUeI*4}`>zMK+8e27@jTWr-3y+p?Kje)RJ`V`_s~=YJ;G9?Ff*`8S+SEC=V_>!#-Rr6(>DE z>lNNq?xGm$?n+jlpIv+;(ekRNnjwrUoG$k4=B5xlBkvHX;tJam&i7V+&~X1!=x<9O zF-I}nQ5ud{O9}10w8^4K$k<>oE^HUxswhspH72a%?jdpcpUxWA!?Y|U8%zE%)hKM^ z?@(eRbxmMk^*>R|=X9GS-qa>^Z~M!{N<``xesOptIu1 z!-`a60-Gtsd3!gTu1<^~o9!1EKdQrJZ~c*6_V=tucU#2m63r2D+@l$^EktMXW4NE0 zYQ;=8D#?|vG5hK?idj(SIPauO&v6T?WE;ri0O22>K@(*V3zlb+VwqN!h$hW z4I++eQJHx`-db|9HlczBFhl^Z{wiAxu&NbQ6H?;`lrcw9>aCn{I_af*&t`QaN^1gJPdgf zyZm-r&6L8aRSILJ;i4=mu55ndsB*5{_}AkE`Tpw*aaZNWyNac>APZAsm1yW?#-d1- zqu&&GjHU#M-1MQ!bkDE$QCXq)~QKX&7MwKs5HfGIddI{9#zx1aToHVS`n zwSUuR{10(AVRS|bm3+hmUk-SydUuqOP3WCQDRRnL5pj8S?eIFUwMA@_A2{(a7mn56 z!|eTp^?-sCLEl{iP*p?>`n#&>skb$^SBmVmourEB$Q=yE)la2Dq=66yJopa5>eugq z{o|SWZ?WjRcP7&`a~NancJD4C$96l_7xWJ;mjO(VOB-58pKj^w8K8*G5>)Yu2{>qo zg=u_YTWOHer~m?NxyCX>Cj^5-RD1h+W}A;xe*D64jT(-37vxdrd)D0sf@v$H1O!L1 z-3Twk!J4EQ>!dBHxpHZbW+w1W!Q7h}tu=QMrSj`@OLIBn7LV&O4Kn!rSnO)?s@_B}JuG=LvJ64TemShv!V<6cg+yBZR8W_Op zoaLC{2^D#4&5nG%*V#jT>QAtS)}cc~r2W9*O=o%9ob`7s1nXlugqH1eOYf5p7lKIy zZ?WKH-7weQm5PWB z-f#_2!~!_HaIa~wYO~l|3jtHKt6bBjs0MQ^U4|YJkN$;Dirl{Lq3s|$7^B>W(#T(8 zONPcg4y}by${*Db3T_JkfS;DVlj&;IW$%K+wcxKPSGhlXW+!nGporDg9%os%2_)Nu zF6>i=B@Gp;$WmKV)7X2DCT9A9SGD!<1VN~M$@D$3fAT4qq{qU;K($Wj%gZ6*mt3-< zW1E!{(7&1R?6Os>qV{7arqAquZe@A-)Q2XsY`~8jH z3R7prkcq(%^qSqJjnU{JJkVXSN(U7dZxWhv5>`(iQ`;$FD8$-IWim#8e!t?!p1j*2 z|FxPB6!}tJCQ%2g|8c^Y=M1}7csBN;H!A-$iVB3AnZg0&%h)gdl=g-4OwWw?Imzc^ z)zRlnMpbcNC-yj!E(?t%Boh-yZ_jwPPg1W-AduNVezhRSs=}b*>g~mb%}zsH^Zv;X zZ9>|tugRV!DQ`if2e^ihPqopARttNa%X}4^1-rtl^$X~ZT;i!uqaKIggP>*(Pp41V z#xy46?R(AHn&u10J6@_Q2-KNm+#&K%iWkP1KSO*c;&)g)qefx6rqX_TgQVI<)@7x7 sZb^5iF;T-Q=%JKf(fA*^3T^B>9eiR`{;=ui&0DCQwd1k-M?LZX1&NvH0ssI2 diff --git a/assets/thyroid/regularized_logistic_regression_cv_Precision_Yes.png b/assets/thyroid/regularized_logistic_regression_cv_Precision_Yes.png deleted file mode 100644 index 244120ff4862455a69b2db4e7405edb36ed073a5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3768 zcmb_f2~-nV7L6JQP(cI%!2l{EveRrr89??1Wl=yx7NZ@DKqDlIYzjyiL6$(+Weuxr zYLH+=Akfvc(ot4vkPw6nhz+uYMcE=`3fq~}=gjFjcAu{E>iyLF@27sfci;X0$2r53AYFYfIxr<1Rnta2mnL?WCTcu zP4EMN05@#_tjPcf3mqWC!iK$JKM(AolK^)$!u8obPz!Y0Xi8LkWPn%4fnu`4tMeCbUvHT=d;NGn+&q)uzViG z=EI$^V)OZYc=mLVPbb4ppM5voVY31U+66}-!^XnYIV5i;Unr7ImyWH}O-jf5A5ola z&`Wr$e|*q})pL(F=vP_I6l>v2$^39qND2&dn_g--r+eiuaYqb>?rG z+ev(8T4Nw6DFBTgIEO+eL}Lh|Y9e24Op1i6Ph+1p88`QoO!4~nnuJ{aF6*UfLzB|L z+WN|Yvt?>79XO#U7c5S5?H-eUO)=V)Rzg^BcP`R{dOU$zl1ko_o4q_FS4-9KT1IUk z@X}O_abdEeKMjY|TV`O(WTB;_0W*}T$X&(5?lk{(jfNHI&UuIS^WUqnaH~MP#=Ups z8y7_6)-nBRsp5J=Eb4OGHpnEK5^iz5vTt7w z2ZJrcbt{u9th#Or9Xjc*Mq678$@F{gJ{9bA()_aWk-n)eo{A*Ble9=$EE(Kg2W{jG zTo>>Giqs*g5tf3(gq>6+6b?-#M~2C{K@6XWtA3WQwG>h=B?eo@!h3wu)vQTwS>syG zy{j9^Qp^o|WSpF)+3V;Q&!p#m!j}0v;0ex+QFN^qt={^>c1m?lmLo~irXva~L4osB zhZ0!`OeU}*^k)eW;nRG+5!Ic@lc3cp)Za-{Uq*Vf92V1rI;$qCpB%S-ERW;vdRFEG z6qe}^pw!(vUWo8@U4@_iA7>pFqfIV7LzG`-~bPMiLeM@Bi9I;tklT-jORtv=ov zk!ydpUrBieI}S}da3`=C-pY22bCfn_){Xr8+R*$Zp0?7 zh^$|`;mHRYR(cBdEoQxKP&z$l{GP#dr^`amB9^o#cb6DJOv{(KA>w*A^J9x2-v8v+ z?@;xvxUn0-AyXTAwuZK+4|yD6#Bs%%bh=aIT|x^VOsAJl5^1|!asdCE13dKi&L(a< zZqWogs|NqI?oOBNbI9DvK_A=iPdz2O^y76OpS`f9b9ZpsPk+7~b@!l51w-}@VJ_ge zf2rK^N1SOzYHHRSQPftu?Np`78&wId%eB8fd3kcxrlxnQpAeEPC+yD)PmR$xaBLV! zXb!G9@^klto=_;IB^@2(zuT4g1aOnEHkFOyjD@H{6t^Z#g_ijedQ$7is-f+Qi|&;< zLWAbMBqD|&lZ?H50aSk~YK0fn)qtj$p7P0N@K0*7t0s4)N{(oLOMjDJ#!k;sMYnDs zonV8?`8+yOHF3?bl+oLhiiz8=GmD!KJg|a-naBvLgBt6ClheHF_VF^zf2B-do;D6e z{&-tV%_>20ALJdSzj86^&ym&F(;DpQqZVR#XKnEgQ`x+%lJl`ow_us{>&bgr!m0O& zGkLcc4~km|>{L95lDaZS);I8&u-v3h4?^eI|=@RIb^W6Ks$N3jC-CWJe2ohJ~xtI2}C2R=sh>lL5v$s}japcbM6BE0a9u}KB6j9?sB8ZOU39XW8`X#tHZxas7Hg6r zy@`!ETXAHiMxYO^TW9uIB}_k1Ugmt!B?-IzLoWNO_QFe)v$!86SM3JA)&_^WZA*MD z&SvJk)=lJyRuHnCmoEuH2VqRd!a(~R&%7N(S84?Ia5e3>8TBD^RZBzn_BAPQj?nL( zzv{c;Zd@WfE9YWedrI00FI9QY(wZ?zRB1@)W{Sb@)9OAb0_PZ+;PkO0hJfbU<;TEt z1iCM7yannLoNpGldTq^WIx%qg^MrxW1qP@j1DT#sI=MM!wWv{v;`J(F_JE6guBX>< zH}m}=mjEdl2)#0F>6?y`yU9I-UeOR?;0P;Lu?BsEoa!9cZ(ZPc%#lP0xaiG@x**vO zW|XwpY&@D_$pB`9}!@c=<~$P}id zj3wxEy~`^P3yMTNR}9h9_EmClTrd-^$Wqi4QxqzC<&0Me z>bJ%h)*pppR>xYEi}#?mlGvO+98PMnHS{_hkpCjpH1G>j;gY|v{qaVRsTwXG?KFA4|ZElW9@1J;fufCf2&~S z|H`ap_j|#JwN<;Owb8j7Ye`=WON~BFpI!-iky4XWSOL*VW8>q;f_l*ENFL@I7@_V> zn_OU-w9Pw=$UwAZKQ+^dk=6c))C}vr zJot~?v&X*S1;%py-}fp)?JWFOPqFs< zs!H98P&!9^F0ip%Bq&?B*Gi{Fyr*6m?9W(RdFqK1csd#iNTzO-n4SMn9L`w^x;PBn zyL7?6cMExZ+%BU|bUL~td$s*cty%@XQA?_+pj7n?6x8~rZkGANNGlRA>@lp7XPqk< zoH7((`5XT#DmCB zLQ4{rjGuQO=oOsxz#;|q0O_X`>aV5jL0L>VYLhsfmR5MtB6Fs0w8S4WzBjQtJagus z_0jPob`QIXKllV3d&}Kv(ALa6qxAYAQWKqD{Wg4}+)2J@hWfUNt8mrNC`1fhE&V!$ zTsv#((&o-s%zYiPkk_*0na!!52wAM637|9JwQL0K(!)%!xNa7FVlGVk z!`@KCiPhDiEou^l388Y&#OXZW*r*_+%f^Xvvb6(ZOPs*oE8h3X*1mA zGd4BWElVWtfR0A0n4b<~V{KQKO*1ke(&A@xKZh$dBU7C7+2G+#9{gQ|zUG|KxlKIJ z-Z0&yJICng#7F6O*A}~xTr@8ZV;Db)NP8MaTYU~Ai_vTwc2=%}(}9j!3g{W=oO(Co z@`|?5jb!WBQ`&p8Iu~$!unL!~sa232d&EE;3VPI>1JcX}gToU;0|E^5GGdii3X$v# zMViizIj!yTbECL%i^5Tr?yHXm;o3GnBtH~90rNQVc=m5Jd!~I zXitg30VV;);BYvA0VE`cga<&o*j?GK1~4Oqn_B|>;AUxZ{&Hl&LZKjrL6R`5)JVFeTgaK=Bj$tX`Z^qKpAn?|DM8g$cjK zfkIs_CIp;}^9m7B62^9%Z_wrw=cP;@bB2DkupkH=ET29zGQ>)eavvj=?=Wc(5q0Jx z%h_H_WR`u6t4mcrO~4>d+!RM&)6$B{&2#VbzZq!#09%$HIxRV5@Pxt5{h_?q1B^hvodS?g&^>v#j5XRgq2?4CS_ z=+fsH7SCftu^rsjYVus*D~;FjMR{KAsa*X*mcUA-i@mH>IGasfb1Ar%E7!Z>*T~7S z7bQNMU!l{x)Q)JF(HO^fKP-D++PWSfM%Vj)odyPWsr6Dx!r|;Vj&KhQ+h8JZRH`sx4c#|JbsiWf(Pf?O3y3)#r1JR=v$Q@wVKBy|t@L4dL_WCuBCKrYDP}B zB$NY8a)Zb`vBj#@uh+#UY7&SF&9^gD_Je0f`j;~V`>Nkl&ti6Wc~+%--e9503$r>{ zGftQvP7LmL4U^Efje{--XjMM9yqK=M-CwCV8YdR-6W-!xst`{Vpzj)~t_udyfP*`V z_QXXSD^$a*M#iaYrUh4{2D+-m3TnbtGK9J2xfPf1AWpB5GJMibERdhEdrIQ{;hpas zLYGu9S9?@gVYXFFghnKS7=Xc2L)X<<5gA+*)=wN}PvGqmFe zmu~Kyxn%9FYna?s*dxou@pI-Omxpd7wbqmtHuPyT?Cw|IF7IsXCz!2e+*iM@MTv6I zZI7k)m451anX5C=dn+xcTp07Qg-A^LpkOtdK(_p*(EBY&NBL5Zf%OD}(#73x{2s@s zAGG72R8Mjr=M%m`!cMuYAD;WLnX3d&Jm*Hq&vC*~2Py7t$ec#H*tz4cXWN9TMjcad@#98Bt|w?tRTSS&7yygYIHh9 zS=KOQf4ySM2k%p%7Drzq=tUmHZ%;d4pP;@1S<78-PD6K1XHs4|qGNPyvyG>=BnGKG zmTr{rzJJWz&S9|An=Kk>aME!K>zI5#phM*#Kwj+*l3gQGxN+)<*Rn zwYduw#JW!R&8d|0M1Z05BJ7WdPh0i9W1@AeBHys%p_Iw=<7*CDrV zC#Ne364H2*aOpK|J{iP;?=rnYT$?C8(on76eD-g}1ujI{vNN0UGD{v9;0Lfk4RRoA z?>)fvDOOeQ+b>!zzUzq5w9Y584AKH73IJdfs7(0L@A zs!tx#6%r~ur{W5n$0gfE#XNZ`o^eC6ux&zV`-EfHYbxQb&)Jo z7gG|D4{RDR3njA|f0_2VES0(TBK2yAdV9u@u|J;{qQ+atoyN0q2j&@bHpe{QQX>p{ z1XfbVEPv|Ulz0nXak}~KG4m8)4L4Y9(d{wMDws* zr1}~EQxwW1kQ=R~K1)(cCVvs~NUQp2d`uhs-gLK=yYnNzFwd3jRd829;JE>H`tFE1 z*Ml`XxfR~T-7l8+M;eiG+<4Dd|4Z@zlP!Ns?*OO|x-Lc^6nbafEvgjA&9Ky&963xW z0isYo{EP{@xcMaI*38qXQ>CKN?`s*%Uhe=1bt)=M1p58If>8Z@XtCX1=b~)Zc(*{! z3yR=XZwshsmgu^w1ak(Goc7_aQXyf9Xxe{~TmPdgD!Nf&71VVh|EXWPW4?0#K5Ree zFWM#)8aZ56vK3*!Bqn=y+o)lG9$(xJE39#Aelzw{>Ux5gw1();^w|~f&yPe~bAM5h zC9ru4<>!1oM@+Y4kH6++sIORQGHmF!U(VRXZI1GJ+|HO$9cfi0oi?xy;rld9cyp0qV(?NKI4p zfa)DNpg&|~c_9%fcoA6L_vz918NkaUu2?Z^{&uc=L6A|3GU3XwpLm4KZp^p5gc>#p zE*x`e?0Ig~$a(b|(&i`K+=g%_rslqetOS6(KhL-HwTsN-7IL?qUHnz4GKAb90_|@+ zlX zD=UHP1LAdEtO;wT+G&a?v;B7kt7pp(k_A z&}k#}NQ7|Xg z%0Swzff2*7Yi7WF`dReiiAb7ioh&7pLXW$E{`A6o5LcDf6g}%yzcc+Pa-OV3aiVpt z+xza1RHQ7;c4rcH1_mx8x+XwRE?FZ%}?)LY_9%$VdvjtP_;kdzmIB?WL!_VS{$W zF37YO%o2=7ePeG&1>5QrWNn|?tN=5z4iVG^(1zU74=IkySZ?i)rkw;Lq^%et)CT^R b`^=7XzGp|OIg2#0dz-X0g`3nFd13zv$Trx^ diff --git a/assets/thyroid/regularized_logistic_regression_cv_Recall_Yes.png b/assets/thyroid/regularized_logistic_regression_cv_Recall_Yes.png deleted file mode 100644 index fd302f10744058c28796d2dad82ff63a1c10aea4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3735 zcmcInd0bOh7LFxsD!YJCl!^i>h=>UE$*vIDK|qXU2{k-HKnQ}eBT1D?Sb`wRE}%qq z2@t4kW-MX^3^B5XgcK?o76s9;lX*}((`{zzACuq7x%Zvr<=&I?efPcVj`j$#&5E00 zFqoLtiQ^|>uuYuxuZRF-vEFP?god9SZCxy&5e5TbFc1dg!vFvVfG~gn1IdsHx&UC{ zx($Fd0RSN(0|ZEzP&5<=!a#5%0vf>e4;ds7Kmw$hAejJ?A6CeS}WJs8h2U29n#V3>bOfsL(BmhhT$RtCuk;LRfPDnBN zd_J^#GRP+rprG~Gb=!Id8XBMxZ`6+z8M4_PWhYKE|vn^7xw5~J$nk?T{AW>_4oFMF#v@<%W zui}Nw5jHbuW_dToqCC%A@2OUo2tBjyu;DS!M=IC_np%%niw!&L;Yj z_^+L$>VCYtIO$&;pgm$#Het%e7&emls#K?Jtv4<+MUtSyaxp!gQ{g^%^U^E^F~g3v zplApleEv{rYDi%hP6oIu^?Khq!1MpMf5$!UcU5q& zX#Z40A8jumAql|_lhvP)XhDhbc502RM|-T^1&rd0o8V)=ML#E$a?D~uC1={N?SPHm z$B-B);%Tt({WeBm!y?s&70-EppC>SlYSo^*t74}f>TD60J%DSqm?2WzO^oe%@EoHO z*X6-RqM$m};oVdOOUke!>z)zCCpS9cD+5|=W`TRKr-fH5>uA|yQ{i)aOKlmOcGOeG z-w=^I(1Vo{BboIRI+1BB21Bv|JVkY@_SivwZduW^`fR@%owXlW^gq>Te~wz>S#@yi z=T6-INW{#gxfc6V7?*^2aaxa0*N1d}VGpU?J3R;(>gXjZOLeSW;+gW6qw>1)zu=F_|M`{-~HEw~W>J z6~L8(uO%W}4X3VgBz-ju@_lHfRVf9X=D{L$vds};^$h`}gP}0h)46?t!E4rvLmbmW z`53tiJG?_~oh(mnnbxq+2)z{PF3RSjdzA!RXdfIOjKx620Mt=9B5*t-a*sybVgL54 z+Sp@v!bDTf>9-hertIozO=LGW|5O<%BED(p6Pe8JT`0WzD(hAT3iD99t0mzuJ&hxd z==YY`;2U3Dd!hV+&)@R@rmPeMrOi}iLB>{8d{de4Z-|Xp&-u7lQERsvaqCSocC0gk zq@+lrWqInI7}`v}D5uq7E4f1xTCIB>bs|Zk2bq3kzSzr&X|g(qjq*_EJXcF5u?AX4CqGE*f3B!JaQFetvR7eVtOzE9oLz zGp7-fwoCNRWFH>`VL zICo=XC7(sq|Bcdf`>_4Reuns>G5w|81|sW)(9CTd(le4F+9@zFs!AvY(Skc3au+)|q= zl!!~KX!%WfXvT5cx!-MM{qe6X-DRNs?7mW$uaqJXzp>52M7e~GNvV3cl`PBcyePXl zxB42DG|IgyS6C=BAsQQYa}}eY2oz>=7D6tpRTQW+#=Z2%B{N;OCFfncW`Ad+$Dnjs zyYyG6x4OEa=B&=Udjh}rfSmXH@%C{n&sio{wPI5C zX3YsP6NyIDj4HHICw3iy=V+ZGDOLI6qO9_iTXQ;#yZ2-R;$nG@Zov%i3g)q6>Mv!7 zg9*i!&^_Sz$B)mAYWM8=3v3LFR7IBwLU21ZbKP4Do?tpD-JG?(K7)h8I9dd$rSg+G zj+hrh)d<)5U6M*)d?;E!(W~Hxo}$s}W^?<&m(Byk-5Gz?^#wyPL8x@Vvszo*Y` zql<{g_XW;?I%NE-*=7JD9jNlqlZ+cS>np$!!XY%5-;K1jSI|zW!a)dk8BRhUlG{MH zQWBajl(4nsx6JUzPW;>OqM5!_Z9w1a_O2Un%h953#nNEEhx5N4t^9()zT(2gD5qKJ z-THpJ288Y4CW}+RV07$?aSY;M7O?xHFL*W z?Uc&by!9VQhVv&=mE-)~ORAIVoCyb}qIOFu=_bRyTKJ5C&ChM#p|ejxmCXXNZn#Cj zA7@vdlO3aj2=399$(29tj9@B8Dp<%@6Gg-oElEE@HENOWs?g}>rlZCK8U5pcd+hb4!W}-At7PWYtDNx6%BOIq*gWH*sz$@36^dAJL{qRr;5Wmr zk!}0FQTBQ$I~DE~ILy`>Zn{DZI^KAYk}}`>WP17^<}=TD4e9N=-*~xjlF)gZOd7T@ zQZCY|A$|C9U_rdG*`tvm#5^-hrr1~G6sQjclF-)~>XaljE8bI-S6i+3@6Ie%*>3)* z0OghT5Fag}lgwdTSO!ZL9;51FqvRDRX5WdcGDO~bpy9$&K!)w%(|-lqjdsVVHM0E& z-%4I~nQNhTBVE^qm}2d7Up%jIAn8);H7FVmulL97T%ryme3x&!i^m+duMl>?%8QKw z(#_kW$B1xZs;!>U{iX2o%dsDa9P;rUt^P96vUvf@VZ19uX(g|yWqU@OKgoJ_oAyivGC0VEYMCNC_C44GmZj_oq zPRRWQGykTM2+DI~#&SRx^JPPf_0PS0+7a`@s1{G955)@WyhvBuq)zK+}Tyo|@l9}2Lk z)(vA{3L!cwTe3{nI@XLaLw0#f_?&o^%`?{~|e!698vUDL{Gm`09gUcb0XETMu z`^*0{DMTkodg7m<9V$Fh27*$9E(LjaV13IwzwM9N*?|kQ&9px6Gb8c2WG#&f$i{Bo z;qKQ;=Al8J#eB`=;R;cif(lduuN+`*N$RFH`UeKA8d%`x`mH0moG$!`Re6m{&-m;Q z)rZ!}juK%oLrG+2dy@QU#NZtKZG&REp1SD)i8R-lxty)=r9>sen9 zj}py?a+2>7XmO#XkA(H)b=R02Y%Zccd_s6TMCqr>(iFU+zrdTmQ-fl%0ACN~_o%_E zdYY$@$!_nqoZjfiS>wyBm;BI8J2M6ReTT9Ioh3hT+yNaO(Z^$I)g5>}4nONuNn=PZ zo73_A*(r?)FjB^qQ3No#)RhhixSx@5?0M>-Lo+;ebTypgX>xDt8A ztta1QGgvLe-LdjeQ)Kv?chT0kGRrlsyaws1spR(%{WoivSE;Lj98+EezO#<%mB4*X zv(5AzVI9SovVea8haiZEER~^B8Bg^SflM^bucz-*>FGgrYad-zuO9oJ>RwnC@2sq! z7_>K|tlsnz(Rn4YI@W8WY20Tix7zr|;;OpLP^Qwx?AW%3d9AK0P3-MapXPc~*8NvU z)BRhkaF=EUoyP&qF;-~5Ntd$(=V_7+q%7-+mz#~s;f)o4KrFvU4qYnh==g;(TN>)~ zEaLjHhV{T33pq8YaSCIIdL%?fm?f#%|Af7Gy5DU5sNXD%s_;`XFFvDuXdNf{%|azPLad7Ywm^0WaPzd zaR~_Jj5RX@dvdvDI)pAoGJ0CbUiQ=q@7I|ygx)9Dc{@TUmRc;KP{ATEqNuvL11T7R zO$au~mT3Hqly!IR2nv!#R-NK3qHC!s78O_8#7wxRVM>_Z7V|IsciB|nhbf^nQnO^6 zEpg$MW@yCxP~uIWX*RyLBuA(6qYJiY&o0#C;wsaDH>V|jkn0TS7=CH!FZ@Ija95P8 zl7>X`Z5^jgT^%x=-y4v~!HcTHNOe;+u33R}!(|r0+@AFOPT)GkuseQS;$EeRMIazU zW}@?Yx>d8^1*)A#T{R2H@^I=aK@{lTKLN1))3H$(r1SrHm`Bp;UUJh$KPkccWC_Xs zdztQ3M$q=SJ?VM?8rNYWdpZvWgI{Uet*{(YcIL)YkQJ~%Uk2@kU8*rb6g>xR&6^5Z zj*leVXAo6@(R?N5NwMuPF$%RZtwP;{ZSufBWQ*ei7kJ zb{g5v3Y}IVgV15tM6+6GUVYcRCL!mT3ytIKScCT>yE*ny`NVBsBtqx5Edzrm+2yS` zb@{pffv^a8+k)41mxdQ152D$k1xe&on=RLkf{+vL z%WbZO2P#X;+=B9aakfqZ4-h1~6k|qHtr|dmZP#F-D2n6@oMumVv*1&nQWojpkwN># zaSi;-I`j62@U~&DGXryPD9&SxP516=ytdr--up<}GI>wFw^}+9iMi*{_EH{s61;Zx zMmA>Pxg@BbN+IV(TX%V)WXr|in{~zn9icv2mwGS^BmxQX)G8DmumBwW>9a~^VgnYA zGFYK$cH}TCmet~Ayehi>U!Q=DIm-Y1i|1`V2If>aC{U{p5<#U$kJz3)e%(B2OdK4B zsr{$lBPL4V^09^Kp1HfsH*|aW9`tGQDbsysq7}17+u+x>M^3vEy^MF+ddzyhnRE)k z*4#y^!v>XzjGh(E)keaH+%@X zWaBROWjg0}2P3HtoMOjils1NdsGwo>F*uNfj!LO5fc@{k0%i_vyXt;lv%6qm2RG}k z-g$?K=fWvs+ry|2F34ZrL#9?7u@G_g+T|N$j8q3YXTP3X3>GXPj*ZjJT)u<9BtrCd ztNA=`5nGaWCl~14&-1nDv)-`6&xFVQSvL+`F;4P?f8vD$oN#t9R zRN`LK>-D4&(12yC=jdoD98|>5;~xm+yfyPtPn{_9cDo7eq*ZLss28{rsq^ZI z4U~fz$nRPoy)Ni~*5YA|Tj z@4{+S3kZU$ASgivcm)88*=a=jJGc~ezcz_YkI4eUGF}4Vr?YC+_+c487lW6Swg>o~ zE!vwJIRj5Slxo-e@W?YA;){ro12Ap|pySc+7hv8!Jl1JUfvj@k2)J}TF4 zZjX(>m_P3KvNyuevg|}cktc=c7w8bM%?%&X$kb>*1{jpnmrNAx_x`gm>r!t^coMGC z!H71Y^VbhwMRRd`qE&cGQ&N z%P({V4UY6D^mK+gdie!C+EaGEEJVDm66Wzus-!anjQJmG<~aYoA|g~YNhgLl+IJ=Z zk9zEuIQ#yNNRcY!*Hg=~KT>`OmMA0yK1<~XC2%4P#v#z^TZH+M5AKX2qTSR|9&Ts6 z%87Y2OGq0`T-8CXk)u}v2+b3Q2_Hv37;-{$6 z&sH#`Bzr+tk7O(wphR99iHk;ocde|cnR1y8y5_RLIAl6v5! z!H4XWe4msAN~GSZSl{;fV-!6_l`gUhGcKLz0BSg5Mg^L0q&%~oKd@jsw>kbg3l0BR zn<-EmZC-AasmneL^()>PFN`*aihmOr+{4W1?XeyWL35X0lp!~slYIki)Goj&$3t`% z#^Ah4yB*MQVk%gpe}^~xkqeVQ}*!Pu1#Zul6h2CMuT$5{GbJ-+3FZH zt@v9;iB`};*mR)k6gKo3sbU7wmqjad>i*Ji*jPyyz!pn zuLQpO(+6?I6Kl`!2fa)@G4K1}BR2FgDG`ifU+LuVBwBN=WN~AP_P0iCe+YOyET_R4 iwCq%BwxBxv@y;F*T5j>(E`%x%_?a4;pDi|YCH)5^_~QKl delta 3092 zcmYjSc|25mA8whsGqyou!VHnf63SpgCI&TR+_*@VC|Sxj7{+>Prnr&mX1H}T*1A~} z#xhwOvR33GYfP5xvSi=h!~5RP=RJR&b3VWG{XWn4`99CdKzE}H_zs$v5sVCeqcCTa z`I7n?#7!QIX!guC6p)g%e4H6S5@0Z=Ew^Cnpi5+qk?#IliW_G>KeI$|4l(R$(}Uz-tBh3%Fhv zeNA0dyldM|eotzHV(DahuekZ*u1*uJl#oSrf2$tWn~GS0WUjZ&6(2x%SC6N8v2ELT_fr-Uykyx@914g@+SR?v#_XHz9aFBd#Zxpq`6|WNEK%BYC*5B zjT3b%C*%6jzNMzfh+Xz5kV^~i-)u#7CyAxz$G;KV=+L2fvToTp;h1t^E3daJ+|BT} z+%ZGw2Hf?umx8~}cLkG}ZcSO*jxT>%g}xcuoxJ4=U48YI`*uC3nWW_t+001(9y>DJ zp;xEBJzOKn;Tx$vO<0+!-_avhn}ek;S0`m=njaF`8l4+yavB6gs5pB5&azAKi5}D$ zRo&Kxh^v06n%#z-QZJr4U3t%oe!>dJj$P_%u8vE2RrWxsLY)+$DFpTrGduYzbhd@!NJctq|_Q;(Q=p}RFE zyA;~S0-pPiv&~Q3$>b^RZfc?RT0F)SwJ;J=s(Mlzo6ExDR3PKF8lDw-LlTOq&p#A~ zq8+zA)E=$NOs>lD4w_VHhkPnM!N#@*F?Q7ZM3{ls2Hz)THH#0`T;6b4xw9=u5ZKlF zN|Gh~#k~PYDfIi46MSz#r0Wt$Bj4>)OENYH3h`=&NTz5>UB{5u>6}!#b=g49LZf|$ zVMJG;Pqt{6MZBJHrE?aHaSwV1$fR@OhVNc*B^nK_j$cJVVrkB#+kCAiK8YX0sa&Bf zWy*O%W{MpOQEKQ1?-Z@vnDKG9AG_*t(7ym%GR^f$5uf5-NEbbgpw@eu&fA>$u%LRX z_|UoiO5E7bPS7Z>nNa;}$CSR{qM}I~Go4zvzG*b1ak{xsJY6UpNKoaW+iG_#3hEb* z+A|u1ceZ=VULkzO!%uf{PVcnTBH&^$&Tw&;`EcvlgEVOsu#Z6jcK;>i=$7*RyEx7$ zS7;(q*j#YaKu7Ian5!VjKuCo+4&BV7iRP@%@vU6c=8LWxGKENIGN`K}aomyPYc5pE z>SVk3D(}#;EG*Xmy~V!3NaMW(F#D)aX<0aTQj2r8G9^McZaiXSeWQA@@be)uv|p2N z9541eLJZ_?-pRpFasB>fuFhu!z_aK($7(ZjjKFa>FJ^au@lR3 zF%1GyQ8s=@D^WcthqLgtt-Ukk`EL_Lec%xfVCKo~q?^ zzJntUD{v-_P$xV5I(Y#itZiVwx9fYYKUan}p8<8R-OvH|?Ojny`q zKW^{$64~l?+{kPO8~J>F*`K|R$FXDCU7Utl*_fy}5KNlrY5E=XK)8nov|ES054w#u z2@4jT+81F~HuT)F{_X`+sgtr;h|(xG#17Y^uapvJ7lFB|1XrC^`746bR|RE(`eo+i$& zX4v7dUQ=2{S0gZfQ@n25+;`diG4e1Nan=Si4T}~4KRf1l3Q?xCw>Wdu@k;AnV-!@Q z!_k%`%=}2z%Ef$0#)J4_oJn&fNH8Gs6y>E$xCsds={EWDNQa%oF3ac^xz>)pANjw< z12yZ0?eHJ?af*(&RSKqBL`T^;9}y!B#7x-LD;7Bz;+55TVzI>;dmVkKvl!)DOL_Z~ zU+c^BcKsCMT}u0HLjKJ4K%@*T7LQgz>{Fw6*eBxr0fk)FAGG@PBssd7@cEYSRT)X0 z1{5h31bMHP9kYnVCf3#tC!p&~5N@Ov`tVA@y?RC6SbOxx;e_M|HpQSh8p#ta+2TXF zATGaI`TCD*gXXB~y)OAJ`NK9w8n(CG)1RCKjr1)MO=3b)kJ|UWGx3A7$G9UgvE9IV z1WOxSpT7|)X!0@pIydce*sBthTke{nE7Fi?(-W^J#?n**PGBM6!r^gOnXe#wdSI)q z*JmQ7j78n?c(9hfDh8l9^cXu-5ab1)@`0)pkRleUv>g+~+Vk`!uw+TQJZTh`3}w&` zX9g)hU#8VXfzV6r+W~hFbyk0jJ`P;Y)%_7Znny+p?+mNgUPqa%#n! zPs%hd-(3(75>RsZYb?!oCnfRnjn0p6g=T0<)Knl;`b}gQHZ>AIZdf3g1CLy&@T}Y# zfIG#-H#f};q>WHQwoATt;*ivcg12XD{$#CBE!N{27c7P(I4l>vAC0+^Bb)VK{nb^H z9OgLz$xDT0H%qb;Xf;5|&eSF{_=}g8Ojhr)#xDWeK!8jSED^H1*(M$7>mYAs(s#1S zEokr(1ZIAc6YRTtwwTHq(3May#!&IEo^y?T%P&tmTTRQmrCJE1FkKLL@h=JOrT0@V zws%dRq0*vs?ZEhB5_#OZ5~l(Q>UXr522UZZsTOhY;oKU(-g(tx?TU}c4pR-_$dw9& z8vveL3vux9^{-$0>2zsBFgl<;X2pQ@m~W}?jA)pKx$j+-H$NO(A2>^U<$qN8Gv51N z-AJv&d}#dKC1}6eT*|bOCGp(`N7Q4H^i8j?lN~!6VpR@A^yhJ-9(TR!ZG51Mel}xZ zv~-UB?PiJR3du#4;?m#0;KiJ4$e>E z!iig?7`R!{~Sq zNU-WzDh-Y{%@7`V%P8YN0i5CZbQC2E2XnE${$#PgeZ#cz`c7%2nM;2vW&4=U*LIYc zHsEoo8*p#n1JrK8xo|E<>%5>qd=9 z>8(=p3AFAqVCgp_g~JbX?$6!jF1t=BF8Y67TFTXs-pHYOldTmmBcs(Qnu}xFE7n+W zpjDbhD%Lr9T+j$oZxS-6&ZAkgKUOUX1s7|A-m}W4zS<&6A36c#dp_}MoPGv8q7>I_bd?NO6@oqy@%u*ylDNsiIAN8Gg(&AD8{g8wp04S_ z?(~<)u_wHPc_JP_Ohb^PwlAKb)^bzCYnmHbM7&BBXVHmg2Y(og9!U}-f6-=F)8gaQAFyVuf@Ua Z*b!`=4vSnZ;e;IkFM=`ADBsYT`X8GF=t}?q diff --git a/assets/thyroid/regularized_logistic_regression_cv_f1_No.png b/assets/thyroid/regularized_logistic_regression_cv_f1_No.png new file mode 100644 index 0000000000000000000000000000000000000000..eccdbd6210489cc20373b005e42a3bf480c591af GIT binary patch literal 3527 zcmcIndpuNm8$Z*gfSzg#1NJkp)t*Sw7c);{p0Pk+kQUp`8?9y)bHB%&IWD6Fr!TLnIvn>N{QdR~PgKe-E=OjEZ#m^<-8 z4ro?(S41RIi9ymjXiDqBDx8F@IP>-27xH(6?@!Qi3`F1KbH^*y`ovR)IfhDOliQeS z1HBCQis~0eC8(%n)5{z6*%kit274Q>)k=^J8OST&^&Y+Z;sY=%+tm87pgE-xU1h!xMl7FDgrWq=_v$%=d-8R673PZRR{Je|EcGmCzugnndm=?$5W6ko zpmvv8tia62ni?UzmAhfg7Yb6THxCah(#wRe&e)I$mD3h^P34lq>`I;R46fBt z!kt^SL{T|{$H~-KbjqyWf3>soNI=iIl5cYgU%g+)ld~f*j!fC~ZQc1qY}5E)g1e#B zltr@(|C!QC-gHe{iFoh=W<;0yBhGF2U9Fi@er_W#^}Irlmr`fD!l)68lX$P%p29lj z&$^jb(a|`&u-V!%4!QFXhHN->rMK|n`41T$?n)oVZ-s3LaCk~J=jfl%y-Rlm<#?^16BFSUIn-Pi#&onqdjF=@xC7~?g$TN&pu9j zNv>^5axRUybs}Z3F>m#&(9SE|x3{1T0rT_Rb`SmqSE;G#7fWtdvDqx@gF|!i9j(p7 z_7EaZx7409SRXG|Hpy{&-_(!T4}~eX|ZF{q5{G^?-lnp^WvsRnbX7N zV=mXeetEn2JjJ*?Ftqw{2^&=t8ku=Cq}wHgFyPAgsd&dpvSD9ZXy45{QUqDuvHq|; zhkjSAe_6WAMtb3JPsh3LV-J?jMaY#Mxrq0$r}VlA4bB?>;(Facf2Fo(Fsyny)h{~h zO|1#v@yVpSuN77VX|%1T`Yib&2{u&hsZL0(p50AQ7k*p419!hMJoUR`kQ^mYZN~2| zN71!Cn}Iz`D#V|F)N$_*TPCV?4xI!OS}kKb@bYZ# zfg-Ffoccj(_5B^^Opr*%0ZB7SVYE;t3RzrJ*GuI(EAPx!Z|Ihj)EjxsCaJ6@#MPMW z9a)NbKhS{CpmPX{n3+|388fk$79D zao`!%XOHF^;hcV7{D1f8^S<|bkW6#c={wo6CFT_B^>X6A(=8_+PnR%zT;@K|75260 z&?|qi9xOtK7&h%T;=P3ASQc-x*=}!+Qt#IE3I`XPG$6j zYUji0l1s0Ns^_)$gI*KsY^!kj9OC#Uv_j&ryJpv_6q&vnJNeL zJz+COZFen*iz8u_z$SUX*YWml%q}x|Tz9+qy9uSgq2`ORtL&3YnXh%6_tjtH^iMf%Kp4GX)E8 zm2rswf&3)D-6p4g!8M}6$U&!|J!!4(AyrPFFkz*jXuvQ22W`cdc%q1iw>Oo~jm4t$ zcb}7_w5Z3dP6|SVD<1`X4uxBlpdV}OII5q-T~sB@p@H z=L>uJWr4}uR-6yIcQJTWFt4tZoJY`O0r0aPtbX*E?_0_;?)QIZ;gr6$y+1e+kU-}x z&$pX11dN~@hCz?RL8xjr*|G}R_Ij#0se)U~cPRD1Bok$lTKkAfynw2+p?l~lJPB77 z{feOBL})MH;Idw2sl<=#KVcLv1@>oDO=@MED6B6lCxSkbinb({V(|TE=&&5jf;GKq z%ku^l*pIoV7zE2M-0Y0fQ5B`c5kgos^*-*`qFxtR}T>ThGmhS zLwY)5_xxk``5ajoJ0;eA*p&spbk`IvD_`d{<4h%0c!4+b&{M(xd=O{jJ_Z_8)h5BP z-j$|?W}Nx29T3c&w-Tl#*QRZvrsxgK1UT2 z#J6XhwXXiGlb`RohxRa6t{xiO(^b literal 0 HcmV?d00001 diff --git a/assets/thyroid/regularized_logistic_regression_cv_f1_Yes.png b/assets/thyroid/regularized_logistic_regression_cv_f1_Yes.png new file mode 100644 index 0000000000000000000000000000000000000000..172a9b4147dc9ceca144d6fd80a971eddb5e5d34 GIT binary patch literal 3539 zcmbVPd03L^7XK;?mrPBO1`R7GHJ9v`3!!Q5qq${`jhGEhMv6{ik-28!l$m9inVA|X zGoraBMyQC9<5sR|rs6`8p}AzZLMr#enLAJSkNZ4!c@F1=^Zs7m_nhx{e&0uer@ND) zJVG7-07d5??7aYB4Nv-(T?a;7&!~GU&ND1{ucyy@ZSg$bT)y0c0GS zjKhJ5zflY_nZYG%8PGAnDG7ri;W8u=E*Z@w$*0%z>uKb^=8OtM;no=jlQVK}fz z!iFjvZY0&&l%izSp`3SHf6{~`#;rQY|9R@Hnosc=rIMNVbJ;syL)#{U5lX_D%h;x> z&+dhe4tPvEVzTB&&fpl%piR3XYRa|Si&h7lTmivjjKROwxam)_S)s3X7N!z$)q z^O|nqnzmbb*4~V+9{MD^+!QUI9nif|{L;-@W)=N%g4ASa=szCHSs-bh|mz2ql_|hbP1OGwwng%!sC<;C|t5e}$Io{EwYw&#jik)&`t{ z+GWq=@$Q1*Q<-~O>|nml-F~UGyz{=Kg6h5v<>A9v{qOG~x%u?gz%sSF1C%9w* z)u$YM_Qsg7ml-cZ8;yTD$1IkcG&g*8m8b8Xd`529bxImd`It$Ed5GB@-lDx*#U<$J z=%pkvmOjsCDV7yJjSqR`eI8lu8jCDcFsry4h)FS)4122nYVFK6)$j|+>8$yXI!0JI z*N>Uo2k<#>ASK>IoBdOxo@@#DB0JHG3#vr;w@~o}uMPg=*K#psg;$S^kpvDN70Dr{ zjr#T9KZz^cb8m^*wzYu$gXpjH1`z(Q<=o+8G-vJ_v$aK9*#p0#_bvL#wbt)yvY?Mq zxYOdXk$052XZN(qrXRR+s`BP5$9v5nM@8d-uRb~YlISUS-;C|=qRd7F>|EfHcKhPm?riyoZ0h#uM zj;@Vb$LaLEi`UfvT177*_GDjaT5-_90x&|1WM+K-?Ttvg$&eawm(zC$sF@NoBaY3nmL zvCJCr;V?DyWyC*{K$}f1eAiNFs{34az#c$VXeK4?I>9c1?o}nQyE?#K_D}kx1T)ts`U5xSym{|%?{t7wyW3W)gz_JZq`lF>x-dU zlj0s0VJ%MA1~-pWIj_8@go(ZuwTATK-(1FY`i>0M?RJANc{D042U<7YKSeJd7dsG? z|HGZ;Mb2egMJbO^%-ePSSGMurx!k?d>$I+plUjIl<{qnkO&2y4GtX(((lq<4WZ!th zF;PAiT~s`mc;3j?PS+%#)8s$0&4g%6mD2E|l?ice%nGxbwg$~V)`n+it0Lr!BNV;4 z?d@_aI}x(&G#$St&#JnO??{ODnLcj>i9g`WfwGh1dCCEPacSDry9iCgjrT`}M|x{i zH1*xZyl~I?)Idl)u)^si18JgqV9Lsd?m;U_|0=WhO8~E&eWYoTbHYwhy@B&2K+Geb zP%~Q9K~{fBza1H9ghJX^?@T!hIS5kRE6IP4Js}U&j>N246-~We_|g;0#0%F0z#5EM zh3`Z3`LiubHj?Zv9V!QVxgiy+b8VvQ3Q+J?#u_cy>NU>}Esg`7^C1J@@lb$EmY- zs&6#2HHew@M~C`;Un6U^v!;uV*~VV{f?682&<;0q6)X`&FBaIWo6y?IG==UDu8UQ(u;H*Z1|0XZyDT;2!M7H(J)q-OT3W-v!dPX7tWGZvRQ<s)SxHyxLyFEiol2DGF9HP*ITqI$gHe`hq1C+>qWw{H3Ap!9&pp$8mP25!_(A& z*0?1hZ{qnAs~*NXhEiB0vN^TzS@!zG+El~P|CTi?89Uhp)o}x_3&~nG*(%Xt$ts%m zBcAd%dGy2^ysftJUI7q_pFaeug1%#W?y>FvdIaUW66kNrVr=_uYUg#d+H&T&gJ<+8 za6(ed?csY~Rhw8-#KJN`HpL_6H%9F3Y4)wg{8^s!)OaH^3DOohB+BRXYNs?z9y!N) zVo_5$cld+w@uzyokUg!yDQiD3f4#1G?B_1~(@LARsERj0bW^fvjh`P11RkfYW?nBRPP@qcp+)LEJQH8ySY{G8DEXn3H7azRmWaX n%W^F{G;`ySN#>4kQEq_*R)t7T_4Yn0{d4B*;BH@RizWRVgCTp@ literal 0 HcmV?d00001 diff --git a/assets/thyroid/regularized_logistic_regression_cv_precision_No.png b/assets/thyroid/regularized_logistic_regression_cv_precision_No.png new file mode 100644 index 0000000000000000000000000000000000000000..22dfc9c0b8c5b8b1f06cf251c52964bef506a511 GIT binary patch literal 3720 zcmb_fc|25m8z12vl!;4ptyw2bA=DU4wi(OIj3P_P79&MQm#nE~DzaqaCKU=bB1tnk zvUgGxS*L7+CWGutgpuVv+zu)gX&pg|?W{l`)PQ?%*0x(pFSP&Y*5n)8)CIT9W8y7&NQi)WEvxoqd2ta(Dih)soTTFyei5M!8 z2qCCc%-6$p_obvvrJzMqb&FoTQtn^BgrOu!D+}{&LobC)w}~o?ENQMO?T)J>qU@TXz`FvD~A>rQCi+J-&IE=m)=-EO*{-xUij9um4!k zOuZ6#@D8EoML*7tvO0ZRa3MyAVTsA~GS<_PN%1LNtIoLhMlG+qfGc75b$!1&IW0YK zZ2{woqj0C(kH8zM12}G^XG(!>Wo)d%d%1=BTwc(%nyxcBSbLpa^({WpS;&2m`!KOw zY@+5NJD_jiet+77kgac46c4fd%i4>rT1XdS8J7e5tF6y17VNW65tMt+2^)cbChE>mN9|mjut=gxPALgQPAR;R_nry;c~*N;?(89$<{Enr3vYZ zFKQI7x=#4+@VU8rSy-pyzWNDv7rDsWNV*h#c+{4`E?FsgTbx|8; z6X#&du;)rJeM?8*HKy@`L++FC9@%xWZ`lNQeuUIDEq#%UtmBn<;1F}q+6v0RKs!O0sn~v* z;MaTOfg1@jVk)6D&-!WVUB2ni5b>xZvX879)q4b4QpoWF-79VLFTO>zKZ;2S@y4s7 zHBZ%tX?D9h9<;`2=T&sg%(nmP)xv7tla#Y;-R}SSRLYNsVTLMMNq{GrEM2%Y@Y3&* z>yJWTc=>pMUqJwIiAFxan9%O{IIH8j`P8ZTbb%JyaRsppp1^#8X!fu)mzCLFl0$fv zX=FPYnBP|4-?0N}gE24+6zuo+(O)1N`$X=3F%$|V3Cstu^(Q(03*Kt9VU3E#gUsu+ zq$v8=Qo2)yfF=AwYA(T)BU5?1Z+>7mU#!WTV&v+;Rp%3mXuk4OKEX$$6;ERSGPxKE zt?e*-dC&7KNx z7F#OYT;Th(jU0qbjD0GF&oyZeo>tRel8AJL%U}GFZ1$(Tmr+ON9MW~$hxRSj-BDSV zfo#m4jRt4vrtOZ!vJ8?w@#2W50%S&)Ib3dDnBd!&MS#P%GEba zdWJlCKJ~}}{C+`bLOEQAkyPkwR|~neRGxJxAx%VP3v~7)%A4#Rd!WJ=EK(W6H8OIo zx+1et=YquonyG1r9OrVh$mTDf;4Uujnk?}r{2X%mgGdmjR!Jj-{>V38V^L4wF(T^EgBzIg8#yalK)&677T?&tpVnB z@MDVR=`4k^%UAJiS=j-5_(mAG@v)rEK5>qO2_Sgj;KC_VU&1OPJ6kMcs-Uvm*+2?Z zV5^5a)g1~$0dBM&q9C)m-zDtJ0a!x+Q z{Oaa^{f`=k|C`h1J!6N93lDE=5QQ9+WvaiQp-nb_n9sJI@Q!1f_$$Gs(~^^8;~>N6 z7EwA=pJk$w>pz;31X|-aI-L@gbeq3eEkv$WHD2#Cyms&a zxifX%*Jq*cK2y1RsR1uzGGonserIyaEVqXCF4;pGObDjSsE$k4Epm?-N;y}#6>s5- zwToVxoNK!EDa*r^!QE=w^coFh$!^&=V@`f;H|N4>N^vr*DwHi zuGio;4XaDWYDqX#A~HULv<|x_<4h%&E2z>sSB;*u*2dkKC_Sv!;AX?fbkWK!x@_kF zCU%RU?f7-ZY}iyvEvL515L@P4ju7qS`h2JYX@-;B0iz+4D{} z!=g&fXDdy{yO6G956bs|8iacF^K#3hy_P4x-*iKqx4;*|31mq<4xk*az0^ZM;N@=@ zRya(BYlOwjH4yG3)K_N;R>$FiWK7gc2SmTyTAzbxrm>YHGGsen5b*@z2B+<1i?g5p z!Vx%YtiWk6!eb_KpfPCRF)2>7s7jBH3N0vP^VcgSTQ)d+%Q9k2PDkY{cjbC~f6XWz z>vKi`*ZQL)$V_Dqa+rtWpaPJG>6(rpJ+WY>ZsfE-%=0(D6uw*%Y`!X^e&%FTkobJ$ z_PP9sljOyzRf}SoA=1TBmnv@f3BRW_cX<^fwe5~G=Q2#bJL^8@lt)DUI{&~>7X%P; zG|s$lu3=A&t?1+|Po3CYE1#c!7k>Y(xoAhL?}f?Q)01kksb97D<%c{C7~UZ`9yuC2 v7t_^%Er!)U8EHZVVl_`<%{71Mz1QKm=V7`Bge}f&d{J6s@fJm9p5*@k)5+9Y literal 0 HcmV?d00001 diff --git a/assets/thyroid/regularized_logistic_regression_cv_precision_Yes.png b/assets/thyroid/regularized_logistic_regression_cv_precision_Yes.png new file mode 100644 index 0000000000000000000000000000000000000000..775fd4ce91c717e6c0a1837997db37c536b40b44 GIT binary patch literal 3716 zcmc&%dpwlc8y~w<pU<}c{NB&=K4<1R-}9dHoab_$^Q#@sQdC$* z7=b{DT3eafBM<@{{ww%BEIB5OW5Ug--HB7?aEm}-5eNW*;32SB1QtMGsR#gsMerXi z0^rNAFsEVxm_RHQCN_+QaR34Uz98TR;6EThr2Z=Hy%RI2fkKv$<_(;YFZ&yQ; zTZ~Kf1f@{Wt379Ak%^RlESzv(j*@V`<)$z;1HE_kzo%eOTh6m=oFo{I@?^J#_gHC zPjl7=vlHBCeJS|pbcKbyb|Z^kPL*>;{jV6(K?W_$+5fut?iP)L8P7mmYPo}nf*n$_V`5(fV9ao|3|ie|{IJ0C00JUd+{WvSiH zdsDdWtVi+&dv`&=!LcJn+@r&h1~nj$ISXmllrCubnSU_$zeS}dG*Kj zSBImgcTqM}bZa#9B6&>Z8Z67~+4+-HIL>YW;$2F3dDcOA^)=tHTErpvAEElo#Ggd=kd z?-1b+wr*k0ZtAYp1`5ez_xo8AB$;Oxg)p={CUd*X&R!?h#jf*4J)@Gx{8NSn44*;M z^UTOyUc(qMcc`3R@@9G26bg#m`o5B5$247njaINhTbTh)q zf#jvp^6EkoZ6|{HV0iiMlbashwO#7P46b$G?06vU+vAX~Lsk?NNFkq_7jeD(XXyH= z*F>}zO?&Gi6ONvP+STH0iNtj`DY9vPG3R~an_TbI*xRT!=v8G2CDFFe4uyKMQO#JA zy)UE{7DR90w2j>`h+j-8K?WTvm$fuL{2kQ#Hj`ph5#eq@ny`$`O=x-(dUH9^caAeb zZ^G*dlUw`@X{Teds_rNzakl&BQc{O6D5RJW1S>Td&%1**?#Yu)FJbw)W#Z&AYx{Sf zk*O5ZOZ(2!8T%uK6Mu9RUp$pU&w02zPI9$R9uOb~#OMPR8xU;>U^U`P^o*Am+ z<<#dLm`S{5R3{{OL}&o|Ao4A_>Yrz|FG!p}^!9)ZMsYuLTN+4CEsCs_mFc$4a?`OI z5e7r0Fs&!H%c)J`gKG;Ztpm`wiRxKcyZ$szs?B!M*1afx60P+$f8LErm4u$TLcKuD=x@f(h3^2IsM(c6;h|2_+Uqh^0O5no;3>`unaa^hz} zG#Ml>&_xq}sz}*-{2cT~`gwS~GvF$eHfUbi-dGfzT@s&Hsh|B@GB(DXvoOuz$qi886QLghNNIk#<)xAmW)^7A5wwjh0s)@7!DMI z5lVVl!xn?(w>@Adrv%#fcS*orCaZ=fmg-e9_-HiwxdzEg=}o{_d%QsA$oFYolNsG< zK@hAF4Kv8g;k`OO$60wqRO*=YK;|hXn}=y?C^Vgv+V*#0WI-vUg zo*t2s@8za#xzOvY6FcNU5to(kJaS-Zpn2HZWb`8oZ8GJOMLMj1>&ERmONCqg*N&LF zDCAtv8|`VsTDk;D%RmO}q-5>3FU>Nij=$FN-EyyLoT!J=5o7V7$XXRrqt&r>aV#)QH!dsJ&d?V2>aX=T1}8Gv&lXBp|) z!C7I?8Tugqcfg1+u-5y~&+t?`Y%NpRZj>@O=BfX8Z-rghPhC&moT8r^f`mOZ+&x-Q<(^AdyGA)*r&ocW~1AW z+h1+5>`BKY%yJ*Rjg6E`c}!H0zApfI>~JUX)Hb-A9sd5{`kq4PA6f@1OcC%LF&j%p zaqA7iS+QDuIAgGM?7CU0@Z(qPrlexdz7M!N&f?ec;z=#f@ba&TrFV%xSERfabllrr z96Wjr{Bc$58paJLzHQ5wnz2%<^S-J_zvIDB9VG4?)LAQ`H_0jW3#scthi8ed9IL3` z(OZiK1|?bAHLkv>2Yud0q-eZHCaTx~*+A*^s$Vc`8wLu9U-PETRmiqaeA2VMuRL1# z!=9kk;WSBJh);)&6<4aTSnpbsq;qIdrEcsd%eJIe^2OUmI}V|a!2I6f#nqW3sEhR) zJM~^*)V8n!q7S>CSb6U)&}HQ~1W%fIh~8o0=u_;=g-8!p!p9xu0*(itLtPvZ1C&$= zu)`mu{fI8+$H4z`V=EIcC=f@EHJ{UXXK1BO?#5`5DS~px7~(PEH^gPlX6z)cb1D*( zID1SIXuexjoBytW&J_yZRAVbQ%juJJALtj2FdOP_$GGQ*&gZDv*uPBUq%r-|8pJ+D zEGkxhN^?QSn&FC3wq$A1iEyO|&GkWBbeCIJK$QJrgz81oN}p|Z$MeQqp~kB?T^2U_ zp#vIM@_G91)h;)s^Ag-`Nacz$X4_j|&TA(h{k795_T0Ub7&Kg>zv>8@DG42dLL%Cl}V=M3X~*ken-)9@*zh27nw_b$5Y5ADZpaFH*S+{x8O`zH5-OzcpQ zQO!xlu6BuUzaNBlBVRk9)5I#J#Dt?_<2MFg=t76PN&C(upT|B7C>ix^EKAIAV6kYo zZbqv_{A`>5>6~`eN^N9x+UjpG6hzAdE2~o#1|v;b`$i^2l9Z<#g}OpZU6q_xJRPkz zOHwqK7St0`LyWd4OycXxfp80fMJ9)W9eQG3ibyD)@k*~^NLMn()&9;re|hQQp_+4f U7w!+GDDZES*5){~5)(Z6zkM^$kpKVy literal 0 HcmV?d00001 diff --git a/assets/thyroid/regularized_logistic_regression_cv_recall_No.png b/assets/thyroid/regularized_logistic_regression_cv_recall_No.png new file mode 100644 index 0000000000000000000000000000000000000000..e196d2bfbb859daa641123971413b9acce502f3f GIT binary patch literal 3665 zcmcgvdo+|=8=q=~DI_t)K zCd+P|XWg*URjAN6dbxpn;G*1gjMbUa=gqGkPx!|!6*zQenlE&*&rb}7CNiy!G90p+Ke=hF&cEIWhgU8hzrcUG1F-s9&q(Ifxb)u-n~k>MH%5Cm z6hhl?F58$oZ&_fU#?b@uHjA##IK2YGAOs&KJj%OtHmc0E;c_NYYkr3H6 zZPGHC{Va>6{vz=1Nh%JlC# z9WyNYn&v3S{k*2yZ{_M#x8#S?(~0zBx@(+XR5 zE{7DzP4Jn|BMowBoDW02R;ma46g{JZ{f+G^GHyFPF7VIo$6Z*hd|E11_XV#O9y%Z; z+Z-M7jDE{jQ()b&4`3C)v8_DhL@1;TY^^-NBr2(Q7B)ND`%vlC>dW;_it`&7N~hq$ zh!X?b{1Xb)d1UhE1^&c;L`>MZCGa zdLG+z3P!u{b)PY9HDg{rNlJ?OZsGI9r)b86$LbM3gWpeOmLr#*z3o?bSTHXiRKt5p zuG(HfCu?PsVpuYsYYEPaUa(=4+7$10no89`yNmj&d!-tx`bGG|;%p$}3lAF|>Y+R3 zW051CQG~M2F~Y6yA<`}dtJPHv4cr=%WcIUCK9Z^GxMk|rJ&B)F8+~iiq8=Z8lA-$O z!98)!)PeW!4;hdqspk7WhUh6^E~M#B7fcPjJ+KatbD0)*v8*q)j;RK z{s>>oON@q1E;?FbhB&cG+4M7WukGR0?OrM<=m5+GQ`c{=SfLdH(XmhOiNejI<(D}p z^@Ye~j++HJ%V2a`#B2@I>^aWEsf8bEzKDWNia}>ydrGXm8j0GqbXzGj6tULU?(#A) zdxE%lWsTsjr6?lYsM2ABTRPv9y5+s^6uW!;rlfX#j}GeL;OCM3PEKn$Y_b~U7TzpQM8^KwtumCm=GFO+QqSK{Qh$Z4ltV~mucbF@5Oa$-A0krQzBUSE zyO^7?PK>a&mrG>vY+|fD@(u|TZmxvT{JvZ}f|ah}(djt(^<){8VA|WRN#ZE`M(sYf z%ZJ&pr@J@=?_m3H^m|ye>}_nl>tAJxzl?HqYTtTlTMz( zsx$%`Ai~mD06YODiDVifr9E(ZYemb8?FWj|d~KuP^+%oOPk6ZUt-kKPl-&BFdamyka+-2|#BJ{-Pkal#V?(!FkmDkwMG2w%r$W+wSAJm5&eT(_u-kS60`a?RcKIvF#RfNvv}q^;a# zc65l}hVJXN73`y%@RvHDO6gvPpFJ8PFOR3ht&Ir=(Zw)zc<_R_6s7T}`GeS_{uuf(K|3n=lNa5uBEL~(XA+E`vp zSK~Z-Mr)kdo~Z76O)tfadG=@+LLQw}!10VLlZ?Az>3iTyli{7s;4;8!E0qyne=_b* zig3`Ls)YhMXxvFBM&ym(wjnkaUE&9Ob=we$4Tdtch~vvr1mgy*5<+T{XRM5ZHf68Z z@?xlV3@+0egp{I+M8NO9O4%0hT)HCi6PJkfH$YD&miaDo$SS0NTJ`^T{`f8QhLf=R zgxn%G0}ZnyyHru<24HOsZOdYkJK#EfhhwS?Opg+bGeebX5E5Jux8e3U~AjmFd&CIA(EdXHV^g(#_B~i)%|mT_2b;nt@OIM`AMpcZ5^B#71S5{|HlQ zZGLUB(R<;Ixq(Q33tewJx}3i^;MQm*L4^ zNYPc{{(6;F+gnx`(WvlhoMKVLplMj0hHJ)!YT4;<*Zsb@YTW3?yejGU^vwptaG3m{ zCQ?MppZ=(=acQ}HgU{ZB{&%XJAE>o?CypUWAJKhUqJi+@&kUOE^Qh9UDYsMQ8`tyj z9)}UK>3=q`7!hN!FEY~yZLTKF7-mUd3rmnjn2cM)ZgVO~w>Zj1aW=lt3}-K>s5Cms zNZVfJzqxggdR70WevB$F^L^yKyFO>X1_bIQwQoVa#xw_?DR35d2}j%Y3p(ynF-9!c zW<1;MLw5v!Z+Mv31v7$AZQx2bXF8$TQ|gKS@W3N>4d3?TT-i(`lx;#SG<$TMQ?Pko zZ;YVZ#7AoTZkgq+(0%u^s7d* z=D2K4>b{P(gYqo=P|fkv5jMOuHTPpdFsoMWCghW5|a+tq`lyBSqBi9qb!BO7q UFJpc@`}M0K*3{CZ*vO6WZ@WLcSO5S3 literal 0 HcmV?d00001 diff --git a/assets/thyroid/regularized_logistic_regression_cv_recall_Yes.png b/assets/thyroid/regularized_logistic_regression_cv_recall_Yes.png new file mode 100644 index 0000000000000000000000000000000000000000..d4d22af58dd3488727f782772e5e2caa7bb51218 GIT binary patch literal 3683 zcmb_f3p|u*7oXOY!Mfy2OVsqW0*bK6U zQEu&lb_Mn=i~lIDBIw|&3;`nA8`*Y|z%J9D1rJ~|4Wex$ z1_xS%7=y#%K;F{;4vhpUado*et_8H>;O#R59fEwy#B{hX1=&zT^q>S zq*B4Q(Mz|i&-7avH`?Gk9!u0MaT4wP!WXRr=y=?3#%CH65fE_(mT zg~%o`ihXu2P@9}lWN!b~iJCs0Y4CMc$-8$q{f+$yC5_?YNG;vq$fuo^}0TzbvY z%p;<5%!59AxhyG#q)5^+2$AG=QoSErxxMKPD$eeJ)McPW3eFYkP(Y zdz%Il_jTo(MiwU{XJQgx?0%oMzC>p@JyRY^EXu{b?VQo@L%Vxnw3MWMI^Jwy>o6G( zhZk?P?X_Ov=qnPPe~SOG@?3G&dN89LV;6}MKFdz@Z=l*rh98P@=BJiz_2H~SPF1lj z8`rAkqhztJ^M^ocag(F{AYtnVzV7}hg^0xL8iVVz^b?iYPQ?i~iz*-H_$fy(qOpeM zBi~DvE=>(|u3!rCKfEbtwHPd%ANJSs9P)p@7MP7Q$)~VfZmj6ncJ&Z$pXx-NIbb85%Q0>D6Yj#6H zJesToN{%sPdsx7tW%sC|3tq|qjH*kv@O7ACCY9g;vZ0TQZvgn)&_+6V# zvhr}V%Pd`YQeD`IeS{Htqp0{<5MR~RGa9|pQvw>+0V^|?&X%(>WMg}I;fW~h`-xV( zNwvk1Dl5N$Fvh{IM}Xr1WK4PWQ5DaCUN9fLsGtW)^LU{lB(6me`_ zOR9r%eK|`Dq;%<^T_^2>ctV!Cb!1pO-Nd-jm$V}P~@Ve1$x-i&~Yh+7@_Vn|yMGEP!6|8oI zh=Qh0NA;V0{p`1V4#I5CQD05e-DxenuX4Duc%$R#%&W84zO?@KpUl@L$Rc)R5ZrI= zbop4fZUQQM7s{iD1i=Hn&LQd@MGjatCMSt0f>(-K9|ydnPq?vQmesnI4CeR^&06rH zvFgTmD3n)NeVOyVSSux)>yeQ$9(<=}eY8+Jn|t!fxORUC9MH8O69G@_kNJ)gr4i7T zq((6LIfBnd{(Q4OKeSUz>M6{FiOLv+(2kKm0p*?I8aZk5a?W*JXoF~HbBb7BCR50K z(2HjqbMVAAvgG0a3U@w6`Zut3)f{|aj93#p!6&q%9RC+^E30Ehi7Zd=&T>e$bO$?@ zH49eLrdwn?y3c2qw!wR<8qLL z{|jh;dU%yP+lis-my2KOY{=>&M)+yxv^yVnVx*OWcB*>57?J{RM#wq3x%8E;sU-V1 zuo8{k=504^o-5}?_ztQPh!;-PLTTH612-isBo1X2GzX9GS>#Q*l9s6~K+V{dO3bWC zpFHAE=1~O4YtIsGF^OvIq;0EDn~need*A&2i%chew>0{qD7Pn(Q}?87PyFShY$W2V zG)0@@FxaL3=&pW&!^S2Gnqu$A&eh-3c#GEizWL$z-uyr@rB>WLe%C5fnVU<`L8n zyWk30tamBXOT|5;hP*0Txx4dY z!bl}XcorO<*4pYfuGV|tGsW=&Q59dllUv#Ig^3G8~vpdz2K_m!}KTY zMx%EBxaB&mCJa=V9E@>BKFUelS7sRR2k%LqX)01FLXPF=yu+Pb4Fjb`P}=V%g?p~u zO}N+eE088h$W00jG|)j@U{+sNI7&QSG3e8h$3s zT8YjIrHdwfLV4b2B}YINs@Et!`~0_V^^nf^4OvKLR25#Bk@N@0n=C1#EqGQkyV6@_ zWnFV0?DE}(LlRI>XyJKCwgY}M5x zf1lo&?_K1hCqGyPSJf||u%lTD4*SS+E@7jqyY=e!>hl%KNDXE(UEdxxl+%s%Sr{Uo zxmq7Ht)sg2$Ne~V;}OuMZ|1eVpK#P62+KV6Avhu)Yrlly0XuwH2gCGP>+7#OQ(8uF z3=bKn)K=*7y}ppQQH~m?xCa*~(~np?&l4stwrfn@iS8*HURiB+6lM3_jUOL&UwlJ> zGfpUY7BJO2a+u3Gi0B{Z?g*~L%EURAN_B|-U67xx&u!tk9{99_&<#WU;RHq2vQB5& zR14qe+MEJm@|RMc&Z*ajr;v?NdMg`>GZy#D2ri58kwjDPqj&S_%`I$yj~5GMFcY+G zJu&o>t(-f>2AJQXPQS@xHo+ODVFrhRq}ra1Zx%-j!*ma>I?UVf(UbxJKXGOP|ENE) t(gQ9~%}BG6cU6rsQ}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-new}{}}} +\subsection{Method \code{new()}}{ +Create a new CurveResult object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$new(results, curve_type)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{results}}{Named list of curve results keyed by model name.} + +\item{\code{curve_type}}{Character. Either "roc" or "pr".} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-get_model}{}}} +\subsection{Method \code{get_model()}}{ +Retrieve curve results for a specific model. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$get_model(model = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. \code{NULL} returns all models. +Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A named list of curve results, or all results if \code{model} +is \code{NULL}. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-get_probs}{}}} +\subsection{Method \code{get_probs()}}{ +Retrieve predicted probabilities for a model partition. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$get_probs(model, partition, set)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name.} + +\item{\code{partition}}{Character. "split" or "fold1".."foldN".} + +\item{\code{set}}{Character. "train" or "test".} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A numeric vector of predicted probabilities. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-get_auc}{}}} +\subsection{Method \code{get_auc()}}{ +Retrieve the area under the curve (AUC). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$get_auc(model = NULL, partition = "split", set = "test")}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. If \code{NULL}, returns AUC for +all models as a named vector. Default is \code{NULL}.} + +\item{\code{partition}}{Character. "split" or "fold1".."foldN". Default is +\code{"split"}.} + +\item{\code{set}}{Character. "train" or "test". Default is \code{"test"}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A numeric value or named numeric vector of AUC values. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-get_max_f1}{}}} +\subsection{Method \code{get_max_f1()}}{ +Retrieve the maximum F1 score. Only available for +Precision-Recall curves. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$get_max_f1(model = NULL, partition = "split", set = "test")}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. If \code{NULL}, returns max F1 for +all models as a named vector. Default is \code{NULL}.} + +\item{\code{partition}}{Character. "split" or "fold1".."foldN". Default is +\code{"split"}.} + +\item{\code{set}}{Character. "train" or "test". Default is \code{"test"}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A numeric value, named numeric vector, or \code{NULL} if the +curve type is not "pr". +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-get_optimal_threshold}{}}} +\subsection{Method \code{get_optimal_threshold()}}{ +Retrieve the optimal threshold. For ROC curves, this is +Youden's Index. For PR curves, this is the threshold that maximizes the +F1 score. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$get_optimal_threshold( + model = NULL, + partition = "split", + set = "test" +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. If \code{NULL}, returns optimal +thresholds for all models as a named vector. Default is \code{NULL}.} + +\item{\code{partition}}{Character. "split" or "fold1".."foldN". Default is +\code{"split"}.} + +\item{\code{set}}{Character. "train" or "test". Default is \code{"test"}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A numeric value or named numeric vector of optimal thresholds. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-get_metrics}{}}} +\subsection{Method \code{get_metrics()}}{ +Retrieve curve metrics (FPR/TPR for ROC, or +precision/recall for PR curves). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$get_metrics(model, partition = "split", set = "test")}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name.} + +\item{\code{partition}}{Character. "split" or "fold1".."foldN". Default is +\code{"split"}.} + +\item{\code{set}}{Character. "train" or "test". Default is \code{"test"}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A named list containing the curve metrics. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-compare}{}}} +\subsection{Method \code{compare()}}{ +Compare AUC across all models for a given partition. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$compare(partition, set)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{partition}}{Character. "split" or "fold1".."foldN".} + +\item{\code{set}}{Character. "train" or "test".} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.frame with columns \code{model} and \code{auc}. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CurveResult-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CurveResult$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/Vswift.Rd b/man/Vswift.Rd new file mode 100644 index 0000000..9bbd107 --- /dev/null +++ b/man/Vswift.Rd @@ -0,0 +1,755 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/r6_classes.R +\name{Vswift} +\alias{Vswift} +\title{Classification Results} +\description{ +An R6 class containing classification results produced by \code{\link{class_cv}}. +Provides methods for accessing metrics, trained models, data partitions, +and generating plots and curves. +} +\examples{ +result <- class_cv( + data = iris, + target = "Species", + models = c("svm", "lda"), + train_params = list(split = 0.8, n_folds = 5, random_seed = 123) +) + +result$summary() +result$metrics("svm", "cv") +result$plot(metrics = "f1") + + +## ------------------------------------------------ +## Method `Vswift$print` +## ------------------------------------------------ + +# Load an example dataset + +data(iris) + +# Perform a train-test split with an 80\% training set using LDA + +results <- class_cv( + data = iris, + target = "Species", + models = "lda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123) +) + +# Print parameter information and performance metrics +results$print() + + +## ------------------------------------------------ +## Method `Vswift$plot` +## ------------------------------------------------ + +# Load an example dataset +data(iris) + +# Perform a train-test split with an 80\% training set and stratified +# sampling using QDA +results <- class_cv( + data = iris, + target = "Species", + models = "qda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + save = list(models = TRUE) +) + + +# Plot performance metrics for train-test split + +results$plot(class_names = "setosa", metrics = "f1") + + +## ------------------------------------------------ +## Method `Vswift$roc_curve` +## ------------------------------------------------ + +# Load an example dataset +data <- iris + +# Make Binary +data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + +# Perform a train-test split with an 80\% training set and stratified +# sampling using QDA +results <- class_cv( + data = data, + target = "Species", + models = "qda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + save = list(data = TRUE, models = TRUE) +) + +# Get ROC curve +results$roc_curve(return_output = FALSE) + + +## ------------------------------------------------ +## Method `Vswift$pr_curve` +## ------------------------------------------------ + +# Load an example dataset +data <- iris + +# Make Binary +data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + +# Perform a train-test split with an 80\% training set and stratified +# sampling using QDA +results <- class_cv( + data = data, + target = "Species", + models = "qda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + save = list(data = TRUE, models = TRUE) +) + +# Get PR curve +results$pr_curve(return_output = FALSE) +} +\section{Active bindings}{ +\if{html}{\out{
}} +\describe{ +\item{\code{classes}}{Character vector of target classes.} + +\item{\code{n_models}}{Number of models in this result.} + +\item{\code{model_names}}{Character vector of model names.} + +\item{\code{has_split}}{TRUE if train-test split was performed.} + +\item{\code{has_cv}}{TRUE if cross-validation was performed.} + +\item{\code{n_folds}}{Number of CV folds. NULL if no CV.} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Vswift-new}{\code{Vswift$new()}} +\item \href{#method-Vswift-metrics}{\code{Vswift$metrics()}} +\item \href{#method-Vswift-configs}{\code{Vswift$configs()}} +\item \href{#method-Vswift-get_trained_model}{\code{Vswift$get_trained_model()}} +\item \href{#method-Vswift-get_imputation_model}{\code{Vswift$get_imputation_model()}} +\item \href{#method-Vswift-get_missing_data_summary}{\code{Vswift$get_missing_data_summary()}} +\item \href{#method-Vswift-get_partition}{\code{Vswift$get_partition()}} +\item \href{#method-Vswift-class_info}{\code{Vswift$class_info()}} +\item \href{#method-Vswift-available_models}{\code{Vswift$available_models()}} +\item \href{#method-Vswift-print}{\code{Vswift$print()}} +\item \href{#method-Vswift-plot}{\code{Vswift$plot()}} +\item \href{#method-Vswift-summary}{\code{Vswift$summary()}} +\item \href{#method-Vswift-roc_curve}{\code{Vswift$roc_curve()}} +\item \href{#method-Vswift-pr_curve}{\code{Vswift$pr_curve()}} +\item \href{#method-Vswift-clone}{\code{Vswift$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-new}{}}} +\subsection{Method \code{new()}}{ +Create a new vswift result object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$new( + configs, + class_summary, + metrics, + trained_models, + missing_data_summary, + data_partitions, + imputation_models +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{configs}}{List of configuration parameters.} + +\item{\code{class_summary}}{List with class-level info.} + +\item{\code{metrics}}{Named list of per-model metric dataframes.} + +\item{\code{trained_models}}{Named list of trained models.} + +\item{\code{missing_data_summary}}{Named list of missing data information.} + +\item{\code{data_partitions}}{List with indices and dataframes.} + +\item{\code{imputation_models}}{List of prep objects.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-metrics}{}}} +\subsection{Method \code{metrics()}}{ +Retrieve evaluation metrics. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$metrics(model, type = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. \code{NULL} returns all.} + +\item{\code{type}}{Character. "split" or "cv". \code{NULL} returns all for that +model. Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.frame or named list. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-configs}{}}} +\subsection{Method \code{configs()}}{ +Retrieve configuration parameters. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$configs(param = NULL, keys = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{param}}{Character. Config key. \code{NULL} returns all. Default +is \code{NULL}.} + +\item{\code{keys}}{Character or list of characters. The sub-keys within param. +\code{NULL} returns all keys of \code{param}. Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +The requested configuration value. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-get_trained_model}{}}} +\subsection{Method \code{get_trained_model()}}{ +Retrieve trained model objects. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$get_trained_model(model = NULL, partition = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. If \code{NULL}, returns all +all models. Default is \code{NULL}.} + +\item{\code{partition}}{Character. "split", "final", or "fold1".."foldN". +If \code{NULL}, returns all partitions. Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A trained model object or named list. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-get_imputation_model}{}}} +\subsection{Method \code{get_imputation_model()}}{ +Retrieve the imputation objects. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$get_imputation_model(model = NULL, partition = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{model}}{Character. Model name. If \code{NULL}, returns all +all models. Default is \code{NULL}.} + +\item{\code{partition}}{Character. "split", "final", or "fold1".."foldN". +If \code{NULL}, returns all partitions. Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +An imputation model object or named list. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-get_missing_data_summary}{}}} +\subsection{Method \code{get_missing_data_summary()}}{ +Retrieve missing data summary. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$get_missing_data_summary(what = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{what}}{Character. The specific missing data information. +\code{NULL} returns all. Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +The requested missing data information. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-get_partition}{}}} +\subsection{Method \code{get_partition()}}{ +Retrieve data partition information. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$get_partition(what = NULL, partition = NULL, set = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{what}}{Character. "indices", "proportions", or "dataframes". +Default is \code{NULL}.} + +\item{\code{partition}}{Character. "split" or "fold1".."foldN". If \code{NULL}, +returns all partitions. Default is \code{NULL}.} + +\item{\code{set}}{Character. "train" or "test". \code{NULL}, returns the +training and test set. Default is \code{NULL}.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Requested partition data. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-class_info}{}}} +\subsection{Method \code{class_info()}}{ +Retrieve class summary information. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$class_info(what)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{what}}{Character. "classes", "keys", "proportions", or "indices".} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +The requested class summary component. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-available_models}{}}} +\subsection{Method \code{available_models()}}{ +List models present in this result object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$available_models()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Character vector of model names. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-print}{}}} +\subsection{Method \code{print()}}{ +Prints model configuration details and/or model evaluation +metrics (classification accuracy, precision, recall, and F1 scores). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$print(configs = TRUE, metrics = TRUE, models = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{configs}}{A logical value indicating whether to print model +configuration information from the vswift class. Default is \code{TRUE}.} + +\item{\code{metrics}}{A logical value indicating whether to print model +evaluation metrics from the vswift class If \code{TRUE}, precision, +recall, and F1 scores for each class will be displayed, along with their +mean values (if cross-validation was used). Default is \code{TRUE}.} + +\item{\code{models}}{A character string or a character vector specifying the +classification algorithm(s) information to be printed. If \code{NULL}, +all model information will be printed. The following options ar +available: +\itemize{ + \item \code{"lda"}: Linear Discriminant Analysis + \item \code{"qda"}: Quadratic Discriminant Analysis + \item \code{"logistic"}: Unregularized Logistic Regression + \item \code{"regularized_logistic"}: Regularized Logistic Regression + \item \code{"svm"}: Support Vector Machine + \item \code{"naivebayes"}: Naive Bayes + \item \code{"nnet"}: Neural Network + \item \code{"knn"}: K-Nearest Neighbors + \item \code{"decisiontree"}: Decision Tree + \item \code{"randomforest"}: Random Forest + \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + Regression + \item \code{"xgboost"}: Extreme Gradient Boosting + } + Default = \code{NULL}.} + +\item{\code{...}}{No additional arguments are currently supported.} +} +\if{html}{\out{
}} +} +\subsection{Examples}{ +\if{html}{\out{
}} +\preformatted{# Load an example dataset + +data(iris) + +# Perform a train-test split with an 80\% training set using LDA + +results <- class_cv( + data = iris, + target = "Species", + models = "lda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123) +) + +# Print parameter information and performance metrics +results$print() + +} +\if{html}{\out{
}} + +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-plot}{}}} +\subsection{Method \code{plot()}}{ +Plots classification metrics (accuracy, precision, recall, +and f1 for each class). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$plot( + metrics = c("accuracy", "precision", "recall", "f1"), + models = NULL, + split = TRUE, + cv = TRUE, + class_names = NULL, + path = NULL, + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{metrics}}{A character vector indicating which metrics to plot. +Supported options are "accuracy", "recall", "precision", "f1". Default is +\code{c("accuracy", "precision", "recall", "f1")}.} + +\item{\code{models}}{A character string or a character vector specifying the +classification algorithm(s) evaluation metrics to plot. If \code{NULL}, +all models will be plotted. The following options are available: +\itemize{ + \item \code{"lda"}: Linear Discriminant Analysis + \item \code{"qda"}: Quadratic Discriminant Analysis + \item \code{"logistic"}: Unregularized Logistic Regression + \item \code{"regularized_logistic"}: Regularized Logistic Regression + \item \code{"svm"}: Support Vector Machine + \item \code{"naivebayes"}: Naive Bayes + \item \code{"nnet"}: Neural Network + \item \code{"knn"}: K-Nearest Neighbors + \item \code{"decisiontree"}: Decision Tree + \item \code{"randomforest"}: Random Forest + \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + Regression + \item \code{"xgboost"}: Extreme Gradient Boosting + } + Default = \code{NULL}.} + +\item{\code{split}}{A logical value indicating whether to plot metrics for the +train-test split results. Default is \code{TRUE}.} + +\item{\code{cv}}{A logical value indicating whether to plot metrics for +cross-validation results. Default is \code{TRUE}.} + +\item{\code{class_names}}{A vector of the specific classes to plot. If +\code{NULL}, plots are generated for all classes. Default is \code{NULL}.} + +\item{\code{path}}{A character string specifying the directory (with a trailing +slash) to save the plots. +Default is \code{NULL}.} + +\item{\code{...}}{Additional arguments passed to the \code{png} function.} +} +\if{html}{\out{
}} +} +\subsection{Examples}{ +\if{html}{\out{
}} +\preformatted{# Load an example dataset +data(iris) + +# Perform a train-test split with an 80\% training set and stratified +# sampling using QDA +results <- class_cv( + data = iris, + target = "Species", + models = "qda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + save = list(models = TRUE) +) + + +# Plot performance metrics for train-test split + +results$plot(class_names = "setosa", metrics = "f1") + +} +\if{html}{\out{
}} + +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-summary}{}}} +\subsection{Method \code{summary()}}{ +Print a compact summary of results. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$summary()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-roc_curve}{}}} +\subsection{Method \code{roc_curve()}}{ +Produces ROC curves and computes the area under the curve +(AUC) and Youden's Index. Only works for binary classification tasks. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$roc_curve( + data = NULL, + models = NULL, + split = TRUE, + cv = TRUE, + thresholds = NULL, + return_output = TRUE, + path = NULL, + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{data}}{A data frame. If \code{NULL}, then the preprocessed data must +be saved using \code{save = list("data" = TRUE)} in \code{class_cv}. +Default = \code{NULL}.} + +\item{\code{models}}{A character string or a character vector specifying the +classification algorithm(s) to plot curves for. If \code{NULL}, all +models will be plotted. The following options are available: +\itemize{ + \item \code{"lda"}: Linear Discriminant Analysis + \item \code{"qda"}: Quadratic Discriminant Analysis + \item \code{"logistic"}: Unregularized Logistic Regression + \item \code{"regularized_logistic"}: Regularized Logistic Regression + \item \code{"svm"}: Support Vector Machine + \item \code{"naivebayes"}: Naive Bayes + \item \code{"nnet"}: Neural Network + \item \code{"knn"}: K-Nearest Neighbors + \item \code{"decisiontree"}: Decision Tree + \item \code{"randomforest"}: Random Forest + \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + Regression + \item \code{"xgboost"}: Extreme Gradient Boosting + } + Default = \code{NULL}.} + +\item{\code{split}}{A logical value indicating whether to plot curves for the +train-test split results. Default is \code{TRUE}.} + +\item{\code{cv}}{A logical value indicating whether to plot curves for +cross-validation results. Default is \code{TRUE}.} + +\item{\code{thresholds}}{A numerical vector specifying the thresholds to use +when producing the curves. If left as \code{NULL} the unique probability +values produced by the training model will be used as thresholds. +Default is \code{NULL}.} + +\item{\code{return_output}}{A logical value indicating whether to return the +output list. Default is \code{TRUE}.} + +\item{\code{path}}{A character string specifying the directory (with a trailing +slash) to save the plots. Default is \code{NULL}.} + +\item{\code{...}}{Additional arguments passed to the \code{png} function.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A \code{\link{CurveResult}} object containing thresholds, target +labels, false positive rates (FPR), true positive rates (TPR), area under +the curve (AUC), and Youden's Index for all training and validation sets +for each model. +} +\subsection{Examples}{ +\if{html}{\out{
}} +\preformatted{# Load an example dataset +data <- iris + +# Make Binary +data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + +# Perform a train-test split with an 80\% training set and stratified +# sampling using QDA +results <- class_cv( + data = data, + target = "Species", + models = "qda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + save = list(data = TRUE, models = TRUE) +) + +# Get ROC curve +results$roc_curve(return_output = FALSE) + +} +\if{html}{\out{
}} + +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-pr_curve}{}}} +\subsection{Method \code{pr_curve()}}{ +Produces PR curves and computes the area under the curve +(AUC) and the threshold with the maximum F1 score. Only works for binary +classification tasks. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$pr_curve( + data = NULL, + models = NULL, + split = TRUE, + cv = TRUE, + thresholds = NULL, + return_output = TRUE, + path = NULL, + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{data}}{A data frame. If \code{NULL}, then the preprocessed data +must be saved using \code{save = list("data" = TRUE)} in \code{class_cv}. +Default = \code{NULL}.} + +\item{\code{models}}{A character string or a character vector specifying the +classification algorithm(s) to plot curves for. If \code{NULL}, all +models will be plotted. The following options are available: +\itemize{ + \item \code{"lda"}: Linear Discriminant Analysis + \item \code{"qda"}: Quadratic Discriminant Analysis + \item \code{"logistic"}: Unregularized Logistic Regression + \item \code{"regularized_logistic"}: Regularized Logistic Regression + \item \code{"svm"}: Support Vector Machine + \item \code{"naivebayes"}: Naive Bayes + \item \code{"nnet"}: Neural Network + \item \code{"knn"}: K-Nearest Neighbors + \item \code{"decisiontree"}: Decision Tree + \item \code{"randomforest"}: Random Forest + \item \code{"multinom"}: Unregularized Multinomial Logistic Regression + \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + Regression + \item \code{"xgboost"}: Extreme Gradient Boosting + } + Default = \code{NULL}.} + +\item{\code{split}}{A logical value indicating whether to plot curves for the +train-test split results. Default is \code{TRUE}.} + +\item{\code{cv}}{A logical value indicating whether to plot curves for +cross-validation results. Default is \code{TRUE}.} + +\item{\code{thresholds}}{A numerical vector specifying the thresholds to use +when producing the curves. If left as \code{NULL} the unique probability +values produced by the training model will be used as thresholds. +Default is \code{NULL}.} + +\item{\code{return_output}}{A logical value indicating whether to return the +output list. Default is \code{TRUE}.} + +\item{\code{path}}{A character string specifying the directory (with a trailing +slash) to save the plots. +Default is \code{NULL}.} + +\item{\code{...}}{Additional arguments passed to the \code{png} function.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A \code{\link{CurveResult}} object containing thresholds, target +labels, precision, recall, area under the curve (AUC), and maximum F1 +score and its associated optimal threshold for all training and validation +sets for each model. +} +\subsection{Examples}{ +\if{html}{\out{
}} +\preformatted{# Load an example dataset +data <- iris + +# Make Binary +data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + +# Perform a train-test split with an 80\% training set and stratified +# sampling using QDA +results <- class_cv( + data = data, + target = "Species", + models = "qda", + train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), + save = list(data = TRUE, models = TRUE) +) + +# Get PR curve +results$pr_curve(return_output = FALSE) +} +\if{html}{\out{
}} + +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Vswift-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Vswift$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/classCV.Rd b/man/class_cv.Rd similarity index 50% rename from man/classCV.Rd rename to man/class_cv.Rd index ca4e0e0..45435e8 100644 --- a/man/classCV.Rd +++ b/man/class_cv.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/classCV.R -\name{classCV} -\alias{classCV} +% Please edit documentation in R/class_cv.R +\name{class_cv} +\alias{class_cv} \title{Perform Train-Test Splitting and/or Cross-Validation on Classification Data} \usage{ -classCV( +class_cv( data, formula = NULL, target = NULL, @@ -23,17 +23,20 @@ classCV( \arguments{ \item{data}{A data frame.} -\item{formula}{A formula specifying the model to use. This argument cannot be used when \code{target} -(and optionally \code{predictors}) is specified. Default is \code{NULL}.} +\item{formula}{A formula specifying the model to use. This argument cannot +be used when \code{target} (and optionally \code{predictors}) is specified. +Default is \code{NULL}.} -\item{target}{The name or numerical index of the target (response) variable in \code{data}. This argument cannot be -used when \code{formula} is specified. Default is \code{NULL}.} +\item{target}{The name or numerical index of the target (response) variable +in \code{data}. This argument cannot be used when \code{formula} is specified. +Default is \code{NULL}.} -\item{predictors}{A vector of variable names or numerical indices indicating the predictors in \code{data}, -used in conjunction with \code{target}. Default is \code{NULL}.} +\item{predictors}{A vector of variable names or numerical indices indicating +the predictors in \code{data}, used in conjunction with \code{target}. +Default is \code{NULL}.} -\item{models}{A character string or a character vector specifying the classification algorithm(s) to use. -The following options are available: +\item{models}{A character string or a character vector specifying the +classification algorithm(s) to use. The following options are available: \itemize{ \item \code{"lda"}: Linear Discriminant Analysis \item \code{"qda"}: Quadratic Discriminant Analysis @@ -46,168 +49,218 @@ The following options are available: \item \code{"decisiontree"}: Decision Tree \item \code{"randomforest"}: Random Forest \item \code{"multinom"}: Unregularized Multinomial Logistic Regression - \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression + \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic + Regression \item \code{"xgboost"}: Extreme Gradient Boosting } \strong{Notes:} \itemize{ - \item \code{"knn"}: The \code{ks} parameter should be set to specify the desired value of \emph{k}, ensuring that - the same value is used in all folds. If \code{ks} is not provided, the optimal \emph{k} is automatically selected - using the \pkg{kknn} package. + \item \code{"knn"}: The \code{ks} parameter should be set to specify the + desired value of \emph{k}, ensuring that the same value is used in all + folds. If \code{ks} is not provided, the optimal \emph{k} is automatically + selected using the \pkg{kknn} package. \item \code{"nnet"}: An additional argument \code{size} must be specified. - \item \code{"regularized_logistic"} and \code{"regularized_multinomial"}: If \code{"lambda"} is specified in the - additional arguments and is a vector of length > 1, then internal nested cross-validation is done on the training - set to determine the optimal lambda value. The number of folds for the nested cross-validation can be - specified by using \code{nfolds} in the additional arguments. If \code{"stratified"} is \code{TRUE}, then the - relative proportions of the classes in the training set will be retained in each fold. - \item \code{"xgboost"}: The following \code{objective} functions are supported: - \code{"reg:logistic"}, \code{"binary:logistic"}, \code{"binary:logitraw"}, \code{"binary:hinge"}, - and \code{"multi:softprob"}. + \item \code{"regularized_logistic"} and \code{"regularized_multinomial"}: + If \code{"lambda"} is specified in the additional arguments and is a vector + of length > 1, then internal nested cross-validation is done on the + training set to determine the optimal lambda value. The number of folds for + the nested cross-validation can be specified by using \code{n_folds} in the + additional arguments. If \code{"stratified"} is \code{TRUE}, then the + relative proportions of the classes in the training set will be retained + in each fold. + \item \code{"xgboost"}: The following \code{objective} functions are + supported: \code{"reg:logistic"}, \code{"binary:logistic"}, + \code{"binary:logitraw"}, \code{"binary:hinge"}, and \code{"multi:softprob"}. }} \item{model_params}{A list that can include the following elements: \itemize{ - \item \code{"map_args"}: A list of named sub-lists used when more than one model is specified in \code{models}. - Each sub-list corresponds to a particular model in the \code{models}] parameter and contains the arguments that will - be passed to that model. Default is \code{NULL}. Refer to the "Additional Model Parameters" section for acceptable - arguments. - \item \code{"threshold"}: A numeric value in the interval [0, 1] that serves as the cutoff value for assigning binary targets. - Observations are assigned to the class coded as "1" if \code{P(Class = 1 | Features) >= threshold}; otherwise, they - are assigned to the class coded as "0". A default threshold of 0.5 is used when \code{"logistic"} is included in - \code{models}, or when \code{"xgboost"} is included in \code{models} with one of these objective functions: - \code{"reg:logistic"}, \code{"binary:logistic"}, or \code{"binary:logitraw"}. If \code{NULL}, the remaining models - will use there respective default assignment methods (maximizing the posterior probability). Default is \code{NULL}. - \item \code{"rule"}: A character that dictates the rule used to select the optimal lambda when using - \code{regularized_logistic} or \code{"regularized_multinomial"}. Available options are: \code{"min"} or + \item \code{"map_args"}: A list of named sub-lists used when more than one + model is specified in \code{models}. Each sub-list corresponds to a + particular model in the \code{models}] parameter and contains the + arguments that will be passed to that model. Default is \code{NULL}. Refer + to the "Additional Model Parameters" section for acceptable arguments. + \item \code{"threshold"}: A numeric value in the interval [0, 1] that serves + as the cutoff value for assigning binary targets. Observations are assigned + to the class coded as "1" if \code{P(Class = 1 | Features) >= threshold}; + otherwise, they are assigned to the class coded as "0". A default threshold + of 0.5 is used when \code{"logistic"} is included in \code{models}, or when + \code{"xgboost"} is included in \code{models} with one of these objective + functions: \code{"reg:logistic"}, \code{"binary:logistic"}, or + \code{"binary:logitraw"}. If \code{NULL}, the remaining models will use + there respective default assignment methods (maximizing the posterior + probability). Default is \code{NULL}. + \item \code{"rule"}: A character that dictates the rule used to select + the optimal lambda when using \code{regularized_logistic} or + \code{"regularized_multinomial"}. Available options are: \code{"min"} or \code{"1se"}. Default is \code{"min"}. - \item \code{verbose}: A logical value indicating whether to state the optimal lambda based on the nested - cross-validation. \item \code{"final_model"}: A logical value indicating whether to use all complete observations - in the input data for model training. Default is \code{FALSE}. + \item \code{verbose}: A logical value indicating whether to state the + optimal lambda based on the nested cross-validation. + \item \code{"final_model"}: A logical value indicating whether to use all + complete observations in the input data for model training. Default is + \code{FALSE}. }} \item{train_params}{A list that can contain the following parameters: \itemize{ - \item \code{split}: A numeric value between 0 and 1 indicating the proportion of data to use - for training. The remaining observations are allocated to the test set. If not specified or set to \code{NULL}, no - train-test splitting is performed. Note that this split is separate from cross-validation. Default is \code{NULL}. - \item \code{n_folds}: An integer greater than 2 specifying the number of folds for cross-validation. If \code{NULL}, - no cross-validation is performed. Default is \code{NULL}. - \item \code{stratified}: A logical value indicating whether stratified sampling should be used during splitting. + \item \code{split}: A numeric value between 0 and 1 indicating the + proportion of data to use for training. The remaining observations are + allocated to the test set. If not specified or set to \code{NULL}, no + train-test splitting is performed. Note that this split is separate from + cross-validation. Default is \code{NULL}. + \item \code{n_folds}: An integer greater than 2 specifying the number of + folds for cross-validation. If \code{NULL}, no cross-validation is + performed. Default is \code{NULL}. + \item \code{stratified}: A logical value indicating whether stratified + sampling should be used during splitting. Default is \code{FALSE}. + \item \code{random_seed}: A numeric value for the random seed to ensure + reproducibility of random splitting and any model training that relies on + random starts. Default is \code{NULL}. + \item \code{standardize}: A logical or a numeric/character vector. If + \code{TRUE}, all numeric columns (except the target) are standardized by + computing the mean and standard deviation from the training subset and + applying them to both the training and test/validation sets. This prevents + data leakage. A vector of column indices or names can also be provided to + only standardize specific columns. + \item \code{remove_obs}: A logical value indicating whether to remove + observations in the test/validation set that contain levels of categorical + predictors not seen in the training data. Some algorithms may produce errors + when encountering such levels in the validation data during prediction. Default is \code{FALSE}. - \item \code{random_seed}: A numeric value for the random seed to ensure reproducibility of random splitting and any - model training that relies on random starts. Default is \code{NULL}. - \item \code{standardize}: A logical or a numeric/character vector. If \code{TRUE}, all numeric columns - (except the target) are standardized by computing the mean and standard deviation from the training subset and - applying them to both the training and test/validation sets. This prevents data leakage. A vector of column indices - or names can also be provided to only standardize specific columns. - \item \code{remove_obs}: A logical value indicating whether to remove observations in the test/validation set that - contain levels of categorical predictors not seen in the training data. Some algorithms may produce errors when - encountering such levels in the validation data during prediction. Default is \code{FALSE}. }} -\item{impute_params}{A list defining how to handle missing values among predictors/features. During imputation, the -target variable is excluded from both training and test/validation sets. Prior to imputation, unlabeled data -(observations with missing targets) are removed, and any specified train-test split or cross-validation folds are -created. A separate imputation model is then generated for each training subset (one for the train-test split and -one per fold). Each imputation model is applied to both its corresponding training and test/validation subsets to -minimize data leakage. Note that numerical columns are automatically standardized (regardless of -\code{train_params$standardize}) before imputation occurs. The \pkg{recipes} package is used for imputation. The -following parameters are available: +\item{impute_params}{A list defining how to handle missing values among +predictors/features. During imputation, the target variable is excluded from +both training and test/validation sets. Prior to imputation, unlabeled data +(observations with missing targets) are removed, and any specified train-test +split or cross-validation folds are created. A separate imputation model is +then generated for each training subset (one for the train-test split and +one per fold). Each imputation model is applied to both its corresponding +training and test/validation subsets to minimize data leakage. Note that +numerical columns are automatically standardized (regardless of +\code{train_params$standardize}) before imputation occurs. The +\pkg{recipes} package is used for imputation. The following parameters are +available: \itemize{ - \item \code{method}: A character specifying the imputation method. Options include: + \item \code{method}: A character specifying the imputation method. Options + include: \itemize{ \item \code{"impute_bag"}: Bagged Trees Imputation \item \code{"impute_knn"}: K-Nearest Neighbors Imputation } Default is \code{NULL}. - \item \code{args}: A list of additional arguments for the chosen imputation method. + \item \code{args}: A list of additional arguments for the chosen imputation + method. \itemize{ \item \code{"impute_bag"}: \code{trees}, \code{seed_val} \item \code{"impute_knn"}: \code{neighbors} } - For more details about these arguments, consult the \pkg{recipes} documentation. Default is \code{NULL}. + For more details about these arguments, consult the \pkg{recipes} + documentation. Default is \code{NULL}. }} \item{save}{A list that may include the following: \itemize{ -\item \code{models}: A logical value indicating whether to save the trained models (including imputation models) -used for train-test splits or cross-validation. Default is \code{FALSE}. -\item \code{data}: A logical value indicating whether to save all training and test/validation sets used during -train-test splitting and/or cross-validation. Default is \code{FALSE}. +\item \code{models}: A logical value indicating whether to save the trained +models (including imputation models) used for train-test splits or +cross-validation. Default is \code{FALSE}. +\item \code{data}: A logical value indicating whether to save all training +and test/validation sets used during train-test splitting and/or +cross-validation. Default is \code{FALSE}. }} \item{parallel_configs}{A list that may include the following: \itemize{ -\item \code{n_cores}: A numeric value specifying the number of cores for parallel processing. Default is \code{NULL}. -\item \code{future.seed}: A numeric value indicating the seed to use with \pkg{future} for parallel processing. +\item \code{n_cores}: A numeric value specifying the number of cores for +parallel processing. Default is \code{NULL}. +\item \code{future.seed}: A numeric value indicating the seed to use with +\pkg{future} for parallel processing. }} -\item{...}{Additional arguments for the chosen classification algorithm. These arguments serve as an alternative to -specifying model-specific parameters in \code{model_params$map_args} when only a single model is specified in -\code{models}. If multiple models are specified, then \code{map_args} must be used. Refer to each algorithm's -documentation for details on additional arguments.} +\item{...}{Additional arguments for the chosen classification algorithm. +These arguments serve as an alternative to specifying model-specific +parameters in \code{model_params$map_args} when only a single model is +specified in \code{models}. If multiple models are specified, then +\code{map_args} must be used. Refer to each algorithm's documentation for +details on additional arguments.} } \value{ -A list (vswift object) containing: +A \code{\link{Vswift}} object containing: \itemize{ - \item Any train-test split or cross-validation results (if specified). - \item Performance metrics. - \item Class distribution details for the training set, test set, and folds (if applicable). - \item Saved models (if requested). - \item Saved datasets (if requested). - \item A final model (if requested). + \item Configuration parameters accessible via \code{$configs()}. + \item Performance metrics accessible via \code{$metrics()}. + \item Class distribution details accessible via \code{$class_info()}. + \item Missing data summary accessible via \code{$get_missing_data_summary()}. + \item Data partition indices and dataframes accessible via + \code{$get_partition()} (if requested). + \item Trained models accessible via \code{$get_trained_model()} (if + requested). + \item Imputation models accessible via \code{$get_imputation_model()} (if + requested). } } \description{ -Performs train-test splitting and/or cross-validation on classification data using various -classification algorithms. +Performs train-test splitting and/or cross-validation on +classification data using various classification algorithms. } \section{Additional Model Parameters}{ -Each element in \code{models} accepts arguments specific to its underlying classification algorithm. Refer to the -original package documentation for more information about these arguments. Further details on the external package -functions used for each model are provided in the "Package Dependencies" section. -The available arguments for each \code{models} are: +Each element in \code{models} accepts arguments specific to its underlying +classification algorithm. Refer to the original package documentation for +more information about these arguments. Further details on the external +package functions used for each model are provided in the "Package +Dependencies" section. The available arguments for each \code{models} are: \itemize{ \item \code{"lda"}: \code{prior}, \code{method}, \code{nu}, \code{tol} \item \code{"qda"}: \code{prior}, \code{method}, \code{nu} \item \code{"logistic"}: \code{weights}, \code{singular.ok}, \code{maxit} - \item \code{"regularized_logistic"}: \code{"alpha"}, \code{"lambda"}, \code{"penalty.factor"}, \code{"maxit"}, - \code{"thresh"}, \code{"nfolds"} - \item \code{"svm"}: \code{kernel}, \code{degree}, \code{gamma}, \code{cost}, \code{nu}, \code{class.weights}, - \code{shrinking}, \code{epsilon}, \code{tolerance}, \code{cachesize} - \item \code{"naivebayes"}: \code{prior}, \code{laplace}, \code{usekernel}, \code{usepoisson} - \item \code{"nnet"}: \code{size}, \code{rang}, \code{decay}, \code{maxit}, \code{softmax}, \code{entropy}, - \code{abstol}, \code{reltol}, \code{Hess}, \code{skip} + \item \code{"regularized_logistic"}: \code{"alpha"}, \code{"lambda"}, + \code{"penalty.factor"}, \code{"maxit"}, \code{"thresh"}, \code{"nfolds"} + \item \code{"svm"}: \code{kernel}, \code{degree}, \code{gamma}, \code{cost}, + \code{nu}, \code{class.weights}, \code{shrinking}, \code{epsilon}, + \code{tolerance}, \code{cachesize} + \item \code{"naivebayes"}: \code{prior}, \code{laplace}, \code{usekernel}, + \code{usepoisson} + \item \code{"nnet"}: \code{size}, \code{rang}, \code{decay}, \code{maxit}, + \code{softmax}, \code{entropy}, \code{abstol}, \code{reltol}, \code{Hess}, + \code{skip} \item \code{"knn"}: \code{kmax}, \code{ks}, \code{distance}, \code{kernel} \item \code{"decisiontree"}: \code{parms}, \code{control}, \code{cost} - \item \code{"randomforest"}: \code{weights}, \code{ntree}, \code{mtry}, \code{nodesize}, \code{importance}, - \code{localImp}, \code{nPerm}, \code{proximity}, \code{keep.forest}, \code{norm.votes} + \item \code{"randomforest"}: \code{weights}, \code{ntree}, \code{mtry}, + \code{nodesize}, \code{importance}, \code{localImp}, \code{nPerm}, + \code{proximity}, \code{keep.forest}, \code{norm.votes} \item \code{"multinom"}: \code{Hess} - \item \code{"regularized_multinomial"}: \code{"alpha"}, \code{"lambda"}, \code{"penalty.factor"}, \code{"maxit"}, + \item \code{"regularized_multinomial"}: \code{"alpha"}, \code{"lambda"}, + \code{"penalty.factor"}, \code{"maxit"}, \code{"thresh"}, \code{"nfolds"} - \item \code{"xgboost"}: \code{params}, \code{nrounds}, \code{print_every_n}, \code{feval}, \code{verbose}, - \code{early_stopping_rounds}, \code{obj}, \code{save_period}, \code{save_name} + \item \code{"xgboost"}: \code{params}, \code{nrounds}, \code{print_every_n}, + \code{feval}, \code{verbose}, \code{early_stopping_rounds}, \code{obj}, + \code{save_period}, \code{save_name} } } \section{Package Dependencies}{ -Each option of \code{models} uses the following function from the specified packages: +Each option of \code{models} uses the following function from the specified +packages: \itemize{ \item \code{"lda"}: \code{lda} from \pkg{MASS} package \item \code{"qda"}: \code{qda} from \pkg{MASS} package -\item \code{"logistic"}: \code{glm} from \pkg{base} package with \code{family = "binomial"} -\item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package with \code{family = "binomial"} and using -\code{cv.glmnet} to select the optimal lambda. +\item \code{"logistic"}: \code{glm} from \pkg{base} package with +\code{family = "binomial"} +\item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package +with \code{family = "binomial"} and using \code{cv.glmnet} to select the +optimal lambda. \item \code{"svm"}: \code{svm()} from \pkg{e1071} package \item \code{"naivebayes"}: \code{naive_bayes} from \pkg{naivebayes} package \item \code{"nnet"}: \code{nnet} from \pkg{nnet} package \item \code{"knn"}: \code{train.kknn} from \pkg{kknn} package \item \code{"decisiontree"}: \code{rpart} from \pkg{rpart} package -\item \code{"randomforest"}: \code{randomForest} from \pkg{randomForest} package +\item \code{"randomforest"}: \code{randomForest} from \pkg{randomForest} +package \item \code{"multinom"}: \code{multinom} from \pkg{nnet} package -\item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package with \code{family = "multinomial"} and using +\item \code{"regularized_logistic"}: \code{glmnet} from \pkg{glmnet} package +with \code{family = "multinomial"} and using \code{cv.glmnet} to select the optimal lambda. \item \code{"xgboost"}: \code{xgb.train} from \pkg{xgboost} package } @@ -218,7 +271,7 @@ Each option of \code{models} uses the following function from the specified pack data(iris) # Perform a train-test split with an 80\% training set using LDA -result <- classCV( +results <- class_cv( data = iris, target = "Species", models = "lda", @@ -226,11 +279,11 @@ result <- classCV( ) # Print parameters and metrics -result +results$print() # Perform 5-fold cross-validation using Extreme Gradient Boosting # w/ additional parameters: params & nrounds -result <- classCV( +results <- class_cv( data = iris, formula = Species ~ ., models = "xgboost", @@ -245,12 +298,11 @@ result <- classCV( ) # Print parameters and metrics -result - +results$print() # Perform 5-fold cross-validation a train-test split with multiple models map_args <- list("knn" = list(ks = 5), "nnet" = list(size = 20)) -result <- classCV( +results <- class_cv( data = iris, target = 5, predictors = c(1:3), @@ -264,12 +316,11 @@ result <- classCV( ) # Print parameters and metrics -result +results$print() } \seealso{ -\code{\link{print.vswift}}, \code{\link{plot.vswift}} -} -\author{ -Donisha Smith +\code{\link{Vswift}}, \code{\link{CurveResult}} + +\code{\link{Vswift}} } diff --git a/man/genFolds.Rd b/man/genFolds.Rd deleted file mode 100644 index 27f9cb0..0000000 --- a/man/genFolds.Rd +++ /dev/null @@ -1,62 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/genFolds.R -\name{genFolds} -\alias{genFolds} -\title{Create Split Datasets and/or Folds with Optional Stratification} -\usage{ -genFolds( - data, - target, - train_params = list(split = NULL, n_folds = NULL, stratified = FALSE, random_seed = - NULL), - create_data = FALSE -) -} -\arguments{ -\item{data}{A data frame.} - -\item{target}{A numeric or character value specifying the target variable. Only required if\code{stratified = TRUE}. -Default is \code{NULL}.} - -\item{train_params}{A list that can contain the following parameters: -\itemize{ - \item \code{split}: A numeric value between 0 and 1 indicating the proportion of data to use - for training. The remaining observations are allocated to the test set. If not specified or set to \code{NULL}, no - train-test splitting is performed. Note that this split is separate from cross-validation. Default is \code{NULL}. - \item \code{n_folds}: An integer greater than 2 specifying the number of folds for cross-validation. If \code{NULL}, - no cross-validation is performed. Default is \code{NULL}. - \item \code{stratified}: A logical value indicating whether stratified sampling should be used during splitting. - Default is \code{FALSE}. - \item \code{random_seed}: A numeric value for the random seed to ensure reproducibility of random splitting and any - model training that relies on random starts. Default is \code{NULL}. - }} - -\item{create_data}{A logical value indicating whether to create all training and test/validation data frames. -Default is \code{FALSE}.} -} -\value{ -A list containing the indices for train-test splitting and/or cross-validation, with information on class -distribution in the training, test sets, and folds (if applicable). It also includes the generated split datasets -and folds based on those indices. -} -\description{ -A standalone function to generate train-test split datasets and/or cross-validation folds, optionally -performing stratified sampling based on class distribution. -} -\examples{ -# Load example dataset - -data(iris) - -# Obtain indices for 80\% training/test split and 5-fold CV - -output <- genFolds( - data = iris, - target = "Species", - train_params = list(split = 0.8, n_folds = 5, random_seed = 123) -) - -} -\author{ -Donisha Smith -} diff --git a/man/plot.vswift.Rd b/man/plot.vswift.Rd deleted file mode 100644 index e210377..0000000 --- a/man/plot.vswift.Rd +++ /dev/null @@ -1,81 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.vswift.R -\name{plot.vswift} -\alias{plot.vswift} -\title{Plot Model Evaluation Metrics} -\usage{ -\method{plot}{vswift}( - x, - metrics = c("accuracy", "precision", "recall", "f1"), - models = NULL, - split = TRUE, - cv = TRUE, - class_names = NULL, - path = NULL, - ... -) -} -\arguments{ -\item{x}{A list object of class \code{"vswift"}.} - -\item{metrics}{A character vector indicating which metrics to plot. Supported options are \code{"accuracy"}, -\code{"precision"}, \code{"recall"}, and \code{"f1"}. Default is \code{c("accuracy", "precision", "recall", "f1")}.} - -\item{models}{A character string or a character vector specifying the classification algorithm(s) evaluation metrics -to plot. If \code{NULL}, all models will be plotted. The following options are available: -\itemize{ - \item \code{"lda"}: Linear Discriminant Analysis - \item \code{"qda"}: Quadratic Discriminant Analysis - \item \code{"logistic"}: Unregularized Logistic Regression - \item \code{"regularized_logistic"}: Regularized Logistic Regression - \item \code{"svm"}: Support Vector Machine - \item \code{"naivebayes"}: Naive Bayes - \item \code{"nnet"}: Neural Network - \item \code{"knn"}: K-Nearest Neighbors - \item \code{"decisiontree"}: Decision Tree - \item \code{"randomforest"}: Random Forest - \item \code{"multinom"}: Unregularized Multinomial Logistic Regression - \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression - \item \code{"xgboost"}: Extreme Gradient Boosting - } - Default = \code{NULL}.} - -\item{split}{A logical value indicating whether to plot metrics for the train-test split results. Default is -\code{TRUE}.} - -\item{cv}{A logical value indicating whether to plot metrics for cross-validation results. Default is \code{TRUE}.} - -\item{class_names}{A vector of the specific classes to plot. If \code{NULL}, plots are generated for all classes. -Default is \code{NULL}.} - -\item{path}{A character string specifying the directory (with a trailing slash) to save the plots. -Default is \code{NULL}.} - -\item{...}{Additional arguments passed to the \code{png} function.} -} -\description{ -Plots classification metrics (accuracy, precision, recall, and f1 for each class). -} -\examples{ -# Load an example dataset -data(iris) - -# Perform a train-test split with an 80\% training set and stratified sampling using QDA - -result <- classCV( - data = iris, - target = "Species", - models = "qda", - train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), - save = list(models = TRUE) -) - - -# Plot performance metrics for train-test split - -plot(result, class_names = "setosa", metrics = "f1") - -} -\author{ -Donisha Smith -} diff --git a/man/prCurve.Rd b/man/prCurve.Rd deleted file mode 100644 index bb2f714..0000000 --- a/man/prCurve.Rd +++ /dev/null @@ -1,91 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/curves.R -\name{prCurve} -\alias{prCurve} -\title{Plot Precision-Recall (PR) Curves for Binary Classification Tasks} -\usage{ -prCurve( - x, - data = NULL, - models = NULL, - split = TRUE, - cv = TRUE, - thresholds = NULL, - return_output = TRUE, - path = NULL, - ... -) -} -\arguments{ -\item{x}{A list object of class \code{"vswift"}. Note that the models must be saved using -\code{save = list("models" = TRUE)} in \code{classCV} for this function to work.} - -\item{data}{A data frame. If \code{NULL}, then the preprocessed data muse be saved using -\code{save = list("data" = TRUE)} in \code{classCV} Default = \code{NULL}.} - -\item{models}{A character string or a character vector specifying the classification algorithm(s) to plot curves -for. If \code{NULL}, all models will be plotted. The following options are available: -\itemize{ - \item \code{"lda"}: Linear Discriminant Analysis - \item \code{"qda"}: Quadratic Discriminant Analysis - \item \code{"logistic"}: Unregularized Logistic Regression - \item \code{"regularized_logistic"}: Regularized Logistic Regression - \item \code{"svm"}: Support Vector Machine - \item \code{"naivebayes"}: Naive Bayes - \item \code{"nnet"}: Neural Network - \item \code{"knn"}: K-Nearest Neighbors - \item \code{"decisiontree"}: Decision Tree - \item \code{"randomforest"}: Random Forest - \item \code{"multinom"}: Unregularized Multinomial Logistic Regression - \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression - \item \code{"xgboost"}: Extreme Gradient Boosting - } - Default = \code{NULL}.} - -\item{split}{A logical value indicating whether to plot curves for the train-test split results. Default is -\code{TRUE}.} - -\item{cv}{A logical value indicating whether to plot curves for cross-validation results. Default is \code{TRUE}.} - -\item{thresholds}{A numerical vector specifying the thresholds to use when producing the curves. If left as NULL -the unique probability values produced by the training model will be used as thresholds. Default is \code{NULL}.} - -\item{return_output}{A logical value indicating whether to return the output list. Default is \code{TRUE}.} - -\item{path}{A character string specifying the directory (with a trailing slash) to save the plots. -Default is \code{NULL}.} - -\item{...}{Additional arguments passed to the \code{png} function.} -} -\value{ -A list containing thresholds used to generate the PR curve, target labels, precision, recall, -area under the curve (AUC), and maximum F1 score and its associated optimal threshold for all training and -validation sets for each model. -} -\description{ -Produces PR curves and computes the area under the curve (AUC) and the threshold with the maximum F1. -score. Only works for binary classification tasks. -} -\examples{ -# Load an example dataset -data <- iris - -# Make Binary -data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - -# Perform a train-test split with an 80\% training set and stratified sampling using QDA -result <- classCV( - data = data, - target = "Species", - models = "qda", - train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), - save = list(data = TRUE, models = TRUE) -) - -# Get PR curve -prCurve(result, return_output = FALSE) - -} -\author{ -Donisha Smith -} diff --git a/man/print.vswift.Rd b/man/print.vswift.Rd deleted file mode 100644 index a61d93f..0000000 --- a/man/print.vswift.Rd +++ /dev/null @@ -1,64 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/print.vswift.R -\name{print.vswift} -\alias{print.vswift} -\title{Print Parameter Information and/or Model Evaluation Metrics} -\usage{ -\method{print}{vswift}(x, configs = TRUE, metrics = TRUE, models = NULL, ...) -} -\arguments{ -\item{x}{A list object of class \code{"vswift"}.} - -\item{configs}{A logical value indicating whether to print model configuration information from the vswift -object. Default is \code{TRUE}.} - -\item{metrics}{A logical value indicating whether to print model evaluation metrics from the vswift object. If -\code{TRUE}, precision, recall, and F1 scores for each class will be displayed, along with their mean values -(if cross-validation was used). Default is \code{TRUE}.} - -\item{models}{A character string or a character vector specifying the classification algorithm(s) information to be -printed. If \code{NULL}, all model information will be printed. The following options are available: -\itemize{ - \item \code{"lda"}: Linear Discriminant Analysis - \item \code{"qda"}: Quadratic Discriminant Analysis - \item \code{"logistic"}: Unregularized Logistic Regression - \item \code{"regularized_logistic"}: Regularized Logistic Regression - \item \code{"svm"}: Support Vector Machine - \item \code{"naivebayes"}: Naive Bayes - \item \code{"nnet"}: Neural Network - \item \code{"knn"}: K-Nearest Neighbors - \item \code{"decisiontree"}: Decision Tree - \item \code{"randomforest"}: Random Forest - \item \code{"multinom"}: Unregularized Multinomial Logistic Regression - \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression - \item \code{"xgboost"}: Extreme Gradient Boosting - } - Default = \code{NULL}.} - -\item{...}{No additional arguments are currently supported.} -} -\description{ -Prints model configuration details and/or model evaluation metrics (classification accuracy, precision, -recall, and F1 scores). -} -\examples{ -# Load an example dataset - -data(iris) - -# Perform a train-test split with an 80\% training set using LDA - -result <- classCV( - data = iris, - target = "Species", - models = "lda", - train_params = list(split = 0.8, stratified = TRUE, random_seed = 123) -) - -# Print parameter information and performance metrics -print(result) - -} -\author{ -Donisha Smith -} diff --git a/man/rocCurve.Rd b/man/rocCurve.Rd deleted file mode 100644 index 6c4335f..0000000 --- a/man/rocCurve.Rd +++ /dev/null @@ -1,91 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/curves.R -\name{rocCurve} -\alias{rocCurve} -\title{Plot Receiver Operating Characteristic (ROC) Curves for Binary Classification Tasks} -\usage{ -rocCurve( - x, - data = NULL, - models = NULL, - split = TRUE, - cv = TRUE, - thresholds = NULL, - return_output = TRUE, - path = NULL, - ... -) -} -\arguments{ -\item{x}{A list object of class \code{"vswift"}. Note that the models must be saved using -\code{save = list("models" = TRUE)} in \code{classCV} for this function to work.} - -\item{data}{A data frame. If \code{NULL}, then the preprocessed data muse be saved using -\code{save = list("data" = TRUE)} in \code{classCV} Default = \code{NULL}.} - -\item{models}{A character string or a character vector specifying the classification algorithm(s) to plot curves -for. If \code{NULL}, all models will be plotted. The following options are available: -\itemize{ - \item \code{"lda"}: Linear Discriminant Analysis - \item \code{"qda"}: Quadratic Discriminant Analysis - \item \code{"logistic"}: Unregularized Logistic Regression - \item \code{"regularized_logistic"}: Regularized Logistic Regression - \item \code{"svm"}: Support Vector Machine - \item \code{"naivebayes"}: Naive Bayes - \item \code{"nnet"}: Neural Network - \item \code{"knn"}: K-Nearest Neighbors - \item \code{"decisiontree"}: Decision Tree - \item \code{"randomforest"}: Random Forest - \item \code{"multinom"}: Unregularized Multinomial Logistic Regression - \item \code{"regularized_multinomial"}: Regularized Multinomial Logistic Regression - \item \code{"xgboost"}: Extreme Gradient Boosting - } - Default = \code{NULL}.} - -\item{split}{A logical value indicating whether to plot curves for the train-test split results. Default is -\code{TRUE}.} - -\item{cv}{A logical value indicating whether to plot curves for cross-validation results. Default is \code{TRUE}.} - -\item{thresholds}{A numerical vector specifying the thresholds to use when producing the curves. If left as NULL -the unique probability values produced by the training model will be used as thresholds. Default is \code{NULL}.} - -\item{return_output}{A logical value indicating whether to return the output list. Default is \code{TRUE}.} - -\item{path}{A character string specifying the directory (with a trailing slash) to save the plots. -Default is \code{NULL}.} - -\item{...}{Additional arguments passed to the \code{png} function.} -} -\value{ -A list containing thresholds used to generate the ROC curve, target labels, false positive rates (FPR), -true positive rates (TPR), area under the curve (AUC), and Youden's Index for all training and validation sets -for each model. -} -\description{ -Produces ROC curves and computes the area under the curve (AUC) and Youden's Index. -Only works for binary classification tasks. -} -\examples{ -# Load an example dataset -data <- iris - -# Make Binary -data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - -# Perform a train-test split with an 80\% training set and stratified sampling using QDA -result <- classCV( - data = data, - target = "Species", - models = "qda", - train_params = list(split = 0.8, stratified = TRUE, random_seed = 123), - save = list(data = TRUE, models = TRUE) -) - -# Get ROC curve -rocCurve(result, return_output = FALSE) - -} -\author{ -Donisha Smith -} diff --git a/tests/testthat/tests_classCV.R b/tests/testthat/tests_class_cv.R similarity index 67% rename from tests/testthat/tests_classCV.R rename to tests/testthat/tests_class_cv.R index 07c6180..65b117d 100644 --- a/tests/testthat/tests_classCV.R +++ b/tests/testthat/tests_class_cv.R @@ -1,4 +1,4 @@ -# Testing classCV function +# Testing class_cv function library(vswift) library(testthat) @@ -10,7 +10,7 @@ skip_test <- function() { test_that("Fail due to `train_params` not being list", { data <- iris - expect_error(result <- classCV( + expect_error(result <- class_cv( data = data, target = "Species", models = "lda", train_params = NULL ), "`train_params` must be a list") @@ -19,7 +19,7 @@ test_that("Fail due to `train_params` not being list", { test_that("Fail due to lack of nesting for `train_params`", { data <- iris expect_error( - result <- classCV( + result <- class_cv( data = data, target = "Species", models = "lda", train_params = list() ), @@ -29,163 +29,181 @@ test_that("Fail due to lack of nesting for `train_params`", { test_that("test train-test split and no stratified sampling", { data <- iris - expect_no_error(result <- classCV( + expect_no_error(result <- class_cv( data = data, target = "Species", models = "lda", train_params = list(split = 0.8, standardize = TRUE) )) # Ensure values are greater than or equal to 0 and less than or equal to one - split_df <- result$metrics$lda$split + split_df <- result$metrics("lda", "split") expect_true(all(split_df[, 2:ncol(split_df)] >= 0 & split_df[, 2:ncol(split_df)] <= 1)) }) test_that("test train-test split and no stratified sampling w/ invalid key", { data <- iris - expect_warning(result <- classCV( + expect_warning(result <- class_cv( data = data, target = "Species", models = "lda", train_params = list(split = 0.8, standardize = TRUE, invalid_key = "1") )) # Ensure values are greater than or equal to 0 and less than or equal to one - split_df <- result$metrics$lda$split + split_df <- result$metrics("lda", "split") expect_true(all(split_df[, 2:ncol(split_df)] >= 0 & split_df[, 2:ncol(split_df)] <= 1)) }) test_that("test new formula method", { data <- iris - result1 <- classCV( + result1 <- class_cv( data = data, target = "Species", models = "qda", train_params = list(split = 0.8, random_seed = 123) ) - expect_no_error(result2 <- classCV( + expect_no_error(result2 <- class_cv( formula = Species ~ ., data = data, models = "qda", train_params = list(split = 0.8, random_seed = 123) )) - expect_equal(result1$metrics$lda$split, result2$metrics$lda$split) + expect_equal(result1$metrics("qda", "split"), result2$metrics("qda", "split")) # Ensure values are greater than or equal to 0 and less than or equal to one - split_df <- result1$metrics$qda$split + split_df <- result1$metrics("qda", "split") expect_true(all(split_df[, 2:ncol(split_df)] >= 0 & split_df[, 2:ncol(split_df)] <= 1)) }) test_that("CV no stratified sampling", { data <- iris - expect_no_error(result <- classCV(data = data, target = "Species", models = "svm", train_params = list(n_folds = 3))) + expect_no_error(result <- class_cv(data = data, target = "Species", models = "svm", train_params = list(n_folds = 3))) # Ensure values are greater than or equal to 0 and less than or equal to one - cv_df <- result$metrics$svm$cv + cv_df <- result$metrics("svm", "cv") expect_true(all(cv_df[, 2:ncol(cv_df)] >= 0 & cv_df[, 2:ncol(cv_df)] <= 1)) }) test_that("CV with stratified", { data <- iris - expect_no_error(result <- classCV( + expect_no_error(result <- class_cv( data = data, target = "Species", models = "nnet", size = 5, train_params = list(n_folds = 3, stratified = TRUE, random_seed = 123) )) - expect_true(all(c("proportions", "indices") %in% names(result$class_summary))) + class_summary <- result$class_info("proportions") + expect_true(!is.null(class_summary)) }) test_that("train-test split and k-fold CV with stratified", { data <- iris - expect_no_error(result <- classCV( + expect_no_error(result <- class_cv( data = data, target = "Species", models = "naivebayes", train_params = list(split = 0.8, n_folds = 3, stratified = TRUE), save = list(data = TRUE) )) # Check that data partition indices between train test split are independent - expect_false(any(result$data_partitions$split$train %in% result$data_partitions$split$train)) + split_train <- result$get_partition("indices", "split", "train") + split_test <- result$get_partition("indices", "split", "test") + expect_false(any(split_train %in% split_test)) # Check that indices are assigned correctly when dataframes are made for modeling - expect_true(all(result$data_partitions$indices$split$train %in% rownames(result$data_partitions$dataframes$split$train))) - expect_true(all(result$data_partitions$indices$split$test %in% rownames(result$data_partitions$dataframes$split$test))) + split_train_df <- result$get_partition("dataframes", "split", "train") + split_test_df <- result$get_partition("dataframes", "split", "test") + expect_true(all(split_train %in% rownames(split_train_df))) + expect_true(all(split_test %in% rownames(split_test_df))) - train_len <- length(result$data_partitions$indices$split$train) - test_len <- length(result$data_partitions$indices$split$test) + train_len <- length(split_train) + test_len <- length(split_test) expect_true(c(train_len + test_len) == nrow(data)) folds <- paste0("fold", 1:3) # Check that data partition indices between folds are independent + cv_indices <- result$get_partition("indices", "cv") for (i in folds) { for (j in folds) { if (i != j) { - expect_false(any(result$data_partitions$indices$cv[[i]] %in% result$data_partitions$indices$cv[[j]])) + expect_false(any(cv_indices[[i]] %in% cv_indices[[j]])) } } } # Check that fold train and test data are correct for (i in folds) { - train_indxs <- as.numeric(unlist(result$data_partitions$indices$cv[!names(result$data_partitions$indices$cv) == i])) - test_indxs <- as.numeric(unlist(result$data_partitions$indices$cv[[i]])) + train_indxs <- as.numeric(unlist(cv_indices[!names(cv_indices) == i])) + test_indxs <- as.numeric(unlist(cv_indices[[i]])) - expect_true(all(train_indxs %in% rownames(result$data_partitions$dataframes$cv[[i]]$train))) - expect_true(all(test_indxs %in% rownames(result$data_partitions$dataframes$cv[[i]]$test))) + fold_train_df <- result$get_partition("dataframes", "cv")[[i]]$train + fold_test_df <- result$get_partition("dataframes", "cv")[[i]]$test + + expect_true(all(train_indxs %in% rownames(fold_train_df))) + expect_true(all(test_indxs %in% rownames(fold_test_df))) } # Ensure values are greater than or equal to 0 and less than or equal to one - split_df <- result$metrics$naivebayes$split - cv_df <- result$metrics$naivebayes$cv + split_df <- result$metrics("naivebayes", "split") + cv_df <- result$metrics("naivebayes", "cv") expect_true(all(split_df[, 2:ncol(split_df)] >= 0 & split_df[, 2:ncol(split_df)] <= 1)) expect_true(all(cv_df[, 2:ncol(cv_df)] >= 0 & cv_df[, 2:ncol(cv_df)] <= 1)) }) test_that("train-test split and k-fold CV without stratified sampling", { data <- iris - expect_no_error(result <- classCV( + expect_no_error(result <- class_cv( data = data, target = "Species", models = "multinom", train_params = list(split = 0.8, n_folds = 3), save = list(data = TRUE) )) # Test again since regular split uses different code from stratified # Check that data partition indices between train test split are independent - expect_false(any(result$data_partitions$split$train %in% result$data_partitions$split$train)) + split_train <- result$get_partition("indices", "split", "train") + split_test <- result$get_partition("indices", "split", "test") + expect_false(any(split_train %in% split_test)) # Check that indices are assigned correctly when dataframes are made for modeling - expect_true(all(result$data_partitions$indices$split$train %in% rownames(result$data_partitions$dataframes$split$train))) - expect_true(all(result$data_partitions$indices$split$test %in% rownames(result$data_partitions$dataframes$split$test))) + split_train_df <- result$get_partition("dataframes", "split", "train") + split_test_df <- result$get_partition("dataframes", "split", "test") + expect_true(all(split_train %in% rownames(split_train_df))) + expect_true(all(split_test %in% rownames(split_test_df))) - train_len <- length(result$data_partitions$indices$split$train) - test_len <- length(result$data_partitions$indices$split$test) + train_len <- length(split_train) + test_len <- length(split_test) expect_true(c(train_len + test_len) == nrow(data)) folds <- paste0("fold", 1:3) # Check that data partition indices between folds are independent + cv_indices <- result$get_partition("indices", "cv") for (i in folds) { for (j in folds) { if (i != j) { - expect_false(any(result$data_partitions$indices$cv[[i]] %in% result$data_partitions$indices$cv[[j]])) + expect_false(any(cv_indices[[i]] %in% cv_indices[[j]])) } } } # Check that fold train and test data are correct for (i in folds) { - train_indxs <- as.numeric(unlist(result$data_partitions$indices$cv[!names(result$data_partitions$indices$cv) == i])) - test_indxs <- as.numeric(unlist(result$data_partitions$indices$cv[[i]])) + train_indxs <- as.numeric(unlist(cv_indices[!names(cv_indices) == i])) + test_indxs <- as.numeric(unlist(cv_indices[[i]])) + + fold_train_df <- result$get_partition("dataframes", "cv")[[i]]$train + fold_test_df <- result$get_partition("dataframes", "cv")[[i]]$test - expect_true(all(train_indxs %in% rownames(result$data_partitions$dataframes$cv[[i]]$train))) - expect_true(all(test_indxs %in% rownames(result$data_partitions$dataframes$cv[[i]]$test))) + expect_true(all(train_indxs %in% rownames(fold_train_df))) + expect_true(all(test_indxs %in% rownames(fold_test_df))) } # Ensure values are greater than or equal to 0 and less than or equal to one - split_df <- result$metrics$multinom$split - cv_df <- result$metrics$multinom$cv + split_df <- result$metrics("multinom", "split") + cv_df <- result$metrics("multinom", "cv") expect_true(all(split_df[, 2:ncol(split_df)] >= 0 & split_df[, 2:ncol(split_df)] <= 1)) expect_true(all(cv_df[, 2:ncol(cv_df)] >= 0 & cv_df[, 2:ncol(cv_df)] <= 1)) }) test_that("test final", { data <- iris - expect_no_error(result <- classCV( + expect_no_error(result <- class_cv( data = data, target = "Species", models = "multinom", train_params = list(standardize = TRUE), model_params = list(final_model = TRUE) )) - expect_true(all(!is.na(result$models$multinom$final))) + final_model <- result$get_trained_model("multinom", "final") + expect_true(all(!is.na(final_model))) # Should stop - expect_error(result <- classCV( + expect_error(result <- class_cv( data = data, target = "Species", models = "multinom", train_params = list(standardize = TRUE), model_params = list(final_model = FALSE) @@ -201,7 +219,7 @@ test_that("test final w imputation", { } # Without folds - expect_warning(expect_warning(result <- classCV( + expect_warning(expect_warning(result <- class_cv( data = data, target = "Species", models = "multinom", train_params = list(standardize = TRUE), model_params = list(final_model = TRUE), @@ -209,10 +227,11 @@ test_that("test final w imputation", { save = list(data = TRUE) ))) - expect_true(all(!is.na(result$data_partitions$dataframes$final))) + final_data <- result$get_partition("dataframes", "preprocessed_data") + expect_true(all(!is.na(final_data))) # With folds - expect_warning(expect_warning(result <- classCV( + expect_warning(expect_warning(result <- class_cv( data = data, target = "Species", models = "multinom", train_params = list(n_folds = 3), model_params = list(final_model = TRUE), @@ -220,7 +239,8 @@ test_that("test final w imputation", { save = list(data = TRUE) ))) - expect_true(all(!is.na(result$data_partitions$dataframes$final))) + final_data <- result$get_partition("dataframes", "preprocessed_data") + expect_true(all(!is.na(final_data))) }) test_that("test imputation and missing data", { @@ -234,7 +254,7 @@ test_that("test imputation and missing data", { data[10, colnames(data)[colnames(data) != "Species"]] <- NA # knn - expect_warning(expect_warning(result <- classCV( + expect_warning(expect_warning(result <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 4, stratified = TRUE), impute_params = list(method = "impute_knn", args = list(neighbors = 5)), @@ -242,7 +262,7 @@ test_that("test imputation and missing data", { ))) # bag - expect_warning(expect_warning(result <- classCV( + expect_warning(expect_warning(result <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 4, stratified = FALSE), impute_params = list(method = "impute_bag", args = list(trees = 5)), @@ -250,7 +270,7 @@ test_that("test imputation and missing data", { ))) # complete cases only - expect_warning(result <- classCV( + expect_warning(result <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 4, stratified = TRUE), models = "decisiontree", model_params = list(final_model = TRUE), @@ -260,33 +280,39 @@ test_that("test imputation and missing data", { test_that("test random seed", { data <- iris - result_1 <- classCV( + result_1 <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 3, stratified = TRUE, random_seed = 123), models = "knn", ks = 5 ) - result_2 <- classCV( + result_2 <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 3, stratified = TRUE, random_seed = 123), models = "knn", model_params = list(map_args = list(knn = list(ks = 5))) ) - expect_equal(result_1$data_partitions$indices$split$train, result_2$data_partitions$indices$split$train) - expect_equal(result_1$metrics$knn$cv, result_2$metrics$knn$cv) + expect_equal( + result_1$get_partition("indices", "split", "train"), + result_2$get_partition("indices", "split", "train") + ) + expect_equal(result_1$metrics("knn", "cv"), result_2$metrics("knn", "cv")) - result_1 <- classCV( + result_1 <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 3, stratified = FALSE, random_seed = 123), models = "knn", ks = 5 ) - result_2 <- classCV( + result_2 <- class_cv( data = data, target = "Species", train_params = list(split = 0.8, n_folds = 3, stratified = FALSE, random_seed = 123), models = "knn", model_params = list(map_args = list(knn = list(ks = 5))) ) - expect_equal(result_1$data_partitions$indices$split$train, result_2$data_partitions$indices$split$train) - expect_equal(result_1$metrics$knn$cv, result_2$metrics$knn$cv) + expect_equal( + result_1$get_partition("indices", "split", "train"), + result_2$get_partition("indices", "split", "train") + ) + expect_equal(result_1$metrics("knn", "cv"), result_2$metrics("knn", "cv")) }) test_that("running multiple models", { @@ -298,7 +324,7 @@ test_that("running multiple models", { max_depth = 6 ), nrounds = 10)) - expect_warning(result <- classCV( + expect_warning(result <- class_cv( data = data, target = 5, models = c("knn", "svm", "xgboost", "randomforest"), train_params = list( split = 0.8, n_folds = 3, stratified = TRUE, @@ -322,7 +348,7 @@ test_that("running multiple models", { ) models <- c("knn", "svm", "logistic", "xgboost", "randomforest") - expect_warning(expect_warning(result <- classCV( + expect_warning(expect_warning(result <- class_cv( data = data, target = 5, models = models, train_params = list( split = 0.8, n_folds = 3, standardize = TRUE, @@ -334,8 +360,8 @@ test_that("running multiple models", { # Ensure values are greater than or equal to 0 and less than or equal to one for (model in models) { - split_df <- result$metrics[[model]]$split - cv_df <- result$metrics[[model]]$cv + split_df <- result$metrics(model, "split") + cv_df <- result$metrics(model, "cv") expect_true(all(split_df[, 2:ncol(split_df)] >= 0 & split_df[, 2:ncol(split_df)] <= 1)) expect_true(all(cv_df[, 2:ncol(cv_df)] >= 0 & cv_df[, 2:ncol(cv_df)] <= 1)) } @@ -352,22 +378,22 @@ test_that("n_cores", { max_depth = 6 ), nrounds = 10)) - expect_warning(result1 <- classCV( + expect_warning(result1 <- class_cv( data = data, target = 5, models = c("knn", "svm", "xgboost", "randomforest"), train_params = list(split = 0.8, n_folds = 3, stratified = TRUE, random_seed = 123), save = list(models = TRUE), model_params = list(map_args = args), parallel_configs = list(n_cores = 2, future.seed = 100) )) - expect_warning(result2 <- classCV( + expect_warning(result2 <- class_cv( data = data, target = 5, models = c("knn", "svm", "xgboost", "randomforest"), train_params = list(split = 0.8, n_folds = 3, stratified = TRUE, random_seed = 123), save = list(models = TRUE), model_params = list(map_args = args), parallel_configs = list(n_cores = 2, future.seed = 100) )) - expect_equal(result1$metrics$knn$split, result2$metrics$knn$split) - expect_equal(result1$metrics$knn$cv, result2$metrics$knn$cv) + expect_equal(result1$metrics("knn", "split"), result2$metrics("knn", "split")) + expect_equal(result1$metrics("knn", "cv"), result2$metrics("knn", "cv")) }) test_that("ensure parallel and nonparallel outputs are equal", { @@ -375,13 +401,13 @@ test_that("ensure parallel and nonparallel outputs are equal", { skip_test() - expect_no_error(result1 <- classCV( + expect_no_error(result1 <- class_cv( data = data, target = 5, models = "lda", train_params = list(n_folds = 3, stratified = TRUE, random_seed = 123), save = list(models = TRUE), )) - expect_no_error(result2 <- classCV( + expect_no_error(result2 <- class_cv( data = data, target = 5, models = "lda", train_params = list(n_folds = 3, stratified = TRUE, random_seed = 123), save = list(models = TRUE), @@ -391,8 +417,7 @@ test_that("ensure parallel and nonparallel outputs are equal", { expect_true(exists("result1") && !is.null(result1)) expect_true(exists("result2") && !is.null(result2)) - expect_equal(result1$metrics$lda$split, result2$metrics$lda$split) - expect_equal(result1$metrics$lda$cv, result2$metrics$lda$cv) + expect_equal(result1$metrics("lda", "cv"), result2$metrics("lda", "cv")) }) test_that("xgboost objectives-single", { @@ -402,7 +427,7 @@ test_that("xgboost objectives-single", { bin_obj <- c("reg:logistic", "binary:logistic", "binary:hinge", "binary:logitraw") for (obj in bin_obj) { - result <- classCV( + result <- class_cv( data = df, formula = Species ~ ., models = "xgboost", @@ -415,13 +440,13 @@ test_that("xgboost objectives-single", { nrounds = 10, save = list(models = T) ) - expect_true(all(!is.na(result$metrics$xgboost$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) } multi_obj <- c("multi:softprob", "multi:softmax") for (obj in multi_obj) { - result <- classCV( + result <- class_cv( data = df, formula = Species ~ ., models = "xgboost", @@ -435,7 +460,7 @@ test_that("xgboost objectives-single", { nrounds = 10, save = list(models = T) ) - expect_true(all(!is.na(result$metrics$xgboost$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) } }) @@ -453,7 +478,7 @@ test_that("xgboost objectives-multi", { max_depth = 6 ), nrounds = 10)) - result <- classCV( + result <- class_cv( data = df, formula = Species ~ ., models = c("xgboost", "knn"), @@ -461,10 +486,10 @@ test_that("xgboost objectives-multi", { model_params = list(map_args = args) ) - expect_true(all(!is.na(result$metrics$xgboost$split))) - expect_true(all(!is.na(result$metrics$xgboost$cv))) - expect_true(all(!is.na(result$metrics$knn$split))) - expect_true(all(!is.na(result$metrics$knn$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "split")))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) + expect_true(all(!is.na(result$metrics("knn", "split")))) + expect_true(all(!is.na(result$metrics("knn", "cv")))) } for (obj in multi_obj) { @@ -474,7 +499,7 @@ test_that("xgboost objectives-multi", { eta = 0.8, max_depth = 6 ), nrounds = 10)) - result <- classCV( + result <- class_cv( data = df, formula = Species ~ ., models = c("xgboost", "knn"), @@ -482,10 +507,10 @@ test_that("xgboost objectives-multi", { model_params = list(map_args = args) ) - expect_true(all(!is.na(result$metrics$xgboost$split))) - expect_true(all(!is.na(result$metrics$xgboost$cv))) - expect_true(all(!is.na(result$metrics$knn$split))) - expect_true(all(!is.na(result$metrics$knn$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "split")))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) + expect_true(all(!is.na(result$metrics("knn", "split")))) + expect_true(all(!is.na(result$metrics("knn", "cv")))) } }) @@ -502,7 +527,7 @@ test_that("binary target", { max_depth = 6 ), nrounds = 10)) - result <- classCV( + result <- class_cv( data = df, formula = Species ~ ., models = c("logistic", "xgboost"), @@ -510,12 +535,12 @@ test_that("binary target", { model_params = list(map_args = args) ) - expect_true(all(!is.na(result$metrics$xgboost$split))) - expect_true(all(!is.na(result$metrics$xgboost$cv))) - expect_true(all(!is.na(result$metrics$logistic$split))) - expect_true(all(!is.na(result$metrics$logistic$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "split")))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) + expect_true(all(!is.na(result$metrics("logistic", "split")))) + expect_true(all(!is.na(result$metrics("logistic", "cv")))) - result <- classCV( + result <- class_cv( data = df, target = "Species", models = c("xgboost", "logistic"), @@ -523,10 +548,10 @@ test_that("binary target", { model_params = list(map_args = args) ) - expect_true(all(!is.na(result$metrics$xgboost$split))) - expect_true(all(!is.na(result$metrics$xgboost$cv))) - expect_true(all(!is.na(result$metrics$logistic$split))) - expect_true(all(!is.na(result$metrics$logistic$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "split")))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) + expect_true(all(!is.na(result$metrics("logistic", "split")))) + expect_true(all(!is.na(result$metrics("logistic", "cv")))) } }) @@ -537,7 +562,7 @@ test_that("test regularized", { map_args <- list(regularized_logistic = list(alpha = 1, nfolds = 3)) - result <- classCV( + result <- class_cv( data = df, target = "Species", models = c("regularized_logistic", "regularized_multinomial"), @@ -545,15 +570,15 @@ test_that("test regularized", { model_params = list(map_args = map_args) ) - expect_true(all(!is.na(result$metrics$regularized_logistic$split))) - expect_true(all(!is.na(result$metrics$regularized_multinomial$split))) + expect_true(all(!is.na(result$metrics("regularized_logistic", "split")))) + expect_true(all(!is.na(result$metrics("regularized_multinomial", "split")))) - expect_true(all(!is.na(result$metrics$regularized_logistic$cv))) - expect_true(all(!is.na(result$metrics$regularized_multinomial$cv))) + expect_true(all(!is.na(result$metrics("regularized_logistic", "cv")))) + expect_true(all(!is.na(result$metrics("regularized_multinomial", "cv")))) # With final - result <- classCV( + result <- class_cv( data = df, target = "Species", models = c("regularized_logistic", "regularized_multinomial"), @@ -570,7 +595,7 @@ test_that("test threshold no xgboost", { mods <- names(vswift:::.MODEL_LIST)[!names(vswift:::.MODEL_LIST) == "xgboost"] map_args <- list(regularized_logistic = list(alpha = 1, nfolds = 3), knn = list(ks = 5), nnet = list(size = 4)) - result <- classCV( + result <- class_cv( data = df, target = "Species", models = mods, @@ -579,8 +604,8 @@ test_that("test threshold no xgboost", { ) for (mod in mods) { - expect_true(all(!is.na(result$metrics[[mod]]$split))) - expect_true(all(!is.na(result$metrics[[mod]]$cv))) + expect_true(all(!is.na(result$metrics(mod, "split")))) + expect_true(all(!is.na(result$metrics(mod, "cv")))) } }) @@ -598,7 +623,7 @@ test_that("test threshold for xgboost", { max_depth = 6 ), nrounds = 10)) - result <- classCV( + result <- class_cv( data = df, formula = Species ~ ., models = "xgboost", @@ -606,7 +631,7 @@ test_that("test threshold for xgboost", { model_params = list(map_args = args) ) - expect_true(all(!is.na(result$metrics$xgboost$split))) - expect_true(all(!is.na(result$metrics$xgboost$cv))) + expect_true(all(!is.na(result$metrics("xgboost", "split")))) + expect_true(all(!is.na(result$metrics("xgboost", "cv")))) } }) diff --git a/tests/testthat/tests_genFolds.R b/tests/testthat/tests_genFolds.R deleted file mode 100644 index 48be36b..0000000 --- a/tests/testthat/tests_genFolds.R +++ /dev/null @@ -1,39 +0,0 @@ -library(vswift) -library(testthat) - -test_that("testing if split and cv works for genFolds", { - data <- iris - expect_no_error( - folds <- genFolds(data = data, target = 5, train_params = list(split = 0.8, n_folds = 5, stratified = T), create_data = T) - ) - expect_no_error( - folds <- genFolds(data = data, target = "Species", train_params = list(split = 0.8, n_folds = 5, stratified = F), create_data = T) - ) - expect_true(is.data.frame(folds$data_partitions$dataframes$cv$fold1$train)) - expect_true(is.data.frame(folds$data_partitions$dataframes$split$train)) -}) - -test_that("testing if split works for genFolds", { - data <- iris - expect_no_error( - folds <- genFolds(data = data, target = 5, train_params = list(split = 0.8, stratified = T), create_data = T) - ) - expect_no_error( - folds <- genFolds(data = data, target = 5, train_params = list(split = 0.8, stratified = F), create_data = T) - ) - - expect_true(!is.null(folds)) -}) - - -test_that("testing if cv works for genFolds", { - data <- iris - expect_no_error( - folds <- genFolds(data = data, target = 5, train_params = list(n_folds = 5, stratified = T), create_data = T) - ) - expect_no_error( - folds <- genFolds(data = data, target = 5, train_params = list(n_folds = 5, stratified = F), create_data = T) - ) - - expect_true(!is.null(folds)) -}) diff --git a/tests/testthat/tests_plot.vswift.R b/tests/testthat/tests_plot.R similarity index 74% rename from tests/testthat/tests_plot.vswift.R rename to tests/testthat/tests_plot.R index 6255af1..a7fe420 100644 --- a/tests/testthat/tests_plot.vswift.R +++ b/tests/testthat/tests_plot.R @@ -6,18 +6,18 @@ test_that("testing plot function", { args <- list("knn" = list(ks = 3), "nnet" = list(size = 10)) - expect_no_error(result <- classCV( + expect_no_error(result <- class_cv( data = data, target = 5, models = c("knn", "randomforest", "nnet", "svm"), train_params = list(split = 0.8, n_folds = 5, remove_obs = T, stratified = T), model_params = list(map_args = args), save = list(models = T, data = T) )) expect_no_error( - plot(result, models = "knn", split = T, cv = T, class_names = "setosa") + result$plot(models = "knn", split = T, cv = T, class_names = "setosa") ) expect_no_error( - plot(result, models = "knn", class_names = "setosa", path = getwd()) + result$plot(models = "knn", class_names = "setosa", path = getwd()) ) for (png_file in list.files(getwd(), pattern = ".png")) { diff --git a/tests/testthat/tests_prCurve.R b/tests/testthat/tests_pr_curve.R similarity index 69% rename from tests/testthat/tests_prCurve.R rename to tests/testthat/tests_pr_curve.R index e005204..2f9cdc8 100644 --- a/tests/testthat/tests_prCurve.R +++ b/tests/testthat/tests_pr_curve.R @@ -1,171 +1,177 @@ -library(vswift) -library(testthat) - -source("utils.R") - -# Test that each model works with train-test splitting alone -test_that("test pr curve", { - data <- iris - - data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - data$Species <- factor(data$Species) - - map_args <- list( - "knn" = list(ks = 5), - "xgboost" = list( - params = list( - booster = "gbtree", - objective = "reg:logistic", - lambda = 0.0003, - alpha = 0.0003, - eta = 0.8, - max_depth = 6 - ), - nrounds = 10 - ), - "regularized_logistic" = list(alpha = 1, nfolds = 3), - "nnet" = list(size = 2) - ) - - models <- c( - "regularized_logistic", "regularized_multinomial", "multinom", "knn", "nnet", "lda", "qda", - "svm", "decisiontree", "randomforest", "logistic", "naivebayes", "xgboost" - ) - - results <- classCV( - formula = Species ~ ., - data = data, - models = models, - model_params = list(map_args = map_args, rule = "1se", verbose = TRUE), - train_params = list( - split = 0.8, - n_folds = 5, - standardize = T, - stratified = TRUE, - random_seed = 123 - ), - save = list(models = TRUE, data = TRUE) - ) - - # With thresholds derived from models - pr_output <- prCurve(results, path = getwd()) - check_png() - expect_true(length(pr_output) == "13") - check_metrics(pr_output, "pr") - - # With specified thresholds - pr_output <- prCurve(results, path = getwd(), thresholds = seq(0, 0.9, 0.1)) - check_png() - expect_true(length(pr_output) == "13") - check_metrics(pr_output, "pr") -}) - -test_that("test equivalence with standardizing", { - data <- iris - - data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - data$Species <- factor(data$Species) - - result1 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - standardize = T, - stratified = TRUE, - random_seed = 123 - ), - save = list(models = TRUE, data = TRUE) - ) - - output1 <- prCurve(result1) - - result2 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - standardize = T, - stratified = TRUE, - random_seed = 123 - ), - save = list(models = TRUE, data = TRUE) - ) - - output2 <- prCurve(result2, data) - - for (fold in names(output1$svm$cv)) { - for (i in names(output1$svm$cv[[fold]])) { - if (i == "metrics") { - expect_true(all(output1$svm$cv[[fold]]$metrics$recall == output2$svm$cv[[fold]]$metrics$recall)) - expect_true(all(output1$svm$cv[[fold]]$metrics$precision == output2$svm$cv[[fold]]$metrics$precision)) - } else { - expect_true(all(output1$svm$cv[[fold]][[i]] == output2$svm$cv[[fold]][[i]])) - } - } - } -}) - - -test_that("test equivalence with imputation", { - data <- iris - - data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - data$Species <- factor(data$Species) - - set.seed(123) - - # Introduce some missing data - for (i in 1:ncol(data)) { - data[sample(1:nrow(data), size = round(nrow(data) * .01)), i] <- NA - } - - result1 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - stratified = TRUE, - random_seed = 123, - standardize = TRUE - ), - impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), - save = list(models = TRUE, data = TRUE) - ) - - output1 <- rocCurve(result1) - - result2 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - stratified = TRUE, - random_seed = 123, - standardize = TRUE - ), - impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), - save = list(models = TRUE) - ) - - output2 <- rocCurve(result2, data) - - for (fold in names(output1$svm$cv)) { - for (i in names(output1$svm$cv[[fold]])) { - if (i == "metrics") { - expect_true(all(output1$svm$cv[[fold]]$metrics$precision == output2$svm$cv[[fold]]$metrics$precision)) - expect_true(all(output1$svm$cv[[fold]]$metrics$recall == output2$svm$cv[[fold]]$metrics$recall)) - } else { - expect_true(all(output1$svm$cv[[fold]][[i]] == output2$svm$cv[[fold]][[i]])) - } - } - } -}) +library(vswift) +library(testthat) + +source("utils.R") + +# Test that each model works with train-test splitting alone +test_that("test pr curve", { + data <- iris + + data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + data$Species <- factor(data$Species) + + map_args <- list( + "knn" = list(ks = 5), + "xgboost" = list( + params = list( + booster = "gbtree", + objective = "reg:logistic", + lambda = 0.0003, + alpha = 0.0003, + eta = 0.8, + max_depth = 6 + ), + nrounds = 10 + ), + "regularized_logistic" = list(alpha = 1, nfolds = 3), + "nnet" = list(size = 2) + ) + + models <- c( + "regularized_logistic", "regularized_multinomial", "multinom", "knn", "nnet", "lda", "qda", + "svm", "decisiontree", "randomforest", "logistic", "naivebayes", "xgboost" + ) + + results <- class_cv( + formula = Species ~ ., + data = data, + models = models, + model_params = list(map_args = map_args, rule = "1se", verbose = TRUE), + train_params = list( + split = 0.8, + n_folds = 5, + standardize = T, + stratified = TRUE, + random_seed = 123 + ), + save = list(models = TRUE, data = TRUE) + ) + + # With thresholds derived from models + pr_output <- results$pr_curve(path = getwd()) + check_png() + expect_true(length(pr_output$get_model()) == 13) + check_metrics(pr_output, "pr") + + # With specified thresholds + pr_output <- results$pr_curve(path = getwd(), thresholds = seq(0, 0.9, 0.1)) + check_png() + expect_true(length(pr_output$get_model()) == 13) + check_metrics(pr_output, "pr") +}) + +test_that("test equivalence with standardizing", { + data <- iris + + data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + data$Species <- factor(data$Species) + + result1 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + standardize = T, + stratified = TRUE, + random_seed = 123 + ), + save = list(models = TRUE, data = TRUE) + ) + + output1 <- result1$pr_curve() + + result2 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + standardize = T, + stratified = TRUE, + random_seed = 123 + ), + save = list(models = TRUE, data = TRUE) + ) + + output2 <- result2$pr_curve(data = data) + + svm1 <- output1$get_model("svm") + svm2 <- output2$get_model("svm") + + for (fold in names(svm1$cv)) { + for (i in names(svm1$cv[[fold]])) { + if (i == "metrics") { + expect_true(all(svm1$cv[[fold]]$metrics$recall == svm2$cv[[fold]]$metrics$recall)) + expect_true(all(svm1$cv[[fold]]$metrics$precision == svm2$cv[[fold]]$metrics$precision)) + } else { + expect_true(all(svm1$cv[[fold]][[i]] == svm2$cv[[fold]][[i]])) + } + } + } +}) + + +test_that("test equivalence with imputation", { + data <- iris + + data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + data$Species <- factor(data$Species) + + set.seed(123) + + # Introduce some missing data + for (i in 1:ncol(data)) { + data[sample(1:nrow(data), size = round(nrow(data) * .01)), i] <- NA + } + + result1 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + stratified = TRUE, + random_seed = 123, + standardize = TRUE + ), + impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), + save = list(models = TRUE, data = TRUE) + ) + + output1 <- result1$roc_curve() + + result2 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + stratified = TRUE, + random_seed = 123, + standardize = TRUE + ), + impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), + save = list(models = TRUE) + ) + + output2 <- result2$roc_curve(data = data) + + svm1 <- output1$get_model("svm") + svm2 <- output2$get_model("svm") + + for (fold in names(svm1$cv)) { + for (i in names(svm1$cv[[fold]])) { + if (i == "metrics") { + expect_true(all(svm1$cv[[fold]]$metrics$precision == svm2$cv[[fold]]$metrics$precision)) + expect_true(all(svm1$cv[[fold]]$metrics$recall == svm2$cv[[fold]]$metrics$recall)) + } else { + expect_true(all(svm1$cv[[fold]][[i]] == svm2$cv[[fold]][[i]])) + } + } + } +}) diff --git a/tests/testthat/tests_print.vswift.R b/tests/testthat/tests_print.R similarity index 75% rename from tests/testthat/tests_print.vswift.R rename to tests/testthat/tests_print.R index 9249d65..9e84acd 100644 --- a/tests/testthat/tests_print.vswift.R +++ b/tests/testthat/tests_print.R @@ -1,23 +1,23 @@ -library(vswift) -library(testthat) - -test_that("testing print function", { - data <- iris - - args <- list("knn" = list(ks = 3), "nnet" = list(size = 10)) - - expect_no_error(result <- classCV( - data = data, target = 5, models = c("knn", "randomforest", "nnet", "svm"), - train_params = list(split = 0.8, n_folds = 5, remove_obs = T, stratified = T), - model_params = list(map_args = args), save = list(models = T, data = T) - )) - expect_no_error( - print(result, models = c("knn", "nnet")) - ) - expect_no_error( - print(result) - ) - expect_no_error( - print(result, models = c("knn", "nnet"), metrics = T) - ) -}) +library(vswift) +library(testthat) + +test_that("testing print function", { + data <- iris + + args <- list("knn" = list(ks = 3), "nnet" = list(size = 10)) + + expect_no_error(result <- class_cv( + data = data, target = 5, models = c("knn", "randomforest", "nnet", "svm"), + train_params = list(split = 0.8, n_folds = 5, remove_obs = T, stratified = T), + model_params = list(map_args = args), save = list(models = T, data = T) + )) + expect_no_error( + result$print(models = c("knn", "nnet")) + ) + expect_no_error( + print(result) + ) + expect_no_error( + result$print(models = c("knn", "nnet"), metrics = T) + ) +}) diff --git a/tests/testthat/tests_rocCurve.R b/tests/testthat/tests_roc_curve.R similarity index 70% rename from tests/testthat/tests_rocCurve.R rename to tests/testthat/tests_roc_curve.R index 6829b20..d8e61ea 100644 --- a/tests/testthat/tests_rocCurve.R +++ b/tests/testthat/tests_roc_curve.R @@ -1,171 +1,177 @@ -library(vswift) -library(testthat) - -source("utils.R") - -# Test that each model works with train-test splitting alone -test_that("test roc curve", { - data <- iris - - data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - data$Species <- factor(data$Species) - - map_args <- list( - "knn" = list(ks = 5), - "xgboost" = list( - params = list( - booster = "gbtree", - objective = "reg:logistic", - lambda = 0.0003, - alpha = 0.0003, - eta = 0.8, - max_depth = 6 - ), - nrounds = 10 - ), - "regularized_logistic" = list(alpha = 1, nfolds = 3), - "nnet" = list(size = 2) - ) - - models <- c( - "regularized_logistic", "regularized_multinomial", "multinom", "knn", "nnet", "lda", "qda", - "svm", "decisiontree", "randomforest", "logistic", "naivebayes", "xgboost" - ) - - results <- classCV( - formula = Species ~ ., - data = data, - models = models, - model_params = list(map_args = map_args, rule = "1se", verbose = TRUE), - train_params = list( - split = 0.8, - n_folds = 5, - standardize = T, - stratified = TRUE, - random_seed = 123 - ), - save = list(models = TRUE, data = TRUE) - ) - - # With thresholds derived from models - roc_output <- rocCurve(results, path = getwd()) - check_png() - expect_true(length(roc_output) == "13") - check_metrics(roc_output, "roc") - - # With specified thresholds - roc_output <- rocCurve(results, path = getwd(), thresholds = seq(0, 0.9, 0.1)) - check_png() - expect_true(length(roc_output) == "13") - check_metrics(roc_output, "roc") -}) - -test_that("test equivalence with standardizing", { - data <- iris - - data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - data$Species <- factor(data$Species) - - result1 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - standardize = T, - stratified = TRUE, - random_seed = 123 - ), - save = list(models = TRUE, data = TRUE) - ) - - output1 <- rocCurve(result1) - - result2 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - standardize = T, - stratified = TRUE, - random_seed = 123 - ), - save = list(models = TRUE, data = TRUE) - ) - - output2 <- rocCurve(result2, data) - - for (fold in names(output1$svm$cv)) { - for (i in names(output1$svm$cv[[fold]])) { - if (i == "metrics") { - expect_true(all(output1$svm$cv[[fold]]$metrics$tpr == output2$svm$cv[[fold]]$metrics$tpr)) - expect_true(all(output1$svm$cv[[fold]]$metrics$fpr == output2$svm$cv[[fold]]$metrics$fpr)) - } else { - expect_true(all(output1$svm$cv[[fold]][[i]] == output2$svm$cv[[fold]][[i]])) - } - } - } -}) - - -test_that("test equivalence with imputation", { - data <- iris - - data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - data$Species <- factor(data$Species) - - set.seed(123) - - # Introduce some missing data - for (i in 1:ncol(data)) { - data[sample(1:nrow(data), size = round(nrow(data) * .01)), i] <- NA - } - - result1 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - stratified = TRUE, - random_seed = 123, - standardize = TRUE - ), - impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), - save = list(models = TRUE, data = TRUE) - ) - - output1 <- rocCurve(result1) - - result2 <- classCV( - formula = Species ~ ., - data = data, - models = "svm", - train_params = list( - split = 0.8, - n_folds = 5, - stratified = TRUE, - random_seed = 123, - standardize = TRUE - ), - impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), - save = list(models = TRUE) - ) - - output2 <- rocCurve(result2, data) - - for (fold in names(output1$svm$cv)) { - for (i in names(output1$svm$cv[[fold]])) { - if (i == "metrics") { - expect_true(all(output1$svm$cv[[fold]]$metrics$tpr == output2$svm$cv[[fold]]$metrics$tpr)) - expect_true(all(output1$svm$cv[[fold]]$metrics$fpr == output2$svm$cv[[fold]]$metrics$fpr)) - } else { - expect_true(all(output1$svm$cv[[fold]][[i]] == output2$svm$cv[[fold]][[i]])) - } - } - } -}) +library(vswift) +library(testthat) + +source("utils.R") + +# Test that each model works with train-test splitting alone +test_that("test roc curve", { + data <- iris + + data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + data$Species <- factor(data$Species) + + map_args <- list( + "knn" = list(ks = 5), + "xgboost" = list( + params = list( + booster = "gbtree", + objective = "reg:logistic", + lambda = 0.0003, + alpha = 0.0003, + eta = 0.8, + max_depth = 6 + ), + nrounds = 10 + ), + "regularized_logistic" = list(alpha = 1, nfolds = 3), + "nnet" = list(size = 2) + ) + + models <- c( + "regularized_logistic", "regularized_multinomial", "multinom", "knn", "nnet", "lda", "qda", + "svm", "decisiontree", "randomforest", "logistic", "naivebayes", "xgboost" + ) + + results <- class_cv( + formula = Species ~ ., + data = data, + models = models, + model_params = list(map_args = map_args, rule = "1se", verbose = TRUE), + train_params = list( + split = 0.8, + n_folds = 5, + standardize = T, + stratified = TRUE, + random_seed = 123 + ), + save = list(models = TRUE, data = TRUE) + ) + + # With thresholds derived from models + roc_output <- results$roc_curve(path = getwd()) + check_png() + expect_true(length(roc_output$get_model()) == 13) + check_metrics(roc_output, "roc") + + # With specified thresholds + roc_output <- results$roc_curve(path = getwd(), thresholds = seq(0, 0.9, 0.1)) + check_png() + expect_true(length(roc_output$get_model()) == 13) + check_metrics(roc_output, "roc") +}) + +test_that("test equivalence with standardizing", { + data <- iris + + data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + data$Species <- factor(data$Species) + + result1 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + standardize = T, + stratified = TRUE, + random_seed = 123 + ), + save = list(models = TRUE, data = TRUE) + ) + + output1 <- result1$roc_curve() + + result2 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + standardize = T, + stratified = TRUE, + random_seed = 123 + ), + save = list(models = TRUE, data = TRUE) + ) + + output2 <- result2$roc_curve(data = data) + + svm1 <- output1$get_model("svm") + svm2 <- output2$get_model("svm") + + for (fold in names(svm1$cv)) { + for (i in names(svm1$cv[[fold]])) { + if (i == "metrics") { + expect_true(all(svm1$cv[[fold]]$metrics$tpr == svm2$cv[[fold]]$metrics$tpr)) + expect_true(all(svm1$cv[[fold]]$metrics$fpr == svm2$cv[[fold]]$metrics$fpr)) + } else { + expect_true(all(svm1$cv[[fold]][[i]] == svm2$cv[[fold]][[i]])) + } + } + } +}) + + +test_that("test equivalence with imputation", { + data <- iris + + data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") + data$Species <- factor(data$Species) + + set.seed(123) + + # Introduce some missing data + for (i in 1:ncol(data)) { + data[sample(1:nrow(data), size = round(nrow(data) * .01)), i] <- NA + } + + result1 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + stratified = TRUE, + random_seed = 123, + standardize = TRUE + ), + impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), + save = list(models = TRUE, data = TRUE) + ) + + output1 <- result1$roc_curve() + + result2 <- class_cv( + formula = Species ~ ., + data = data, + models = "svm", + train_params = list( + split = 0.8, + n_folds = 5, + stratified = TRUE, + random_seed = 123, + standardize = TRUE + ), + impute_params = list(method = "impute_bag", args = list(trees = 20, seed_val = 123)), + save = list(models = TRUE) + ) + + output2 <- result2$roc_curve(data = data) + + svm1 <- output1$get_model("svm") + svm2 <- output2$get_model("svm") + + for (fold in names(svm1$cv)) { + for (i in names(svm1$cv[[fold]])) { + if (i == "metrics") { + expect_true(all(svm1$cv[[fold]]$metrics$tpr == svm2$cv[[fold]]$metrics$tpr)) + expect_true(all(svm1$cv[[fold]]$metrics$fpr == svm2$cv[[fold]]$metrics$fpr)) + } else { + expect_true(all(svm1$cv[[fold]][[i]] == svm2$cv[[fold]][[i]])) + } + } + } +}) diff --git a/tests/testthat/utils.R b/tests/testthat/utils.R index 103079b..d7db275 100644 --- a/tests/testthat/utils.R +++ b/tests/testthat/utils.R @@ -5,7 +5,6 @@ check_png <- function() { expect_true(file.size(png_file) > 0) file.remove(png_file) } - file.remove(list.files(getwd(), pattern = "Rplots.pdf")) } @@ -16,9 +15,7 @@ check_conditions <- function(x, curve) { } else { met_names <- c("precision", "recall", "maxF1", "optimal_threshold") } - met_names <- c(met_names, "auc", "thresholds", "probs") - for (name in met_names) { if (name %in% c("tpr", "fpr", "precision", "recall")) { expect_true(all(x$metrics[[name]] >= 0)) @@ -30,12 +27,12 @@ check_conditions <- function(x, curve) { } } - check_metrics <- function(out, curve) { - for (mod in names(out)) { - for (split_method in names(out[[mod]])) { - for (id in names(out[[mod]][[split_method]])) { - x <- out[[mod]][[split_method]][[id]] + results <- out$get_model() + for (mod in names(results)) { + for (split_method in names(results[[mod]])) { + for (id in names(results[[mod]][[split_method]])) { + x <- results[[mod]][[split_method]][[id]] check_conditions(x, curve) } } diff --git a/vignettes/vswift-intro.Rmd b/vignettes/vswift-intro.Rmd index c2f42a1..0569211 100644 --- a/vignettes/vswift-intro.Rmd +++ b/vignettes/vswift-intro.Rmd @@ -1,87 +1,76 @@ ---- -title: "Introduction to vswift" -author: "Donisha Smith" -date: "`r Sys.Date()`" -output: - pdf_document: - toc: true - rmarkdown::html_vignette: - fig_width: 8 - fig_height: 6 -vignette: > - %\VignetteIndexEntry{Introduction to vswift} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - - -### Import **vswift** into the current R session. -```{r setup} -library(vswift) - -# Display documentation for the 'vswift' package -help(package = "vswift") -``` - - -### The main function of the **vswift** package is `classCV`. -```{r} -result1 <- classCV( - data = iris, target = "Species", - train_params = list("split" = 0.8, "random_seed" = 123), - models = "lda" -) - -# Perform a train-test split with five folds using stratified sampling with K-Nearest Neighbors while also specifying an additional argument for knn to specify the number of neighbors to consider -result2 <- classCV( - data = iris, formula = Species ~ ., - train_params = list(split = 0.8, n_folds = 5, stratified = TRUE, random_seed = 123), - models = "knn", - ks = 5 -) - -# Perform a train-test split with an 80% training set and five folds with stratified sampling LDA and knn. Also specify an argument for knn and save the models for lda and knn. - -args <- list("knn" = list(ks = 5)) - -result3 <- classCV( - data = iris, formula = Species ~ ., - train_params = list(split = 0.8, n_folds = 5, stratified = TRUE, random_seed = 123), - models = c("lda", "knn"), - model_params = list(map_args = args) -) -``` - -### `print` can be used to produce command-line output of the performance metrics. -```{r} -print(result3, models = "knn") -``` - - -### `plot` function can be used to visualize performance metrics. -```{r} -plot(result3, models = "knn", split = FALSE, cv = TRUE, class_names = c("setosa", "virginica"), metrics = "precision") -``` - -### `rocCurve`` can be used to generate ROC curves and ROC-AUC scores for binary classification targets. The model and preprocessed dataset must be saved. -```{r} -data <- iris -data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") - -result <- classCV( - data = data, formula = Species ~ ., - train_params = list(split = 0.8, n_folds = 5, stratified = TRUE, random_seed = 123), - models = c("decisiontree", "randomforest"), - model_params = list(map_args = args), - save = list(models = TRUE) -) - -output <- rocCurve(result, data) -``` +--- +title: "Introduction to vswift" +author: "Donisha Smith" +date: "`r Sys.Date()`" +output: + pdf_document: + toc: true + rmarkdown::html_vignette: + fig_width: 8 + fig_height: 6 +vignette: > + %\VignetteIndexEntry{Introduction to vswift} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +### Import **vswift** into the current R session. +```{r setup} +library(vswift) +# Display documentation for the 'vswift' package +help(package = "vswift") +``` + +### The main function of the **vswift** package is `class_cv`. +```{r} +result1 <- class_cv( + data = iris, target = "Species", + train_params = list("split" = 0.8, "random_seed" = 123), + models = "lda" +) +# Perform a train-test split with five folds using stratified sampling with K-Nearest Neighbors while also specifying an additional argument for knn to specify the number of neighbors to consider +result2 <- class_cv( + data = iris, formula = Species ~ ., + train_params = list(split = 0.8, n_folds = 5, stratified = TRUE, random_seed = 123), + models = "knn", + ks = 5 +) +# Perform a train-test split with an 80% training set and five folds with stratified sampling LDA and knn. Also specify an argument for knn and save the models for lda and knn. +args <- list("knn" = list(ks = 5)) +result3 <- class_cv( + data = iris, formula = Species ~ ., + train_params = list(split = 0.8, n_folds = 5, stratified = TRUE, random_seed = 123), + models = c("lda", "knn"), + model_params = list(map_args = args) +) +``` + +### `print` can be used to produce command-line output of the performance metrics. +```{r} +result3$print(models = "knn") +``` + +### `plot` can be used to visualize performance metrics. +```{r} +result3$plot(models = "knn", split = FALSE, cv = TRUE, class_names = c("setosa", "virginica"), metrics = "precision") +``` + +### `roc_curve` can be used to generate ROC curves and ROC-AUC scores for binary classification targets. The model and preprocessed dataset must be saved. +```{r} +data <- iris +data$Species <- ifelse(data$Species == "setosa", "setosa", "not setosa") +result <- class_cv( + data = data, formula = Species ~ ., + train_params = list(split = 0.8, n_folds = 5, stratified = TRUE, random_seed = 123), + models = c("decisiontree", "randomforest"), + model_params = list(map_args = args), + save = list(models = TRUE) +) +output <- result$roc_curve(data) +```