diff --git a/cross-validation/metadata.json b/cross-validation/metadata.json index 88572c3..eb8e41a 100644 --- a/cross-validation/metadata.json +++ b/cross-validation/metadata.json @@ -2,5 +2,5 @@ "name": "Cross validation", "description": "A collection of whizzml scripts and libraries performing k-fold cross-validation", "kind": "package", - "components": ["cross-validation-gen", "basic", "model", "ensemble", "logistic-regression", "boosted-ensemble", "deepnet", "supervised-conf", "linear-regression"] + "components": ["cross-validation-gen", "basic", "model", "ensemble", "logistic-regression", "boosted-ensemble", "deepnet", "supervised-conf", "linear-regression", "purged-cross-validation"] } diff --git a/cross-validation/purged-cross-validation/metadata.json b/cross-validation/purged-cross-validation/metadata.json new file mode 100644 index 0000000..1a35bf0 --- /dev/null +++ b/cross-validation/purged-cross-validation/metadata.json @@ -0,0 +1,6 @@ +{ + "name": "Purged k-fold cross-validation script", + "description": "The objective of this script is performing a purged k-fold cross validation of any supervised model built from a time-series like already sorted dataset. The algorithm:\n\n - Divides the dataset in k parts\n - Holds out the data in one of the parts and builds a supervised model\n with the rest of data\n - Removes the edges of the hold out dataset to create the test dataset (avoiding leakage).\n- Evaluates the supervised model with the purged test dataset\n - The second, third and fourth steps are repeated with each of the k parts, so that\n k evaluations are generated\n - Finally, the evaluation metrics are averaged to provide the cross-validation\n metrics.\n\n The **goal** of the script is producing a\n cross-validation, an evaluation whose metrics are averages of the k evaluations\n created in the cross-validation process.\n\n For more information, please see the [readme](https://github.com/whizzml/examples/tree/master/cross-validation/purged-cross-validation).", + "kind": "script", + "source_code": "script.whizzml" +} diff --git a/cross-validation/purged-cross-validation/readme.md b/cross-validation/purged-cross-validation/readme.md new file mode 100644 index 0000000..7dc0c7b --- /dev/null +++ b/cross-validation/purged-cross-validation/readme.md @@ -0,0 +1,18 @@ +# Script for purged k-fold cross-validation + +The objective of this script is create a purged k-fold cross validation +starting form any classification model +built from a time-series kind of dataset that has been previously ordered. + +The algorithm: + +- Divides the dataset in k parts +- Holds out the data in one of the parts and builds the same supervised model + used as input with the rest of data +- Creates a test dataset by purging its edges (15% of the hold out data) to + avoid leakage. +- Evaluates the model with the test data +- The second, third and fourth steps are repeated with each of the k parts, + so that k evaluations are generated +- Finally, the evaluation metrics are averaged to provide the cross-validation + metrics. diff --git a/cross-validation/purged-cross-validation/script.whizzml b/cross-validation/purged-cross-validation/script.whizzml new file mode 100644 index 0000000..65db7e2 --- /dev/null +++ b/cross-validation/purged-cross-validation/script.whizzml @@ -0,0 +1,411 @@ +;; options allowed for evaluation resources +(define EVALUATION_OPTIONS ["sample_rate" + "out_of_bag" + "range" + "replacement" + "ordering" + "seed" + "missing_strategy" + "confidence_threshold" + "probability_threshold" + "fields_map" + "negative_class" + "positive_class" + "threshold" + "operating_kind" + "operating_point"]) + +;; list of available classification models +(define SLMS + ["model" "ensemble" "logisticregression" "deepnet" "linearregression"]) + +(define (_transpose m) + (apply map list m)) + +;; cross-validation-pre +;; +;; retrieves the dataset, model type and arguments to be used in the +;; cross-validation +;; Inputs: +;; slm-id: (string) Supervised Model ID +;; Output: (list) e.g. ["dataset/1111111" ;; dataset-id +;; "logisticregression" ;; model-type +;; {"field_balance" true "objective_field" "000004"} ;; model-options +;; "000004"] ;; objective-id +;; Raises: +;; 101: The slm-id argument is not a string +;; 102: The slm-id is not a supervised model ID +(define (cross-validation-pre slm-id) + (check-resource-id slm-id SLMS) + (let (slm (fetch slm-id) + model-type (resource-type slm-id) + [dataset-id model-options] (slm-arguments slm-id) + objective-id (slm "objective_field")) + [dataset-id model-type objective-id model-options])) + +;; purged-model-cross-validation +;; +;; creates k-fold cross-validation +;; **from an existing model** +;; Inputs: +;; slm-id: (string) Supervised Model ID +;; k-folds: (integer) Number of folds +;; evaluation-options: (map) Options to use in evaluation creation +;; delete-resources?: (boolean) Whether to delete all intermediate resources +;; +;; Output: (map) Average of evaluations results +;; +;; Raises: +;; 101: The slm-id argument is not a string +;; 102: The slm-id is not a supervised model ID +;; 103: The k-folds argument is not an integer +;; 104: The k-folds argument is not >= 2 +;; 106: Non-compatible model for the dataset objective field type +;; 107: The number of instances in minority class is less than k +;; +(define (purged-model-cross-validation slm-id + k-folds + evaluation-options + delete-resources?) + (let ([dataset-id + model-type + objective-id + model-options] (cross-validation-pre slm-id)) + (ds-purged-cross-validation dataset-id + k-folds + objective-id + model-type + model-options + evaluation-options + delete-resources?))) + +;; General procedure to generate a cross-validation from a dataset +;; Raises: +;; 103: The k-folds argument is not an integer +;; 104: The k-folds argument is not >= 2 +;; 106: Non-compatible model for the dataset objective field type +;; 107: The number of instances in minority class is less than k + +(define (ds-purged-cross-validation dataset-id + k-folds + objective-id + model-type + model-options + evaluation-options + delete-resources?) + (check-integer k-folds 2 false) + (let (dataset (fetch dataset-id) + dataset-name (dataset "name" false) + fields (resource-fields dataset-id) + regression? (is-regression objective-id fields) + _ (when (not regression?) + (check-minority-instances k-folds dataset-id objective-id)) + _ (check-modeltype model-type regression?) + objective-name (fields [objective-id "name"] false) + k-fold-datasets (create-linear-k-folds dataset-id k-folds) + evaluations (create-k-purged-evaluations k-fold-datasets + objective-name + dataset-name + model-type + model-options + evaluation-options + delete-resources?) + evaluations-average (create-and-wait-evaluation {"evaluations" + evaluations})) + (when delete-resources? + (map safe-delete (concat k-fold-datasets evaluations))) + evaluations-average)) + + +;;---------------------------------------------------------------------------- + + +;; check-resource-id +;; +;; Validates that the argument is a resource ID and its type. Raises an error +;; if otherwise. +;; +;; Inputs: +;; resource-id: (string) Resource ID +;; type: (string) Type of resource +;; +;; Output: (string) Checked resource ID +(define (check-resource-id resource-id type) + (when (not (string? resource-id)) + (raise {"message" (str "Resource ID string expected. Found " + resource-id " instead.") + "code" 101})) + (when (not (member? (resource-type resource-id) type)) + (raise {"message" (str "Failed to find a correct " type " ID.") + "code" 102})) + resource-id) + + +;; check-integer +;; +;; Validates that the argument is an integer. Raises error if otherwise. +;; +;; Inputs: +;; value: (number) Integer to be checked +;; minimum: (number) Minimum value (false if not set) +;; maximum: (number) Maximum value (false if not set) +;; +;; Output: (number) Checked integer +(define (check-integer value minimum maximum) + (when (not (integer? value)) + (raise {"message" (str "Integer value expected. Found " value " instead.") + "code" 103})) + (when (and minimum (< value minimum)) + (raise {"message" (str "Minimum accepted value is " minimum ". " value + " found.") + "code" 104})) + (when (and maximum (> value maximum)) + (raise {"message" (str "Maximum accepted value is " maximum ". " value + " found.") + "code" 105})) + value) + +;; check-categorical-objective-id +;; +;; Validates that the argument is a valid categorical objective id in the +;; reference dataset. +;; +;; Inputs: +;; objective-id: (string) ID of the objective field +;; fields: (map) Fields informatiion +;; +;; Output: (string) Checked objective field ID +(define (check-categorical-objective-id objective-id fields) + (when (!= "categorical" (fields [objective-id "optype"] false)) + (raise {"message" (str "Only classification models can be" + " cross-validated") + "code" 106}))) + +;; is-regression +;; +;; Checks whether the objective field is numeric +;; +;; Inputs: +;; objective-id: (string) ID of the objective field +;; fields: (map) Fields informatiion +;; +;; Output: (boolean) True if numeric objective field +(define (is-regression objective-id fields) + (= "numeric" (fields [objective-id "optype"] false))) + + +;; check-modeltype +;; +;; Checks whether the model type matches the regression / classification +;; problem +(define (check-modeltype model-type regression?) + (let (error (or (and (= model-type "logisticregression") regression?) + (and (= model-type "linearregression") (not regression?)))) + (when error + (raise {"message" (str "The " model-type " cannot be used to" + " predict " (if regression? "regressions" + "classifications") + ".") + "code" 106})))) + +;; check-minority-instances +;; +;; Checks whether the number of instances in the minority class is greater +;; than the number of folds +(define (check-minority-instances k-folds dataset-id objective-id) + (let (dataset (fetch dataset-id) + min-cat (last (dataset ["fields" objective-id "summary" "categories"] + [[]])) + min-inst (min-cat 1 0)) + (when (< min-inst k-folds) + (raise {"message" (str "Failed to create a " k-folds "-folds sample: " + "not enough instances in some categories") + "code" 107})))) + +;; -------------------------------------------------------------------------- + + +;; slm-arguments +;; +;; Retrieves the origin dataset and configuration options used in a +;; supervised learning model +;; +;; Inputs: +;; slm-id: (string) Supervised Model ID +;; +;; Output: (list) [dataset-id model-options] +(define (slm-arguments slm-id) + (let (last-step-w (resource-workflow slm-id true true) + step (last-step-w ["steps" 0]) + conf (step "args") + input (last-step-w ["inputs" 0]) + origin-key ((keys (step "origin")) 0) + dataset-id (if (= origin-key "dataset") + input + (create-dataset (assoc {} + (str "origin_" origin-key) + input))) + conf (dissoc conf ((keys (step "origin")) 0))) + (log-info "Dataset ID and configuration: " dataset-id conf) + [dataset-id conf])) + +;; create-linear-k-folds +;; +;; creating k-fold splits from a dataset +;; +;; Inputs: +;; dataset-id: (string) Dataset ID +;; k-folds: (integer) Number of folds +;; +;; Output: (list) List of dataset IDs +;; +(define (create-linear-k-folds dataset-id k-folds) + (let (dataset (fetch dataset-id) + rows (dataset "rows") + batch (round (/ rows k-folds)) + k-fold-fn (lambda (x) + (log-info "range" (str (+ 1 (* x batch))) (str (+ 1 (* (+ x 1) batch)))) + (create-dataset {"origin_dataset" dataset-id + "range" [(+ 1 (* x batch)) (+ 1 (* (+ x 1) batch))] + "new_fields" [{"name" "k_fold" + "field" (str x)}]})) + dataset-ids (map k-fold-fn (range 0 k-folds))) + (wait* dataset-ids))) + +;; pair-k-folds +;; +;; Builds a list of pairs of hold-out and complementary datasets for all +;; the k-fold dataset IDs. +;; +;; Inputs: +;; dataset-ids: (list) List of the k-fold dataset IDs +;; +;; Output: (list) List of pairs [hold-out dataset, multidataset with the rest] +;; +(define (pair-k-folds dataset-ids) + (map (lambda(x) + [(nth dataset-ids x) + (concat (take x dataset-ids) + (drop (+ x 1) dataset-ids))]) + (range 0 (count dataset-ids)))) + + +;; select-map-keys +;; +;; Filters the keys in a map, keeping only the ones that appear in the list. +;; +;; Inputs: +;; map: (map) Key, value maps +;; keys-list: (list) List of keys to be kept in the map +;; Output: (map) filtered map with only the keys in the keys-list +;; +(define (select-map-keys a-map keys-list) + (reduce (lambda (x y) (let (value (a-map y false)) + (cond value (assoc x y value) x))) + {} + keys-list)) + +;; create-k-models +;; +;; Creates the models for a set of k-fold datasets +;; +;; Inputs: +;; type: (string) type of model (model or ensemble) +;; multidatasets: (list) List of lists of datset IDs once a k-fold is +;; excluded +;; objective-name: (string) name of the objective field +;; model-options: (map) Options for the model or ensemble +;; +;; Output: (list) model IDs +;; +(define (create-k-models type multidatasets objective-name model-options) + (let (models (map (lambda (x) + (create type + (merge {"datasets" x + "objective_field" objective-name} + model-options))) + multidatasets)) + (wait* models))) + +;; create-k-evaluations +;; +;; Creates the models/ensembles and evaluations for a set of k-fold datasets +;; +;; Inputs: +;; dataset-ids: (list) List of the k-fold dataset IDs +;; objective-name: (string) Objective field name +;; dataset-name: (string) Name of the origin dataset +;; model-type: (string) Type of supervised learning model +;; model-options: (map) Options used to build the models/ensembles +;; evaluation-options: (map) Options used to build evaluations +;; delete-resources?: (boolean) Whether to delete all intermediate resources +;; +;; Output: (list) List of evaluation IDs +;; +(define (create-k-purged-evaluations dataset-ids + objective-name + dataset-name + model-type + model-options + evaluation-options + delete-resources?) + (let (number-of-models (model-options "number_of_models" 1) + k-fold-pairs (pair-k-folds dataset-ids) + evaluation-options (select-map-keys evaluation-options + EVALUATION_OPTIONS) + multidatasets (map last k-fold-pairs) + models (create-k-models model-type + multidatasets + objective-name + model-options) + evaluations (iterate (es [] + id dataset-ids + mid models + idx (range 1 (+ 1 (count dataset-ids)))) + (let (name (str "Evaluation tested with subset " + idx + " of " dataset-name) + opts (assoc evaluation-options "name" name) + purged-id (purge-dataset id)) + (append es (create-evaluation purged-id mid opts))))) + (wait* evaluations) + (when delete-resources? + (map safe-delete models)) + evaluations)) + + +;; purge-dataset +;; +;; purges the edges of the dataset by 7.5% +;; Inputs: +;; id: (dataset-id) dataset ID +;; +;; Output: (dataset-id) purged dataset ID +(define (purge-dataset ds-id) + (let (dataset (fetch ds-id) + rows (dataset "rows") + pruning-rows (round (* (/ rows 100) 7.5))) + (log-info "range" (str (+ 1 pruning-rows)) (str (- rows pruning-rows 1))) + (create-dataset {"origin_dataset" ds-id + "range" [(+ 1 pruning-rows) (- rows pruning-rows 1)]}))) + + +;; safe-delete +;; +;; deletes resources ignoring errors +;; Inputs: +;; id: (resource-id) resource to be deleted +;; +;; Output: (boolean) true if successful, false if not +;; +(define (safe-delete id) + (try (delete id) + (catch e + (log-info (str "Error deleting resource " id " ignored")) + false))) + +(define purged-cv-output + (purged-model-cross-validation slm-id + k-folds + evaluation-options + delete-resources?)) diff --git a/cross-validation/readme.md b/cross-validation/readme.md index bd8d21d..cc6b2ea 100644 --- a/cross-validation/readme.md +++ b/cross-validation/readme.md @@ -1,7 +1,13 @@ -# k-fold cross-validation +# Cross-validation -In this package you'll find three scripts implementing k-fold -cross-validation for different resources: +In this package you'll find scripts implementing cross-validation for different resources: + +## k-fold cross-validation + +The following scripts perform a k-fold cross-validation for different kinds of +models. The last one can be applied to any existing supervised learning model. +The k-fold splits are selecting random rows in the original datasets to create +k models and evaluations, so they are not suitable for time series data. - [Basic 5-fold cross-validation](./basic) 5-fold cross-validation of a model created with default parameters @@ -24,6 +30,15 @@ cross-validation for different resources: General k-fold cross-validation for existing Supervised Learning Classification Models. +## Purged cross-validation + +The following script performs a k-fold cross-validation compatible with time +series datasets. Test datasets are created by sampling linearly the original +dataset and some data is removed from the test dataset edges to avoid leakage. + +- [Purged k-fold cross-validation script](./purged-cross-validation) + General k-fold cross-validation for Supervised Learning Models built on + time series datasets. ## Installation