diff --git a/model_definitions/r-diabetes/README.md b/model_definitions/r-diabetes/README.md deleted file mode 100644 index b61ee146..00000000 --- a/model_definitions/r-diabetes/README.md +++ /dev/null @@ -1,107 +0,0 @@ -# Overview -PIMA Diabetes demo model using R - -# Datasets -The dataset required to train or evaluate this model is the PIMA Indians Diabetes dataset available [here](http://nrvis.com/data/mldata/pima-indians-diabetes.csv). The teradataml code to import it is - -```python -import pandas as pd -from teradataml import copy_to_sql - -df = pd.read_csv("http://nrvis.com/data/mldata/pima-indians-diabetes.csv", header=None) -df.columns = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age", "HasDiabetes"] - -copy_to_sql(df = df, table_name = "PIMA", index=True, index_label="PatientId", if_exists="replace") -``` - -```sql -CREATE TABLE PIMA_PATIENT_FEATURES AS - (SELECT - patientid, - numtimesprg, - plglcconc, - bloodp, - skinthick, - twohourserins, - bmi, - dipedfunc, - age - FROM PIMA - ) WITH DATA; - - -CREATE TABLE PIMA_PATIENT_DIAGNOSES AS - (SELECT - patientid, - hasdiabetes - FROM PIMA - ) WITH DATA; -``` - - -# Training -The [training.R](model_modules/training.R) produces the following artifacts - -- model.rds (gbm parameters) - -# Evaluation -Evaluation is defined in the `evaluate` method in [scoring.R](model_modules/scoring.R) and it returns the following metrics - -- Accuracy -- Recall -- Precision -- f1-score - -We produce a number of plots for each evaluation also - -- confusion matrix - -# Scoring -This demo mode supports two types of scoring - - - Batch - - RESTful - -Batch Scoring is supported via the `score` method in [scoring.R](model_modules/scoring.R). - -The following table must exist to write (append) the scores into - -```sql -CREATE MULTISET TABLE pima_patient_predictions ( - job_id VARCHAR(255), -- comes from airflow on job execution - PatientId BIGINT, -- entity key as it is in the source data - HasDiabetes BIGINT, -- if model automatically extracts target - json_report CLOB(1048544000) CHARACTER SET UNICODE -- output of - ) - PRIMARY INDEX ( job_id ); -``` - -RESTful scoring is supported via the `score.restful` function in [scoring.R](model_modules/scoring.R) which implements a predict method which is called by the RESTful Serving Engine. An example request is - - curl -X POST http://localhost:5000/predict \ - -H "Content-Type: application/json" \ - -d '{ - "data": { - "ndarray": [[ - 6, - 148, - 72, - 35, - 0, - 33.6, - 0.627, - 50 - ]], - "names":[ - "NumTimesPrg", - "PlGlcConc", - "BloodP", - "SkinThick", - "TwoHourSerIns", - "BMI", - "DiPedFunc", - "Age" - ] - } - }' - diff --git a/model_definitions/r-diabetes/config.json b/model_definitions/r-diabetes/config.json deleted file mode 100644 index acd145ed..00000000 --- a/model_definitions/r-diabetes/config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "hyperParameters": { - "shrinkage": 0.01, - "cv.folds": 5, - "n.trees": 3000 - } -} diff --git a/model_definitions/r-diabetes/model.json b/model_definitions/r-diabetes/model.json deleted file mode 100644 index 40177be3..00000000 --- a/model_definitions/r-diabetes/model.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "id": "bf6a52b2-b595-4358-ac4f-24fb41a85c45", - "name": "R Diabetes Prediction", - "description": "R GBM for Diabetes Prediction", - "language": "R", - - "automation": { - "training": { - "resources": { - "cpu": "1", - "memory": "10Gi" - } - }, - "evaluation": { - "resources": { - "cpu": "1", - "memory": "10Gi" - } - }, - "deployment": { - "resources": { - "cpu": "1", - "memory": "10Gi" - } - } - } -} diff --git a/model_definitions/r-diabetes/model_modules/evaluation.R b/model_definitions/r-diabetes/model_modules/evaluation.R deleted file mode 100644 index 68e5bd3f..00000000 --- a/model_definitions/r-diabetes/model_modules/evaluation.R +++ /dev/null @@ -1,44 +0,0 @@ - -LoadPackages <- function() { - library("methods") - library("jsonlite") - library("caret") - library("gbm") - library("DBI") - library("dplyr") - library("tdplyr") -} - -evaluate <- function(data_conf, model_conf, ...) { - model <- readRDS("artifacts/input/model.rds") - print("Evaluating model...") - - suppressPackageStartupMessages(LoadPackages()) - - # Connect to Vantage - con <- aoa_create_context() - - table <- tbl(con, sql(data_conf$sql)) - - # Create dataframe from tibble, selecting the necessary columns and mutating integer64 to integers - data <- table %>% mutate(NumTimesPrg = as.integer(NumTimesPrg), - PlGlcConc = as.integer(PlGlcConc), - BloodP = as.integer(BloodP), - SkinThick = as.integer(SkinThick), - TwoHourSerIns = as.integer(TwoHourSerIns), - HasDiabetes = as.integer(HasDiabetes)) %>% as.data.frame() - - probs <- predict(model, data, na.action = na.pass, type = "response") - preds <- as.integer(ifelse(probs > 0.5, 1, 0)) - - cm <- confusionMatrix(table(preds, data$HasDiabetes)) - - png("artifacts/output/confusion_matrix.png", width = 860, height = 860) - fourfoldplot(cm$table) - dev.off() - - preds$pred <- preds - metrics <- cm$overall - - write(jsonlite::toJSON(metrics, auto_unbox = TRUE, null = "null", keep_vec_names=TRUE), "artifacts/output/metrics.json") -} diff --git a/model_definitions/r-diabetes/model_modules/requirements.R b/model_definitions/r-diabetes/model_modules/requirements.R deleted file mode 100644 index eaa163a3..00000000 --- a/model_definitions/r-diabetes/model_modules/requirements.R +++ /dev/null @@ -1,7 +0,0 @@ -message('Installing packages') -if(!require('gbm')){install.packages('gbm')} -if(!require('devtools')){install.packages('devtools')} -if(!require('caret')){install.packages('caret')} - -#library("devtools") -#install_git("git://github.com/jpmml/r2pmml.git") diff --git a/model_definitions/r-diabetes/model_modules/scoring.R b/model_definitions/r-diabetes/model_modules/scoring.R deleted file mode 100644 index d56a4d2a..00000000 --- a/model_definitions/r-diabetes/model_modules/scoring.R +++ /dev/null @@ -1,73 +0,0 @@ -library(methods) -library(gbm) -library(jsonlite) -library(caret) - -LoadBatchScoringPackages <- function() { - library("gbm") - library("DBI") - library("dplyr") - library("tdplyr") -} - -score.restful <- function(model, data, ...) { - print("Scoring model...") - probs <- predict(model, data, na.action = na.pass, type = "response") - score <- ifelse(probs > 0.5, 1, 0) - score -} - -score.batch <- function(data_conf, model_conf, model_version, job_id, ...) { - model <- initialise_model() - print("Batch scoring model...") - - suppressPackageStartupMessages(LoadBatchScoringPackages()) - - # Connect to Teradata Vantage - con <- aoa_create_context() - - table <- tbl(con, sql(data_conf$sql)) - - # Create dataframe from tibble, selecting the necessary columns and mutating integer64 to integers - data <- table %>% mutate(PatientId = as.integer(PatientId), - NumTimesPrg = as.integer(NumTimesPrg), - PlGlcConc = as.integer(PlGlcConc), - BloodP = as.integer(BloodP), - SkinThick = as.integer(SkinThick), - TwoHourSerIns = as.integer(TwoHourSerIns)) %>% as.data.frame() - - # The model object will be obtain from the environment as it has already been initialised using 'initialise_model' - probs <- predict(model, data, na.action = na.pass, type = "response") - score <- as.integer(ifelse(probs > 0.5, 1, 0)) - print("Finished batch scoring model...") - - # create result dataframe and store in Teradata Vantage - pred_df <- as.data.frame(unlist(score)) - colnames(pred_df) <- c("HasDiabetes") - pred_df$PatientId <- data$PatientId - pred_df$job_id <- job_id - - # tdplyr doesn't match column names on append.. and so to match / use same table schema as for byom predict - # example (see README.md), we must add empty json_report column and change column order manually (v17.0.0.4) - # CREATE MULTISET TABLE pima_patient_predictions - # ( - # job_id VARCHAR(255), -- comes from airflow on job execution - # PatientId BIGINT, -- entity key as it is in the source data - # HasDiabetes BIGINT, -- if model automatically extracts target - # json_report CLOB(1048544000) CHARACTER SET UNICODE -- output of - # ) - # PRIMARY INDEX ( job_id ); - pred_df$json_report <- "" - pred_df <- pred_df[, c("job_id", "PatientId", "HasDiabetes", "json_report")] - - copy_to(con, pred_df, - name=dbplyr::in_schema(data_conf$predictions$database, data_conf$predictions$table), - types = c("varchar(255)", "bigint", "bigint", "clob"), - append=TRUE) - print("Saved batch predictions...") -} - -initialise_model <- function() { - print("Loading model...") - model <- readRDS("artifacts/input/model.rds") -} diff --git a/model_definitions/r-diabetes/model_modules/training.R b/model_definitions/r-diabetes/model_modules/training.R deleted file mode 100644 index c1db79c9..00000000 --- a/model_definitions/r-diabetes/model_modules/training.R +++ /dev/null @@ -1,57 +0,0 @@ -LoadPackages <- function() { - library("gbm") - library("DBI") - library("dplyr") - library("tdplyr") - -} - -suppressPackageStartupMessages(LoadPackages()) - -train <- function(data_conf, model_conf, ...) { - # Connect to Vantage - con <- aoa_create_context() - - table <- tbl(con, sql(data_conf$sql)) - - # Create dataframe from tibble, selecting the necessary columns and mutating integer64 to integers - # select both the feature and target columns (ignorning e.g. entity key) - columns <- unlist(c(data_conf$featureNames, data_conf$targetNames), use.name = TRUE) - data <- table %>% select(all_of(columns)) %>% mutate( - NumTimesPrg = as.integer(NumTimesPrg), - PlGlcConc = as.integer(PlGlcConc), - BloodP = as.integer(BloodP), - SkinThick = as.integer(SkinThick), - TwoHourSerIns = as.integer(TwoHourSerIns), - HasDiabetes = as.integer(HasDiabetes)) %>% as.data.frame() - - # Load hyperparameters from model configuration - hyperparams <- model_conf[["hyperParameters"]] - - print("Training model...") - - # Train model - model <- gbm(HasDiabetes~., - data=data, - shrinkage=hyperparams$shrinkage, - distribution = 'bernoulli', - cv.folds=hyperparams$cv.folds, - n.trees=hyperparams$n.trees, - verbose=FALSE) - - print("Model Trained!") - - # Get optimal number of iterations - best.iter <- gbm.perf(model, plot.it=FALSE, method="cv") - - # clean the model (R stores the dataset on the model.. - model$data <- NULL - - # how to save only best.iter tree? - # model$best.iter <- best.iter - # model$trees <- light$trees[best.iter] - - # Save trained model - print("Saving trained model...") - saveRDS(model, "artifacts/output/model.rds") -}