diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 000000000..e37101be7 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,283 @@ +version: 2 + +defaults: &defaults + docker: + - image: circleci/python:3.7.2 + working_directory: ~/project + +prepare_venv: &prepare_venv + run: + name: Create venv + command: | + python3 -m venv venv + source venv/bin/activate + pip install --upgrade pip + +prepare_tox: &prepare_tox + run: + name: Install tox + command: | + sudo pip install --upgrade pip + pip install --user tox + +fetch_data: &fetch_data + run: + name: Set script permissions and fetch data + command: | + source venv/bin/activate + chmod +x ./scripts/fetch_kaggle_dataset.sh + ./scripts/fetch_kaggle_dataset.sh + +jobs: + test_regression_model_py36: + docker: + - image: circleci/python:3.6.9 + working_directory: ~/project/packages/regression_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.6 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py36 + + test_regression_model_py37: + docker: + - image: circleci/python:3.7.6 + working_directory: ~/project/packages/regression_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.7 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py37 + + test_regression_model_py38: + docker: + - image: circleci/python:3.8.0 + working_directory: ~/project/packages/regression_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.8 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py38 + + test_ml_api_py36: + docker: + - image: circleci/python:3.6.9 + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.6 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py36 + + test_ml_api_py37: + docker: + - image: circleci/python:3.7.6 + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.7 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py37 + + test_ml_api_py38: + docker: + - image: circleci/python:3.8.1 + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.8 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py38 + + train_and_upload_regression_model: + <<: *defaults + steps: + - checkout + - *prepare_venv + - run: + name: Install requirements + command: | + . venv/bin/activate + pip install -r packages/regression_model/requirements.txt + - *fetch_data + - run: + name: Train model + command: | + . venv/bin/activate + PYTHONPATH=./packages/regression_model python3 packages/regression_model/regression_model/train_pipeline.py + - run: + name: Publish model to Gemfury + command: | + . venv/bin/activate + chmod +x ./scripts/publish_model.sh + ./scripts/publish_model.sh ./packages/regression_model/ + + section_9_differential_tests: + <<: *defaults + steps: + - checkout + - *prepare_venv + - run: + name: Capturing previous model predictions + command: | + . venv/bin/activate + pip install -r packages/ml_api/diff_test_requirements.txt + PYTHONPATH=./packages/ml_api python3 packages/ml_api/tests/capture_model_predictions.py + - run: + name: Runnning differential tests + command: | + . venv/bin/activate + pip install -r packages/ml_api/requirements.txt + py.test -vv packages/ml_api/tests -m differential + + section_10_deploy_to_heroku: + <<: *defaults + steps: + - checkout + - run: + name: Deploy to Heroku + command: | + git push https://heroku:$HEROKU_API_KEY@git.heroku.com/$HEROKU_APP_NAME.git master + + section_11_build_and_push_to_heroku_docker: + <<: *defaults + steps: + - checkout + - setup_remote_docker: + docker_layer_caching: true + - run: docker login --username=$HEROKU_EMAIL --password=$HEROKU_API_KEY registry.heroku.com + - run: + name: Setup Heroku CLI + command: | + wget -qO- https://cli-assets.heroku.com/install-ubuntu.sh | sh + - run: + name: Build and Push Image + command: | + make build-ml-api-heroku push-ml-api-heroku + - run: + name: Release to Heroku + command: | + heroku container:release web --app $HEROKU_APP_NAME + + section_12_publish_docker_image_to_aws: + <<: *defaults + working_directory: ~/project/packages/ml_models + steps: + - checkout + - setup_remote_docker + - run: + name: Publishing docker image to aws ECR + command: | + sudo pip install awscli + eval $(aws ecr get-login --no-include-email --region us-east-1) + make build-ml-api-aws tag-ml-api push-ml-api-aws + aws ecs update-service --cluster ml-api-cluster --service custom-service --task-definition first-run-task-definition --force-new-deployment + + section_13_train_and_upload_neural_network_model: + docker: + - image: circleci/python:3.6.4-stretch + working_directory: ~/project + steps: + - checkout + - *prepare_venv + - run: + name: Install requirements + command: | + . venv/bin/activate + pip install -r packages/neural_network_model/requirements.txt + - run: + name: Fetch Training data - 2GB + command: | + . venv/bin/activate + chmod +x ./scripts/fetch_kaggle_large_dataset.sh + ./scripts/fetch_kaggle_large_dataset.sh + - run: + name: Train model + command: | + . venv/bin/activate + PYTHONPATH=./packages/neural_network_model python3 packages/neural_network_model/neural_network_model/train_pipeline.py + - run: + name: Publish model to Gemfury + command: | + . venv/bin/activate + chmod +x ./scripts/publish_model.sh + ./scripts/publish_model.sh ./packages/neural_network_model/ + +workflows: + version: 2 + test-all: + jobs: + - test_regression_model_py36 + - test_regression_model_py37 + - test_regression_model_py38 + - test_ml_api_py36 + - test_ml_api_py37 + # - test_ml_api_py38 pending NN model update + - section_9_differential_tests + - train_and_upload_regression_model: + requires: + - test_regression_model_py36 + - test_regression_model_py37 + - test_regression_model_py38 + - test_ml_api_py36 + - test_ml_api_py37 + - section_9_differential_tests + filters: + branches: + only: + - master + # - section_10_deploy_to_heroku: + # requires: + # - train_and_upload_regression_model + # filters: + # branches: + # only: + # - master +# - section_11_build_and_push_to_heroku_docker: +# requires: +# - train_and_upload_regression_model +# filters: +# branches: +# only: +# - master + # - section_12_publish_docker_image_to_aws: + # requires: + # - train_and_upload_regression_model + # filters: + # branches: + # only: + # - master +# - section_13_train_and_upload_neural_network_model: +# requires: +# - test_regression_model +# - test_ml_api +# - section_9_differential_tests + # - train_and_upload_regression_model + # filters: + # branches: + # only: + # - master diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..b9e54fcad --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +jupyter_notebooks* +*/env* +*/venv* +.circleci* +packages/regression_model +*.env +*.log +.git +.gitignore \ No newline at end of file diff --git a/.gitignore b/.gitignore index cda241356..29988fc6f 100644 --- a/.gitignore +++ b/.gitignore @@ -106,10 +106,30 @@ venv.bak/ # pycharm .idea/ -# pickle files -*.pkl - # datafiles packages/regression_model/regression_model/datasets/*.csv packages/regression_model/regression_model/datasets/*.zip packages/regression_model/regression_model/datasets/*.txt +train.csv +test.csv +data_description.txt +house-prices-advanced-regression-techniques.zip +sample_submission.csv +test_data_predictions.csv +v2-plant-seedlings-dataset/ +v2-plant-seedlings-dataset.zip + +# all logs +logs/ + +# trained models (will be created in CI) +packages/regression_model/regression_model/trained_models/*.pkl +packages/neural_network_model/neural_network_model/trained_models/*.pkl +packages/neural_network_model/neural_network_model/trained_models/*.h5 +*.h5 +packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt + +.DS_Store + +kaggle.json +packages/ml_api/uploads/* diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..bbba25c1a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.6.4 + +# Create the user that will run the app +RUN adduser --disabled-password --gecos '' ml-api-user + +WORKDIR /opt/ml_api + +ARG PIP_EXTRA_INDEX_URL +ENV FLASK_APP run.py + +# Install requirements, including from Gemfury +ADD ./packages/ml_api /opt/ml_api/ +RUN pip install --upgrade pip +RUN pip install -r /opt/ml_api/requirements.txt + +RUN chmod +x /opt/ml_api/run.sh +RUN chown -R ml-api-user:ml-api-user ./ + +USER ml-api-user + +EXPOSE 5000 + +CMD ["bash", "./run.sh"] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..f02d80abc --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2019, Soledad Galli and Christopher Samiullah. Deployment of Machine Learning Models, online course. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..7fe16bef3 --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +NAME=udemy-ml-api +COMMIT_ID=$(shell git rev-parse HEAD) + + +build-ml-api-heroku: + docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/$(NAME)/web:$(COMMIT_ID) . + +push-ml-api-heroku: + docker push registry.heroku.com/${HEROKU_APP_NAME}/web:$(COMMIT_ID) + +build-ml-api-aws: + docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t $(NAME):$(COMMIT_ID) . + +push-ml-api-aws: + docker push ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID) + +tag-ml-api: + docker tag $(NAME):$(COMMIT_ID) ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID) diff --git a/Procfile b/Procfile new file mode 100644 index 000000000..2d349992c --- /dev/null +++ b/Procfile @@ -0,0 +1 @@ +web: gunicorn --pythonpath packages/ml_api --access-logfile - --error-logfile - run:application \ No newline at end of file diff --git a/README.md b/README.md index 692a61f79..7fbf80b75 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ -# Deploying Machine Learning Models -For the documentation, visit the course on Udemy. +# Deployment of Machine Learning Models +Accompanying repo for the online course Deployment of Machine Learning Models. + +For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO). diff --git a/packages/ml_api/VERSION b/packages/ml_api/VERSION new file mode 100644 index 000000000..9325c3ccd --- /dev/null +++ b/packages/ml_api/VERSION @@ -0,0 +1 @@ +0.3.0 \ No newline at end of file diff --git a/packages/ml_api/api/__init__.py b/packages/ml_api/api/__init__.py new file mode 100644 index 000000000..ad56c24c1 --- /dev/null +++ b/packages/ml_api/api/__init__.py @@ -0,0 +1,4 @@ +from api.config import PACKAGE_ROOT + +with open(PACKAGE_ROOT / 'VERSION') as version_file: + __version__ = version_file.read().strip() diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py new file mode 100644 index 000000000..40abdb55f --- /dev/null +++ b/packages/ml_api/api/app.py @@ -0,0 +1,20 @@ +from flask import Flask + +from api.config import get_logger + + +_logger = get_logger(logger_name=__name__) + + +def create_app(*, config_object) -> Flask: + """Create a flask app instance.""" + + flask_app = Flask('ml_api') + flask_app.config.from_object(config_object) + + # import blueprints + from api.controller import prediction_app + flask_app.register_blueprint(prediction_app) + _logger.debug('Application instance created') + + return flask_app diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py new file mode 100644 index 000000000..3ca849c99 --- /dev/null +++ b/packages/ml_api/api/config.py @@ -0,0 +1,70 @@ +import logging +from logging.handlers import TimedRotatingFileHandler +import pathlib +import os +import sys + +PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent + +FORMATTER = logging.Formatter( + "%(asctime)s — %(name)s — %(levelname)s —" + "%(funcName)s:%(lineno)d — %(message)s") +LOG_DIR = PACKAGE_ROOT / 'logs' +LOG_DIR.mkdir(exist_ok=True) +LOG_FILE = LOG_DIR / 'ml_api.log' +UPLOAD_FOLDER = PACKAGE_ROOT / 'uploads' +UPLOAD_FOLDER.mkdir(exist_ok=True) + +ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg']) + + +def get_console_handler(): + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(FORMATTER) + return console_handler + + +def get_file_handler(): + file_handler = TimedRotatingFileHandler( + LOG_FILE, when='midnight') + file_handler.setFormatter(FORMATTER) + file_handler.setLevel(logging.WARNING) + return file_handler + + +def get_logger(*, logger_name): + """Get logger with prepared handlers.""" + + logger = logging.getLogger(logger_name) + + logger.setLevel(logging.INFO) + + logger.addHandler(get_console_handler()) + logger.addHandler(get_file_handler()) + logger.propagate = False + + return logger + + +class Config: + DEBUG = False + TESTING = False + CSRF_ENABLED = True + SECRET_KEY = 'this-really-needs-to-be-changed' + SERVER_PORT = 5000 + UPLOAD_FOLDER = UPLOAD_FOLDER + + +class ProductionConfig(Config): + DEBUG = False + SERVER_ADDRESS: os.environ.get('SERVER_ADDRESS', '0.0.0.0') + SERVER_PORT: os.environ.get('SERVER_PORT', '5000') + + +class DevelopmentConfig(Config): + DEVELOPMENT = True + DEBUG = True + + +class TestingConfig(Config): + TESTING = True diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py new file mode 100644 index 000000000..4e683b2dc --- /dev/null +++ b/packages/ml_api/api/controller.py @@ -0,0 +1,89 @@ +from flask import Blueprint, request, jsonify +from regression_model.predict import make_prediction +from regression_model import __version__ as _version +from neural_network_model.predict import make_single_prediction +import os +from werkzeug.utils import secure_filename + +from api.config import get_logger, UPLOAD_FOLDER +from api.validation import validate_inputs, allowed_file +from api import __version__ as api_version + +_logger = get_logger(logger_name=__name__) + + +prediction_app = Blueprint('prediction_app', __name__) + + +@prediction_app.route('/health', methods=['GET']) +def health(): + if request.method == 'GET': + _logger.info('health status OK') + return 'ok' + + +@prediction_app.route('/version', methods=['GET']) +def version(): + if request.method == 'GET': + return jsonify({'model_version': _version, + 'api_version': api_version}) + + +@prediction_app.route('/v1/predict/regression', methods=['POST']) +def predict(): + if request.method == 'POST': + # Step 1: Extract POST data from request body as JSON + json_data = request.get_json() + _logger.debug(f'Inputs: {json_data}') + + # Step 2: Validate the input using marshmallow schema + input_data, errors = validate_inputs(input_data=json_data) + + # Step 3: Model prediction + result = make_prediction(input_data=input_data) + _logger.debug(f'Outputs: {result}') + + # Step 4: Convert numpy ndarray to list + predictions = result.get('predictions').tolist() + version = result.get('version') + + # Step 5: Return the response as JSON + return jsonify({'predictions': predictions, + 'version': version, + 'errors': errors}) + + +@prediction_app.route('/predict/classifier', methods=['POST']) +def predict_image(): + if request.method == 'POST': + # Step 1: check if the post request has the file part + if 'file' not in request.files: + return jsonify('No file found'), 400 + + file = request.files['file'] + + # Step 2: Basic file extension validation + if file and allowed_file(file.filename): + filename = secure_filename(file.filename) + + # Step 3: Save the file + # Note, in production, this would require careful + # validation, management and clean up. + file.save(os.path.join(UPLOAD_FOLDER, filename)) + + _logger.debug(f'Inputs: {filename}') + + # Step 4: perform prediction + result = make_single_prediction( + image_name=filename, + image_directory=UPLOAD_FOLDER) + + _logger.debug(f'Outputs: {result}') + + readable_predictions = result.get('readable_predictions') + version = result.get('version') + + # Step 5: Return the response as JSON + return jsonify( + {'readable_predictions': readable_predictions[0], + 'version': version}) diff --git a/packages/ml_api/api/validation.py b/packages/ml_api/api/validation.py new file mode 100644 index 000000000..c143263a4 --- /dev/null +++ b/packages/ml_api/api/validation.py @@ -0,0 +1,155 @@ +import typing as t + +from marshmallow import Schema, fields +from marshmallow import ValidationError + +from api import config + + +class InvalidInputError(Exception): + """Invalid model input.""" + + +SYNTAX_ERROR_FIELD_MAP = { + '1stFlrSF': 'FirstFlrSF', + '2ndFlrSF': 'SecondFlrSF', + '3SsnPorch': 'ThreeSsnPortch' +} + + +class HouseDataRequestSchema(Schema): + Alley = fields.Str(allow_none=True) + BedroomAbvGr = fields.Integer() + BldgType = fields.Str() + BsmtCond = fields.Str() + BsmtExposure = fields.Str(allow_none=True) + BsmtFinSF1 = fields.Float() + BsmtFinSF2 = fields.Float() + BsmtFinType1 = fields.Str() + BsmtFinType2 = fields.Str() + BsmtFullBath = fields.Float() + BsmtHalfBath = fields.Float() + BsmtQual = fields.Str(allow_none=True) + BsmtUnfSF = fields.Float() + CentralAir = fields.Str() + Condition1 = fields.Str() + Condition2 = fields.Str() + Electrical = fields.Str() + EnclosedPorch = fields.Integer() + ExterCond = fields.Str() + ExterQual = fields.Str() + Exterior1st = fields.Str() + Exterior2nd = fields.Str() + Fence = fields.Str(allow_none=True) + FireplaceQu = fields.Str(allow_none=True) + Fireplaces = fields.Integer() + Foundation = fields.Str() + FullBath = fields.Integer() + Functional = fields.Str() + GarageArea = fields.Float() + GarageCars = fields.Float() + GarageCond = fields.Str() + GarageFinish = fields.Str(allow_none=True) + GarageQual = fields.Str() + GarageType = fields.Str(allow_none=True) + GarageYrBlt = fields.Float() + GrLivArea = fields.Integer() + HalfBath = fields.Integer() + Heating = fields.Str() + HeatingQC = fields.Str() + HouseStyle = fields.Str() + Id = fields.Integer() + KitchenAbvGr = fields.Integer() + KitchenQual = fields.Str() + LandContour = fields.Str() + LandSlope = fields.Str() + LotArea = fields.Integer() + LotConfig = fields.Str() + LotFrontage = fields.Float(allow_none=True) + LotShape = fields.Str() + LowQualFinSF = fields.Integer() + MSSubClass = fields.Integer() + MSZoning = fields.Str() + MasVnrArea = fields.Float() + MasVnrType = fields.Str(allow_none=True) + MiscFeature = fields.Str(allow_none=True) + MiscVal = fields.Integer() + MoSold = fields.Integer() + Neighborhood = fields.Str() + OpenPorchSF = fields.Integer() + OverallCond = fields.Integer() + OverallQual = fields.Integer() + PavedDrive = fields.Str() + PoolArea = fields.Integer() + PoolQC = fields.Str(allow_none=True) + RoofMatl = fields.Str() + RoofStyle = fields.Str() + SaleCondition = fields.Str() + SaleType = fields.Str() + ScreenPorch = fields.Integer() + Street = fields.Str() + TotRmsAbvGrd = fields.Integer() + TotalBsmtSF = fields.Float() + Utilities = fields.Str() + WoodDeckSF = fields.Integer() + YearBuilt = fields.Integer() + YearRemodAdd = fields.Integer() + YrSold = fields.Integer() + FirstFlrSF = fields.Integer() + SecondFlrSF = fields.Integer() + ThreeSsnPortch = fields.Integer() + + +def _filter_error_rows(errors: dict, + validated_input: t.List[dict] + ) -> t.List[dict]: + """Remove input data rows with errors.""" + + indexes = errors.keys() + # delete them in reverse order so that you + # don't throw off the subsequent indexes. + for index in sorted(indexes, reverse=True): + del validated_input[index] + + return validated_input + + +def validate_inputs(input_data): + """Check prediction inputs against schema.""" + + # set many=True to allow passing in a list + schema = HouseDataRequestSchema(strict=True, many=True) + + # convert syntax error field names (beginning with numbers) + for dict in input_data: + for key, value in SYNTAX_ERROR_FIELD_MAP.items(): + dict[value] = dict[key] + del dict[key] + + errors = None + try: + schema.load(input_data) + except ValidationError as exc: + errors = exc.messages + + # convert syntax error field names back + # this is a hack - never name your data + # fields with numbers as the first letter. + for dict in input_data: + for key, value in SYNTAX_ERROR_FIELD_MAP.items(): + dict[key] = dict[value] + del dict[value] + + if errors: + validated_input = _filter_error_rows( + errors=errors, + validated_input=input_data) + else: + validated_input = input_data + + return validated_input, errors + + +def allowed_file(filename): + return '.' in filename and \ + filename.rsplit('.', 1)[1].lower() in config.ALLOWED_EXTENSIONS diff --git a/packages/ml_api/diff_test_requirements.txt b/packages/ml_api/diff_test_requirements.txt new file mode 100644 index 000000000..37ebe9b56 --- /dev/null +++ b/packages/ml_api/diff_test_requirements.txt @@ -0,0 +1,13 @@ +--extra-index-url=${PIP_EXTRA_INDEX_URL} + +# api +flask>=1.1.1,<1.2.0 + +# schema validation +marshmallow==2.17.0 + +# Set this to the previous model version +regression-model==2.0.19 + +# temporarily necessary as we update sklearn +joblib>=0.14.1,<0.15.0 \ No newline at end of file diff --git a/packages/ml_api/requirements.txt b/packages/ml_api/requirements.txt new file mode 100644 index 000000000..39a8feec1 --- /dev/null +++ b/packages/ml_api/requirements.txt @@ -0,0 +1,14 @@ +--extra-index-url=${PIP_EXTRA_INDEX_URL} + +# api +flask>=1.1.1,<1.2.0 + +# schema validation +marshmallow==2.17.0 + +# Install from gemfury +regression-model==2.0.20 +neural_network_model==0.1.1 + +# Deployment +gunicorn==19.9.0 \ No newline at end of file diff --git a/packages/ml_api/run.py b/packages/ml_api/run.py new file mode 100644 index 000000000..7f60a072a --- /dev/null +++ b/packages/ml_api/run.py @@ -0,0 +1,10 @@ +from api.app import create_app +from api.config import DevelopmentConfig, ProductionConfig + + +application = create_app( + config_object=ProductionConfig) + + +if __name__ == '__main__': + application.run() diff --git a/packages/ml_api/run.sh b/packages/ml_api/run.sh new file mode 100644 index 000000000..f579e6b1a --- /dev/null +++ b/packages/ml_api/run.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +export IS_DEBUG=${DEBUG:-false} +exec gunicorn --bind 0.0.0.0:5000 --access-logfile - --error-logfile - run:application \ No newline at end of file diff --git a/packages/ml_api/test_data_predictions.csv b/packages/ml_api/test_data_predictions.csv new file mode 100644 index 000000000..d1117a25b --- /dev/null +++ b/packages/ml_api/test_data_predictions.csv @@ -0,0 +1,501 @@ +,predictions,version +0,143988.30704997465,0.2.0 +1,116598.08159580332,0.2.0 +2,130128.90560814076,0.2.0 +3,113470.10675716968,0.2.0 +4,159022.48121448176,0.2.0 +5,139861.32732907546,0.2.0 +6,227118.89767805065,0.2.0 +7,91953.99400144782,0.2.0 +8,225573.26579772323,0.2.0 +9,125802.8602526304,0.2.0 +10,137481.49149643493,0.2.0 +11,124990.09839895074,0.2.0 +12,133270.15609091,0.2.0 +13,192143.4530280595,0.2.0 +14,123206.5594461486,0.2.0 +15,201801.77975634683,0.2.0 +16,198027.98470170778,0.2.0 +17,185664.94305866087,0.2.0 +18,146728.39264190392,0.2.0 +19,152443.1572738422,0.2.0 +20,197054.58979409203,0.2.0 +21,146781.9115319493,0.2.0 +22,138838.0050135225,0.2.0 +23,259997.45200360558,0.2.0 +24,220904.18524276977,0.2.0 +25,162760.6578114075,0.2.0 +26,81622.7760115488,0.2.0 +27,104671.50728326188,0.2.0 +28,129551.38264993431,0.2.0 +29,95446.01639989471,0.2.0 +30,129507.4444341237,0.2.0 +31,95477.93516568728,0.2.0 +32,129422.6043698834,0.2.0 +33,128062.38086640426,0.2.0 +34,123419.71922835958,0.2.0 +35,128318.94350485185,0.2.0 +36,207431.6698047325,0.2.0 +37,174685.92854135018,0.2.0 +38,204544.1513220886,0.2.0 +39,188046.15280301377,0.2.0 +40,182971.78532877663,0.2.0 +41,70097.27238622728,0.2.0 +42,110733.2059471847,0.2.0 +43,93994.92500037784,0.2.0 +44,252924.35745892464,0.2.0 +45,214641.99038515135,0.2.0 +46,154979.9669243978,0.2.0 +47,160810.80098181101,0.2.0 +48,230690.236786167,0.2.0 +49,196243.15614263792,0.2.0 +50,177792.5604951465,0.2.0 +51,150956.42632815256,0.2.0 +52,168211.15880784288,0.2.0 +53,158387.31855224012,0.2.0 +54,114339.5601018531,0.2.0 +55,90052.36198593948,0.2.0 +56,89964.45949954129,0.2.0 +57,98668.89304456668,0.2.0 +58,121518.86270978909,0.2.0 +59,134198.59781615838,0.2.0 +60,163434.02753944616,0.2.0 +61,135542.55508479764,0.2.0 +62,141825.43043982252,0.2.0 +63,227613.38755000453,0.2.0 +64,188761.60830094197,0.2.0 +65,116489.4563051063,0.2.0 +66,167327.47818717395,0.2.0 +67,183019.80781626955,0.2.0 +68,263704.159135985,0.2.0 +69,194109.36377179576,0.2.0 +70,300262.7532032975,0.2.0 +71,223004.09657281314,0.2.0 +72,229985.38944263826,0.2.0 +73,184172.20037350367,0.2.0 +74,188222.84233142118,0.2.0 +75,188097.29339417908,0.2.0 +76,172331.10498565168,0.2.0 +77,174886.6907641111,0.2.0 +78,201441.14534017237,0.2.0 +79,178852.47480584026,0.2.0 +80,225286.87493988863,0.2.0 +81,186618.03844702366,0.2.0 +82,253907.81542043414,0.2.0 +83,240359.90484464006,0.2.0 +84,238601.0921535284,0.2.0 +85,177935.77765021168,0.2.0 +86,162057.79394455065,0.2.0 +87,163514.64562596226,0.2.0 +88,133002.50357947565,0.2.0 +89,126285.82757075419,0.2.0 +90,114122.89197558099,0.2.0 +91,118965.43322308766,0.2.0 +92,107820.17501469971,0.2.0 +93,107672.41260124673,0.2.0 +94,161142.56666974662,0.2.0 +95,155175.112064241,0.2.0 +96,159626.62056220102,0.2.0 +97,159289.85166702382,0.2.0 +98,164753.43823200595,0.2.0 +99,130441.66184067688,0.2.0 +100,150115.21843697876,0.2.0 +101,363780.0225506806,0.2.0 +102,330017.780544809,0.2.0 +103,331883.3191102819,0.2.0 +104,406837.5511403465,0.2.0 +105,292997.10969063273,0.2.0 +106,306609.27632288035,0.2.0 +107,329626.60615839734,0.2.0 +108,311532.52238578524,0.2.0 +109,302589.7805774104,0.2.0 +110,313113.53389941505,0.2.0 +111,255492.2795391536,0.2.0 +112,348040.2630000232,0.2.0 +113,286215.77612206567,0.2.0 +114,257811.3774942191,0.2.0 +115,219056.33504400466,0.2.0 +116,221072.9009001751,0.2.0 +117,227272.5447635412,0.2.0 +118,389000.9584031945,0.2.0 +119,333081.2372066048,0.2.0 +120,301748.2795090072,0.2.0 +121,268886.605541231,0.2.0 +122,292214.7783535345,0.2.0 +123,218893.10534405566,0.2.0 +124,198679.87790616706,0.2.0 +125,198256.12319179106,0.2.0 +126,203810.58008877232,0.2.0 +127,200888.22351579432,0.2.0 +128,208173.15639542375,0.2.0 +129,208236.64492513813,0.2.0 +130,204263.56750308358,0.2.0 +131,194016.82016564548,0.2.0 +132,247220.62121392722,0.2.0 +133,186454.85767170336,0.2.0 +134,183808.3284633914,0.2.0 +135,184105.97903285234,0.2.0 +136,239209.89605894414,0.2.0 +137,184218.80235097196,0.2.0 +138,307821.6280329202,0.2.0 +139,309780.2215794851,0.2.0 +140,250051.75088695402,0.2.0 +141,264234.36472344183,0.2.0 +142,238517.39539507058,0.2.0 +143,253639.64599699862,0.2.0 +144,266777.25555390265,0.2.0 +145,249262.33173072065,0.2.0 +146,354687.6212203011,0.2.0 +147,211718.31772737036,0.2.0 +148,208112.29103266165,0.2.0 +149,269063.04990015837,0.2.0 +150,232554.7387626751,0.2.0 +151,267547.16223942576,0.2.0 +152,259496.4322217068,0.2.0 +153,254987.37388475015,0.2.0 +154,213297.22522688,0.2.0 +155,209521.4853124122,0.2.0 +156,168400.4848772304,0.2.0 +157,168269.52494463106,0.2.0 +158,138015.7063444789,0.2.0 +159,197692.7497359191,0.2.0 +160,210792.23068435694,0.2.0 +161,160895.21637656086,0.2.0 +162,129967.65699942572,0.2.0 +163,148887.7470968613,0.2.0 +164,189032.60710901304,0.2.0 +165,206354.3720483368,0.2.0 +166,170625.45360343822,0.2.0 +167,161155.2832590772,0.2.0 +168,177241.4857453312,0.2.0 +169,152617.9750132888,0.2.0 +170,164767.3082372813,0.2.0 +171,121689.0145099861,0.2.0 +172,114755.20351999925,0.2.0 +173,109385.54490451732,0.2.0 +174,115908.28531894127,0.2.0 +175,127297.15226141199,0.2.0 +176,111687.7144642378,0.2.0 +177,250341.40946203517,0.2.0 +178,231747.51470786144,0.2.0 +179,273940.75455758354,0.2.0 +180,223840.72800951728,0.2.0 +181,207683.72914446727,0.2.0 +182,185613.50839666792,0.2.0 +183,195932.25270587756,0.2.0 +184,248138.38057655803,0.2.0 +185,188290.29546011682,0.2.0 +186,210444.7210381098,0.2.0 +187,205928.18597414377,0.2.0 +188,210044.0320203481,0.2.0 +189,156787.38785618285,0.2.0 +190,149779.3462459088,0.2.0 +191,222254.2913941949,0.2.0 +192,117338.5782329264,0.2.0 +193,144956.37156722017,0.2.0 +194,190502.7599290919,0.2.0 +195,176058.9300745161,0.2.0 +196,113437.17520996452,0.2.0 +197,113005.87286210393,0.2.0 +198,148396.4974016323,0.2.0 +199,155111.51255427708,0.2.0 +200,160895.4088655705,0.2.0 +201,146811.64156366416,0.2.0 +202,161697.96498210484,0.2.0 +203,175408.29205737467,0.2.0 +204,119486.7853118973,0.2.0 +205,155735.2535739763,0.2.0 +206,161732.25789945782,0.2.0 +207,186302.28474718594,0.2.0 +208,126314.40090076534,0.2.0 +209,161489.29160402366,0.2.0 +210,142192.79730554653,0.2.0 +211,125295.79760954925,0.2.0 +212,133726.54674477206,0.2.0 +213,131402.58297528428,0.2.0 +214,147256.8448434014,0.2.0 +215,130042.3601888925,0.2.0 +216,126109.99661525768,0.2.0 +217,104028.06280588396,0.2.0 +218,139015.86204044707,0.2.0 +219,123915.67823516048,0.2.0 +220,178112.6718654715,0.2.0 +221,125873.4394256058,0.2.0 +222,94911.69337443665,0.2.0 +223,137426.63537243495,0.2.0 +224,110144.45586689096,0.2.0 +225,119424.4928970573,0.2.0 +226,149432.93149379385,0.2.0 +227,163081.24792773716,0.2.0 +228,72754.84825273752,0.2.0 +229,107008.00619034276,0.2.0 +230,97026.69480171583,0.2.0 +231,176624.72236581342,0.2.0 +232,136815.75834336376,0.2.0 +233,136527.98103527437,0.2.0 +234,149254.9171475344,0.2.0 +235,127404.15185928933,0.2.0 +236,150150.4110071018,0.2.0 +237,122947.21890337647,0.2.0 +238,123038.56391694587,0.2.0 +239,106055.04206900226,0.2.0 +240,133737.62620695255,0.2.0 +241,127761.33500718801,0.2.0 +242,148651.3511288533,0.2.0 +243,150394.04939898496,0.2.0 +244,137871.15589031755,0.2.0 +245,137889.2545253325,0.2.0 +246,135021.79176355613,0.2.0 +247,132212.93368155853,0.2.0 +248,132394.6589172383,0.2.0 +249,116451.46796853734,0.2.0 +250,132045.77239979545,0.2.0 +251,93828.92317256187,0.2.0 +252,98304.79957463636,0.2.0 +253,116592.62783055207,0.2.0 +254,98723.66631722648,0.2.0 +255,70121.22021310769,0.2.0 +256,97709.23487001589,0.2.0 +257,117883.99993469544,0.2.0 +258,145026.28625503322,0.2.0 +259,153912.57618886943,0.2.0 +260,93381.08729006874,0.2.0 +261,123495.69496267234,0.2.0 +262,151217.31007381002,0.2.0 +263,70925.4220942242,0.2.0 +264,134164.7860642941,0.2.0 +265,137115.50773650245,0.2.0 +266,112454.46885682318,0.2.0 +267,113576.35603796394,0.2.0 +268,126311.04816994928,0.2.0 +269,130853.87341430226,0.2.0 +270,134365.47254085648,0.2.0 +271,149331.816504544,0.2.0 +272,113846.4490674583,0.2.0 +273,127309.62370143532,0.2.0 +274,138936.11004121447,0.2.0 +275,126773.14110750334,0.2.0 +276,118674.20763474096,0.2.0 +277,94732.55765810968,0.2.0 +278,115042.27875631058,0.2.0 +279,97413.63757181565,0.2.0 +280,125103.21858739002,0.2.0 +281,127112.78156168538,0.2.0 +282,100712.28345775318,0.2.0 +283,123435.94852302536,0.2.0 +284,146777.37991798244,0.2.0 +285,141324.91303095603,0.2.0 +286,147015.62617541858,0.2.0 +287,182059.49685921244,0.2.0 +288,66635.70748853082,0.2.0 +289,113133.7345902136,0.2.0 +290,115399.86396709623,0.2.0 +291,142613.97712567318,0.2.0 +292,122675.88261778199,0.2.0 +293,128951.35723355877,0.2.0 +294,159633.68071362676,0.2.0 +295,163672.2859152473,0.2.0 +296,200101.77128067127,0.2.0 +297,166260.33914041193,0.2.0 +298,150329.84339014755,0.2.0 +299,140794.76572322496,0.2.0 +300,166102.833620058,0.2.0 +301,140183.19131161584,0.2.0 +302,257819.0508760762,0.2.0 +303,257819.0508760762,0.2.0 +304,257819.0508760762,0.2.0 +305,297489.40422482847,0.2.0 +306,288713.0465842733,0.2.0 +307,238840.80382128613,0.2.0 +308,264054.2118258276,0.2.0 +309,214038.27040784762,0.2.0 +310,216541.14163119273,0.2.0 +311,251482.14382697808,0.2.0 +312,201302.78506297944,0.2.0 +313,221418.6030263962,0.2.0 +314,143245.9627266626,0.2.0 +315,195099.27104358346,0.2.0 +316,194957.58888827328,0.2.0 +317,196553.0339968338,0.2.0 +318,209163.81006532238,0.2.0 +319,137593.75834543034,0.2.0 +320,139886.56269297737,0.2.0 +321,224462.0649769455,0.2.0 +322,249722.4606197197,0.2.0 +323,196221.2726508532,0.2.0 +324,200883.07978660773,0.2.0 +325,236876.5404898464,0.2.0 +326,265449.9719556491,0.2.0 +327,210031.52797804037,0.2.0 +328,250335.16327422266,0.2.0 +329,193702.5517580212,0.2.0 +330,113345.66683243777,0.2.0 +331,141908.87717126816,0.2.0 +332,98061.70102934526,0.2.0 +333,122961.05363435802,0.2.0 +334,117995.15041902235,0.2.0 +335,134068.9122846434,0.2.0 +336,122607.11339521343,0.2.0 +337,128632.12690453106,0.2.0 +338,130665.06200115388,0.2.0 +339,181867.81868509538,0.2.0 +340,172320.99427457084,0.2.0 +341,163115.13448378997,0.2.0 +342,142692.95549842576,0.2.0 +343,204336.63049215134,0.2.0 +344,151865.2725254776,0.2.0 +345,187999.9387459913,0.2.0 +346,153898.50002741258,0.2.0 +347,201370.60175011388,0.2.0 +348,136260.79769104172,0.2.0 +349,167661.378830941,0.2.0 +350,151900.7260108396,0.2.0 +351,203200.5976776774,0.2.0 +352,275987.18626456213,0.2.0 +353,131731.26809609786,0.2.0 +354,72685.59185678526,0.2.0 +355,264769.3677760745,0.2.0 +356,223505.75506482823,0.2.0 +357,140373.47418071458,0.2.0 +358,165740.37720853413,0.2.0 +359,153501.3958318297,0.2.0 +360,333345.8132030645,0.2.0 +361,284907.13582157245,0.2.0 +362,235976.61331734635,0.2.0 +363,237331.86536503406,0.2.0 +364,222571.43251950064,0.2.0 +365,330547.42125199316,0.2.0 +366,126425.36283381855,0.2.0 +367,150931.15863895716,0.2.0 +368,116973.81860226691,0.2.0 +369,147483.17081444428,0.2.0 +370,137775.93779758728,0.2.0 +371,136213.6538169831,0.2.0 +372,160855.09129555486,0.2.0 +373,180999.95456004038,0.2.0 +374,177875.4323401108,0.2.0 +375,183722.0684301858,0.2.0 +376,183394.03709605164,0.2.0 +377,167171.69796713692,0.2.0 +378,253008.1582497637,0.2.0 +379,208356.18546752,0.2.0 +380,184067.27386951286,0.2.0 +381,184525.57241064525,0.2.0 +382,234914.10484877022,0.2.0 +383,319321.39732491894,0.2.0 +384,329258.81904322456,0.2.0 +385,171807.44667235087,0.2.0 +386,300439.8001753106,0.2.0 +387,168715.42175203658,0.2.0 +388,224083.29347340713,0.2.0 +389,169027.4893700393,0.2.0 +390,219986.76456349975,0.2.0 +391,206599.36694968113,0.2.0 +392,168431.21773772905,0.2.0 +393,198938.11718684685,0.2.0 +394,137044.70162504562,0.2.0 +395,256489.3797086342,0.2.0 +396,169081.6811380493,0.2.0 +397,246159.3182317069,0.2.0 +398,146517.01285907425,0.2.0 +399,115488.93084257792,0.2.0 +400,124226.28849234067,0.2.0 +401,105765.49539858926,0.2.0 +402,105734.63795160982,0.2.0 +403,109307.7618847266,0.2.0 +404,153399.47012489414,0.2.0 +405,148098.79308079585,0.2.0 +406,256865.85340555105,0.2.0 +407,353705.2884855737,0.2.0 +408,339406.68729405693,0.2.0 +409,370934.7245862843,0.2.0 +410,412758.66452745936,0.2.0 +411,337318.9162127192,0.2.0 +412,292636.5292003634,0.2.0 +413,306738.89042618143,0.2.0 +414,395200.33469924616,0.2.0 +415,265420.90751885757,0.2.0 +416,304674.1881521481,0.2.0 +417,322466.11906014563,0.2.0 +418,309583.69640512683,0.2.0 +419,222251.71906371377,0.2.0 +420,305633.12114918296,0.2.0 +421,246068.43249597988,0.2.0 +422,237392.40028237563,0.2.0 +423,211279.01604200783,0.2.0 +424,228094.0196541859,0.2.0 +425,217362.23612708444,0.2.0 +426,212395.21391217507,0.2.0 +427,192157.327626266,0.2.0 +428,210131.93667451647,0.2.0 +429,218479.26431069477,0.2.0 +430,227732.65975321413,0.2.0 +431,207550.8611689138,0.2.0 +432,196406.28233478937,0.2.0 +433,215352.46117706495,0.2.0 +434,195390.69073167298,0.2.0 +435,268095.89486272854,0.2.0 +436,317322.5783410133,0.2.0 +437,292294.5209052129,0.2.0 +438,256214.48067033372,0.2.0 +439,289956.5518384693,0.2.0 +440,285699.6865787319,0.2.0 +441,238369.04431785582,0.2.0 +442,266162.84585317614,0.2.0 +443,276105.07384260837,0.2.0 +444,241944.78930174315,0.2.0 +445,212994.50831895912,0.2.0 +446,266502.50110652676,0.2.0 +447,203362.7111452237,0.2.0 +448,180227.73055119175,0.2.0 +449,188392.39553333411,0.2.0 +450,142481.50831170173,0.2.0 +451,174912.95802564104,0.2.0 +452,168060.24103720946,0.2.0 +453,170840.3065243665,0.2.0 +454,185335.0674102329,0.2.0 +455,175685.71835342573,0.2.0 +456,182131.57134249242,0.2.0 +457,127731.04705949678,0.2.0 +458,130944.89863769621,0.2.0 +459,105125.80701127343,0.2.0 +460,113673.41846707783,0.2.0 +461,171746.81645701104,0.2.0 +462,147544.47667904384,0.2.0 +463,266570.15210116236,0.2.0 +464,340483.4209594863,0.2.0 +465,193926.64894274823,0.2.0 +466,177273.1783748505,0.2.0 +467,188439.6899965548,0.2.0 +468,179646.3820244513,0.2.0 +469,277801.9107183519,0.2.0 +470,244750.34380769494,0.2.0 +471,264143.13027023565,0.2.0 +472,264084.9900022445,0.2.0 +473,190623.30283373612,0.2.0 +474,218303.47626378198,0.2.0 +475,209178.35576652727,0.2.0 +476,210247.40015571192,0.2.0 +477,305489.9014144604,0.2.0 +478,206548.65094650167,0.2.0 +479,260901.671279582,0.2.0 +480,234130.08563281858,0.2.0 +481,215084.1602052955,0.2.0 +482,162068.0157257143,0.2.0 +483,175403.3655499554,0.2.0 +484,188329.78909449733,0.2.0 +485,148772.6745077038,0.2.0 +486,135234.48910921262,0.2.0 +487,132981.35850945665,0.2.0 +488,142443.15434220844,0.2.0 +489,172322.6219487221,0.2.0 +490,114015.40802504608,0.2.0 +491,131679.82317114327,0.2.0 +492,140830.26421534023,0.2.0 +493,96630.01740632812,0.2.0 +494,146497.76662391485,0.2.0 +495,161384.411998765,0.2.0 +496,122294.75296565886,0.2.0 +497,187349.35839738324,0.2.0 +498,139773.34125411394,0.2.0 +499,151158.00827612064,0.2.0 diff --git a/packages/ml_api/tests/__init__.py b/packages/ml_api/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/ml_api/tests/capture_model_predictions.py b/packages/ml_api/tests/capture_model_predictions.py new file mode 100644 index 000000000..19a71142a --- /dev/null +++ b/packages/ml_api/tests/capture_model_predictions.py @@ -0,0 +1,35 @@ +""" +This script should only be run in CI. +Never run it locally or you will disrupt the +differential test versioning logic. +""" + +import pandas as pd + +from regression_model.predict import make_prediction +from regression_model.processing.data_management import load_dataset + +from api import config + + +def capture_predictions() -> None: + """Save the test data predictions to a CSV.""" + + save_file = 'test_data_predictions.csv' + test_data = load_dataset(file_name='test.csv') + + # we take a slice with no input validation issues + multiple_test_input = test_data[99:600] + + predictions = make_prediction(input_data=multiple_test_input) + + # save predictions for the test dataset + predictions_df = pd.DataFrame(predictions) + + # hack here to save the file to the regression model + # package of the repo, not the installed package + predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}') + + +if __name__ == '__main__': + capture_predictions() diff --git a/packages/ml_api/tests/conftest.py b/packages/ml_api/tests/conftest.py new file mode 100644 index 000000000..3134a9b4d --- /dev/null +++ b/packages/ml_api/tests/conftest.py @@ -0,0 +1,18 @@ +import pytest + +from api.app import create_app +from api.config import TestingConfig + + +@pytest.fixture +def app(): + app = create_app(config_object=TestingConfig) + + with app.app_context(): + yield app + + +@pytest.fixture +def flask_test_client(app): + with app.test_client() as test_client: + yield test_client diff --git a/packages/ml_api/tests/differential_tests/__init__.py b/packages/ml_api/tests/differential_tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/ml_api/tests/differential_tests/test_differential.py b/packages/ml_api/tests/differential_tests/test_differential.py new file mode 100644 index 000000000..acabf724d --- /dev/null +++ b/packages/ml_api/tests/differential_tests/test_differential.py @@ -0,0 +1,53 @@ +import math + +from regression_model.config import config as model_config +from regression_model.predict import make_prediction +from regression_model.processing.data_management import load_dataset +import pandas as pd +import pytest + + +from api import config + + +@pytest.mark.differential +def test_model_prediction_differential( + *, + save_file: str = 'test_data_predictions.csv'): + """ + This test compares the prediction result similarity of + the current model with the previous model's results. + """ + + # Given + # Load the saved previous model predictions + previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}') + previous_model_predictions = previous_model_df.predictions.values + + test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) + multiple_test_input = test_data[99:600] + + # When + current_result = make_prediction(input_data=multiple_test_input) + current_model_predictions = current_result.get('predictions') + + # Then + # diff the current model vs. the old model + assert len(previous_model_predictions) == len( + current_model_predictions) + + # Perform the differential test + for previous_value, current_value in zip( + previous_model_predictions, current_model_predictions): + + # convert numpy float64 to Python float. + previous_value = previous_value.item() + current_value = current_value.item() + + # rel_tol is the relative tolerance – it is the maximum allowed + # difference between a and b, relative to the larger absolute + # value of a or b. For example, to set a tolerance of 5%, pass + # rel_tol=0.05. + assert math.isclose(previous_value, + current_value, + rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE) diff --git a/packages/ml_api/tests/test_controller.py b/packages/ml_api/tests/test_controller.py new file mode 100644 index 000000000..e45179b14 --- /dev/null +++ b/packages/ml_api/tests/test_controller.py @@ -0,0 +1,79 @@ +import io +import json +import math +import os + +from neural_network_model.config import config as ccn_config +from regression_model import __version__ as _version +from regression_model.config import config as model_config +from regression_model.processing.data_management import load_dataset + +from api import __version__ as api_version + + +def test_health_endpoint_returns_200(flask_test_client): + # When + response = flask_test_client.get('/health') + + # Then + assert response.status_code == 200 + + +def test_version_endpoint_returns_version(flask_test_client): + # When + response = flask_test_client.get('/version') + + # Then + assert response.status_code == 200 + response_json = json.loads(response.data) + assert response_json['model_version'] == _version + assert response_json['api_version'] == api_version + + +def test_prediction_endpoint_returns_prediction(flask_test_client): + # Given + # Load the test data from the regression_model package + # This is important as it makes it harder for the test + # data versions to get confused by not spreading it + # across packages. + test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) + post_json = test_data[0:1].to_json(orient='records') + + # When + response = flask_test_client.post('/v1/predict/regression', + json=json.loads(post_json)) + + # Then + assert response.status_code == 200 + response_json = json.loads(response.data) + prediction = response_json['predictions'] + response_version = response_json['version'] + assert math.ceil(prediction[0]) == 112476 + assert response_version == _version + + +def test_classifier_endpoint_returns_prediction(flask_test_client): + # Given + # Load the test data from the neural_network_model package + # This is important as it makes it harder for the test + # data versions to get confused by not spreading it + # across packages. + data_dir = os.path.abspath(os.path.join(ccn_config.DATA_FOLDER, os.pardir)) + test_dir = os.path.join(data_dir, 'test_data') + black_grass_dir = os.path.join(test_dir, 'Black-grass') + black_grass_image = os.path.join(black_grass_dir, '1.png') + with open(black_grass_image, "rb") as image_file: + file_bytes = image_file.read() + data = dict( + file=(io.BytesIO(bytearray(file_bytes)), "1.png"), + ) + + # When + response = flask_test_client.post('/predict/classifier', + content_type='multipart/form-data', + data=data) + + # Then + assert response.status_code == 200 + response_json = json.loads(response.data) + assert response_json['readable_predictions'] diff --git a/packages/ml_api/tests/test_validation.py b/packages/ml_api/tests/test_validation.py new file mode 100644 index 000000000..d34d86c72 --- /dev/null +++ b/packages/ml_api/tests/test_validation.py @@ -0,0 +1,26 @@ +import json + +from regression_model.config import config +from regression_model.processing.data_management import load_dataset + + +def test_prediction_endpoint_validation_200(flask_test_client): + # Given + # Load the test data from the regression_model package. + # This is important as it makes it harder for the test + # data versions to get confused by not spreading it + # across packages. + test_data = load_dataset(file_name=config.TESTING_DATA_FILE) + post_json = test_data.to_json(orient='records') + + # When + response = flask_test_client.post('/v1/predict/regression', + json=json.loads(post_json)) + + # Then + assert response.status_code == 200 + response_json = json.loads(response.data) + + # Check correct number of errors removed + assert len(response_json.get('predictions')) + len( + response_json.get('errors')) == len(test_data) diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini new file mode 100644 index 000000000..50e82033a --- /dev/null +++ b/packages/ml_api/tox.ini @@ -0,0 +1,32 @@ +[tox] +envlist = py36, py37, py38 +skipsdist = True + + +[testenv] +install_command = pip install --pre {opts} {packages} +deps = + -rrequirements.txt + +passenv = + PIP_EXTRA_INDEX_URL + KERAS_BACKEND + +setenv = + PYTHONPATH=. + +commands = + pytest \ + -s \ + -v \ + -m "not differential" \ + {posargs:tests} + + +# content of pytest.ini +[pytest] +markers = + integration: mark a test as an integration test. + differential: mark a test as a differential test. +filterwarnings = + ignore::DeprecationWarning \ No newline at end of file diff --git a/packages/neural_network_model/MANIFEST.in b/packages/neural_network_model/MANIFEST.in new file mode 100644 index 000000000..f9aca5b03 --- /dev/null +++ b/packages/neural_network_model/MANIFEST.in @@ -0,0 +1,17 @@ +include *.txt +include *.md +include *.cfg +include *.pkl +recursive-include ./neural_network_model/*.py + +include neural_network_model/trained_models/*.pkl +include neural_network_model/trained_models/*.h5 +include neural_network_model/VERSION +include neural_network_model/datasets/test_data/Black-grass/1.png +include neural_network_model/datasets/test_data/Charlock/1.png + +include ./requirements.txt +exclude *.log + +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] \ No newline at end of file diff --git a/packages/neural_network_model/config.yml b/packages/neural_network_model/config.yml new file mode 100644 index 000000000..a939e5708 --- /dev/null +++ b/packages/neural_network_model/config.yml @@ -0,0 +1,4 @@ +MODEL_NAME: ${MODEL_NAME:cnn_model} +PIPELINE_NAME: ${PIPELINE_NAME:cnn_pipe} +CLASSES_PATH: ${CLASSES_PATH:False} +IMAGE_SIZE: $(IMAGE_SIZE:150} diff --git a/packages/neural_network_model/neural_network_model/VERSION b/packages/neural_network_model/neural_network_model/VERSION new file mode 100644 index 000000000..6c6aa7cb0 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/VERSION @@ -0,0 +1 @@ +0.1.0 \ No newline at end of file diff --git a/packages/neural_network_model/neural_network_model/__init__.py b/packages/neural_network_model/neural_network_model/__init__.py new file mode 100644 index 000000000..b6c968d56 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/__init__.py @@ -0,0 +1,7 @@ +import os + +from neural_network_model.config import config + + +with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file: + __version__ = version_file.read().strip() diff --git a/packages/neural_network_model/neural_network_model/config/__init__.py b/packages/neural_network_model/neural_network_model/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/neural_network_model/neural_network_model/config/config.py b/packages/neural_network_model/neural_network_model/config/config.py new file mode 100644 index 000000000..4d8b173d7 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/config/config.py @@ -0,0 +1,38 @@ +# The Keras model loading function does not play well with +# Pathlib at the moment, so we are using the old os module +# style + +import os + +PWD = os.path.dirname(os.path.abspath(__file__)) +PACKAGE_ROOT = os.path.abspath(os.path.join(PWD, '..')) +DATASET_DIR = os.path.join(PACKAGE_ROOT, 'datasets') +TRAINED_MODEL_DIR = os.path.join(PACKAGE_ROOT, 'trained_models') +DATA_FOLDER = os.path.join(DATASET_DIR, 'v2-plant-seedlings-dataset') + +# MODEL PERSISTING +MODEL_NAME = 'cnn_model' +PIPELINE_NAME = 'cnn_pipe' +CLASSES_NAME = 'classes' +ENCODER_NAME = 'encoder' + +# MODEL FITTING +IMAGE_SIZE = 150 # 50 for testing, 150 for final model +BATCH_SIZE = 10 +EPOCHS = int(os.environ.get('EPOCHS', 1)) # 1 for testing, 10 for final model + + +with open(os.path.join(PACKAGE_ROOT, 'VERSION')) as version_file: + _version = version_file.read().strip() + +MODEL_FILE_NAME = f'{MODEL_NAME}_{_version}.h5' +MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, MODEL_FILE_NAME) + +PIPELINE_FILE_NAME = f'{PIPELINE_NAME}_{_version}.pkl' +PIPELINE_PATH = os.path.join(TRAINED_MODEL_DIR, PIPELINE_FILE_NAME) + +CLASSES_FILE_NAME = f'{CLASSES_NAME}_{_version}.pkl' +CLASSES_PATH = os.path.join(TRAINED_MODEL_DIR, CLASSES_FILE_NAME) + +ENCODER_FILE_NAME = f'{ENCODER_NAME}_{_version}.pkl' +ENCODER_PATH = os.path.join(TRAINED_MODEL_DIR, ENCODER_FILE_NAME) diff --git a/packages/neural_network_model/neural_network_model/datasets/__init__.py b/packages/neural_network_model/neural_network_model/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png b/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png new file mode 100644 index 000000000..c4a76e407 Binary files /dev/null and b/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png differ diff --git a/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png b/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png new file mode 100644 index 000000000..6c8db80d6 Binary files /dev/null and b/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png differ diff --git a/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py b/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/neural_network_model/neural_network_model/model.py b/packages/neural_network_model/neural_network_model/model.py new file mode 100644 index 000000000..ef31d213b --- /dev/null +++ b/packages/neural_network_model/neural_network_model/model.py @@ -0,0 +1,79 @@ +# for the convolutional network +from keras.models import Sequential +from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten +from keras.optimizers import Adam +from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint +from keras.wrappers.scikit_learn import KerasClassifier + +from neural_network_model.config import config + + +def cnn_model(kernel_size=(3, 3), + pool_size=(2, 2), + first_filters=32, + second_filters=64, + third_filters=128, + dropout_conv=0.3, + dropout_dense=0.3, + image_size=50): + + model = Sequential() + model.add(Conv2D( + first_filters, + kernel_size, + activation='relu', + input_shape=(image_size, image_size, 3))) + model.add(Conv2D(first_filters, kernel_size, activation = 'relu')) + model.add(MaxPooling2D(pool_size=pool_size)) + model.add(Dropout(dropout_conv)) + + model.add(Conv2D(second_filters, kernel_size, activation='relu')) + model.add(Conv2D(second_filters, kernel_size, activation ='relu')) + model.add(MaxPooling2D(pool_size=pool_size)) + model.add(Dropout(dropout_conv)) + + model.add(Conv2D(third_filters, kernel_size, activation='relu')) + model.add(Conv2D(third_filters, kernel_size, activation ='relu')) + model.add(MaxPooling2D(pool_size=pool_size)) + model.add(Dropout(dropout_conv)) + + model.add(Flatten()) + model.add(Dense(256, activation="relu")) + model.add(Dropout(dropout_dense)) + model.add(Dense(12, activation="softmax")) + + model.compile(Adam(lr=0.0001), + loss='binary_crossentropy', + metrics=['accuracy']) + + return model + + +checkpoint = ModelCheckpoint(config.MODEL_PATH, + monitor='acc', + verbose=1, + save_best_only=True, + mode='max') + +reduce_lr = ReduceLROnPlateau(monitor='acc', + factor=0.5, + patience=2, + verbose=1, + mode='max', + min_lr=0.00001) + +callbacks_list = [checkpoint, reduce_lr] + +cnn_clf = KerasClassifier(build_fn=cnn_model, + batch_size=config.BATCH_SIZE, + validation_split=10, + epochs=config.EPOCHS, + verbose=1, # progress bar - required for CI job + callbacks=callbacks_list, + image_size=config.IMAGE_SIZE + ) + + +if __name__ == '__main__': + model = cnn_model() + model.summary() diff --git a/packages/neural_network_model/neural_network_model/pipeline.py b/packages/neural_network_model/neural_network_model/pipeline.py new file mode 100644 index 000000000..d8f68a6cc --- /dev/null +++ b/packages/neural_network_model/neural_network_model/pipeline.py @@ -0,0 +1,10 @@ +from sklearn.pipeline import Pipeline + +from neural_network_model.config import config +from neural_network_model.processing import preprocessors as pp +from neural_network_model import model + + +pipe = Pipeline([ + ('dataset', pp.CreateDataset(config.IMAGE_SIZE)), + ('cnn_model', model.cnn_clf)]) diff --git a/packages/neural_network_model/neural_network_model/predict.py b/packages/neural_network_model/neural_network_model/predict.py new file mode 100644 index 000000000..56869268c --- /dev/null +++ b/packages/neural_network_model/neural_network_model/predict.py @@ -0,0 +1,67 @@ +import logging + +import pandas as pd + +from neural_network_model import __version__ as _version +from neural_network_model.processing import data_management as dm + +_logger = logging.getLogger(__name__) +KERAS_PIPELINE = dm.load_pipeline_keras() +ENCODER = dm.load_encoder() + + +def make_single_prediction(*, image_name: str, image_directory: str): + """Make a single prediction using the saved model pipeline. + + Args: + image_name: Filename of the image to classify + image_directory: Location of the image to classify + + Returns + Dictionary with both raw predictions and readable values. + """ + + image_df = dm.load_single_image( + data_folder=image_directory, + filename=image_name) + + prepared_df = image_df['image'].reset_index(drop=True) + _logger.info(f'received input array: {prepared_df}, ' + f'filename: {image_name}') + + predictions = KERAS_PIPELINE.predict(prepared_df) + readable_predictions = ENCODER.encoder.inverse_transform(predictions) + + _logger.info(f'Made prediction: {predictions}' + f' with model version: {_version}') + + return dict(predictions=predictions, + readable_predictions=readable_predictions, + version=_version) + + +def make_bulk_prediction(*, images_df: pd.Series) -> dict: + """Make multiple predictions using the saved model pipeline. + + Currently, this function is primarily for testing purposes, + allowing us to pass in a directory of images for running + bulk predictions. + + Args: + images_df: Pandas series of images + + Returns + Dictionary with both raw predictions and their classifications. + """ + + _logger.info(f'received input df: {images_df}') + + predictions = KERAS_PIPELINE.predict(images_df) + readable_predictions = ENCODER.encoder.inverse_transform(predictions) + + _logger.info(f'Made predictions: {predictions}' + f' with model version: {_version}') + + return dict(predictions=predictions, + readable_predictions=readable_predictions, + version=_version) diff --git a/packages/neural_network_model/neural_network_model/processing/__init__.py b/packages/neural_network_model/neural_network_model/processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/neural_network_model/neural_network_model/processing/data_management.py b/packages/neural_network_model/neural_network_model/processing/data_management.py new file mode 100644 index 000000000..675362ca0 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/processing/data_management.py @@ -0,0 +1,130 @@ +import logging +import os +import typing as t +from glob import glob +from pathlib import Path + +import pandas as pd +from keras.models import load_model +from keras.wrappers.scikit_learn import KerasClassifier +from sklearn.externals import joblib +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder + +from neural_network_model import model as m +from neural_network_model.config import config + +_logger = logging.getLogger(__name__) + + +def load_single_image(data_folder: str, filename: str) -> pd.DataFrame: + """Makes dataframe with image path and target.""" + + image_df = [] + + # search for specific image in directory + for image_path in glob(os.path.join(data_folder, f'{filename}')): + tmp = pd.DataFrame([image_path, 'unknown']).T + image_df.append(tmp) + + # concatenate the final df + images_df = pd.concat(image_df, axis=0, ignore_index=True) + images_df.columns = ['image', 'target'] + + return images_df + + +def load_image_paths(data_folder: str) -> pd.DataFrame: + """Makes dataframe with image path and target.""" + + images_df = [] + + # navigate within each folder + for class_folder_name in os.listdir(data_folder): + class_folder_path = os.path.join(data_folder, class_folder_name) + + # collect every image path + for image_path in glob(os.path.join(class_folder_path, "*.png")): + tmp = pd.DataFrame([image_path, class_folder_name]).T + images_df.append(tmp) + + # concatenate the final df + images_df = pd.concat(images_df, axis=0, ignore_index=True) + images_df.columns = ['image', 'target'] + + return images_df + + +def get_train_test_target(df: pd.DataFrame): + """Split a dataset into train and test segments.""" + + X_train, X_test, y_train, y_test = train_test_split(df['image'], + df['target'], + test_size=0.20, + random_state=101) + + X_train.reset_index(drop=True, inplace=True) + X_test.reset_index(drop=True, inplace=True) + + y_train.reset_index(drop=True, inplace=True) + y_test.reset_index(drop=True, inplace=True) + + return X_train, X_test, y_train, y_test + + +def save_pipeline_keras(model) -> None: + """Persist keras model to disk.""" + + joblib.dump(model.named_steps['dataset'], config.PIPELINE_PATH) + joblib.dump(model.named_steps['cnn_model'].classes_, config.CLASSES_PATH) + model.named_steps['cnn_model'].model.save(str(config.MODEL_PATH)) + + remove_old_pipelines( + files_to_keep=[config.MODEL_FILE_NAME, config.ENCODER_FILE_NAME, + config.PIPELINE_FILE_NAME, config.CLASSES_FILE_NAME]) + + +def load_pipeline_keras() -> Pipeline: + """Load a Keras Pipeline from disk.""" + + dataset = joblib.load(config.PIPELINE_PATH) + + build_model = lambda: load_model(config.MODEL_PATH) + + classifier = KerasClassifier(build_fn=build_model, + batch_size=config.BATCH_SIZE, + validation_split=10, + epochs=config.EPOCHS, + verbose=2, + callbacks=m.callbacks_list, + # image_size = config.IMAGE_SIZE + ) + + classifier.classes_ = joblib.load(config.CLASSES_PATH) + classifier.model = build_model() + + return Pipeline([ + ('dataset', dataset), + ('cnn_model', classifier) + ]) + + +def load_encoder() -> LabelEncoder: + encoder = joblib.load(config.ENCODER_PATH) + + return encoder + + +def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: + """ + Remove old model pipelines, models, encoders and classes. + + This is to ensure there is a simple one-to-one + mapping between the package version and the model + version to be imported and used by other applications. + """ + do_not_delete = files_to_keep + ['__init__.py'] + for model_file in Path(config.TRAINED_MODEL_DIR).iterdir(): + if model_file.name not in do_not_delete: + model_file.unlink() diff --git a/packages/neural_network_model/neural_network_model/processing/errors.py b/packages/neural_network_model/neural_network_model/processing/errors.py new file mode 100644 index 000000000..b92425437 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/processing/errors.py @@ -0,0 +1,6 @@ +class BaseError(Exception): + """Base package error.""" + + +class InvalidModelInputError(BaseError): + """Model input contains an error.""" diff --git a/packages/neural_network_model/neural_network_model/processing/preprocessors.py b/packages/neural_network_model/neural_network_model/processing/preprocessors.py new file mode 100644 index 000000000..37f813c19 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/processing/preprocessors.py @@ -0,0 +1,50 @@ +import numpy as np +import cv2 +from keras.utils import np_utils +from sklearn.preprocessing import LabelEncoder +from sklearn.base import BaseEstimator, TransformerMixin + + +class TargetEncoder(BaseEstimator, TransformerMixin): + + def __init__(self, encoder=LabelEncoder()): + self.encoder = encoder + + def fit(self, X, y=None): + # note that x is the target in this case + self.encoder.fit(X) + return self + + def transform(self, X): + X = X.copy() + X = np_utils.to_categorical(self.encoder.transform(X)) + return X + + +def _im_resize(df, n, image_size): + im = cv2.imread(df[n]) + im = cv2.resize(im, (image_size, image_size)) + return im + + +class CreateDataset(BaseEstimator, TransformerMixin): + + def __init__(self, image_size=50): + self.image_size = image_size + + def fit(self, X, y=None): + return self + + def transform(self, X): + X = X.copy() + tmp = np.zeros((len(X), + self.image_size, + self.image_size, 3), dtype='float32') + + for n in range(0, len(X)): + im = _im_resize(X, n, self.image_size) + tmp[n] = im + + print('Dataset Images shape: {} size: {:,}'.format( + tmp.shape, tmp.size)) + return tmp diff --git a/packages/neural_network_model/neural_network_model/train_pipeline.py b/packages/neural_network_model/neural_network_model/train_pipeline.py new file mode 100644 index 000000000..13110b145 --- /dev/null +++ b/packages/neural_network_model/neural_network_model/train_pipeline.py @@ -0,0 +1,27 @@ +from sklearn.externals import joblib + +from neural_network_model import pipeline as pipe +from neural_network_model.config import config +from neural_network_model.processing import data_management as dm +from neural_network_model.processing import preprocessors as pp + + +def run_training(save_result: bool = True): + """Train a Convolutional Neural Network.""" + + images_df = dm.load_image_paths(config.DATA_FOLDER) + X_train, X_test, y_train, y_test = dm.get_train_test_target(images_df) + + enc = pp.TargetEncoder() + enc.fit(y_train) + y_train = enc.transform(y_train) + + pipe.pipe.fit(X_train, y_train) + + if save_result: + joblib.dump(enc, config.ENCODER_PATH) + dm.save_pipeline_keras(pipe.pipe) + + +if __name__ == '__main__': + run_training(save_result=True) diff --git a/packages/neural_network_model/neural_network_model/trained_models/__init__.py b/packages/neural_network_model/neural_network_model/trained_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/neural_network_model/requirements.txt b/packages/neural_network_model/requirements.txt new file mode 100644 index 000000000..ffac1feac --- /dev/null +++ b/packages/neural_network_model/requirements.txt @@ -0,0 +1,18 @@ +# production requirements +pandas==0.23.4 +numpy==1.13.3 +scikit-learn==0.19.0 +Keras==2.1.3 +opencv-python==4.0.0.21 +h5py==2.9.0 +Theano==0.9.0 + +# packaging +setuptools==40.6.3 +wheel==0.32.3 + +# testing requirements +pytest==4.0.2 + +# fetching datasets +kaggle==1.5.1.1 \ No newline at end of file diff --git a/packages/neural_network_model/setup.py b/packages/neural_network_model/setup.py new file mode 100644 index 000000000..dd4e4d6a6 --- /dev/null +++ b/packages/neural_network_model/setup.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import io +import os +from pathlib import Path + +from setuptools import find_packages, setup + + +# Package meta-data. +NAME = 'neural_network_model' +DESCRIPTION = 'Train and deploy neural network model.' +URL = 'your github project' +EMAIL = 'your_email@email.com' +AUTHOR = 'Your name' +REQUIRES_PYTHON = '>=3.6.0' + + +# What packages are required for this module to be executed? +def list_reqs(fname='requirements.txt'): + with open(fname) as fd: + return fd.read().splitlines() + + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change the +# Trove Classifier for that! + +here = os.path.abspath(os.path.dirname(__file__)) + +# Import the README and use it as the long-description. +# Note: this will only work if 'README.md' is present in your MANIFEST.in file! +try: + with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = '\n' + f.read() +except FileNotFoundError: + long_description = DESCRIPTION + + +# Load the package's __version__.py module as a dictionary. +ROOT_DIR = Path(__file__).resolve().parent +PACKAGE_DIR = ROOT_DIR / NAME +about = {} +with open(PACKAGE_DIR / 'VERSION') as f: + _version = f.read().strip() + about['__version__'] = _version + + +# Where the magic happens: +setup( + name=NAME, + version=about['__version__'], + description=DESCRIPTION, + long_description=long_description, + long_description_content_type='text/markdown', + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages(exclude=('tests',)), + package_data={'neural_network_model': ['VERSION']}, + install_requires=list_reqs(), + extras_require={}, + include_package_data=True, + license='MIT', + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy' + ], +) diff --git a/packages/neural_network_model/tests/__init__.py b/packages/neural_network_model/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/neural_network_model/tests/conftest.py b/packages/neural_network_model/tests/conftest.py new file mode 100644 index 000000000..90aa8aa79 --- /dev/null +++ b/packages/neural_network_model/tests/conftest.py @@ -0,0 +1,20 @@ +import pytest +import os + +from neural_network_model.config import config + + +@pytest.fixture +def black_grass_dir(): + test_data_dir = os.path.join(config.DATASET_DIR, 'test_data') + black_grass_dir = os.path.join(test_data_dir, 'Black-grass') + + return black_grass_dir + + +@pytest.fixture +def charlock_dir(): + test_data_dir = os.path.join(config.DATASET_DIR, 'test_data') + charlock_dir = os.path.join(test_data_dir, 'Charlock') + + return charlock_dir diff --git a/packages/neural_network_model/tests/test_predict.py b/packages/neural_network_model/tests/test_predict.py new file mode 100644 index 000000000..020fba5ab --- /dev/null +++ b/packages/neural_network_model/tests/test_predict.py @@ -0,0 +1,17 @@ +from neural_network_model import __version__ as _version +from neural_network_model.predict import (make_single_prediction) + + +def test_make_prediction_on_sample(charlock_dir): + # Given + filename = '1.png' + expected_classification = 'Charlock' + + # When + results = make_single_prediction(image_directory=charlock_dir, + image_name=filename) + + # Then + assert results['predictions'] is not None + assert results['readable_predictions'][0] == expected_classification + assert results['version'] == _version diff --git a/packages/regression_model/regression_model/VERSION b/packages/regression_model/regression_model/VERSION index 6c6aa7cb0..a30e84ffa 100644 --- a/packages/regression_model/regression_model/VERSION +++ b/packages/regression_model/regression_model/VERSION @@ -1 +1 @@ -0.1.0 \ No newline at end of file +2.0.20 \ No newline at end of file diff --git a/packages/regression_model/regression_model/predict.py b/packages/regression_model/regression_model/predict.py index e28ed8a96..7e4ed3d67 100644 --- a/packages/regression_model/regression_model/predict.py +++ b/packages/regression_model/regression_model/predict.py @@ -7,6 +7,7 @@ from regression_model import __version__ as _version import logging +import typing as t _logger = logging.getLogger(__name__) @@ -15,12 +16,22 @@ _price_pipe = load_pipeline(file_name=pipeline_file_name) -def make_prediction(*, input_data) -> dict: - """Make a prediction using the saved model pipeline.""" +def make_prediction(*, input_data: t.Union[pd.DataFrame, dict], + ) -> dict: + """Make a prediction using a saved model pipeline. - data = pd.read_json(input_data) + Args: + input_data: Array of model prediction inputs. + + Returns: + Predictions for each input row, as well as the model version. + """ + + data = pd.DataFrame(input_data) validated_data = validate_inputs(input_data=data) + prediction = _price_pipe.predict(validated_data[config.FEATURES]) + output = np.exp(prediction) results = {"predictions": output, "version": _version} diff --git a/packages/regression_model/regression_model/processing/data_management.py b/packages/regression_model/regression_model/processing/data_management.py index 388412a7d..0357e1219 100644 --- a/packages/regression_model/regression_model/processing/data_management.py +++ b/packages/regression_model/regression_model/processing/data_management.py @@ -6,6 +6,7 @@ from regression_model import __version__ as _version import logging +import typing as t _logger = logging.getLogger(__name__) @@ -28,7 +29,7 @@ def save_pipeline(*, pipeline_to_persist) -> None: save_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl" save_path = config.TRAINED_MODEL_DIR / save_file_name - remove_old_pipelines(files_to_keep=save_file_name) + remove_old_pipelines(files_to_keep=[save_file_name]) joblib.dump(pipeline_to_persist, save_path) _logger.info(f"saved pipeline: {save_file_name}") @@ -41,14 +42,17 @@ def load_pipeline(*, file_name: str) -> Pipeline: return trained_model -def remove_old_pipelines(*, files_to_keep) -> None: +def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: """ Remove old model pipelines. + This is to ensure there is a simple one-to-one mapping between the package version and the model version to be imported and used by other applications. + However, we do also include the immediate previous + pipeline version for differential testing purposes. """ - + do_not_delete = files_to_keep + ['__init__.py'] for model_file in config.TRAINED_MODEL_DIR.iterdir(): - if model_file.name not in [files_to_keep, "__init__.py"]: + if model_file.name not in do_not_delete: model_file.unlink() diff --git a/packages/regression_model/requirements.txt b/packages/regression_model/requirements.txt index 0f6f283d5..919b75917 100644 --- a/packages/regression_model/requirements.txt +++ b/packages/regression_model/requirements.txt @@ -14,3 +14,6 @@ pytest>=5.3.2,<6.0.0 # packaging setuptools>=41.4.0,<42.0.0 wheel>=0.33.6,<0.34.0 + +# fetching datasets +kaggle>=1.5.6,<1.6.0 diff --git a/packages/regression_model/setup.py b/packages/regression_model/setup.py index 5200fe1e5..264c47805 100644 --- a/packages/regression_model/setup.py +++ b/packages/regression_model/setup.py @@ -10,14 +10,14 @@ # Package meta-data. NAME = 'regression_model' -DESCRIPTION = 'Train and deploy regression model.' -URL = 'your github project' -EMAIL = 'your_email@email.com' -AUTHOR = 'Your name' +DESCRIPTION = 'Regression model for using in the Train In Data online course "Deployment of Machine Learning Models".' +URL = 'https://github.com/trainindata/deploying-machine-learning-models' +EMAIL = 'christopher.samiullah@protonmail.com' +AUTHOR = 'ChristopherGS' REQUIRES_PYTHON = '>=3.6.0' -# What packages are required for this module to be executed? +# Packages that are required for this module to be executed def list_reqs(fname='requirements.txt'): with open(fname) as fd: return fd.read().splitlines() @@ -42,7 +42,7 @@ def list_reqs(fname='requirements.txt'): # Load the package's __version__.py module as a dictionary. ROOT_DIR = Path(__file__).resolve().parent -PACKAGE_DIR = ROOT_DIR / NAME +PACKAGE_DIR = ROOT_DIR / 'regression_model' about = {} with open(PACKAGE_DIR / 'VERSION') as f: _version = f.read().strip() @@ -65,7 +65,7 @@ def list_reqs(fname='requirements.txt'): install_requires=list_reqs(), extras_require={}, include_package_data=True, - license='MIT', + license='BSD 3', classifiers=[ # Trove classifiers # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers diff --git a/packages/regression_model/tests/test_predict.py b/packages/regression_model/tests/test_predict.py index 0357307b7..3c7147f89 100644 --- a/packages/regression_model/tests/test_predict.py +++ b/packages/regression_model/tests/test_predict.py @@ -7,10 +7,10 @@ def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') - single_test_json = test_data[0:1].to_json(orient='records') + single_test_input = test_data[0:1] # When - subject = make_prediction(input_data=single_test_json) + subject = make_prediction(input_data=single_test_input) # Then assert subject is not None @@ -22,10 +22,10 @@ def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) - multiple_test_json = test_data.to_json(orient='records') + multiple_test_input = test_data # When - subject = make_prediction(input_data=multiple_test_json) + subject = make_prediction(input_data=multiple_test_input) # Then assert subject is not None diff --git a/packages/regression_model/tox.ini b/packages/regression_model/tox.ini old mode 100644 new mode 100755 index 7fdec534b..7dde476d4 --- a/packages/regression_model/tox.ini +++ b/packages/regression_model/tox.ini @@ -1,35 +1,25 @@ -# Tox is a generic virtualenv management and test command line tool. Its goal is to -# standardize testing in Python. We will be using it extensively in this course. - -# Using Tox we can (on multiple operating systems): -# + Eliminate PYTHONPATH challenges when running scripts/tests -# + Eliminate virtualenv setup confusion -# + Streamline steps such as model training, model publishing - [tox] -envlist = regression_model -skipsdist = True +envlist = py38 + [testenv] -install_command = pip install {opts} {packages} +install_command = pip install --pre {opts} {packages} +whitelist_externals = unzip deps = - -rrequirements.txt - -setenv = - PYTHONPATH=. + -rrequirements.txt -commands = - python regression_model/train_pipeline.py - pytest tests/ - - -[testenv:install_locally] -deps = - {[testenv]deps} +passenv = + KAGGLE_USERNAME + KAGGLE_KEY setenv = - PYTHONPATH=. + PYTHONPATH=. commands = - python regression_model/train_pipeline.py - python setup.py sdist bdist_wheel + # kaggle competitions download -c house-prices-advanced-regression-techniques -p regression_model/datasets/ + # unzip -o regression_model/datasets/house-prices-advanced-regression-techniques.zip -d regression_model/datasets + python regression_model/train_pipeline.py + pytest \ + -s \ + -v \ + {posargs:tests} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..e391ca79f --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-r packages/ml_api/requirements.txt diff --git a/scripts/fetch_kaggle_dataset.sh b/scripts/fetch_kaggle_dataset.sh new file mode 100755 index 000000000..455b9c970 --- /dev/null +++ b/scripts/fetch_kaggle_dataset.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +kaggle competitions download -c house-prices-advanced-regression-techniques -p packages/regression_model/regression_model/datasets/ \ No newline at end of file diff --git a/scripts/fetch_kaggle_large_dataset.sh b/scripts/fetch_kaggle_large_dataset.sh new file mode 100755 index 000000000..e83841e99 --- /dev/null +++ b/scripts/fetch_kaggle_large_dataset.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +TRAINING_DATA_URL="vbookshelf/v2-plant-seedlings-dataset" +NOW=$(date) + +kaggle datasets download -d $TRAINING_DATA_URL -p packages/neural_network_model/neural_network_model/datasets/ && \ +unzip packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset.zip -d packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset && \ +echo $TRAINING_DATA_URL 'retrieved on:' $NOW > packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt && \ +mkdir -p "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" && \ +mv -v "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse/"* "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" +rm -rf "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse" \ No newline at end of file diff --git a/scripts/input_test.json b/scripts/input_test.json new file mode 100644 index 000000000..bee61b12a --- /dev/null +++ b/scripts/input_test.json @@ -0,0 +1,82 @@ +[{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11622, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1961, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 896, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}] \ No newline at end of file diff --git a/scripts/publish_model.sh b/scripts/publish_model.sh new file mode 100755 index 000000000..9a1cad78a --- /dev/null +++ b/scripts/publish_model.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Building packages and uploading them to a Gemfury repository + +GEMFURY_URL=$GEMFURY_PUSH_URL + +set -e + +DIRS="$@" +BASE_DIR=$(pwd) +SETUP="setup.py" + +warn() { + echo "$@" 1>&2 +} + +die() { + warn "$@" + exit 1 +} + +build() { + DIR="${1/%\//}" + echo "Checking directory $DIR" + cd "$BASE_DIR/$DIR" + [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return + PACKAGE_NAME=$(python $SETUP --fullname) + echo "Package $PACKAGE_NAME" + python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed" + for X in $(ls dist) + do + curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X" + done +} + +if [ -n "$DIRS" ]; then + for dir in $DIRS; do + build $dir + done +else + ls -d */ | while read dir; do + build $dir + done +fi \ No newline at end of file