diff --git a/.gitignore b/.gitignore index 4e6646b..6870680 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ *.rds *.db *.log +*.parquet +*.csv # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/deep4cast/datasets.py b/deep4cast/datasets.py index 5d54d90..a4f9b53 100644 --- a/deep4cast/datasets.py +++ b/deep4cast/datasets.py @@ -1,94 +1,185 @@ +from fastparquet import ParquetFile import numpy as np +import pandas as pd from torch.utils.data import Dataset from deep4cast import transforms class TimeSeriesDataset(Dataset): - """Takes a list of time series and provides access to windowed subseries for - training. + """Provides windowed subseries for training. Each time series is split into + lookback and horizon examples, with the number of examples in each + calculated using ``lookback``, ``horizon`` and ``step``. Requires either + * a list of ``numpy`` arrays or + * a path to parquet files and a CSV containing partition index and partition length. Arguments: - * time_series (list): List of time series ``numpy`` arrays. * lookback (int): Number of time steps used as input for forecasting. * horizon (int): Number of time steps to forecast. * step (int): Time step size between consecutive examples. - * transform (``transforms.Compose``): Specific transformations to apply to time series examples. - * static_covs (list): Static covariates for each item in ``time_series`` list. + * transform (``transforms.Compose``): List of transformations to apply to time series examples. * thinning (float): Fraction of examples to include. + * split (str): Optional, specifies ``train`` or ``test`` split. + * time_series (list): List of time series ``numpy`` arrays. + * path_parquet (str): File location of partitioned parquet files. + * path_metadata (list): List of CSV file locations containing time series id and length. """ - def __init__(self, - time_series, + + def __init__(self, lookback, horizon, - step, + step, transform, - static_covs=None, - thinning=1.0): - self.time_series = time_series + thinning=1.0, + split=None, + time_series=None, + path_parquet=None, + path_metadata=None): self.lookback = lookback self.horizon = horizon self.step = step self.transform = transform - self.static_covs = static_covs + self.split = split + self.path_parquet = path_parquet + self.time_series = time_series + + if path_parquet: + self._examples_parquet(path_metadata=path_metadata) + elif time_series: + self._examples_array() + + # Store the number of training examples + self._len = int(len(self.example_ids) * thinning) + + def _examples_parquet(self, path_metadata): + """Takes a file location of metadata about the length of each time series + and calculates number of examples in each. + + Arguments: + * path_metadata (list): List of CSV files containing time series id and length. + + """ + path_file = ParquetFile(self.path_parquet) + self.partitions = path_file.info['partitions'][0] # Slice each time series into examples, assigning IDs to each - last_id = 0 + example_ids = {} + n_dropped = 0 + for file_meta in path_metadata: + with open(file_meta) as infile: + for line in infile: + line = line.strip('\n') + line = line.split(',') + index = line[0] # Parition name + length = int(line[1]) # Length of time series + # Withhold the horizon for testing + if self.split is 'train': + length -= self.horizon + # At least the horizon is required for zero-padding + if length < self.horizon: + n_dropped += 1 + continue + # Slice each time series into examples, assigning IDs to each + example_ids = self._hashmap( + index=index, + length=length, + example_ids=example_ids) + + # Inform user about time series that were too short + if n_dropped > 0: + print('Dropped {} time series due to length.'.format(n_dropped)) + + self.example_ids = example_ids + + def _examples_array(self): + """Takes a list of time series and calculates number of examples in each. + + """ n_dropped = 0 - self.example_ids = {} + example_ids = {} for i, ts in enumerate(self.time_series): - num_examples = (ts.shape[-1] - self.lookback - self.horizon + self.step) // self.step # Time series shorter than the forecast horizon need to be dropped. if ts.shape[-1] < self.horizon: n_dropped += 1 continue - # For short time series zero pad the input - if ts.shape[-1] < self.lookback + self.horizon: - num_examples = 1 - for j in range(num_examples): - self.example_ids[last_id + j] = (i, j * self.step) - last_id += num_examples + # Slice each time series into examples, assigning IDs to each + example_ids = self._hashmap( + index=i, + length=ts.shape[-1], + example_ids=example_ids) # Inform user about time series that were too short if n_dropped > 0: - print("Dropped {}/{} time series due to length.".format( - n_dropped, len(self.time_series) - ) - ) + print('Dropped {} time series due to length.'.format(n_dropped)) + + self.example_ids = example_ids - # Store the number of training examples - self._len = int(self.example_ids.__len__() * thinning) + def _hashmap(self, index, length, example_ids): + """Creates a dictionary of windowed examples indexed on the time series + and location within the time series. + + Arguments: + * index: Either list index or parquet parition name. + * length (int): Length of the indexed time series. + * example_ids (dict): Dictionary where the key is the example + number and the value is the tuple of + (time series index, start position for example slice). + + """ + last_id = len(example_ids) + + # only use last lookback + horizon for test case + if self.split is 'test': + length = length - self.lookback - self.horizon + length = max((length, 0)) + example_ids[last_id] = (index, length) + + return example_ids + + num_examples = (length - self.lookback - + self.horizon + self.step) // self.step + # For short time series we will zero pad the input + num_examples = max((num_examples, 1)) + # (time series index, start position for example slice) + for j in range(num_examples): + example_ids[last_id + j] = (index, j * self.step) + + return example_ids def __len__(self): return self._len def __getitem__(self, idx): - # Get time series ts_id, lookback_id = self.example_ids[idx] - ts = self.time_series[ts_id] + + if self.path_parquet: + path_file = self.path_parquet + self.partitions + '=' + ts_id + '/part.0.parquet' + + ts = ParquetFile(path_file) + ts = ts.to_pandas() + ts = ts.values.T + elif self.time_series: + ts = self.time_series[ts_id] # Prepare input and target. Zero pad if necessary. if ts.shape[-1] < self.lookback + self.horizon: # If the time series is too short, we zero pad X = ts[:, :-self.horizon] X = np.pad( - X, - pad_width=((0, 0), (self.lookback - X.shape[-1], 0)), - mode='constant', + X, + pad_width=((0, 0), (self.lookback - X.shape[-1], 0)), + mode='constant', constant_values=0 ) y = ts[:, -self.horizon:] else: X = ts[:, lookback_id:lookback_id + self.lookback] - y = ts[:, lookback_id + self.lookback:lookback_id + self.lookback + self.horizon] + y = ts[:, lookback_id + self.lookback:lookback_id + + self.lookback + self.horizon] # Create the input and output for the sample sample = {'X': X, 'y': y} sample = self.transform(sample) - # Static covariates can be attached - if self.static_covs is not None: - sample['X_stat'] = self.static_covs[ts_id] - return sample diff --git a/docs/conf.py b/docs/conf.py index bce6ca8..16c6407 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,4 +22,4 @@ } autoclass_content = "both" use_system_site_packages = True -autodoc_mock_imports = ["numpy", "torch"] \ No newline at end of file +autodoc_mock_imports = ["numpy", "torch", "pandas", "fastparquet"] \ No newline at end of file diff --git a/docs/examples/m4daily.ipynb b/docs/examples/m4daily.ipynb index 5dc65ca..37f9f27 100644 --- a/docs/examples/m4daily.ipynb +++ b/docs/examples/m4daily.ipynb @@ -14,8 +14,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:02.007580Z", - "start_time": "2019-06-28T17:15:01.380345Z" + "end_time": "2019-08-01T18:02:31.484908Z", + "start_time": "2019-08-01T18:02:30.655744Z" }, "scrolled": true }, @@ -24,8 +24,6 @@ "import numpy as np\n", "import os\n", "import pandas as pd\n", - "import datetime as dt\n", - "import matplotlib.pyplot as plt\n", "\n", "import torch\n", "from torch.utils.data import DataLoader\n", @@ -40,9 +38,7 @@ "np.random.seed(0)\n", "torch.manual_seed(0)\n", "# Use a gpu if available, otherwise use cpu\n", - "device = ('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "%matplotlib inline" + "device = ('cuda' if torch.cuda.is_available() else 'cpu')" ] }, { @@ -50,7 +46,7 @@ "metadata": {}, "source": [ "## Dataset\n", - "In this section we inspect the dataset, split it into a training and a test set, and prepare it for easy consuption with PyTorch-based data loaders. Model construction and training will be done in the next section." + "In this section we inspect the dataset, split it into a training and a test set, and prepare it for easy consumption with PyTorch-based data loaders. Model construction and training will be done in the next section." ] }, { @@ -58,8 +54,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:02.017357Z", - "start_time": "2019-06-28T17:15:02.011736Z" + "end_time": "2019-08-01T18:02:31.493230Z", + "start_time": "2019-08-01T18:02:31.486922Z" } }, "outputs": [], @@ -75,8 +71,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:18.767394Z", - "start_time": "2019-06-28T17:15:02.019564Z" + "end_time": "2019-08-01T18:02:49.047951Z", + "start_time": "2019-08-01T18:02:31.495355Z" } }, "outputs": [], @@ -92,10 +88,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Divide into train and test\n", + "### Data handling\n", "We use the DataLoader object from PyTorch to build batches from the test data set.\n", "\n", "However, we first need to specify how much history to use in creating a forecast of a given length:\n", + "\n", "- horizon = time steps to forecast\n", "- lookback = time steps leading up to the period to be forecast" ] @@ -105,8 +102,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:18.771334Z", - "start_time": "2019-06-28T17:15:18.769032Z" + "end_time": "2019-08-01T18:02:49.052666Z", + "start_time": "2019-08-01T18:02:49.050082Z" } }, "outputs": [], @@ -119,7 +116,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We've also found that it is not necessary to train on the full dataset, so we here select a 10% random sample of time series for training. We will evaluate on the full dataset later." + "We hold out the final horizon from the training time series." ] }, { @@ -127,18 +124,15 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:18.873938Z", - "start_time": "2019-06-28T17:15:18.772798Z" + "end_time": "2019-08-01T18:02:49.145905Z", + "start_time": "2019-08-01T18:02:49.054235Z" } }, "outputs": [], "source": [ - "import random\n", - "\n", "data_train = []\n", "for time_series in data_arr:\n", - " data_train.append(time_series[:, :-horizon],)\n", - "data_train = random.sample(data_train, int(len(data_train) * 0.1))" + " data_train.append(time_series[:, :-horizon],)" ] }, { @@ -160,8 +154,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:18.950829Z", - "start_time": "2019-06-28T17:15:18.876296Z" + "end_time": "2019-08-01T18:02:49.223970Z", + "start_time": "2019-08-01T18:02:49.148959Z" } }, "outputs": [], @@ -180,7 +174,9 @@ "source": [ "`TimeSeriesDataset` inherits from [Torch Datasets](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) for use with [Torch DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). It handles the creation of the examples used to train the network using `lookback` and `horizon` to partition the time series.\n", "\n", - "The parameter 'step' controls how far apart consective windowed samples from a time series are spaced. For example, for a time series of length 100 and a setup with lookback 24 and horizon 12, we split the original time series into smaller training examples of length 24+12=36. How much these examples are overlapping is controlled by the parameter `step` in `TimeSeriesDataset`." + "The parameter 'step' controls how far apart consective windowed samples from a time series are spaced. For example, for a time series of length 100 and a setup with lookback 24 and horizon 12, we split the original time series into smaller training examples of length 24+12=36. How much these examples are overlapping is controlled by the parameter `step` in `TimeSeriesDataset`.\n", + "\n", + "We've also found that it is not necessary to train on the full dataset, so we here select a 10% random sample (``thinning = 0.1``) of time series for training. We will evaluate on the full dataset later." ] }, { @@ -188,18 +184,19 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:19.243876Z", - "start_time": "2019-06-28T17:15:18.954125Z" + "end_time": "2019-08-01T18:02:51.293731Z", + "start_time": "2019-08-01T18:02:49.227556Z" } }, "outputs": [], "source": [ "data_train = TimeSeriesDataset(\n", - " data_train, \n", - " lookback, \n", - " horizon,\n", + " time_series=data_train, \n", + " lookback=lookback, \n", + " horizon=horizon,\n", " step=1,\n", - " transform=transform\n", + " transform=transform,\n", + " thinning=0.1\n", ")\n", "\n", "# Create mini-batch data loader\n", @@ -208,7 +205,7 @@ " batch_size=512, \n", " shuffle=True, \n", " pin_memory=True,\n", - " num_workers=1\n", + " num_workers=4\n", ")" ] }, @@ -236,8 +233,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:15:19.261939Z", - "start_time": "2019-06-28T17:15:19.246822Z" + "end_time": "2019-08-01T18:02:51.311484Z", + "start_time": "2019-08-01T18:02:51.296273Z" } }, "outputs": [ @@ -258,7 +255,6 @@ " horizon=horizon, \n", " hidden_channels=89,\n", " skip_channels=199,\n", - " dense_units=156,\n", " n_layers=7)\n", "\n", "print('Number of model parameters: {}.'.format(model.n_parameters))\n", @@ -281,8 +277,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:52:16.907027Z", - "start_time": "2019-06-28T17:15:19.263466Z" + "end_time": "2019-08-01T18:25:43.821740Z", + "start_time": "2019-08-01T18:02:51.313033Z" } }, "outputs": [ @@ -290,7 +286,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/austin/miniconda3/envs/d4cGithub/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + "/home/austin/miniconda3/envs/d4c/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", " warnings.warn('Was asked to gather along dimension 0, but all '\n" ] }, @@ -298,16 +294,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/5 [915731/915731 (100%)]\tLoss: -1.863526\tElapsed/Remaining: 3m52s/15m30s \n", - "Training error: -2.67e+01.\n", - "Epoch 2/5 [915731/915731 (100%)]\tLoss: -1.963631\tElapsed/Remaining: 11m21s/17m2s \n", - "Training error: -2.71e+01.\n", - "Epoch 3/5 [915731/915731 (100%)]\tLoss: -1.983338\tElapsed/Remaining: 18m42s/12m28s \n", - "Training error: -2.75e+01.\n", - "Epoch 4/5 [915731/915731 (100%)]\tLoss: -1.974977\tElapsed/Remaining: 26m2s/6m30s \n", + "Epoch 1/5 [931419/931419 (100%)]\tLoss: -1.922641\tElapsed/Remaining: 3m17s/13m9s \n", "Training error: -2.78e+01.\n", - "Epoch 5/5 [915731/915731 (100%)]\tLoss: -2.073579\tElapsed/Remaining: 33m20s/0m0s \n", - "Training error: -2.83e+01.\n" + "Epoch 2/5 [931419/931419 (100%)]\tLoss: -2.105988\tElapsed/Remaining: 7m52s/11m48s \n", + "Training error: -2.85e+01.\n", + "Epoch 3/5 [931419/931419 (100%)]\tLoss: -2.109848\tElapsed/Remaining: 12m25s/8m17s \n", + "Training error: -2.97e+01.\n", + "Epoch 4/5 [931419/931419 (100%)]\tLoss: -2.160128\tElapsed/Remaining: 16m60s/4m15s \n", + "Training error: -3.08e+01.\n", + "Epoch 5/5 [931419/931419 (100%)]\tLoss: -2.279575\tElapsed/Remaining: 21m35s/0m0s \n", + "Training error: -3.18e+01.\n" ] } ], @@ -330,8 +326,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:52:33.409674Z", - "start_time": "2019-06-28T17:52:16.911086Z" + "end_time": "2019-08-01T18:26:00.889421Z", + "start_time": "2019-08-01T18:25:43.825929Z" } }, "outputs": [], @@ -349,26 +345,32 @@ " data_arr.append(ts)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Only retain the final ``lookback`` and ``horizon`` for testing." + ] + }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:52:33.421253Z", - "start_time": "2019-06-28T17:52:33.411359Z" + "end_time": "2019-08-01T18:26:00.902606Z", + "start_time": "2019-08-01T18:26:00.891310Z" } }, "outputs": [], "source": [ - "# Sequentialize the training and testing dataset\n", "data_test = []\n", "for time_series in data_arr:\n", " data_test.append(time_series[:, -horizon-lookback:])\n", "\n", "data_test = TimeSeriesDataset(\n", - " data_test, \n", - " lookback, \n", - " horizon, \n", + " time_series=data_test, \n", + " lookback=lookback, \n", + " horizon=horizon,\n", " step=1,\n", " transform=transform\n", ")\n", @@ -376,7 +378,7 @@ " data_test, \n", " batch_size=1024, \n", " shuffle=False,\n", - " num_workers=2\n", + " num_workers=4\n", ")" ] }, @@ -393,8 +395,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:52:55.851568Z", - "start_time": "2019-06-28T17:52:33.422806Z" + "end_time": "2019-08-01T18:26:23.230885Z", + "start_time": "2019-08-01T18:26:00.904514Z" } }, "outputs": [], @@ -422,8 +424,8 @@ "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2019-06-28T17:52:55.953031Z", - "start_time": "2019-06-28T17:52:55.853679Z" + "end_time": "2019-08-01T18:26:23.333064Z", + "start_time": "2019-08-01T18:26:23.233143Z" } }, "outputs": [ @@ -431,7 +433,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "SMAPE: 3.1666347980499268%\n" + "SMAPE: 3.197432041168213%\n" ] } ], @@ -445,9 +447,9 @@ ], "metadata": { "kernelspec": { - "display_name": "d4cGithub", + "display_name": "d4c", "language": "python", - "name": "d4cgithub" + "name": "d4c" }, "language_info": { "codemirror_mode": { diff --git a/docs/examples/m4daily_distributed.ipynb b/docs/examples/m4daily_distributed.ipynb new file mode 100644 index 0000000..7e7ec9d --- /dev/null +++ b/docs/examples/m4daily_distributed.ipynb @@ -0,0 +1,773 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Distributed Dataset\n", + "\n", + "This notebook is designed to give a simple introduction to forecasting using the Deep4Cast package. The time series data is taken from the [M4 dataset](https://github.com/M4Competition/M4-methods/tree/master/Dataset), specifically, the ``Daily`` subset of the data. \n", + "\n", + "Since most of the content is duplicated from the M4 Daily notebook we will here focus only on how to use the distributed dataset features." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:23:44.499193Z", + "start_time": "2019-08-01T17:23:43.686001Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "\n", + "from deep4cast.forecasters import Forecaster\n", + "from deep4cast.models import WaveNet\n", + "from deep4cast.datasets import TimeSeriesDataset\n", + "import deep4cast.transforms as transforms\n", + "import deep4cast.metrics as metrics\n", + "\n", + "# Make RNG predictable\n", + "np.random.seed(0)\n", + "torch.manual_seed(0)\n", + "# Use a gpu if available, otherwise use cpu\n", + "device = ('cuda' if torch.cuda.is_available() else 'cpu')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "In this section we prepare the dataset, write it into parquet files, and prepare it for easy consumption with PyTorch-based data loaders. Model construction and training will be done in the next section." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:23:44.507062Z", + "start_time": "2019-08-01T17:23:44.501105Z" + } + }, + "outputs": [], + "source": [ + "if not os.path.exists('data/Daily-train.csv'):\n", + " !wget https://raw.githubusercontent.com/M4Competition/M4-methods/master/Dataset/Train/Daily-train.csv -P data/\n", + "if not os.path.exists('data/Daily-test.csv'):\n", + " !wget https://raw.githubusercontent.com/M4Competition/M4-methods/master/Dataset/Test/Daily-test.csv -P data/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:00.652072Z", + "start_time": "2019-08-01T17:23:44.509155Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V1V2V3V4V5V6V7V8V9V10...V9911V9912V9913V9914V9915V9916V9917V9918V9919V9920
0D11017.101019.301017.001019.201018.701015.601018.501018.301018.4...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1D22793.702793.802803.702805.802802.302795.002806.402782.202780.3...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2D31091.301088.501085.701082.901080.101077.301074.501071.701068.9...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3D41092.001078.001064.001050.001036.001022.001008.001092.001078.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4D52938.632956.442964.412972.413014.973014.233024.083031.973062.7...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 9920 columns

\n", + "
" + ], + "text/plain": [ + " V1 V2 V3 V4 V5 V6 V7 V8 V9 \\\n", + "0 D1 1017.10 1019.30 1017.00 1019.20 1018.70 1015.60 1018.50 1018.30 \n", + "1 D2 2793.70 2793.80 2803.70 2805.80 2802.30 2795.00 2806.40 2782.20 \n", + "2 D3 1091.30 1088.50 1085.70 1082.90 1080.10 1077.30 1074.50 1071.70 \n", + "3 D4 1092.00 1078.00 1064.00 1050.00 1036.00 1022.00 1008.00 1092.00 \n", + "4 D5 2938.63 2956.44 2964.41 2972.41 3014.97 3014.23 3024.08 3031.97 \n", + "\n", + " V10 ... V9911 V9912 V9913 V9914 V9915 V9916 V9917 V9918 V9919 \\\n", + "0 1018.4 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 2780.3 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "2 1068.9 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "3 1078.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "4 3062.7 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + " V9920 \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + "[5 rows x 9920 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('data/Daily-train.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We transform from wide to long format to facilitate paritioning parquet files on the time series id." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:04.016429Z", + "start_time": "2019-08-01T17:24:00.653918Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V1value
0D11017.10
1D22793.70
2D31091.30
3D41092.00
4D52938.63
\n", + "
" + ], + "text/plain": [ + " V1 value\n", + "0 D1 1017.10\n", + "1 D2 2793.70\n", + "2 D3 1091.30\n", + "3 D4 1092.00\n", + "4 D5 2938.63" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.melt(id_vars='V1')\n", + "df = df[df.value.notnull()]\n", + "df = df.reset_index(drop=True)\n", + "df = df.drop('variable', axis=1)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create parquet files by paritioning on the time series id. This creates directories with parquet files containing the entirety of the single time series." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:16.690077Z", + "start_time": "2019-08-01T17:24:04.018107Z" + } + }, + "outputs": [], + "source": [ + "df.to_parquet(\n", + " 'data/m4/daily/',\n", + " engine='fastparquet',\n", + " partition_cols=['V1'],\n", + " compression=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data handling\n", + "\n", + "We use the DataLoader object from PyTorch to build batches from the data set.\n", + "\n", + "However, we first need to specify how much history to use in creating a forecast of a given length:\n", + "- horizon = time steps to forecast\n", + "- lookback = time steps leading up to the period to be forecast" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:16.695828Z", + "start_time": "2019-08-01T17:24:16.692204Z" + } + }, + "outputs": [], + "source": [ + "horizon = 14\n", + "lookback = 128\n", + "\n", + "transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.LogTransform(targets=[0], offset=1.0),\n", + " transforms.RemoveLast(targets=[0]),\n", + " transforms.Target(targets=[0]),\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:17.320766Z", + "start_time": "2019-08-01T17:24:16.697613Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
V1
D11006
D10674
D1001006
D10001052
D10011052
\n", + "
" + ], + "text/plain": [ + " value\n", + "V1 \n", + "D1 1006\n", + "D10 674\n", + "D100 1006\n", + "D1000 1052\n", + "D1001 1052" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfg = df.groupby('V1').count()\n", + "dfg.to_csv('data/m4/daily/_metadata_partition.csv', header=None)\n", + "dfg.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`TimeSeriesDataset` inherits from [Torch Datasets](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) for use with [Torch DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). It handles the creation of the examples used to train the network using `lookback` and `horizon` to partition the time series.\n", + "\n", + "Instead of providing an array of ``numpy`` time series, we here provide a path to the paritioned parquet files as well as a list of files locations containing metadata on the time series ids. The metadata file has the partition name (first column) and the length of the time series (second column). This will be used to calculate the number of examples in each time series.\n", + "\n", + "Finally, since the entire time series is stored in the parquet file, if we want to perform a train-test split then we set ``split='train'``, this holds out the final horizon from each time series from training. Setting ``split='test'`` will conversely provide only the final ``lookback`` and ``horizon``." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:20.372816Z", + "start_time": "2019-08-01T17:24:17.322796Z" + } + }, + "outputs": [], + "source": [ + "data_train = TimeSeriesDataset(\n", + " path_parquet='data/m4/daily/',\n", + " path_metadata=['data/m4/daily/_metadata_partition.csv'],\n", + " lookback=lookback, \n", + " horizon=horizon,\n", + " step=1,\n", + " transform=transform,\n", + " thinning=0.1,\n", + " split='train'\n", + ")\n", + "dataloader_train = DataLoader(\n", + " data_train, \n", + " batch_size=512, \n", + " shuffle=True, \n", + " pin_memory=True,\n", + " num_workers=8\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling and Forecasting" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:24:20.384823Z", + "start_time": "2019-08-01T17:24:20.374533Z" + } + }, + "outputs": [], + "source": [ + "model = WaveNet(input_channels=1,\n", + " output_channels=1,\n", + " horizon=horizon, \n", + " n_layers=7)\n", + "\n", + "if torch.cuda.device_count() > 1:\n", + " model = torch.nn.DataParallel(model)\n", + "\n", + "optim = torch.optim.Adam(model.parameters(), lr=0.001)\n", + "\n", + "loss = torch.distributions.StudentT" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:28:15.037658Z", + "start_time": "2019-08-01T17:24:20.386316Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/austin/miniconda3/envs/d4c/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/1 [931419/931419 (100%)]\tLoss: -1.971866\tElapsed/Remaining: 3m52s/0m0s " + ] + } + ], + "source": [ + "forecaster = Forecaster(model, loss, optim, n_epochs=1, device=device)\n", + "forecaster.fit(dataloader_train, eval_model=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation\n", + "\n", + "We need to append the ``lookback`` to the test data so that we can make forecasts to compare to actuals." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:28:31.507650Z", + "start_time": "2019-08-01T17:28:15.039723Z" + } + }, + "outputs": [], + "source": [ + "data_train = pd.read_csv('data/Daily-train.csv')\n", + "data_test = pd.read_csv('data/Daily-test.csv')\n", + "data_train = data_train.iloc[:, 1:].values\n", + "data_test = data_test.iloc[:, 1:].values\n", + "\n", + "data_arr = []\n", + "for ts_train, ts_test in zip(data_train, data_test):\n", + " ts_a = ts_train[~np.isnan(ts_train)]\n", + " ts_b = ts_test\n", + " ts = np.concatenate([ts_a, ts_b])[None, :]\n", + " data_arr.append(ts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we provide a list of ``numpy`` arrays containing the train and test time series. ``TimeSeriesDataset`` creates a test split (``split='test'``) providing the final ``lookback`` and ``horizon`` of each time series so that the ``lookback`` can be used to create a forecast." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:28:31.517061Z", + "start_time": "2019-08-01T17:28:31.509487Z" + } + }, + "outputs": [], + "source": [ + "data_test = TimeSeriesDataset(\n", + " time_series=data_arr,\n", + " lookback=lookback, \n", + " horizon=horizon, \n", + " step=1,\n", + " transform=transform,\n", + " split='test'\n", + ")\n", + "dataloader_test = DataLoader(\n", + " data_test, \n", + " batch_size=1024, \n", + " shuffle=False,\n", + " num_workers=8\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:28:51.065291Z", + "start_time": "2019-08-01T17:28:31.518886Z" + } + }, + "outputs": [], + "source": [ + "y_test = []\n", + "for example in dataloader_test:\n", + " example = dataloader_test.dataset.transform.untransform(example)\n", + " y_test.append(example['y'])\n", + "y_test = np.concatenate(y_test)\n", + "\n", + "y_samples = forecaster.predict(dataloader_test, n_samples=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2019-08-01T17:28:51.166945Z", + "start_time": "2019-08-01T17:28:51.067609Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SMAPE: 3.007478952407837%\n" + ] + } + ], + "source": [ + "test_smape = metrics.smape(y_samples, y_test)\n", + "\n", + "print('SMAPE: {}%'.format(test_smape.mean()))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "d4c", + "language": "python", + "name": "d4c" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": true, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/get_started.rst b/docs/get_started.rst index f4c1a9b..45b1cee 100644 --- a/docs/get_started.rst +++ b/docs/get_started.rst @@ -8,7 +8,7 @@ Main Requirements ================= - `python 3.6 `_ -- `pytorch 1.0 `_ +- `pytorch 1.1 `_ Installation ============ diff --git a/requirements.txt b/requirements.txt index 47a725c..b055607 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -torch>=1.0.0 -torchvision>=0.2.1 +torch>=1.1.0 +torchvision>=0.3.0 matplotlib>=3.0.3 numpy>=1.16.2 pandas>=0.24.2 -scipy>=1.2.1 \ No newline at end of file +scipy>=1.2.1 +fastparquet>=0.3.1 \ No newline at end of file