From 06d829766f05571c81b80d17382279c4c11be8d3 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 14:24:21 +1000
Subject: [PATCH 1/9] Update to Python 3.11 and removal of constraints

---
 constraints.txt |  4 ----
 setup.py        | 15 +++++++--------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/constraints.txt b/constraints.txt
index c14b9998..e69de29b 100644
--- a/constraints.txt
+++ b/constraints.txt
@@ -1,4 +0,0 @@
-cftime<1.1.1;python_version=='3.5'
-netCDF4<1.5.4;python_version=='3.5'
-pandas<0.25.0;python_version=='3.5'
-xarray<0.14.0;python_version=='3.5'
diff --git a/setup.py b/setup.py
index cbe40b92..a6a67d54 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
 from setuptools import setup, find_packages
 
 INSTALL_REQUIRES = [
-    'jsonschema>=2.6.0,<3.0.0',
-    'numpy>=1.13.0',
-    'netCDF4>=1.5.3',
-    'pandas>=0.24.2',
-    'xarray>=0.11.3'
+    'jsonschema>=4.23.0',
+    'numpy>=2.2.4',
+    'netCDF4>=1.7.2',
+    'pandas>=2.2.3',
+    'xarray>=2025.3.1'
 ]
 
 TESTS_REQUIRE = [
@@ -37,7 +37,7 @@
     author_email='projectofficers@emii.org.au',
     description='AODN data tools library',
     zip_safe=False,
-    python_requires='>=3.5',
+    python_requires='>=3.11, <3.12',
     install_requires=INSTALL_REQUIRES,
     tests_require=TESTS_REQUIRE,
     extras_require=EXTRAS_REQUIRE,
@@ -49,8 +49,7 @@
         'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
         'Programming Language :: Python',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.11',
         'Programming Language :: Python :: Implementation :: CPython',
     ]
 )

From 4dc990bc5876cf736597a0bb5cfa487b25a70a13 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 14:27:07 +1000
Subject: [PATCH 2/9] Updated because Pandas removed the DataFrame.append()
 method in version 2.0

---
 .../timeseries_products/hourly_timeseries.py  | 43 ++++++++++---------
 .../velocity_hourly_timeseries.py             |  2 +-
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/aodntools/timeseries_products/hourly_timeseries.py b/aodntools/timeseries_products/hourly_timeseries.py
index bc079534..7f23db26 100644
--- a/aodntools/timeseries_products/hourly_timeseries.py
+++ b/aodntools/timeseries_products/hourly_timeseries.py
@@ -30,27 +30,27 @@ def check_files(file_list, site_code, parameter_names_accepted, input_dir=''):
     :param input_dir: base path where source files are stored
     :return: dictionary with the file name and list of failed tests, list good files chronologically ordered
     """
-
-    file_list_dataframe = pd.DataFrame(columns=["url", "deployment_date"])
+    rows = []
     error_dict = {}
 
     for file in file_list:
         with xr.open_dataset(os.path.join(input_dir, file)) as nc:
             error_list = check_file(nc, site_code, parameter_names_accepted)
             if error_list:
-                error_dict.update({file: error_list})
+                error_dict[file] = error_list
             else:
-                file_list_dataframe = file_list_dataframe.append({'url': file,
-                                                                  'deployment_date': parse(nc.time_deployment_start)},
-                                                                 ignore_index=True)
+                rows.append({
+                    'url': file,
+                    'deployment_date': parse(nc.time_deployment_start)
+                })
 
+    file_list_dataframe = pd.DataFrame(rows, columns=["url", "deployment_date"])
     file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date')
-    file_list = file_list_dataframe['url'].to_list()
-    if file_list == []:
+    sorted_files = file_list_dataframe['url'].to_list()
+    if not sorted_files:
         raise NoInputFilesError("no valid input files to aggregate")
 
-    return file_list, error_dict
-
+    return sorted_files, error_dict
 
 
 def get_parameter_names(nc):
@@ -308,7 +308,7 @@ def PDresample_by_hour(df, function_dict, function_stats):
     df_data = pd.DataFrame(index=pd.DatetimeIndex([]))
     for variable in varnames:
         ds_var = df[variable]
-        ds_var_resample = ds_var.resample('1H', base=0.5)  # shift by half hour to centre bin on the hour
+        ds_var_resample = ds_var.resample('1h', offset='30min')  # shift by half hour to centre bin on the hour
         ds_var_mean = ds_var_resample.apply(function_dict[variable]).astype(np.float32)
         df_data = pd.concat([df_data, ds_var_mean], axis=1, sort=False)
         for stat_method in function_stats:
@@ -366,8 +366,6 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
         variable_attribute_dictionary = json.load(json_file)['_variables']
 
     df_data = pd.DataFrame()
-
-
     ## create empty DF with dtypes
     metadata_df_types = [('source_file', str),
                          ('instrument_id', str),
@@ -380,6 +378,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
     parameter_names_all = []
     applied_offset = []
     qc_count_all = {}
+    metadata_rows = []
 
     for file_index, file in enumerate(files_to_aggregate):
         print(file_index)
@@ -398,13 +397,16 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
             qc_count = get_QCcount(nc_clean, qcflags)
             qc_count_all = update_QCcount(qc_count_all, qc_count)
             nc_clean = good_data_only(nc_clean, qcflags)  # good quality data only
-            df_metadata = df_metadata.append({'source_file': file,
-                                              'instrument_id': utils.get_instrument_id(nc),
-                                              'LONGITUDE': nc.LONGITUDE.squeeze().values,
-                                              'LATITUDE': nc.LATITUDE.squeeze().values,
-                                              'NOMINAL_DEPTH': get_nominal_depth(nc)},
-                                             ignore_index=True)
-
+                    
+            # Append a new row as a dictionary to the list.
+            metadata_rows.append({
+                'source_file': file,
+                'instrument_id': utils.get_instrument_id(nc),
+                'LONGITUDE': nc.LONGITUDE.squeeze().values,
+                'LATITUDE': nc.LATITUDE.squeeze().values,
+                'NOMINAL_DEPTH': get_nominal_depth(nc)
+            })
+            
             # If TIME had out-of-range values before cleaning, nc_clean would now have a CFTimeIndex, which
             # breaks the resampling further down. Here we reset it to a DatetimeIndex as suggested here:
             # https://stackoverflow.com/questions/55786995/converting-cftime-datetimejulian-to-datetime/55787899#55787899
@@ -421,6 +423,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
             df_temp['instrument_index'] = np.repeat(file_index, len(df_temp)).astype(np.int32)
             df_data = pd.concat([df_data, df_temp.reset_index()], ignore_index=True, sort=False)
 
+    df_metadata = pd.DataFrame(metadata_rows, columns=['source_file', 'instrument_id', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'])
     df_metadata.index.rename('INSTRUMENT', inplace=True)
     df_data.index.rename('OBSERVATION', inplace=True)
     ## rename index to TIME
diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index fd12b497..9a7e4911 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -58,7 +58,7 @@ def append_resampled_values(nc_cell, ds, slice_start, binning_functions):
     # shift the index forward 30min to centre the bins on the hour
     df_cell.index = df_cell.index + pd.Timedelta(minutes=30)
 
-    df_cell_1H = df_cell.resample('1H')
+    df_cell_1H = df_cell.resample('1h')
     slice_end = len(df_cell_1H) + slice_start
 
     # set binned timestamps

From 80340a590d59527182fc07d6b135b089169cba27 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 14:29:03 +1000
Subject: [PATCH 3/9] Updated because  the Validator no longer expects a
 parameter named `types`

---
 aodntools/ncwriter/schema.py | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/aodntools/ncwriter/schema.py b/aodntools/ncwriter/schema.py
index 4a2dd579..246c2dbc 100644
--- a/aodntools/ncwriter/schema.py
+++ b/aodntools/ncwriter/schema.py
@@ -1,21 +1,20 @@
 """This module holds schema definitions for validating the various :py:class:`dicts` that make up parts of a
 template, and also the helper functions necessary to validate an object against their respective schema.
 """
-
 import json
-
 import numpy as np
 from jsonschema import validators, Draft4Validator, FormatChecker, ValidationError
 from pkg_resources import resource_filename
 
+def is_array(checker, instance):
+    return isinstance(instance, (list, np.ndarray))
 
-# Create a new validator class (based on Draft4Validator) to allow templates to use
-# * Python types or numpy dtypes to specify variable data types; and
-# * numpy arrays to specify variable data.
-TemplateValidator = validators.create(meta_schema=Draft4Validator.META_SCHEMA,
-                                      validators=Draft4Validator.VALIDATORS)
-format_checker = FormatChecker()
+# Extend the default type checker by redefining "array"
+custom_type_checker = Draft4Validator.TYPE_CHECKER.redefine("array", is_array)
 
+# Create a custom validator that uses the new type checker.
+CustomValidator = validators.extend(Draft4Validator, type_checker=custom_type_checker)
+format_checker = FormatChecker()
 
 @format_checker.checks('datatype')
 def is_python_datatype(value):
@@ -24,32 +23,25 @@ def is_python_datatype(value):
         return True
     if isinstance(value, type):
         return issubclass(value, np.number)
-
     return False
 
-
-TYPES = {'array': (list, np.ndarray)}
-
 TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json')
 with open(TEMPLATE_SCHEMA_JSON) as f:
     TEMPLATE_SCHEMA = json.load(f)
-TemplateValidator.check_schema(TEMPLATE_SCHEMA)
-
-template_validator = TemplateValidator(TEMPLATE_SCHEMA, types=TYPES, format_checker=format_checker)
+CustomValidator.check_schema(TEMPLATE_SCHEMA)
 
+# Use the custom validator
+template_validator = CustomValidator(TEMPLATE_SCHEMA, format_checker=format_checker)
 
 def validate_template(t):
     template_validator.validate(t)
 
-
 def validate_dimensions(d):
     validate_template({'_dimensions': d})
 
-
 def validate_variables(v):
     validate_template({'_variables': v})
-
-
+    
 def validate_global_attributes(a):
     if hasattr(a, 'keys'):
         special = [k for k in a.keys() if k.startswith('_')]

From 5d83875f1bdb74d67866563defe8849235c86b59 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 14:32:12 +1000
Subject: [PATCH 4/9] Update required for newer version of numpy

---
 aodntools/timeseries_products/common.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py
index 75c75f2d..a0d8f12f 100644
--- a/aodntools/timeseries_products/common.py
+++ b/aodntools/timeseries_products/common.py
@@ -2,6 +2,7 @@
 from datetime import datetime, timezone
 
 import numpy as np
+import xarray as xr
 
 # Common date/time format strings
 TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
@@ -179,7 +180,7 @@ def in_water_index(nc):
     """
     time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
     time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
-    TIME = nc['TIME'][:]
+    TIME = nc['TIME'].values
     return (TIME >= time_deployment_start) & (TIME <= time_deployment_end)
 
 def in_water(nc):
@@ -189,7 +190,11 @@ def in_water(nc):
     :param nc: xarray dataset
     :return: xarray dataset
     """
-    return nc.where(in_water_index(nc), drop=True)
+
+    condition = in_water_index(nc)  # This returns a numpy array
+    # Wrap the condition in a DataArray so that it aligns with the TIME coordinate.
+    cond_da = xr.DataArray(condition, dims=["TIME"], coords={"TIME": nc["TIME"].values})
+    return nc.where(cond_da, drop=True)
 
 
 def current_utc_timestamp(format=TIMESTAMP_FORMAT):

From 987ed14653ec0428874bf0be2cdc9ba0c75fdb6e Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 15:04:09 +1000
Subject: [PATCH 5/9] Fix for failing pytest regarding duplicate TIME values

---
 aodntools/timeseries_products/common.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py
index a0d8f12f..14c399b0 100644
--- a/aodntools/timeseries_products/common.py
+++ b/aodntools/timeseries_products/common.py
@@ -190,12 +190,11 @@ def in_water(nc):
     :param nc: xarray dataset
     :return: xarray dataset
     """
-
-    condition = in_water_index(nc)  # This returns a numpy array
-    # Wrap the condition in a DataArray so that it aligns with the TIME coordinate.
-    cond_da = xr.DataArray(condition, dims=["TIME"], coords={"TIME": nc["TIME"].values})
-    return nc.where(cond_da, drop=True)
-
+    condition = in_water_index(nc)  # NumPy boolean array
+    # Get the integer indices where condition is True.
+    indices = np.nonzero(condition)[0]
+    # Use positional indexing to select the TIME entries that satisfy the condition.
+    return nc.isel(TIME=indices)
 
 def current_utc_timestamp(format=TIMESTAMP_FORMAT):
     return datetime.now(timezone.utc).strftime(format)

From 876847e00c57619dbfcedd578cc57ec25234bb92 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 15:34:05 +1000
Subject: [PATCH 6/9] np.cast` was removed in the NumPy 2.0

---
 examples/rottnest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/rottnest.py b/examples/rottnest.py
index 41d6e0d6..e215dbb6 100644
--- a/examples/rottnest.py
+++ b/examples/rottnest.py
@@ -46,7 +46,8 @@
     var_type = var['_datatype']
     for attr in ('valid_min', 'valid_max'):
         if attr in var:
-            var[attr] = np.cast[var_type](var[attr])
+            var[attr] = np.array(var[attr], dtype=var_type)
+
 
 # update range attributes
 template.add_extent_attributes()

From b74bad4184ef925743bde7abb8fdbdcaa2c3fb6a Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 15:49:50 +1000
Subject: [PATCH 7/9] Updating Github actions to use Python 3.11

---
 .github/workflows/test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 48dada65..1ba9a5f7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.8' ]
+        python-version: [ '3.11' ]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -22,7 +22,6 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install "numpy<1.19.0"
         pip install -r test_requirements.txt
         pip install pytest-cov
     - name: Test with pytest

From 43ad27b58d95b9bb7b88bfe53ce3f0c45d3c6588 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Mon, 7 Apr 2025 15:52:44 +1000
Subject: [PATCH 8/9] Update setup.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a6a67d54..5ff7f23f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
     'numpy>=2.2.4',
     'netCDF4>=1.7.2',
     'pandas>=2.2.3',
-    'xarray>=2025.3.1'
+    'xarray>=2023.1.0'
 ]
 
 TESTS_REQUIRE = [

From edcfd593d396599a65dd73e0a47f2e23555cda79 Mon Sep 17 00:00:00 2001
From: Michael Hemming <mphemming@live.co.uk>
Date: Tue, 8 Apr 2025 09:44:09 +1000
Subject: [PATCH 9/9] Additional comments

---
 aodntools/ncwriter/schema.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/aodntools/ncwriter/schema.py b/aodntools/ncwriter/schema.py
index 246c2dbc..f4ac6b9f 100644
--- a/aodntools/ncwriter/schema.py
+++ b/aodntools/ncwriter/schema.py
@@ -6,16 +6,21 @@
 from jsonschema import validators, Draft4Validator, FormatChecker, ValidationError
 from pkg_resources import resource_filename
 
+# helper function that will later be used to tell the schema validator how to validate objects of type "array"
 def is_array(checker, instance):
     return isinstance(instance, (list, np.ndarray))
 
 # Extend the default type checker by redefining "array"
+# whenever a schema expects a value of type "array", it will now use the is_array function to check if the value is acceptable.
 custom_type_checker = Draft4Validator.TYPE_CHECKER.redefine("array", is_array)
 
 # Create a custom validator that uses the new type checker.
+# any validation performed with CustomValidator will use the custom array checker 
 CustomValidator = validators.extend(Draft4Validator, type_checker=custom_type_checker)
 format_checker = FormatChecker()
 
+# Define a custom format checker
+# called when a JSON schema specifies that a value should have the format "datatype"
 @format_checker.checks('datatype')
 def is_python_datatype(value):
     """Return whether the given value is a valid data type specification for a NetCDF variable"""
@@ -25,14 +30,19 @@ def is_python_datatype(value):
         return issubclass(value, np.number)
     return False
 
+# Load JSON schema file
 TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json')
 with open(TEMPLATE_SCHEMA_JSON) as f:
     TEMPLATE_SCHEMA = json.load(f)
+
+# Use the custom validator to check it is valid according to Draft 4 rules
 CustomValidator.check_schema(TEMPLATE_SCHEMA)
 
-# Use the custom validator
+# ready-to-use validator that applies both custom type and format checks
 template_validator = CustomValidator(TEMPLATE_SCHEMA, format_checker=format_checker)
 
+
+# Validation checks
 def validate_template(t):
     template_validator.validate(t)