From 06d829766f05571c81b80d17382279c4c11be8d3 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 14:24:21 +1000 Subject: [PATCH 1/9] Update to Python 3.11 and removal of constraints --- constraints.txt | 4 ---- setup.py | 15 +++++++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/constraints.txt b/constraints.txt index c14b9998..e69de29b 100644 --- a/constraints.txt +++ b/constraints.txt @@ -1,4 +0,0 @@ -cftime<1.1.1;python_version=='3.5' -netCDF4<1.5.4;python_version=='3.5' -pandas<0.25.0;python_version=='3.5' -xarray<0.14.0;python_version=='3.5' diff --git a/setup.py b/setup.py index cbe40b92..a6a67d54 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ from setuptools import setup, find_packages INSTALL_REQUIRES = [ - 'jsonschema>=2.6.0,<3.0.0', - 'numpy>=1.13.0', - 'netCDF4>=1.5.3', - 'pandas>=0.24.2', - 'xarray>=0.11.3' + 'jsonschema>=4.23.0', + 'numpy>=2.2.4', + 'netCDF4>=1.7.2', + 'pandas>=2.2.3', + 'xarray>=2025.3.1' ] TESTS_REQUIRE = [ @@ -37,7 +37,7 @@ author_email='projectofficers@emii.org.au', description='AODN data tools library', zip_safe=False, - python_requires='>=3.5', + python_requires='>=3.11, <3.12', install_requires=INSTALL_REQUIRES, tests_require=TESTS_REQUIRE, extras_require=EXTRAS_REQUIRE, @@ -49,8 +49,7 @@ 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: Implementation :: CPython', ] ) From 4dc990bc5876cf736597a0bb5cfa487b25a70a13 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 14:27:07 +1000 Subject: [PATCH 2/9] Updated because Pandas removed the DataFrame.append() method in version 2.0 --- .../timeseries_products/hourly_timeseries.py | 43 ++++++++++--------- .../velocity_hourly_timeseries.py | 2 +- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/aodntools/timeseries_products/hourly_timeseries.py b/aodntools/timeseries_products/hourly_timeseries.py index bc079534..7f23db26 100644 --- a/aodntools/timeseries_products/hourly_timeseries.py +++ b/aodntools/timeseries_products/hourly_timeseries.py @@ -30,27 +30,27 @@ def check_files(file_list, site_code, parameter_names_accepted, input_dir=''): :param input_dir: base path where source files are stored :return: dictionary with the file name and list of failed tests, list good files chronologically ordered """ - - file_list_dataframe = pd.DataFrame(columns=["url", "deployment_date"]) + rows = [] error_dict = {} for file in file_list: with xr.open_dataset(os.path.join(input_dir, file)) as nc: error_list = check_file(nc, site_code, parameter_names_accepted) if error_list: - error_dict.update({file: error_list}) + error_dict[file] = error_list else: - file_list_dataframe = file_list_dataframe.append({'url': file, - 'deployment_date': parse(nc.time_deployment_start)}, - ignore_index=True) + rows.append({ + 'url': file, + 'deployment_date': parse(nc.time_deployment_start) + }) + file_list_dataframe = pd.DataFrame(rows, columns=["url", "deployment_date"]) file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date') - file_list = file_list_dataframe['url'].to_list() - if file_list == []: + sorted_files = file_list_dataframe['url'].to_list() + if not sorted_files: raise NoInputFilesError("no valid input files to aggregate") - return file_list, error_dict - + return sorted_files, error_dict def get_parameter_names(nc): @@ -308,7 +308,7 @@ def PDresample_by_hour(df, function_dict, function_stats): df_data = pd.DataFrame(index=pd.DatetimeIndex([])) for variable in varnames: ds_var = df[variable] - ds_var_resample = ds_var.resample('1H', base=0.5) # shift by half hour to centre bin on the hour + ds_var_resample = ds_var.resample('1h', offset='30min') # shift by half hour to centre bin on the hour ds_var_mean = ds_var_resample.apply(function_dict[variable]).astype(np.float32) df_data = pd.concat([df_data, ds_var_mean], axis=1, sort=False) for stat_method in function_stats: @@ -366,8 +366,6 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp variable_attribute_dictionary = json.load(json_file)['_variables'] df_data = pd.DataFrame() - - ## create empty DF with dtypes metadata_df_types = [('source_file', str), ('instrument_id', str), @@ -380,6 +378,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp parameter_names_all = [] applied_offset = [] qc_count_all = {} + metadata_rows = [] for file_index, file in enumerate(files_to_aggregate): print(file_index) @@ -398,13 +397,16 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp qc_count = get_QCcount(nc_clean, qcflags) qc_count_all = update_QCcount(qc_count_all, qc_count) nc_clean = good_data_only(nc_clean, qcflags) # good quality data only - df_metadata = df_metadata.append({'source_file': file, - 'instrument_id': utils.get_instrument_id(nc), - 'LONGITUDE': nc.LONGITUDE.squeeze().values, - 'LATITUDE': nc.LATITUDE.squeeze().values, - 'NOMINAL_DEPTH': get_nominal_depth(nc)}, - ignore_index=True) - + + # Append a new row as a dictionary to the list. + metadata_rows.append({ + 'source_file': file, + 'instrument_id': utils.get_instrument_id(nc), + 'LONGITUDE': nc.LONGITUDE.squeeze().values, + 'LATITUDE': nc.LATITUDE.squeeze().values, + 'NOMINAL_DEPTH': get_nominal_depth(nc) + }) + # If TIME had out-of-range values before cleaning, nc_clean would now have a CFTimeIndex, which # breaks the resampling further down. Here we reset it to a DatetimeIndex as suggested here: # https://stackoverflow.com/questions/55786995/converting-cftime-datetimejulian-to-datetime/55787899#55787899 @@ -421,6 +423,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp df_temp['instrument_index'] = np.repeat(file_index, len(df_temp)).astype(np.int32) df_data = pd.concat([df_data, df_temp.reset_index()], ignore_index=True, sort=False) + df_metadata = pd.DataFrame(metadata_rows, columns=['source_file', 'instrument_id', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH']) df_metadata.index.rename('INSTRUMENT', inplace=True) df_data.index.rename('OBSERVATION', inplace=True) ## rename index to TIME diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py index fd12b497..9a7e4911 100644 --- a/aodntools/timeseries_products/velocity_hourly_timeseries.py +++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py @@ -58,7 +58,7 @@ def append_resampled_values(nc_cell, ds, slice_start, binning_functions): # shift the index forward 30min to centre the bins on the hour df_cell.index = df_cell.index + pd.Timedelta(minutes=30) - df_cell_1H = df_cell.resample('1H') + df_cell_1H = df_cell.resample('1h') slice_end = len(df_cell_1H) + slice_start # set binned timestamps From 80340a590d59527182fc07d6b135b089169cba27 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 14:29:03 +1000 Subject: [PATCH 3/9] Updated because the Validator no longer expects a parameter named `types` --- aodntools/ncwriter/schema.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/aodntools/ncwriter/schema.py b/aodntools/ncwriter/schema.py index 4a2dd579..246c2dbc 100644 --- a/aodntools/ncwriter/schema.py +++ b/aodntools/ncwriter/schema.py @@ -1,21 +1,20 @@ """This module holds schema definitions for validating the various :py:class:`dicts` that make up parts of a template, and also the helper functions necessary to validate an object against their respective schema. """ - import json - import numpy as np from jsonschema import validators, Draft4Validator, FormatChecker, ValidationError from pkg_resources import resource_filename +def is_array(checker, instance): + return isinstance(instance, (list, np.ndarray)) -# Create a new validator class (based on Draft4Validator) to allow templates to use -# * Python types or numpy dtypes to specify variable data types; and -# * numpy arrays to specify variable data. -TemplateValidator = validators.create(meta_schema=Draft4Validator.META_SCHEMA, - validators=Draft4Validator.VALIDATORS) -format_checker = FormatChecker() +# Extend the default type checker by redefining "array" +custom_type_checker = Draft4Validator.TYPE_CHECKER.redefine("array", is_array) +# Create a custom validator that uses the new type checker. +CustomValidator = validators.extend(Draft4Validator, type_checker=custom_type_checker) +format_checker = FormatChecker() @format_checker.checks('datatype') def is_python_datatype(value): @@ -24,32 +23,25 @@ def is_python_datatype(value): return True if isinstance(value, type): return issubclass(value, np.number) - return False - -TYPES = {'array': (list, np.ndarray)} - TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json') with open(TEMPLATE_SCHEMA_JSON) as f: TEMPLATE_SCHEMA = json.load(f) -TemplateValidator.check_schema(TEMPLATE_SCHEMA) - -template_validator = TemplateValidator(TEMPLATE_SCHEMA, types=TYPES, format_checker=format_checker) +CustomValidator.check_schema(TEMPLATE_SCHEMA) +# Use the custom validator +template_validator = CustomValidator(TEMPLATE_SCHEMA, format_checker=format_checker) def validate_template(t): template_validator.validate(t) - def validate_dimensions(d): validate_template({'_dimensions': d}) - def validate_variables(v): validate_template({'_variables': v}) - - + def validate_global_attributes(a): if hasattr(a, 'keys'): special = [k for k in a.keys() if k.startswith('_')] From 5d83875f1bdb74d67866563defe8849235c86b59 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 14:32:12 +1000 Subject: [PATCH 4/9] Update required for newer version of numpy --- aodntools/timeseries_products/common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py index 75c75f2d..a0d8f12f 100644 --- a/aodntools/timeseries_products/common.py +++ b/aodntools/timeseries_products/common.py @@ -2,6 +2,7 @@ from datetime import datetime, timezone import numpy as np +import xarray as xr # Common date/time format strings TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%SZ' @@ -179,7 +180,7 @@ def in_water_index(nc): """ time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1]) time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1]) - TIME = nc['TIME'][:] + TIME = nc['TIME'].values return (TIME >= time_deployment_start) & (TIME <= time_deployment_end) def in_water(nc): @@ -189,7 +190,11 @@ def in_water(nc): :param nc: xarray dataset :return: xarray dataset """ - return nc.where(in_water_index(nc), drop=True) + + condition = in_water_index(nc) # This returns a numpy array + # Wrap the condition in a DataArray so that it aligns with the TIME coordinate. + cond_da = xr.DataArray(condition, dims=["TIME"], coords={"TIME": nc["TIME"].values}) + return nc.where(cond_da, drop=True) def current_utc_timestamp(format=TIMESTAMP_FORMAT): From 987ed14653ec0428874bf0be2cdc9ba0c75fdb6e Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 15:04:09 +1000 Subject: [PATCH 5/9] Fix for failing pytest regarding duplicate TIME values --- aodntools/timeseries_products/common.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py index a0d8f12f..14c399b0 100644 --- a/aodntools/timeseries_products/common.py +++ b/aodntools/timeseries_products/common.py @@ -190,12 +190,11 @@ def in_water(nc): :param nc: xarray dataset :return: xarray dataset """ - - condition = in_water_index(nc) # This returns a numpy array - # Wrap the condition in a DataArray so that it aligns with the TIME coordinate. - cond_da = xr.DataArray(condition, dims=["TIME"], coords={"TIME": nc["TIME"].values}) - return nc.where(cond_da, drop=True) - + condition = in_water_index(nc) # NumPy boolean array + # Get the integer indices where condition is True. + indices = np.nonzero(condition)[0] + # Use positional indexing to select the TIME entries that satisfy the condition. + return nc.isel(TIME=indices) def current_utc_timestamp(format=TIMESTAMP_FORMAT): return datetime.now(timezone.utc).strftime(format) From 876847e00c57619dbfcedd578cc57ec25234bb92 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 15:34:05 +1000 Subject: [PATCH 6/9] np.cast` was removed in the NumPy 2.0 --- examples/rottnest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/rottnest.py b/examples/rottnest.py index 41d6e0d6..e215dbb6 100644 --- a/examples/rottnest.py +++ b/examples/rottnest.py @@ -46,7 +46,8 @@ var_type = var['_datatype'] for attr in ('valid_min', 'valid_max'): if attr in var: - var[attr] = np.cast[var_type](var[attr]) + var[attr] = np.array(var[attr], dtype=var_type) + # update range attributes template.add_extent_attributes() From b74bad4184ef925743bde7abb8fdbdcaa2c3fb6a Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 15:49:50 +1000 Subject: [PATCH 7/9] Updating Github actions to use Python 3.11 --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 48dada65..1ba9a5f7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8' ] + python-version: [ '3.11' ] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -22,7 +22,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install "numpy<1.19.0" pip install -r test_requirements.txt pip install pytest-cov - name: Test with pytest From 43ad27b58d95b9bb7b88bfe53ce3f0c45d3c6588 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Mon, 7 Apr 2025 15:52:44 +1000 Subject: [PATCH 8/9] Update setup.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a6a67d54..5ff7f23f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ 'numpy>=2.2.4', 'netCDF4>=1.7.2', 'pandas>=2.2.3', - 'xarray>=2025.3.1' + 'xarray>=2023.1.0' ] TESTS_REQUIRE = [ From edcfd593d396599a65dd73e0a47f2e23555cda79 Mon Sep 17 00:00:00 2001 From: Michael Hemming Date: Tue, 8 Apr 2025 09:44:09 +1000 Subject: [PATCH 9/9] Additional comments --- aodntools/ncwriter/schema.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/aodntools/ncwriter/schema.py b/aodntools/ncwriter/schema.py index 246c2dbc..f4ac6b9f 100644 --- a/aodntools/ncwriter/schema.py +++ b/aodntools/ncwriter/schema.py @@ -6,16 +6,21 @@ from jsonschema import validators, Draft4Validator, FormatChecker, ValidationError from pkg_resources import resource_filename +# helper function that will later be used to tell the schema validator how to validate objects of type "array" def is_array(checker, instance): return isinstance(instance, (list, np.ndarray)) # Extend the default type checker by redefining "array" +# whenever a schema expects a value of type "array", it will now use the is_array function to check if the value is acceptable. custom_type_checker = Draft4Validator.TYPE_CHECKER.redefine("array", is_array) # Create a custom validator that uses the new type checker. +# any validation performed with CustomValidator will use the custom array checker CustomValidator = validators.extend(Draft4Validator, type_checker=custom_type_checker) format_checker = FormatChecker() +# Define a custom format checker +# called when a JSON schema specifies that a value should have the format "datatype" @format_checker.checks('datatype') def is_python_datatype(value): """Return whether the given value is a valid data type specification for a NetCDF variable""" @@ -25,14 +30,19 @@ def is_python_datatype(value): return issubclass(value, np.number) return False +# Load JSON schema file TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json') with open(TEMPLATE_SCHEMA_JSON) as f: TEMPLATE_SCHEMA = json.load(f) + +# Use the custom validator to check it is valid according to Draft 4 rules CustomValidator.check_schema(TEMPLATE_SCHEMA) -# Use the custom validator +# ready-to-use validator that applies both custom type and format checks template_validator = CustomValidator(TEMPLATE_SCHEMA, format_checker=format_checker) + +# Validation checks def validate_template(t): template_validator.validate(t)