From 59bb3f543ce5319970600e3671668755b00d58be Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 11 Mar 2026 15:37:06 +0000 Subject: [PATCH 1/3] Define the supported encodings, and test them all. --- lib/iris/fileformats/netcdf/__init__.py | 2 + .../netcdf/_bytecoding_datasets.py | 150 +++++++++++++----- lib/iris/fileformats/netcdf/saver.py | 67 ++++---- .../integration/netcdf/test_stringdata.py | 21 ++- .../netcdf/test_bytecoding_datasets.py | 34 ++-- 5 files changed, 180 insertions(+), 94 deletions(-) diff --git a/lib/iris/fileformats/netcdf/__init__.py b/lib/iris/fileformats/netcdf/__init__.py index d8420c4f38..60b4b5895a 100644 --- a/lib/iris/fileformats/netcdf/__init__.py +++ b/lib/iris/fileformats/netcdf/__init__.py @@ -29,6 +29,7 @@ DECODE_TO_STRINGS_ON_READ, DEFAULT_READ_ENCODING, DEFAULT_WRITE_ENCODING, + SUPPORTED_ENCODINGS, ) from .loader import DEBUG, NetCDFDataProxy, load_cubes from .saver import ( @@ -53,6 +54,7 @@ "MESH_ELEMENTS", "NetCDFDataProxy", "SPATIO_TEMPORAL_AXES", + "SUPPORTED_ENCODINGS", "Saver", "UnknownCellMethodWarning", "load_cubes", diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 23ea17750d..4187ca2ffb 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -44,6 +44,7 @@ import contextlib import dataclasses import threading +from typing import Callable import warnings import numpy as np @@ -117,8 +118,8 @@ class VariableEncoder: varname: str # just for the error messages dtype: np.dtype is_chardata: bool # just a shortcut for the dtype test - read_encoding: str # IF 'is_chardata': a valid encoding from the codecs package - write_encoding: str # IF 'is_chardata': a valid encoding from the codecs package + read_encoding: str # IF 'is_chardata': one of the supported encodings + write_encoding: str # IF 'is_chardata': one of the supported encodings n_chars_dim: int # IF 'is_chardata': length of associated character dimension string_width: int # IF 'is_chardata': width when viewed as strings (i.e. "Uxx") @@ -138,59 +139,30 @@ def __init__(self, cf_var): self.dtype = cf_var.dtype self.is_chardata = np.issubdtype(self.dtype, np.bytes_) if self.is_chardata: - self.read_encoding = self._get_encoding(cf_var, writing=False) - self.write_encoding = self._get_encoding(cf_var, writing=True) + encoding_attr = getattr(cf_var, "_Encoding", None) + self.read_encoding = _identify_encoding( + encoding_attr, var_name=cf_var.name, writing=False + ) + self.write_encoding = _identify_encoding( + encoding_attr, var_name=cf_var.name, writing=True + ) n_chars_dim = 1 # default to 1 for a scalar var if len(cf_var.dimensions) >= 1: dim_name = cf_var.dimensions[-1] if dim_name in cf_var.group().dimensions: n_chars_dim = cf_var.group().dimensions[dim_name].size self.n_chars_dim = n_chars_dim - self.string_width = self._get_string_width(cf_var) + self.string_width = self._get_string_width() - @staticmethod - def _get_encoding(cf_var, writing=False) -> str: - """Get the byte encoding defined for this variable (or None).""" - result = getattr(cf_var, "_Encoding", None) - if result is not None: - try: - # Accept + normalise naming of encodings - result = codecs.lookup(result).name - # NOTE: if encoding does not suit data, errors can occur. - # For example, _Encoding = "ascii", with non-ascii content. - except LookupError: - # Unrecognised encoding name : handle this as just a warning - msg = ( - f"Ignoring unknown encoding for variable {cf_var.name!r}: " - f"_Encoding = {result!r}." - ) - warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning - warnings.warn(msg, category=warntype) - # Proceed as if there is no specified encoding - result = None - - if result is None: - if writing: - result = DEFAULT_WRITE_ENCODING - else: - result = DEFAULT_READ_ENCODING - return result - - def _get_string_width(self, cf_var) -> int: + def _get_string_width(self) -> int: """Return the string-length defined for this variable.""" # Work out the actual byte width from the parent dataset dimensions. - strlen = self.n_chars_dim + n_bytes = self.n_chars_dim # Convert the string dimension length (i.e. bytes) to a sufficiently-long # string width, depending on the (read) encoding used. encoding = self.read_encoding - if "utf-16" in encoding: - # Each char needs at least 2 bytes -- including a terminator char - strlen = (strlen // 2) - 1 - elif "utf-32" in encoding: - # Each char needs exactly 4 bytes -- including a terminator char - strlen = (strlen // 4) - 1 - # "ELSE": assume there can be (at most) as many chars as bytes - return strlen + n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes) + return n_chars def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray: if self.is_chardata: @@ -252,6 +224,98 @@ def context(self, perform_decoding: bool): DEFAULT_WRITE_ENCODING = "ascii" +@dataclasses.dataclass +class EncodingWidthRelations: + """Encode the default string-width <-> byte-dimension relations. + + These translations are just a "best guess"... + + When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default) + string width may be longer than is needed for the actual content. But it is at + least "safe". + + When translating strings to bytes, we *can* get more bytes than the default + byte dimension length, and the code will then truncate + ( with a warning : see '_identify_encoding' ). + This can be avoided if necessary, in specific cases, by recasting the data to a + dtype with greater width (Uxx). + """ + + nchars_2_nbytes: Callable[[int], int] + nbytes_2_nchars: Callable[[int], int] + + +_ENCODING_WIDTH_TRANSLATIONS = { + "ascii": EncodingWidthRelations(lambda x: x, lambda x: x), + "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x), + "utf-16": EncodingWidthRelations( + nchars_2_nbytes=lambda x: x + 2, + nbytes_2_nchars=lambda x: x - 2, + ), + "utf-32": EncodingWidthRelations( + nchars_2_nbytes=lambda x: (x + 1) * 4, + nbytes_2_nchars=lambda x: x // 4 - 1, + ), +} +SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys()) + + +def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str: + """Normalise an encoding name + check it is supported. + + Parameters + ---------- + encoding : Any + Select an encoding : None, or a string, or anything printable (via str()). + var_name : str + Name of the relevant dataste variable (i.e. 'var_name') : + used only to produce warning messages. + writing : bool + Specify whether reading or writing, which affects any *default* return value, + i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING. + + If given, and supported, return a normalised encoding name, + -- i.e. always one of SUPPORTED_ENCODINGS. + If not given, or not supported, return the default encoding name. + + If given **but not recognised/supported**, also emit a warning (and return default). + """ + if encoding is not None: + encoding = str(encoding) + + result: str | None = None # not yet 'found' : we will never *return* this + + if encoding is not None: + # Normalise the name : NB must recognised by Python "codecs". + try: + result = codecs.lookup(encoding).name + except LookupError: + pass + + if result is not None: + if result not in SUPPORTED_ENCODINGS: + # Python "codecs" recognised it, but we don't support it. + result = None + + if encoding is not None and result is None: + # Unrecognised encoding name : handle this as just a warning + msg = ( + f"Ignoring unsupported encoding for netCDF variable {var_name!r}: " + f"_Encoding = {encoding!r}, is not recognised as one of the supported " + f"encodings, {SUPPORTED_ENCODINGS}." + ) + warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning + warnings.warn(msg, category=warntype) + + if result is None: + if writing: + result = DEFAULT_WRITE_ENCODING + else: + result = DEFAULT_READ_ENCODING + + return result + + class EncodedVariable(VariableWrapper): """A variable wrapper that translates variable data according to byte encodings.""" diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 4bf3f5b3bd..b5b25c22bb 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1833,17 +1833,46 @@ def _create_generic_cf_array_var( if not is_dataless and np.issubdtype(data.dtype, np.str_): # Deal with string-type variables. # Typically CF label variables, but also possibly ancil-vars ? + + # NOTE: all we are doing here is to calculate the byte dimension length, + # based on the dtype and any encoding attribute. + # The actual char --> byte data *translation* is done by the variable, + # being a _bytecoding_datasets.EncodedVariable. string_dimension_depth = data.dtype.itemsize + if data.dtype.kind == "U": - encoding = element.attributes.get("_Encoding", "ascii") - # TODO: this can fail -- use a sensible warning + default? - encoding = codecs.lookup(encoding).name - if encoding == "utf-32": - # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4 - string_dimension_depth += 4 - else: - # generally, 4 bytes per char in numpy --> make bytewidth = string-width - string_dimension_depth //= 4 + # String content (U) instead of bytes (S). + # For numpy strings, itemsize is **always** a multiple of 4 + if string_dimension_depth % 4 != 0: + msg = ( + "Unexpected numpy string 'itemsize' for element " + f"{cube_or_mesh.name()}: " + f"'dtype.itemsize = {string_dimension_depth}, expected " + "a multiple of four (always)." + ) + raise ValueError(msg) + nchars = string_dimension_depth // 4 + + encoding_attr = element.attributes.get("_Encoding", "ascii") + # Look this up + return a supported encoding name + # NB implements defaults and raises a warning if given not recognised. + encoding = bytecoding_datasets._identify_encoding( + encoding=encoding_attr, var_name=cf_name, writing=True + ) + width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding] + string_dimension_depth = width_fns.nchars_2_nbytes(nchars) + else: + if data.dtype.kind != "S" or data.dtype.itemsize != 1: + # Some type of data we don't "understand". + # NB this includes "Sxx" types other than "S1" : It seems that + # netCDF4 can treat Sxx as if it was Uxx, as least if there is an + # _Encoding attribute. But we don't support that type in Iris. + msg = ( + f"Variable {cf_name!r} has unexpected string/character dtype, " + f"{data.dtype} -- should be either 'S' or 'U' type." + ) + raise ValueError(msg) + string_dimension_name = "string%d" % string_dimension_depth # Determine whether to create the string length dimension. @@ -1861,26 +1890,6 @@ def _create_generic_cf_array_var( # Create the label coordinate variable. cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims) - - # # Convert data from an array of strings into a character array - # # with an extra string-length dimension. - # if len(element_dims) == 1: - # # Scalar variable (only has string dimension). - # data_first = data[0] - # if is_lazy_data(data_first): - # data_first = dask.compute(data_first) - # data = list("%- *s" % (string_dimension_depth, data_first)) - # else: - # # NOTE: at present, can't do this lazily?? - # orig_shape = data.shape - # new_shape = orig_shape + (string_dimension_depth,) - # new_data = np.zeros(new_shape, cf_var.dtype) - # for index in np.ndindex(orig_shape): - # index_slice = tuple(list(index) + [slice(None, None)]) - # new_data[index_slice] = list( - # "%- *s" % (string_dimension_depth, data[index]) - # ) - # data = new_data else: # A normal (numeric) variable. # ensure a valid datatype for the file format. diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 6ceb0fc1f5..908c873bf5 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -19,7 +19,7 @@ import iris from iris.coords import AuxCoord, DimCoord from iris.cube import Cube -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import SUPPORTED_ENCODINGS, _thread_safe_nc @pytest.fixture(scope="module") @@ -38,14 +38,7 @@ def all_lazy_auxcoords(): PERSIST_TESTFILES: str | None = None NO_ENCODING_STR = "" -TEST_ENCODINGS = [ - NO_ENCODING_STR, - "ascii", - "utf-8", - # "iso8859-1", # a common one-byte-per-char "codepage" type - # "utf-16", - "utf-32", -] +TEST_ENCODINGS = [NO_ENCODING_STR] + SUPPORTED_ENCODINGS # @@ -255,10 +248,12 @@ def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails): assert load_problems_list() == [] assert cube.shape == (N_XDIM,) - if encoding != "utf-32": - expected_string_width = N_CHARS_DIM - else: + if encoding == "utf-32": expected_string_width = (N_CHARS_DIM // 4) - 1 + elif encoding == "utf-16": + expected_string_width = N_CHARS_DIM - 2 + else: + expected_string_width = N_CHARS_DIM assert cube.dtype == f" string array result = v[:] expected = write_strings - if encoding == "utf-8": - # In this case, with the given non-ascii sample data, the + if encoding in ("utf-8", "utf-16"): + # In these cases, with the given non-ascii sample data, the # "default minimum string length" is overestimated. - assert strlen == 7 - assert result.dtype == "U7" - # correct the result dtype to pass the write_strings comparison below - truncated_result = result.astype("U4") + if encoding == "utf-8": + assert strlen == 7 + assert result.dtype == "U7" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") + elif encoding == "utf-16": + assert strlen == 10 + assert result.dtype == "U8" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") # Also check that content is the same (i.e. not actually truncated) assert np.all(truncated_result == result) result = truncated_result else: - # Close and re-open as "regular" dataset -- just to check the raw content + # Close and re-open as "regular" dataset -- just to check "raw" byte content v = self.undecoded_testvar(ds_encoded, "vxs") result = v[:] expected = write_bytes @@ -449,7 +460,10 @@ def test_read_badencoding_ignore(self, tempdir): ) v[:] = test_utf8_bytes - msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\." + msg = ( + r"Ignoring unsupported encoding for netCDF variable 'vxs': " + ".*'unknown', is not recognised as one of the supported encodings" + ) with pytest.warns(IrisCfLoadWarning, match=msg): # raises warning but succeeds, due to default read encoding of 'utf-8' v[:] From 25482297bb1102864a854f9be3dae894bbe9d7d9 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 12 Mar 2026 14:29:02 +0000 Subject: [PATCH 2/3] Fix utf-16 nchars/nbytes relation. --- lib/iris/fileformats/netcdf/_bytecoding_datasets.py | 6 +++--- lib/iris/tests/integration/netcdf/test_stringdata.py | 8 +++++--- .../unit/fileformats/netcdf/test_bytecoding_datasets.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 4187ca2ffb..2d38498708 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -249,8 +249,8 @@ class EncodingWidthRelations: "ascii": EncodingWidthRelations(lambda x: x, lambda x: x), "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x), "utf-16": EncodingWidthRelations( - nchars_2_nbytes=lambda x: x + 2, - nbytes_2_nchars=lambda x: x - 2, + nchars_2_nbytes=lambda x: (x + 1) * 2, + nbytes_2_nchars=lambda x: x // 2 - 1, ), "utf-32": EncodingWidthRelations( nchars_2_nbytes=lambda x: (x + 1) * 4, @@ -268,7 +268,7 @@ def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str: encoding : Any Select an encoding : None, or a string, or anything printable (via str()). var_name : str - Name of the relevant dataste variable (i.e. 'var_name') : + Name of the relevant dataset variable (i.e. 'var_name') : used only to produce warning messages. writing : bool Specify whether reading or writing, which affects any *default* return value, diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 908c873bf5..585388c5e4 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -251,7 +251,7 @@ def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails): if encoding == "utf-32": expected_string_width = (N_CHARS_DIM // 4) - 1 elif encoding == "utf-16": - expected_string_width = N_CHARS_DIM - 2 + expected_string_width = (N_CHARS_DIM) // 2 - 1 else: expected_string_width = N_CHARS_DIM assert cube.dtype == f" Date: Thu, 12 Mar 2026 17:21:39 +0000 Subject: [PATCH 3/3] Rationalise error handling + improve messages. --- .../netcdf/_bytecoding_datasets.py | 66 +++++++++++-------- .../netcdf/test_bytecoding_datasets.py | 11 ++-- 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 2d38498708..e493a481b5 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -60,7 +60,7 @@ def decode_bytesarray_to_stringarray( - byte_array: np.ndarray, encoding: str, string_width: int + byte_array: np.ndarray, encoding: str, string_width: int, var_name: str ) -> np.ndarray: """Convert an array of bytes to an array of strings, with one less dimension. @@ -77,13 +77,24 @@ def decode_bytesarray_to_stringarray( for ndindex in np.ndindex(var_shape): element_bytes = byte_array[ndindex] bytes = b"".join([b or b"\0" for b in element_bytes]) - string = bytes.decode(encoding) + try: + string = bytes.decode(encoding) + except UnicodeDecodeError as err: + msg = ( + f"Character data in variable {var_name!r} could not be decoded " + f"with the {encoding!r} encoding. This can be fixed by setting the " + "variable '_Encoding' attribute to suit the content." + ) + raise ValueError(msg) from err result[ndindex] = string return result def encode_stringarray_as_bytearray( - data: np.typing.ArrayLike, encoding: str, string_dimension_length: int + data: np.typing.ArrayLike, + encoding: str, + string_dimension_length: int, + var_name: str, ) -> np.ndarray: """Encode strings as a bytes array.""" data = np.asanyarray(data) @@ -92,15 +103,28 @@ def encode_stringarray_as_bytearray( right_pad = b"\0" * string_dimension_length for index in np.ndindex(element_shape): string = data[index] - bytes = string.encode(encoding=encoding) + try: + bytes = string.encode(encoding=encoding) + except UnicodeEncodeError as err: + msg = ( + f"String data written to netcdf character variable {var_name!r} " + f"could not be represented in encoding {encoding!r}. " + "This can be fixed by setting a suitable variable '_Encoding' " + 'attribute, e.g. variable._Encoding="UTF-8".' + ) + raise ValueError(msg) from err + n_bytes = len(bytes) # TODO: may want to issue warning or error if we overflow the length? if n_bytes > string_dimension_length: from iris.exceptions import TranslationError msg = ( - f"String {string!r} written to netcdf exceeds string dimension after " - f"encoding : {n_bytes} > {string_dimension_length}." + f"String '{string}' written into netcdf variable {var_name!r} with " + f"encoding {encoding!r} is {n_bytes} bytes long, which exceeds the " + f"string dimension length, {string_dimension_length}. " + 'This can be fixed by converting the data to a "wider" string dtype, ' + f'e.g. cube.data = cube.data.astype("U{n_bytes}").' ) raise TranslationError(msg) @@ -169,15 +193,9 @@ def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray: # N.B. read encoding default is UTF-8 --> a "usually safe" choice encoding = self.read_encoding strlen = self.string_width - try: - data = decode_bytesarray_to_stringarray(data, encoding, strlen) - except UnicodeDecodeError as err: - msg = ( - f"Character data in variable {self.varname!r} could not be decoded " - f"with the {encoding!r} encoding. This can be fixed by setting the " - "variable '_Encoding' attribute to suit the content." - ) - raise ValueError(msg) from err + data = decode_bytesarray_to_stringarray( + data, encoding, strlen, self.varname + ) return data @@ -185,19 +203,11 @@ def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray: if self.is_chardata and data.dtype.kind == "U": # N.B. it is also possible to pass a byte array (dtype "S1"), # to be written directly, without processing. - try: - # N.B. write encoding *default* is "ascii" --> fails bad content - encoding = self.write_encoding - strlen = self.n_chars_dim - data = encode_stringarray_as_bytearray(data, encoding, strlen) - except UnicodeEncodeError as err: - msg = ( - f"String data written to netcdf character variable {self.varname!r} " - f"could not be represented in encoding {self.write_encoding!r}. " - "This can be fixed by setting a suitable variable '_Encoding' " - 'attribute, e.g. ._Encoding="UTF-8".' - ) - raise ValueError(msg) from err + # N.B. write encoding *default* is "ascii" --> fails bad content + encoding = self.write_encoding + strlen = self.n_chars_dim + data = encode_stringarray_as_bytearray(data, encoding, strlen, self.varname) + return data diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 1687a8edcb..5734d423db 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -227,12 +227,15 @@ def test_write_badencoding_ignore(self, tempdir): def test_overlength(self, tempdir): # Check expected behaviour with over-length data path = tempdir / "test_writestrings_overlength.nc" - strlen = 5 - ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + strlen = 6 + ds = make_encoded_dataset(path, strlen=strlen, encoding="utf8") v = ds.variables["vxs"] - msg = r"String .* written to netcdf exceeds string dimension .* : [0-9]* > 5\." + msg = ( + r"String .* written into netcdf variable 'vxs' with encoding \'utf-8\' " + r"is 7 bytes long, which exceeds .* 6\. This can be fixed by " + ) with pytest.raises(TranslationError, match=msg): - v[:] = ["1", "123456789", "two"] + v[:] = ["1", "éclair", "two"] def test_overlength_splitcoding(self, tempdir): # Check expected behaviour when non-ascii multibyte coding gets truncated