From 59bb3f543ce5319970600e3671668755b00d58be Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 11 Mar 2026 15:37:06 +0000
Subject: [PATCH 1/3] Define the supported encodings, and test them all.

---
 lib/iris/fileformats/netcdf/__init__.py       |   2 +
 .../netcdf/_bytecoding_datasets.py            | 150 +++++++++++++-----
 lib/iris/fileformats/netcdf/saver.py          |  67 ++++----
 .../integration/netcdf/test_stringdata.py     |  21 ++-
 .../netcdf/test_bytecoding_datasets.py        |  34 ++--
 5 files changed, 180 insertions(+), 94 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/__init__.py b/lib/iris/fileformats/netcdf/__init__.py
index d8420c4f38..60b4b5895a 100644
--- a/lib/iris/fileformats/netcdf/__init__.py
+++ b/lib/iris/fileformats/netcdf/__init__.py
@@ -29,6 +29,7 @@
     DECODE_TO_STRINGS_ON_READ,
     DEFAULT_READ_ENCODING,
     DEFAULT_WRITE_ENCODING,
+    SUPPORTED_ENCODINGS,
 )
 from .loader import DEBUG, NetCDFDataProxy, load_cubes
 from .saver import (
@@ -53,6 +54,7 @@
     "MESH_ELEMENTS",
     "NetCDFDataProxy",
     "SPATIO_TEMPORAL_AXES",
+    "SUPPORTED_ENCODINGS",
     "Saver",
     "UnknownCellMethodWarning",
     "load_cubes",
diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 23ea17750d..4187ca2ffb 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -44,6 +44,7 @@
 import contextlib
 import dataclasses
 import threading
+from typing import Callable
 import warnings
 
 import numpy as np
@@ -117,8 +118,8 @@ class VariableEncoder:
     varname: str  # just for the error messages
     dtype: np.dtype
     is_chardata: bool  # just a shortcut for the dtype test
-    read_encoding: str  # IF 'is_chardata': a valid encoding from the codecs package
-    write_encoding: str  # IF 'is_chardata': a valid encoding from the codecs package
+    read_encoding: str  # IF 'is_chardata': one of the supported encodings
+    write_encoding: str  # IF 'is_chardata': one of the supported encodings
     n_chars_dim: int  # IF 'is_chardata': length of associated character dimension
     string_width: int  # IF 'is_chardata': width when viewed as strings (i.e. "Uxx")
 
@@ -138,59 +139,30 @@ def __init__(self, cf_var):
         self.dtype = cf_var.dtype
         self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
         if self.is_chardata:
-            self.read_encoding = self._get_encoding(cf_var, writing=False)
-            self.write_encoding = self._get_encoding(cf_var, writing=True)
+            encoding_attr = getattr(cf_var, "_Encoding", None)
+            self.read_encoding = _identify_encoding(
+                encoding_attr, var_name=cf_var.name, writing=False
+            )
+            self.write_encoding = _identify_encoding(
+                encoding_attr, var_name=cf_var.name, writing=True
+            )
             n_chars_dim = 1  # default to 1 for a scalar var
             if len(cf_var.dimensions) >= 1:
                 dim_name = cf_var.dimensions[-1]
                 if dim_name in cf_var.group().dimensions:
                     n_chars_dim = cf_var.group().dimensions[dim_name].size
             self.n_chars_dim = n_chars_dim
-            self.string_width = self._get_string_width(cf_var)
+            self.string_width = self._get_string_width()
 
-    @staticmethod
-    def _get_encoding(cf_var, writing=False) -> str:
-        """Get the byte encoding defined for this variable (or None)."""
-        result = getattr(cf_var, "_Encoding", None)
-        if result is not None:
-            try:
-                # Accept + normalise naming of encodings
-                result = codecs.lookup(result).name
-                # NOTE: if encoding does not suit data, errors can occur.
-                # For example, _Encoding = "ascii", with non-ascii content.
-            except LookupError:
-                # Unrecognised encoding name : handle this as just a warning
-                msg = (
-                    f"Ignoring unknown encoding for variable {cf_var.name!r}: "
-                    f"_Encoding = {result!r}."
-                )
-                warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
-                warnings.warn(msg, category=warntype)
-                # Proceed as if there is no specified encoding
-                result = None
-
-        if result is None:
-            if writing:
-                result = DEFAULT_WRITE_ENCODING
-            else:
-                result = DEFAULT_READ_ENCODING
-        return result
-
-    def _get_string_width(self, cf_var) -> int:
+    def _get_string_width(self) -> int:
         """Return the string-length defined for this variable."""
         # Work out the actual byte width from the parent dataset dimensions.
-        strlen = self.n_chars_dim
+        n_bytes = self.n_chars_dim
         # Convert the string dimension length (i.e. bytes) to a sufficiently-long
         #  string width, depending on the (read) encoding used.
         encoding = self.read_encoding
-        if "utf-16" in encoding:
-            # Each char needs at least 2 bytes -- including a terminator char
-            strlen = (strlen // 2) - 1
-        elif "utf-32" in encoding:
-            # Each char needs exactly 4 bytes -- including a terminator char
-            strlen = (strlen // 4) - 1
-        # "ELSE": assume there can be (at most) as many chars as bytes
-        return strlen
+        n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes)
+        return n_chars
 
     def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
         if self.is_chardata:
@@ -252,6 +224,98 @@ def context(self, perform_decoding: bool):
 DEFAULT_WRITE_ENCODING = "ascii"
 
 
+@dataclasses.dataclass
+class EncodingWidthRelations:
+    """Encode the default string-width <-> byte-dimension relations.
+
+    These translations are just a "best guess"...
+
+    When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default)
+    string width may be longer than is needed for the actual content.  But it is at
+    least "safe".
+
+    When translating strings to bytes, we *can* get more bytes than the default
+    byte dimension length, and the code will then truncate
+    ( with a warning : see '_identify_encoding' ).
+    This can be avoided if necessary, in specific cases, by recasting the data to a
+    dtype with greater width (Uxx).
+    """
+
+    nchars_2_nbytes: Callable[[int], int]
+    nbytes_2_nchars: Callable[[int], int]
+
+
+_ENCODING_WIDTH_TRANSLATIONS = {
+    "ascii": EncodingWidthRelations(lambda x: x, lambda x: x),
+    "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x),
+    "utf-16": EncodingWidthRelations(
+        nchars_2_nbytes=lambda x: x + 2,
+        nbytes_2_nchars=lambda x: x - 2,
+    ),
+    "utf-32": EncodingWidthRelations(
+        nchars_2_nbytes=lambda x: (x + 1) * 4,
+        nbytes_2_nchars=lambda x: x // 4 - 1,
+    ),
+}
+SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys())
+
+
+def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str:
+    """Normalise an encoding name + check it is supported.
+
+    Parameters
+    ----------
+    encoding : Any
+        Select an encoding : None, or a string, or anything printable (via str()).
+    var_name : str
+        Name of the relevant dataste variable (i.e. 'var_name') :
+        used only to produce warning messages.
+    writing : bool
+        Specify whether reading or writing, which affects any *default* return value,
+        i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING.
+
+    If given, and supported, return a normalised encoding name,
+    -- i.e. always one of SUPPORTED_ENCODINGS.
+    If not given, or not supported, return the default encoding name.
+
+    If given **but not recognised/supported**, also emit a warning (and return default).
+    """
+    if encoding is not None:
+        encoding = str(encoding)
+
+    result: str | None = None  # not yet 'found' : we will never *return* this
+
+    if encoding is not None:
+        # Normalise the name : NB must recognised by Python "codecs".
+        try:
+            result = codecs.lookup(encoding).name
+        except LookupError:
+            pass
+
+        if result is not None:
+            if result not in SUPPORTED_ENCODINGS:
+                # Python "codecs" recognised it, but we don't support it.
+                result = None
+
+    if encoding is not None and result is None:
+        # Unrecognised encoding name : handle this as just a warning
+        msg = (
+            f"Ignoring unsupported encoding for netCDF variable {var_name!r}: "
+            f"_Encoding = {encoding!r}, is not recognised as one of the supported "
+            f"encodings, {SUPPORTED_ENCODINGS}."
+        )
+        warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
+        warnings.warn(msg, category=warntype)
+
+    if result is None:
+        if writing:
+            result = DEFAULT_WRITE_ENCODING
+        else:
+            result = DEFAULT_READ_ENCODING
+
+    return result
+
+
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index 4bf3f5b3bd..b5b25c22bb 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1833,17 +1833,46 @@ def _create_generic_cf_array_var(
         if not is_dataless and np.issubdtype(data.dtype, np.str_):
             # Deal with string-type variables.
             # Typically CF label variables, but also possibly ancil-vars ?
+
+            # NOTE: all we are doing here is to calculate the byte dimension length,
+            # based on the dtype and any encoding attribute.
+            # The actual char --> byte data *translation* is done by the variable,
+            # being a _bytecoding_datasets.EncodedVariable.
             string_dimension_depth = data.dtype.itemsize
+
             if data.dtype.kind == "U":
-                encoding = element.attributes.get("_Encoding", "ascii")
-                # TODO: this can fail -- use a sensible warning + default?
-                encoding = codecs.lookup(encoding).name
-                if encoding == "utf-32":
-                    # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4
-                    string_dimension_depth += 4
-                else:
-                    # generally, 4 bytes per char in numpy --> make bytewidth = string-width
-                    string_dimension_depth //= 4
+                # String content (U) instead of bytes (S).
+                # For numpy strings, itemsize is **always** a multiple of 4
+                if string_dimension_depth % 4 != 0:
+                    msg = (
+                        "Unexpected numpy string 'itemsize' for element "
+                        f"{cube_or_mesh.name()}: "
+                        f"'dtype.itemsize = {string_dimension_depth}, expected "
+                        "a multiple of four (always)."
+                    )
+                    raise ValueError(msg)
+                nchars = string_dimension_depth // 4
+
+                encoding_attr = element.attributes.get("_Encoding", "ascii")
+                # Look this up + return a supported encoding name
+                # NB implements defaults and raises a warning if given not recognised.
+                encoding = bytecoding_datasets._identify_encoding(
+                    encoding=encoding_attr, var_name=cf_name, writing=True
+                )
+                width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
+                string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
+            else:
+                if data.dtype.kind != "S" or data.dtype.itemsize != 1:
+                    # Some type of data we don't "understand".
+                    # NB this includes "Sxx" types other than "S1" :  It seems that
+                    # netCDF4 can treat Sxx as if it was Uxx, as least if there is an
+                    # _Encoding attribute.  But we don't support that type in Iris.
+                    msg = (
+                        f"Variable {cf_name!r} has unexpected string/character dtype, "
+                        f"{data.dtype} -- should be either 'S' or 'U' type."
+                    )
+                    raise ValueError(msg)
+
             string_dimension_name = "string%d" % string_dimension_depth
 
             # Determine whether to create the string length dimension.
@@ -1861,26 +1890,6 @@ def _create_generic_cf_array_var(
 
             # Create the label coordinate variable.
             cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
-
-            # # Convert data from an array of strings into a character array
-            # # with an extra string-length dimension.
-            # if len(element_dims) == 1:
-            #     # Scalar variable (only has string dimension).
-            #     data_first = data[0]
-            #     if is_lazy_data(data_first):
-            #         data_first = dask.compute(data_first)
-            #     data = list("%- *s" % (string_dimension_depth, data_first))
-            # else:
-            #     # NOTE: at present, can't do this lazily??
-            #     orig_shape = data.shape
-            #     new_shape = orig_shape + (string_dimension_depth,)
-            #     new_data = np.zeros(new_shape, cf_var.dtype)
-            #     for index in np.ndindex(orig_shape):
-            #         index_slice = tuple(list(index) + [slice(None, None)])
-            #         new_data[index_slice] = list(
-            #             "%- *s" % (string_dimension_depth, data[index])
-            #         )
-            #     data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index 6ceb0fc1f5..908c873bf5 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -19,7 +19,7 @@
 import iris
 from iris.coords import AuxCoord, DimCoord
 from iris.cube import Cube
-from iris.fileformats.netcdf import _thread_safe_nc
+from iris.fileformats.netcdf import SUPPORTED_ENCODINGS, _thread_safe_nc
 
 
 @pytest.fixture(scope="module")
@@ -38,14 +38,7 @@ def all_lazy_auxcoords():
 PERSIST_TESTFILES: str | None = None
 
 NO_ENCODING_STR = "<noencoding>"
-TEST_ENCODINGS = [
-    NO_ENCODING_STR,
-    "ascii",
-    "utf-8",
-    # "iso8859-1",  # a common one-byte-per-char "codepage" type
-    # "utf-16",
-    "utf-32",
-]
+TEST_ENCODINGS = [NO_ENCODING_STR] + SUPPORTED_ENCODINGS
 
 
 #
@@ -255,10 +248,12 @@ def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails):
         assert load_problems_list() == []
         assert cube.shape == (N_XDIM,)
 
-        if encoding != "utf-32":
-            expected_string_width = N_CHARS_DIM
-        else:
+        if encoding == "utf-32":
             expected_string_width = (N_CHARS_DIM // 4) - 1
+        elif encoding == "utf-16":
+            expected_string_width = N_CHARS_DIM - 2
+        else:
+            expected_string_width = N_CHARS_DIM
         assert cube.dtype == f"<U{expected_string_width}"
         cube_data = cube.data
         assert np.all(cube_data == datavar_strings)
@@ -303,6 +298,8 @@ def make_testcube(
         charlen = N_CHARS_DIM
         if encoding_str == "utf-32":
             charlen = charlen // 4 - 1
+        elif encoding_str == "utf-16":
+            charlen = charlen - 2
         strings_dtype = np.dtype(f"U{charlen}")
         coordvar_array = np.array(coordvar_strings, dtype=strings_dtype)
         datavar_array = np.array(datavar_strings, dtype=strings_dtype)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 90914a39c7..a06acd9b9e 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -5,6 +5,7 @@
 """Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module."""
 
 from pathlib import Path
+import warnings
 
 import numpy as np
 import pytest
@@ -12,12 +13,13 @@
 from iris.exceptions import TranslationError
 from iris.fileformats.netcdf._bytecoding_datasets import (
     DECODE_TO_STRINGS_ON_READ,
+    SUPPORTED_ENCODINGS,
     EncodedDataset,
 )
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
 from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning
 
-encoding_options = [None, "ascii", "utf-8", "utf-32"]
+encoding_options = [None] + SUPPORTED_ENCODINGS
 
 samples_3_ascii = np.array(
     ["one", "", "seven"],  # N.B. include empty!
@@ -215,7 +217,10 @@ def test_write_badencoding_ignore(self, tempdir):
         path = tempdir / "test_writestrings_badencoding_ignore.nc"
         ds = make_encoded_dataset(path, strlen=5, encoding="unknown")
         v = ds.variables["vxs"]
-        msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\."
+        msg = (
+            r"Ignoring unsupported encoding for netCDF variable 'vxs': "
+            ".*'unknown', is not recognised as one of the supported encodings"
+        )
         with pytest.warns(IrisCfSaveWarning, match=msg):
             v[:] = samples_3_ascii  # will work OK
 
@@ -335,18 +340,24 @@ def test_encodings(self, encoding, tempdir, readmode):
             # Test "normal" read --> string array
             result = v[:]
             expected = write_strings
-            if encoding == "utf-8":
-                # In this case, with the given non-ascii sample data, the
+            if encoding in ("utf-8", "utf-16"):
+                # In these cases, with the given non-ascii sample data, the
                 #  "default minimum string length" is overestimated.
-                assert strlen == 7
-                assert result.dtype == "U7"
-                # correct the result dtype to pass the write_strings comparison below
-                truncated_result = result.astype("U4")
+                if encoding == "utf-8":
+                    assert strlen == 7
+                    assert result.dtype == "U7"
+                    # correct the result dtype to pass the write_strings comparison below
+                    truncated_result = result.astype("U4")
+                elif encoding == "utf-16":
+                    assert strlen == 10
+                    assert result.dtype == "U8"
+                    # correct the result dtype to pass the write_strings comparison below
+                    truncated_result = result.astype("U4")
                 # Also check that content is the same (i.e. not actually truncated)
                 assert np.all(truncated_result == result)
                 result = truncated_result
         else:
-            # Close and re-open as "regular" dataset -- just to check the raw content
+            # Close and re-open as "regular" dataset -- just to check "raw" byte content
             v = self.undecoded_testvar(ds_encoded, "vxs")
             result = v[:]
             expected = write_bytes
@@ -449,7 +460,10 @@ def test_read_badencoding_ignore(self, tempdir):
         )
         v[:] = test_utf8_bytes
 
-        msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\."
+        msg = (
+            r"Ignoring unsupported encoding for netCDF variable 'vxs': "
+            ".*'unknown', is not recognised as one of the supported encodings"
+        )
         with pytest.warns(IrisCfLoadWarning, match=msg):
             # raises warning but succeeds, due to default read encoding of 'utf-8'
             v[:]

From 25482297bb1102864a854f9be3dae894bbe9d7d9 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Thu, 12 Mar 2026 14:29:02 +0000
Subject: [PATCH 2/3] Fix utf-16 nchars/nbytes relation.

---
 lib/iris/fileformats/netcdf/_bytecoding_datasets.py       | 6 +++---
 lib/iris/tests/integration/netcdf/test_stringdata.py      | 8 +++++---
 .../unit/fileformats/netcdf/test_bytecoding_datasets.py   | 2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 4187ca2ffb..2d38498708 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -249,8 +249,8 @@ class EncodingWidthRelations:
     "ascii": EncodingWidthRelations(lambda x: x, lambda x: x),
     "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x),
     "utf-16": EncodingWidthRelations(
-        nchars_2_nbytes=lambda x: x + 2,
-        nbytes_2_nchars=lambda x: x - 2,
+        nchars_2_nbytes=lambda x: (x + 1) * 2,
+        nbytes_2_nchars=lambda x: x // 2 - 1,
     ),
     "utf-32": EncodingWidthRelations(
         nchars_2_nbytes=lambda x: (x + 1) * 4,
@@ -268,7 +268,7 @@ def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str:
     encoding : Any
         Select an encoding : None, or a string, or anything printable (via str()).
     var_name : str
-        Name of the relevant dataste variable (i.e. 'var_name') :
+        Name of the relevant dataset variable (i.e. 'var_name') :
         used only to produce warning messages.
     writing : bool
         Specify whether reading or writing, which affects any *default* return value,
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index 908c873bf5..585388c5e4 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -251,7 +251,7 @@ def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails):
         if encoding == "utf-32":
             expected_string_width = (N_CHARS_DIM // 4) - 1
         elif encoding == "utf-16":
-            expected_string_width = N_CHARS_DIM - 2
+            expected_string_width = (N_CHARS_DIM) // 2 - 1
         else:
             expected_string_width = N_CHARS_DIM
         assert cube.dtype == f"<U{expected_string_width}"
@@ -295,11 +295,13 @@ def make_testcube(
         datavar_strings = ["bun", "éclair", "sandwich"]
 
     if not byte_data:
+        # Do our own conversion between intended byte dimension and string width
+        # N.B. N_CHARS_DIM is set big enough so the test strings will never overflow
         charlen = N_CHARS_DIM
         if encoding_str == "utf-32":
-            charlen = charlen // 4 - 1
+            charlen = (charlen // 4) - 1
         elif encoding_str == "utf-16":
-            charlen = charlen - 2
+            charlen = (charlen // 2) - 1
         strings_dtype = np.dtype(f"U{charlen}")
         coordvar_array = np.array(coordvar_strings, dtype=strings_dtype)
         datavar_array = np.array(datavar_strings, dtype=strings_dtype)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index a06acd9b9e..1687a8edcb 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -350,7 +350,7 @@ def test_encodings(self, encoding, tempdir, readmode):
                     truncated_result = result.astype("U4")
                 elif encoding == "utf-16":
                     assert strlen == 10
-                    assert result.dtype == "U8"
+                    assert result.dtype == "U4"
                     # correct the result dtype to pass the write_strings comparison below
                     truncated_result = result.astype("U4")
                 # Also check that content is the same (i.e. not actually truncated)

From e8b4e59743776c50968f5f83dbeeb72a84a3a2f9 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Thu, 12 Mar 2026 17:21:39 +0000
Subject: [PATCH 3/3] Rationalise error handling + improve messages.

---
 .../netcdf/_bytecoding_datasets.py            | 66 +++++++++++--------
 .../netcdf/test_bytecoding_datasets.py        | 11 ++--
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 2d38498708..e493a481b5 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -60,7 +60,7 @@
 
 
 def decode_bytesarray_to_stringarray(
-    byte_array: np.ndarray, encoding: str, string_width: int
+    byte_array: np.ndarray, encoding: str, string_width: int, var_name: str
 ) -> np.ndarray:
     """Convert an array of bytes to an array of strings, with one less dimension.
 
@@ -77,13 +77,24 @@ def decode_bytesarray_to_stringarray(
     for ndindex in np.ndindex(var_shape):
         element_bytes = byte_array[ndindex]
         bytes = b"".join([b or b"\0" for b in element_bytes])
-        string = bytes.decode(encoding)
+        try:
+            string = bytes.decode(encoding)
+        except UnicodeDecodeError as err:
+            msg = (
+                f"Character data in variable {var_name!r} could not be decoded "
+                f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                "variable '_Encoding' attribute to suit the content."
+            )
+            raise ValueError(msg) from err
         result[ndindex] = string
     return result
 
 
 def encode_stringarray_as_bytearray(
-    data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
+    data: np.typing.ArrayLike,
+    encoding: str,
+    string_dimension_length: int,
+    var_name: str,
 ) -> np.ndarray:
     """Encode strings as a bytes array."""
     data = np.asanyarray(data)
@@ -92,15 +103,28 @@ def encode_stringarray_as_bytearray(
     right_pad = b"\0" * string_dimension_length
     for index in np.ndindex(element_shape):
         string = data[index]
-        bytes = string.encode(encoding=encoding)
+        try:
+            bytes = string.encode(encoding=encoding)
+        except UnicodeEncodeError as err:
+            msg = (
+                f"String data written to netcdf character variable {var_name!r} "
+                f"could not be represented in encoding {encoding!r}.  "
+                "This can be fixed by setting a suitable variable '_Encoding' "
+                'attribute, e.g. variable._Encoding="UTF-8".'
+            )
+            raise ValueError(msg) from err
+
         n_bytes = len(bytes)
         # TODO: may want to issue warning or error if we overflow the length?
         if n_bytes > string_dimension_length:
             from iris.exceptions import TranslationError
 
             msg = (
-                f"String {string!r} written to netcdf exceeds string dimension after "
-                f"encoding : {n_bytes} > {string_dimension_length}."
+                f"String '{string}' written into netcdf variable {var_name!r} with "
+                f"encoding {encoding!r} is {n_bytes} bytes long, which exceeds the "
+                f"string dimension length, {string_dimension_length}. "
+                'This can be fixed by converting the data to a "wider" string dtype, '
+                f'e.g. cube.data = cube.data.astype("U{n_bytes}").'
             )
             raise TranslationError(msg)
 
@@ -169,15 +193,9 @@ def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
             # N.B. read encoding default is UTF-8 --> a "usually safe" choice
             encoding = self.read_encoding
             strlen = self.string_width
-            try:
-                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
-            except UnicodeDecodeError as err:
-                msg = (
-                    f"Character data in variable {self.varname!r} could not be decoded "
-                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
-                    "variable '_Encoding' attribute to suit the content."
-                )
-                raise ValueError(msg) from err
+            data = decode_bytesarray_to_stringarray(
+                data, encoding, strlen, self.varname
+            )
 
         return data
 
@@ -185,19 +203,11 @@ def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray:
         if self.is_chardata and data.dtype.kind == "U":
             # N.B. it is also possible to pass a byte array (dtype "S1"),
             #  to be written directly, without processing.
-            try:
-                # N.B. write encoding *default* is "ascii" --> fails bad content
-                encoding = self.write_encoding
-                strlen = self.n_chars_dim
-                data = encode_stringarray_as_bytearray(data, encoding, strlen)
-            except UnicodeEncodeError as err:
-                msg = (
-                    f"String data written to netcdf character variable {self.varname!r} "
-                    f"could not be represented in encoding {self.write_encoding!r}.  "
-                    "This can be fixed by setting a suitable variable '_Encoding' "
-                    'attribute, e.g. <variable>._Encoding="UTF-8".'
-                )
-                raise ValueError(msg) from err
+            # N.B. write encoding *default* is "ascii" --> fails bad content
+            encoding = self.write_encoding
+            strlen = self.n_chars_dim
+            data = encode_stringarray_as_bytearray(data, encoding, strlen, self.varname)
+
         return data
 
 
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 1687a8edcb..5734d423db 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -227,12 +227,15 @@ def test_write_badencoding_ignore(self, tempdir):
     def test_overlength(self, tempdir):
         # Check expected behaviour with over-length data
         path = tempdir / "test_writestrings_overlength.nc"
-        strlen = 5
-        ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
+        strlen = 6
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="utf8")
         v = ds.variables["vxs"]
-        msg = r"String .* written to netcdf exceeds string dimension .* : [0-9]* > 5\."
+        msg = (
+            r"String .* written into netcdf variable 'vxs' with encoding \'utf-8\' "
+            r"is 7 bytes long, which exceeds .* 6\. This can be fixed by "
+        )
         with pytest.raises(TranslationError, match=msg):
-            v[:] = ["1", "123456789", "two"]
+            v[:] = ["1", "éclair", "two"]
 
     def test_overlength_splitcoding(self, tempdir):
         # Check expected behaviour when non-ascii multibyte coding gets truncated