pp-mo · pp-mo · Mar 11, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/lib/iris/fileformats/netcdf/__init__.py b/lib/iris/fileformats/netcdf/__init__.py
@@ -29,6 +29,7 @@
     DECODE_TO_STRINGS_ON_READ,
     DEFAULT_READ_ENCODING,
     DEFAULT_WRITE_ENCODING,
+    SUPPORTED_ENCODINGS,
 )
 from .loader import DEBUG, NetCDFDataProxy, load_cubes
 from .saver import (
@@ -53,6 +54,7 @@
     "MESH_ELEMENTS",
     "NetCDFDataProxy",
     "SPATIO_TEMPORAL_AXES",
+    "SUPPORTED_ENCODINGS",
     "Saver",
     "UnknownCellMethodWarning",
     "load_cubes",

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -44,6 +44,7 @@
 import contextlib
 import dataclasses
 import threading
+from typing import Callable
 import warnings
 
 import numpy as np
@@ -59,7 +60,7 @@
 
 
 def decode_bytesarray_to_stringarray(
-    byte_array: np.ndarray, encoding: str, string_width: int
+    byte_array: np.ndarray, encoding: str, string_width: int, var_name: str
 ) -> np.ndarray:
     """Convert an array of bytes to an array of strings, with one less dimension.
 
@@ -76,13 +77,24 @@ def decode_bytesarray_to_stringarray(
     for ndindex in np.ndindex(var_shape):
         element_bytes = byte_array[ndindex]
         bytes = b"".join([b or b"\0" for b in element_bytes])
-        string = bytes.decode(encoding)
+        try:
+            string = bytes.decode(encoding)
+        except UnicodeDecodeError as err:
+            msg = (
+                f"Character data in variable {var_name!r} could not be decoded "
+                f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                "variable '_Encoding' attribute to suit the content."
+            )
+            raise ValueError(msg) from err
         result[ndindex] = string
     return result
 
 
 def encode_stringarray_as_bytearray(
-    data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
+    data: np.typing.ArrayLike,
+    encoding: str,
+    string_dimension_length: int,
+    var_name: str,
 ) -> np.ndarray:
     """Encode strings as a bytes array."""
     data = np.asanyarray(data)
@@ -91,15 +103,28 @@ def encode_stringarray_as_bytearray(
     right_pad = b"\0" * string_dimension_length
     for index in np.ndindex(element_shape):
         string = data[index]
-        bytes = string.encode(encoding=encoding)
+        try:
+            bytes = string.encode(encoding=encoding)
+        except UnicodeEncodeError as err:
+            msg = (
+                f"String data written to netcdf character variable {var_name!r} "
+                f"could not be represented in encoding {encoding!r}.  "
+                "This can be fixed by setting a suitable variable '_Encoding' "
+                'attribute, e.g. variable._Encoding="UTF-8".'
+            )
+            raise ValueError(msg) from err
+
         n_bytes = len(bytes)
         # TODO: may want to issue warning or error if we overflow the length?
         if n_bytes > string_dimension_length:
             from iris.exceptions import TranslationError
 
             msg = (
-                f"String {string!r} written to netcdf exceeds string dimension after "
-                f"encoding : {n_bytes} > {string_dimension_length}."
+                f"String '{string}' written into netcdf variable {var_name!r} with "
+                f"encoding {encoding!r} is {n_bytes} bytes long, which exceeds the "
+                f"string dimension length, {string_dimension_length}. "
+                'This can be fixed by converting the data to a "wider" string dtype, '
+                f'e.g. cube.data = cube.data.astype("U{n_bytes}").'
             )
             raise TranslationError(msg)
 
@@ -117,8 +142,8 @@ class VariableEncoder:
     varname: str  # just for the error messages
     dtype: np.dtype
     is_chardata: bool  # just a shortcut for the dtype test
-    read_encoding: str  # IF 'is_chardata': a valid encoding from the codecs package
-    write_encoding: str  # IF 'is_chardata': a valid encoding from the codecs package
+    read_encoding: str  # IF 'is_chardata': one of the supported encodings
+    write_encoding: str  # IF 'is_chardata': one of the supported encodings
     n_chars_dim: int  # IF 'is_chardata': length of associated character dimension
     string_width: int  # IF 'is_chardata': width when viewed as strings (i.e. "Uxx")
 
@@ -138,94 +163,51 @@ def __init__(self, cf_var):
         self.dtype = cf_var.dtype
         self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
         if self.is_chardata:
-            self.read_encoding = self._get_encoding(cf_var, writing=False)
-            self.write_encoding = self._get_encoding(cf_var, writing=True)
+            encoding_attr = getattr(cf_var, "_Encoding", None)
+            self.read_encoding = _identify_encoding(
+                encoding_attr, var_name=cf_var.name, writing=False
+            )
+            self.write_encoding = _identify_encoding(
+                encoding_attr, var_name=cf_var.name, writing=True
+            )
             n_chars_dim = 1  # default to 1 for a scalar var
             if len(cf_var.dimensions) >= 1:
                 dim_name = cf_var.dimensions[-1]
                 if dim_name in cf_var.group().dimensions:
                     n_chars_dim = cf_var.group().dimensions[dim_name].size
             self.n_chars_dim = n_chars_dim
-            self.string_width = self._get_string_width(cf_var)
+            self.string_width = self._get_string_width()
 
-    @staticmethod
-    def _get_encoding(cf_var, writing=False) -> str:
-        """Get the byte encoding defined for this variable (or None)."""
-        result = getattr(cf_var, "_Encoding", None)
-        if result is not None:
-            try:
-                # Accept + normalise naming of encodings
-                result = codecs.lookup(result).name
-                # NOTE: if encoding does not suit data, errors can occur.
-                # For example, _Encoding = "ascii", with non-ascii content.
-            except LookupError:
-                # Unrecognised encoding name : handle this as just a warning
-                msg = (
-                    f"Ignoring unknown encoding for variable {cf_var.name!r}: "
-                    f"_Encoding = {result!r}."
-                )
-                warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
-                warnings.warn(msg, category=warntype)
-                # Proceed as if there is no specified encoding
-                result = None
-
-        if result is None:
-            if writing:
-                result = DEFAULT_WRITE_ENCODING
-            else:
-                result = DEFAULT_READ_ENCODING
-        return result
-
-    def _get_string_width(self, cf_var) -> int:
+    def _get_string_width(self) -> int:
         """Return the string-length defined for this variable."""
         # Work out the actual byte width from the parent dataset dimensions.
-        strlen = self.n_chars_dim
+        n_bytes = self.n_chars_dim
         # Convert the string dimension length (i.e. bytes) to a sufficiently-long
         #  string width, depending on the (read) encoding used.
         encoding = self.read_encoding
-        if "utf-16" in encoding:
-            # Each char needs at least 2 bytes -- including a terminator char
-            strlen = (strlen // 2) - 1
-        elif "utf-32" in encoding:
-            # Each char needs exactly 4 bytes -- including a terminator char
-            strlen = (strlen // 4) - 1
-        # "ELSE": assume there can be (at most) as many chars as bytes
-        return strlen
+        n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes)
+        return n_chars
 
     def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
         if self.is_chardata:
             # N.B. read encoding default is UTF-8 --> a "usually safe" choice
             encoding = self.read_encoding
             strlen = self.string_width
-            try:
-                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
-            except UnicodeDecodeError as err:
-                msg = (
-                    f"Character data in variable {self.varname!r} could not be decoded "
-                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
-                    "variable '_Encoding' attribute to suit the content."
-                )
-                raise ValueError(msg) from err
+            data = decode_bytesarray_to_stringarray(
+                data, encoding, strlen, self.varname
+            )
 
         return data
 
     def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray:
         if self.is_chardata and data.dtype.kind == "U":
             # N.B. it is also possible to pass a byte array (dtype "S1"),
             #  to be written directly, without processing.
-            try:
-                # N.B. write encoding *default* is "ascii" --> fails bad content
-                encoding = self.write_encoding
-                strlen = self.n_chars_dim
-                data = encode_stringarray_as_bytearray(data, encoding, strlen)
-            except UnicodeEncodeError as err:
-                msg = (
-                    f"String data written to netcdf character variable {self.varname!r} "
-                    f"could not be represented in encoding {self.write_encoding!r}.  "
-                    "This can be fixed by setting a suitable variable '_Encoding' "
-                    'attribute, e.g. <variable>._Encoding="UTF-8".'
-                )
-                raise ValueError(msg) from err
+            # N.B. write encoding *default* is "ascii" --> fails bad content
+            encoding = self.write_encoding
+            strlen = self.n_chars_dim
+            data = encode_stringarray_as_bytearray(data, encoding, strlen, self.varname)
+
         return data
 
 
@@ -252,6 +234,98 @@ def context(self, perform_decoding: bool):
 DEFAULT_WRITE_ENCODING = "ascii"
 
 
+@dataclasses.dataclass
+class EncodingWidthRelations:
+    """Encode the default string-width <-> byte-dimension relations.
+
+    These translations are just a "best guess"...
+
+    When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default)
+    string width may be longer than is needed for the actual content.  But it is at
+    least "safe".
+
+    When translating strings to bytes, we *can* get more bytes than the default
+    byte dimension length, and the code will then truncate
+    ( with a warning : see '_identify_encoding' ).
+    This can be avoided if necessary, in specific cases, by recasting the data to a
+    dtype with greater width (Uxx).
+    """
+
+    nchars_2_nbytes: Callable[[int], int]
+    nbytes_2_nchars: Callable[[int], int]
+
+
+_ENCODING_WIDTH_TRANSLATIONS = {
+    "ascii": EncodingWidthRelations(lambda x: x, lambda x: x),
+    "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x),
+    "utf-16": EncodingWidthRelations(
+        nchars_2_nbytes=lambda x: (x + 1) * 2,
+        nbytes_2_nchars=lambda x: x // 2 - 1,
+    ),
+    "utf-32": EncodingWidthRelations(
+        nchars_2_nbytes=lambda x: (x + 1) * 4,
+        nbytes_2_nchars=lambda x: x // 4 - 1,
+    ),
+}
+SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys())
+
+
+def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str:
+    """Normalise an encoding name + check it is supported.
+
+    Parameters
+    ----------
+    encoding : Any
+        Select an encoding : None, or a string, or anything printable (via str()).
+    var_name : str
+        Name of the relevant dataset variable (i.e. 'var_name') :
+        used only to produce warning messages.
+    writing : bool
+        Specify whether reading or writing, which affects any *default* return value,
+        i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING.
+
+    If given, and supported, return a normalised encoding name,
+    -- i.e. always one of SUPPORTED_ENCODINGS.
+    If not given, or not supported, return the default encoding name.
+
+    If given **but not recognised/supported**, also emit a warning (and return default).
+    """
+    if encoding is not None:
+        encoding = str(encoding)
+
+    result: str | None = None  # not yet 'found' : we will never *return* this
+
+    if encoding is not None:
+        # Normalise the name : NB must recognised by Python "codecs".
+        try:
+            result = codecs.lookup(encoding).name
+        except LookupError:
+            pass
+
+        if result is not None:
+            if result not in SUPPORTED_ENCODINGS:
+                # Python "codecs" recognised it, but we don't support it.
+                result = None
+
+    if encoding is not None and result is None:
+        # Unrecognised encoding name : handle this as just a warning
+        msg = (
+            f"Ignoring unsupported encoding for netCDF variable {var_name!r}: "
+            f"_Encoding = {encoding!r}, is not recognised as one of the supported "
+            f"encodings, {SUPPORTED_ENCODINGS}."
+        )
+        warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
+        warnings.warn(msg, category=warntype)
+
+    if result is None:
+        if writing:
+            result = DEFAULT_WRITE_ENCODING
+        else:
+            result = DEFAULT_READ_ENCODING
+
+    return result
+
+
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
@@ -1833,17 +1833,46 @@ def _create_generic_cf_array_var(
         if not is_dataless and np.issubdtype(data.dtype, np.str_):
             # Deal with string-type variables.
             # Typically CF label variables, but also possibly ancil-vars ?
+
+            # NOTE: all we are doing here is to calculate the byte dimension length,
+            # based on the dtype and any encoding attribute.
+            # The actual char --> byte data *translation* is done by the variable,
+            # being a _bytecoding_datasets.EncodedVariable.
             string_dimension_depth = data.dtype.itemsize
+
             if data.dtype.kind == "U":
-                encoding = element.attributes.get("_Encoding", "ascii")
-                # TODO: this can fail -- use a sensible warning + default?
-                encoding = codecs.lookup(encoding).name
-                if encoding == "utf-32":
-                    # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4
-                    string_dimension_depth += 4
-                else:
-                    # generally, 4 bytes per char in numpy --> make bytewidth = string-width
-                    string_dimension_depth //= 4
+                # String content (U) instead of bytes (S).
+                # For numpy strings, itemsize is **always** a multiple of 4
+                if string_dimension_depth % 4 != 0:
+                    msg = (
+                        "Unexpected numpy string 'itemsize' for element "
+                        f"{cube_or_mesh.name()}: "
+                        f"'dtype.itemsize = {string_dimension_depth}, expected "
+                        "a multiple of four (always)."
+                    )
+                    raise ValueError(msg)
+                nchars = string_dimension_depth // 4
+
+                encoding_attr = element.attributes.get("_Encoding", "ascii")
+                # Look this up + return a supported encoding name
+                # NB implements defaults and raises a warning if given not recognised.
+                encoding = bytecoding_datasets._identify_encoding(
+                    encoding=encoding_attr, var_name=cf_name, writing=True
+                )
+                width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
+                string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
+            else:
+                if data.dtype.kind != "S" or data.dtype.itemsize != 1:
+                    # Some type of data we don't "understand".
+                    # NB this includes "Sxx" types other than "S1" :  It seems that
+                    # netCDF4 can treat Sxx as if it was Uxx, as least if there is an
+                    # _Encoding attribute.  But we don't support that type in Iris.
+                    msg = (
+                        f"Variable {cf_name!r} has unexpected string/character dtype, "
+                        f"{data.dtype} -- should be either 'S' or 'U' type."
+                    )
+                    raise ValueError(msg)
+
             string_dimension_name = "string%d" % string_dimension_depth
 
             # Determine whether to create the string length dimension.
@@ -1861,26 +1890,6 @@ def _create_generic_cf_array_var(
 
             # Create the label coordinate variable.
             cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
-
-            # # Convert data from an array of strings into a character array
-            # # with an extra string-length dimension.
-            # if len(element_dims) == 1:
-            #     # Scalar variable (only has string dimension).
-            #     data_first = data[0]
-            #     if is_lazy_data(data_first):
-            #         data_first = dask.compute(data_first)
-            #     data = list("%- *s" % (string_dimension_depth, data_first))
-            # else:
-            #     # NOTE: at present, can't do this lazily??
-            #     orig_shape = data.shape
-            #     new_shape = orig_shape + (string_dimension_depth,)
-            #     new_data = np.zeros(new_shape, cf_var.dtype)
-            #     for index in np.ndindex(orig_shape):
-            #         index_slice = tuple(list(index) + [slice(None, None)])
-            #         new_data[index_slice] = list(
-            #             "%- *s" % (string_dimension_depth, data[index])
-            #         )
-            #     data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.