Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/iris/fileformats/netcdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DECODE_TO_STRINGS_ON_READ,
DEFAULT_READ_ENCODING,
DEFAULT_WRITE_ENCODING,
SUPPORTED_ENCODINGS,
)
from .loader import DEBUG, NetCDFDataProxy, load_cubes
from .saver import (
Expand All @@ -53,6 +54,7 @@
"MESH_ELEMENTS",
"NetCDFDataProxy",
"SPATIO_TEMPORAL_AXES",
"SUPPORTED_ENCODINGS",
"Saver",
"UnknownCellMethodWarning",
"load_cubes",
Expand Down
216 changes: 145 additions & 71 deletions lib/iris/fileformats/netcdf/_bytecoding_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import contextlib
import dataclasses
import threading
from typing import Callable
import warnings

import numpy as np
Expand All @@ -59,7 +60,7 @@


def decode_bytesarray_to_stringarray(
byte_array: np.ndarray, encoding: str, string_width: int
byte_array: np.ndarray, encoding: str, string_width: int, var_name: str
) -> np.ndarray:
"""Convert an array of bytes to an array of strings, with one less dimension.

Expand All @@ -76,13 +77,24 @@ def decode_bytesarray_to_stringarray(
for ndindex in np.ndindex(var_shape):
element_bytes = byte_array[ndindex]
bytes = b"".join([b or b"\0" for b in element_bytes])
string = bytes.decode(encoding)
try:
string = bytes.decode(encoding)
except UnicodeDecodeError as err:
msg = (
f"Character data in variable {var_name!r} could not be decoded "
f"with the {encoding!r} encoding. This can be fixed by setting the "
"variable '_Encoding' attribute to suit the content."
)
raise ValueError(msg) from err
result[ndindex] = string
return result


def encode_stringarray_as_bytearray(
data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
data: np.typing.ArrayLike,
encoding: str,
string_dimension_length: int,
var_name: str,
) -> np.ndarray:
"""Encode strings as a bytes array."""
data = np.asanyarray(data)
Expand All @@ -91,15 +103,28 @@ def encode_stringarray_as_bytearray(
right_pad = b"\0" * string_dimension_length
for index in np.ndindex(element_shape):
string = data[index]
bytes = string.encode(encoding=encoding)
try:
bytes = string.encode(encoding=encoding)
except UnicodeEncodeError as err:
msg = (
f"String data written to netcdf character variable {var_name!r} "
f"could not be represented in encoding {encoding!r}. "
"This can be fixed by setting a suitable variable '_Encoding' "
'attribute, e.g. variable._Encoding="UTF-8".'
)
raise ValueError(msg) from err

n_bytes = len(bytes)
# TODO: may want to issue warning or error if we overflow the length?
if n_bytes > string_dimension_length:
from iris.exceptions import TranslationError

msg = (
f"String {string!r} written to netcdf exceeds string dimension after "
f"encoding : {n_bytes} > {string_dimension_length}."
f"String '{string}' written into netcdf variable {var_name!r} with "
f"encoding {encoding!r} is {n_bytes} bytes long, which exceeds the "
f"string dimension length, {string_dimension_length}. "
'This can be fixed by converting the data to a "wider" string dtype, '
f'e.g. cube.data = cube.data.astype("U{n_bytes}").'
)
raise TranslationError(msg)

Expand All @@ -117,8 +142,8 @@ class VariableEncoder:
varname: str # just for the error messages
dtype: np.dtype
is_chardata: bool # just a shortcut for the dtype test
read_encoding: str # IF 'is_chardata': a valid encoding from the codecs package
write_encoding: str # IF 'is_chardata': a valid encoding from the codecs package
read_encoding: str # IF 'is_chardata': one of the supported encodings
write_encoding: str # IF 'is_chardata': one of the supported encodings
n_chars_dim: int # IF 'is_chardata': length of associated character dimension
string_width: int # IF 'is_chardata': width when viewed as strings (i.e. "Uxx")

Expand All @@ -138,94 +163,51 @@ def __init__(self, cf_var):
self.dtype = cf_var.dtype
self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
if self.is_chardata:
self.read_encoding = self._get_encoding(cf_var, writing=False)
self.write_encoding = self._get_encoding(cf_var, writing=True)
encoding_attr = getattr(cf_var, "_Encoding", None)
self.read_encoding = _identify_encoding(
encoding_attr, var_name=cf_var.name, writing=False
)
self.write_encoding = _identify_encoding(
encoding_attr, var_name=cf_var.name, writing=True
)
n_chars_dim = 1 # default to 1 for a scalar var
if len(cf_var.dimensions) >= 1:
dim_name = cf_var.dimensions[-1]
if dim_name in cf_var.group().dimensions:
n_chars_dim = cf_var.group().dimensions[dim_name].size
self.n_chars_dim = n_chars_dim
self.string_width = self._get_string_width(cf_var)
self.string_width = self._get_string_width()

@staticmethod
def _get_encoding(cf_var, writing=False) -> str:
"""Get the byte encoding defined for this variable (or None)."""
result = getattr(cf_var, "_Encoding", None)
if result is not None:
try:
# Accept + normalise naming of encodings
result = codecs.lookup(result).name
# NOTE: if encoding does not suit data, errors can occur.
# For example, _Encoding = "ascii", with non-ascii content.
except LookupError:
# Unrecognised encoding name : handle this as just a warning
msg = (
f"Ignoring unknown encoding for variable {cf_var.name!r}: "
f"_Encoding = {result!r}."
)
warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
warnings.warn(msg, category=warntype)
# Proceed as if there is no specified encoding
result = None

if result is None:
if writing:
result = DEFAULT_WRITE_ENCODING
else:
result = DEFAULT_READ_ENCODING
return result

def _get_string_width(self, cf_var) -> int:
def _get_string_width(self) -> int:
"""Return the string-length defined for this variable."""
# Work out the actual byte width from the parent dataset dimensions.
strlen = self.n_chars_dim
n_bytes = self.n_chars_dim
# Convert the string dimension length (i.e. bytes) to a sufficiently-long
# string width, depending on the (read) encoding used.
encoding = self.read_encoding
if "utf-16" in encoding:
# Each char needs at least 2 bytes -- including a terminator char
strlen = (strlen // 2) - 1
elif "utf-32" in encoding:
# Each char needs exactly 4 bytes -- including a terminator char
strlen = (strlen // 4) - 1
# "ELSE": assume there can be (at most) as many chars as bytes
return strlen
n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes)
return n_chars

def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
if self.is_chardata:
# N.B. read encoding default is UTF-8 --> a "usually safe" choice
encoding = self.read_encoding
strlen = self.string_width
try:
data = decode_bytesarray_to_stringarray(data, encoding, strlen)
except UnicodeDecodeError as err:
msg = (
f"Character data in variable {self.varname!r} could not be decoded "
f"with the {encoding!r} encoding. This can be fixed by setting the "
"variable '_Encoding' attribute to suit the content."
)
raise ValueError(msg) from err
data = decode_bytesarray_to_stringarray(
data, encoding, strlen, self.varname
)

return data

def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray:
if self.is_chardata and data.dtype.kind == "U":
# N.B. it is also possible to pass a byte array (dtype "S1"),
# to be written directly, without processing.
try:
# N.B. write encoding *default* is "ascii" --> fails bad content
encoding = self.write_encoding
strlen = self.n_chars_dim
data = encode_stringarray_as_bytearray(data, encoding, strlen)
except UnicodeEncodeError as err:
msg = (
f"String data written to netcdf character variable {self.varname!r} "
f"could not be represented in encoding {self.write_encoding!r}. "
"This can be fixed by setting a suitable variable '_Encoding' "
'attribute, e.g. <variable>._Encoding="UTF-8".'
)
raise ValueError(msg) from err
# N.B. write encoding *default* is "ascii" --> fails bad content
encoding = self.write_encoding
strlen = self.n_chars_dim
data = encode_stringarray_as_bytearray(data, encoding, strlen, self.varname)

return data


Expand All @@ -252,6 +234,98 @@ def context(self, perform_decoding: bool):
DEFAULT_WRITE_ENCODING = "ascii"


@dataclasses.dataclass
class EncodingWidthRelations:
"""Encode the default string-width <-> byte-dimension relations.

These translations are just a "best guess"...

When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default)
string width may be longer than is needed for the actual content. But it is at
least "safe".

When translating strings to bytes, we *can* get more bytes than the default
byte dimension length, and the code will then truncate
( with a warning : see '_identify_encoding' ).
This can be avoided if necessary, in specific cases, by recasting the data to a
dtype with greater width (Uxx).
"""

nchars_2_nbytes: Callable[[int], int]
nbytes_2_nchars: Callable[[int], int]


_ENCODING_WIDTH_TRANSLATIONS = {
"ascii": EncodingWidthRelations(lambda x: x, lambda x: x),
"utf-8": EncodingWidthRelations(lambda x: x, lambda x: x),
"utf-16": EncodingWidthRelations(
nchars_2_nbytes=lambda x: (x + 1) * 2,
nbytes_2_nchars=lambda x: x // 2 - 1,
),
"utf-32": EncodingWidthRelations(
nchars_2_nbytes=lambda x: (x + 1) * 4,
nbytes_2_nchars=lambda x: x // 4 - 1,
),
}
SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys())


def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str:
"""Normalise an encoding name + check it is supported.

Parameters
----------
encoding : Any
Select an encoding : None, or a string, or anything printable (via str()).
var_name : str
Name of the relevant dataset variable (i.e. 'var_name') :
used only to produce warning messages.
writing : bool
Specify whether reading or writing, which affects any *default* return value,
i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING.

If given, and supported, return a normalised encoding name,
-- i.e. always one of SUPPORTED_ENCODINGS.
If not given, or not supported, return the default encoding name.

If given **but not recognised/supported**, also emit a warning (and return default).
"""
if encoding is not None:
encoding = str(encoding)

result: str | None = None # not yet 'found' : we will never *return* this

if encoding is not None:
# Normalise the name : NB must recognised by Python "codecs".
try:
result = codecs.lookup(encoding).name
except LookupError:
pass

if result is not None:
if result not in SUPPORTED_ENCODINGS:
# Python "codecs" recognised it, but we don't support it.
result = None

if encoding is not None and result is None:
# Unrecognised encoding name : handle this as just a warning
msg = (
f"Ignoring unsupported encoding for netCDF variable {var_name!r}: "
f"_Encoding = {encoding!r}, is not recognised as one of the supported "
f"encodings, {SUPPORTED_ENCODINGS}."
)
warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
warnings.warn(msg, category=warntype)

if result is None:
if writing:
result = DEFAULT_WRITE_ENCODING
else:
result = DEFAULT_READ_ENCODING

return result


class EncodedVariable(VariableWrapper):
"""A variable wrapper that translates variable data according to byte encodings."""

Expand Down
67 changes: 38 additions & 29 deletions lib/iris/fileformats/netcdf/saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -1833,17 +1833,46 @@ def _create_generic_cf_array_var(
if not is_dataless and np.issubdtype(data.dtype, np.str_):
# Deal with string-type variables.
# Typically CF label variables, but also possibly ancil-vars ?

# NOTE: all we are doing here is to calculate the byte dimension length,
# based on the dtype and any encoding attribute.
# The actual char --> byte data *translation* is done by the variable,
# being a _bytecoding_datasets.EncodedVariable.
string_dimension_depth = data.dtype.itemsize

if data.dtype.kind == "U":
encoding = element.attributes.get("_Encoding", "ascii")
# TODO: this can fail -- use a sensible warning + default?
encoding = codecs.lookup(encoding).name
if encoding == "utf-32":
# UTF-32 is a special case -- always 4 exactly bytes per char, plus 4
string_dimension_depth += 4
else:
# generally, 4 bytes per char in numpy --> make bytewidth = string-width
string_dimension_depth //= 4
# String content (U) instead of bytes (S).
# For numpy strings, itemsize is **always** a multiple of 4
if string_dimension_depth % 4 != 0:
msg = (
"Unexpected numpy string 'itemsize' for element "
f"{cube_or_mesh.name()}: "
f"'dtype.itemsize = {string_dimension_depth}, expected "
"a multiple of four (always)."
)
raise ValueError(msg)
nchars = string_dimension_depth // 4

encoding_attr = element.attributes.get("_Encoding", "ascii")
# Look this up + return a supported encoding name
# NB implements defaults and raises a warning if given not recognised.
encoding = bytecoding_datasets._identify_encoding(
encoding=encoding_attr, var_name=cf_name, writing=True
)
width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
else:
if data.dtype.kind != "S" or data.dtype.itemsize != 1:
# Some type of data we don't "understand".
# NB this includes "Sxx" types other than "S1" : It seems that
# netCDF4 can treat Sxx as if it was Uxx, as least if there is an
# _Encoding attribute. But we don't support that type in Iris.
msg = (
f"Variable {cf_name!r} has unexpected string/character dtype, "
f"{data.dtype} -- should be either 'S' or 'U' type."
)
raise ValueError(msg)

string_dimension_name = "string%d" % string_dimension_depth

# Determine whether to create the string length dimension.
Expand All @@ -1861,26 +1890,6 @@ def _create_generic_cf_array_var(

# Create the label coordinate variable.
cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)

# # Convert data from an array of strings into a character array
# # with an extra string-length dimension.
# if len(element_dims) == 1:
# # Scalar variable (only has string dimension).
# data_first = data[0]
# if is_lazy_data(data_first):
# data_first = dask.compute(data_first)
# data = list("%- *s" % (string_dimension_depth, data_first))
# else:
# # NOTE: at present, can't do this lazily??
# orig_shape = data.shape
# new_shape = orig_shape + (string_dimension_depth,)
# new_data = np.zeros(new_shape, cf_var.dtype)
# for index in np.ndindex(orig_shape):
# index_slice = tuple(list(index) + [slice(None, None)])
# new_data[index_slice] = list(
# "%- *s" % (string_dimension_depth, data[index])
# )
# data = new_data
else:
# A normal (numeric) variable.
# ensure a valid datatype for the file format.
Expand Down
Loading
Loading