Add: Support for NaNs in FixedScaleOffset (zarr-developers#511)

This commit addresses issue zarr-developers#511 by adding support for handling NaN inputs in the FixedScaleOffset class. The changes include: - Introduced a check to ensure that when a fill_value is provided, the input dtype must be floating-point. - fill_value must be an integer dtype - Updated type and casting validation to ensure that fill_value is correctly cast to the specified astype (eg. fill_value of 3000 cannot cast to int8) - Only support float -> int -> float transformations, as float -> float already natively support NaNs without fill_value - Added tests for fill_value options - Added fixtures for fill_value version of fixedscaleoffset References: zarr-developers#511
mps01060 · Oct 7, 2024 · c4779f3 · c4779f3
1 parent 71fcab0
commit c4779f3
Show file tree

Hide file tree

Showing 10 changed files with 123 additions and 7 deletions.
diff --git a/fixture/fixedscaleoffset/fill_value/array.00.npy b/fixture/fixedscaleoffset/fill_value/array.00.npy
diff --git a/fixture/fixedscaleoffset/fill_value/array.01.npy b/fixture/fixedscaleoffset/fill_value/array.01.npy
diff --git a/fixture/fixedscaleoffset/fill_value/codec.00/config.json b/fixture/fixedscaleoffset/fill_value/codec.00/config.json
@@ -0,0 +1,8 @@
+{
+    "astype": "<i2",
+    "dtype": "<f4",
+    "fill_value": -32768,
+    "id": "fixedscaleoffset",
+    "offset": 0,
+    "scale": 100
+}
diff --git a/fixture/fixedscaleoffset/fill_value/codec.00/encoded.00.dat b/fixture/fixedscaleoffset/fill_value/codec.00/encoded.00.dat
diff --git a/fixture/fixedscaleoffset/fill_value/codec.00/encoded.01.dat b/fixture/fixedscaleoffset/fill_value/codec.00/encoded.01.dat
diff --git a/fixture/fixedscaleoffset/fill_value/codec.01/config.json b/fixture/fixedscaleoffset/fill_value/codec.01/config.json
@@ -0,0 +1,8 @@
+{
+    "astype": "<i2",
+    "dtype": "<f4",
+    "fill_value": -32768,
+    "id": "fixedscaleoffset",
+    "offset": 10,
+    "scale": 100
+}
diff --git a/fixture/fixedscaleoffset/fill_value/codec.01/encoded.00.dat b/fixture/fixedscaleoffset/fill_value/codec.01/encoded.00.dat
diff --git a/fixture/fixedscaleoffset/fill_value/codec.01/encoded.01.dat b/fixture/fixedscaleoffset/fill_value/codec.01/encoded.01.dat
diff --git a/numcodecs/fixedscaleoffset.py b/numcodecs/fixedscaleoffset.py
@@ -21,12 +21,24 @@ class FixedScaleOffset(Codec):
         Data type to use for decoded data.
     astype : dtype, optional
         Data type to use for encoded data.
+    fill_value : integer, optional
+        A value used to represent NaNs during encoding when `astype` is an integer
+        data type. This allows round-tripping NaN values by encoding them as an 
+        integer and decoding them back to NaN. Similar to the `add_offset` and 
+        `scale_factor` in netCDF4, `fill_value` ensures NaNs can be preserved 
+        during the transformation. It is only relevant when `astype` is an integer 
+        dtype and ignored for float types. If not provided, NaNs are not encoded.
 
     Notes
     -----
-    If `astype` is an integer data type, please ensure that it is
-    sufficiently large to store encoded values. No checks are made and data
-    may become corrupted due to integer overflow if `astype` is too small.
+    If `astype` is an integer data type, please ensure that it is sufficiently
+    large to store encoded values. No checks are made and data may become corrupted
+    due to integer overflow if `astype` is too small.
+
+    When `fill_value` is provided and `astype` is an integer dtype, NaNs are 
+    encoded as this value and are decoded back to NaNs during the reverse 
+    transformation. This is not implemented for astype==float, because `fill_value`
+    is not required as NaNs are natively supported by floats.
 
     Examples
     --------
@@ -61,6 +73,18 @@ class FixedScaleOffset(Codec):
     >>> z3
     array([1000.   , 1000.111, 1000.222, 1000.333, 1000.444, 1000.556,
            1000.667, 1000.778, 1000.889, 1001.   ])
+    >>> x_nans = np.linspace(0, 0.1, 10, dtype='f4')
+    >>> x_nans[0] = np.nan
+    >>> x_nans
+    array([       nan, 0.01111111, 0.02222222, 0.03333334, 0.04444445,
+           0.05555556, 0.06666667, 0.07777778, 0.08888889, 0.1       ], dtype=float32)
+    >>> codec = numcodecs.FixedScaleOffset(offset=0, scale=100, dtype='f4', astype='i2', fill_value=-32768)
+    >>> y4 = codec.encode(x_nans)
+    >>> y4
+    array([-32768, 1, 2, 3, 4, 6, 7, 8, 9, 10], dtype=int16)
+    >>> z4 = codec.decode(y4)
+    >>> z4
+    array([ nan, 0.01, 0.02, 0.03, 0.04, 0.06, 0.07, 0.08, 0.09, 0.1 ], dtype=float32)
 
     See Also
     --------
@@ -70,7 +94,7 @@ class FixedScaleOffset(Codec):
 
     codec_id = 'fixedscaleoffset'
 
-    def __init__(self, offset, scale, dtype, astype=None):
+    def __init__(self, offset, scale, dtype, astype=None, fill_value=None):
         self.offset = offset
         self.scale = scale
         self.dtype = np.dtype(dtype)
@@ -80,6 +104,26 @@ def __init__(self, offset, scale, dtype, astype=None):
             self.astype = np.dtype(astype)
         if self.dtype == np.dtype(object) or self.astype == np.dtype(object):
             raise ValueError('object arrays are not supported')
+        if fill_value is not None:
+            if np.issubdtype(self.astype, np.floating):
+                raise NotImplementedError(
+                    'Encoding floats to floats does not require a fill_value '
+                    'since floats natively support NaNs.'
+                )
+            if not np.issubdtype(self.dtype, np.floating):
+                raise TypeError(
+                    f'fill_value requires a floating-point input dtype, but got dtype "{self.dtype}".'
+                )
+            if not isinstance(fill_value, (int, np.integer)):
+                raise TypeError('fill_value must be an integer value')
+            if not np.can_cast(fill_value, self.astype, casting='safe'):
+                raise ValueError(
+                    f'fill_value "{fill_value}"" cannot be safely cast to output dtype "{self.astype}"'
+                )
+            # Convert NumPy integer to Python native types for JSON serialization compatibility
+            if isinstance(fill_value, np.integer):
+                fill_value = int(fill_value)
+        self.fill_value = fill_value
 
     def encode(self, buf):
         # normalise input
@@ -91,6 +135,10 @@ def encode(self, buf):
         # compute scale offset
         enc = (arr - self.offset) * self.scale
 
+        # change nans to fill_value
+        if self.fill_value is not None:
+            enc[np.isnan(enc)] = self.fill_value
+
         # round to nearest integer
         enc = np.around(enc)
 
@@ -109,6 +157,10 @@ def decode(self, buf, out=None):
         # decode scale offset
         dec = (enc / self.scale) + self.offset
 
+        # convert fill_values to nans
+        if self.fill_value is not None:
+            dec[enc==self.fill_value] = np.nan
+
         # convert dtype
         dec = dec.astype(self.dtype, copy=False)
 
@@ -122,12 +174,15 @@ def get_config(self):
             scale=self.scale,
             offset=self.offset,
             dtype=self.dtype.str,
+            fill_value=self.fill_value,
             astype=self.astype.str,
         )
 
     def __repr__(self):
         r = f'{type(self).__name__}(scale={self.scale}, offset={self.offset}, dtype={self.dtype.str!r}'
         if self.astype != self.dtype:
             r += f', astype={self.astype.str!r}'
+        if self.fill_value is not None:
+            r += f', fill_value={self.fill_value}'
         r += ')'
         return r
diff --git a/numcodecs/tests/test_fixedscaleoffset.py b/numcodecs/tests/test_fixedscaleoffset.py
@@ -32,11 +32,25 @@
     FixedScaleOffset(offset=1000, scale=10**12, dtype='<f8'),
 ]
 
+nan_arrays = [
+    np.array([np.nan, 0.0111, 0.0222, 0.0333, 0.0444, 0.0555, 0.0666, 0.0777, 0.0888, 0.1], dtype='f4'),
+    np.array([np.nan, 0.0111, 0.0222, 0.0333, 0.0444, 0.0555, 0.0666, 0.0777, 0.0888, 0.1], dtype='f4') + 10,
+]
+
+nan_codecs = [
+    FixedScaleOffset(offset=0, scale=100, dtype='<f4', astype='<i2', fill_value=np.iinfo('<i2').min),
+    FixedScaleOffset(offset=10, scale=100, dtype='<f4', astype='<i2', fill_value=np.iinfo('<i2').min),
+]
+
 
 def test_encode_decode():
     for arr, codec in itertools.product(arrays, codecs):
         precision = int(np.log10(codec.scale))
         check_encode_decode(arr, codec, precision=precision)
+
+    for arr, codec in zip(nan_arrays, nan_codecs):
+        precision = int(np.log10(codec.scale))
+        check_encode_decode(arr, codec, precision=precision)
 
 
 def test_encode():
@@ -50,23 +64,54 @@ def test_encode():
     assert np.dtype(astype) == actual.dtype
 
 
+def test_encode_nan():
+    dtype = '<f4'
+    astype = '<i2'
+    codec = FixedScaleOffset(
+        offset=0, scale=100, dtype=dtype, astype=astype, fill_value=np.iinfo(astype).min
+    )
+    arr = np.array(
+        [np.nan, 0.0111, 0.0222, 0.0333, 0.0444, 0.0555, 0.0666, 0.0777, 0.0888, 0.1],
+        dtype=dtype
+    )
+    expect = np.array([-32768, 1, 2, 3, 4, 6, 7, 8, 9, 10], dtype=astype)
+    actual = codec.encode(arr)
+    assert_array_equal(expect, actual)
+    assert np.dtype(astype) == actual.dtype
+
+
 def test_config():
-    codec = FixedScaleOffset(dtype='<f8', astype='<i4', scale=10, offset=100)
-    check_config(codec)
+    for codec in codecs + nan_codecs:
+        check_config(codec)
 
 
 def test_repr():
     stmt = "FixedScaleOffset(scale=10, offset=100, dtype='<f8', astype='<i4')"
     check_repr(stmt)
+    stmt = "FixedScaleOffset(scale=100, offset=0, dtype='<f4', astype='<i2', fill_value=-32768)"
+    check_repr(stmt)
 
 
 def test_backwards_compatibility():
     precision = [int(np.log10(codec.scale)) for codec in codecs]
     check_backwards_compatibility(FixedScaleOffset.codec_id, arrays, codecs, precision=precision)
-
+    precision = [int(np.log10(codec.scale)) for codec in nan_codecs]
+    check_backwards_compatibility(FixedScaleOffset.codec_id, nan_arrays, nan_codecs, precision=precision, prefix='fill_value')
 
 def test_errors():
     with pytest.raises(ValueError):
         FixedScaleOffset(dtype=object, astype='i4', scale=10, offset=100)
     with pytest.raises(ValueError):
         FixedScaleOffset(dtype='f8', astype=object, scale=10, offset=100)
+    with pytest.raises(TypeError):
+        FixedScaleOffset(
+            offset=0, scale=100, dtype='i4', astype='i2', fill_value=np.iinfo('i2').min
+        )
+    with pytest.raises(TypeError):
+        FixedScaleOffset(offset=0, scale=100, dtype='f4', astype='i2', fill_value='bad')
+    with pytest.raises(TypeError):
+        FixedScaleOffset(offset=0, scale=100, dtype='f4', astype='i2', fill_value=3.1)
+    with pytest.raises(NotImplementedError):
+        FixedScaleOffset(offset=0, scale=100, dtype='f8', astype='f4', fill_value=3.1)
+    with pytest.raises(ValueError):
+        FixedScaleOffset(offset=0, scale=100, dtype='f8', astype='i1', fill_value=3000)