Skip to content

Commit

Permalink
Add: Support for NaNs in FixedScaleOffset (zarr-developers#511)
Browse files Browse the repository at this point in the history
This commit addresses issue zarr-developers#511 by adding support for handling NaN
inputs in the FixedScaleOffset class. The changes include:

- Introduced a check to ensure that when a fill_value is provided, the
  input dtype must be floating-point.
- fill_value must be an integer dtype
- Updated type and casting validation to ensure that fill_value is
  correctly cast to the specified astype (eg. fill_value of 3000 cannot
  cast to int8)
- Only support float -> int -> float transformations, as float -> float
  already natively support NaNs without fill_value
- Added tests for fill_value options
- Added fixtures for fill_value version of fixedscaleoffset

References: zarr-developers#511
  • Loading branch information
mps01060 committed Oct 7, 2024
1 parent 71fcab0 commit c4779f3
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 7 deletions.
Binary file added fixture/fixedscaleoffset/fill_value/array.00.npy
Binary file not shown.
Binary file added fixture/fixedscaleoffset/fill_value/array.01.npy
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/fixedscaleoffset/fill_value/codec.00/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"astype": "<i2",
"dtype": "<f4",
"fill_value": -32768,
"id": "fixedscaleoffset",
"offset": 0,
"scale": 100
}
Binary file not shown.
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/fixedscaleoffset/fill_value/codec.01/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"astype": "<i2",
"dtype": "<f4",
"fill_value": -32768,
"id": "fixedscaleoffset",
"offset": 10,
"scale": 100
}
Binary file not shown.
Binary file not shown.
63 changes: 59 additions & 4 deletions numcodecs/fixedscaleoffset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,24 @@ class FixedScaleOffset(Codec):
Data type to use for decoded data.
astype : dtype, optional
Data type to use for encoded data.
fill_value : integer, optional
A value used to represent NaNs during encoding when `astype` is an integer
data type. This allows round-tripping NaN values by encoding them as an
integer and decoding them back to NaN. Similar to the `add_offset` and
`scale_factor` in netCDF4, `fill_value` ensures NaNs can be preserved
during the transformation. It is only relevant when `astype` is an integer
dtype and ignored for float types. If not provided, NaNs are not encoded.
Notes
-----
If `astype` is an integer data type, please ensure that it is
sufficiently large to store encoded values. No checks are made and data
may become corrupted due to integer overflow if `astype` is too small.
If `astype` is an integer data type, please ensure that it is sufficiently
large to store encoded values. No checks are made and data may become corrupted
due to integer overflow if `astype` is too small.
When `fill_value` is provided and `astype` is an integer dtype, NaNs are
encoded as this value and are decoded back to NaNs during the reverse
transformation. This is not implemented for astype==float, because `fill_value`
is not required as NaNs are natively supported by floats.
Examples
--------
Expand Down Expand Up @@ -61,6 +73,18 @@ class FixedScaleOffset(Codec):
>>> z3
array([1000. , 1000.111, 1000.222, 1000.333, 1000.444, 1000.556,
1000.667, 1000.778, 1000.889, 1001. ])
>>> x_nans = np.linspace(0, 0.1, 10, dtype='f4')
>>> x_nans[0] = np.nan
>>> x_nans
array([ nan, 0.01111111, 0.02222222, 0.03333334, 0.04444445,
0.05555556, 0.06666667, 0.07777778, 0.08888889, 0.1 ], dtype=float32)
>>> codec = numcodecs.FixedScaleOffset(offset=0, scale=100, dtype='f4', astype='i2', fill_value=-32768)
>>> y4 = codec.encode(x_nans)
>>> y4
array([-32768, 1, 2, 3, 4, 6, 7, 8, 9, 10], dtype=int16)
>>> z4 = codec.decode(y4)
>>> z4
array([ nan, 0.01, 0.02, 0.03, 0.04, 0.06, 0.07, 0.08, 0.09, 0.1 ], dtype=float32)
See Also
--------
Expand All @@ -70,7 +94,7 @@ class FixedScaleOffset(Codec):

codec_id = 'fixedscaleoffset'

def __init__(self, offset, scale, dtype, astype=None):
def __init__(self, offset, scale, dtype, astype=None, fill_value=None):
self.offset = offset
self.scale = scale
self.dtype = np.dtype(dtype)
Expand All @@ -80,6 +104,26 @@ def __init__(self, offset, scale, dtype, astype=None):
self.astype = np.dtype(astype)
if self.dtype == np.dtype(object) or self.astype == np.dtype(object):
raise ValueError('object arrays are not supported')
if fill_value is not None:
if np.issubdtype(self.astype, np.floating):
raise NotImplementedError(
'Encoding floats to floats does not require a fill_value '
'since floats natively support NaNs.'
)
if not np.issubdtype(self.dtype, np.floating):
raise TypeError(
f'fill_value requires a floating-point input dtype, but got dtype "{self.dtype}".'
)
if not isinstance(fill_value, (int, np.integer)):
raise TypeError('fill_value must be an integer value')
if not np.can_cast(fill_value, self.astype, casting='safe'):
raise ValueError(
f'fill_value "{fill_value}"" cannot be safely cast to output dtype "{self.astype}"'
)
# Convert NumPy integer to Python native types for JSON serialization compatibility
if isinstance(fill_value, np.integer):
fill_value = int(fill_value)
self.fill_value = fill_value

def encode(self, buf):
# normalise input
Expand All @@ -91,6 +135,10 @@ def encode(self, buf):
# compute scale offset
enc = (arr - self.offset) * self.scale

# change nans to fill_value
if self.fill_value is not None:
enc[np.isnan(enc)] = self.fill_value

# round to nearest integer
enc = np.around(enc)

Expand All @@ -109,6 +157,10 @@ def decode(self, buf, out=None):
# decode scale offset
dec = (enc / self.scale) + self.offset

# convert fill_values to nans
if self.fill_value is not None:
dec[enc==self.fill_value] = np.nan

# convert dtype
dec = dec.astype(self.dtype, copy=False)

Expand All @@ -122,12 +174,15 @@ def get_config(self):
scale=self.scale,
offset=self.offset,
dtype=self.dtype.str,
fill_value=self.fill_value,
astype=self.astype.str,
)

def __repr__(self):
r = f'{type(self).__name__}(scale={self.scale}, offset={self.offset}, dtype={self.dtype.str!r}'
if self.astype != self.dtype:
r += f', astype={self.astype.str!r}'
if self.fill_value is not None:
r += f', fill_value={self.fill_value}'
r += ')'
return r
51 changes: 48 additions & 3 deletions numcodecs/tests/test_fixedscaleoffset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,25 @@
FixedScaleOffset(offset=1000, scale=10**12, dtype='<f8'),
]

nan_arrays = [
np.array([np.nan, 0.0111, 0.0222, 0.0333, 0.0444, 0.0555, 0.0666, 0.0777, 0.0888, 0.1], dtype='f4'),
np.array([np.nan, 0.0111, 0.0222, 0.0333, 0.0444, 0.0555, 0.0666, 0.0777, 0.0888, 0.1], dtype='f4') + 10,
]

nan_codecs = [
FixedScaleOffset(offset=0, scale=100, dtype='<f4', astype='<i2', fill_value=np.iinfo('<i2').min),
FixedScaleOffset(offset=10, scale=100, dtype='<f4', astype='<i2', fill_value=np.iinfo('<i2').min),
]


def test_encode_decode():
for arr, codec in itertools.product(arrays, codecs):
precision = int(np.log10(codec.scale))
check_encode_decode(arr, codec, precision=precision)

for arr, codec in zip(nan_arrays, nan_codecs):
precision = int(np.log10(codec.scale))
check_encode_decode(arr, codec, precision=precision)


def test_encode():
Expand All @@ -50,23 +64,54 @@ def test_encode():
assert np.dtype(astype) == actual.dtype


def test_encode_nan():
dtype = '<f4'
astype = '<i2'
codec = FixedScaleOffset(
offset=0, scale=100, dtype=dtype, astype=astype, fill_value=np.iinfo(astype).min
)
arr = np.array(
[np.nan, 0.0111, 0.0222, 0.0333, 0.0444, 0.0555, 0.0666, 0.0777, 0.0888, 0.1],
dtype=dtype
)
expect = np.array([-32768, 1, 2, 3, 4, 6, 7, 8, 9, 10], dtype=astype)
actual = codec.encode(arr)
assert_array_equal(expect, actual)
assert np.dtype(astype) == actual.dtype


def test_config():
codec = FixedScaleOffset(dtype='<f8', astype='<i4', scale=10, offset=100)
check_config(codec)
for codec in codecs + nan_codecs:
check_config(codec)


def test_repr():
stmt = "FixedScaleOffset(scale=10, offset=100, dtype='<f8', astype='<i4')"
check_repr(stmt)
stmt = "FixedScaleOffset(scale=100, offset=0, dtype='<f4', astype='<i2', fill_value=-32768)"
check_repr(stmt)


def test_backwards_compatibility():
precision = [int(np.log10(codec.scale)) for codec in codecs]
check_backwards_compatibility(FixedScaleOffset.codec_id, arrays, codecs, precision=precision)

precision = [int(np.log10(codec.scale)) for codec in nan_codecs]
check_backwards_compatibility(FixedScaleOffset.codec_id, nan_arrays, nan_codecs, precision=precision, prefix='fill_value')

def test_errors():
with pytest.raises(ValueError):
FixedScaleOffset(dtype=object, astype='i4', scale=10, offset=100)
with pytest.raises(ValueError):
FixedScaleOffset(dtype='f8', astype=object, scale=10, offset=100)
with pytest.raises(TypeError):
FixedScaleOffset(
offset=0, scale=100, dtype='i4', astype='i2', fill_value=np.iinfo('i2').min
)
with pytest.raises(TypeError):
FixedScaleOffset(offset=0, scale=100, dtype='f4', astype='i2', fill_value='bad')
with pytest.raises(TypeError):
FixedScaleOffset(offset=0, scale=100, dtype='f4', astype='i2', fill_value=3.1)
with pytest.raises(NotImplementedError):
FixedScaleOffset(offset=0, scale=100, dtype='f8', astype='f4', fill_value=3.1)
with pytest.raises(ValueError):
FixedScaleOffset(offset=0, scale=100, dtype='f8', astype='i1', fill_value=3000)

0 comments on commit c4779f3

Please sign in to comment.