3 byte fields (#32)

* Update fromfile to support 3-byte fields Some FCS files (such as the one generated by the Cytek xP5) store integers in 3-byte fields. This breaks numpy's parser, which only wants power-of-two sized fields. So, I've updated fromfile() to parse FCS files as a table of 1-byte unsigned ints, expand those fields to 4 bytes, and then re-view them as the proper dtype. My only concern is that this may break on an FCS file produced by a big-endian machine (because I'm adding the extra bytes to the beginning of the field). I don't have ready access to any of those -- do you? * add tests for 3-byte DATA * documentation for fromfile() changes
eyurtsev · Oct 7, 2021 · 111bf0b · 111bf0b
1 parent 199126a
commit 111bf0b
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 6 deletions.
diff --git a/fcsparser/api.py b/fcsparser/api.py
@@ -27,11 +27,86 @@
 
 def fromfile(file, dtype, count, *args, **kwargs):
     """Wrapper around np.fromfile to support any file-like object."""
+
+    # to understand this function, you must understand the difference
+    # between a numpy *record* and a numpy *field*.  a *record* is
+    # composed of a number of *fields*, each of which may be a different
+    # dtype.  for example, if we were encoding a list of two-dimensional
+    # points on a plane, each point would be represented by a record
+    # containing two fields, each of which was a four-byte float
+    # represented by the dtype 'f4'.  thus, the record would be 
+    # specified by dtype("f4, f4").
+
+    # numpy's functions expect all field widths to be a power of two  
+    # (because this is how such things are stored in memory.)
+    # unfortunately, some pathological cytometers don't make their 
+    # records a power-of-two wide.  For example, a Cytek xP5 encodes
+    # records in their DATA segment in three-byte-wide integers --
+    # this comes in as a dtype of "i3", which makes numpy freak out.
+
+    # To address this, we convert the requested dtype so that each
+    # record is read as a series of one-byte-wide unsigned integers 
+    # ('u1'), pad out each record with NUL bytes until it is
+    # a power-of-two wide, then re-convert it to the requested
+    # dtype (but with a power-of-two width) and return it.
+
+    # what dtypes were we asked for?
+    dtypes = dtype.split(',')
+    field_widths = []
+
+    # how wide is each dtype?
+    for dt in dtypes:
+        num_bytes = int(dt[2:])
+        field_widths.append(num_bytes)
+
+    # how many bytes wide is the total record?
+    record_width = sum(field_widths)
+
+    # read the DATA segment into a 1 x `count` array of records. 
+    # each record has a number of `u1` (one-byte unsigned integers)
+    # equal to `record_width`.
     try:
-        return numpy.fromfile(file, dtype=dtype, count=count, *args, **kwargs)
+        ret = numpy.fromfile(file, 
+                             dtype=",".join(['u1'] * record_width), 
+                             count=count, 
+                             *args, 
+                             **kwargs)
     except (TypeError, IOError):
-        return numpy.frombuffer(file.read(count * numpy.dtype(dtype).itemsize),
-                                dtype=dtype, count=count, *args, **kwargs)
+        ret = numpy.frombuffer(file.read(count * record_width),
+                               dtype=",".join(['u1'] * record_width), 
+                               count=count, 
+                               *args, 
+                               **kwargs)
+
+    # convert the DATA segment from a 1 x `count` array of records 
+    # (and remember, each record is composed of `record_width` 
+    # 1-byte unsigned ints) to a `record_width` x `count` array of 
+    # 'u1' unsigned ints.
+    ret = ret.view('u1').reshape((count, record_width))
+
+    # now, for each requested dtype.....
+    ret_dtypes = []
+    for field_idx, dt in enumerate(dtypes):
+        dtype_type = dt[1]
+        dtype_endian = dt[0]
+        num_bytes = int(dt[2:])
+
+        # num_bytes & (num_bytes - 1) is 0 IFF num_bytes is a power of two
+        # while num_bytes is NOT a power of two....
+        while num_bytes & (num_bytes - 1) != 0:
+            # ...insert another COLUMN of NUL bytes at the front of the field....
+            ret = numpy.insert(ret, sum(field_widths[0:field_idx]), numpy.zeros(count), axis=1)
+
+            # ....and increment the number of bytes for this field.
+            num_bytes = num_bytes + 1
+
+        # when we've got a field that's a power-of-two wide, append that field's
+        # dtype to the list of record dtypes we're going to return
+        ret_dtypes.append(dtype_endian + dtype_type + str(num_bytes))
+
+    # now, "cast" the newly padded array as the desired data types,
+    # and return it.
+    return ret.view(','.join(ret_dtypes)).ravel()
 
 
 class ParserFeatureNotImplementedError(Exception):
@@ -447,7 +522,7 @@ def read_data(self, file_handle):
         if text["$DATATYPE"] == "I":
             if len(set(par_numeric_type_list)) > 1:
                 for channel_number in self.channel_numbers:
-                    valid_bits = numpy.ceil(numpy.log2(numpy.float(text["$P{0}R".format(channel_number)])))
+                    valid_bits = numpy.ceil(numpy.log2(float(text["$P{0}R".format(channel_number)])))
 
                     if bytes_per_par_list[channel_number - 1] * 8 == valid_bits:
                         continue
@@ -457,7 +532,7 @@ def read_data(self, file_handle):
                     data[name] = data[name] & bitmask
             else:
                 valid_bits_per_par_list = numpy.array([
-                    2 ** numpy.ceil(numpy.log2(numpy.float(text["$P{0}R".format(i)]))) - 1
+                    2 ** numpy.ceil(numpy.log2(float(text["$P{0}R".format(i)]))) - 1
                     for i in self.channel_numbers
                 ], dtype=data.dtype)
                 data &= valid_bits_per_par_list
@@ -602,4 +677,4 @@ def parse(path, meta_data_only=False, compensate=False, channel_naming='$PnS',
         df = fcs_parser.dataframe
         df = df.astype(dtype) if dtype else df
         return meta, df
-
+
diff --git a/fcsparser/tests/data/FlowCytometers/Cytek_xP5/Cytek_xP5.fcs b/fcsparser/tests/data/FlowCytometers/Cytek_xP5/Cytek_xP5.fcs
diff --git a/fcsparser/tests/test_fcs_reader.py b/fcsparser/tests/test_fcs_reader.py
@@ -30,6 +30,7 @@
     'large fake fcs': os.path.join(BASE_PATH, 'fake_large_fcs', 'fake_large_fcs.fcs'),
     'cyflow cube 8': os.path.join(BASE_PATH, 'cyflow_cube_8', 'cyflow_cube_8.fcs'),
     'fake bitmask error': os.path.join(BASE_PATH, 'fake_bitmask_error', 'fcs1_cleaned.lmd'),
+    'Cytek xP5': os.path.join(BASE_PATH, 'Cytek_xP5', 'Cytek_xP5.fcs'),
 }
 
 # The group of files below is used for checking behavior other than reading data.
@@ -215,6 +216,11 @@ def test_mq_FCS_3_1_data_segment(self):
         fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.1']
         meta, df = parse_fcs(fname)
 
+    def test_Cytek_xP5(self):
+        """Test the 3-byte-wide DATA segment from a Cytek xP5"""
+        fname = FILE_IDENTIFIER_TO_PATH['Cytek xP5']
+        meta, df = parse_fcs(fname)
+
     def test_fcs_reader_API(self):
         """Make sure that the API remains consistent."""
         for fname in FILE_IDENTIFIER_TO_PATH.values():