Skip to content

Commit

Permalink
3 byte fields (#32)
Browse files Browse the repository at this point in the history
* Update fromfile to support 3-byte fields

Some FCS files (such as the one generated by the Cytek xP5) store integers in 3-byte fields.  This breaks numpy's parser, which only wants power-of-two sized fields.  So, I've updated fromfile() to parse FCS files as a table of 1-byte unsigned ints, expand those fields to 4 bytes, and then re-view them as the proper dtype.

My only concern is that this may break on an FCS file produced by a big-endian machine (because I'm adding the extra bytes to the beginning of the field).  I don't have ready access to any of those -- do you?

* add tests for 3-byte DATA

* documentation for fromfile() changes
  • Loading branch information
bpteague authored Oct 7, 2021
1 parent 199126a commit 111bf0b
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 6 deletions.
87 changes: 81 additions & 6 deletions fcsparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,86 @@

def fromfile(file, dtype, count, *args, **kwargs):
"""Wrapper around np.fromfile to support any file-like object."""

# to understand this function, you must understand the difference
# between a numpy *record* and a numpy *field*. a *record* is
# composed of a number of *fields*, each of which may be a different
# dtype. for example, if we were encoding a list of two-dimensional
# points on a plane, each point would be represented by a record
# containing two fields, each of which was a four-byte float
# represented by the dtype 'f4'. thus, the record would be
# specified by dtype("f4, f4").

# numpy's functions expect all field widths to be a power of two
# (because this is how such things are stored in memory.)
# unfortunately, some pathological cytometers don't make their
# records a power-of-two wide. For example, a Cytek xP5 encodes
# records in their DATA segment in three-byte-wide integers --
# this comes in as a dtype of "i3", which makes numpy freak out.

# To address this, we convert the requested dtype so that each
# record is read as a series of one-byte-wide unsigned integers
# ('u1'), pad out each record with NUL bytes until it is
# a power-of-two wide, then re-convert it to the requested
# dtype (but with a power-of-two width) and return it.

# what dtypes were we asked for?
dtypes = dtype.split(',')
field_widths = []

# how wide is each dtype?
for dt in dtypes:
num_bytes = int(dt[2:])
field_widths.append(num_bytes)

# how many bytes wide is the total record?
record_width = sum(field_widths)

# read the DATA segment into a 1 x `count` array of records.
# each record has a number of `u1` (one-byte unsigned integers)
# equal to `record_width`.
try:
return numpy.fromfile(file, dtype=dtype, count=count, *args, **kwargs)
ret = numpy.fromfile(file,
dtype=",".join(['u1'] * record_width),
count=count,
*args,
**kwargs)
except (TypeError, IOError):
return numpy.frombuffer(file.read(count * numpy.dtype(dtype).itemsize),
dtype=dtype, count=count, *args, **kwargs)
ret = numpy.frombuffer(file.read(count * record_width),
dtype=",".join(['u1'] * record_width),
count=count,
*args,
**kwargs)

# convert the DATA segment from a 1 x `count` array of records
# (and remember, each record is composed of `record_width`
# 1-byte unsigned ints) to a `record_width` x `count` array of
# 'u1' unsigned ints.
ret = ret.view('u1').reshape((count, record_width))

# now, for each requested dtype.....
ret_dtypes = []
for field_idx, dt in enumerate(dtypes):
dtype_type = dt[1]
dtype_endian = dt[0]
num_bytes = int(dt[2:])

# num_bytes & (num_bytes - 1) is 0 IFF num_bytes is a power of two
# while num_bytes is NOT a power of two....
while num_bytes & (num_bytes - 1) != 0:
# ...insert another COLUMN of NUL bytes at the front of the field....
ret = numpy.insert(ret, sum(field_widths[0:field_idx]), numpy.zeros(count), axis=1)

# ....and increment the number of bytes for this field.
num_bytes = num_bytes + 1

# when we've got a field that's a power-of-two wide, append that field's
# dtype to the list of record dtypes we're going to return
ret_dtypes.append(dtype_endian + dtype_type + str(num_bytes))

# now, "cast" the newly padded array as the desired data types,
# and return it.
return ret.view(','.join(ret_dtypes)).ravel()


class ParserFeatureNotImplementedError(Exception):
Expand Down Expand Up @@ -447,7 +522,7 @@ def read_data(self, file_handle):
if text["$DATATYPE"] == "I":
if len(set(par_numeric_type_list)) > 1:
for channel_number in self.channel_numbers:
valid_bits = numpy.ceil(numpy.log2(numpy.float(text["$P{0}R".format(channel_number)])))
valid_bits = numpy.ceil(numpy.log2(float(text["$P{0}R".format(channel_number)])))

if bytes_per_par_list[channel_number - 1] * 8 == valid_bits:
continue
Expand All @@ -457,7 +532,7 @@ def read_data(self, file_handle):
data[name] = data[name] & bitmask
else:
valid_bits_per_par_list = numpy.array([
2 ** numpy.ceil(numpy.log2(numpy.float(text["$P{0}R".format(i)]))) - 1
2 ** numpy.ceil(numpy.log2(float(text["$P{0}R".format(i)]))) - 1
for i in self.channel_numbers
], dtype=data.dtype)
data &= valid_bits_per_par_list
Expand Down Expand Up @@ -602,4 +677,4 @@ def parse(path, meta_data_only=False, compensate=False, channel_naming='$PnS',
df = fcs_parser.dataframe
df = df.astype(dtype) if dtype else df
return meta, df


Binary file not shown.
6 changes: 6 additions & 0 deletions fcsparser/tests/test_fcs_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
'large fake fcs': os.path.join(BASE_PATH, 'fake_large_fcs', 'fake_large_fcs.fcs'),
'cyflow cube 8': os.path.join(BASE_PATH, 'cyflow_cube_8', 'cyflow_cube_8.fcs'),
'fake bitmask error': os.path.join(BASE_PATH, 'fake_bitmask_error', 'fcs1_cleaned.lmd'),
'Cytek xP5': os.path.join(BASE_PATH, 'Cytek_xP5', 'Cytek_xP5.fcs'),
}

# The group of files below is used for checking behavior other than reading data.
Expand Down Expand Up @@ -215,6 +216,11 @@ def test_mq_FCS_3_1_data_segment(self):
fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.1']
meta, df = parse_fcs(fname)

def test_Cytek_xP5(self):
"""Test the 3-byte-wide DATA segment from a Cytek xP5"""
fname = FILE_IDENTIFIER_TO_PATH['Cytek xP5']
meta, df = parse_fcs(fname)

def test_fcs_reader_API(self):
"""Make sure that the API remains consistent."""
for fname in FILE_IDENTIFIER_TO_PATH.values():
Expand Down

0 comments on commit 111bf0b

Please sign in to comment.