diff --git a/fcsparser/api.py b/fcsparser/api.py index f7e6966..cadea0e 100644 --- a/fcsparser/api.py +++ b/fcsparser/api.py @@ -19,15 +19,9 @@ import warnings import numpy +import pandas as pd import six -try: - import pandas as pd -except ImportError: - pd = None - warnings.warn(u'pandas is not installed, so the parse_fcs function can only be used together ' - u'with numpy.') - logger = logging.getLogger(__name__) @@ -484,8 +478,8 @@ def dataframe(self): return pd.DataFrame(data, columns=channel_names) -def parse(path, meta_data_only=False, output_format='DataFrame', compensate=False, - channel_naming='$PnS', reformat_meta=False, data_set=0): +def parse(path, meta_data_only=False, compensate=False, channel_naming='$PnS', + reformat_meta=False, data_set=0, dtype='float32'): """Parse an fcs file at the location specified by the path. Parameters @@ -514,7 +508,13 @@ def parse(path, meta_data_only=False, output_format='DataFrame', compensate=Fals into a DataFrame and moved into the '_channels_' key data_set: int Index of retrieved data set in the fcs file. - This value specifies the data set being retrieved from an fcs file with multple data sets. + This value specifies the data set being retrieved from an fcs file with multiple data sets. + dtype: str | None + If provided, will force convert all data into this dtype. + This is set by default to auto-convert to float32 to deal with cases in which the original + data has been stored using a smaller data type (e.g., unit8). This modifies the original + data, but should make follow up analysis safer in basically all cases. + Returns ------- @@ -530,32 +530,24 @@ def parse(path, meta_data_only=False, output_format='DataFrame', compensate=Fals -------- fname = '../tests/data/EY_2013-05-03_EID_214_PID_1120_Piperacillin_Well_B7.001.fcs' meta = parse_fcs(fname, meta_data_only=True) - meta, data_pandas = parse_fcs(fname, meta_data_only=False, output_format='DataFrame') - meta, data_numpy = parse_fcs(fname, meta_data_only=False, output_format='ndarray') + meta, data_pandas = parse_fcs(fname, meta_data_only=False) """ if compensate: raise ParserFeatureNotImplementedError(u'Compensation has not been implemented yet.') - if reformat_meta or (output_format == 'DataFrame'): - if pd is None: - raise ImportError(u'You do not have pandas installed.') - read_data = not meta_data_only - parsed_fcs = FCSParser(path, read_data=read_data, channel_naming=channel_naming, + fcs_parser = FCSParser(path, read_data=read_data, channel_naming=channel_naming, data_set=data_set) if reformat_meta: - parsed_fcs.reformat_meta() + fcs_parser.reformat_meta() - meta = parsed_fcs.annotation + meta = fcs_parser.annotation if meta_data_only: return meta - elif output_format == 'DataFrame': - return meta, parsed_fcs.dataframe - elif output_format == 'ndarray': - # Constructs numpy matrix - return meta, parsed_fcs.data - else: - raise ValueError(u'The output_format must be either "ndarray" or "DataFrame".') + else: # Then include both meta and dataframe. + df = fcs_parser.dataframe + df = df.astype(dtype) if dtype else df + return meta, df diff --git a/fcsparser/tests/test_fcs_reader.py b/fcsparser/tests/test_fcs_reader.py index b3aa214..e560f5f 100755 --- a/fcsparser/tests/test_fcs_reader.py +++ b/fcsparser/tests/test_fcs_reader.py @@ -43,8 +43,8 @@ def check_data_segment(file_name, expected_array_values): """Check that the data segmented extracted from the file corresponds to the expected values.""" file_path = FILE_IDENTIFIER_TO_PATH[file_name] - meta, matrix = parse_fcs(file_path, output_format='ndarray') - diff = numpy.abs(expected_array_values - matrix[0:4, :]) + meta, df = parse_fcs(file_path) + diff = numpy.abs(expected_array_values - df.values[0:4, :]) return numpy.all(diff < 10 ** -8) # Is this the proper way to do the test? @@ -52,7 +52,7 @@ class TestFCSReader(unittest.TestCase): def test_mq_FCS_2_0_text_segment(self): """Test TEXT segment parsed from FCS (2.0 format) file from a MACSQuant flow cytometer.""" fname = FILE_IDENTIFIER_TO_PATH['mq fcs 2.0'] - meta = parse_fcs(fname, meta_data_only=True, output_format='ndarray') + meta = parse_fcs(fname, meta_data_only=True) self.assertEqual('EY_2013-07-19_PBS_FCS_2.0_Custom_Without_Add_Well_A1.001.fcs', meta['$FIL']) self.assertEqual('MACSQuant', meta['$CYT']) @@ -60,7 +60,7 @@ def test_mq_FCS_2_0_text_segment(self): def test_mq_FCS_3_0_text_segment(self): """Test TEXT segment parsed from FCS (3.0 format) file from a MACSQuant flow cytometer.""" fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.0'] - meta = parse_fcs(fname, meta_data_only=True, output_format='ndarray') + meta = parse_fcs(fname, meta_data_only=True) expected_fname = 'EY_2013-07-19_PID_101_MG1655_Transformants_D01_Well_A4.001.fcs' self.assertEqual(expected_fname, meta['$FIL']) @@ -69,7 +69,7 @@ def test_mq_FCS_3_0_text_segment(self): def test_mq_FCS_3_1_text_segment(self): """Test TEXT segment parsed from FCS (3.1 format) file from a MACSQuant flow cytometer.""" fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.1'] - meta = parse_fcs(fname, meta_data_only=True, output_format='ndarray') + meta = parse_fcs(fname, meta_data_only=True) self.assertEqual('MACSQuant', meta['$CYT']) def test_mq_FCS_2_0_data_segment(self): @@ -205,7 +205,7 @@ def test_Fortessa_data_segment(self): def test_mq_FCS_3_1_data_segment(self): """Test DATA segment parsed from FCS (3.1 format) file from a MACSQuant flow cytometer""" fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.1'] - meta, matrix = parse_fcs(fname, output_format='ndarray') + meta, df = parse_fcs(fname) def test_fcs_reader_API(self): """Make sure that the API remains consistent.""" @@ -214,13 +214,8 @@ def test_fcs_reader_API(self): # Invoke the parser in multiple ways to make sure that all invocations run successfully. # This is a shallow test that only verifies consistency. meta = parse_fcs(fname, meta_data_only=True) - meta, data_pandas = parse_fcs(fname, meta_data_only=False, output_format='DataFrame') - meta, data_pandas = parse_fcs(fname, meta_data_only=False, output_format='DataFrame', - reformat_meta=True) - meta, data_numpy = parse_fcs(fname, meta_data_only=False, output_format='ndarray', - reformat_meta=False) - meta, data_numpy = parse_fcs(fname, meta_data_only=False, output_format='ndarray', - reformat_meta=True) + meta, data_pandas = parse_fcs(fname, meta_data_only=False) + meta, data_pandas = parse_fcs(fname, meta_data_only=False, reformat_meta=True) self.assertIsInstance(meta['_channel_names_'], tuple) self.assertGreater(len(meta['_channel_names_']), 0) @@ -278,21 +273,18 @@ def test_speed_of_reading_fcs_files(self): number = 1000 time = timeit.timeit( - lambda: parse_fcs(file_path, meta_data_only=True, output_format='DataFrame', - reformat_meta=False), number=number) + lambda: parse_fcs(file_path, meta_data_only=True, reformat_meta=False), number=number) print('Loading fcs file {0} times with meta_data only without reformatting of ' 'meta takes {1} per loop'.format(time / number, number)) time = timeit.timeit( - lambda: parse_fcs(file_path, meta_data_only=True, output_format='DataFrame', - reformat_meta=True), number=number) + lambda: parse_fcs(file_path, meta_data_only=True, reformat_meta=True), number=number) print('Loading fcs file {0} times with meta_data only with reformatting of ' 'meta takes {1} per loop'.format(time / number, number)) time = timeit.timeit( - lambda: parse_fcs(file_path, meta_data_only=False, output_format='DataFrame', - reformat_meta=False), number=number) + lambda: parse_fcs(file_path, meta_data_only=False, reformat_meta=False), number=number) print('Loading fcs file {0} times both meta and data but without reformatting of ' 'meta takes {1} per loop'.format(time / number, number))