From c32cb06e0df7ca56b77728972c0518d88b692d7a Mon Sep 17 00:00:00 2001
From: gergelycsegzi <33738892+gergelycsegzi@users.noreply.github.com>
Date: Sun, 24 Oct 2021 18:07:25 +0100
Subject: [PATCH] If the FCS file uses whitespace delimiters (e.g. \x0c) avoid
 stripping (#37)

* If the FCS file uses whitespace delimiters (e.g. \x0c) avoid stripping

Otherwise the $BEGINDATA could be cut. In my case I had headers without 'data start' where the first data row had:
\x0c$BEGINDATA
So doing both the trim and the raw_text = raw_text[1:] means that we cut off the '$' which results in an error on line 381:
self._data_start = int(text['$BEGINDATA'])

* Added unit tests for whitespace delimiter

Co-authored-by: Gergely Csegzi <gcsegzi@palantir.com>
---
 fcsparser/api.py                   |  4 +++-
 fcsparser/tests/test_fcs_reader.py | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/fcsparser/api.py b/fcsparser/api.py
index 080a183..796bb85 100644
--- a/fcsparser/api.py
+++ b/fcsparser/api.py
@@ -280,7 +280,9 @@ def _extract_text_dict(raw_text):
         delimiter = raw_text[0]
 
         if raw_text[-1] != delimiter:
-            raw_text = raw_text.strip()
+            # Avoid stripping whitespace delimiter
+            if delimiter.strip() == delimiter:
+                raw_text = raw_text.strip()
             if raw_text[-1] != delimiter:
                 msg = (u'The first two characters were:\n {}. The last two characters were: {}\n'
                        u'Parser expects the same delimiter character in beginning '
diff --git a/fcsparser/tests/test_fcs_reader.py b/fcsparser/tests/test_fcs_reader.py
index 9255ffe..c9b86f7 100755
--- a/fcsparser/tests/test_fcs_reader.py
+++ b/fcsparser/tests/test_fcs_reader.py
@@ -84,6 +84,23 @@ def test_repeated_delimiter_text_segment(self):
         text = parser._extract_text_dict(raw_text)
         self.assertDictEqual(text, {'flow_speed': '3 m/s', 'x': 'a/', 'y': 'b//'})
 
+    def whitespace_delimiter_test_helper(self, has_final_delimiter: bool):
+        parser = FCSParser()
+        delimiter = '\t'
+        text_values = ['$BEGINDATA', '15', '$ENDDATA', '500']
+        raw_text = delimiter + delimiter.join(text_values)
+        if has_final_delimiter:
+            raw_text = raw_text + delimiter
+        text = parser._extract_text_dict(raw_text)
+        self.assertDictEqual(text, {'$BEGINDATA': '15', '$ENDDATA': '500'})
+    
+    def test_whitespace_delimited_text_extraction(self):
+        TestFCSReader.whitespace_delimiter_test_helper(self, has_final_delimiter=True)
+
+
+    def test_whitespace_delimited_text_extraction_no_final_delimiter(self):
+        TestFCSReader.whitespace_delimiter_test_helper(self, has_final_delimiter=False)
+
     def test_mq_FCS_2_0_data_segment(self):
         """Test DATA segment parsed from FCS (2.0 format) file from a MACSQuant flow cytometer"""
         values = np.array([[1.60764902830123901367e-03, 1.46554875373840332031e+00,