added hook for data_readers.Excel.effective_sheet_name to allow multi…

…ple sheets of same type
4dn-dcic · Jan 22, 2025 · d6b2ad1 · d6b2ad1
1 parent 805af4a
commit d6b2ad1
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 3 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,10 +8,12 @@ Change Log
 
 8.17.0
 ======
-* dmichaels / 2025-01-14
+* dmichaels / 2025-01-14 / branch: dmichaels-structured-data-row-mapper-hook-20250114 / PR-324
 * Added hook to structured_data.StructuredDataSet to allow a custom Excel class
   to be use, so a custom column mapping can be provided; this was initially to support
   special/more-intuitive columns for QC values in the submission spreadsheet for smaht-submitr.
+* Added hook to structured_data.StructuredDataSet to allow multiple sheets associated with
+  the same type (via a new data_readers.Excel.effective_sheet_name function).
 
 8.16.6
 ======

diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py
@@ -184,6 +184,9 @@ def __init__(self, file: str, reader_class: Optional[Type] = None, include_hidde
     def sheet_reader(self, sheet_name: str) -> ExcelSheetReader:
         return self._reader_class(self, sheet_name=sheet_name, workbook=self._workbook)
 
+    def effective_sheet_name(self, sheet_name: str) -> str:
+        return sheet_name
+
     def open(self) -> None:
         if self._workbook is None:
             import warnings

diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py
@@ -320,7 +320,16 @@ def get_counts() -> Tuple[int, int]:
         # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl).
         order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {}
         for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)):
-            self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name))
+            # This effective_sheet_name function added 2025-01-21 to allow sheets whose sheet names are
+            # other than simply the name of the type, but which do contain that type somehow; i.e. e.g.
+            # specifically where the sheet name is like "DSA:ExternalQualityMetric" where the "DSA"
+            # part is purely informational, and the "ExternalQualityMetric" is the type name; so we
+            # now can have multiple sheets of the same type (impossible before as sheet names need
+            # to be unique); this is simply a mechanism to allow the user to partition/organize their
+            # sheets with some data/rows for a given type split across multiple actual sheets.
+            type_name = Schema.type_name(excel.effective_sheet_name(sheet_name))
+            self._load_reader(excel.sheet_reader(sheet_name), type_name=type_name)
+            # self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name))
             if self._validator_sheet_hook and self.data.get(sheet_name):
                 self._validator_sheet_hook(self, sheet_name, self.data[sheet_name])
         # TODO: Do we really need progress reporting for the below?

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "8.16.6.1b4"  # TODO: 8.17.0
+version = "8.16.6.1b5"  # TODO: 8.17.0
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"