diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 27de625d1..8261b2fde 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,10 +8,12 @@ Change Log 8.17.0 ====== -* dmichaels / 2025-01-14 +* dmichaels / 2025-01-14 / branch: dmichaels-structured-data-row-mapper-hook-20250114 / PR-324 * Added hook to structured_data.StructuredDataSet to allow a custom Excel class to be use, so a custom column mapping can be provided; this was initially to support special/more-intuitive columns for QC values in the submission spreadsheet for smaht-submitr. +* Added hook to structured_data.StructuredDataSet to allow multiple sheets associated with + the same type (via a new data_readers.Excel.effective_sheet_name function). 8.16.6 ====== diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py index 589affded..60ff668d8 100644 --- a/dcicutils/data_readers.py +++ b/dcicutils/data_readers.py @@ -184,6 +184,9 @@ def __init__(self, file: str, reader_class: Optional[Type] = None, include_hidde def sheet_reader(self, sheet_name: str) -> ExcelSheetReader: return self._reader_class(self, sheet_name=sheet_name, workbook=self._workbook) + def effective_sheet_name(self, sheet_name: str) -> str: + return sheet_name + def open(self) -> None: if self._workbook is None: import warnings diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index 362d043af..3d0c5d627 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -320,7 +320,16 @@ def get_counts() -> Tuple[int, int]: # Order the sheet names by any specified ordering (e.g. ala snovault.loadxl). order = {Schema.type_name(key): index for index, key in enumerate(self._order)} if self._order else {} for sheet_name in sorted(excel.sheet_names, key=lambda key: order.get(Schema.type_name(key), sys.maxsize)): - self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name)) + # This effective_sheet_name function added 2025-01-21 to allow sheets whose sheet names are + # other than simply the name of the type, but which do contain that type somehow; i.e. e.g. + # specifically where the sheet name is like "DSA:ExternalQualityMetric" where the "DSA" + # part is purely informational, and the "ExternalQualityMetric" is the type name; so we + # now can have multiple sheets of the same type (impossible before as sheet names need + # to be unique); this is simply a mechanism to allow the user to partition/organize their + # sheets with some data/rows for a given type split across multiple actual sheets. + type_name = Schema.type_name(excel.effective_sheet_name(sheet_name)) + self._load_reader(excel.sheet_reader(sheet_name), type_name=type_name) + # self._load_reader(excel.sheet_reader(sheet_name), type_name=Schema.type_name(sheet_name)) if self._validator_sheet_hook and self.data.get(sheet_name): self._validator_sheet_hook(self, sheet_name, self.data[sheet_name]) # TODO: Do we really need progress reporting for the below? diff --git a/pyproject.toml b/pyproject.toml index a922c0b55..2868a3b2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.16.6.1b4" # TODO: 8.17.0 +version = "8.16.6.1b5" # TODO: 8.17.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT"