kraina-ai · RaczeQ · Jan 14, 2025 · Nov 5, 2024 · Nov 5, 2024 · Nov 7, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Automatic total time wrapper decorator to aggregate nested function calls
+- Parameter `columns_to_download` for selecting columns to download from the dataset [#23](https://github.com/kraina-ai/overturemaestro/issues/23)
+- Option to pass a list of pyarrow filters and columns for download for each theme type pair when downloading multiple datasets at once
+
+### Changed
+
+- Refactored available release versions caching [#24](https://github.com/kraina-ai/overturemaestro/issues/24)
+- Removed hive partitioned parquet schema columns from GeoDataFrame loading
+
+### Deprecated
+
+- Nested fields in PyArrow filter in CLI is now expected to be separated by a dot, not a comma [#22](https://github.com/kraina-ai/overturemaestro/issues/22)
+
 ## [0.1.2] - 2024-12-17
 
 ### Added

diff --git a/README.md b/README.md
@@ -80,6 +80,8 @@ Required:
 
 - `geoarrow-rust-core (>=0.3.0)`: For transforming Arrow data to Shapely objects
 
+- `duckdb (>=1.1.0)`: For transforming downloaded data to the wide format
+
 - `pooch (>=1.6.0)`: For downloading precalculated dataset indexes
 
 - `rich (>=12.0.0)`: For showing progress bars

diff --git a/overturemaestro/__main__.py b/overturemaestro/__main__.py
@@ -6,7 +6,6 @@ def main() -> None:
     try:
         from overturemaestro import __app_name__, cli
     except ImportError as exc:
-        raise
         error_msg = (
             "Missing optional dependencies required for the CLI."
             " Please install required packages using `pip install overturemaestro[cli]`."

diff --git a/overturemaestro/_duckdb.py b/overturemaestro/_duckdb.py
@@ -0,0 +1,27 @@
+"""Helper functions for DuckDB."""
+
+from pathlib import Path
+from typing import Union
+
+import duckdb
+
+
+def _sql_escape(value: str) -> str:
+    """Escape value for SQL query."""
+    return value.replace("'", "''")
+
+
+def _set_up_duckdb_connection(tmp_dir_path: Union[str, Path]) -> "duckdb.DuckDBPyConnection":
+    """Create DuckDB connection in a given directory."""
+    local_db_file = "db.duckdb"
+    connection = duckdb.connect(
+        database=str(Path(tmp_dir_path) / local_db_file),
+        config=dict(preserve_insertion_order=False),
+    )
+    connection.sql("SET enable_progress_bar = false;")
+    connection.sql("SET enable_progress_bar_print = false;")
+
+    connection.install_extension("spatial")
+    connection.load_extension("spatial")
+
+    return connection
diff --git a/overturemaestro/_exceptions.py b/overturemaestro/_exceptions.py
@@ -1 +1,7 @@
 class QueryNotGeocodedError(ValueError): ...
+
+
+class MissingColumnError(ValueError): ...
+
+
+class HierarchyDepthOutOfBoundsError(ValueError): ...
diff --git a/overturemaestro/_parquet_multiprocessing.py b/overturemaestro/_parquet_multiprocessing.py
@@ -1,12 +1,13 @@
 import multiprocessing
+from multiprocessing.managers import SyncManager
 from pathlib import Path
 from queue import Empty, Queue
 from time import sleep, time
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
 
 from overturemaestro._rich_progress import VERBOSITY_MODE, TrackProgressSpinner
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from multiprocessing.managers import ValueProxy
     from threading import Lock
 
@@ -31,13 +32,14 @@ def _job(
     columns: Optional[list[str]],
     filesystem: "fs.FileSystem",
 ) -> None:  # pragma: no cover
+    import hashlib
+
     import pyarrow.dataset as ds
     import pyarrow.parquet as pq
 
     current_pid = multiprocessing.current_process().pid
 
-    filepath = save_path / f"{current_pid}.parquet"
-    writer = None
+    writers = {}
     while not queue.empty():
         try:
             file_name, row_group_index = None, None
@@ -61,10 +63,16 @@ def _job(
                     tracker.value += 1
                 continue
 
-            if not writer:
-                writer = pq.ParquetWriter(filepath, result_table.schema)
+            h = hashlib.new("sha256")
+            h.update(result_table.schema.to_string().encode())
+            schema_hash = h.hexdigest()
+
+            if schema_hash not in writers:
+                filepath = save_path / str(current_pid) / f"{schema_hash}.parquet"
+                filepath.parent.mkdir(exist_ok=True, parents=True)
+                writers[schema_hash] = pq.ParquetWriter(filepath, result_table.schema)
 
-            writer.write_table(result_table)
+            writers[schema_hash].write_table(result_table)
 
             with tracker_lock:
                 tracker.value += 1
@@ -80,7 +88,7 @@ def _job(
             )
             raise MultiprocessingRuntimeError(msg) from ex
 
-    if writer:
+    for writer in writers.values():
         writer.close()
 
 
@@ -107,14 +115,21 @@ def exception(self) -> Optional[tuple[Exception, str]]:
         return self._exception
 
 
+class SingletonContextManager(SyncManager):
+    def __new__(cls, ctx: multiprocessing.context.SpawnContext) -> "SingletonContextManager":
+        if not hasattr(cls, "instance"):
+            cls.instance = ctx.Manager()
+        return cast(SingletonContextManager, cls.instance)
+
+
 def _read_row_group_number(path: str, filesystem: "fs.FileSystem") -> int:
     import pyarrow.parquet as pq
 
     return int(pq.ParquetFile(path, filesystem=filesystem).num_row_groups)
 
 
 def map_parquet_dataset(
-    dataset_path: Union[str, list[str]],
+    dataset_path: Union[str, Path, list[str], list[Path]],
     destination_path: Path,
     function: Callable[[str, int, "pa.Table"], "pa.Table"],
     progress_description: str,
@@ -157,7 +172,7 @@ def map_parquet_dataset(
 
         from overturemaestro._rich_progress import TrackProgressBar
 
-        manager = ctx.Manager()
+        manager = SingletonContextManager(ctx=ctx)
 
         queue: Queue[tuple[str, int]] = manager.Queue()
         tracker: ValueProxy[int] = manager.Value("i", 0)
@@ -178,17 +193,20 @@ def map_parquet_dataset(
             no_scan_workers = min(max_workers, no_scan_workers)
             no_processing_workers = min(max_workers, no_processing_workers)
 
-    with TrackProgressBar(verbosity_mode=verbosity_mode) as progress:
-        total_files = len(dataset.files)
-        with ProcessPoolExecutor(max_workers=min(no_scan_workers, total_files)) as ex:
-            fn = partial(_read_row_group_number, filesystem=dataset.filesystem)
-            row_group_numbers = list(
-                progress.track(
-                    ex.map(fn, dataset.files, chunksize=1),
-                    description="Reading all parquet files row groups",
-                    total=total_files,
-                )
+    total_files = len(dataset.files)
+
+    with (
+        TrackProgressBar(verbosity_mode=verbosity_mode) as progress,
+        ProcessPoolExecutor(max_workers=min(no_scan_workers, total_files)) as ex,
+    ):
+        fn = partial(_read_row_group_number, filesystem=dataset.filesystem)
+        row_group_numbers = list(
+            progress.track(
+                ex.map(fn, dataset.files, chunksize=1),
+                description="Reading all parquet files row groups",
+                total=total_files,
             )
+        )
 
         for pq_file, row_group_number in zip(dataset.files, row_group_numbers):
             for row_group in range(row_group_number):

diff --git a/overturemaestro/advanced_functions/__init__.py b/overturemaestro/advanced_functions/__init__.py
@@ -0,0 +1,66 @@
+"""
+Advanced functions.
+
+This module contains dedicated functions for specific use cases.
+"""
+
+# from overturemaestro.advanced_functions.poi import (
+#     convert_bounding_box_to_pois_geodataframe,
+#     convert_bounding_box_to_pois_parquet,
+#     convert_geometry_to_pois_geodataframe,
+#     convert_geometry_to_pois_parquet,
+# )
+# from overturemaestro.advanced_functions.transportation import (
+#     convert_bounding_box_to_roads_geodataframe,
+#     convert_bounding_box_to_roads_parquet,
+#     convert_geometry_to_roads_geodataframe,
+#     convert_geometry_to_roads_parquet,
+# )
+from overturemaestro.advanced_functions.functions import (
+    convert_bounding_box_to_wide_form_geodataframe,
+    convert_bounding_box_to_wide_form_geodataframe_for_all_types,
+    convert_bounding_box_to_wide_form_geodataframe_for_multiple_types,
+    convert_bounding_box_to_wide_form_parquet,
+    convert_bounding_box_to_wide_form_parquet_for_all_types,
+    convert_bounding_box_to_wide_form_parquet_for_multiple_types,
+    convert_geometry_to_wide_form_geodataframe,
+    convert_geometry_to_wide_form_geodataframe_for_all_types,
+    convert_geometry_to_wide_form_geodataframe_for_multiple_types,
+    convert_geometry_to_wide_form_parquet,
+    convert_geometry_to_wide_form_parquet_for_all_types,
+    convert_geometry_to_wide_form_parquet_for_multiple_types,
+)
+
+__all__ = [
+    "convert_bounding_box_to_wide_form_geodataframe",
+    "convert_bounding_box_to_wide_form_geodataframe_for_all_types",
+    "convert_bounding_box_to_wide_form_geodataframe_for_multiple_types",
+    "convert_bounding_box_to_wide_form_parquet",
+    "convert_bounding_box_to_wide_form_parquet_for_all_types",
+    "convert_bounding_box_to_wide_form_parquet_for_multiple_types",
+    "convert_geometry_to_wide_form_geodataframe",
+    "convert_geometry_to_wide_form_geodataframe_for_all_types",
+    "convert_geometry_to_wide_form_geodataframe_for_multiple_types",
+    "convert_geometry_to_wide_form_parquet",
+    "convert_geometry_to_wide_form_parquet_for_all_types",
+    "convert_geometry_to_wide_form_parquet_for_multiple_types",
+]
+
+# __all__ = [
+#     "convert_bounding_box_to_buildings_geodataframe",
+#     "convert_bounding_box_to_buildings_parquet",
+#     "convert_bounding_box_to_pois_geodataframe",
+#     "convert_bounding_box_to_pois_parquet",
+#     "convert_bounding_box_to_roads_geodataframe",
+#     "convert_bounding_box_to_roads_parquet",
+#     "convert_bounding_box_to_wide_form_geodataframe",
+#     "convert_bounding_box_to_wide_form_parquet",
+#     "convert_geometry_to_buildings_geodataframe",
+#     "convert_geometry_to_buildings_parquet",
+#     "convert_geometry_to_pois_geodataframe",
+#     "convert_geometry_to_pois_parquet",
+#     "convert_geometry_to_roads_geodataframe",
+#     "convert_geometry_to_roads_parquet",
+#     "convert_geometry_to_wide_form_geodataframe",
+#     "convert_geometry_to_wide_form_parquet",
+# ]