diff --git a/CHANGELOG.md b/CHANGELOG.md index dd99f24..4161189 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Removed additional redundancy of GeoParquet result files when only one extract covers whole area [#35](https://github.com/kraina-ai/quackosm/issues/35) + +### Fixed + +- Added missing `requests` dependency + ## [0.4.0] - 2024-01-31 ### Added diff --git a/README.md b/README.md index 7966747..08c91c3 100644 --- a/README.md +++ b/README.md @@ -41,15 +41,26 @@ QuackOSM supports **Python >= 3.9** ### Dependencies Required: -- duckdb (>=0.9.2) -- pyarrow (>=13.0.0) -- geoarrow-pyarrow (>=0.1.1) -- geopandas -- shapely (>=2.0) -- typeguard +- duckdb (>=0.9.2) - For all DuckDB operations on PBF files +- pyarrow (>=13.0.0) - For parquet files wrangling +- pyarrow-ops - For easy removal of duplicated features in parquet files +- geoarrow-pyarrow (>=0.1.1) - For GeoParquet IO operations +- geopandas - For returning GeoDataFrames and reading Geo files +- shapely (>=2.0) - For parsing WKT and GeoJSON strings and fixing geometries +- typeguard - For internal validation of types +- psutil - For automatic scaling of parameters based on available resources +- pooch - For downloading `*.osm.pbf` files +- tqdm - For showing progress bars +- requests - For iterating OSM PBF files services +- beautifulsoup4 - For parsing HTML files and scraping required information Optional: -- typer[all] (click, colorama, rich, shellingham) +- typer[all] (click, colorama, rich, shellingham) - For CLI +- osmnx - For geocoding of strings in CLI +- h3 - For reading H3 strings in CLI +- h3ronpy - For transforming H3 indexes into geometries +- s2 - For transforming S2 indexes into geometries +- python-geohash - For transforming GeoHash indexes into geometries ## Usage diff --git a/pdm.lock b/pdm.lock index 4efd333..add7030 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev", "docs", "license", "lint", "test", "cli", "cli-dev"] strategy = ["cross_platform"] lock_version = "4.4.1" -content_hash = "sha256:87a6882ff8eac17697cc290dd53104c101175d08fc4585238434d7386b2aa692" +content_hash = "sha256:457aacc81c0be0f2aa13cd15d20888fff0e50e19e26b5d864b5002874932f844" [[package]] name = "appnope" diff --git a/pyproject.toml b/pyproject.toml index dbaedd9..52953c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "tqdm", "beautifulsoup4", "pyarrow-ops", + "requests", ] requires-python = ">=3.9" readme = "README.md" diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index 5cd68cf..0b24b35 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -270,29 +270,45 @@ def convert_geometry_filter_to_gpq( explode_tags=explode_tags, ) ) - if not result_file_path.exists() or ignore_cache: - matching_extracts = find_smallest_containing_extract( - self.geometry_filter, self.osm_extract_source - ) - pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory) - parsed_geoparquet_files = [] - for file_path in pbf_files: - parsed_geoparquet_file = self.convert_pbf_to_gpq( - file_path, - keep_all_tags=keep_all_tags, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - filter_osm_ids=filter_osm_ids, - ) - parsed_geoparquet_files.append(parsed_geoparquet_file) + matching_extracts = find_smallest_containing_extract( + self.geometry_filter, self.osm_extract_source + ) - joined_parquet_table = self._drop_duplicates_features_in_pyarrow_table( - parsed_geoparquet_files - ) - io.write_geoparquet_table( # type: ignore - joined_parquet_table, result_file_path, primary_geometry_column=GEOMETRY_COLUMN + if len(matching_extracts) == 1: + pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory) + return self.convert_pbf_to_gpq( + pbf_files[0], + result_file_path=result_file_path, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, ) + else: + if not result_file_path.exists() or ignore_cache: + matching_extracts = find_smallest_containing_extract( + self.geometry_filter, self.osm_extract_source + ) + pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory) + + parsed_geoparquet_files = [] + for file_path in pbf_files: + parsed_geoparquet_file = self.convert_pbf_to_gpq( + file_path, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, + ) + parsed_geoparquet_files.append(parsed_geoparquet_file) + + joined_parquet_table = self._drop_duplicates_features_in_pyarrow_table( + parsed_geoparquet_files + ) + io.write_geoparquet_table( # type: ignore + joined_parquet_table, result_file_path, primary_geometry_column=GEOMETRY_COLUMN + ) return Path(result_file_path)