From 05847c3c77ccd726ebe467289376a57721fb68b7 Mon Sep 17 00:00:00 2001 From: mwip Date: Fri, 20 Sep 2024 21:44:41 +0200 Subject: [PATCH 01/12] Add support for exporting to duckdb (via parquet) This patch adds functionality to export directly to a DuckDB database via the --duckdb flag or using a ".duckdb" or ".db" file. Optionally one can change the table name in which data will be imported. Documentation was mostly copied from existing functions but doctests were updated and checked for consistency with the results. Closes kraina-ai/quackosm#94 --- quackosm/__init__.py | 6 + quackosm/cli.py | 212 ++++++++++---- quackosm/functions.py | 569 ++++++++++++++++++++++++++++++++++++ quackosm/pbf_file_reader.py | 147 ++++++++++ tests/base/test_cli.py | 6 + 5 files changed, 885 insertions(+), 55 deletions(-) diff --git a/quackosm/__init__.py b/quackosm/__init__.py index 5a0d190..093eacb 100644 --- a/quackosm/__init__.py +++ b/quackosm/__init__.py @@ -6,10 +6,13 @@ """ from quackosm.functions import ( + convert_geometry_to_duckdb, convert_geometry_to_geodataframe, convert_geometry_to_parquet, + convert_osm_extract_to_duckdb, convert_osm_extract_to_geodataframe, convert_osm_extract_to_parquet, + convert_pbf_to_duckdb, convert_pbf_to_geodataframe, convert_pbf_to_parquet, ) @@ -23,8 +26,11 @@ __all__ = [ "PbfFileReader", "convert_pbf_to_parquet", + "convert_pbf_to_duckdb", "convert_geometry_to_parquet", + "convert_geometry_to_duckdb", "convert_osm_extract_to_parquet", + "convert_osm_extract_to_duckdb", "convert_pbf_to_geodataframe", "convert_geometry_to_geodataframe", "convert_osm_extract_to_geodataframe", diff --git a/quackosm/cli.py b/quackosm/cli.py index 59fe303..7826e64 100644 --- a/quackosm/cli.py +++ b/quackosm/cli.py @@ -491,6 +491,20 @@ def main( ), ), ] = None, + duckdb: Annotated[ + bool, + typer.Option( + "--duckdb", + help="Export to duckdb database", + ), + ] = None, + duckdb_table_name: Annotated[ + Optional[str], + typer.Option( + "--duckdb-table-name", + help="Table name which the data will be imported into in the DuckDB database.", + ), + ] = "quackosm", ignore_cache: Annotated[ bool, typer.Option( @@ -674,34 +688,35 @@ def main( logging.disable(logging.CRITICAL) if pbf_file: - from quackosm.functions import convert_pbf_to_parquet - - geoparquet_path = convert_pbf_to_parquet( - pbf_path=pbf_file, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - geometry_filter=geometry_filter_value, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - save_as_wkt=wkt_result, - verbosity_mode=verbosity_mode, - ) - elif osm_extract_query: - from quackosm._exceptions import OsmExtractSearchError - from quackosm.functions import convert_osm_extract_to_parquet + # export to DuckDB database + if (result_file_path and result_file_path.suffix in [".duckdb", ".db"]) or duckdb: + from quackosm.functions import convert_pbf_to_duckdb - try: - geoparquet_path = convert_osm_extract_to_parquet( - osm_extract_query=osm_extract_query, - osm_extract_source=osm_extract_source, + result_path = convert_pbf_to_duckdb( + pbf_path=pbf_file, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + duckdb_table_name=duckdb_table_name, + verbosity_mode=verbosity_mode, + ) + + # export to parquet + else: + from quackosm.functions import convert_pbf_to_parquet + + result_path = convert_pbf_to_parquet( + pbf_path=pbf_file, tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore keep_all_tags=keep_all_tags, geometry_filter=geometry_filter_value, @@ -718,33 +733,120 @@ def main( save_as_wkt=wkt_result, verbosity_mode=verbosity_mode, ) - except OsmExtractSearchError as ex: - from rich.console import Console + elif osm_extract_query: + # export to DuckDB database + if (result_file_path and result_file_path.suffix in [".duckdb", ".db"]) or duckdb: + from quackosm._exceptions import OsmExtractSearchError + from quackosm.functions import convert_osm_extract_to_duckdb + + try: + result_path = convert_osm_extract_to_duckdb( + osm_extract_query=osm_extract_query, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + duckdb_table_name=duckdb_table_name, + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + ) + except OsmExtractSearchError as ex: + from rich.console import Console + + err_console = Console(stderr=True) + err_console.print(ex) + raise typer.Exit(code=1) from None + + # export to parquet + else: + from quackosm._exceptions import OsmExtractSearchError + from quackosm.functions import convert_osm_extract_to_parquet + + try: + result_path = convert_osm_extract_to_parquet( + osm_extract_query=osm_extract_query, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + ) + except OsmExtractSearchError as ex: + from rich.console import Console - err_console = Console(stderr=True) - err_console.print(ex) - raise typer.Exit(code=1) from None + err_console = Console(stderr=True) + err_console.print(ex) + raise typer.Exit(code=1) from None else: - from quackosm.functions import convert_geometry_to_parquet - - geoparquet_path = convert_geometry_to_parquet( - geometry_filter=geometry_filter_value, - osm_extract_source=osm_extract_source, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - save_as_wkt=wkt_result, - verbosity_mode=verbosity_mode, - geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, - allow_uncovered_geometry=allow_uncovered_geometry, - ) - typer.secho(geoparquet_path, fg="green") + # export to DuckDB database + if (result_file_path and result_file_path.suffix in [".duckdb", ".db"]) or duckdb: + from quackosm.functions import convert_geometry_to_duckdb + + result_path = convert_geometry_to_duckdb( + geometry_filter=geometry_filter_value, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + duckdb_table_name=duckdb_table_name, + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, + allow_uncovered_geometry=allow_uncovered_geometry, + ) + + # export to parquet + else: + from quackosm.functions import convert_geometry_to_parquet + + result_path = convert_geometry_to_parquet( + geometry_filter=geometry_filter_value, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, + allow_uncovered_geometry=allow_uncovered_geometry, + ) + typer.secho(result_path, fg="green") diff --git a/quackosm/functions.py b/quackosm/functions.py index cb30dd0..8b14803 100644 --- a/quackosm/functions.py +++ b/quackosm/functions.py @@ -19,13 +19,582 @@ __all__ = [ "convert_pbf_to_parquet", + "convert_pbf_to_duckdb", "convert_geometry_to_parquet", + "convert_geometry_to_duckdb", "convert_pbf_to_geodataframe", "convert_geometry_to_geodataframe", "convert_osm_extract_to_parquet", + "convert_osm_extract_to_duckdb", "convert_osm_extract_to_geodataframe", ] +def convert_pbf_to_duckdb( + pbf_path: Union[str, Path, Iterable[Union[str, Path]]], + tags_filter: Optional[Union[OsmTagsFilter, GroupedOsmTagsFilter]] = None, + geometry_filter: Optional[BaseGeometry] = None, + result_file_path: Optional[Union[str, Path]] = None, + keep_all_tags: bool = False, + explode_tags: Optional[bool] = None, + ignore_cache: bool = False, + filter_osm_ids: Optional[list[str]] = None, + duckdb_table_name: str = "quackosm", + working_directory: Union[str, Path] = "files", + osm_way_polygon_features_config: Optional[Union[OsmWayPolygonConfig, dict[str, Any]]] = None, + verbosity_mode: Literal["silent", "transient", "verbose"] = "transient", + debug_memory: bool = False, + debug_times: bool = False, +) -> Path: + """ + Convert PBF file to DuckDB file. + + Args: + pbf_path (Union[str, Path, Iterable[Union[str, Path]]]): + Path or list of paths of `*.osm.pbf` files to be parsed. Can be an URL. + tags_filter (Union[OsmTagsFilter, GroupedOsmTagsFilter], optional): A dictionary + specifying which tags to download. + The keys should be OSM tags (e.g. `building`, `amenity`). + The values should either be `True` for retrieving all objects with the tag, + string for retrieving a single tag-value pair + or list of strings for retrieving all values specified in the list. + `tags={'leisure': 'park}` would return parks from the area. + `tags={'leisure': 'park, 'amenity': True, 'shop': ['bakery', 'bicycle']}` + would return parks, all amenity types, bakeries and bicycle shops. + If `None`, handler will allow all of the tags to be parsed. Defaults to `None`. + geometry_filter (BaseGeometry, optional): Region which can be used to filter only + intersecting OSM objects. Defaults to `None`. + result_file_path (Union[str, Path], optional): Where to save + the geoparquet file. If not provided, will be generated based on hashes + from provided tags filter and geometry filter. Defaults to `None`. + keep_all_tags (bool, optional): Works only with the `tags_filter` parameter. + Whether to keep all tags related to the element, or return only those defined + in the `tags_filter`. When `True`, will override the optional grouping defined + in the `tags_filter`. Defaults to `False`. + explode_tags (bool, optional): Whether to split tags into columns based on OSM tag keys. + If `None`, will be set based on `tags_filter` and `keep_all_tags` parameters. + If there is tags filter defined and `keep_all_tags` is set to `False`, then it will + be set to `True`. Otherwise it will be set to `False`. Defaults to `None`. + ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not. + Defaults to False. + filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file. + Have to be in the form of 'node/', 'way/' or 'relation/'. + Defaults to an empty list. + duckdb_table_name (str): Table in which to store the OSM data inside the DuckDB database. + working_directory (Union[str, Path], optional): Directory where to save + the parsed `*.parquet` files. Defaults to "files". + osm_way_polygon_features_config (Union[OsmWayPolygonConfig, dict[str, Any]], optional): + Config used to determine which closed way features are polygons. + Modifications to this config left are left for experienced OSM users. + Defaults to predefined "osm_way_polygon_features.json". + verbosity_mode (Literal["silent", "transient", "verbose"], optional): Set progress + verbosity mode. Can be one of: silent, transient and verbose. Silent disables + output completely. Transient tracks progress, but removes output after finished. + Verbose leaves all progress outputs in the stdout. Defaults to "transient". + debug_memory (bool, optional): If turned on, will keep all temporary files after operation + for debugging. Defaults to `False`. + debug_times (bool, optional): If turned on, will report timestamps at which second each + step has been executed. Defaults to `False`. + + Returns: + Path: Path to the generated DuckDB file. + + Examples: + Get OSM data from a PBF file. + + Tags will be kept in a single column + >>> from pathlib import Path + >>> import quackosm as qosm + + >>> ddb_path = qosm.convert_pbf_to_duckdb(monaco_pbf_path) + >>> ddb_path.as_posix() + 'files/monaco-latest_nofilter_noclip_compact.duckdb' + + >>> import duckdb + >>> with duckdb.connect(str(ddb_path)) as con: + ... con.load_extension('spatial') + ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP + ┌──────────────────┬──────────────────────┬──────────────────────────────────────────────┐ + │ feature_id │ tags │ geometry │ + │ varchar │ map(varchar, varch… │ geometry │ + ├──────────────────┼──────────────────────┼──────────────────────────────────────────────┤ + │ node/10005045289 │ {shop=bakery} │ POINT (7.4224498 43.7310532) │ + │ node/10020887517 │ {leisure=swimming_… │ POINT (7.4131561 43.7338391) │ + │ node/10021298117 │ {leisure=swimming_… │ POINT (7.4277743 43.7427669) │ + │ node/10021298717 │ {leisure=swimming_… │ POINT (7.4263029 43.7409734) │ + │ node/10025656383 │ {ferry=yes, name=Q… │ POINT (7.4254971 43.7369002) │ + │ node/10025656390 │ {amenity=restauran… │ POINT (7.4269287 43.7368818) │ + │ node/10025656391 │ {name=Capitainerie… │ POINT (7.4272127 43.7359593) │ + │ node/10025656392 │ {name=Direction de… │ POINT (7.4270392 43.7365262) │ + │ node/10025656393 │ {brand=IQOS, brand… │ POINT (7.4275175 43.7373195) │ + │ node/10025656394 │ {artist_name=Anna … │ POINT (7.4293446 43.737448) │ + │ · │ · │ · │ + │ · │ · │ · │ + │ · │ · │ · │ + │ way/986864693 │ {natural=bare_rock} │ POLYGON ((7.4340482 43.745598, 7.4340263 4… │ + │ way/986864694 │ {barrier=wall} │ LINESTRING (7.4327547 43.7445382, 7.432808… │ + │ way/986864695 │ {natural=bare_rock} │ POLYGON ((7.4332994 43.7449315, 7.4332912 … │ + │ way/986864696 │ {barrier=wall} │ LINESTRING (7.4356006 43.7464325, 7.435574… │ + │ way/986864697 │ {natural=bare_rock} │ POLYGON ((7.4362767 43.74697, 7.4362983 43… │ + │ way/990669427 │ {amenity=shelter, … │ POLYGON ((7.4146087 43.733883, 7.4146192 4… │ + │ way/990669428 │ {highway=secondary… │ LINESTRING (7.4136598 43.7334433, 7.413640… │ + │ way/990669429 │ {highway=secondary… │ LINESTRING (7.4137621 43.7334251, 7.413746… │ + │ way/990848785 │ {addr:city=Monaco,… │ POLYGON ((7.4142551 43.7339622, 7.4143113 … │ + │ way/993121275 │ {building=yes, nam… │ POLYGON ((7.4321416 43.7481309, 7.4321638 … │ + ├──────────────────┴──────────────────────┴──────────────────────────────────────────────┤ + │ 8154 rows (20 shown) 3 columns │ + └────────────────────────────────────────────────────────────────────────────────────────┘ + + Get only buildings, amenities and highways from a PBF file. + >>> ddb_path = qosm.convert_pbf_to_duckdb( + ... monaco_pbf_path, tags_filter={"building": True, "amenity": True, "highway": True} + ... ) + >>> ddb_path.as_posix() + 'files/monaco-latest_6593ca69098459d039054bc5fe0a87c56681e29a5f59d38ce3485c03cb0e9374_noclip_compact.duckdb' + + Get features for Malé - the capital city of Maldives + + Tags will be kept in a single column. + >>> with duckdb.connect(str(ddb_path)) as con: + ... con.load_extension('spatial') + ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP + ┌──────────────────┬──────────┬────────────┬─────────────┬───────────────────────────────┐ + │ feature_id │ building │ amenity │ highway │ geometry │ + │ varchar │ varchar │ varchar │ varchar │ geometry │ + ├──────────────────┼──────────┼────────────┼─────────────┼───────────────────────────────┤ + │ node/10025656390 │ NULL │ restaurant │ NULL │ POINT (7.4269287 43.7368818) │ + │ node/10025843517 │ NULL │ restaurant │ NULL │ POINT (7.4219362 43.7367446) │ + │ node/10025852089 │ NULL │ bar │ NULL │ POINT (7.4227543 43.7369926) │ + │ node/10025852090 │ NULL │ restaurant │ NULL │ POINT (7.4225093 43.7369627) │ + │ node/10068880332 │ NULL │ NULL │ bus_stop │ POINT (7.4380858 43.7493026) │ + │ node/10068880335 │ NULL │ bench │ NULL │ POINT (7.4186855 43.7321515) │ + │ node/10127713363 │ NULL │ cafe │ NULL │ POINT (7.4266367 43.7420755) │ + │ node/10601158089 │ NULL │ restaurant │ NULL │ POINT (7.4213086 43.7336187) │ + │ node/10671507005 │ NULL │ bar │ NULL │ POINT (7.4296915 43.7423307) │ + │ node/10674256605 │ NULL │ bar │ NULL │ POINT (7.4213558 43.7336317) │ + │ · │ · │ · │ · │ · │ + │ · │ · │ · │ · │ · │ + │ · │ · │ · │ · │ · │ + │ way/981971425 │ NULL │ NULL │ residential │ LINESTRING (7.4321217 43.74… │ + │ way/982061461 │ NULL │ NULL │ secondary │ LINESTRING (7.4246341 43.74… │ + │ way/982081599 │ NULL │ NULL │ tertiary │ LINESTRING (7.4225202 43.73… │ + │ way/982081600 │ NULL │ NULL │ service │ LINESTRING (7.4225202 43.73… │ + │ way/986029035 │ NULL │ NULL │ path │ LINESTRING (7.4189462 43.73… │ + │ way/990669427 │ NULL │ shelter │ NULL │ POLYGON ((7.4146087 43.7338… │ + │ way/990669428 │ NULL │ NULL │ secondary │ LINESTRING (7.4136598 43.73… │ + │ way/990669429 │ NULL │ NULL │ secondary │ LINESTRING (7.4137621 43.73… │ + │ way/990848785 │ yes │ NULL │ NULL │ POLYGON ((7.4142551 43.7339… │ + │ way/993121275 │ yes │ NULL │ NULL │ POLYGON ((7.4321416 43.7481… │ + ├──────────────────┴──────────┴────────────┴─────────────┴───────────────────────────────┤ + │ 5902 rows (20 shown) 5 columns │ + └────────────────────────────────────────────────────────────────────────────────────────┘ + + >>> from shapely.geometry import box + >>> ddb_path = qosm.convert_pbf_to_duckdb( + ... maldives_pbf_path, + ... geometry_filter=box( + ... minx=73.4975872, + ... miny=4.1663240, + ... maxx=73.5215528, + ... maxy=4.1818121 + ... ) + ... ) # doctest: +IGNORE_RESULT + >>> ddb_path.as_posix() + 'files/maldives-latest_nofilter_4eeabb20ccd8aefeaa80b9a46a202ab985fd454760823b7012cc7778498a085b_compact.duckdb' + + >>> with duckdb.connect(str(ddb_path)) as con: + ... con.load_extension('spatial') + ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP + ┌──────────────────┬──────────────────────┬──────────────────────────────────────────────┐ + │ feature_id │ tags │ geometry │ + │ varchar │ map(varchar, varch… │ geometry │ + ├──────────────────┼──────────────────────┼──────────────────────────────────────────────┤ + │ node/10010180778 │ {brand=Ooredoo, br… │ POINT (73.5179039 4.1752105) │ + │ node/10062500171 │ {contact:facebook=… │ POINT (73.509583 4.1724485) │ + │ node/10078084764 │ {addr:city=Male', … │ POINT (73.5047972 4.1726734) │ + │ node/10078086040 │ {addr:city=Malé, a… │ POINT (73.5031714 4.1759622) │ + │ node/10158825718 │ {addr:postcode=201… │ POINT (73.5083189 4.1730108) │ + │ node/10289176711 │ {addr:street=Dhona… │ POINT (73.5133902 4.1725724) │ + │ node/10294045310 │ {amenity=restauran… │ POINT (73.5091277 4.1735378) │ + │ node/10294045311 │ {amenity=restauran… │ POINT (73.5055534 4.1759515) │ + │ node/10294045411 │ {amenity=restauran… │ POINT (73.5037257 4.1717866) │ + │ node/10294045412 │ {amenity=restauran… │ POINT (73.5024147 4.1761633) │ + │ · │ · │ · │ + │ · │ · │ · │ + │ · │ · │ · │ + │ way/91986244 │ {highway=residenti… │ LINESTRING (73.5069785 4.1704686, 73.50759… │ + │ way/91986245 │ {highway=residenti… │ LINESTRING (73.5135834 4.1740562, 73.51383… │ + │ way/91986249 │ {highway=residenti… │ LINESTRING (73.5153971 4.1735146, 73.51601… │ + │ way/91986251 │ {highway=residenti… │ LINESTRING (73.5082522 4.1709887, 73.50823… │ + │ way/91986254 │ {highway=residenti… │ LINESTRING (73.508114 4.1693477, 73.508154… │ + │ way/91986255 │ {landuse=cemetery,… │ POLYGON ((73.507509 4.1731064, 73.5078884 … │ + │ way/91986256 │ {highway=residenti… │ LINESTRING (73.5106692 4.1744828, 73.51082… │ + │ way/935784864 │ {layer=-1, locatio… │ LINESTRING (73.4875382 4.1703263, 73.50074… │ + │ way/935784867 │ {layer=-1, locatio… │ LINESTRING (73.446172 4.1856738, 73.460937… │ + │ way/959150179 │ {amenity=place_of_… │ POLYGON ((73.5184052 4.1755282, 73.5184863… │ + ├──────────────────┴──────────────────────┴──────────────────────────────────────────────┤ + │ 2168 rows (20 shown) 3 columns │ + └────────────────────────────────────────────────────────────────────────────────────────┘ + """ + return PbfFileReader( + tags_filter=tags_filter, + geometry_filter=geometry_filter, + working_directory=working_directory, + osm_way_polygon_features_config=osm_way_polygon_features_config, + verbosity_mode=verbosity_mode, + debug_memory=debug_memory, + debug_times=debug_times, + ).convert_pbf_to_duckdb( + pbf_path=pbf_path, + result_file_path=result_file_path, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, + duckdb_table_name=duckdb_table_name, + ) + +def convert_geometry_to_duckdb( + geometry_filter: BaseGeometry = None, + osm_extract_source: Union[OsmExtractSource, str] = OsmExtractSource.any, + tags_filter: Optional[Union[OsmTagsFilter, GroupedOsmTagsFilter]] = None, + result_file_path: Optional[Union[str, Path]] = None, + keep_all_tags: bool = False, + explode_tags: Optional[bool] = None, + ignore_cache: bool = False, + filter_osm_ids: Optional[list[str]] = None, + duckdb_table_name: str = "quackosm", + working_directory: Union[str, Path] = "files", + osm_way_polygon_features_config: Optional[Union[OsmWayPolygonConfig, dict[str, Any]]] = None, + verbosity_mode: Literal["silent", "transient", "verbose"] = "transient", + geometry_coverage_iou_threshold: float = 0.01, + allow_uncovered_geometry: bool = False, + debug_memory: bool = False, + debug_times: bool = False, +) -> Path: + """ + Get a DuckDB file with OpenStreetMap features within given geometry. + + Automatically downloads matching OSM extracts from different sources and returns a single file + as a result. + + Args: + geometry_filter (BaseGeometry): Geometry filter used to download matching OSM extracts. + osm_extract_source (Union[OsmExtractSource, str], optional): A source for automatic + downloading of OSM extracts. Can be Geofabrik, BBBike, OSMfr or any. + Defaults to `any`. + tags_filter (Union[OsmTagsFilter, GroupedOsmTagsFilter], optional): A dictionary + specifying which tags to download. + The keys should be OSM tags (e.g. `building`, `amenity`). + The values should either be `True` for retrieving all objects with the tag, + string for retrieving a single tag-value pair + or list of strings for retrieving all values specified in the list. + `tags={'leisure': 'park}` would return parks from the area. + `tags={'leisure': 'park, 'amenity': True, 'shop': ['bakery', 'bicycle']}` + would return parks, all amenity types, bakeries and bicycle shops. + If `None`, handler will allow all of the tags to be parsed. Defaults to `None`. + result_file_path (Union[str, Path], optional): Where to save + the DuckDB file. If not provided, will be generated based on hashes + from provided tags filter and geometry filter. Defaults to `None`. + keep_all_tags (bool, optional): Works only with the `tags_filter` parameter. + Whether to keep all tags related to the element, or return only those defined + in the `tags_filter`. When `True`, will override the optional grouping defined + in the `tags_filter`. Defaults to `False`. + explode_tags (bool, optional): Whether to split tags into columns based on OSM tag keys. + If `None`, will be set based on `tags_filter` and `keep_all_tags` parameters. + If there is tags filter defined and `keep_all_tags` is set to `False`, then it will + be set to `True`. Otherwise it will be set to `False`. Defaults to `None`. + ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not. + Defaults to False. + filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file. + Have to be in the form of 'node/', 'way/' or 'relation/'. + Defaults to an empty list. + duckdb_table_name (str): Table in which to store the OSM data inside the DuckDB database. + working_directory (Union[str, Path], optional): Directory where to save + the parsed `*.parquet` files. Defaults to "files". + osm_way_polygon_features_config (Union[OsmWayPolygonConfig, dict[str, Any]], optional): + Config used to determine which closed way features are polygons. + Modifications to this config left are left for experienced OSM users. + Defaults to predefined "osm_way_polygon_features.json". + verbosity_mode (Literal["silent", "transient", "verbose"], optional): Set progress + verbosity mode. Can be one of: silent, transient and verbose. Silent disables + output completely. Transient tracks progress, but removes output after finished. + Verbose leaves all progress outputs in the stdout. Defaults to "transient". + geometry_coverage_iou_threshold (float): Minimal value of the Intersection over Union metric + for selecting the matching OSM extracts. Is best matching extract has value lower than + the threshold, it is discarded (except the first one). Has to be in range between 0 + and 1. Value of 0 will allow every intersected extract, value of 1 will only allow + extracts that match the geometry exactly. Defaults to 0.01. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Works only when PbfFileReader is asked to download OSM extracts + automatically. Defaults to `False`. + debug_memory (bool, optional): If turned on, will keep all temporary files after operation + for debugging. Defaults to `False`. + debug_times (bool, optional): If turned on, will report timestamps at which second each + step has been executed. Defaults to `False`. + + Returns: + Path: Path to the generated DuckDB file. + + Examples: + Get OSM data from the center of Monaco. + + >>> import quackosm as qosm + >>> from shapely import from_wkt + >>> wkt = ( + ... "POLYGON ((7.41644 43.73598, 7.41644 43.73142, 7.42378 43.73142," + ... " 7.42378 43.73598, 7.41644 43.73598))" + ... ) + >>> ddb_path = qosm.convert_geometry_to_duckdb(from_wkt(wkt)) # doctest: +IGNORE_RESULT + >>> ddb_path.as_posix() + 'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.duckdb' + + Inspect the file with duckdb + >>> import duckdb + >>> with duckdb.connect(str(ddb_path)) as con: + ... con.load_extension('spatial') + ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP + ┌──────────────────┬──────────────────────┬──────────────────────────────────────────────┐ + │ feature_id │ tags │ geometry │ + │ varchar │ map(varchar, varch… │ geometry │ + ├──────────────────┼──────────────────────┼──────────────────────────────────────────────┤ + │ node/10068880335 │ {amenity=bench, ma… │ POINT (7.4186855 43.7321515) │ + │ node/10196648824 │ {contact:city=Mona… │ POINT (7.4193805 43.7337539) │ + │ node/10601158089 │ {addr:city=Monaco,… │ POINT (7.4213086 43.7336187) │ + │ node/10672624925 │ {addr:city=Monaco,… │ POINT (7.4215683 43.7351727) │ + │ node/10674256605 │ {amenity=bar, name… │ POINT (7.4213558 43.7336317) │ + │ node/1074584632 │ {crossing=marked, … │ POINT (7.4188525 43.7323654) │ + │ node/1074584650 │ {crossing=marked, … │ POINT (7.4174145 43.7341601) │ + │ node/1079045434 │ {addr:country=MC, … │ POINT (7.4173175 43.7320823) │ + │ node/1079045443 │ {highway=traffic_s… │ POINT (7.4182804 43.7319223) │ + │ node/10862390705 │ {amenity=drinking_… │ POINT (7.4219582 43.7355272) │ + │ · │ · │ · │ + │ · │ · │ · │ + │ · │ · │ · │ + │ way/952068828 │ {attraction=water_… │ LINESTRING (7.4221787 43.7343579, 7.422176… │ + │ way/952068829 │ {attraction=water_… │ LINESTRING (7.4220996 43.7343719, 7.422131… │ + │ way/952068830 │ {attraction=water_… │ LINESTRING (7.4221161 43.7343595, 7.422119… │ + │ way/952068831 │ {attraction=water_… │ LINESTRING (7.4221421 43.7343773, 7.422159… │ + │ way/952068832 │ {attraction=water_… │ LINESTRING (7.4221748 43.7343815, 7.422173… │ + │ way/952419569 │ {highway=primary, … │ LINESTRING (7.4171229 43.7316079, 7.417117… │ + │ way/952419570 │ {highway=primary, … │ LINESTRING (7.4171473 43.7315034, 7.417166… │ + │ way/952419571 │ {highway=primary, … │ LINESTRING (7.4171671 43.731656, 7.4171486… │ + │ way/952419572 │ {highway=primary, … │ LINESTRING (7.4173054 43.7316813, 7.417276… │ + │ way/952419573 │ {highway=primary, … │ LINESTRING (7.4173897 43.7316435, 7.417372… │ + ├──────────────────┴──────────────────────┴──────────────────────────────────────────────┤ + │ 1384 rows (20 shown) │ + └────────────────────────────────────────────────────────────────────────────────────────┘ + + Making sure that you are using specific OSM extract source - here Geofabrik. + + >>> ddb_path = qosm.convert_geometry_to_duckdb( + ... from_wkt(wkt), + ... osm_extract_source='Geofabrik', + ... ) # doctest: +IGNORE_RESULT + >>> ddb_path.as_posix() + 'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.parquet' + + Inspect the file with duckdb + >>> with duckdb.connect(str(ddb_path)) as con: + ... con.load_extension('spatial') + ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP + ┌──────────────────┬──────────────────────┬──────────────────────────────────────────────┐ + │ feature_id │ tags │ geometry │ + │ varchar │ map(varchar, varch… │ geometry │ + ├──────────────────┼──────────────────────┼──────────────────────────────────────────────┤ + │ node/10068880335 │ {amenity=bench, ma… │ POINT (7.4186855 43.7321515) │ + │ node/10196648824 │ {contact:city=Mona… │ POINT (7.4193805 43.7337539) │ + │ node/10601158089 │ {addr:city=Monaco,… │ POINT (7.4213086 43.7336187) │ + │ node/10672624925 │ {addr:city=Monaco,… │ POINT (7.4215683 43.7351727) │ + │ node/10674256605 │ {amenity=bar, name… │ POINT (7.4213558 43.7336317) │ + │ node/1074584632 │ {crossing=marked, … │ POINT (7.4188525 43.7323654) │ + │ node/1074584650 │ {crossing=marked, … │ POINT (7.4174145 43.7341601) │ + │ node/1079045434 │ {addr:country=MC, … │ POINT (7.4173175 43.7320823) │ + │ node/1079045443 │ {highway=traffic_s… │ POINT (7.4182804 43.7319223) │ + │ node/10862390705 │ {amenity=drinking_… │ POINT (7.4219582 43.7355272) │ + │ · │ · │ · │ + │ · │ · │ · │ + │ · │ · │ · │ + │ way/952068828 │ {attraction=water_… │ LINESTRING (7.4221787 43.7343579, 7.422176… │ + │ way/952068829 │ {attraction=water_… │ LINESTRING (7.4220996 43.7343719, 7.422131… │ + │ way/952068830 │ {attraction=water_… │ LINESTRING (7.4221161 43.7343595, 7.422119… │ + │ way/952068831 │ {attraction=water_… │ LINESTRING (7.4221421 43.7343773, 7.422159… │ + │ way/952068832 │ {attraction=water_… │ LINESTRING (7.4221748 43.7343815, 7.422173… │ + │ way/952419569 │ {highway=primary, … │ LINESTRING (7.4171229 43.7316079, 7.417117… │ + │ way/952419570 │ {highway=primary, … │ LINESTRING (7.4171473 43.7315034, 7.417166… │ + │ way/952419571 │ {highway=primary, … │ LINESTRING (7.4171671 43.731656, 7.4171486… │ + │ way/952419572 │ {highway=primary, … │ LINESTRING (7.4173054 43.7316813, 7.417276… │ + │ way/952419573 │ {highway=primary, … │ LINESTRING (7.4173897 43.7316435, 7.417372… │ + ├──────────────────┴──────────────────────┴──────────────────────────────────────────────┤ + │ 1384 rows (20 shown) │ + └────────────────────────────────────────────────────────────────────────────────────────┘ + """ + return PbfFileReader( + tags_filter=tags_filter, + geometry_filter=geometry_filter, + working_directory=working_directory, + osm_way_polygon_features_config=osm_way_polygon_features_config, + osm_extract_source=osm_extract_source, + verbosity_mode=verbosity_mode, + geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, + allow_uncovered_geometry=allow_uncovered_geometry, + debug_memory=debug_memory, + debug_times=debug_times, + ).convert_geometry_to_duckdb( + result_file_path=result_file_path, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, + duckdb_table_name=duckdb_table_name, + ) + +def convert_osm_extract_to_duckdb( + osm_extract_query: str, + osm_extract_source: Union[OsmExtractSource, str] = OsmExtractSource.any, + tags_filter: Optional[Union[OsmTagsFilter, GroupedOsmTagsFilter]] = None, + geometry_filter: Optional[BaseGeometry] = None, + result_file_path: Optional[Union[str, Path]] = None, + keep_all_tags: bool = False, + explode_tags: Optional[bool] = None, + ignore_cache: bool = False, + filter_osm_ids: Optional[list[str]] = None, + duckdb_table_name: str = "quackosm", + working_directory: Union[str, Path] = "files", + osm_way_polygon_features_config: Optional[Union[OsmWayPolygonConfig, dict[str, Any]]] = None, + verbosity_mode: Literal["silent", "transient", "verbose"] = "transient", + debug_memory: bool = False, + debug_times: bool = False, +) -> Path: + """ + Get a single OpenStreetMap extract from a given source and transform it to a DuckDB file. + + Args: + osm_extract_query (str): + Query to find an OpenStreetMap extract from available sources. + osm_extract_source (Union[OsmExtractSource, str], optional): A source for automatic + downloading of OSM extracts. Can be Geofabrik, BBBike, OSMfr or any. + Defaults to `any`. + tags_filter (Union[OsmTagsFilter, GroupedOsmTagsFilter], optional): A dictionary + specifying which tags to download. + The keys should be OSM tags (e.g. `building`, `amenity`). + The values should either be `True` for retrieving all objects with the tag, + string for retrieving a single tag-value pair + or list of strings for retrieving all values specified in the list. + `tags={'leisure': 'park}` would return parks from the area. + `tags={'leisure': 'park, 'amenity': True, 'shop': ['bakery', 'bicycle']}` + would return parks, all amenity types, bakeries and bicycle shops. + If `None`, handler will allow all of the tags to be parsed. Defaults to `None`. + geometry_filter (BaseGeometry, optional): Region which can be used to filter only + intersecting OSM objects. Defaults to `None`. + result_file_path (Union[str, Path], optional): Where to save + the geoparquet file. If not provided, will be generated based on hashes + from provided tags filter and geometry filter. Defaults to `None`. + keep_all_tags (bool, optional): Works only with the `tags_filter` parameter. + Whether to keep all tags related to the element, or return only those defined + in the `tags_filter`. When `True`, will override the optional grouping defined + in the `tags_filter`. Defaults to `False`. + explode_tags (bool, optional): Whether to split tags into columns based on OSM tag keys. + If `None`, will be set based on `tags_filter` and `keep_all_tags` parameters. + If there is tags filter defined and `keep_all_tags` is set to `False`, then it will + be set to `True`. Otherwise it will be set to `False`. Defaults to `None`. + ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not. + Defaults to False. + filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file. + Have to be in the form of 'node/', 'way/' or 'relation/'. + Defaults to an empty list. + duckdb_table_name (str): Table in which to store the OSM data inside the DuckDB database. + working_directory (Union[str, Path], optional): Directory where to save + the parsed `*.parquet` files. Defaults to "files". + osm_way_polygon_features_config (Union[OsmWayPolygonConfig, dict[str, Any]], optional): + Config used to determine which closed way features are polygons. + Modifications to this config left are left for experienced OSM users. + Defaults to predefined "osm_way_polygon_features.json". + verbosity_mode (Literal["silent", "transient", "verbose"], optional): Set progress + verbosity mode. Can be one of: silent, transient and verbose. Silent disables + output completely. Transient tracks progress, but removes output after finished. + Verbose leaves all progress outputs in the stdout. Defaults to "transient". + debug_memory (bool, optional): If turned on, will keep all temporary files after operation + for debugging. Defaults to `False`. + debug_times (bool, optional): If turned on, will report timestamps at which second each + step has been executed. Defaults to `False`. + + Returns: + Path: Path to the generated DuckDB file. + + Examples: + Get OSM data for the Monaco. + + >>> import quackosm as qosm + >>> ddb_path = qosm.convert_osm_extract_to_duckdb( + ... "monaco", osm_extract_source="geofabrik" + ... ) # doctest: +IGNORE_RESULT + >>> ddb_path.as_posix() + 'files/geofabrik_europe_monaco_nofilter_noclip_compact.duckdb' + + Inspect the file with duckdb + >>> import duckdb + ... with duckdb.connect(str(ddb_path)) as con: + ... con.load_extension('spatial') + ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP + ┌──────────────────┬──────────────────────┬──────────────────────────────────────────────┐ + │ feature_id │ tags │ geometry │ + │ varchar │ map(varchar, varch… │ geometry │ + ├──────────────────┼──────────────────────┼──────────────────────────────────────────────┤ + │ node/10005045289 │ {shop=bakery} │ POINT (7.4224498 43.7310532) │ + │ node/10020887517 │ {leisure=swimming_… │ POINT (7.4131561 43.7338391) │ + │ node/10021298117 │ {leisure=swimming_… │ POINT (7.4277743 43.7427669) │ + │ node/10021298717 │ {leisure=swimming_… │ POINT (7.4263029 43.7409734) │ + │ node/10025656383 │ {ferry=yes, name=Q… │ POINT (7.4254971 43.7369002) │ + │ node/10025656390 │ {amenity=restauran… │ POINT (7.4269287 43.7368818) │ + │ node/10025656391 │ {name=Capitainerie… │ POINT (7.4272127 43.7359593) │ + │ node/10025656392 │ {name=Direction de… │ POINT (7.4270392 43.7365262) │ + │ node/10025656393 │ {name=IQOS, openin… │ POINT (7.4275175 43.7373195) │ + │ node/10025656394 │ {artist_name=Anna … │ POINT (7.4293446 43.737448) │ + │ · │ · │ · │ + │ · │ · │ · │ + │ · │ · │ · │ + │ way/986864693 │ {natural=bare_rock} │ POLYGON ((7.4340482 43.745598, 7.4340263 4… │ + │ way/986864694 │ {barrier=wall} │ LINESTRING (7.4327547 43.7445382, 7.432808… │ + │ way/986864695 │ {natural=bare_rock} │ POLYGON ((7.4332994 43.7449315, 7.4332912 … │ + │ way/986864696 │ {barrier=wall} │ LINESTRING (7.4356006 43.7464325, 7.435574… │ + │ way/986864697 │ {natural=bare_rock} │ POLYGON ((7.4362767 43.74697, 7.4362983 43… │ + │ way/990669427 │ {amenity=shelter, … │ POLYGON ((7.4146087 43.733883, 7.4146192 4… │ + │ way/990669428 │ {highway=secondary… │ LINESTRING (7.4136598 43.7334433, 7.413640… │ + │ way/990669429 │ {highway=secondary… │ LINESTRING (7.4137621 43.7334251, 7.413746… │ + │ way/990848785 │ {addr:city=Monaco,… │ POLYGON ((7.4142551 43.7339622, 7.4143113 … │ + │ way/993121275 │ {building=yes, nam… │ POLYGON ((7.4321416 43.7481309, 7.4321638 … │ + ├──────────────────┴──────────────────────┴──────────────────────────────────────────────┤ + │ 7906 rows (20 shown) 3 columns │ + └────────────────────────────────────────────────────────────────────────────────────────┘ + + Full name can also be used. Osm extract source can be skipped. + + >>> ddb_path = qosm.convert_osm_extract_to_duckdb( + ... "geofabrik_europe_monaco" + ... ) # doctest: +IGNORE_RESULT + >>> ddb_path.as_posix() + 'files/geofabrik_europe_monaco_nofilter_noclip_compact.duckdb' + """ + downloaded_osm_extract = download_extract_by_query( + query=osm_extract_query, source=osm_extract_source + ) + return PbfFileReader( + tags_filter=tags_filter, + geometry_filter=geometry_filter, + working_directory=working_directory, + osm_way_polygon_features_config=osm_way_polygon_features_config, + verbosity_mode=verbosity_mode, + debug_memory=debug_memory, + debug_times=debug_times, + ).convert_pbf_to_duckdb( + pbf_path=downloaded_osm_extract, + result_file_path=result_file_path, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, + duckdb_table_name=duckdb_table_name, + ) def convert_pbf_to_parquet( pbf_path: Union[str, Path, Iterable[Union[str, Path]]], diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index 82ba409..651aa7a 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -698,6 +698,153 @@ def convert_geometry_to_geodataframe( return gdf_parquet + def convert_pbf_to_duckdb( + self, + pbf_path: Union[str, Path, Iterable[Union[str, Path]]], + result_file_path: Optional[Union[str, Path]] = None, + keep_all_tags: bool = False, + explode_tags: Optional[bool] = None, + ignore_cache: bool = False, + filter_osm_ids: Optional[list[str]] = None, + duckdb_table_name: str = "quackosm", + ) -> Path: + """ + Convert PBF file to DuckDB Database. + + Function parses multiple PBF files and returns a single GeoDataFrame with parsed + OSM objects. + + Args: + pbf_path (Union[str, Path, Iterable[Union[str, Path]]]): + Path or list of paths of `*.osm.pbf` files to be parsed. Can be an URL. + result_file_path (Union[str, Path], optional): Where to save + the duckdb file. If not provided, will be generated based on hashes + from provided tags filter and geometry filter. Defaults to `None`. + keep_all_tags (bool, optional): Works only with the `tags_filter` parameter. + Whether to keep all tags related to the element, or return only those defined + in the `tags_filter`. When `True`, will override the optional grouping defined + in the `tags_filter`. Defaults to `False`. + explode_tags (bool, optional): Whether to split tags into columns based on OSM tag keys. + If `None`, will be set based on `tags_filter` and `keep_all_tags` parameters. + If there is tags filter defined and `keep_all_tags` is set to `False`, then it will + be set to `True`. Otherwise it will be set to `False`. Defaults to `None`. + ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not. + Defaults to False. + filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file. + Have to be in the form of 'node/', 'way/' or 'relation/'. + Defaults to an empty list. + duckdb_table_name (str): Table name in which data will be stored inside the DuckDB + database (default: "quackosm") + + Returns: + gpd.GeoDataFrame: GeoDataFrame with OSM features. + """ + if isinstance(pbf_path, (str, Path)): + pbf_path = [pbf_path] + + parsed_geoparquet_file = self.convert_pbf_to_parquet( + pbf_path=pbf_path, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, + ) + + # generate result_file_path if missing + result_file_path = Path( + result_file_path + or self._generate_result_file_path( + pbf_path=pbf_path, + filter_osm_ids=filter_osm_ids, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + save_as_wkt=False, + ).with_suffix(".duckdb") + ) + + with duckdb.connect(str(result_file_path)) as con: + con.load_extension("spatial") + con.sql(f""" + CREATE TABLE {duckdb_table_name} AS + SELECT * REPLACE (ST_GeomFromWKB(geometry) as geometry) + FROM read_parquet('{str(parsed_geoparquet_file)}'); + """) + + # clean up intermediary parquet + parsed_geoparquet_file.unlink() + + return result_file_path + + def convert_geometry_to_duckdb( + self, + result_file_path: Optional[Union[str, Path]] = None, + keep_all_tags: bool = False, + explode_tags: Optional[bool] = None, + ignore_cache: bool = False, + filter_osm_ids: Optional[list[str]] = None, + duckdb_table_name: str = "quackosm", + ) -> Path: + """ + Get features GeoDataFrame from a provided geometry filter. + + Will automatically find and download OSM extracts covering a given geometry + and return a single GeoDataFrame with parsed OSM objects. + + Args: + result_file_path (Union[str, Path], optional): Where to save + the duckdb file. If not provided, will be generated based on hashes + from provided tags filter and geometry filter. Defaults to `None`. + keep_all_tags (bool, optional): Works only with the `tags_filter` parameter. + Whether to keep all tags related to the element, or return only those defined + in the `tags_filter`. When `True`, will override the optional grouping defined + in the `tags_filter`. Defaults to `False`. + explode_tags (bool, optional): Whether to split tags into columns based on OSM tag keys. + If `None`, will be set based on `tags_filter` and `keep_all_tags` parameters. + If there is tags filter defined and `keep_all_tags` is set to `False`, then it will + be set to `True`. Otherwise it will be set to `False`. Defaults to `None`. + ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not. + Defaults to False. + filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file. + Have to be in the form of 'node/', 'way/' or 'relation/'. + Defaults to an empty list. + duckdb_table_name (str): Table name in which data will be stored inside the DuckDB + database (default: "quackosm") + + Returns: + gpd.GeoDataFrame: GeoDataFrame with OSM features. + """ + parsed_geoparquet_file = self.convert_geometry_to_parquet( + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + filter_osm_ids=filter_osm_ids, + ) + + # generate result_file_path if missing + result_file_path = Path( + result_file_path + or self._generate_result_file_path_from_geometry( + filter_osm_ids=filter_osm_ids, + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + save_as_wkt=False, + ).with_suffix(".duckdb") + ) + + with duckdb.connect(str(result_file_path)) as con: + con.load_extension("spatial") + + con.sql(f""" + CREATE TABLE {duckdb_table_name} AS + SELECT * REPLACE(ST_GeomFromWKB(geometry) as geometry) + FROM read_parquet('{str(parsed_geoparquet_file)}'); + """) + + # clean up intermediary parquet + parsed_geoparquet_file.unlink() + + return result_file_path + def _drop_duplicated_features_in_pyarrow_table( self, parsed_geoparquet_files: list[Path], tmp_dir_path: Path ) -> list[Path]: diff --git a/tests/base/test_cli.py b/tests/base/test_cli.py index 7a017f4..b6d00e6 100644 --- a/tests/base/test_cli.py +++ b/tests/base/test_cli.py @@ -104,6 +104,12 @@ def test_transient_mode(monaco_pbf_file_path_fixture: str) -> None: @P.case("Ignore cache short", ["--no-cache"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore @P.case("Output", ["--output", "files/monaco_output.parquet"], "files/monaco_output.parquet") # type: ignore @P.case("Output short", ["-o", "files/monaco_output.parquet"], "files/monaco_output.parquet") # type: ignore +@P.case("DuckDB explicit export", ["--duckdb"], "files/monaco_nofilter_noclip.duckdb") # type: ignore +@P.case( + "DuckDB explicit export with table name", + ["--duckdb", "--duckdb_table_name test"], + "files/monaco_nofilter_noclip.duckdb", +) # type: ignore @P.case("Silent", ["--silent"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore @P.case("Transient", ["--transient"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore @P.case( From 4d5a3b337e0282bcba8ca1b0f77447473c5ea9e2 Mon Sep 17 00:00:00 2001 From: mwip Date: Fri, 20 Sep 2024 21:56:42 +0200 Subject: [PATCH 02/12] Fix open mypy issues --- CHANGELOG.md | 4 ++++ quackosm/cli.py | 8 ++++---- quackosm/pbf_file_reader.py | 8 ++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25f94e9..2f2616d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Option to export to DuckDB database [#94](https://github.com/kraina-ai/quackosm/issues/119) + ## [0.9.4] - 2024-09-11 ### Changed diff --git a/quackosm/cli.py b/quackosm/cli.py index 7826e64..5aec3b0 100644 --- a/quackosm/cli.py +++ b/quackosm/cli.py @@ -497,7 +497,7 @@ def main( "--duckdb", help="Export to duckdb database", ), - ] = None, + ] = False, duckdb_table_name: Annotated[ Optional[str], typer.Option( @@ -707,7 +707,7 @@ def main( else None ), filter_osm_ids=filter_osm_ids, # type: ignore - duckdb_table_name=duckdb_table_name, + duckdb_table_name=duckdb_table_name or "quackosm", verbosity_mode=verbosity_mode, ) @@ -756,7 +756,7 @@ def main( else None ), filter_osm_ids=filter_osm_ids, # type: ignore - duckdb_table_name=duckdb_table_name, + duckdb_table_name=duckdb_table_name or "quackosm", save_as_wkt=wkt_result, verbosity_mode=verbosity_mode, ) @@ -818,7 +818,7 @@ def main( else None ), filter_osm_ids=filter_osm_ids, # type: ignore - duckdb_table_name=duckdb_table_name, + duckdb_table_name=duckdb_table_name or "quackosm", save_as_wkt=wkt_result, verbosity_mode=verbosity_mode, geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index 651aa7a..ba46c27 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -755,9 +755,9 @@ def convert_pbf_to_duckdb( result_file_path or self._generate_result_file_path( pbf_path=pbf_path, - filter_osm_ids=filter_osm_ids, + filter_osm_ids=filter_osm_ids or [""], keep_all_tags=keep_all_tags, - explode_tags=explode_tags, + explode_tags=explode_tags or False, save_as_wkt=False, ).with_suffix(".duckdb") ) @@ -824,9 +824,9 @@ def convert_geometry_to_duckdb( result_file_path = Path( result_file_path or self._generate_result_file_path_from_geometry( - filter_osm_ids=filter_osm_ids, + filter_osm_ids=filter_osm_ids or [""], keep_all_tags=keep_all_tags, - explode_tags=explode_tags, + explode_tags=explode_tags or False, save_as_wkt=False, ).with_suffix(".duckdb") ) From 22b5e0d531375748ef739aa5980c9e3c14dbdd50 Mon Sep 17 00:00:00 2001 From: mwip Date: Sun, 6 Oct 2024 21:34:45 +0200 Subject: [PATCH 03/12] Fix doctest errors; fix filter_osm_ids typing properly This patch fixes remaining doctest errors that occured during kraina-ai/quackosm#157. Meanwhile, a remaining bug was discovered around the typing of filter_osm_ids. It was solved, too. --- quackosm/functions.py | 15 ++++++++------- quackosm/pbf_file_reader.py | 20 ++++++++++++-------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/quackosm/functions.py b/quackosm/functions.py index 1cc7de4..f0783e9 100644 --- a/quackosm/functions.py +++ b/quackosm/functions.py @@ -105,11 +105,12 @@ def convert_pbf_to_duckdb( >>> from pathlib import Path >>> import quackosm as qosm - >>> ddb_path = qosm.convert_pbf_to_duckdb(monaco_pbf_path) + >>> ddb_path = qosm.convert_pbf_to_duckdb(monaco_pbf_path) # doctest: +IGNORE_RESULT >>> ddb_path.as_posix() - 'files/monaco-latest_nofilter_noclip_compact.duckdb' + 'files/monaco_nofilter_noclip_compact.duckdb' >>> import duckdb + >>> duckdb.load_extension('spatial') >>> with duckdb.connect(str(ddb_path)) as con: ... con.load_extension('spatial') ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP @@ -147,9 +148,9 @@ def convert_pbf_to_duckdb( Get only buildings, amenities and highways from a PBF file. >>> ddb_path = qosm.convert_pbf_to_duckdb( ... monaco_pbf_path, tags_filter={"building": True, "amenity": True, "highway": True} - ... ) + ... ) # doctest: +IGNORE_RESULT >>> ddb_path.as_posix() - 'files/monaco-latest_6593ca69098459d039054bc5fe0a87c56681e29a5f59d38ce3485c03cb0e9374_noclip_compact.duckdb' + 'files/monaco_6593ca69098459d039054bc5fe0a87c56681e29a5f59d38ce3485c03cb0e9374_noclip_compact.duckdb' Get features for Malé - the capital city of Maldives @@ -199,7 +200,7 @@ def convert_pbf_to_duckdb( ... ) ... ) # doctest: +IGNORE_RESULT >>> ddb_path.as_posix() - 'files/maldives-latest_nofilter_4eeabb20ccd8aefeaa80b9a46a202ab985fd454760823b7012cc7778498a085b_compact.duckdb' + 'files/maldives_nofilter_4eeabb20ccd8aefeaa80b9a46a202ab985fd454760823b7012cc7778498a085b_compact.duckdb' >>> with duckdb.connect(str(ddb_path)) as con: ... con.load_extension('spatial') @@ -391,7 +392,7 @@ def convert_geometry_to_duckdb( ... osm_extract_source='Geofabrik', ... ) # doctest: +IGNORE_RESULT >>> ddb_path.as_posix() - 'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.parquet' + 'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.duckdb' Inspect the file with duckdb >>> with duckdb.connect(str(ddb_path)) as con: @@ -533,7 +534,7 @@ def convert_osm_extract_to_duckdb( Inspect the file with duckdb >>> import duckdb - ... with duckdb.connect(str(ddb_path)) as con: + >>> with duckdb.connect(str(ddb_path)) as con: ... con.load_extension('spatial') ... con.sql("SELECT * FROM quackosm ORDER BY feature_id;") # doctest: +SKIP ┌──────────────────┬──────────────────────┬──────────────────────────────────────────────┐ diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index df81d84..52cf52a 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -751,12 +751,15 @@ def convert_pbf_to_duckdb( filter_osm_ids=filter_osm_ids, ) + if filter_osm_ids is None: + filter_osm_ids = [] + # generate result_file_path if missing result_file_path = Path( result_file_path or self._generate_result_file_path( pbf_path=pbf_path, - filter_osm_ids=filter_osm_ids or [""], + filter_osm_ids=filter_osm_ids, keep_all_tags=keep_all_tags, explode_tags=explode_tags or False, save_as_wkt=False, @@ -766,9 +769,8 @@ def convert_pbf_to_duckdb( with duckdb.connect(str(result_file_path)) as con: con.load_extension("spatial") con.sql(f""" - CREATE TABLE {duckdb_table_name} AS - SELECT * REPLACE (ST_GeomFromWKB(geometry) as geometry) - FROM read_parquet('{str(parsed_geoparquet_file)}'); + CREATE OR REPLACE TABLE {duckdb_table_name} AS + SELECT * FROM read_parquet('{parsed_geoparquet_file}'); """) # clean up intermediary parquet @@ -821,11 +823,14 @@ def convert_geometry_to_duckdb( filter_osm_ids=filter_osm_ids, ) + if filter_osm_ids is None: + filter_osm_ids = [] + # generate result_file_path if missing result_file_path = Path( result_file_path or self._generate_result_file_path_from_geometry( - filter_osm_ids=filter_osm_ids or [""], + filter_osm_ids=filter_osm_ids, keep_all_tags=keep_all_tags, explode_tags=explode_tags or False, save_as_wkt=False, @@ -836,9 +841,8 @@ def convert_geometry_to_duckdb( con.load_extension("spatial") con.sql(f""" - CREATE TABLE {duckdb_table_name} AS - SELECT * REPLACE(ST_GeomFromWKB(geometry) as geometry) - FROM read_parquet('{str(parsed_geoparquet_file)}'); + CREATE OR REPLACE TABLE {duckdb_table_name} AS + SELECT * FROM read_parquet('{parsed_geoparquet_file}'); """) # clean up intermediary parquet From 80a4a3341e3798aa85029ac78594bc1e484bc983 Mon Sep 17 00:00:00 2001 From: mwip Date: Sun, 6 Oct 2024 21:38:34 +0200 Subject: [PATCH 04/12] Resolve refurb: immut. tuple over list for suffix --- quackosm/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/quackosm/cli.py b/quackosm/cli.py index 285be98..c1ed17d 100644 --- a/quackosm/cli.py +++ b/quackosm/cli.py @@ -703,7 +703,7 @@ def main( logging.disable(logging.CRITICAL) if pbf_file: # export to DuckDB database - if (result_file_path and result_file_path.suffix in [".duckdb", ".db"]) or duckdb: + if (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb: from quackosm.functions import convert_pbf_to_duckdb result_path = convert_pbf_to_duckdb( @@ -749,7 +749,7 @@ def main( ) elif osm_extract_query: # export to DuckDB database - if (result_file_path and result_file_path.suffix in [".duckdb", ".db"]) or duckdb: + if (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb: from quackosm._exceptions import OsmExtractSearchError from quackosm.functions import convert_osm_extract_to_duckdb @@ -814,7 +814,7 @@ def main( raise typer.Exit(code=1) from None else: # export to DuckDB database - if (result_file_path and result_file_path.suffix in [".duckdb", ".db"]) or duckdb: + if (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb: from quackosm.functions import convert_geometry_to_duckdb result_path = convert_geometry_to_duckdb( From 9a01c7cb6caac2bbafb670be45da5308e2050511 Mon Sep 17 00:00:00 2001 From: mwip Date: Sun, 6 Oct 2024 21:53:19 +0200 Subject: [PATCH 05/12] Fix typo in test_cli.py --- tests/base/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/test_cli.py b/tests/base/test_cli.py index b6d00e6..7130507 100644 --- a/tests/base/test_cli.py +++ b/tests/base/test_cli.py @@ -107,7 +107,7 @@ def test_transient_mode(monaco_pbf_file_path_fixture: str) -> None: @P.case("DuckDB explicit export", ["--duckdb"], "files/monaco_nofilter_noclip.duckdb") # type: ignore @P.case( "DuckDB explicit export with table name", - ["--duckdb", "--duckdb_table_name test"], + ["--duckdb", "--duckdb-table-name test"], "files/monaco_nofilter_noclip.duckdb", ) # type: ignore @P.case("Silent", ["--silent"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore From eed48d553b8354f7957aa42172d282da54e5b6c1 Mon Sep 17 00:00:00 2001 From: mwip Date: Sun, 6 Oct 2024 22:01:22 +0200 Subject: [PATCH 06/12] Fix test_cli.py for duckdb: correct file, split args --- tests/base/test_cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/base/test_cli.py b/tests/base/test_cli.py index 7130507..09da18e 100644 --- a/tests/base/test_cli.py +++ b/tests/base/test_cli.py @@ -104,11 +104,11 @@ def test_transient_mode(monaco_pbf_file_path_fixture: str) -> None: @P.case("Ignore cache short", ["--no-cache"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore @P.case("Output", ["--output", "files/monaco_output.parquet"], "files/monaco_output.parquet") # type: ignore @P.case("Output short", ["-o", "files/monaco_output.parquet"], "files/monaco_output.parquet") # type: ignore -@P.case("DuckDB explicit export", ["--duckdb"], "files/monaco_nofilter_noclip.duckdb") # type: ignore +@P.case("DuckDB explicit export", ["--duckdb"], "files/monaco_nofilter_noclip_compact.duckdb") # type: ignore @P.case( "DuckDB explicit export with table name", - ["--duckdb", "--duckdb-table-name test"], - "files/monaco_nofilter_noclip.duckdb", + ["--duckdb", "--duckdb-table-name", "test"], + "files/monaco_nofilter_noclip_compact.duckdb", ) # type: ignore @P.case("Silent", ["--silent"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore @P.case("Transient", ["--transient"], "files/monaco_nofilter_noclip_compact.parquet") # type: ignore From de19ff6ca25de35a01f56f1b3d21ff251ed26ba6 Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Tue, 8 Oct 2024 23:15:52 +0200 Subject: [PATCH 07/12] chore: add new test case for increased coverage --- tests/base/test_cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/base/test_cli.py b/tests/base/test_cli.py index 09da18e..32f4225 100644 --- a/tests/base/test_cli.py +++ b/tests/base/test_cli.py @@ -684,6 +684,14 @@ def test_proper_args_with_pbf_url() -> None: "nonexistent_extract", ], ) # type: ignore +@P.case( + "OSM extracts with zero matches and duckdb export", + [ + "--duckdb", + "--osm-extract-query", + "quack_extract", + ], +) # type: ignore @P.case( "Wrong IoU threshold value", [ From a30cac029319ae04c8ec0c898761855d7a5ed764 Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Tue, 8 Oct 2024 23:16:07 +0200 Subject: [PATCH 08/12] chore: add author to the changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8cc140..18fc640 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Option to export to DuckDB database [#94](https://github.com/kraina-ai/quackosm/issues/119) +- Option to export to DuckDB database [#94](https://github.com/kraina-ai/quackosm/issues/119) (implemented by [@mwip](https://github.com/mwip)) ## [0.11.0] - 2024-09-24 From abec37aad2f4cfb3293a5c372d884b028aa43e34 Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Tue, 8 Oct 2024 23:37:04 +0200 Subject: [PATCH 09/12] feat: add automatic directory generation for duckdb export --- quackosm/pbf_file_reader.py | 6 +++++- tests/base/test_pbf_file_reader.py | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index 52cf52a..ca34737 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -707,7 +707,7 @@ def convert_pbf_to_duckdb( explode_tags: Optional[bool] = None, ignore_cache: bool = False, filter_osm_ids: Optional[list[str]] = None, - duckdb_table_name: str = "quackosm", + duckdb_table_name: Optional[str] = "quackosm", ) -> Path: """ Convert PBF file to DuckDB Database. @@ -766,6 +766,10 @@ def convert_pbf_to_duckdb( ).with_suffix(".duckdb") ) + result_file_path.parent.mkdir(exist_ok=True, parents=True) + + duckdb_table_name = duckdb_table_name or "quackosm" + with duckdb.connect(str(result_file_path)) as con: con.load_extension("spatial") con.sql(f""" diff --git a/tests/base/test_pbf_file_reader.py b/tests/base/test_pbf_file_reader.py index 07b7b01..b43bca9 100644 --- a/tests/base/test_pbf_file_reader.py +++ b/tests/base/test_pbf_file_reader.py @@ -1,6 +1,7 @@ """Tests for PbfFileReader.""" import json +import random import urllib.request import warnings from functools import partial @@ -92,6 +93,29 @@ def test_pbf_to_geoparquet_parsing( assert GEOMETRY_COLUMN in decoded_geo_schema["columns"] +@pytest.mark.parametrize( + "result_file_path", + [None, "quackosm.db", "files/quackosm.db", f"files/{random.getrandbits(128)}/quackosm.db"], +) # type: ignore +@pytest.mark.parametrize("table_name", [None, "quackosm", "osm_features"]) # type: ignore +def test_pbf_reader_duckdb_export(result_file_path: Optional[str], table_name: Optional[str]): + """Test proper DuckDB export file generation.""" + pbf_file = Path(__file__).parent.parent / "test_files" / "monaco.osm.pbf" + result_path = PbfFileReader().convert_pbf_to_duckdb( + pbf_path=pbf_file, + result_file_path=result_file_path, + duckdb_table_name=table_name, + ignore_cache=True, + ) + + assert result_path.exists(), "DuckDB file doesn't exist" + with duckdb.connect(str(result_path)) as con: + existing_tables = [row[0] for row in con.sql("SHOW TABLES;").fetchall()] + assert table_name or "quackosm" in existing_tables + + result_path.unlink() + + def test_pbf_reader_url_path(): # type: ignore """Test proper URL detection in `PbfFileReader`.""" file_name = "https://download.geofabrik.de/europe/monaco-latest.osm.pbf" From ff53b08979cf9065e118c697cad056114a9c06bc Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Tue, 8 Oct 2024 23:49:13 +0200 Subject: [PATCH 10/12] chore: refactor cli codebase --- quackosm/cli.py | 259 +++++++++++++++++++++++++----------------------- 1 file changed, 133 insertions(+), 126 deletions(-) diff --git a/quackosm/cli.py b/quackosm/cli.py index c1ed17d..951d716 100644 --- a/quackosm/cli.py +++ b/quackosm/cli.py @@ -500,8 +500,10 @@ def main( "--output", "-o", help=( - "Path where to save final geoparquet file. If not provided, it will be generated" + "Path where to save final result file. If not provided, it will be generated" " automatically based on the input pbf file name." + " Can be [bold green].parquet[/bold green] or" + " [bold green].db[/bold green] or [bold green].duckdb[/bold green] extension." ), ), ] = None, @@ -509,7 +511,11 @@ def main( bool, typer.Option( "--duckdb", - help="Export to duckdb database", + help=( + "Export to duckdb database. If not provided, data can still be exported if", + " [bold bright_cyan]output[/bold bright_cyan] has [bold green].db[/bold green]" + " or [bold green].duckdb[/bold green] extension.", + ), ), ] = False, duckdb_table_name: Annotated[ @@ -701,36 +707,66 @@ def main( verbosity_mode = "silent" logging.disable(logging.CRITICAL) - if pbf_file: - # export to DuckDB database - if (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb: - from quackosm.functions import convert_pbf_to_duckdb - result_path = convert_pbf_to_duckdb( - pbf_path=pbf_file, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - geometry_filter=geometry_filter_value, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - duckdb_table_name=duckdb_table_name or "quackosm", - verbosity_mode=verbosity_mode, - ) - - # export to parquet - else: - from quackosm.functions import convert_pbf_to_parquet + is_duckdb = (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb + + pbf_file_parquet = pbf_file and not is_duckdb + pbf_file_duckdb = pbf_file and is_duckdb + osm_extract_parquet = osm_extract_query and not is_duckdb + osm_extract_duckdb = osm_extract_query and is_duckdb + geometry_parquet = not pbf_file and not osm_extract_query and not is_duckdb + geometry_duckdb = not pbf_file and not osm_extract_query and is_duckdb + + if pbf_file_parquet: + from quackosm.functions import convert_pbf_to_parquet + + result_path = convert_pbf_to_parquet( + pbf_path=cast(str, pbf_file), + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + ) + elif pbf_file_duckdb: + from quackosm.functions import convert_pbf_to_duckdb + + result_path = convert_pbf_to_duckdb( + pbf_path=cast(str, pbf_file), + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + duckdb_table_name=duckdb_table_name or "quackosm", + verbosity_mode=verbosity_mode, + ) + elif osm_extract_parquet: + from quackosm._exceptions import OsmExtractSearchError + from quackosm.functions import convert_osm_extract_to_parquet - result_path = convert_pbf_to_parquet( - pbf_path=pbf_file, + try: + result_path = convert_osm_extract_to_parquet( + osm_extract_query=cast(str, osm_extract_query), + osm_extract_source=osm_extract_source, tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore keep_all_tags=keep_all_tags, geometry_filter=geometry_filter_value, @@ -747,81 +783,23 @@ def main( save_as_wkt=wkt_result, verbosity_mode=verbosity_mode, ) - elif osm_extract_query: - # export to DuckDB database - if (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb: - from quackosm._exceptions import OsmExtractSearchError - from quackosm.functions import convert_osm_extract_to_duckdb - - try: - result_path = convert_osm_extract_to_duckdb( - osm_extract_query=osm_extract_query, - osm_extract_source=osm_extract_source, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - geometry_filter=geometry_filter_value, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - duckdb_table_name=duckdb_table_name or "quackosm", - save_as_wkt=wkt_result, - verbosity_mode=verbosity_mode, - ) - except OsmExtractSearchError as ex: - from rich.console import Console - - err_console = Console(stderr=True) - err_console.print(ex) - raise typer.Exit(code=1) from None - - # export to parquet - else: - from quackosm._exceptions import OsmExtractSearchError - from quackosm.functions import convert_osm_extract_to_parquet - - try: - result_path = convert_osm_extract_to_parquet( - osm_extract_query=osm_extract_query, - osm_extract_source=osm_extract_source, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - geometry_filter=geometry_filter_value, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - save_as_wkt=wkt_result, - verbosity_mode=verbosity_mode, - ) - except OsmExtractSearchError as ex: - from rich.console import Console + except OsmExtractSearchError as ex: + from rich.console import Console - err_console = Console(stderr=True) - err_console.print(ex) - raise typer.Exit(code=1) from None - else: - # export to DuckDB database - if (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb: - from quackosm.functions import convert_geometry_to_duckdb + err_console = Console(stderr=True) + err_console.print(ex) + raise typer.Exit(code=1) from None + elif osm_extract_duckdb: + from quackosm._exceptions import OsmExtractSearchError + from quackosm.functions import convert_osm_extract_to_duckdb - result_path = convert_geometry_to_duckdb( - geometry_filter=geometry_filter_value, + try: + result_path = convert_osm_extract_to_duckdb( + osm_extract_query=cast(str, osm_extract_query), osm_extract_source=osm_extract_source, tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, explode_tags=explode_tags, ignore_cache=ignore_cache, working_directory=working_directory, @@ -835,32 +813,61 @@ def main( duckdb_table_name=duckdb_table_name or "quackosm", save_as_wkt=wkt_result, verbosity_mode=verbosity_mode, - geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, - allow_uncovered_geometry=allow_uncovered_geometry, ) + except OsmExtractSearchError as ex: + from rich.console import Console + + err_console = Console(stderr=True) + err_console.print(ex) + raise typer.Exit(code=1) from None + elif geometry_parquet: + from quackosm.functions import convert_geometry_to_parquet + + result_path = convert_geometry_to_parquet( + geometry_filter=geometry_filter_value, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, + allow_uncovered_geometry=allow_uncovered_geometry, + ) + elif geometry_duckdb: + from quackosm.functions import convert_geometry_to_duckdb + + result_path = convert_geometry_to_duckdb( + geometry_filter=geometry_filter_value, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + duckdb_table_name=duckdb_table_name or "quackosm", + save_as_wkt=wkt_result, + verbosity_mode=verbosity_mode, + geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, + allow_uncovered_geometry=allow_uncovered_geometry, + ) + else: + raise RuntimeError("Unknown operation mode") - # export to parquet - else: - from quackosm.functions import convert_geometry_to_parquet - - result_path = convert_geometry_to_parquet( - geometry_filter=geometry_filter_value, - osm_extract_source=osm_extract_source, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - save_as_wkt=wkt_result, - verbosity_mode=verbosity_mode, - geometry_coverage_iou_threshold=geometry_coverage_iou_threshold, - allow_uncovered_geometry=allow_uncovered_geometry, - ) typer.secho(result_path, fg="green") From abf678cab179acfc4aa9665bcff712c2825cc24d Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Tue, 8 Oct 2024 23:54:48 +0200 Subject: [PATCH 11/12] fix: remove commas --- quackosm/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quackosm/cli.py b/quackosm/cli.py index 951d716..d30ef8a 100644 --- a/quackosm/cli.py +++ b/quackosm/cli.py @@ -512,9 +512,9 @@ def main( typer.Option( "--duckdb", help=( - "Export to duckdb database. If not provided, data can still be exported if", + "Export to duckdb database. If not provided, data can still be exported if" " [bold bright_cyan]output[/bold bright_cyan] has [bold green].db[/bold green]" - " or [bold green].duckdb[/bold green] extension.", + " or [bold green].duckdb[/bold green] extension." ), ), ] = False, From 9d52a1b30c43388efe6af01b9f57e281d6c2d6da Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Wed, 9 Oct 2024 00:39:39 +0200 Subject: [PATCH 12/12] chore: change working directory test --- tests/base/test_cli.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/base/test_cli.py b/tests/base/test_cli.py index 32f4225..8562b88 100644 --- a/tests/base/test_cli.py +++ b/tests/base/test_cli.py @@ -340,16 +340,22 @@ def test_proper_args_with_pbf( ["--geom-filter-file", geometry_boundary_file_path(), "--compact-tags"], "files/6a869bcfa1a49ade8b76569e48e4142bce29098815bf37e57155a18204f2bbbc_nofilter_compact.parquet", ) # type: ignore -@P.case( - "Working directory", - ["--geom-filter-file", geometry_boundary_file_path(), "--working-directory", "files/workdir"], - "files/workdir/6a869bcfa1a49ade8b76569e48e4142bce29098815bf37e57155a18204f2bbbc_nofilter_compact.parquet", -) # type: ignore @P.case( "Ignore cache", ["--geom-filter-file", geometry_boundary_file_path(), "--ignore-cache"], "files/6a869bcfa1a49ade8b76569e48e4142bce29098815bf37e57155a18204f2bbbc_nofilter_compact.parquet", ) # type: ignore +@P.case( + "Working directory", + [ + "--geom-filter-file", + geometry_boundary_file_path(), + "--working-directory", + "files/workdir", + "--ignore-cache", + ], + "files/workdir/6a869bcfa1a49ade8b76569e48e4142bce29098815bf37e57155a18204f2bbbc_nofilter_compact.parquet", +) # type: ignore @P.case( "Output", [