diff --git a/config/config.default.yaml b/config/config.default.yaml index ffebff647..a15676251 100644 --- a/config/config.default.yaml +++ b/config/config.default.yaml @@ -870,6 +870,12 @@ costs: # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#clustering clustering: + mode: admin # buses, admin, custom_admin_map, custom_busmap + admin: + level: 1 # Set general level of administrative division, e.g. 0 for countries, 1 for regions, specify individual countries if needed + DE: 3 + FR: 2 + ES: 0 focus_weights: false simplify_network: to_substations: false diff --git a/doc/data-bundle.rst b/doc/data-bundle.rst index f859cd5a3..230a81ea0 100644 --- a/doc/data-bundle.rst +++ b/doc/data-bundle.rst @@ -33,13 +33,6 @@ scope to reduce file size, or are not provided through stable URLs elsewhere. - **License:** `custom `__ - **Description:** Average annual population to calculate regional GDP data (thousand persons) by NUTS 3 regions. -``data/bundle/nama_10r_3gdp.tsv.gz`` - -- **Source:** Eurostat -- **Link:** https://ec.europa.eu/eurostat/databrowser/view/nama_10r_3gdp/default/table?lang=en -- **License:** `custom `__ -- **Description:** Gross domestic product (GDP) at current market prices by NUTS 3 regions. - ``data/bundle/corine`` - **Source:** European Environment Agency (EEA) @@ -94,7 +87,7 @@ scope to reduce file size, or are not provided through stable URLs elsewhere. - **License:** CC0 (`reference `__) - **Description:** Gridded GDP data. -``data/bundle/ppp_2013_1km_Aggregated.tif`` +``data/bundle/ppp_2019_1km_Aggregated.tif`` - **Source:** WorldPop (www.worldpop.org - School of Geography and Environmental Science, University of Southampton; Department of Geography and Geosciences, @@ -104,5 +97,5 @@ scope to reduce file size, or are not provided through stable URLs elsewhere. Funded by The Bill and Melinda Gates Foundation (OPP1134076). https://dx.doi.org/10.5258/SOTON/WP00647 - **Link:** https://hub.worldpop.org/doi/10.5258/SOTON/WP00647 -- **License:** CC-BY 4.0 (`reference `__) +- **License:** CC-BY 4.0 (`reference `__) - **Description:** Gridded population data. diff --git a/doc/release_notes.rst b/doc/release_notes.rst index e931365ce..63972da76 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -115,6 +115,8 @@ Upcoming Release * Update locations and capacities of ammonia plants. +* Updating all base shapes (country_shapes, europe_shape, nuts3_shapes, ...). The workflow has been modified to use higher resolution and more harmonised shapes (NUTS3 2021 01M data and OSM administration level 1 for non-NUTS3 countries, such as BA, MD, UA, and XK). Data sources for population and GDP p.c. have been updated to JRC ARDECO https://urban.jrc.ec.europa.eu/ardeco/ -- 2019 values are used. `build_gdp_pop_non_nuts3` (originally created to build regional GDP p.c. and population data for MD and UA) is now integrated into `build_shapes` and extended to build regional values for all non-NUTS3 countries using cutouts of the updated datasets `GDP_per_capita_PPP_1990_2015_v2.nc` and `ppp_2019_1km_Aggregated.tif`, + PyPSA-Eur 0.13.0 (13th September 2024) ====================================== diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk index 533429d9a..1b885c532 100755 --- a/rules/build_electricity.smk +++ b/rules/build_electricity.smk @@ -98,17 +98,45 @@ rule base_network: "../scripts/base_network.py" +rule build_osm_boundaries: + input: + json="data/osm-boundaries/json/{country}_adm1.json", + eez=ancient("data/eez/World_EEZ_v12_20231025_LR/eez_v12_lowres.gpkg"), + output: + boundary="data/osm-boundaries/build/{country}_adm1.geojson", + log: + "logs/build_osm_boundaries_{country}.log", + threads: 1 + resources: + mem_mb=1500, + conda: + "../envs/environment.yaml" + script: + "../scripts/build_osm_boundaries.py" + + +rule build_osm_boundaries_all: + input: + "data/osm-boundaries/build/MD_adm1.geojson", + "data/osm-boundaries/build/BA_adm1.geojson", + "data/osm-boundaries/build/UA_adm1.geojson", + "data/osm-boundaries/build/XK_adm1.geojson", + + rule build_shapes: params: countries=config_provider("countries"), input: - naturalearth=ancient("data/naturalearth/ne_10m_admin_0_countries_deu.shp"), eez=ancient("data/eez/World_EEZ_v12_20231025_LR/eez_v12_lowres.gpkg"), - nuts3=ancient("data/nuts/NUTS_RG_03M_2013_4326_LEVL_3.geojson"), - nuts3pop=ancient("data/bundle/nama_10r_3popgdp.tsv.gz"), - nuts3gdp=ancient("data/bundle/nama_10r_3gdp.tsv.gz"), - ch_cantons=ancient("data/ch_cantons.csv"), - ch_popgdp=ancient("data/bundle/je-e-21.03.02.xls"), + nuts3_2021="data/nuts/NUTS_RG_01M_2021_4326_LEVL_3.geojson", + ba_adm1="data/osm-boundaries/build/BA_adm1.geojson", + md_adm1="data/osm-boundaries/build/MD_adm1.geojson", + ua_adm1="data/osm-boundaries/build/UA_adm1.geojson", + xk_adm1="data/osm-boundaries/build/XK_adm1.geojson", + nuts3_gdp="data/jrc-ardeco/ARDECO-SUVGDP.2021.table.csv", + nuts3_pop="data/jrc-ardeco/ARDECO-SNPTD.2021.table.csv", + other_gdp="data/sandbox/GDP_per_capita_PPP_1990_2015_v2.nc", # TODO: update links to data/bundle after data bundle update + other_pop="data/sandbox/ppp_2019_1km_Aggregated.tif", # TODO: update links to data/bundle after data bundle update output: country_shapes=resources("country_shapes.geojson"), offshore_shapes=resources("offshore_shapes.geojson"), @@ -472,42 +500,10 @@ def input_conventional(w): } -# Optional input when having Ukraine (UA) or Moldova (MD) in the countries list -def input_gdp_pop_non_nuts3(w): - countries = set(config_provider("countries")(w)) - if {"UA", "MD"}.intersection(countries): - return {"gdp_pop_non_nuts3": resources("gdp_pop_non_nuts3.geojson")} - return {} - - -rule build_gdp_pop_non_nuts3: - params: - countries=config_provider("countries"), - input: - base_network=resources("networks/base_s.nc"), - regions=resources("regions_onshore_base_s.geojson"), - gdp_non_nuts3="data/bundle/GDP_per_capita_PPP_1990_2015_v2.nc", - pop_non_nuts3="data/bundle/ppp_2013_1km_Aggregated.tif", - output: - resources("gdp_pop_non_nuts3.geojson"), - log: - logs("build_gdp_pop_non_nuts3.log"), - benchmark: - benchmarks("build_gdp_pop_non_nuts3") - threads: 1 - resources: - mem_mb=8000, - conda: - "../envs/environment.yaml" - script: - "../scripts/build_gdp_pop_non_nuts3.py" - - rule build_electricity_demand_base: params: distribution_key=config_provider("load", "distribution_key"), input: - unpack(input_gdp_pop_non_nuts3), base_network=resources("networks/base_s.nc"), regions=resources("regions_onshore_base_s.geojson"), nuts3=resources("nuts3_shapes.geojson"), @@ -593,6 +589,8 @@ def input_cluster_network(w): rule cluster_network: params: + mode=config_provider("clustering", "mode"), + admin=config_provider("clustering", "admin"), cluster_network=config_provider("clustering", "cluster_network"), aggregation_strategies=config_provider( "clustering", "aggregation_strategies", default={} @@ -608,9 +606,9 @@ rule cluster_network: input: unpack(input_cluster_network), network=resources("networks/base_s.nc"), + nuts3_shapes=resources("nuts3_shapes.geojson"), regions_onshore=resources("regions_onshore_base_s.geojson"), regions_offshore=resources("regions_offshore_base_s.geojson"), - busmap=ancient(resources("busmap_base_s.csv")), hac_features=lambda w: ( resources("hac_features.nc") if config_provider("clustering", "cluster_network", "algorithm")(w) diff --git a/rules/build_sector.smk b/rules/build_sector.smk index 087ed1fb0..523ae58f0 100755 --- a/rules/build_sector.smk +++ b/rules/build_sector.smk @@ -468,6 +468,9 @@ rule build_heat_totals: "../scripts/build_heat_totals.py" +# TODO: update long-term to only need NUTS2021, nuts3_shapes.geojson +# However: requires manual updating of NUTS code changes 2016 to 2021: +# https://ec.europa.eu/eurostat/web/nuts/history rule build_biomass_potentials: params: biomass=config_provider("biomass"), diff --git a/rules/retrieve.smk b/rules/retrieve.smk index f34daef95..6a66ac360 100755 --- a/rules/retrieve.smk +++ b/rules/retrieve.smk @@ -18,7 +18,6 @@ if config["enable"]["retrieve"] and config["enable"].get("retrieve_databundle", datafiles = [ "je-e-21.03.02.xls", "nama_10r_3popgdp.tsv.gz", - "nama_10r_3gdp.tsv.gz", "corine/g250_clc06_V18_5.tif", "eea/UNFCCC_v23.csv", "emobility/KFZ__count", @@ -26,8 +25,8 @@ if config["enable"]["retrieve"] and config["enable"].get("retrieve_databundle", "h2_salt_caverns_GWh_per_sqkm.geojson", "natura/natura.tiff", "gebco/GEBCO_2014_2D.nc", - "GDP_per_capita_PPP_1990_2015_v2.nc", - "ppp_2013_1km_Aggregated.tif", + "GDP_per_capita_PPP_1990_2015_v2.nc", # TODO: update + "ppp_2013_1km_Aggregated.tif", # TODO: update ] rule retrieve_databundle: @@ -77,7 +76,7 @@ if config["enable"]["retrieve"] and config["enable"].get("retrieve_databundle", if config["enable"]["retrieve"]: - rule retrieve_nuts_shapes: + rule retrieve_nuts_2013_shapes: input: shapes=storage( "https://gisco-services.ec.europa.eu/distribution/v2/nuts/download/ref-nuts-2013-03m.geojson.zip" @@ -101,6 +100,34 @@ if config["enable"]["retrieve"]: +if config["enable"]["retrieve"]: + + rule retrieve_nuts_2021_shapes: + input: + shapes=storage( + "https://gisco-services.ec.europa.eu/distribution/v2/nuts/download/ref-nuts-2021-01m.geojson.zip" + ), + output: + shapes_level_3="data/nuts/NUTS_RG_01M_2021_4326_LEVL_3.geojson", + shapes_level_2="data/nuts/NUTS_RG_01M_2021_4326_LEVL_2.geojson", + shapes_level_1="data/nuts/NUTS_RG_01M_2021_4326_LEVL_1.geojson", + shapes_level_0="data/nuts/NUTS_RG_01M_2021_4326_LEVL_0.geojson", + params: + zip_file="data/nuts/ref-nuts-2021-01m.geojson.zip", + run: + os.rename(input.shapes, params.zip_file) + with ZipFile(params.zip_file, "r") as zip_ref: + for level in ["LEVL_3", "LEVL_2", "LEVL_1", "LEVL_0"]: + filename = f"NUTS_RG_01M_2021_4326_{level}.geojson" + zip_ref.extract(filename, Path(output.shapes_level_0).parent) + extracted_file = Path(output.shapes_level_0).parent / filename + extracted_file.rename( + getattr(output, f"shapes_level_{level[-1]}") + ) + os.remove(params.zip_file) + + + if config["enable"]["retrieve"] and config["enable"].get("retrieve_cutout", True): rule retrieve_cutout: @@ -371,27 +398,6 @@ if config["enable"]["retrieve"]: -if config["enable"]["retrieve"]: - - # Download directly from naciscdn.org which is a redirect from naturalearth.com - # (https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/) - # Use point-of-view (POV) variant of Germany so that Crimea is included. - rule retrieve_naturalearth_countries: - input: - storage( - "https://naciscdn.org/naturalearth/10m/cultural/ne_10m_admin_0_countries_deu.zip" - ), - params: - zip="data/naturalearth/ne_10m_admin_0_countries_deu.zip", - output: - countries="data/naturalearth/ne_10m_admin_0_countries_deu.shp", - run: - move(input[0], params["zip"]) - output_folder = Path(output["countries"]).parent - unpack_archive(params["zip"], output_folder) - os.remove(params["zip"]) - - if config["enable"]["retrieve"]: rule retrieve_gem_europe_gas_tracker: @@ -629,6 +635,20 @@ if config["enable"]["retrieve"] and ( ), +if config["enable"]["retrieve"]: + + rule retrieve_osm_boundaries: + output: + json="data/osm-boundaries/json/{country}_adm1.json", + log: + "logs/retrieve_osm_boundaries_{country}_adm1.log", + threads: 1 + conda: + "../envs/retrieve.yaml" + script: + "../scripts/retrieve_osm_boundaries.py" + + if config["enable"]["retrieve"]: rule retrieve_heat_source_utilisation_potentials: @@ -645,3 +665,48 @@ if config["enable"]["retrieve"]: "data/heat_source_utilisation_potentials/{heat_source}.gpkg", script: "../scripts/retrieve_heat_source_utilisation_potentials.py" + + +if config["enable"]["retrieve"]: + + rule retrieve_jrc_ardeco: + output: + ardeco_gdp="data/jrc-ardeco/ARDECO-SUVGDP.2021.table.csv", + ardeco_pop="data/jrc-ardeco/ARDECO-SNPTD.2021.table.csv", + run: + import requests + + urls = { + "ardeco_gdp": "https://urban.jrc.ec.europa.eu/ardeco-api-v2/rest/export/SUVGDP?version=2021&format=csv-table", + "ardeco_pop": "https://urban.jrc.ec.europa.eu/ardeco-api-v2/rest/export/SNPTD?version=2021&format=csv-table", + } + + for key, url in urls.items(): + response = requests.get(url) + output_path = output[key] if key in urls else None + if output_path: + with open(output_path, "wb") as f: + f.write(response.content) + + + +# TODO: Remove before merging into master and after updating databundle on Zenodo: +if config["enable"]["retrieve"]: + + rule retrieve_sandbox_data: + input: + gdp_non_nuts3=storage( + "https://github.com/bobbyxng/sandbox/raw/refs/heads/main/data/GDP_per_capita_PPP_1990_2015_v2.nc", + keep_local=True, + ), + pop_non_nuts_3=storage( + "https://github.com/bobbyxng/sandbox/raw/refs/heads/main/data/ppp_2019_1km_Aggregated.tif", + keep_local=True, + ), + output: + gdp_non_nuts3="data/sandbox/GDP_per_capita_PPP_1990_2015_v2.nc", + pop_non_nuts_3="data/sandbox/ppp_2019_1km_Aggregated.tif", + retries: 1 + run: + for key in input.keys(): + move(input[key], output[key]) diff --git a/scripts/build_electricity_demand_base.py b/scripts/build_electricity_demand_base.py index d999d6f3b..192faf570 100644 --- a/scripts/build_electricity_demand_base.py +++ b/scripts/build_electricity_demand_base.py @@ -44,7 +44,6 @@ def upsample_load( regions_fn: str, load_fn: str, nuts3_fn: str, - gdp_pop_non_nuts3_fn: str, distribution_key: dict[str, float], ) -> pd.DataFrame: substation_lv_i = n.buses.index[n.buses["substation_lv"]] @@ -61,19 +60,7 @@ def upsample_load( for cntry, group in gdf_regions.geometry.groupby(gdf_regions.country): load_ct = load[cntry] - if cntry in ["UA", "MD"]: - # separate handling because nuts3 provides no data for UA+MD - gdp_pop_non_nuts3 = gpd.read_file(gdp_pop_non_nuts3_fn).set_index("Bus") - gdp_pop_non_nuts3 = gdp_pop_non_nuts3.loc[ - (gdp_pop_non_nuts3.country == cntry) - & (gdp_pop_non_nuts3.index.isin(substation_lv_i)) - ] - factors = normed( - gdp_weight * normed(gdp_pop_non_nuts3["gdp"]) - + pop_weight * normed(gdp_pop_non_nuts3["pop"]) - ) - - elif len(group) == 1: + if len(group) == 1: factors = pd.Series(1.0, index=group.index) else: @@ -116,7 +103,6 @@ def upsample_load( regions_fn=snakemake.input.regions, load_fn=snakemake.input.load, nuts3_fn=snakemake.input.nuts3, - gdp_pop_non_nuts3_fn=snakemake.input.get("gdp_pop_non_nuts3"), distribution_key=params.distribution_key, ) diff --git a/scripts/build_gdp_pop_non_nuts3.py b/scripts/build_gdp_pop_non_nuts3.py deleted file mode 100644 index 79d141cc7..000000000 --- a/scripts/build_gdp_pop_non_nuts3.py +++ /dev/null @@ -1,152 +0,0 @@ -# SPDX-FileCopyrightText: Contributors to PyPSA-Eur -# -# SPDX-License-Identifier: MIT -""" -Maps the per-capita GDP and population values to non-NUTS3 regions. - -The script takes as input the country code, a GeoDataFrame containing -the regions, and the file paths to the datasets containing the GDP and -POP values for non-NUTS3 countries. -""" - -import logging - -import geopandas as gpd -import numpy as np -import pandas as pd -import pypsa -import rasterio -import xarray as xr -from _helpers import configure_logging, set_scenario_config -from rasterio.mask import mask -from shapely.geometry import box - -logger = logging.getLogger(__name__) - - -def calc_gdp_pop(country, regions, gdp_non_nuts3, pop_non_nuts3): - """ - Calculate the GDP p.c. and population values for non NUTS3 regions. - - Parameters - ---------- - country (str): The two-letter country code of the non-NUTS3 region. - regions (GeoDataFrame): A GeoDataFrame containing the regions. - gdp_non_nuts3 (str): The file path to the dataset containing the GDP p.c values - for non NUTS3 countries (e.g. MD, UA) - pop_non_nuts3 (str): The file path to the dataset containing the POP values - for non NUTS3 countries (e.g. MD, UA) - - Returns - ------- - tuple: A tuple containing two GeoDataFrames: - - gdp: A GeoDataFrame with the mean GDP p.c. values mapped to each bus. - - pop: A GeoDataFrame with the summed POP values mapped to each bus. - """ - regions = regions.rename(columns={"name": "Bus"}).set_index("Bus") - regions = regions[regions.country == country] - # Create a bounding box for UA, MD from region shape, including a buffer of 10000 metres - bounding_box = ( - gpd.GeoDataFrame(geometry=[box(*regions.total_bounds)], crs=regions.crs) - .to_crs(epsg=3857) - .buffer(10000) - .to_crs(regions.crs) - ) - - # GDP Mapping - logger.info(f"Mapping mean GDP p.c. to non-NUTS3 region: {country}") - with xr.open_dataset(gdp_non_nuts3) as src_gdp: - src_gdp = src_gdp.where( - (src_gdp.longitude >= bounding_box.bounds.minx.min()) - & (src_gdp.longitude <= bounding_box.bounds.maxx.max()) - & (src_gdp.latitude >= bounding_box.bounds.miny.min()) - & (src_gdp.latitude <= bounding_box.bounds.maxy.max()), - drop=True, - ) - gdp = src_gdp.to_dataframe().reset_index() - gdp = gdp.rename(columns={"GDP_per_capita_PPP": "gdp"}) - gdp = gdp[gdp.time == gdp.time.max()] - gdp_raster = gpd.GeoDataFrame( - gdp, - geometry=gpd.points_from_xy(gdp.longitude, gdp.latitude), - crs="EPSG:4326", - ) - gdp_mapped = gpd.sjoin(gdp_raster, regions, predicate="within") - gdp = ( - gdp_mapped.copy() - .groupby(["Bus", "country"]) - .agg({"gdp": "mean"}) - .reset_index(level=["country"]) - ) - - # Population Mapping - logger.info(f"Mapping summed population to non-NUTS3 region: {country}") - with rasterio.open(pop_non_nuts3) as src_pop: - # Mask the raster with the bounding box - out_image, out_transform = mask(src_pop, bounding_box, crop=True) - out_meta = src_pop.meta.copy() - out_meta.update( - { - "driver": "GTiff", - "height": out_image.shape[1], - "width": out_image.shape[2], - "transform": out_transform, - } - ) - masked_data = out_image[0] # Use the first band (rest is empty) - row_indices, col_indices = np.where(masked_data != src_pop.nodata) - values = masked_data[row_indices, col_indices] - - # Affine transformation from pixel coordinates to geo coordinates - x_coords, y_coords = rasterio.transform.xy(out_transform, row_indices, col_indices) - pop_raster = pd.DataFrame({"x": x_coords, "y": y_coords, "pop": values}) - pop_raster = gpd.GeoDataFrame( - pop_raster, - geometry=gpd.points_from_xy(pop_raster.x, pop_raster.y), - crs=src_pop.crs, - ) - pop_mapped = gpd.sjoin(pop_raster, regions, predicate="within") - pop = ( - pop_mapped.groupby(["Bus", "country"]) - .agg({"pop": "sum"}) - .reset_index() - .set_index("Bus") - ) - gdp_pop = regions.join(gdp.drop(columns="country"), on="Bus").join( - pop.drop(columns="country"), on="Bus" - ) - gdp_pop.fillna(0, inplace=True) - - return gdp_pop - - -if __name__ == "__main__": - if "snakemake" not in globals(): - from _helpers import mock_snakemake - - snakemake = mock_snakemake( - "build_gdp_pop_non_nuts3", configfiles=["config/config.osm-raw.yaml"] - ) - configure_logging(snakemake) - set_scenario_config(snakemake) - - n = pypsa.Network(snakemake.input.base_network) - regions = gpd.read_file(snakemake.input.regions) - - gdp_non_nuts3 = snakemake.input.gdp_non_nuts3 - pop_non_nuts3 = snakemake.input.pop_non_nuts3 - - subset = {"MD", "UA"}.intersection(snakemake.params.countries) - - gdp_pop = pd.concat( - [ - calc_gdp_pop(country, regions, gdp_non_nuts3, pop_non_nuts3) - for country in subset - ], - axis=0, - ) - - logger.info( - f"Exporting GDP and POP values for non-NUTS3 regions {snakemake.output}" - ) - gdp_pop.reset_index().to_file(snakemake.output[0], driver="GeoJSON") diff --git a/scripts/build_osm_boundaries.py b/scripts/build_osm_boundaries.py new file mode 100755 index 000000000..923b5416b --- /dev/null +++ b/scripts/build_osm_boundaries.py @@ -0,0 +1,184 @@ +# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors +# +# SPDX-License-Identifier: MIT + +import json +import logging + +import country_converter as coco +import geopandas as gpd +import pandas as pd +from _helpers import configure_logging, set_scenario_config +from build_shapes import eez +from shapely import line_merge +from shapely.geometry import LineString, MultiLineString, MultiPolygon, Polygon + +logger = logging.getLogger(__name__) + +cc = coco.CountryConverter() + +GEO_CRS = "EPSG:4326" +EXCLUDER_LIST = [ + 3788485, # RU version of Sevastopol + 3795586, # RU version of Crimea +] + + +def _create_linestring(row): + """ + Create a LineString object from the given row. + + Parameters + ---------- + row (dict): A dictionary containing the row data. + + Returns + ------- + LineString: A LineString object representing the geometry. + """ + coords = [(coord["lon"], coord["lat"]) for coord in row["geometry"]] + return LineString(coords) + + +def _create_geometries(row, crs=GEO_CRS): + """ + Create geometries from OSM data. + + This function processes OpenStreetMap (OSM) data to create geometries + based on the roles of the members in the data. It supports "outer" and + "inner" roles to construct polygons and multipolygons. + + Parameters + ---------- + - row (dict): A dictionary containing OSM data with a "members" key. + - crs (str, optional): Coordinate reference system for the geometries. + Defaults to GEO_CRS. + + Returns + ------- + - shapely.geometry.Polygon or shapely.geometry.MultiPolygon: The resulting + geometry after processing the OSM data. + """ + valid_roles = ["outer", "inner"] + df = pd.json_normalize(row["members"]) + df = df[df["role"].isin(valid_roles) & ~df["geometry"].isna()] + df.loc[:, "geometry"] = df.apply(_create_linestring, axis=1) + + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs=crs) + outer = line_merge(gdf[gdf["role"] == "outer"].union_all()) + + if isinstance(outer, LineString): + outer = Polygon(outer) + if isinstance(outer, MultiLineString): + polygons = [Polygon(line) for line in outer.geoms] + outer = MultiPolygon(polygons) + + if not gdf[gdf["role"] == "inner"].empty: + inner = line_merge(gdf[gdf["role"] == "inner"].union_all()) + + if isinstance(inner, LineString): + inner = Polygon(inner) + if isinstance(inner, MultiLineString): + polygons = [Polygon(line) for line in inner.geoms] + inner = MultiPolygon(polygons) + + outer = outer.difference(inner) + + return outer + + +def build_osm_boundaries(country, adm1_path, offshore_shapes): + """ + Build administrative boundaries from OSM data for a given country. + + Parameters + ---------- + - country (str): The country code (e.g., 'DE' for Germany). + - adm1_path (str): The file path to the administrative level 1 OSM data in JSON format. + - offshore_shapes (GeoDataFrame): A GeoDataFrame containing offshore shapes to clip the boundaries. + + Returns + ------- + GeoDataFrame: A GeoDataFrame containing the administrative boundaries with columns: + - id: The administrative level 1 code. + - country: The country code. + - name: The administrative level 1 name. + - osm_id: The OSM ID. + - geometry: The geometry of the boundary. + """ + logger.info( + f"Building administrative boundaries from OSM data for country {country}." + ) + + df = json.load(open(adm1_path)) + df = pd.DataFrame(df["elements"]) + + # Filter out ids in excluder list + df = df[~df["id"].isin(EXCLUDER_LIST)] + + df.loc[:, "geometry"] = df.apply(_create_geometries, axis=1) + + col_tags = [ + "ISO3166-2", + "name:en", + ] + + tags = pd.json_normalize(df["tags"]).map(lambda x: str(x) if pd.notnull(x) else x) + + for ct in col_tags: + if ct not in tags.columns: + tags[ct] = pd.NA + + tags = tags.loc[:, col_tags] + + df = pd.concat([df, tags], axis="columns") + df.drop(columns=["type", "tags", "bounds", "members"], inplace=True) + + # Rename columns + df.rename( + columns={ + "id": "osm_id", + "ISO3166-2": "id", + "name:en": "name", + }, + inplace=True, + ) + + df["country"] = country + + # Resort columns + df = df[["id", "country", "name", "osm_id", "geometry"]] + + # If any id is missing, sort by osm_id and number starting from 1 + if df["id"].isna().any(): + df.sort_values(by="osm_id", inplace=True) + df["id"] = country + "-" + df.index.to_series().add(1).astype(str) + + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs=GEO_CRS) + + # Clip gdf by offshore shapes + gdf = gpd.overlay(gdf, offshore_shapes, how="difference") + + # Check if substring in "id" is equal to country, if not, drop + gdf = gdf[gdf["id"].str.startswith(country)] + + return gdf + + +if __name__ == "__main__": + if "snakemake" not in globals(): + from _helpers import mock_snakemake + + snakemake = mock_snakemake("build_osm_boundaries", country="MD") + + configure_logging(snakemake) + set_scenario_config(snakemake) + + country = snakemake.wildcards.country + adm1_path = snakemake.input.json + offshore_shapes = eez(snakemake.input.eez) + + boundaries = build_osm_boundaries(country, adm1_path, offshore_shapes) + + # Export + boundaries.to_file(snakemake.output[0], driver="GeoJSON") diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 099e77931..405ef7721 100755 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -4,71 +4,11 @@ """ Creates GIS shape files of the countries, exclusive economic zones and `NUTS3 < https://en.wikipedia.org/wiki/Nomenclature_of_Territorial_Units_for_Statistics> -`_ areas. - -Relevant Settings ------------------ - -.. code:: yaml - - countries: - -.. seealso:: - Documentation of the configuration file ``config/config.yaml`` at - :ref:`toplevel_cf` - -Inputs ------- - -- ``data/bundle/naturalearth/ne_10m_admin_0_countries.shp``: World country shapes - - .. image:: img/countries.png - :scale: 33 % - -- ``data/eez/World_EEZ_v12_20231025_gpkg/eez_v12.gpkg ``: World `exclusive economic zones `_ (EEZ) - - .. image:: img/eez.png - :scale: 33 % - -- ``data/bundle/NUTS_2013_60M_SH/data/NUTS_RG_60M_2013.shp``: Europe NUTS3 regions - - .. image:: img/nuts3.png - :scale: 33 % - -- ``data/bundle/nama_10r_3popgdp.tsv.gz``: Average annual population by NUTS3 region (`eurostat `__) -- ``data/bundle/nama_10r_3gdp.tsv.gz``: Gross domestic product (GDP) by NUTS 3 regions (`eurostat `__) -- ``data/ch_cantons.csv``: Mapping between Swiss Cantons and NUTS3 regions -- ``data/bundle/je-e-21.03.02.xls``: Population and GDP data per Canton (`BFS - Swiss Federal Statistical Office `_ ) - -Outputs -------- - -- ``resources/country_shapes.geojson``: country shapes out of country selection - - .. image:: img/country_shapes.png - :scale: 33 % - -- ``resources/offshore_shapes.geojson``: EEZ shapes out of country selection - - .. image:: img/offshore_shapes.png - :scale: 33 % - -- ``resources/europe_shape.geojson``: Shape of Europe including countries and EEZ - - .. image:: img/europe_shape.png - :scale: 33 % - -- ``resources/nuts3_shapes.geojson``: NUTS3 shapes out of country selection including population and GDP data. - - .. image:: img/nuts3_shapes.png - :scale: 33 % - -Description ------------ +`_ and OSM ADM1 areas (for BA, MD, UA, and XK). """ import logging -from functools import reduce +import unicodedata from itertools import takewhile from operator import attrgetter @@ -76,19 +16,119 @@ import geopandas as gpd import numpy as np import pandas as pd +import rasterio +import xarray as xr from _helpers import configure_logging, set_scenario_config -from shapely.geometry import MultiPolygon, Polygon +from rasterio.mask import mask +from shapely.geometry import MultiPolygon, Polygon, box logger = logging.getLogger(__name__) - cc = coco.CountryConverter() -def _simplify_polys(polys, minarea=0.1, tolerance=None, filterremote=True): +GEO_CRS = "EPSG:4326" +DISTANCE_CRS = "EPSG:3035" +GDP_YEAR = 2019 +POP_YEAR = 2019 +EUROPE_COUNTRIES = [ + "AL", + "AT", + "BA", + "BE", + "BG", + "CH", + "CZ", + "DE", + "DK", + "EE", + "ES", + "FI", + "FR", + "GB", + "GR", + "HR", + "HU", + "IE", + "IT", + "LT", + "LU", + "LV", + "ME", + "MK", + "NL", + "NO", + "PL", + "PT", + "RO", + "RS", + "SE", + "SI", + "SK", + "XK", + "UA", + "MD", +] +DROP_REGIONS = [ + "ES703", + "ES704", + "ES705", + "ES706", + "ES707", + "ES708", + "ES709", + "ES630", + "ES640", + "FRY10", + "FRY20", + "FRY30", + "FRY40", + "FRY50", + "NO0B1", + "NO0B2", + "PT200", + "PT300", +] +OTHER_GDP_TOTAL_2019 = { # in bn. USD + "BA": 20.48, # World Bank + "MD": 11.74, # World Bank + "UA": 153.9, # World Bank + "XK": 7.9, # https://de.statista.com/statistik/daten/studie/415738/umfrage/bruttoinlandsprodukt-bip-des-kosovo/ +} +OTHER_POP_2019 = { # in 1000 persons + "BA": 3361, # World Bank + "MD": 2664, # World Bank + "UA": 44470, # World Bank + "XK": 1782, # World Bank +} +EXCHANGE_EUR_USD_2019 = 1.1 +NUTS3_INCLUDE = [ + "DE80N", + "DEF08", + "DK014", + "DK050", + "GBH34", + "GBJ34", + "GBJ43", + "NL33A", + "NL33C", + "NL342", + "NL411", + "NL412", + "PL428", +] + + +def _simplify_polys( + polys, minarea=100 * 1e6, maxdistance=None, tolerance=None, filterremote=True +): # 100*1e6 = 100 km² if CRS is DISTANCE_CRS if isinstance(polys, MultiPolygon): polys = sorted(polys.geoms, key=attrgetter("area"), reverse=True) mainpoly = polys[0] mainlength = np.sqrt(mainpoly.area / (2.0 * np.pi)) + + if maxdistance is not None: + mainlength = maxdistance + if mainpoly.area > minarea: polys = MultiPolygon( [ @@ -104,32 +144,14 @@ def _simplify_polys(polys, minarea=0.1, tolerance=None, filterremote=True): return polys -def countries(naturalearth, country_list): - df = gpd.read_file(naturalearth) - - # Names are a hassle in naturalearth, try several fields - fieldnames = ( - df[x].where(lambda s: s != "-99") for x in ("ISO_A2", "WB_A2", "ADM0_A3") - ) - df["name"] = reduce(lambda x, y: x.fillna(y), fieldnames, next(fieldnames)).str[:2] - df.replace({"name": {"KV": "XK"}}, inplace=True) - - df = df.loc[ - df.name.isin(country_list) & ((df["scalerank"] == 0) | (df["scalerank"] == 5)) - ] - s = df.set_index("name")["geometry"].map(_simplify_polys).set_crs(df.crs) - - return s - - -def eez(eez, country_list): +def eez(eez, country_list=EUROPE_COUNTRIES): df = gpd.read_file(eez) iso3_list = cc.convert(country_list, src="ISO2", to="ISO3") # noqa: F841 pol_type = ["200NM", "Overlapping claim"] # noqa: F841 df = df.query("ISO_TER1 in @iso3_list and POL_TYPE in @pol_type").copy() df["name"] = cc.convert(df.ISO_TER1, src="ISO3", to="ISO2") s = df.set_index("name").geometry.map( - lambda s: _simplify_polys(s, filterremote=False) + lambda s: _simplify_polys(s, minarea=0.1, filterremote=False) ) s = s.to_frame("geometry").set_crs(df.crs) s.index.name = "name" @@ -146,96 +168,324 @@ def country_cover(country_shapes, eez_shapes=None): return Polygon(shell=europe_shape.exterior) -def nuts3(country_shapes, nuts3, nuts3pop, nuts3gdp, ch_cantons, ch_popgdp): - df = gpd.read_file(nuts3) - df["geometry"] = df["geometry"].map(_simplify_polys) - df = df.rename(columns={"NUTS_ID": "id"})[["id", "geometry"]].set_index("id") +def normalise_text(text): + """ + Removes diacritics from non-standard Latin letters, converts them to their + closest standard 26-letter Latin equivalents, and removes asterisks (*) from the text. + + Args: + text (str): Input string to normalize. + + Returns: + str: Normalized string with only standard Latin letters and no asterisks. + """ + # Normalize Unicode to decompose characters (e.g., č -> c + ̌) + text = unicodedata.normalize("NFD", text) + # Remove diacritical marks by filtering out characters of the 'Mn' (Mark, Nonspacing) category + text = "".join(char for char in text if unicodedata.category(char) != "Mn") + # Remove asterisks + text = text.replace("*", "") + # Optionally, ensure only ASCII characters remain + text = "".join(char for char in text if char.isascii()) + return text + + +def simplify_europe(regions): + """ + Simplifies the geometries of European regions by removing small islands and re-adding selected regions manually. + + Parameters + ---------- + regions (GeoDataFrame): A GeoDataFrame containing the geometries of European regions. + + Returns + ------- + regions (GeoDataFrame): A simplified GeoDataFrame with updated geometries and dropped entries. + """ + logger.info( + "Simplifying geometries for Europe by removing small islands smaller than 500 km2 or further than 200 km away." + ) + coverage = ( + regions.to_crs(DISTANCE_CRS) + .groupby("country")["geometry"] + .apply(lambda x: x.union_all()) + ) + coverage_dk = coverage.loc[["DK"]] + coverage = coverage.apply(_simplify_polys, minarea=500 * 1e6, maxdistance=200 * 1e3) + coverage_dk = coverage_dk.apply( + _simplify_polys, minarea=65 * 1e6, maxdistance=200 * 1e3 + ) + coverage.loc["DK"] = coverage_dk.values[0] + coverage = gpd.GeoDataFrame(geometry=coverage, crs=DISTANCE_CRS).to_crs(GEO_CRS) + + # Re-add selected regions manually + coverage = pd.concat([coverage, regions.loc[NUTS3_INCLUDE, ["geometry"]]]) + shape = coverage.union_all() + + regions_polygon = regions.explode() + regions_polygon = gpd.sjoin( + regions_polygon, + gpd.GeoDataFrame(geometry=[shape], crs=GEO_CRS), + how="inner", + predicate="intersects", + ) + + # Group by level3 and country, and aggregate by union, and name index "index" + regions_polygon = regions_polygon.groupby(["level3"])["geometry"].apply( + lambda x: x.union_all() + ) - pop = pd.read_table(nuts3pop, na_values=[":"], delimiter=" ?\t", engine="python") - pop = ( - pop.set_index( - pd.MultiIndex.from_tuples(pop.pop("unit,geo\\time").str.split(",")) - ) - .loc["THS"] - .map(lambda x: pd.to_numeric(x, errors="coerce")) - .bfill(axis=1) - )["2014"] - - gdp = pd.read_table(nuts3gdp, na_values=[":"], delimiter=" ?\t", engine="python") - gdp = ( - gdp.set_index( - pd.MultiIndex.from_tuples(gdp.pop("unit,geo\\time").str.split(",")) - ) - .loc["EUR_HAB"] - .map(lambda x: pd.to_numeric(x, errors="coerce")) - .bfill(axis=1) - )["2014"] - - cantons = pd.read_csv(ch_cantons) - cantons = cantons.set_index(cantons["HASC"].str[3:])["NUTS"] - cantons = cantons.str.pad(5, side="right", fillchar="0") - - swiss = pd.read_excel(ch_popgdp, skiprows=3, index_col=0) - swiss.columns = swiss.columns.to_series().map(cantons) - - swiss_pop = pd.to_numeric(swiss.loc["Residents in 1000", "CH040":]) - pop = pd.concat([pop, swiss_pop]) - swiss_gdp = pd.to_numeric( - swiss.loc["Gross domestic product per capita in Swiss francs", "CH040":] + # Update regions + regions = regions.loc[regions.index.isin(regions_polygon.index)] + regions.loc[regions_polygon.index, "geometry"] = regions_polygon + + return regions + + +def create_regions( + country_list, + nuts3_path, + ba_adm1_path, + md_adm1_path, + ua_adm1_path, + xk_adm1_path, + offshore_shapes, + nuts3_gdp, + nuts3_pop, + other_gdp, + other_pop, +): + """ + Create regions by processing NUTS and non-NUTS geographical shapes. + + Parameters + ---------- + - country_list (list): List of country codes to include. + - nuts3_path (str): Path to the NUTS3 2021 shapefile. + - ba_adm1_path (str): Path to adm1 boundaries for Bosnia and Herzegovina. + - md_adm1_path (str): Path to adm1 boundaries for Moldova. + - ua_adm1_path (str): Path to adm1 boundaries for Ukraine. + - xk_adm1_path (str): Path to adm1 boundaries for Kosovo. + - offshore_shapes (geopandas.GeoDataFrame): Geographical shapes of the exclusive economic zones. + + Returns + ------- + geopandas.GeoDataFrame: A GeoDataFrame containing the processed regions with columns: + - id: Region identifier. + - country: Country code. + - name: Region name. + - geometry: Geometrical shape of the region. + - level1: Level 1 region identifier. + - level2: Level 2 region identifier. + - level3: Level 3 region identifier. + """ + # Prepare NUTS shapes + logger.info("Processing NUTS regions.") + regions = gpd.read_file(nuts3_path) + regions.loc[regions.CNTR_CODE == "EL", "CNTR_CODE"] = "GR" # Rename "EL" to "GR + regions["NUTS_ID"] = regions["NUTS_ID"].str.replace("EL", "GR") + regions.loc[regions.CNTR_CODE == "UK", "CNTR_CODE"] = "GB" # Rename "UK" to "GB" + regions["NUTS_ID"] = regions["NUTS_ID"].str.replace("UK", "GB") + + # Create new df + regions = regions[["NUTS_ID", "CNTR_CODE", "NAME_LATN", "geometry"]] + + # Rename columns and add level columns + regions = regions.rename( + columns={"NUTS_ID": "id", "CNTR_CODE": "country", "NAME_LATN": "name"} + ) + + # Normalise text + regions["id"] = regions["id"].apply(normalise_text) + + regions["level1"] = regions["id"].str[:3] + regions["level2"] = regions["id"].str[:4] + regions["level3"] = regions["id"] + + # Non NUTS countries + logger.info("Processing non-NUTS regions.") + + ba_adm1 = gpd.read_file(ba_adm1_path) + md_adm1 = gpd.read_file(md_adm1_path) + ua_adm1 = gpd.read_file(ua_adm1_path) + xk_adm1 = gpd.read_file(xk_adm1_path) + + regions_non_nuts = pd.concat([ba_adm1, md_adm1, ua_adm1, xk_adm1]) + regions_non_nuts = regions_non_nuts.drop(columns=["osm_id"]) + + # Normalise text + regions_non_nuts["id"] = regions_non_nuts["id"].apply(normalise_text) + regions_non_nuts["name"] = regions_non_nuts["name"].apply(normalise_text) + + # Add level columns + regions_non_nuts["level1"] = regions_non_nuts["id"] + regions_non_nuts["level2"] = regions_non_nuts["id"] + regions_non_nuts["level3"] = regions_non_nuts["id"] + + # Clip regions by non-NUTS shapes + regions["geometry"] = regions["geometry"].difference( + regions_non_nuts.geometry.union_all() + ) + + # Concatenate NUTS and non-NUTS regions + logger.info("Harmonising NUTS and non-NUTS regions.") + regions = pd.concat([regions, regions_non_nuts]) + regions.set_index("id", inplace=True) + + # Drop regions out of geographical scope + regions = regions.drop(DROP_REGIONS, errors="ignore") + + # Clip regions by offshore shapes + logger.info("Clipping regions by offshore shapes.") + regions["geometry"] = regions["geometry"].difference( + offshore_shapes.geometry.union_all() ) - gdp = pd.concat([gdp, swiss_gdp]) - df = df.join(pd.DataFrame(dict(pop=pop, gdp=gdp))) + # GDP and POP for NUTS3 regions + # GDP + logger.info(f"Importing JRC ARDECO GDP data for year {GDP_YEAR}.") + nuts3_gdp = pd.read_csv(nuts3_gdp, index_col=[0]) + nuts3_gdp = nuts3_gdp.query("LEVEL_ID == 3 and UNIT == 'EUR'") + nuts3_gdp.index = nuts3_gdp.index.str.replace("UK", "GB").str.replace("EL", "GR") + nuts3_gdp = nuts3_gdp[str(GDP_YEAR)] + regions["gdp"] = nuts3_gdp + + # Population + logger.info(f"Importing JRC ARDECO population data for year {POP_YEAR}.") + nuts3_pop = pd.read_csv(nuts3_pop, index_col=[0]) + nuts3_pop = nuts3_pop.query("LEVEL_ID == 3") + nuts3_pop.index = nuts3_pop.index.str.replace("UK", "GB").str.replace("EL", "GR") + nuts3_pop = nuts3_pop[str(POP_YEAR)] + regions["pop"] = nuts3_pop.div(1e3).round(0) + + # GDP and POP for non-NUTS3 regions + other_countries = {"BA", "MD", "UA", "XK"} + + if any(country in country_list for country in other_countries): + gdp_pop = pd.concat( + [ + calc_gdp_pop(country, regions, other_gdp, other_pop) + for country in other_countries + ], + axis=0, + ) + + # Merge NUTS3 and non-NUTS3 regions + regions.loc[gdp_pop.index, ["gdp", "pop"]] = gdp_pop[["gdp", "pop"]] - df["country"] = ( - df.index.to_series().str[:2].replace(dict(UK="GB", EL="GR", KV="XK")) + # Resort columns and rename index + regions = regions[ + ["name", "level1", "level2", "level3", "gdp", "pop", "country", "geometry"] + ] + regions.index.name = "index" + + # Simplify geometries + regions = simplify_europe(regions) + + # Only include countries in the config + regions = regions.query("country in @country_list") + + return regions + + +def calc_gdp_pop(country, regions, gdp_non_nuts3, pop_non_nuts3): + """ + Calculate the GDP p.c. and population values for non NUTS3 regions. + + Parameters + ---------- + - country (str): The two-letter country code of the non-NUTS3 region. + - regions (GeoDataFrame): A GeoDataFrame containing the regions. + - gdp_non_nuts3 (str): The file path to the dataset containing the GDP p.c values + for non NUTS3 countries (e.g. MD, UA) + - pop_non_nuts3 (str): The file path to the dataset containing the POP values + for non NUTS3 countries (e.g. MD, UA) + + Returns + ------- + tuple: A tuple containing two GeoDataFrames: + - gdp: A GeoDataFrame with the mean GDP p.c. values mapped to each bus. + - pop: A GeoDataFrame with the summed POP values mapped to each bus. + """ + region = regions.loc[regions.country == country, ["geometry"]] + # Create a bounding box for UA, MD from region shape, including a buffer of 10000 metres + bounding_box = ( + gpd.GeoDataFrame(geometry=[box(*region.total_bounds)], crs=region.crs) + .to_crs(epsg=3857) + .buffer(10000) + .to_crs(region.crs) ) - excludenuts = pd.Index( - ( - "FRA10", - "FRA20", - "FRA30", - "FRA40", - "FRA50", - "PT200", - "PT300", - "ES707", - "ES703", - "ES704", - "ES705", - "ES706", - "ES708", - "ES709", - "FI2", - "FR9", + # GDP Mapping + logger.info(f"Mapping mean GDP p.c. to non-NUTS3 region: {country}") + with xr.open_dataset(gdp_non_nuts3) as src_gdp: + src_gdp = src_gdp.where( + (src_gdp.longitude >= bounding_box.bounds.minx.min()) + & (src_gdp.longitude <= bounding_box.bounds.maxx.max()) + & (src_gdp.latitude >= bounding_box.bounds.miny.min()) + & (src_gdp.latitude <= bounding_box.bounds.maxy.max()), + drop=True, ) + gdp = src_gdp.to_dataframe().reset_index() + gdp = gdp.rename(columns={"GDP_per_capita_PPP": "gdp"}) + gdp = gdp[gdp.time == gdp.time.max()] + gdp_raster = gpd.GeoDataFrame( + gdp, + geometry=gpd.points_from_xy(gdp.longitude, gdp.latitude), + crs="EPSG:4326", ) - excludecountry = pd.Index(("MT", "TR", "LI", "IS", "CY")) - - df = df.loc[df.index.difference(excludenuts)] - df = df.loc[~df.country.isin(excludecountry)] - - manual = gpd.GeoDataFrame( - [ - ["BA1", "BA", 3234.0], - ["RS1", "RS", 6664.0], - ["AL1", "AL", 2778.0], - ["XK1", "XK", 1587.0], - ], - columns=["NUTS_ID", "country", "pop"], - geometry=gpd.GeoSeries(), - crs=df.crs, + gdp_mapped = gpd.sjoin(gdp_raster, region, predicate="within") + gdp = gdp_mapped.groupby(["id"]).agg({"gdp": "mean"}) + + # Population Mapping + logger.info(f"Mapping summed population to non-NUTS3 region: {country}") + with rasterio.open(pop_non_nuts3) as src_pop: + # Mask the raster with the bounding box + out_image, out_transform = mask(src_pop, bounding_box, crop=True) + out_meta = src_pop.meta.copy() + out_meta.update( + { + "driver": "GTiff", + "height": out_image.shape[1], + "width": out_image.shape[2], + "transform": out_transform, + } + ) + masked_data = out_image[0] # Use the first band (rest is empty) + row_indices, col_indices = np.where(masked_data != src_pop.nodata) + values = masked_data[row_indices, col_indices] + + # Affine transformation from pixel coordinates to geo coordinates + x_coords, y_coords = rasterio.transform.xy(out_transform, row_indices, col_indices) + pop_raster = pd.DataFrame({"x": x_coords, "y": y_coords, "pop": values}) + pop_raster = gpd.GeoDataFrame( + pop_raster, + geometry=gpd.points_from_xy(pop_raster.x, pop_raster.y), + crs=src_pop.crs, ) - manual["geometry"] = manual["country"].map(country_shapes.to_crs(df.crs)) - manual = manual.dropna() - manual = manual.set_index("NUTS_ID") + pop_mapped = gpd.sjoin(pop_raster, region, predicate="within") + pop = pop_mapped.groupby(["id"]).agg({"pop": "sum"}).reset_index().set_index("id") + gdp_pop = region.join(gdp).join(pop).drop(columns="geometry") + gdp_pop.fillna(0, inplace=True) - df = pd.concat([df, manual], sort=False) + # Clean and rescale data to 2019 historical values + gdp_pop["gdp"] = gdp_pop["gdp"].round(0) + gdp_pop["pop"] = gdp_pop["pop"].div(1e3).round(0) + + gdp_pop["pop"] = ( + gdp_pop["pop"].div(gdp_pop["pop"].sum()).mul(OTHER_POP_2019[country]).round(0) + ) - df.loc["ME000", "pop"] = 617.0 + gdp_pop["gdp"] = ( + gdp_pop["gdp"] + .mul(1e9) + .div(gdp_pop["gdp"].sum()) + .mul(OTHER_GDP_TOTAL_2019[country]) + .div(EXCHANGE_EUR_USD_2019) + / (1e3 * gdp_pop["pop"]) + ).round(0) - return df + return gdp_pop if __name__ == "__main__": @@ -246,25 +496,40 @@ def nuts3(country_shapes, nuts3, nuts3pop, nuts3gdp, ch_cantons, ch_popgdp): configure_logging(snakemake) set_scenario_config(snakemake) - country_shapes = countries(snakemake.input.naturalearth, snakemake.params.countries) - - country_shapes.reset_index().to_file(snakemake.output.country_shapes) - + # Offshore regions offshore_shapes = eez(snakemake.input.eez, snakemake.params.countries) offshore_shapes.reset_index().to_file(snakemake.output.offshore_shapes) + # Onshore regions + regions = create_regions( + snakemake.params.countries, + snakemake.input.nuts3_2021, + snakemake.input.ba_adm1, + snakemake.input.md_adm1, + snakemake.input.ua_adm1, + snakemake.input.xk_adm1, + offshore_shapes, + snakemake.input.nuts3_gdp, + snakemake.input.nuts3_pop, + snakemake.input.other_gdp, + snakemake.input.other_pop, + ) + + country_shapes = regions.groupby("country")["geometry"].apply( + lambda x: x.union_all() + ) + country_shapes.crs = regions.crs + country_shapes.index.name = "name" + country_shapes.reset_index().to_file(snakemake.output.country_shapes) + europe_shape = gpd.GeoDataFrame( geometry=[country_cover(country_shapes, offshore_shapes.geometry)], crs=country_shapes.crs, ) europe_shape.reset_index().to_file(snakemake.output.europe_shape) - nuts3_shapes = nuts3( - country_shapes, - snakemake.input.nuts3, - snakemake.input.nuts3pop, - snakemake.input.nuts3gdp, - snakemake.input.ch_cantons, - snakemake.input.ch_popgdp, + # Export regions including GDP and POP data + logger.info( + f"Exporting NUTS3 and ADM1 shapes with GDP and POP values to {snakemake.output.nuts3_shapes}." ) - nuts3_shapes.reset_index().to_file(snakemake.output.nuts3_shapes) + regions.reset_index().to_file(snakemake.output.nuts3_shapes) diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py index 1ea72d66c..3288dd0d2 100644 --- a/scripts/cluster_network.py +++ b/scripts/cluster_network.py @@ -102,6 +102,7 @@ import numpy as np import pandas as pd import pypsa +import tqdm import xarray as xr from _helpers import configure_logging, set_scenario_config from packaging.version import Version, parse @@ -340,7 +341,10 @@ def cluster_regions( if "snakemake" not in globals(): from _helpers import mock_snakemake - snakemake = mock_snakemake("cluster_network", clusters=60) + snakemake = mock_snakemake( + "cluster_network", + clusters=60, + ) configure_logging(snakemake) set_scenario_config(snakemake) @@ -378,6 +382,83 @@ def cluster_regions( custom_busmap.index = custom_busmap.index.astype(str) logger.info(f"Imported custom busmap from {snakemake.input.custom_busmap}") busmap = custom_busmap + + mode = params.mode + if mode == "admin": + level_map = { + 0: "country", + 1: "level1", + 2: "level2", + 3: "level3", + } + admin_level = params.admin["level"] + logger.info(f"Clustering at administrative level {admin_level}.") + # check if BA, MD, UA, or XK are in the network + adm1_countries = ["BA", "MD", "UA", "XK"] + buses = n.buses[["x", "y", "country"]].copy() + # Find the intersection of adm1_countries and n.buses.country + adm1_countries = list( + set(adm1_countries).intersection(buses["country"].unique()) + ) + if adm1_countries: + logger.info( + f"Note that the following countries can only be clustered at a maximum administration level of 1: {adm1_countries}." + ) + + nuts3_regions = gpd.read_file(snakemake.input.nuts3_shapes).set_index( + "index" + ) + nuts3_regions["column"] = level_map[admin_level] + + country_level = {k: v for k, v in params.admin.items() if k != "level"} + if country_level: + country_level_list = "\n".join( + [f"- {k}: level {v}" for k, v in country_level.items()] + ) + logger.info( + f"Setting individual administrative levels for:\n{country_level_list}" + ) + nuts3_regions.loc[ + nuts3_regions["country"].isin(country_level.keys()), "column" + ] = ( + nuts3_regions.loc[ + nuts3_regions["country"].isin(country_level.keys()), "country" + ] + .map(country_level) + .map(level_map) + ) + nuts3_regions["busmap"] = nuts3_regions.apply( + lambda row: row[row["column"]], axis=1 + ) + + # Group by busmap + admin_shapes = nuts3_regions[["busmap", "geometry"]] + admin_shapes = admin_shapes.groupby("busmap")["geometry"].apply( + lambda x: x.union_all() + ) + admin_shapes = gpd.GeoDataFrame( + admin_shapes, geometry="geometry", crs=nuts3_regions.crs + ) + admin_shapes["country"] = admin_shapes.index.str[:2] + + buses["geometry"] = gpd.points_from_xy(buses["x"], buses["y"]) + buses = gpd.GeoDataFrame(buses, geometry="geometry", crs="EPSG:4326") + buses["busmap"] = "" + + # Map based for each country + for country in tqdm.tqdm(buses["country"].unique()): + buses_subset = buses.loc[buses["country"] == country] + + buses.loc[buses_subset.index, "busmap"] = gpd.sjoin_nearest( + buses_subset.to_crs(epsg=3857), + admin_shapes.loc[admin_shapes["country"] == country].to_crs( + epsg=3857 + ), + how="left", + )["busmap_right"] + + busmap = buses["busmap"] + else: algorithm = params.cluster_network["algorithm"] features = None diff --git a/scripts/retrieve_osm_boundaries.py b/scripts/retrieve_osm_boundaries.py new file mode 100644 index 000000000..6070a57d6 --- /dev/null +++ b/scripts/retrieve_osm_boundaries.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors +# +# SPDX-License-Identifier: MIT +""" +Retrieve administrative boundaries for the specified country using the overpass API and save it +to the specified output files. + +Note that overpass requests are based on a fair +use policy. `retrieve_osm_data` is meant to be used in a way that respects this +policy by fetching the needed data once, only. +""" + +import json +import logging +import time + +import requests +from _helpers import ( # set_scenario_config,; update_config_from_wildcards,; update_config_from_wildcards, + configure_logging, + set_scenario_config, +) + +logger = logging.getLogger(__name__) + + +ADM1_SPECIALS = { + "XK": 5, +} + + +def retrieve_osm_boundaries( + country, + adm1_specials, + output, +): + """ + Retrieve OSM administrative boundaries for the specified country and save it to the specified + output files. + + Parameters + ---------- + country : str + The country code for which the OSM data should be retrieved. + """ + # Overpass API endpoint URL + overpass_url = "https://overpass-api.de/api/interpreter" + + wait_time = 5 + + osm_adm_level = "4" + if country in adm1_specials: + osm_adm_level = adm1_specials[country] # special case e.g. for Kosovo + + retries = 3 + for attempt in range(retries): + logger.info( + f" - Fetching OSM administrative boundaries for {country} (Attempt {attempt + 1})..." + ) + + # Build the overpass query + op_area = f'area["ISO3166-1"="{country}"]' + op_query = f""" + [out:json]; + {op_area}->.searchArea; + ( + relation["boundary"="administrative"]["admin_level"={osm_adm_level}]["name"](area.searchArea); + ); + out body geom; + """ + try: + # Send the request + response = requests.post(overpass_url, data=op_query) + response.raise_for_status() # Raise HTTPError for bad responses + + filepath = output[0] + + with open(filepath, mode="w") as f: + json.dump(response.json(), f, indent=2) + logger.info(" - Done.") + break # Exit the retry loop on success + except (json.JSONDecodeError, requests.exceptions.RequestException) as e: + logger.error( + f"Error for retrieving administrative boundaries in country {country}: {e}" + ) + logger.debug( + f"Response text: {response.text if response else 'No response'}" + ) + if attempt < retries - 1: + wait_time += 15 + logger.info(f"Waiting {wait_time} seconds before retrying...") + time.sleep(wait_time) + else: + logger.error( + f"Failed to retrieve administrative boundaries in country {country} after {retries} attempts." + ) + except Exception as e: + # For now, catch any other exceptions and log them. Treat this + # the same as a RequestException and try to run again two times. + logger.error( + f"Unexpected error in retrieving administrative boundaries in country {country}: {e}" + ) + if attempt < retries - 1: + wait_time += 10 + logger.info(f"Waiting {wait_time} seconds before retrying...") + time.sleep(wait_time) + else: + logger.error( + f"Failed to retrieve administrative boundaries for country {country} after {retries} attempts." + ) + + +if __name__ == "__main__": + if "snakemake" not in globals(): + from _helpers import mock_snakemake + + snakemake = mock_snakemake("retrieve_osm_boundaries", country="XK") + configure_logging(snakemake) + set_scenario_config(snakemake) + + # Retrieve the OSM data + country = snakemake.wildcards.country + output = snakemake.output + + retrieve_osm_boundaries(country, ADM1_SPECIALS, output) diff --git a/test/test_build_shapes.py b/test/test_build_shapes.py index 48aeac7f5..8660d0011 100644 --- a/test/test_build_shapes.py +++ b/test/test_build_shapes.py @@ -15,7 +15,7 @@ sys.path.append("./scripts") -from build_shapes import _simplify_polys, countries, country_cover, eez +from build_shapes import _simplify_polys, eez path_cwd = pathlib.Path.cwd() @@ -59,20 +59,6 @@ def test_simplify_polys(tolerance, expected_tuple, italy_shape): assert all([x == y for x, y in zip(output_tuple, expected_tuple)]) -@pytest.mark.parametrize( - "country_list", - [["MK"], ["IT"]], -) -def test_countries(config, download_natural_earth, country_list): - """ - Verify what is returned by countries. - """ - natural_earth = download_natural_earth - country_shapes_df = countries(natural_earth, country_list) - assert country_shapes_df.shape == (1,) - assert country_shapes_df.index.unique().tolist() == country_list - - @pytest.mark.parametrize( "country_list", [["DE"], ["IT"]], @@ -85,46 +71,3 @@ def test_eez(config, country_list, download_eez): offshore_shapes_gdf = eez(eez_path, country_list) assert offshore_shapes_gdf.shape == (1, 1) assert offshore_shapes_gdf.index == country_list[0] - - -@pytest.mark.parametrize( - "country_list,expected_tuple", - [ - ( - ["IT"], - ( - 89.49, - 61.67, - ), - ), - ( - ["DE"], - ( - 53.72, - 56.46, - ), - ), - ], -) -def test_country_cover( - country_list, download_natural_earth, download_eez, expected_tuple -): - """ - Verify what is returned by country_cover. - """ - natural_earth = download_natural_earth - eez_path = download_eez - country_shapes_gdf = countries(natural_earth, country_list).reset_index() - offshore_shapes_gdf = eez(eez_path, country_list).reset_index() - europe_shape_gdf = gpd.GeoDataFrame( - geometry=[country_cover(country_shapes_gdf, offshore_shapes_gdf.geometry)], - crs=6933, - ) - europe_shape_gdf["area"] = europe_shape_gdf.area - europe_shape_gdf["perimeter"] = europe_shape_gdf.length - output_tuple = ( - np.round(europe_shape_gdf["area"][0], 2), - np.round(europe_shape_gdf["perimeter"][0], 2), - ) - assert len(output_tuple) == len(expected_tuple) - assert all([x == y for x, y in zip(output_tuple, expected_tuple)])