From f51dba8bd64054ad873d8f461e35ac530ae975bc Mon Sep 17 00:00:00 2001 From: Patrick Brosi Date: Tue, 17 Dec 2024 15:20:34 +0100 Subject: [PATCH 1/4] add option --no-untagged-nodes-geometric-relations to ignore untagged nodes during computation of the geometric relations --- include/osm2rdf/config/Config.h | 2 ++ include/osm2rdf/config/Constants.h | 8 +++++ src/config/Config.cpp | 47 ++++++++++++++++++------------ src/osm/OsmiumHandler.cpp | 3 +- 4 files changed, 40 insertions(+), 20 deletions(-) diff --git a/include/osm2rdf/config/Config.h b/include/osm2rdf/config/Config.h index 36dd358b..a1fe7cb6 100644 --- a/include/osm2rdf/config/Config.h +++ b/include/osm2rdf/config/Config.h @@ -80,6 +80,8 @@ struct Config { bool addUntaggedRelations = true; bool addUntaggedAreas = true; + bool addSpatialRelsForUntaggedNodes = true; + int numThreads = std::thread::hardware_concurrency(); // Default settings for data diff --git a/include/osm2rdf/config/Constants.h b/include/osm2rdf/config/Constants.h index 8d341a15..10fde473 100644 --- a/include/osm2rdf/config/Constants.h +++ b/include/osm2rdf/config/Constants.h @@ -175,6 +175,14 @@ const static inline std::string NO_WAY_GEOM_RELATIONS_OPTION_LONG = const static inline std::string NO_WAY_GEOM_RELATIONS_OPTION_HELP = "Do not dump way geometric relations"; +const static inline std::string NO_UNTAGGED_NODES_SPATIAL_RELS_INFO = + "Do not compute spatial relations involving untagged nodes"; +const static inline std::string NO_UNTAGGED_NODES_SPATIAL_RELS_OPTION_SHORT = ""; +const static inline std::string NO_UNTAGGED_NODES_SPATIAL_RELS_OPTION_LONG = + "no-untagged-nodes-geometric-relations"; +const static inline std::string NO_UNTAGGED_NODES_SPATIAL_RELS_OPTION_HELP = + "Do not compute spatial relations involving untagged nodes"; + const static inline std::string NO_UNTAGGED_NODES_INFO = "Do not output untagged nodes"; const static inline std::string NO_UNTAGGED_NODES_OPTION_SHORT = ""; diff --git a/src/config/Config.cpp b/src/config/Config.cpp index 4a9d036c..80310065 100644 --- a/src/config/Config.cpp +++ b/src/config/Config.cpp @@ -103,6 +103,11 @@ std::string osm2rdf::config::Config::getInfo(std::string_view prefix) const { oss << "\n" << prefix << osm2rdf::config::constants::NO_UNTAGGED_AREAS_INFO; } + if (!addSpatialRelsForUntaggedNodes) { + oss << "\n" + << prefix + << osm2rdf::config::constants::NO_UNTAGGED_NODES_SPATIAL_RELS_INFO; + } if (simplifyWKT > 0) { oss << "\n" << prefix << osm2rdf::config::constants::SIMPLIFY_WKT_INFO; oss << "\n" @@ -280,17 +285,21 @@ void osm2rdf::config::Config::fromArgs(int argc, char** argv) { osm2rdf::config::constants::ADD_AREA_WAY_LINESTRINGS_OPTION_LONG, osm2rdf::config::constants::ADD_AREA_WAY_LINESTRINGS_OPTION_HELP); - auto noUntaggedNodesOp = - parser.add( - osm2rdf::config::constants::NO_UNTAGGED_NODES_OPTION_SHORT, - osm2rdf::config::constants::NO_UNTAGGED_NODES_OPTION_LONG, - osm2rdf::config::constants::NO_UNTAGGED_NODES_OPTION_HELP); + auto noUntaggedNodesSpatialRelsOp = parser.add( + osm2rdf::config::constants::NO_UNTAGGED_NODES_SPATIAL_RELS_OPTION_SHORT, + osm2rdf::config::constants::NO_UNTAGGED_NODES_SPATIAL_RELS_OPTION_LONG, + osm2rdf::config::constants::NO_UNTAGGED_NODES_SPATIAL_RELS_OPTION_HELP); - auto noUntaggedWaysOp = - parser.add( - osm2rdf::config::constants::NO_UNTAGGED_WAYS_OPTION_SHORT, - osm2rdf::config::constants::NO_UNTAGGED_WAYS_OPTION_LONG, - osm2rdf::config::constants::NO_UNTAGGED_WAYS_OPTION_HELP); + auto noUntaggedNodesOp = parser.add( + osm2rdf::config::constants::NO_UNTAGGED_NODES_OPTION_SHORT, + osm2rdf::config::constants::NO_UNTAGGED_NODES_OPTION_LONG, + osm2rdf::config::constants::NO_UNTAGGED_NODES_OPTION_HELP); + + auto noUntaggedWaysOp = parser.add( + osm2rdf::config::constants::NO_UNTAGGED_WAYS_OPTION_SHORT, + osm2rdf::config::constants::NO_UNTAGGED_WAYS_OPTION_LONG, + osm2rdf::config::constants::NO_UNTAGGED_WAYS_OPTION_HELP); auto noUntaggedRelationsOp = parser.add( @@ -298,11 +307,10 @@ void osm2rdf::config::Config::fromArgs(int argc, char** argv) { osm2rdf::config::constants::NO_UNTAGGED_RELATIONS_OPTION_LONG, osm2rdf::config::constants::NO_UNTAGGED_RELATIONS_OPTION_HELP); - auto noUntaggedAreasOp = - parser.add( - osm2rdf::config::constants::NO_UNTAGGED_AREAS_OPTION_SHORT, - osm2rdf::config::constants::NO_UNTAGGED_AREAS_OPTION_LONG, - osm2rdf::config::constants::NO_UNTAGGED_AREAS_OPTION_HELP); + auto noUntaggedAreasOp = parser.add( + osm2rdf::config::constants::NO_UNTAGGED_AREAS_OPTION_SHORT, + osm2rdf::config::constants::NO_UNTAGGED_AREAS_OPTION_LONG, + osm2rdf::config::constants::NO_UNTAGGED_AREAS_OPTION_HELP); auto addWayMetadataOp = parser.add( osm2rdf::config::constants::ADD_WAY_METADATA_OPTION_SHORT, @@ -482,6 +490,8 @@ void osm2rdf::config::Config::fromArgs(int argc, char** argv) { wktDeviation = wktDeviationOp->value(); wktPrecision = wktPrecisionOp->value(); + addSpatialRelsForUntaggedNodes = !noUntaggedNodesSpatialRelsOp->is_set(); + addUntaggedNodes = !noUntaggedNodesOp->is_set(); addUntaggedWays = !noUntaggedWaysOp->is_set(); addUntaggedRelations = !noUntaggedRelationsOp->is_set(); @@ -514,10 +524,9 @@ void osm2rdf::config::Config::fromArgs(int argc, char** argv) { } else if (outputCompressOp->value() == "bz2") { outputCompress = BZ2; } else { - throw popl::invalid_option( - outputCompressOp.get(), - popl::invalid_option::Error::invalid_argument, - popl::OptionName::long_name, outputCompressOp->value(), ""); + throw popl::invalid_option( + outputCompressOp.get(), popl::invalid_option::Error::invalid_argument, + popl::OptionName::long_name, outputCompressOp->value(), ""); } outputKeepFiles = outputKeepFilesOp->is_set(); diff --git a/src/osm/OsmiumHandler.cpp b/src/osm/OsmiumHandler.cpp index 2aa83be4..441d2bc0 100644 --- a/src/osm/OsmiumHandler.cpp +++ b/src/osm/OsmiumHandler.cpp @@ -208,7 +208,8 @@ void osm2rdf::osm::OsmiumHandler::node(const osmium::Node& node) { _progressBar.update(_numTasksDone++); } } - if (!_config.noGeometricRelations && !_config.noNodeGeometricRelations) { + if (!_config.noGeometricRelations && !_config.noNodeGeometricRelations && + (!osmNode.tags().empty() || _config.addSpatialRelsForUntaggedNodes)) { _geometryHandler->node(osmNode); #pragma omp critical(progress) { From ba666ab72f7c3a405e5ffdc765729645e2447a9c Mon Sep 17 00:00:00 2001 From: Patrick Brosi Date: Tue, 17 Dec 2024 15:47:07 +0100 Subject: [PATCH 2/4] update libspatialjoin --- vendor/spatialjoin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/spatialjoin b/vendor/spatialjoin index d8f046c9..c229a5e3 160000 --- a/vendor/spatialjoin +++ b/vendor/spatialjoin @@ -1 +1 @@ -Subproject commit d8f046c92f61b9491093e35aa2f1336f6259c09e +Subproject commit c229a5e3fff5a9ab8f1daadc8b4582910ed5318e From 492069c1e2fa54670d66c8511c6eeeb752e8038f Mon Sep 17 00:00:00 2001 From: Patrick Brosi Date: Fri, 20 Dec 2024 14:19:29 +0100 Subject: [PATCH 3/4] for the spatialjoin, encode the object IDs and their type in a variable-length integer, store this integer in a string, and use this string as an ID for libspatialjoin. This makes use of the fact that our IDs are always pairs of and and has the following benefits: (1) the ID size of a typical OSM id goes down from around 18 bytes to around 5 bytes, (2) the disk space required for the geometry cache (where these IDs are stored in) is reduced, especially for nodes (where ONLY the id is stored on disk), the memory overhead of the relation tracking (via libspatialjoins "collection of geometry refs" functionality) is greatly reduced. Note: for further optimiziations regarding the memory problem with the dense RAM cache, I am waiting for profiling results from massif --- include/osm2rdf/osm/GeometryHandler.h | 3 ++ src/osm/GeometryHandler.cpp | 73 ++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/include/osm2rdf/osm/GeometryHandler.h b/include/osm2rdf/osm/GeometryHandler.h index 4761831d..17be9211 100644 --- a/include/osm2rdf/osm/GeometryHandler.h +++ b/include/osm2rdf/osm/GeometryHandler.h @@ -79,6 +79,9 @@ class GeometryHandler { const std::string& pred); void progressCb(size_t progr); + std::string getSweeperId(uint64_t oid, char type); + std::string getFullID(const std::string& id); + osm2rdf::util::ProgressBar _progressBar; }; diff --git a/src/osm/GeometryHandler.cpp b/src/osm/GeometryHandler.cpp index 5f2143b5..6314bc91 100644 --- a/src/osm/GeometryHandler.cpp +++ b/src/osm/GeometryHandler.cpp @@ -98,9 +98,7 @@ void GeometryHandler::relation(const Relation& rel) { if (!rel.hasGeometry()) return; - const std::string id = _writer->generateIRI( - osm2rdf::ttl::constants::RELATION_NAMESPACE[_config.sourceDataset], - rel.id()); + std::string id = getSweeperId(rel.id(), 5); size_t subId = 0; @@ -137,7 +135,7 @@ template void GeometryHandler::writeRelCb(size_t t, const std::string& a, const std::string& b, const std::string& pred) { - _writer->writeTriple(a, pred, b, t); + _writer->writeTriple(getFullID(a), pred, getFullID(b), t); } // ____________________________________________________________________________ @@ -197,9 +195,7 @@ ::util::geo::I32MultiPolygon GeometryHandler::transform( // ____________________________________________________________________________ template void GeometryHandler::area(const Area& area) { - const std::string id = _writer->generateIRI( - areaNS(area.fromWay() ? AreaFromType::WAY : AreaFromType::RELATION), - area.objId()); + std::string id = getSweeperId(area.objId(), area.fromWay() ? 3 : 4); _sweeper.add(transform(area.geom()), id, false, _parseBatches[omp_get_thread_num()]); @@ -220,12 +216,66 @@ ::util::geo::I32Point GeometryHandler::transform( static_cast(point.getY() * PREC)}; } +// ____________________________________________________________________________ +template +std::string GeometryHandler::getFullID(const std::string& strid) { + uint64_t id = 0; + + for (size_t i = strid.size() - 1; i > 0; i--) { + id |= + static_cast(reinterpret_cast(strid[i])) + << (8 * (strid.size() - 1 - i)); + } + + if (strid[0] == 1) { + return _writer->generateIRI( + osm2rdf::ttl::constants::NODE_NAMESPACE[_config.sourceDataset], id); + } + + if (strid[0] == 2) { + return _writer->generateIRI( + osm2rdf::ttl::constants::WAY_NAMESPACE[_config.sourceDataset], id); + } + + if (strid[0] == 3) { + return _writer->generateIRI( + areaNS(AreaFromType::WAY), id); + } + + if (strid[0] == 4) { + return _writer->generateIRI( + areaNS(AreaFromType::RELATION), id); + } + + if (strid[0] == 5) { + return _writer->generateIRI( + osm2rdf::ttl::constants::RELATION_NAMESPACE[_config.sourceDataset], id); + } + + return strid; +} + +// ____________________________________________________________________________ +template +std::string GeometryHandler::getSweeperId(uint64_t oid, char type) { + unsigned char id[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int a = 0; + uint64_t tmp; + + while ((tmp = (oid & (0xFFLL << (a * 8))))) { + id[8 - a] = tmp >> (a * 8); + a++; + } + + id[8 - a] = type; + + return reinterpret_cast(id + (8 - a)); +} + // ____________________________________________________________________________ template void GeometryHandler::node(const Node& node) { - std::string id = _writer->generateIRI( - osm2rdf::ttl::constants::NODE_NAMESPACE[_config.sourceDataset], - node.id()); + std::string id = getSweeperId(node.id(), 1); _sweeper.add(transform(node.geom()), id, false, _parseBatches[omp_get_thread_num()]); @@ -241,8 +291,7 @@ template void GeometryHandler::way(const Way& way) { if (way.isArea()) return; // skip way relations, will be handled by area() - std::string id = _writer->generateIRI( - osm2rdf::ttl::constants::WAY_NAMESPACE[_config.sourceDataset], way.id()); + std::string id = getSweeperId(way.id(), 2); _sweeper.add(transform(way.geom()), id, false, _parseBatches[omp_get_thread_num()]); From 366155e1f95428eb4722b38e611564f8d2c8941a Mon Sep 17 00:00:00 2001 From: Patrick Brosi Date: Fri, 20 Dec 2024 14:52:46 +0100 Subject: [PATCH 4/4] also allow integer which contain 0 bytes --- src/osm/GeometryHandler.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/osm/GeometryHandler.cpp b/src/osm/GeometryHandler.cpp index 6314bc91..abc7175b 100644 --- a/src/osm/GeometryHandler.cpp +++ b/src/osm/GeometryHandler.cpp @@ -238,13 +238,11 @@ std::string GeometryHandler::getFullID(const std::string& strid) { } if (strid[0] == 3) { - return _writer->generateIRI( - areaNS(AreaFromType::WAY), id); + return _writer->generateIRI(areaNS(AreaFromType::WAY), id); } if (strid[0] == 4) { - return _writer->generateIRI( - areaNS(AreaFromType::RELATION), id); + return _writer->generateIRI(areaNS(AreaFromType::RELATION), id); } if (strid[0] == 5) { @@ -262,14 +260,16 @@ std::string GeometryHandler::getSweeperId(uint64_t oid, char type) { int a = 0; uint64_t tmp; - while ((tmp = (oid & (0xFFLL << (a * 8))))) { + while ((oid >> (a * 8))) { + tmp = (oid & (0xFFLL << (a * 8))); id[8 - a] = tmp >> (a * 8); a++; } id[8 - a] = type; - return reinterpret_cast(id + (8 - a)); + return std::string{reinterpret_cast(id + (8 - a)), + static_cast(a + 1)}; } // ____________________________________________________________________________