diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 530df1a63..6cf30d59e 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -1,4 +1,5 @@ query-preprocessing: + - step: split_japanese_phrases - step: normalize normalization: - ":: lower ()" @@ -9,16 +10,17 @@ normalization: - "'nº' > 'no'" - "ª > a" - "º > o" - - "[[:Punctuation:][:Symbol:]\u02bc] > ' '" + - "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'" - "ß > 'ss'" # German szet is unambiguously equal to double ss - - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >" + - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >" - "[:Lm:] >" - ":: [[:Number:]] Latin ()" - ":: [[:Number:]] Ascii ();" - ":: [[:Number:]] NFD ();" - "[[:Nonspacing Mark:] [:Cf:]] >;" - - "[:Space:]+ > ' '" + - "[-:]?[:Space:]+[-:]? > ' '" transliteration: + - "[-:] > ' '" - ":: Latin ()" - !include icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" diff --git a/src/nominatim_api/query_preprocessing/split_japanese_phrases.py b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py new file mode 100644 index 000000000..7ab55b5f5 --- /dev/null +++ b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This file divides Japanese addresses into three categories: +prefecture, municipality, and other. +The division is not strict but simple using these keywords. +""" +from typing import List +import re + +from .config import QueryConfig +from .base import QueryProcessingFunc +from ..search.query import Phrase + +MATCH_PATTERNS = [ + r''' + (...??[都都道府県縣]) # [group1] prefecture + (.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages) + (.+) # [group3] other words + ''', + r''' + (...??[都都道府県縣]) # [group1] prefecture + (.+) # [group3] other words + ''', + r''' + (.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages) + (.+) # [group3] other words + ''' +] + + +class _JapanesePreprocessing: + + def __init__(self, config: QueryConfig) -> None: + self.config = config + + def split_phrase(self, phrase: Phrase) -> Phrase: + """ + This function performs a division on the given text using a regular expression. + """ + for pattern in MATCH_PATTERNS: + result = re.match(pattern, phrase.text, re.VERBOSE) + if result is not None: + return Phrase(phrase.ptype, ':'.join(result.groups())) + + return phrase + + def __call__(self, phrases: List[Phrase]) -> List[Phrase]: + """Split a Japanese address using japanese_tokenizer. + """ + return [self.split_phrase(p) for p in phrases] + + +def create(config: QueryConfig) -> QueryProcessingFunc: + """ Create a function of japanese preprocessing. + """ + return _JapanesePreprocessing(config) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index 632270ef0..a6335c137 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -433,6 +433,7 @@ def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCa BreakType.START: 0.0, BreakType.END: 0.0, BreakType.PHRASE: 0.0, + BreakType.SOFT_PHRASE: 0.0, BreakType.WORD: 0.1, BreakType.PART: 0.2, BreakType.TOKEN: 0.4 diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py index efe5b7216..69455d77a 100644 --- a/src/nominatim_api/search/geocoder.py +++ b/src/nominatim_api/search/geocoder.py @@ -133,7 +133,7 @@ def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None: """ assert self.query_analyzer is not None qwords = [word for phrase in query.source - for word in re.split('[, ]+', phrase.text) if word] + for word in re.split('[-,: ]+', phrase.text) if word] if not qwords: return @@ -146,7 +146,7 @@ def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None: distance = 0.0 norm = self.query_analyzer.normalize_text(' '.join((result.display_name, result.country_code or ''))) - words = set((w for w in norm.split(' ') if w)) + words = set((w for w in re.split('[-,: ]+', norm) if w)) if not words: continue for qword in qwords: diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 5976fbec0..6f1dcf790 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -7,10 +7,12 @@ """ Implementation of query analysis for the ICU tokenizer. """ -from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast +from typing import Tuple, Dict, List, Optional, Iterator, Any, cast from collections import defaultdict import dataclasses import difflib +import re +from itertools import zip_longest from icu import Transliterator @@ -34,17 +36,30 @@ 'C': qmod.TokenType.COUNTRY } +PENALTY_IN_TOKEN_BREAK = { + qmod.BreakType.START: 0.5, + qmod.BreakType.END: 0.5, + qmod.BreakType.PHRASE: 0.5, + qmod.BreakType.SOFT_PHRASE: 0.5, + qmod.BreakType.WORD: 0.1, + qmod.BreakType.PART: 0.0, + qmod.BreakType.TOKEN: 0.0 +} + -class QueryPart(NamedTuple): +@dataclasses.dataclass +class QueryPart: """ Normalized and transliterated form of a single term in the query. When the term came out of a split during the transliteration, the normalized string is the full word before transliteration. The word number keeps track of the word before transliteration and can be used to identify partial transliterated terms. + Penalty is the break penalty for the break following the token. """ token: str normalized: str word_number: int + penalty: float QueryParts = List[QueryPart] @@ -58,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod. total = len(terms) for first in range(start, total): word = terms[first].token - yield word, qmod.TokenRange(first, first + 1) + penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD] + yield word, qmod.TokenRange(first, first + 1, penalty=penalty) for last in range(first + 1, min(first + 20, total)): word = ' '.join((word, terms[last].token)) - yield word, qmod.TokenRange(first, last + 1) + penalty += terms[last - 1].penalty + yield word, qmod.TokenRange(first, last + 1, penalty=penalty) @dataclasses.dataclass @@ -94,25 +111,25 @@ def rematch(self, norm: str) -> None: self.penalty += (distance/len(self.lookup_word)) @staticmethod - def from_db_row(row: SaRow) -> 'ICUToken': + def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken': """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) addr_count = 1 if row.info is None else row.info.get('addr_count', 1) - penalty = 0.0 + penalty = base_penalty if row.type == 'w': - penalty = 0.3 + penalty += 0.3 elif row.type == 'W': if len(row.word_token) == 1 and row.word_token == row.word: - penalty = 0.2 if row.word.isdigit() else 0.3 + penalty += 0.2 if row.word.isdigit() else 0.3 elif row.type == 'H': - penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit()) + penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit()) if all(not c.isdigit() for c in row.word_token): penalty += 0.2 * (len(row.word_token) - 1) elif row.type == 'C': if len(row.word_token) == 1: - penalty = 0.3 + penalty += 0.3 if row.info is None: lookup_word = row.word @@ -202,7 +219,7 @@ async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: for row in await self.lookup_in_db(list(words.keys())): for trange in words[row.word_token]: - token = ICUToken.from_db_row(row) + token = ICUToken.from_db_row(row, trange.penalty or 0.0) if row.type == 'S': if row.info['op'] in ('in', 'near'): if trange.start == 0: @@ -242,16 +259,24 @@ def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: wordnr = 0 for phrase in query.source: query.nodes[-1].ptype = phrase.ptype - for word in phrase.text.split(' '): + phrase_split = re.split('([ :-])', phrase.text) + # The zip construct will give us the pairs of word/break from + # the regular expression split. As the split array ends on the + # final word, we simply use the fillvalue to even out the list and + # add the phrase break at the end. + for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','): + if not word: + continue trans = self.transliterator.transliterate(word) if trans: for term in trans.split(' '): if term: - parts.append(QueryPart(term, word, wordnr)) + parts.append(QueryPart(term, word, wordnr, + PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN])) query.add_node(qmod.BreakType.TOKEN, phrase.ptype) - query.nodes[-1].btype = qmod.BreakType.WORD + query.nodes[-1].btype = qmod.BreakType(breakchar) + parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)] wordnr += 1 - query.nodes[-1].btype = qmod.BreakType.PHRASE for word, wrange in yield_words(parts, phrase_start): words[word].append(wrange) @@ -272,7 +297,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: """ Add tokens to query that are not saved in the database. """ for part, node, i in zip(parts, query.nodes, range(1000)): - if len(part.token) <= 4 and part[0].isdigit()\ + if len(part.token) <= 4 and part.token.isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, ICUToken(penalty=0.5, token=0, diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 02ebbb5b9..aa1694313 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -21,7 +21,13 @@ class BreakType(enum.Enum): END = '>' """ End of the query. """ PHRASE = ',' - """ Break between two phrases. """ + """ Hard break between two phrases. Address parts cannot cross hard + phrase boundaries.""" + SOFT_PHRASE = ':' + """ Likely break between two phrases. Address parts should not cross soft + phrase boundaries. Soft breaks can be inserted by a preprocessor + that is analysing the input string. + """ WORD = ' ' """ Break between words. """ PART = '-' @@ -116,6 +122,7 @@ class TokenRange: """ start: int end: int + penalty: Optional[float] = None def __lt__(self, other: 'TokenRange') -> bool: return self.end <= other.start diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py index a2e1804c7..0983fd13b 100644 --- a/src/nominatim_api/search/token_assignment.py +++ b/src/nominatim_api/search/token_assignment.py @@ -27,6 +27,7 @@ class TypedRange: qmod.BreakType.START: 0.0, qmod.BreakType.END: 0.0, qmod.BreakType.PHRASE: 0.0, + qmod.BreakType.SOFT_PHRASE: 0.0, qmod.BreakType.WORD: 0.1, qmod.BreakType.PART: 0.2, qmod.BreakType.TOKEN: 0.4 diff --git a/src/nominatim_db/tokenizer/icu_token_analysis.py b/src/nominatim_db/tokenizer/icu_token_analysis.py index a3cdcb7af..c1ba106c4 100644 --- a/src/nominatim_db/tokenizer/icu_token_analysis.py +++ b/src/nominatim_db/tokenizer/icu_token_analysis.py @@ -25,6 +25,8 @@ class ICUTokenAnalysis: def __init__(self, norm_rules: str, trans_rules: str, analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']): + # additional break signs are not relevant during name analysis + norm_rules += ";[[:Space:][-:]]+ > ' ';" self.normalizer = Transliterator.createFromRules("icu_normalization", norm_rules) trans_rules += ";[:Space:]+ > ' '" diff --git a/test/python/api/query_processing/test_split_japanese_phrases.py b/test/python/api/query_processing/test_split_japanese_phrases.py new file mode 100644 index 000000000..6055f9db5 --- /dev/null +++ b/test/python/api/query_processing/test_split_japanese_phrases.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for japanese phrase splitting. +""" +from pathlib import Path + +import pytest + +from icu import Transliterator + +import nominatim_api.search.query as qmod +from nominatim_api.query_preprocessing.config import QueryConfig +from nominatim_api.query_preprocessing import split_japanese_phrases + +def run_preprocessor_on(query): + proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None)) + + return proc(query) + + +@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'), + ('大阪府大阪', '大阪府:大阪'), + ('大阪市大阪', '大阪市:大阪')]) +def test_split_phrases(inp, outp): + query = [qmod.Phrase(qmod.PhraseType.NONE, inp)] + + out = run_preprocessor_on(query) + + assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]