diff --git a/CHANGELOG.md b/CHANGELOG.md index af58f23..3996ee2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ (no unreleased changes yet) +## [v0.9.1][] + +**Release date: 2020-01-16** + +Partially revert some changes regarding the parsing of surface +alignments as they caused issues, particular when strings contain `~` +characters. + +### Added + +* `ALIGNMENT` production in `penman.lexer.PATTERNS` (this reverts a + change in v0.9.0) ([#50][]) + +### Changed + +* Disallow tildes in the `ROLE` and `SYMBOL` patterns (this reverts a + change in v0.9.0) ([#50][]) +* `ALIGNMENT` tokens are joined to their previous `ROLE`, `SYMBOL`, or + `STRING`, which means they can now be separated with a space (for + example, `(a / alpha ~1)`) + + ## [v0.9.0][] **Release date: 2020-01-07** @@ -550,3 +572,4 @@ First release with very basic functionality. [#45]: https://github.com/goodmami/penman/issues/45 [#47]: https://github.com/goodmami/penman/issues/47 [#48]: https://github.com/goodmami/penman/issues/48 +[#50]: https://github.com/goodmami/penman/issues/50 diff --git a/docs/api/penman.lexer.rst b/docs/api/penman.lexer.rst index 7fc0ce5..3b30e7c 100644 --- a/docs/api/penman.lexer.rst +++ b/docs/api/penman.lexer.rst @@ -12,7 +12,7 @@ penman.lexer A dictionary mapping token names to regular expressions. For instance:: - 'ROLE': r':[^\s()\/:]*' + 'ROLE': r':[^\s()\/:~]*' The token names are used later by the :class:`TokenIterator` to help with parsing. diff --git a/penman/__about__.py b/penman/__about__.py index 7cb66a6..a8dec1b 100644 --- a/penman/__about__.py +++ b/penman/__about__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = '0.9.0' +__version__ = '0.9.1' __version_info__ = tuple( int(x) if x.isdigit() else x for x in __version__.replace('.', ' ').replace('-', ' ').split() diff --git a/penman/codec.py b/penman/codec.py index 44a78d9..90c7efb 100644 --- a/penman/codec.py +++ b/penman/codec.py @@ -137,6 +137,8 @@ def _parse_node(self, tokens: TokenIterator): # for robustness, don't assume next token is the concept if tokens.peek().type in ('SYMBOL', 'STRING'): concept = tokens.next().text + if tokens.peek().type == 'ALIGNMENT': + concept += tokens.next().text else: concept = None logger.warning('Missing concept: %s', slash.line) @@ -156,13 +158,18 @@ def _parse_edge(self, tokens: TokenIterator): Edge := Role (Constant | Node) """ - role = tokens.expect('ROLE') - target = None + role_token = tokens.expect('ROLE') + role = role_token.text + if tokens.peek().type == 'ALIGNMENT': + role += tokens.next().text + target = None _next = tokens.peek() next_type = _next.type if next_type in ('SYMBOL', 'STRING'): target = tokens.next().text + if tokens.peek().type == 'ALIGNMENT': + target += tokens.next().text elif next_type == 'LPAREN': target = self._parse_node(tokens) # for robustness in parsing, allow edges with no target: @@ -171,9 +178,9 @@ def _parse_edge(self, tokens: TokenIterator): elif next_type not in ('ROLE', 'RPAREN'): raise tokens.error('Expected: SYMBOL, STRING, LPAREN', token=_next) else: - logger.warning('Missing target: %s', role.line) + logger.warning('Missing target: %s', role_token.line) - return (role.text, target) + return (role, target) def parse_triples(self, s: str) -> List[BasicTriple]: """ Parse a triple conjunction from *s*.""" diff --git a/penman/layout.py b/penman/layout.py index 89e5f5b..5281ab6 100644 --- a/penman/layout.py +++ b/penman/layout.py @@ -161,9 +161,18 @@ def _interpret_node(t: Node, variables: Set[Variable], model: Model): # atomic targets if is_atomic(target): + # remove any alignments if target and '~' in target: - target, _, alignment = target.partition('~') - epis.append(Alignment.from_string(alignment)) + if target.startswith('"'): + # need to handle alignments on strings differently + # because strings may contain ~ inside the quotes + pivot = target.rindex('"') + 1 + if pivot < len(target): + epis.append(Alignment.from_string(target[pivot:])) + target = target[:pivot] + else: + target, _, alignment = target.partition('~') + epis.append(Alignment.from_string(alignment)) triple = (var, role, target) if model.is_role_inverted(role): if target in variables: diff --git a/penman/lexer.py b/penman/lexer.py index 9ec0929..e562dc3 100644 --- a/penman/lexer.py +++ b/penman/lexer.py @@ -20,10 +20,11 @@ PATTERNS = { 'COMMENT': r'\#.*$', 'STRING': r'"[^"\\]*(?:\\.[^"\\]*)*"', + 'ALIGNMENT': r'~(?:[a-z]\.?)?[0-9]+(?:,[0-9]+)*', # ROLE cannot be made up of COLON + SYMBOL because it then becomes # difficult to detect anonymous roles: (a : b) vs (a :b c) - 'ROLE': r':[^\s()\/:]*', - 'SYMBOL': r'[^\s()\/:]+', + 'ROLE': r':[^\s()\/:~]*', + 'SYMBOL': r'[^\s()\/:~]+', 'LPAREN': r'\(', 'RPAREN': r'\)', 'SLASH': r'\/', # concept (node label) role @@ -41,7 +42,7 @@ def _compile(*names: str) -> Pattern[str]: PENMAN_RE = _compile('COMMENT', 'STRING', 'LPAREN', 'RPAREN', 'SLASH', - 'ROLE', 'SYMBOL', + 'ROLE', 'SYMBOL', 'ALIGNMENT', 'UNEXPECTED') TRIPLE_RE = _compile('COMMENT', 'STRING', diff --git a/tests/test_codec.py b/tests/test_codec.py index af0da96..29da9d9 100644 --- a/tests/test_codec.py +++ b/tests/test_codec.py @@ -36,6 +36,11 @@ def test_parse(self): 'a', [(':ARG-of', 'b')]) assert codec.parse('(a :ARG~1 b~2)') == ( 'a', [(':ARG~1', 'b~2')]) + # https://github.com/goodmami/penman/issues/50 + assert codec.parse('(a :ARG "str~ing")') == ( + 'a', [(':ARG', '"str~ing"')]) + assert codec.parse('(a :ARG "str~ing"~1)') == ( + 'a', [(':ARG', '"str~ing"~1')]) def test_format(self): assert codec.format( @@ -248,6 +253,40 @@ def test_decode_atoms(self): ('x20', ':instance', '876-9'), ] + def test_decode_alignments(self): + g = decode('(a / alpha~1)') + assert g.triples == [ + ('a', ':instance', 'alpha'), + ] + assert surface.alignments(g) == { + ('a', ':instance', 'alpha'): surface.Alignment((1,)), + } + assert surface.role_alignments(g) == {} + + assert decode('(a / alpha~1)') == decode('(a / alpha ~1)') + + g = decode('(a :ARG~e.1,2 b)') + assert g.triples == [ + ('a', ':instance', None), + ('a', ':ARG', 'b'), + ] + assert surface.alignments(g) == {} + assert surface.role_alignments(g) == { + ('a', ':ARG', 'b'): surface.RoleAlignment((1, 2), prefix='e.'), + } + + # https://github.com/goodmami/penman/issues/50 + g = decode('(a :ARG1 "str~ing" :ARG2 "str~ing"~1)') + assert g.triples == [ + ('a', ':instance', None), + ('a', ':ARG1', '"str~ing"'), + ('a', ':ARG2', '"str~ing"'), + ] + assert surface.alignments(g) == { + ('a', ':ARG2', '"str~ing"'): surface.Alignment((1,)), + } + assert surface.role_alignments(g) == {} + def test_decode_invalid_graphs(self): # some robustness g = decode('(g / )') diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 22887ce..5e47cb1 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -20,12 +20,18 @@ def _lex(s): 'RPAREN'] assert _lex('(a :ROLE~e.1,2 b~3)') == [ 'LPAREN', 'SYMBOL', - 'ROLE', 'SYMBOL', + 'ROLE', 'ALIGNMENT', 'SYMBOL', 'ALIGNMENT', 'RPAREN'] assert _lex('# comment\n# (n / nope)\n(a / alpha)') == [ 'COMMENT', 'COMMENT', 'LPAREN', 'SYMBOL', 'SLASH', 'SYMBOL', 'RPAREN'] +def test_lexing_issue_50(): + # https://github.com/goodmami/penman/issues/50 + assert [tok.type for tok in lexer.lex('(a :ROLE "a~b"~1)')] == [ + 'LPAREN', 'SYMBOL', 'ROLE', 'STRING', 'ALIGNMENT', 'RPAREN'] + + def test_lex_triples(): def _lex(s): return [tok.type for tok in lexer.lex(s, pattern=lexer.TRIPLE_RE)]