Merge pull request #51 from goodmami/v0.9.1

V0.9.1
goodmami · Jan 16, 2020 · eb4d7c4 · eb4d7c4
2 parents 64effe0 + eba3884
commit eb4d7c4
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,28 @@
 (no unreleased changes yet)
 
 
+## [v0.9.1][]
+
+**Release date: 2020-01-16**
+
+Partially revert some changes regarding the parsing of surface
+alignments as they caused issues, particular when strings contain `~`
+characters.
+
+### Added
+
+* `ALIGNMENT` production in `penman.lexer.PATTERNS` (this reverts a
+  change in v0.9.0) ([#50][])
+
+### Changed
+
+* Disallow tildes in the `ROLE` and `SYMBOL` patterns (this reverts a
+  change in v0.9.0) ([#50][])
+* `ALIGNMENT` tokens are joined to their previous `ROLE`, `SYMBOL`, or
+  `STRING`, which means they can now be separated with a space (for
+  example, `(a / alpha ~1)`)
+
+
 ## [v0.9.0][]
 
 **Release date: 2020-01-07**
@@ -550,3 +572,4 @@ First release with very basic functionality.
 [#45]: https://github.com/goodmami/penman/issues/45
 [#47]: https://github.com/goodmami/penman/issues/47
 [#48]: https://github.com/goodmami/penman/issues/48
+[#50]: https://github.com/goodmami/penman/issues/50
diff --git a/docs/api/penman.lexer.rst b/docs/api/penman.lexer.rst
@@ -12,7 +12,7 @@ penman.lexer
       A dictionary mapping token names to regular expressions. For
       instance::
 
-	'ROLE':  r':[^\s()\/:]*'
+	'ROLE':  r':[^\s()\/:~]*'
 
       The token names are used later by the :class:`TokenIterator` to
       help with parsing.

diff --git a/penman/__about__.py b/penman/__about__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-__version__ = '0.9.0'
+__version__ = '0.9.1'
 __version_info__ = tuple(
     int(x) if x.isdigit() else x
     for x in __version__.replace('.', ' ').replace('-', ' ').split()

diff --git a/penman/codec.py b/penman/codec.py
@@ -137,6 +137,8 @@ def _parse_node(self, tokens: TokenIterator):
                 # for robustness, don't assume next token is the concept
                 if tokens.peek().type in ('SYMBOL', 'STRING'):
                     concept = tokens.next().text
+                    if tokens.peek().type == 'ALIGNMENT':
+                        concept += tokens.next().text
                 else:
                     concept = None
                     logger.warning('Missing concept: %s', slash.line)
@@ -156,13 +158,18 @@ def _parse_edge(self, tokens: TokenIterator):
 
             Edge := Role (Constant | Node)
         """
-        role = tokens.expect('ROLE')
-        target = None
+        role_token = tokens.expect('ROLE')
+        role = role_token.text
+        if tokens.peek().type == 'ALIGNMENT':
+            role += tokens.next().text
 
+        target = None
         _next = tokens.peek()
         next_type = _next.type
         if next_type in ('SYMBOL', 'STRING'):
             target = tokens.next().text
+            if tokens.peek().type == 'ALIGNMENT':
+                target += tokens.next().text
         elif next_type == 'LPAREN':
             target = self._parse_node(tokens)
         # for robustness in parsing, allow edges with no target:
@@ -171,9 +178,9 @@ def _parse_edge(self, tokens: TokenIterator):
         elif next_type not in ('ROLE', 'RPAREN'):
             raise tokens.error('Expected: SYMBOL, STRING, LPAREN', token=_next)
         else:
-            logger.warning('Missing target: %s', role.line)
+            logger.warning('Missing target: %s', role_token.line)
 
-        return (role.text, target)
+        return (role, target)
 
     def parse_triples(self, s: str) -> List[BasicTriple]:
         """ Parse a triple conjunction from *s*."""

diff --git a/penman/layout.py b/penman/layout.py
@@ -161,9 +161,18 @@ def _interpret_node(t: Node, variables: Set[Variable], model: Model):
 
         # atomic targets
         if is_atomic(target):
+            # remove any alignments
             if target and '~' in target:
-                target, _, alignment = target.partition('~')
-                epis.append(Alignment.from_string(alignment))
+                if target.startswith('"'):
+                    # need to handle alignments on strings differently
+                    # because strings may contain ~ inside the quotes
+                    pivot = target.rindex('"') + 1
+                    if pivot < len(target):
+                        epis.append(Alignment.from_string(target[pivot:]))
+                        target = target[:pivot]
+                else:
+                    target, _, alignment = target.partition('~')
+                    epis.append(Alignment.from_string(alignment))
             triple = (var, role, target)
             if model.is_role_inverted(role):
                 if target in variables:

diff --git a/penman/lexer.py b/penman/lexer.py
@@ -20,10 +20,11 @@
 PATTERNS = {
     'COMMENT':    r'\#.*$',
     'STRING':     r'"[^"\\]*(?:\\.[^"\\]*)*"',
+    'ALIGNMENT':  r'~(?:[a-z]\.?)?[0-9]+(?:,[0-9]+)*',
     # ROLE cannot be made up of COLON + SYMBOL because it then becomes
     # difficult to detect anonymous roles: (a : b) vs (a :b c)
-    'ROLE':       r':[^\s()\/:]*',
-    'SYMBOL':     r'[^\s()\/:]+',
+    'ROLE':       r':[^\s()\/:~]*',
+    'SYMBOL':     r'[^\s()\/:~]+',
     'LPAREN':     r'\(',
     'RPAREN':     r'\)',
     'SLASH':      r'\/',  # concept (node label) role
@@ -41,7 +42,7 @@ def _compile(*names: str) -> Pattern[str]:
 PENMAN_RE = _compile('COMMENT',
                      'STRING',
                      'LPAREN', 'RPAREN', 'SLASH',
-                     'ROLE', 'SYMBOL',
+                     'ROLE', 'SYMBOL', 'ALIGNMENT',
                      'UNEXPECTED')
 TRIPLE_RE = _compile('COMMENT',
                      'STRING',

diff --git a/tests/test_codec.py b/tests/test_codec.py
@@ -36,6 +36,11 @@ def test_parse(self):
             'a', [(':ARG-of', 'b')])
         assert codec.parse('(a :ARG~1 b~2)') == (
             'a', [(':ARG~1', 'b~2')])
+        # https://github.com/goodmami/penman/issues/50
+        assert codec.parse('(a :ARG "str~ing")') == (
+            'a', [(':ARG', '"str~ing"')])
+        assert codec.parse('(a :ARG "str~ing"~1)') == (
+            'a', [(':ARG', '"str~ing"~1')])
 
     def test_format(self):
         assert codec.format(
@@ -248,6 +253,40 @@ def test_decode_atoms(self):
             ('x20', ':instance', '876-9'),
         ]
 
+    def test_decode_alignments(self):
+        g = decode('(a / alpha~1)')
+        assert g.triples == [
+            ('a', ':instance', 'alpha'),
+        ]
+        assert surface.alignments(g) == {
+            ('a', ':instance', 'alpha'): surface.Alignment((1,)),
+        }
+        assert surface.role_alignments(g) == {}
+
+        assert decode('(a / alpha~1)') == decode('(a / alpha ~1)')
+
+        g = decode('(a :ARG~e.1,2 b)')
+        assert g.triples == [
+            ('a', ':instance', None),
+            ('a', ':ARG', 'b'),
+        ]
+        assert surface.alignments(g) == {}
+        assert surface.role_alignments(g) == {
+            ('a', ':ARG', 'b'): surface.RoleAlignment((1, 2), prefix='e.'),
+        }
+
+        # https://github.com/goodmami/penman/issues/50
+        g = decode('(a :ARG1 "str~ing" :ARG2 "str~ing"~1)')
+        assert g.triples == [
+            ('a', ':instance', None),
+            ('a', ':ARG1', '"str~ing"'),
+            ('a', ':ARG2', '"str~ing"'),
+        ]
+        assert surface.alignments(g) == {
+            ('a', ':ARG2', '"str~ing"'): surface.Alignment((1,)),
+        }
+        assert surface.role_alignments(g) == {}
+
     def test_decode_invalid_graphs(self):
         # some robustness
         g = decode('(g / )')

diff --git a/tests/test_lexer.py b/tests/test_lexer.py
@@ -20,12 +20,18 @@ def _lex(s):
         'RPAREN']
     assert _lex('(a :ROLE~e.1,2 b~3)') == [
         'LPAREN', 'SYMBOL',
-        'ROLE', 'SYMBOL',
+        'ROLE', 'ALIGNMENT', 'SYMBOL', 'ALIGNMENT',
         'RPAREN']
     assert _lex('# comment\n# (n / nope)\n(a / alpha)') == [
         'COMMENT', 'COMMENT', 'LPAREN', 'SYMBOL', 'SLASH', 'SYMBOL', 'RPAREN']
 
 
+def test_lexing_issue_50():
+    # https://github.com/goodmami/penman/issues/50
+    assert [tok.type for tok in lexer.lex('(a :ROLE "a~b"~1)')] == [
+        'LPAREN', 'SYMBOL', 'ROLE', 'STRING', 'ALIGNMENT', 'RPAREN']
+
+
 def test_lex_triples():
     def _lex(s):
         return [tok.type for tok in lexer.lex(s, pattern=lexer.TRIPLE_RE)]