Skip to content

Commit dd316bb

Browse files
committed
Support for joining long numbers with spaces.
1 parent 5960803 commit dd316bb

1 file changed

Lines changed: 14 additions & 5 deletions

File tree

udapi/block/ud/jointoken.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
from udapi.core.block import Block
55
import logging
6+
import re
67

78

89
class JoinToken(Block):
@@ -29,8 +30,8 @@ class JoinToken(Block):
2930
underlying text was created directly for UD and can be thus considered
3031
part of the annotation.
3132
32-
At present, this block does not support merging with spaces at all, but
33-
in the future one or more of the options may be added.
33+
At present, this block does not support merging with spaces except for
34+
long numbers, for which it creates words with spaces (option 2).
3435
"""
3536

3637
def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs):
@@ -80,10 +81,18 @@ def process_node(self, node):
8081
logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name)
8182
node.misc['Bug'] = 'JoiningTokenNotSupportedHere'
8283
return
84+
# In exceptional cases UD allows spaces inside words and then we may
85+
# join tokens even if there was space between them. Such exceptions
86+
# must be registered for each UD language. We do not access that register
87+
# here but we allow a special case which is allowed e.g. in Czech and
88+
# French: long numbers (1 000 000).
8389
if prevnode.misc['SpaceAfter'] != 'No':
84-
logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name)
85-
node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported'
86-
return
90+
if re.fullmatch(r'[0-9,\. ]+', node.form) and re.fullmatch(r'[0-9,\. ]+', prevnode.form):
91+
node.form = ' ' + node.form
92+
else:
93+
logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name)
94+
node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported'
95+
return
8796
###!!! This block currently must not be applied on data containing
8897
###!!! enhanced dependencies. We must first implement adjustments of
8998
###!!! the enhanced structure.

0 commit comments

Comments
 (0)