33"""
44from udapi .core .block import Block
55import logging
6+ import re
67
78
89class JoinToken (Block ):
@@ -29,8 +30,8 @@ class JoinToken(Block):
2930 underlying text was created directly for UD and can be thus considered
3031 part of the annotation.
3132
32- At present, this block does not support merging with spaces at all, but
33- in the future one or more of the options may be added .
33+ At present, this block does not support merging with spaces except for
34+ long numbers, for which it creates words with spaces (option 2) .
3435 """
3536
3637 def __init__ (self , misc_name = 'JoinToken' , misc_value = None , ** kwargs ):
@@ -80,10 +81,18 @@ def process_node(self, node):
8081 logging .warning ("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self .misc_name )
8182 node .misc ['Bug' ] = 'JoiningTokenNotSupportedHere'
8283 return
84+ # In exceptional cases UD allows spaces inside words and then we may
85+ # join tokens even if there was space between them. Such exceptions
86+ # must be registered for each UD language. We do not access that register
87+ # here but we allow a special case which is allowed e.g. in Czech and
88+ # French: long numbers (1 000 000).
8389 if prevnode .misc ['SpaceAfter' ] != 'No' :
84- logging .warning ("MISC %s cannot be used if there is space between the tokens." % self .misc_name )
85- node .misc ['Bug' ] = 'JoiningTokensWithSpaceNotSupported'
86- return
90+ if re .fullmatch (r'[0-9,\. ]+' , node .form ) and re .fullmatch (r'[0-9,\. ]+' , prevnode .form ):
91+ node .form = ' ' + node .form
92+ else :
93+ logging .warning ("MISC %s cannot be used if there is space between the tokens." % self .misc_name )
94+ node .misc ['Bug' ] = 'JoiningTokensWithSpaceNotSupported'
95+ return
8796 ###!!! This block currently must not be applied on data containing
8897 ###!!! enhanced dependencies. We must first implement adjustments of
8998 ###!!! the enhanced structure.
0 commit comments