diff --git a/machine/corpora/corpora_utils.py b/machine/corpora/corpora_utils.py index 09437463..98e67c10 100644 --- a/machine/corpora/corpora_utils.py +++ b/machine/corpora/corpora_utils.py @@ -15,6 +15,14 @@ T = TypeVar("T") +def alignment_exception(refs: Iterable[str]) -> TypeError: + return TypeError( + f'Invalid format in {", ".join(refs)}. ' + "Mismatched key formats. There may be an extraneous tab, " + "missing ref, or inconsistent use of user-defined refs." + ) + + def batch(iterable: Iterable[T], batch_size: int) -> Iterable[Sequence[T]]: if isinstance(iterable, Sequence) and len(iterable) <= batch_size: yield iterable diff --git a/machine/corpora/n_parallel_text_corpus.py b/machine/corpora/n_parallel_text_corpus.py index c2098aae..cf3cac0d 100644 --- a/machine/corpora/n_parallel_text_corpus.py +++ b/machine/corpora/n_parallel_text_corpus.py @@ -2,6 +2,7 @@ from typing import Any, Callable, Iterable, List, Optional, Sequence, Set, cast from ..scripture.verse_ref import Versification +from .corpora_utils import alignment_exception from .n_parallel_text_corpus_base import NParallelTextCorpusBase from .n_parallel_text_row import NParallelTextRow from .scripture_ref import ScriptureRef @@ -181,7 +182,10 @@ def _get_rows(self, generators: List[TextCorpusEnumerator]) -> Iterable[NParalle refs.append(row.ref) else: refs.append(None) - min_ref_indexes = self._min_ref_indexes(refs) + try: + min_ref_indexes = self._min_ref_indexes(refs) + except TypeError as e: + raise alignment_exception([str(r.ref) for r in current_rows if r is not None]) from e non_min_ref_indexes = list(set(range(0, self.n)).difference(min_ref_indexes)) if ( len(min_ref_indexes) < num_remaining_rows @@ -274,6 +278,8 @@ def _get_rows(self, generators: List[TextCorpusEnumerator]) -> Iterable[NParalle if is_completed: num_completed += 1 num_remaining_rows -= 1 + else: + raise alignment_exception([str(current_rows[i].ref) for i in min_ref_indexes]) if range_info.is_in_range: yield range_info.create_row() @@ -362,9 +368,12 @@ def _create_min_ref_rows( yield row def _check_same_ref_rows(self, same_ref_rows: List[TextRow], other_row: TextRow) -> bool: - if len(same_ref_rows) > 0 and self.row_ref_comparer(same_ref_rows[0].ref, other_row.ref) != 0: - same_ref_rows.clear() - return len(same_ref_rows) > 0 + try: + if len(same_ref_rows) > 0 and self.row_ref_comparer(same_ref_rows[0].ref, other_row.ref) != 0: + same_ref_rows.clear() + return len(same_ref_rows) > 0 + except TypeError as e: + raise alignment_exception([str(same_ref_rows[0].ref), str(other_row.ref)]) from e def _create_same_ref_rows( self, diff --git a/machine/corpora/standard_parallel_text_corpus.py b/machine/corpora/standard_parallel_text_corpus.py index eac12954..7521e3f0 100644 --- a/machine/corpora/standard_parallel_text_corpus.py +++ b/machine/corpora/standard_parallel_text_corpus.py @@ -4,6 +4,7 @@ from typing import Callable, Generator, Iterable, Optional from .alignment_corpus import AlignmentCorpus +from .corpora_utils import alignment_exception from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .n_parallel_text_corpus import NParallelTextCorpus, default_row_ref_comparer from .parallel_text_corpus import ParallelTextCorpus @@ -68,7 +69,10 @@ def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelText if self._alignment_corpus is not None and all([len(n) > 0 for n in n_row.n_segments]): while True: if alignment_row is not None: - compare_alignment_corpus = self._row_ref_comparer(n_row.ref, alignment_row.ref) + try: + compare_alignment_corpus = self._row_ref_comparer(n_row.ref, alignment_row.ref) + except TypeError as e: + raise alignment_exception([str(r) for r in n_row.n_refs]) from e else: compare_alignment_corpus = 1 if compare_alignment_corpus >= 0: