From cb5324c573f9283a2fed9680a086516064a3cc24 Mon Sep 17 00:00:00 2001 From: fze Date: Sun, 31 May 2026 16:51:46 +0800 Subject: [PATCH] Add move-to-front transform --- data_compression/move_to_front.py | 99 +++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 data_compression/move_to_front.py diff --git a/data_compression/move_to_front.py b/data_compression/move_to_front.py new file mode 100644 index 000000000000..07bdc32f7b34 --- /dev/null +++ b/data_compression/move_to_front.py @@ -0,0 +1,99 @@ +""" +Move-to-front transform. + +The move-to-front transform encodes each symbol as its current index in an +ordered alphabet, then moves that symbol to the front of the alphabet. +It is commonly used after the Burrows-Wheeler transform in lossless +compression pipelines. + +Reference: https://en.wikipedia.org/wiki/Move-to-front_transform +""" + + +def _validated_alphabet(alphabet: str) -> list[str]: + """ + Return a mutable alphabet list after validating uniqueness. + + >>> _validated_alphabet("abc") + ['a', 'b', 'c'] + >>> _validated_alphabet("aba") + Traceback (most recent call last): + ... + ValueError: alphabet must contain unique characters + """ + if not isinstance(alphabet, str): + raise TypeError("alphabet must be a string") + if len(set(alphabet)) != len(alphabet): + raise ValueError("alphabet must contain unique characters") + return list(alphabet) + + +def move_to_front_encode(text: str, alphabet: str) -> list[int]: + """ + Encode text using the move-to-front transform. + + >>> move_to_front_encode("banana", "abcdefghijklmnopqrstuvwxyz") + [1, 1, 13, 1, 1, 1] + >>> move_to_front_encode("banana", "abn") + [1, 1, 2, 1, 1, 1] + >>> move_to_front_encode("", "abc") + [] + >>> move_to_front_encode("bad", "abc") + Traceback (most recent call last): + ... + ValueError: character 'd' is not in the alphabet + """ + if not isinstance(text, str): + raise TypeError("text must be a string") + + symbols = _validated_alphabet(alphabet) + encoded_text: list[int] = [] + + for char in text: + try: + char_index = symbols.index(char) + except ValueError: + message = f"character {char!r} is not in the alphabet" + raise ValueError(message) from None + encoded_text.append(char_index) + symbols.insert(0, symbols.pop(char_index)) + + return encoded_text + + +def move_to_front_decode(encoded_text: list[int], alphabet: str) -> str: + """ + Decode a move-to-front encoded list of indexes. + + >>> move_to_front_decode([1, 1, 13, 1, 1, 1], "abcdefghijklmnopqrstuvwxyz") + 'banana' + >>> move_to_front_decode([1, 1, 2, 1, 1, 1], "abn") + 'banana' + >>> move_to_front_decode([], "abc") + '' + >>> move_to_front_decode([3], "abc") + Traceback (most recent call last): + ... + ValueError: index 3 is not valid for alphabet size 3 + >>> move_to_front_decode([-1], "abc") + Traceback (most recent call last): + ... + ValueError: index -1 is not valid for alphabet size 3 + """ + symbols = _validated_alphabet(alphabet) + decoded_text = [] + + for index in encoded_text: + if not 0 <= index < len(symbols): + message = f"index {index} is not valid for alphabet size {len(symbols)}" + raise ValueError(message) + decoded_text.append(symbols[index]) + symbols.insert(0, symbols.pop(index)) + + return "".join(decoded_text) + + +if __name__ == "__main__": + import doctest + + doctest.testmod()