---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In[10], line 1
----> 1 html_body = msg.htmlBody
File xxx/lib/python3.10/functools.py:981, in cached_property.__get__(self, instance, owner)
979 val = cache.get(self.attrname, _NOT_FOUND)
980 if val is _NOT_FOUND:
--> 981 val = self.func(instance)
982 try:
983 cache[self.attrname] = val
File xxx/lib/python3.10/site-packages/extract_msg/msg_classes/message_base.py:1174, in MessageBase.htmlBody(self)
1172 elif self.rtfBody:
1173 logger.info('HTML body was not found, attempting to generate from RTF.')
-> 1174 htmlBody = cast(bytes, self.deencapsulateBody(self.rtfBody, DeencapType.HTML))
1175 # This is it's own if statement so we can ensure it will generate
1176 # even if there is an rtfBody, in the event it doesn't have HTML.
1177 if not htmlBody and self.body:
1178 # Convert the plain text body to html.
File xxx/lib/python3.10/site-packages/extract_msg/msg_classes/message_base.py:257, in MessageBase.deencapsulateBody(self, rtfBody, bodyType)
255 logger.exception('Custom deencapsulation function reported data is not encapsulated.')
256 else:
--> 257 if self.deencapsulatedRtf and self.deencapsulatedRtf.content_type == 'html':
258 return self.deencapsulatedRtf.html
260 if bodyType == DeencapType.PLAIN:
File xxx/lib/python3.10/functools.py:981, in cached_property.__get__(self, instance, owner)
979 val = cache.get(self.attrname, _NOT_FOUND)
980 if val is _NOT_FOUND:
--> 981 val = self.func(instance)
982 try:
983 cache[self.attrname] = val
File xxx/lib/python3.10/site-packages/extract_msg/msg_classes/message_base.py:1014, in MessageBase.deencapsulatedRtf(self)
1012 try:
1013 deencapsultor = RTFDE.DeEncapsulator(body)
-> 1014 deencapsultor.deencapsulate()
1015 return deencapsultor
1016 except RTFDE.exceptions.NotEncapsulatedRtf:
File xxx/lib/python3.10/site-packages/RTFDE/deencapsulate.py:120, in DeEncapsulator.deencapsulate(self)
118 raise MalformedEncapsulatedRtf(f"Malformed encapsulated RTF discovered:") from _e
119 Decoder = TextDecoder()
--> 120 Decoder.update_children(self.full_tree)
121 self.get_doc_tree()
122 self.validate_encapsulation()
File xxx/lib/python3.10/site-packages/RTFDE/text_extraction.py:675, in TextDecoder.update_children(self, obj)
673 self.set_font_info(obj)
674 children = obj.children
--> 675 obj.children = [i for i in self.iterate_on_children(children)]
File xxx/lib/python3.10/site-packages/RTFDE/text_extraction.py:675, in <listcomp>(.0)
673 self.set_font_info(obj)
674 children = obj.children
--> 675 obj.children = [i for i in self.iterate_on_children(children)]
File xxx/lib/python3.10/site-packages/RTFDE/text_extraction.py:761, in TextDecoder.iterate_on_children(self, children)
758 yield decoded_hex_tok
759 elif isinstance(item, Tree):
760 # Run this same function recursively on nested trees
--> 761 item.children = [i for i in self.iterate_on_children(item.children)]
762 yield item
763 else:
File xxx/lib/python3.10/site-packages/RTFDE/text_extraction.py:761, in <listcomp>(.0)
758 yield decoded_hex_tok
759 elif isinstance(item, Tree):
760 # Run this same function recursively on nested trees
--> 761 item.children = [i for i in self.iterate_on_children(item.children)]
762 yield item
763 else:
[... skipping similar frames: <listcomp> at line 761 (3 times), TextDecoder.iterate_on_children at line 761 (3 times)]
File xxx/lib/python3.10/site-packages/RTFDE/text_extraction.py:748, in TextDecoder.iterate_on_children(self, children)
746 current_fontdef = self.font_table[self.font_stack[-1]]
747 current_codec = current_fontdef.codec
--> 748 decoded_hex = decode_hex_char(base_bytes, current_codec)
749 # We are replacing a Tree. So, need item.data to access it's info token
750 decoded_hex_tok = Token('STRING',
751 decoded_hex,
752 start_pos=_hex_start_pos,
(...)
756 column=_hex_start_column,
757 end_column=_hex_end_column)
File xxx/lib/python3.10/site-packages/RTFDE/text_extraction.py:627, in decode_hex_char(item, codec)
624 if codec is None:
625 # Default to U.S. Windows default codepage
626 codec = 'CP1252'
--> 627 decoded = item.decode(codec)
628 decoded = decoded.encode()
629 if is_logger_on("RTFDE.text_extraction") is True:
UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 0: incomplete multibyte sequence
Bug Metadata
Describe the bug
An UnicodeDecodeError error is thrown when accessing the htmlBody property.
UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 0: incomplete multibyte sequenceTraceback
Additional context
I've noticed several similar issues such as #373, seamustuohy/RTFDE#19, #103, while same error still occur during usage. According to the traceback, the source of the error still seems to be the RTFDE package. Is there any method to fix this, or to get the formatted email body other than using the htmlBody property?