diff --git a/src/scielo/bin/xml/app_modules/app/data/sgml_html.py b/src/scielo/bin/xml/app_modules/app/data/sgml_html.py new file mode 100644 index 000000000..623e33d4b --- /dev/null +++ b/src/scielo/bin/xml/app_modules/app/data/sgml_html.py @@ -0,0 +1,129 @@ +# coding=utf-8 + +import os +import shutil + +from ...generics import fs_utils +from ...generics import encoding +from . import symbols + + +class SGMLHTMLDocment(object): + + def __init__(self, xml_name, html_filename): + self.xml_name = xml_name + self.html_filename = html_filename + self.html_path = os.path.dirname(self.html_filename) + self.html_basename = os.path.basename(self.html_filename) + self.html_name, ext = os.path.splitext(self.html_basename) + self._html_content = None + + @property + def html_content(self): + if self._html_content is None: + self._html_content = fs_utils.read_file(self.html_filename, encoding.SYS_DEFAULT_ENCODING) or '' + return self._html_content + + @property + def html_img_path(self): + path = None + for item in os.listdir(self.html_path): + if os.path.isdir(self.html_path + '/' + item) and item.startswith(self.html_name): + path = self.html_path + '/' + item + break + if path is None: + path = self._alternative_html_img_path(self.html_path, self.html_name) + if path is None: + path = self.html_path + return path + + def _alternative_html_img_path(self): + #name_image001 + new_html_folder = self.html_path + '/' + self.html_name + '_arquivosalt' + if not os.path.isdir(new_html_folder): + os.makedirs(new_html_folder) + for item in os.listdir(self.html_path): + if os.path.isfile(self.html_path + '/' + item) and item.startswith(self.html_name + '_image'): + new_name = item[len(self.html_name)+1:] + shutil.copyfile(self.html_path + '/' + item, new_html_folder + '/' + new_name) + return new_html_folder + + @property + def img_files(self): + return os.listdir(self.html_img_path) + + @property + def img_href_items(self): + return [item for item in self.unknown_href_items if item != 'None'] + + @property + def unknown_href_items(self): + """ + [graphic href="?a20_115"][/graphic] + """ + html_content = self.html_content + if 'href="?' in html_content: + html_content = html_content.replace('href="?', 'href="?') + """ + if '“' in html_content: + html_content = html_content.replace('“', '"') + if '”' in html_content: + html_content = html_content.replace('”', '"') + """ + _items = html_content.replace('href="?', 'href="?--~BREAK~FIXHREF--FIXHREF').split('--~BREAK~FIXHREF--') + items = [item for item in _items if item.startswith('FIXHREF')] + img_src = [] + for item in items: + if 'src="' in item: + src = item[item.find('src="') + len('src="'):] + src = src[0:src.find('"')] + if '/' in src: + src = src[src.find('/') + 1:] + if len(src) > 0: + img_src.append(src) + else: + img_src.append('None') + return img_src + + def get_fontsymbols(self): + r = [] + html_content = self.html_content + if '[fontsymbol]' in html_content.lower(): + for style in ['italic', 'sup', 'sub', 'bold']: + html_content = html_content.replace('<' + style + '>', '[' + style + ']') + html_content = html_content.replace('', '[/' + style + ']') + + html_content = html_content.replace('[fontsymbol]'.upper(), '[fontsymbol]') + html_content = html_content.replace('[/fontsymbol]'.upper(), '[/fontsymbol]') + html_content = html_content.replace('[fontsymbol]', '~BREAK~[fontsymbol]') + html_content = html_content.replace('[/fontsymbol]', '[/fontsymbol]~BREAK~') + + html_fontsymbol_items = [item for item in html_content.split('~BREAK~') if item.startswith('[fontsymbol]')] + for item in html_fontsymbol_items: + item = item.replace('[fontsymbol]', '').replace('[/fontsymbol]', '') + item = item.replace('<', '~BREAK~<').replace('>', '>~BREAK~') + item = item.replace('[', '~BREAK~[').replace(']', ']~BREAK~') + parts = [part for part in item.split('~BREAK~') if not part.endswith('>') and not part.startswith('<')] + + new = '' + for part in parts: + if part.startswith('['): + new += part + else: + + for c in part: + _c = c.strip() + if _c.isdigit() or _c == '': + n = c + else: + try: + n = symbols.get_symbol(c) + except: + n = '?' + new += n + for style in ['italic', 'sup', 'sub', 'bold']: + new = new.replace('[' + style + ']', '<' + style + '>') + new = new.replace('[/' + style + ']', '') + r.append(new) + return r