diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py index d0247f1..d904eb1 100644 --- a/markup_doc/labeling_utils.py +++ b/markup_doc/labeling_utils.py @@ -2,6 +2,7 @@ import json import re import requests +import unicodedata from lxml import etree @@ -674,6 +675,20 @@ def match_subsection(item, sections): ) else None +def normalize_text(text): + text = re.sub(r'<[^>]+>', '', text) # quita etiquetas + text = text.strip().lower() + text = unicodedata.normalize('NFD', text) + text = ''.join(c for c in text if unicodedata.category(c) != 'Mn') + return text + + +def comes_before_or_equal(obj1, obj2): + p1 = normalize_text(obj1.get('value', {}).get('paragraph', '')) + p2 = normalize_text(obj2.get('value', {}).get('paragraph', '')) + return p1 <= p2 + + def create_labeled_object2(i, item, state, sections): obj = {} result = None @@ -688,7 +703,14 @@ def create_labeled_object2(i, item, state, sections): state['label'] = result.get('label') state['body'] = result.get('body') - if state.get('body') and re.search(r"^(refer)", item.get('text').lower()) and match_section(item, sections): + text = item.get('text', '').strip().lower() + + is_references_title = bool(re.fullmatch( + r"(?:referencias|references|referências)\s*[:.]?", + text + )) + + if state.get('body') and is_references_title: state['label'] = '' state['body'] = False state['back'] = True diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py index c0b9f42..42730d7 100644 --- a/markup_doc/tasks.py +++ b/markup_doc/tasks.py @@ -17,7 +17,8 @@ create_labeled_object2, get_data_first_block, get_llm_model_name, - create_special_content_object + create_special_content_object, + comes_before_or_equal ) from markup_doc.models import ProcessStatus @@ -83,6 +84,8 @@ def get_labels(title, user_id): next_item = None obj_reference = [] + obj_postreference = [] + last_obj = None llama_model = False for i, item in enumerate(content): @@ -234,8 +237,7 @@ def get_labels(title, user_id): if item.get('text') is None or item.get('text') == '': state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next'] - if state['back']: - state['back'] = False + if state['back'] and num_ref > 0: state['body'] = False state['references'] = True else: @@ -262,13 +264,22 @@ def get_labels(title, user_id): else: stream_data_body.append(obj) elif state['back']: - if state['label'] == '': + if state['references']: + obj_postreference.append(obj) + elif state['label'] == '': stream_data_back.append(obj) - if state['label'] == '

': - num_ref = num_ref + 1 - #obj = {}#process_reference(num_ref, obj, user_id) - obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],}) - #stream_data_back.append(obj) + elif state['label'] == '

': + if last_obj is not None and not re.search(r"^(refer)",last_obj.get('value', {}).get('paragraph', '').strip().lower()): + if comes_before_or_equal(last_obj, obj): + num_ref = num_ref + 1 + obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],}) + else: + obj_postreference.append(obj) + state['references'] = True + else: + num_ref = num_ref + 1 + obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],}) + last_obj = obj else: stream_data.append(obj) @@ -296,6 +307,7 @@ def get_labels(title, user_id): output.extend(parsed) # Agrega a la lista de salida stream_data_back.extend(process_references(num_refs, output)) + stream_data_back.extend(obj_postreference) article_docx_markup.content = stream_data article_docx_markup.content_body = stream_data_body diff --git a/markup_doc/xml.py b/markup_doc/xml.py index d200675..5e446a3 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -723,6 +723,8 @@ def get_xml(article_docx, data_front, data, data_back): node_tit = etree.SubElement(node_reflist, 'title') append_fragment(node_tit, d['value']['paragraph']) if d['value']['label'] == '

': + if 'refid' not in d['value']: + continue values = d['value'] node_ref = etree.SubElement(node_reflist, 'ref', attrib={"id": values['refid']}) #node_label = etree.SubElement(node_ref, 'label') diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py index decf630..301e6d2 100644 --- a/markuplib/function_docx.py +++ b/markuplib/function_docx.py @@ -360,15 +360,12 @@ def extrae_Tabla(element, rels_map, namespaces): obj['formula'] = etree.tostring(mathml_root, pretty_print=True, encoding='unicode') - # obtiene id y nivel - if is_numPr: - numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) - numId = numPr.find('.//w:numId', namespaces=paragraph.nsmap).get(namespaces_p + 'val') - type_matches = [ - (key, objt) - for key, objt in list_types.items() - if objt.get('numId') == numId - ] + if not obj_image: + paragraph = element + text_paragraph = [] + + # Determina si es parte de una lista + is_numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) is not None # obtiene id y nivel if is_numPr: @@ -400,76 +397,6 @@ def extrae_Tabla(element, rels_map, namespaces): objl['list'] = '\n'.join(current_list) current_list = [] content.append(objl) - list_type = 'bullet' - if type_matches and type_matches[0][1].get(str(0)) == 'decimal': - list_type = 'order' - - current_list.append(f'[list list-type="{list_type}"]') - else: - #Se terminaron de agregar elementos a la lista - if len(current_list) > 0: - current_list.append('[/list]') - objl = {} - objl['type'] = 'list' - objl['list'] = '\n'.join(current_list) - current_list = [] - content.append(objl) - - for child in paragraph: - if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink': - for r in child.findall('w:r', namespaces=child.nsmap): - t_elem = r.find('w:t', namespaces=child.nsmap) - if t_elem is not None and t_elem.text: - text_paragraph.append(t_elem.text) - - elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r': - namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' - sz_element = child.find('.//w:sz', namespaces=child.nsmap) - obj['font_size'] = 0 - - if sz_element is None: - p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap) - if p_pr is not None: - sz_element = p_pr.find('.//w:pPr', namespaces=child.nsmap) - - if sz_element is not None: - xml_string = etree.tostring(sz_element, pretty_print=True, encoding='unicode') - size_element = objectify.fromstring(xml_string) - font_size_value = size_element.get(namespaces+'val') - obj['font_size'] = int(font_size_value)/2 - - color_element = child.find('.//w:color', namespaces=child.nsmap) - - if color_element is None: - p_pr = paragraph.find('.//w:pPr', namespaces=child.nsmap) - if p_pr is not None: - color_element = p_pr.find('.//w:rPr/w:color', namespaces=child.nsmap) - - if color_element is not None: - xml_string_color = etree.tostring(color_element, pretty_print=True, encoding='unicode') - object_element = objectify.fromstring(xml_string_color) - color_value = object_element.get(namespaces + 'val') - obj['color'] = color_value - - b_tag = child.find('.//w:b', namespaces=child.nsmap) - - if b_tag is None: - p_pr = paragraph.find('.//w:rPr/w:b', namespaces=child.nsmap) - if p_pr is not None: - b_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) - - if b_tag is not None: - val = b_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') - obj['bold'] = (val is None or val in ['1', 'true', 'True']) - else: - obj['bold'] = False - - i_tag = child.find('.//w:i', namespaces=child.nsmap) - - if i_tag is None: - p_pr = paragraph.find('.//w:rPr/w:i', namespaces=child.nsmap) - if p_pr is not None: - i_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) for child in paragraph: if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink': @@ -481,7 +408,7 @@ def extrae_Tabla(element, rels_map, namespaces): elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r': namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' sz_element = child.find('.//w:sz', namespaces=child.nsmap) - obj['font_size'] = 0 + obj['font_size'] = 12 if sz_element is None: p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap) @@ -572,12 +499,24 @@ def extrae_Tabla(element, rels_map, namespaces): sections = [sections[-1]] first_block = '' tmp_content = [] - abstract_mode = False + abstract_mode = False + abstract_started = False for c in content: if abstract_mode: + if not abstract_started: + if c['text'] == '' or c['spacing'] is True: + continue + else: + abstract_started = True + tmp_content.append(c) + continue + + # empezó el abstract: sí encuentra vacío marca fin if c['text'] == '' or c['spacing'] is True: abstract_mode = False + abstract_started = False + continue else: tmp_content.append(c) continue @@ -585,8 +524,10 @@ def extrae_Tabla(element, rels_map, namespaces): if 'paraph' in c: tmp_content.append(c) abstract_mode = False + abstract_started = False if c['paraph'] == '': abstract_mode = True + abstract_started = False continue else: if 'text' in c: @@ -648,17 +589,13 @@ def extrae_Tabla(element, rels_map, namespaces): 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' } - if is_numPr: - if 'font_size' in obj: - del obj['font_size'] - current_list.append(f'[list-item]{obj["text"]}[/list-item]') - if isinstance(obj['text'], list) and len(text_paragraph) > 0: - obj2 = {} - obj2['type'] = 'text' - obj2['value'] = ' '.join(text_paragraph) - obj['text'].append(obj2) - text_paragraph = [] - if not is_numPr: - content.append(obj) + table = element + table_data = extrae_Tabla(element, hiperlinks_info, namespaces) + obj = {} + obj['type'] = 'table' + obj['table'] = table_data + + if not is_numPr: + content.append(obj) sections.sort(key=section_priority) return sections, content