scieloorg · eduranm · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
@@ -2,6 +2,7 @@
 import json
 import re
 import requests
+import unicodedata
 
 from lxml import etree
 
@@ -674,6 +675,20 @@ def match_subsection(item, sections):
     ) else None
 
 
+def normalize_text(text):
+    text = re.sub(r'<[^>]+>', '', text)  # quita etiquetas
+    text = text.strip().lower()
+    text = unicodedata.normalize('NFD', text)
+    text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
+    return text
+
+
+def comes_before_or_equal(obj1, obj2):
+    p1 = normalize_text(obj1.get('value', {}).get('paragraph', ''))
+    p2 = normalize_text(obj2.get('value', {}).get('paragraph', ''))
+    return p1 <= p2
+
+
 def create_labeled_object2(i, item, state, sections):
     obj = {}
     result = None
@@ -688,7 +703,14 @@ def create_labeled_object2(i, item, state, sections):
         state['label'] = result.get('label')
         state['body'] = result.get('body')
 
-    if state.get('body') and re.search(r"^(refer)", item.get('text').lower()) and match_section(item, sections):
+    text = item.get('text', '').strip().lower()
+
+    is_references_title = bool(re.fullmatch(
+        r"(?:referencias|references|referências)\s*[:.]?",
+        text
+    ))
+
+    if state.get('body') and is_references_title:  
         state['label'] = '<sec>'
         state['body'] = False
         state['back'] = True

diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
@@ -17,7 +17,8 @@
     create_labeled_object2,
     get_data_first_block,
     get_llm_model_name,
-    create_special_content_object
+    create_special_content_object,
+    comes_before_or_equal
 )
 
 from markup_doc.models import ProcessStatus
@@ -83,6 +84,8 @@ def get_labels(title, user_id):
 
     next_item  = None
     obj_reference = []
+    obj_postreference = []
+    last_obj = None
     llama_model = False
 
     for i, item in enumerate(content):
@@ -234,8 +237,7 @@ def get_labels(title, user_id):
 
         if item.get('text') is None or item.get('text') == '':
             state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
-            if state['back']:
-                state['back'] = False
+            if state['back'] and num_ref > 0:
                 state['body'] = False
                 state['references'] = True
         else:
@@ -262,13 +264,22 @@ def get_labels(title, user_id):
                     else:
                         stream_data_body.append(obj)
                 elif state['back']:
-                    if state['label'] == '<sec>':
+                    if state['references']:
+                        obj_postreference.append(obj)
+                    elif state['label'] == '<sec>':
                         stream_data_back.append(obj)
-                    if state['label'] == '<p>':
-                        num_ref = num_ref + 1
-                        #obj = {}#process_reference(num_ref, obj, user_id)
-                        obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
-                    #stream_data_back.append(obj)
+                    elif state['label'] == '<p>':
+                        if last_obj is not None and not re.search(r"^(refer)",last_obj.get('value', {}).get('paragraph', '').strip().lower()):
+                            if comes_before_or_equal(last_obj, obj):
+                                num_ref = num_ref + 1
+                                obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
+                            else:
+                                obj_postreference.append(obj)
+                                state['references'] = True
+                        else:
+                            num_ref = num_ref + 1
+                            obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
+                        last_obj = obj
                 else:
                     stream_data.append(obj)
 
@@ -296,6 +307,7 @@ def get_labels(title, user_id):
                     output.extend(parsed)  # Agrega a la lista de salida
 
         stream_data_back.extend(process_references(num_refs, output))
+        stream_data_back.extend(obj_postreference)
 
     article_docx_markup.content = stream_data
     article_docx_markup.content_body = stream_data_body

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
@@ -723,6 +723,8 @@ def get_xml(article_docx, data_front, data, data_back):
             node_tit = etree.SubElement(node_reflist, 'title')
             append_fragment(node_tit, d['value']['paragraph'])
         if d['value']['label'] == '<p>':
+            if 'refid' not in d['value']:
+                continue
             values = d['value']
             node_ref = etree.SubElement(node_reflist, 'ref', attrib={"id": values['refid']})
             #node_label = etree.SubElement(node_ref, 'label')

diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py
@@ -360,15 +360,12 @@ def extrae_Tabla(element, rels_map, namespaces):
                     obj['formula'] = etree.tostring(mathml_root, pretty_print=True, encoding='unicode')
 
 
-                # obtiene id y nivel
-                if is_numPr:
-                    numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap)
-                    numId = numPr.find('.//w:numId', namespaces=paragraph.nsmap).get(namespaces_p + 'val')
-                    type_matches = [
-                        (key, objt)
-                        for key, objt in list_types.items()
-                        if objt.get('numId') == numId
-                    ]
+                if not obj_image:
+                    paragraph = element
+                    text_paragraph = []
+
+                    # Determina si es parte de una lista
+                    is_numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) is not None
 
                     # obtiene id y nivel
                     if is_numPr:
@@ -400,76 +397,6 @@ def extrae_Tabla(element, rels_map, namespaces):
                             objl['list'] = '\n'.join(current_list)
                             current_list = []
                             content.append(objl)
-                        list_type = 'bullet'
-                        if type_matches and type_matches[0][1].get(str(0)) == 'decimal':
-                            list_type = 'order'
-
-                        current_list.append(f'[list list-type="{list_type}"]')
-                else:
-                    #Se terminaron de agregar elementos a la lista
-                    if len(current_list) > 0:
-                        current_list.append('[/list]')
-                        objl = {}
-                        objl['type'] = 'list'
-                        objl['list'] = '\n'.join(current_list)
-                        current_list = []
-                        content.append(objl)
-
-                for child in paragraph:
-                    if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink':
-                        for r in child.findall('w:r', namespaces=child.nsmap):
-                            t_elem = r.find('w:t', namespaces=child.nsmap)
-                            if t_elem is not None and t_elem.text:
-                                text_paragraph.append(t_elem.text)
-
-                    elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r':
-                        namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
-                        sz_element = child.find('.//w:sz', namespaces=child.nsmap)
-                        obj['font_size'] = 0
-
-                        if sz_element is None:
-                            p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap)
-                            if p_pr is not None:
-                                sz_element = p_pr.find('.//w:pPr', namespaces=child.nsmap)
-
-                        if sz_element is not None:
-                            xml_string = etree.tostring(sz_element, pretty_print=True, encoding='unicode')
-                            size_element = objectify.fromstring(xml_string)
-                            font_size_value = size_element.get(namespaces+'val')
-                            obj['font_size'] = int(font_size_value)/2
-
-                        color_element = child.find('.//w:color', namespaces=child.nsmap)
-
-                        if color_element is None:
-                            p_pr = paragraph.find('.//w:pPr', namespaces=child.nsmap)
-                            if p_pr is not None:
-                                color_element = p_pr.find('.//w:rPr/w:color', namespaces=child.nsmap)
-
-                        if color_element is not None:
-                            xml_string_color = etree.tostring(color_element, pretty_print=True, encoding='unicode')
-                            object_element = objectify.fromstring(xml_string_color)
-                            color_value = object_element.get(namespaces + 'val')
-                            obj['color'] = color_value
-
-                        b_tag = child.find('.//w:b', namespaces=child.nsmap)
-
-                        if b_tag is None:
-                            p_pr = paragraph.find('.//w:rPr/w:b', namespaces=child.nsmap)
-                            if p_pr is not None:
-                                b_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap)
-
-                        if b_tag is not None:
-                            val = b_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
-                            obj['bold'] = (val is None or val in ['1', 'true', 'True'])
-                        else:
-                            obj['bold'] = False
-
-                        i_tag = child.find('.//w:i', namespaces=child.nsmap)
-
-                        if i_tag is None:
-                            p_pr = paragraph.find('.//w:rPr/w:i', namespaces=child.nsmap)
-                            if p_pr is not None:
-                                i_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap)
 
                     for child in paragraph:
                         if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink':
@@ -481,7 +408,7 @@ def extrae_Tabla(element, rels_map, namespaces):
                         elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r':
                             namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
                             sz_element = child.find('.//w:sz', namespaces=child.nsmap)
-                            obj['font_size'] = 0
+                            obj['font_size'] = 12
 
                             if sz_element is None:
                                 p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap)
@@ -572,21 +499,35 @@ def extrae_Tabla(element, rels_map, namespaces):
                                 sections = [sections[-1]]
                                 first_block = ''
                                 tmp_content = []
-                                abstract_mode = False 
+                                abstract_mode = False
+                                abstract_started = False
 
                                 for c in content:
                                     if abstract_mode:
+                                        if not abstract_started:
+                                            if c['text'] == '' or c['spacing'] is True:
+                                                continue
+                                            else:
+                                                abstract_started = True
+                                                tmp_content.append(c)
+                                                continue
+
+                                        # empezó el abstract: sí encuentra vacío marca fin
                                         if c['text'] == '' or c['spacing'] is True:
                                             abstract_mode = False
+                                            abstract_started = False
+                                            continue
                                         else:
                                             tmp_content.append(c)
                                             continue
 
                                     if 'paraph' in c:
                                         tmp_content.append(c)
                                         abstract_mode = False
+                                        abstract_started = False
                                         if c['paraph'] == '<abstract>':
                                             abstract_mode = True
+                                            abstract_started = False
                                             continue                                        
                                     else:
                                         if 'text' in c:
@@ -648,17 +589,13 @@ def extrae_Tabla(element, rels_map, namespaces):
                     'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
                 }
 
-                if is_numPr:
-                    if 'font_size' in obj:
-                        del obj['font_size']
-                    current_list.append(f'[list-item]{obj["text"]}[/list-item]')
-                if isinstance(obj['text'], list) and len(text_paragraph) > 0:
-                    obj2 = {}
-                    obj2['type'] = 'text'
-                    obj2['value'] = ' '.join(text_paragraph)
-                    obj['text'].append(obj2)
-                    text_paragraph = []
-                if not is_numPr:
-                    content.append(obj)
+                table = element
+                table_data = extrae_Tabla(element, hiperlinks_info, namespaces)
+                obj = {}
+                obj['type'] = 'table'
+                obj['table'] = table_data
+
+            if not is_numPr:
+                content.append(obj)
         sections.sort(key=section_priority)
         return sections, content