Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion markup_doc/labeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import re
import requests
import unicodedata

from lxml import etree

Expand Down Expand Up @@ -674,6 +675,20 @@ def match_subsection(item, sections):
) else None


def normalize_text(text):
text = re.sub(r'<[^>]+>', '', text) # quita etiquetas
text = text.strip().lower()
text = unicodedata.normalize('NFD', text)
text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
return text


def comes_before_or_equal(obj1, obj2):
p1 = normalize_text(obj1.get('value', {}).get('paragraph', ''))
p2 = normalize_text(obj2.get('value', {}).get('paragraph', ''))
return p1 <= p2


def create_labeled_object2(i, item, state, sections):
obj = {}
result = None
Expand All @@ -688,7 +703,14 @@ def create_labeled_object2(i, item, state, sections):
state['label'] = result.get('label')
state['body'] = result.get('body')

if state.get('body') and re.search(r"^(refer)", item.get('text').lower()) and match_section(item, sections):
text = item.get('text', '').strip().lower()

is_references_title = bool(re.fullmatch(
r"(?:referencias|references|referências)\s*[:.]?",
text
))

if state.get('body') and is_references_title:
state['label'] = '<sec>'
state['body'] = False
state['back'] = True
Expand Down
30 changes: 21 additions & 9 deletions markup_doc/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
create_labeled_object2,
get_data_first_block,
get_llm_model_name,
create_special_content_object
create_special_content_object,
comes_before_or_equal
)

from markup_doc.models import ProcessStatus
Expand Down Expand Up @@ -83,6 +84,8 @@ def get_labels(title, user_id):

next_item = None
obj_reference = []
obj_postreference = []
last_obj = None
llama_model = False

for i, item in enumerate(content):
Expand Down Expand Up @@ -234,8 +237,7 @@ def get_labels(title, user_id):

if item.get('text') is None or item.get('text') == '':
state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
if state['back']:
state['back'] = False
if state['back'] and num_ref > 0:
state['body'] = False
state['references'] = True
else:
Expand All @@ -262,13 +264,22 @@ def get_labels(title, user_id):
else:
stream_data_body.append(obj)
elif state['back']:
if state['label'] == '<sec>':
if state['references']:
obj_postreference.append(obj)
elif state['label'] == '<sec>':
stream_data_back.append(obj)
if state['label'] == '<p>':
num_ref = num_ref + 1
#obj = {}#process_reference(num_ref, obj, user_id)
obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
#stream_data_back.append(obj)
elif state['label'] == '<p>':
if last_obj is not None and not re.search(r"^(refer)",last_obj.get('value', {}).get('paragraph', '').strip().lower()):
if comes_before_or_equal(last_obj, obj):
num_ref = num_ref + 1
obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
else:
obj_postreference.append(obj)
state['references'] = True
else:
num_ref = num_ref + 1
obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
last_obj = obj
else:
stream_data.append(obj)

Expand Down Expand Up @@ -296,6 +307,7 @@ def get_labels(title, user_id):
output.extend(parsed) # Agrega a la lista de salida

stream_data_back.extend(process_references(num_refs, output))
stream_data_back.extend(obj_postreference)

article_docx_markup.content = stream_data
article_docx_markup.content_body = stream_data_body
Expand Down
2 changes: 2 additions & 0 deletions markup_doc/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,8 @@ def get_xml(article_docx, data_front, data, data_back):
node_tit = etree.SubElement(node_reflist, 'title')
append_fragment(node_tit, d['value']['paragraph'])
if d['value']['label'] == '<p>':
if 'refid' not in d['value']:
continue
values = d['value']
node_ref = etree.SubElement(node_reflist, 'ref', attrib={"id": values['refid']})
#node_label = etree.SubElement(node_ref, 'label')
Expand Down
123 changes: 30 additions & 93 deletions markuplib/function_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,15 +360,12 @@ def extrae_Tabla(element, rels_map, namespaces):
obj['formula'] = etree.tostring(mathml_root, pretty_print=True, encoding='unicode')


# obtiene id y nivel
if is_numPr:
numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap)
numId = numPr.find('.//w:numId', namespaces=paragraph.nsmap).get(namespaces_p + 'val')
type_matches = [
(key, objt)
for key, objt in list_types.items()
if objt.get('numId') == numId
]
if not obj_image:
paragraph = element
text_paragraph = []

# Determina si es parte de una lista
is_numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) is not None

# obtiene id y nivel
if is_numPr:
Expand Down Expand Up @@ -400,76 +397,6 @@ def extrae_Tabla(element, rels_map, namespaces):
objl['list'] = '\n'.join(current_list)
current_list = []
content.append(objl)
list_type = 'bullet'
if type_matches and type_matches[0][1].get(str(0)) == 'decimal':
list_type = 'order'

current_list.append(f'[list list-type="{list_type}"]')
else:
#Se terminaron de agregar elementos a la lista
if len(current_list) > 0:
current_list.append('[/list]')
objl = {}
objl['type'] = 'list'
objl['list'] = '\n'.join(current_list)
current_list = []
content.append(objl)

for child in paragraph:
if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink':
for r in child.findall('w:r', namespaces=child.nsmap):
t_elem = r.find('w:t', namespaces=child.nsmap)
if t_elem is not None and t_elem.text:
text_paragraph.append(t_elem.text)

elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r':
namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
sz_element = child.find('.//w:sz', namespaces=child.nsmap)
obj['font_size'] = 0

if sz_element is None:
p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap)
if p_pr is not None:
sz_element = p_pr.find('.//w:pPr', namespaces=child.nsmap)

if sz_element is not None:
xml_string = etree.tostring(sz_element, pretty_print=True, encoding='unicode')
size_element = objectify.fromstring(xml_string)
font_size_value = size_element.get(namespaces+'val')
obj['font_size'] = int(font_size_value)/2

color_element = child.find('.//w:color', namespaces=child.nsmap)

if color_element is None:
p_pr = paragraph.find('.//w:pPr', namespaces=child.nsmap)
if p_pr is not None:
color_element = p_pr.find('.//w:rPr/w:color', namespaces=child.nsmap)

if color_element is not None:
xml_string_color = etree.tostring(color_element, pretty_print=True, encoding='unicode')
object_element = objectify.fromstring(xml_string_color)
color_value = object_element.get(namespaces + 'val')
obj['color'] = color_value

b_tag = child.find('.//w:b', namespaces=child.nsmap)

if b_tag is None:
p_pr = paragraph.find('.//w:rPr/w:b', namespaces=child.nsmap)
if p_pr is not None:
b_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap)

if b_tag is not None:
val = b_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
obj['bold'] = (val is None or val in ['1', 'true', 'True'])
else:
obj['bold'] = False

i_tag = child.find('.//w:i', namespaces=child.nsmap)

if i_tag is None:
p_pr = paragraph.find('.//w:rPr/w:i', namespaces=child.nsmap)
if p_pr is not None:
i_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap)

for child in paragraph:
if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink':
Expand All @@ -481,7 +408,7 @@ def extrae_Tabla(element, rels_map, namespaces):
elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r':
namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
sz_element = child.find('.//w:sz', namespaces=child.nsmap)
obj['font_size'] = 0
obj['font_size'] = 12

if sz_element is None:
p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap)
Expand Down Expand Up @@ -572,21 +499,35 @@ def extrae_Tabla(element, rels_map, namespaces):
sections = [sections[-1]]
first_block = ''
tmp_content = []
abstract_mode = False
abstract_mode = False
abstract_started = False

for c in content:
if abstract_mode:
if not abstract_started:
if c['text'] == '' or c['spacing'] is True:
continue
else:
abstract_started = True
tmp_content.append(c)
continue

# empezó el abstract: sí encuentra vacío marca fin
if c['text'] == '' or c['spacing'] is True:
abstract_mode = False
abstract_started = False
continue
else:
tmp_content.append(c)
continue

if 'paraph' in c:
tmp_content.append(c)
abstract_mode = False
abstract_started = False
if c['paraph'] == '<abstract>':
abstract_mode = True
abstract_started = False
continue
else:
if 'text' in c:
Expand Down Expand Up @@ -648,17 +589,13 @@ def extrae_Tabla(element, rels_map, namespaces):
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
}

if is_numPr:
if 'font_size' in obj:
del obj['font_size']
current_list.append(f'[list-item]{obj["text"]}[/list-item]')
if isinstance(obj['text'], list) and len(text_paragraph) > 0:
obj2 = {}
obj2['type'] = 'text'
obj2['value'] = ' '.join(text_paragraph)
obj['text'].append(obj2)
text_paragraph = []
if not is_numPr:
content.append(obj)
table = element
table_data = extrae_Tabla(element, hiperlinks_info, namespaces)
obj = {}
obj['type'] = 'table'
obj['table'] = table_data

if not is_numPr:
content.append(obj)
sections.sort(key=section_priority)
return sections, content
Loading