From b88d61a0df4e334f815e47103ea412c6feef6a48 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:05:55 +0800 Subject: [PATCH 01/39] feat(discpline_info.csv): remove this file --- data/discpline_info.csv | 154 ---------------------------------------- 1 file changed, 154 deletions(-) delete mode 100644 data/discpline_info.csv diff --git a/data/discpline_info.csv b/data/discpline_info.csv deleted file mode 100644 index 0a60b55..0000000 --- a/data/discpline_info.csv +++ /dev/null @@ -1,154 +0,0 @@ -discpline,num_papers,success,failure,processed,status -hep-ph,69164,0.0,0.0,0.0,TBD -quant-ph,66714,0.0,0.0,0.0,TBD -hep-th,54047,0.0,0.0,0.0,TBD -cs.CV,53498,0.0,0.0,0.0,TBD -cs.LG,49171,0.0,0.0,0.0,TBD -gr-qc,38174,0.0,0.0,0.0,TBD -cond-mat.mes-hall,37376,0.0,0.0,0.0,TBD -astro-ph.GA,37065,0.0,0.0,0.0,TBD -astro-ph.SR,36193,0.0,0.0,0.0,TBD -astro-ph.CO,34882,0.0,0.0,0.0,TBD -astro-ph.HE,31842,0.0,0.0,0.0,TBD -cond-mat.str-el,28712,0.0,0.0,0.0,TBD -cs.IT,26643,0.0,0.0,0.0,TBD -cs.CL,25202,0.0,0.0,0.0,TBD -cond-mat.mtrl-sci,24458,0.0,0.0,0.0,TBD -math-ph,22455,0.0,0.0,0.0,TBD -cond-mat.stat-mech,21981,0.0,0.0,0.0,TBD -nucl-th,18004,0.0,0.0,0.0,TBD -astro-ph,17200,0.0,0.0,0.0,TBD -astro-ph.EP,15999,0.0,0.0,0.0,TBD -cond-mat.soft,15697,0.0,0.0,0.0,TBD -cond-mat.supr-con,15066,0.0,0.0,0.0,TBD -hep-ex,13033,0.0,0.0,0.0,TBD -astro-ph.IM,12721,0.0,0.0,0.0,TBD -cond-mat.quant-gas,11842,0.0,0.0,0.0,TBD -cs.RO,11700,0.0,0.0,0.0,TBD -cs.CR,11659,0.0,0.0,0.0,TBD -cs.DS,11282,0.0,0.0,0.0,TBD -cs.AI,10798,0.0,0.0,0.0,TBD -stat.ML,10516,0.0,0.0,0.0,TBD -math.NA,9458,0.0,0.0,0.0,TBD -cs.NI,9055,0.0,0.0,0.0,TBD -hep-lat,8742,0.0,0.0,0.0,TBD -eess.SP,8584,0.0,0.0,0.0,TBD -cs.DC,8203,0.0,0.0,0.0,TBD -cs.LO,7619,0.0,0.0,0.0,TBD -nucl-ex,7077,0.0,0.0,0.0,TBD -eess.IV,6976,0.0,0.0,0.0,TBD -cs.SI,6638,0.0,0.0,0.0,TBD -math.OC,6442,0.0,0.0,0.0,TBD -eess.SY,6337,1378.0,806.0,2184.0,processing -cs.SE,6177,1379.0,783.0,2162.0,complete -cond-mat.dis-nn,4938,2102.0,1563.0,3665.0,complete -cs.IR,5350,1545.0,599.0,2144.0,complete -physics.ins-det,4967,1346.0,961.0,2307.0,processing -cs.GT,4808,1115.0,681.0,1796.0,complete -math.PR,3190,618.0,665.0,1283.0,complete -physics.soc-ph,3764,2119.0,1229.0,3348.0,processing -cs.SY,3822,2123.0,1258.0,3381.0,complete -math.CO,2687,850.0,479.0,1329.0,processing -math.AP,2036,490.0,570.0,1060.0,complete -cs.HC,4204,2817.0,1011.0,3828.0,complete -cs.NE,3690,2428.0,1074.0,3502.0,complete -cs.CC,2873,1750.0,1053.0,2803.0,processing -cs.CY,3550,2409.0,952.0,3361.0,complete -cs.DM,2667,1544.0,1030.0,2574.0,complete -eess.AS,3567,2692.0,718.0,3410.0,processing -cs.DB,3350,1538.0,1101.0,2639.0,processing -physics.optics,3130,1863.0,1124.0,2987.0,complete -cond-mat.other,2689,1514.0,1131.0,2645.0,complete -cs.PL,3234,841.0,1109.0,1950.0,processing -cs.SD,3329,2431.0,768.0,3199.0,complete -cs.CG,2963,1644.0,992.0,2636.0,processing -physics.atom-ph,2416,1411.0,926.0,2337.0,complete -physics.comp-ph,2357,1408.0,854.0,2262.0,complete -physics.chem-ph,2203,1323.0,741.0,2064.0,complete -physics.flu-dyn,2124,1185.0,874.0,2059.0,complete -cs.FL,1843,1000.0,804.0,1804.0,complete -math.DG,836,464.0,359.0,823.0,complete -cs.CE,1826,1102.0,655.0,1757.0,complete -cs.MA,1457,880.0,494.0,1374.0,complete -physics.bio-ph,1317,790.0,468.0,1258.0,complete -cs.GR,1446,827.0,539.0,1366.0,complete -math.AG,700,433.0,262.0,695.0,complete -econ.EM,1359,770.0,533.0,1303.0,complete -q-fin.ST,1164,732.0,427.0,1159.0,complete -nlin.SI,533,337.0,192.0,529.0,complete -cs.AR,1318,750.0,496.0,1246.0,complete -math.DS,869,530.0,329.0,859.0,complete -math.ST,1131,603.0,512.0,1115.0,complete -physics.plasm-ph,1037,606.0,397.0,1003.0,complete -math.SP,444,228.0,202.0,430.0,complete -stat.ME,1135,647.0,446.0,1093.0,complete -q-fin.MF,836,481.0,348.0,829.0,complete -physics.app-ph,1185,709.0,398.0,1107.0,complete -math.QA,351,211.0,140.0,351.0,complete -cs.MM,1069,758.0,257.0,1015.0,complete -cs.ET,1088,690.0,335.0,1025.0,complete -q-bio.NC,965,558.0,368.0,926.0,complete -q-fin.PR,690,355.0,332.0,687.0,complete -physics.gen-ph,432,254.0,175.0,429.0,complete -econ.GN,979,535.0,366.0,901.0,complete -nlin.CD,778,410.0,362.0,772.0,complete -cs.DL,893,543.0,314.0,857.0,complete -physics.data-an,851,443.0,385.0,828.0,complete -cs.SC,670,329.0,314.0,643.0,complete -q-bio.PE,830,422.0,384.0,806.0,complete -q-fin.RM,682,375.0,302.0,677.0,complete -econ.TH,759,374.0,345.0,719.0,complete -physics.hist-ph,501,280.0,210.0,490.0,complete -cs.NA,759,421.0,321.0,742.0,complete -q-fin.GN,647,418.0,228.0,646.0,complete -q-fin.CP,723,389.0,322.0,711.0,complete -q-fin.PM,609,194.0,411.0,605.0,complete -stat.AP,767,456.0,285.0,741.0,complete -q-fin.TR,685,390.0,285.0,675.0,complete -q-bio.QM,766,446.0,273.0,719.0,complete -cs.PF,732,435.0,278.0,713.0,complete -cs.MS,728,388.0,309.0,697.0,complete -math.CA,308,193.0,116.0,309.0,complete -math.NT,314,210.0,94.0,304.0,complete -physics.class-ph,439,269.0,167.0,436.0,complete -math.LO,458,269.0,180.0,449.0,complete -math.FA,225,131.0,91.0,222.0,complete -physics.space-ph,603,298.0,281.0,579.0,complete -math.RT,222,136.0,86.0,222.0,complete -cs.OH,534,350.0,165.0,515.0,complete -nlin.PS,522,340.0,175.0,515.0,complete -q-bio.BM,436,262.0,154.0,416.0,complete -nlin.AO,387,225.0,153.0,378.0,complete -physics.med-ph,429,262.0,141.0,403.0,complete -math.OA,119,69.0,48.0,117.0,complete -physics.acc-ph,372,197.0,160.0,357.0,complete -stat.CO,392,195.0,179.0,374.0,complete -physics.geo-ph,361,220.0,132.0,352.0,complete -math.RA,125,82.0,40.0,122.0,complete -math.AT,248,119.0,126.0,245.0,complete -math.GT,232,128.0,104.0,232.0,complete -q-bio.MN,277,160.0,103.0,263.0,complete -math.GR,175,91.0,57.0,148.0,complete -math.SG,139,66.0,70.0,136.0,complete -cs.OS,269,141.0,121.0,262.0,complete -math.MG,220,138.0,79.0,217.0,complete -physics.ao-ph,265,139.0,105.0,244.0,complete -q-fin.EC,187,121.0,65.0,186.0,complete -physics.pop-ph,189,112.0,72.0,184.0,complete -math.CT,233,88.0,110.0,198.0,complete -q-bio.GN,222,131.0,73.0,204.0,complete -physics.ed-ph,178,98.0,77.0,175.0,complete -math.CV,61,30.0,30.0,60.0,complete -physics.atm-clus,117,78.0,38.0,116.0,complete -q-bio.CB,114,65.0,44.0,109.0,complete -q-bio.SC,122,73.0,43.0,116.0,complete -math.HO,81,50.0,30.0,80.0,complete -math.AC,55,36.0,18.0,54.0,complete -nlin.CG,90,55.0,35.0,90.0,complete -stat.OT,46,28.0,18.0,46.0,complete -q-bio.TO,48,34.0,14.0,48.0,complete -math.KT,18,10.0,8.0,18.0,complete -cs.GL,53,21.0,7.0,28.0,complete -math.GN,25,17.0,8.0,25.0,complete -q-bio.OT,28,12.0,16.0,28.0,complete -math.GM,8,2.0,6.0,8.0,complete From d51d638d8ff88a3af466e2ddf9c0bf7c834b3684 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:16:09 +0800 Subject: [PATCH 02/39] feat(block.py): add type hint --- vrdu/block.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/vrdu/block.py b/vrdu/block.py index 0c46dcc..c0c228e 100644 --- a/vrdu/block.py +++ b/vrdu/block.py @@ -1,7 +1,7 @@ from dataclasses import dataclass -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple, Union -from pyparsing import Any +from typing import Any @dataclass @@ -34,17 +34,17 @@ def __repr__(self) -> str: def __getitem__(self, index: int) -> float: return (self.x0, self.y0, self.x1, self.y1)[index] - def area(self): + def area(self) -> float: return abs((self.x1 - self.x0) * (self.y1 - self.y0)) - def overlap(self, other): + def overlap(self, other) -> float: if ( self.x0 > other.x1 or self.x1 < other.x0 or self.y0 > other.y1 or self.y1 < other.y0 ): - return 0 + return 0.0 x_overlap = max(0, min(self.x1, other.x1) - max(self.x0, other.x0)) y_overlap = max(0, min(self.y1, other.y1) - max(self.y0, other.y0)) return x_overlap * y_overlap @@ -75,16 +75,16 @@ class Block: def __init__( self, - block_id: int = None, - bounding_box: BoundingBox = None, - category: int = None, - page_index: int = None, - previous_block: int = None, - parent_block: int = None, - next_block: int = None, - source_code: str = None, - labels: List[str] = None, - references: List[str] = None, + bounding_box: Optional[BoundingBox] = None, + block_id: Optional[int] = None, + category: Optional[int] = None, + page_index: Optional[int] = None, + previous_block: Optional[int] = None, + parent_block: Optional[int] = None, + next_block: Optional[int] = None, + source_code: Optional[str] = None, + labels: Optional[List[str]] = None, + references: Optional[List[str]] = None, ) -> None: if not block_id: self.id = Block.current_id @@ -106,7 +106,7 @@ def __repr__(self) -> str: return f"Block(id={self.id}, category={self.category}, page_index={self.page_index}, bbox={self.bbox}), source_code={self.source_code}" @property - def bbox(self): + def bbox(self) -> Union[BoundingBox, None]: return self._bounding_box @bbox.setter @@ -152,7 +152,7 @@ def page_index(self, value: int) -> None: @property def source_code(self) -> str: return self._source_code - + @source_code.setter def source_code(self, value: str) -> None: self._source_code = value From bd2d2e78af0b51f86de0d7e150d650a8d6de1c28 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:16:45 +0800 Subject: [PATCH 03/39] refactor(block.py): make block field required instead of optional --- vrdu/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vrdu/block.py b/vrdu/block.py index c0c228e..cff84fd 100644 --- a/vrdu/block.py +++ b/vrdu/block.py @@ -75,7 +75,7 @@ class Block: def __init__( self, - bounding_box: Optional[BoundingBox] = None, + bounding_box: BoundingBox, block_id: Optional[int] = None, category: Optional[int] = None, page_index: Optional[int] = None, From bf0decee29c70f46a7463c54eb0164bcf65a015d Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:30:40 +0800 Subject: [PATCH 04/39] refactor(batch_process.py, utils.py): move extract_tex_files to batch_process --- batch_process.py | 25 +++++++++++++------------ vrdu/utils.py | 21 --------------------- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/batch_process.py b/batch_process.py index 2505a28..9185a72 100644 --- a/batch_process.py +++ b/batch_process.py @@ -2,12 +2,11 @@ import argparse import multiprocessing import shutil -from typing import List, Optional +from typing import List from uuid import uuid4 import pandas as pd from vrdu import logger -from vrdu import utils from main import process_one_file log_file = str(uuid4()) + ".log" @@ -16,19 +15,22 @@ database = "data/processed_paper_database.csv" -def filter_tex_files( - tex_files: List[str], main_path: Optional[str] = None -) -> List[str]: - """extract all MAIN.tex files for processing, if main_path is not None, then - only extract MAIN.tex files in the main_path (not recursive) +def filter_tex_files(discipline_path: str) -> List[str]: + """extract all MAIN.tex files for processing, if discipline_path is not None, then + only extract MAIN.tex files in the discipline_path (not recursive) Args: - tex_files (List[str]): list of tex files - main_path (str): path to main directory. + discipline_path (str): path to main directory. Returns: List[str]: list of tex files that are compilable. """ + tex_files = [] + + for root, _, files in os.walk(discipline_path): + tex_files.extend( + [os.path.join(root, file) for file in files if file.endswith(".tex")] + ) # TODO: move this to config redundant_tex_files = [ @@ -47,7 +49,7 @@ def filter_tex_files( # ensure the tex files inside a subfolder is not included # ex: cs.AI/1234.4567/figs/draw.tex will be excluded - if main_path and os.path.dirname(os.path.dirname(tex_file)) != main_path: + if os.path.dirname(os.path.dirname(tex_file)) != discipline_path: continue # make sure the tex file is compilable (main document) @@ -89,8 +91,7 @@ def process_one_discipline(path: str, cpu_count: int, discipline: str) -> None: discipline_path = os.path.join(path, discipline) log.info(f"[VRDU] Path to raw data: {discipline_path}") log.info(f"[VRDU] Using cpu counts: {cpu_count}") - tex_files = utils.extract_all_tex_files(discipline_path) - tex_files = filter_tex_files(tex_files, discipline_path) + tex_files = filter_tex_files(discipline_path) try: with multiprocessing.Pool(cpu_count) as pool: diff --git a/vrdu/utils.py b/vrdu/utils.py index 22eda88..14c57f4 100755 --- a/vrdu/utils.py +++ b/vrdu/utils.py @@ -15,26 +15,6 @@ from vrdu.config import config -def extract_all_tex_files(path) -> List[str]: - """ - Given a path, this function extracts all the .tex files within the - specified directory and its subdirectories. - - Args: - path (str): The path to the directory where the .tex files are located. - - Returns: - List[str]: A list of paths to the .tex files found. - """ - tex_files = [] - - for root, _, files in os.walk(path): - tex_files.extend( - [os.path.join(root, file) for file in files if file.endswith(".tex")] - ) - return tex_files - - def export_to_json(data, file_path) -> None: """ Write the contents of a dictionary to a JSON file. @@ -221,7 +201,6 @@ def convert_eps_image_to_pdf_image(eps_image_path: str, pdf_image_path: str): subprocess.run(["epspdf", eps_image_path, pdf_image_path]) - def extract_macro_definitions(tex_file) -> List[str]: """ Extracts macro definitions from a given tex file. From 903f291e6ece2c6ee6fdbfbbd7b05db12ca2bcdb Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:32:37 +0800 Subject: [PATCH 05/39] refactor(utils.py): optimize load and export logic --- vrdu/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vrdu/utils.py b/vrdu/utils.py index 14c57f4..5394b82 100755 --- a/vrdu/utils.py +++ b/vrdu/utils.py @@ -4,7 +4,7 @@ import re import subprocess import json -from typing import Any, Dict, List +from typing import Any, Dict, List, Union import uuid @@ -15,27 +15,27 @@ from vrdu.config import config -def export_to_json(data, file_path) -> None: +def export_to_json(data: Union[Dict, List], file_path: str) -> None: """ - Write the contents of a dictionary to a JSON file. + Write the contents of a dictionary or a list to a JSON file. Parameters: - data (dict): The dictionary to be written to the file. + data (Union[Dict, List]): The dictionary to be written to the file. file_path (str): The path to the JSON file. """ with open(file_path, "w") as json_file: json.dump(data, json_file, indent=4) -def load_json(file_path) -> Any: +def load_json(file_path: str) -> Union[Dict, List]: """ - Load a JSON file into a dictionary. + Load a JSON file into a dictionary or a list. Parameters: file_path (str): The path to the JSON file. Returns: - dict: The loaded JSON data as a dictionary. + Union[Dict, List]: The loaded JSON data as a dictionary or a list. """ with open(file_path, "r") as json_file: data = json.load(json_file) From 912d10df03c091d9195284670c0b458e558ec385 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:38:00 +0800 Subject: [PATCH 06/39] refactor(utils.py, tests/): remove unused functions --- tests/test_compile_check.py | 19 ---- tests/test_extract_macro_definitions.py | 104 ------------------- tests/test_extract_title_name.py | 17 ---- vrdu/utils.py | 126 +----------------------- 4 files changed, 1 insertion(+), 265 deletions(-) delete mode 100644 tests/test_compile_check.py delete mode 100644 tests/test_extract_macro_definitions.py delete mode 100644 tests/test_extract_title_name.py diff --git a/tests/test_compile_check.py b/tests/test_compile_check.py deleted file mode 100644 index b1adbd8..0000000 --- a/tests/test_compile_check.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest -import unittest.mock -import os - -from vrdu.utils import compile_check - - -class TestGraphics(unittest.TestCase): - def test_equation1(self): - self.assertEqual(compile_check(r"\begin{equation}a \end{equation}"), True) - - temp_files = [file for file in os.listdir(".") if file.startswith("temp")] - self.assertEqual(len(temp_files), 0) - - def test_equation2(self): - self.assertEqual(compile_check(r"\begin{equation}\e\end{equation}"), False) - - temp_files = [file for file in os.listdir(".") if file.startswith("temp")] - self.assertEqual(len(temp_files), 0) diff --git a/tests/test_extract_macro_definitions.py b/tests/test_extract_macro_definitions.py deleted file mode 100644 index 72efeec..0000000 --- a/tests/test_extract_macro_definitions.py +++ /dev/null @@ -1,104 +0,0 @@ -import unittest -import unittest.mock - -from vrdu.utils import extract_macro_definitions - - -class TestExtractMacroDefinitions(unittest.TestCase): - def setUp(self) -> None: - self.mock_file_content1 = ( - """\\documentclass{article}\\begin{document}\\end{document}""" - ) - self.mock_file_content2 = r"""\documentclass{article} - \newcommand{\Sin}{\mathrm{sin}\,\theta} - \newcommand{\Cos}{\mathrm{cos}\,\theta} - \newcommand{\Tan}{\mathrm{tan}\,\theta} - \begin{document} - \[ \Tan = \frac{\Sin}{\Cos} \] \[ (\Sin)^2 + (\Cos)^2 =1 \] - \[ \cot\theta = \frac{\Cos}{\Sin} \] - \end{document} - """ - self.mock_file_content3 = r"""\documentclass{article} - \newcommand{\trig}[1]{\mathrm{\#1}\,\theta} - \begin{document} - \[ \trig{sin},\,\trig{cos},\,\trig{tan} \] - \[ \trig{tan} = \frac{\trig{sin}}{\trig{cos}} \] - \[ \trig{sin^2} + \trig{cos^2} =1 \] - \[ \int \frac{\trig{cos^3}}{1+\trig{sin^2}}d\theta \] - \end{document} - """ - self.mock_file_content4 = r"""\documentclass{article} - \newcommand{\trig}[2]{\mathrm{\#1}\left(\#2\right)} - \newcommand{\Int}[2]{\int_{\#2}^{\#1}} - \begin{document} - \[ \int\frac{du}{\sqrt{a^2 + u^2}}=\trig{sin^{\!-1}}{\frac{u}{a}} + C \] - \[ \int\trig{sec}{\frac{a}{x}}dx = \frac{1}{a} \log\trig{tan}{\frac{\pi}{4}+ \frac{a}{2x}} + C \] - \[ \Int{a}{b}f(x)dx = \sum_{k=1}^n \trig{sin}{5+\frac{3k}{n}} \] - \[ \Int{b}{a}f(x)dx = \lim_{n \to \infty} \sum_{i=1}^{n}f(x_i)\delta x \] - \end{document} - """ - self.mock_file_content5 = r"""\documentclass{article} - \usepackage{xcolor} - \newcommand{\trig}[3][]{\mathrm{\#2^{\#1}}\left(\#3\right)} - \newcommand{\trigx}[3][]{\mathrm{\#2}\left({\color{\#1}\#3}\right)} - \begin{document} - \[ \trig{sin}{\alpha}, \trig[n]{sin}{\beta},\trig[m]{sin}{\gamma} \] - \[ \trigx[red]{cos}{2\theta}-\trigx[blue]{sin}{2\theta}=\trigx[green]{cos}{4\theta} \] - \[ \theta=\trigx[red]{tan^{-1}}{\frac{x}{y}},\trigx[red]{tan}{\alpha+\beta}=\frac{\trigx[blue]{tan}{\alpha}+\trigx[blue]{tan}{\beta}}{1-\trigx[blue]{tan}{\alpha}\trigx[blue]{tan}{\beta}}\] - \end{document} - """ - - def test_no_macro(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content1), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual(result, []) - - def test_no_arguments(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content2), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual( - result, - [ - r"\newcommand{\Sin}{\mathrm{sin}\,\theta}", - r"\newcommand{\Cos}{\mathrm{cos}\,\theta}", - r"\newcommand{\Tan}{\mathrm{tan}\,\theta}", - ], - ) - - def test_more_than_one_arguments(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content4), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual( - result, - [ - r"\newcommand{\trig}[2]{\mathrm{\#1}\left(\#2\right)}", - r"\newcommand{\Int}[2]{\int_{\#2}^{\#1}}", - ], - ) - - def test_optional_arguments(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content5), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual( - result, - [ - r"\newcommand{\trig}[3][]{\mathrm{\#2^{\#1}}\left(\#3\right)}", - r"\newcommand{\trigx}[3][]{\mathrm{\#2}\left({\color{\#1}\#3}\right)}", - ], - ) diff --git a/tests/test_extract_title_name.py b/tests/test_extract_title_name.py deleted file mode 100644 index cf6093f..0000000 --- a/tests/test_extract_title_name.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - - -from vrdu.utils import extract_title_name - - -class TestExtractTitleName(unittest.TestCase): - def test_title_name(self): - self.assertEqual(extract_title_name("\\section{Name}"), "section") - self.assertEqual(extract_title_name("\\subsection*{AnotherName}"), "subsection") - self.assertEqual(extract_title_name("No match"), "") - self.assertEqual( - extract_title_name("\\subsubsection{No match}"), "subsubsection" - ) - self.assertEqual( - extract_title_name("\\subsubsection*{No match}"), "subsubsection" - ) diff --git a/vrdu/utils.py b/vrdu/utils.py index 5394b82..b55e2b4 100755 --- a/vrdu/utils.py +++ b/vrdu/utils.py @@ -42,76 +42,6 @@ def load_json(file_path: str) -> Union[Dict, List]: return data -def compile_check(source_code: str) -> bool: - """ - check if the source code can be compiled, - used to check if there are macros in the source code. - """ - prefix = r""" - \documentclass{article} - \usepackage{amsmath} - \usepackage{amssymb} - \usepackage{amsfonts, bm} - \usepackage{amsthm} - \usepackage{array} - \usepackage{tabularx} - \usepackage{multirow} - \usepackage{booktabs} - \begin{document} - """ - - suffix = r""" - \end{document} - """ - temp_filename = str(uuid.uuid4()) - content = prefix + source_code + suffix - with open(f"{temp_filename}.tex", "w") as f: - f.write(content) - - result = True - try: - subprocess.run( - ["pdflatex", "-halt-on-error", f"{temp_filename}.tex"], check=True - ) - except subprocess.CalledProcessError: - result = False - finally: - # remove files - files = glob.glob(f"{os.getcwd()}/{temp_filename}.*") - for file in files: - os.remove(file) - - return result - - -def get_main_content(data): - """ - Generate the main content of a document. - - Parameters: - - data (list): A list of dictionaries representing the document. - - Returns: - - main_content (str): The main content of the document. - - Raises: - - Exception: If the document is not found in the data. - - """ - main_content = None - main_content_index = None - for index, item in enumerate(data): - if isinstance(item, dict) and "document" in item: - main_content = item["document"][1] - main_content_index = index - break - - if main_content is None: - raise Exception("document not found") - - return main_content, main_content_index - - def compile_latex(file: str): """ Compile a LaTeX file using either pdflatex or xelatex as the tex engine. @@ -201,35 +131,6 @@ def convert_eps_image_to_pdf_image(eps_image_path: str, pdf_image_path: str): subprocess.run(["epspdf", eps_image_path, pdf_image_path]) -def extract_macro_definitions(tex_file) -> List[str]: - """ - Extracts macro definitions from a given tex file. - - Args: - tex_file (str): The path to the tex file. - - Returns: - List[str]: A list of macro definitions extracted from the tex file. - """ - macro_patterns = [ - r"\\newcommand{[^}]+}", - r"\\renewcommand{[^}]+}", - r"\\newenvironment{[^}]+}", - r"\\renewenvironment{[^}]+}", - ] - - macros = [] - with open(tex_file, "r") as file: - text_lines = file.readlines() - macros = [ - line.strip() - for line in text_lines - if any(re.findall(pattern, line) for pattern in macro_patterns) - ] - - return macros - - def export_to_coco( layout_info: Dict[int, List[Block]], image_infos: Dict[int, Dict[str, Any]], @@ -290,32 +191,7 @@ def export_to_coco( } result["annotations"].append(annotation) - export_to_json(result, filename) - - -def extract_title_name(title) -> str: - """ - Extracts the name of a title from its format. - - Args: - title (str): The title string to extract the name from. - - Returns: - str: The extracted title environment name from the title. - - Example: - >>> extract_title_name("\\section{Name}") - 'section' - >>> extract_title_name("\\subsection*{AnotherName}") - 'subsection' - >>> extract_title_name("No match") - '' - """ - match = re.search(r"\\(\w+)(\*?){(.*)}", title) - if match: - return match.group(1) - - return "" + export_to_json(result, file_path) def colorize(text: str, category_name: str) -> str: From 4d1363d383018fb7371b6255e718b11834d1f006 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:38:49 +0800 Subject: [PATCH 07/39] refactor(utils.py, layout_annotation.py): use consistent argument names --- vrdu/layout_annotation.py | 2 +- vrdu/utils.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/vrdu/layout_annotation.py b/vrdu/layout_annotation.py index b4a186b..3e42260 100644 --- a/vrdu/layout_annotation.py +++ b/vrdu/layout_annotation.py @@ -531,7 +531,7 @@ def annotate(self): self.result_directory, "layout_annotation.json" ) utils.export_to_coco( - layout_info, image_annotation, filename=layout_annotation_file + layout_info, image_annotation, file_path=layout_annotation_file ) # step3: generate reading annotation diff --git a/vrdu/utils.py b/vrdu/utils.py index b55e2b4..99cf7ad 100755 --- a/vrdu/utils.py +++ b/vrdu/utils.py @@ -134,15 +134,17 @@ def convert_eps_image_to_pdf_image(eps_image_path: str, pdf_image_path: str): def export_to_coco( layout_info: Dict[int, List[Block]], image_infos: Dict[int, Dict[str, Any]], - filename: str, + file_path: str, ) -> None: """ Export the given layout information and image information to a COCO format JSON file. Args: - layout_info (Dict[int, List[Block]]): A dictionary mapping page indices to lists of Block objects. - image_infos (Dict[int, Dict[str, Any]]): A dictionary mapping page indices to dictionaries containing image information. - filename (str): The name of the output JSON file. + layout_info (Dict[int, List[Block]]): + A dictionary mapping page indices to lists of Block objects. + image_infos (Dict[int, Dict[str, Any]]): + A dictionary mapping page indices to dictionaries containing image information. + file_path (str): The name of the output JSON file. Returns: None From a7e5c9509074e06d0073961b656e0f512d8458db Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:40:25 +0800 Subject: [PATCH 08/39] fix(block.py): return type error --- vrdu/block.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vrdu/block.py b/vrdu/block.py index cff84fd..00dd0d3 100644 --- a/vrdu/block.py +++ b/vrdu/block.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple from typing import Any @@ -106,7 +106,7 @@ def __repr__(self) -> str: return f"Block(id={self.id}, category={self.category}, page_index={self.page_index}, bbox={self.bbox}), source_code={self.source_code}" @property - def bbox(self) -> Union[BoundingBox, None]: + def bbox(self) -> BoundingBox: return self._bounding_box @bbox.setter From 4cb0b8f181d33413fab3cb78d517fb7eb024587a Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:43:26 +0800 Subject: [PATCH 09/39] style(block.py): format f-string --- vrdu/block.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vrdu/block.py b/vrdu/block.py index 00dd0d3..e494ec5 100644 --- a/vrdu/block.py +++ b/vrdu/block.py @@ -103,7 +103,11 @@ def __init__( self._references = references def __repr__(self) -> str: - return f"Block(id={self.id}, category={self.category}, page_index={self.page_index}, bbox={self.bbox}), source_code={self.source_code}" + return ( + f"Block(id={self.id}, category={self.category}, " + f"page_index={self.page_index}, bbox={self.bbox}), " + f"source_code={self.source_code}" + ) @property def bbox(self) -> BoundingBox: From e8be463a780da9cd9685eccb3d5ce2d40a9d6337 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:45:14 +0800 Subject: [PATCH 10/39] style(utils.py): remove unused packages --- vrdu/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vrdu/utils.py b/vrdu/utils.py index 99cf7ad..e0bc388 100755 --- a/vrdu/utils.py +++ b/vrdu/utils.py @@ -1,11 +1,7 @@ -import csv -import glob import os -import re import subprocess import json from typing import Any, Dict, List, Union -import uuid from pdf2image import pdf2image From 3d84de9b4891ed6481b0654e118f6f1d2cd68aab Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:45:49 +0800 Subject: [PATCH 11/39] docs(utils.py): remove wrong docstring --- vrdu/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vrdu/utils.py b/vrdu/utils.py index e0bc388..aecefe8 100755 --- a/vrdu/utils.py +++ b/vrdu/utils.py @@ -38,9 +38,9 @@ def load_json(file_path: str) -> Union[Dict, List]: return data -def compile_latex(file: str): +def compile_latex(file: str) -> None: """ - Compile a LaTeX file using either pdflatex or xelatex as the tex engine. + Compile a LaTeX file using pdflatex engine. Parameters: file (str): The path to the LaTeX file to be compiled. From ac67642ad713d04372dea858909512a8b59d2b3f Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:52:56 +0800 Subject: [PATCH 12/39] feat(block.py, layout_annotation.py): use consistent argument type --- vrdu/block.py | 3 +++ vrdu/layout_annotation.py | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/vrdu/block.py b/vrdu/block.py index e494ec5..90d9aaf 100644 --- a/vrdu/block.py +++ b/vrdu/block.py @@ -52,6 +52,9 @@ def overlap(self, other) -> float: def to_dict(self) -> Dict[str, Any]: return {"bbox": (self.x0, self.y0, self.x1, self.y1)} + def to_tuple(self) -> Tuple[float, float, float, float]: + return (self.x0, self.y0, self.x1, self.y1) + @classmethod def from_dict(cls, data: Dict[str, Any]): return cls( diff --git a/vrdu/layout_annotation.py b/vrdu/layout_annotation.py index 3e42260..b6b6ae2 100644 --- a/vrdu/layout_annotation.py +++ b/vrdu/layout_annotation.py @@ -492,7 +492,9 @@ def generate_image_annotation( for element in layout_info[page_index]: category = element.category draw.rectangle( - element.bbox, outline=config.colors_map[str(category)], width=3 + element.bbox.to_tuple(), + outline=config.colors_map[str(category)], + width=3, ) draw.text( (element.bbox[0], element.bbox[1]), @@ -609,7 +611,9 @@ def generate_geometry_annotation( for index, element in enumerate(layout_elements): category = element.category - draw.rectangle(element.bbox, outline=config.colors_map[str(category)], width=3) + draw.rectangle( + element.bbox.to_tuple(), outline=config.colors_map[str(category)], width=3 + ) draw.text( (element.bbox[0], element.bbox[1]), config.category2name[category], From 64facb6fc892b28d769b71e5123b7cf7d979d9a5 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:53:52 +0800 Subject: [PATCH 13/39] fix(layout_annotation.py): typo --- vrdu/layout_annotation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vrdu/layout_annotation.py b/vrdu/layout_annotation.py index b6b6ae2..31b7a1f 100644 --- a/vrdu/layout_annotation.py +++ b/vrdu/layout_annotation.py @@ -2,7 +2,7 @@ import os import glob import subprocess -from typing import Any, DefaultDict, Dict, List, Tuple +from typing import Any, DefaultDict, Dict, List import matplotlib.pyplot as plt import numpy as np from skimage.measure import label, regionprops @@ -143,7 +143,7 @@ def parse_metadata(self, pdf_layouts: List[LTPage]) -> None: self.layout_metadata = layout_metadata - def retrive_figure_source_code( + def retrieve_figure_source_code( self, figure_layout_info: Dict[int, List[Block]] ) -> None: """Retrieves the source code of a figure using synctex. @@ -209,7 +209,7 @@ def generate_figure_bb(self, pdf_layouts: List[LTPage]) -> Dict[int, List[Block] if not isinstance(element, LTFigure): continue # the coordinate system of Pdfminer is in contrast to the coordinate system of the image - # by fliping the y axis + # by flipping the y axis y0 = height - element.bbox[3] y1 = height - element.bbox[1] x0 = element.bbox[0] @@ -224,7 +224,7 @@ def generate_figure_bb(self, pdf_layouts: List[LTPage]) -> Dict[int, List[Block] ) # find the corresponding source code to figure bounding box - self.retrive_figure_source_code(layout_info) + self.retrieve_figure_source_code(layout_info) # convert bounding boxes from PDF coordinate system to image coordinate system self.transform(layout_info) @@ -304,11 +304,11 @@ def generate_non_figure_bb(self) -> Dict[int, List[Block]]: # We do not consider the cross column case for these envs. if category in envs.one_column_envs: - bboxes = [bb for bb in bounding_boxes] - if len(bboxes) == 0: + bounding_boxes = [bb for bb in bounding_boxes] + if len(bounding_boxes) == 0: continue element = Block( - bounding_box=BoundingBox.from_list(bboxes), + bounding_box=BoundingBox.from_list(bounding_boxes), source_code=self.text_info[category][index], category=config.name2category[category], page_index=page_index, From 61255a307c2eba2e76069385a44a68a27123e5ff Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 12 Jun 2024 17:55:32 +0800 Subject: [PATCH 14/39] fix(order_annotation.py, renderer.py): typo --- vrdu/order_annotation.py | 10 +++++----- vrdu/renderer.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vrdu/order_annotation.py b/vrdu/order_annotation.py index 3462d86..868873c 100644 --- a/vrdu/order_annotation.py +++ b/vrdu/order_annotation.py @@ -163,7 +163,7 @@ def generate_float_envs_order(self): block.labels = re.findall(label_pattern, block.source_code) # 2. add labels for float envs - # find the intetval of tables + # find the interval of tables category_to_patterns = { "Table": re.compile( r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL @@ -176,18 +176,18 @@ def generate_float_envs_order(self): ), } - category_to_indicdes = {} + category_to_indices = {} for category, pattern in category_to_patterns.items(): - category_to_indicdes[category] = [] + category_to_indices[category] = [] indices = pattern.finditer(latex_content) # we add a uuid to match for float environments in case # there are no explicit cite for _match in indices: - category_to_indicdes[category].append( + category_to_indices[category].append( (_match.start(), _match.end(), str(uuid4())) ) - for category_name, indices in category_to_indicdes.items(): + for category_name, indices in category_to_indices.items(): # find labels for those float environments for block in self.annotations["annotations"]: if config.category2name[block.category] != category_name: diff --git a/vrdu/renderer.py b/vrdu/renderer.py index 73a0bf3..5451bb4 100644 --- a/vrdu/renderer.py +++ b/vrdu/renderer.py @@ -718,7 +718,7 @@ def _render_float_envs(self, content: str, pattern: str, category: str) -> str: result += content[indexes[i - 1][1] : indexes[i][0]] float_env = content[indexes[i][0] : indexes[i][1]] - # filter tablle of figures + # filter table of figures if category == "Table" and float_env.find("\\includegraphics") != -1: continue From b21f39d8ad996a611fc57c2e4dce64d62454a0d9 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 10:27:30 +0800 Subject: [PATCH 15/39] refactor(DocParser): enclose packages as a whole --- {TexSoup => DocParser/TexSoup}/LICENSE | 0 {TexSoup => DocParser/TexSoup}/MANIFEST.in | 0 {TexSoup => DocParser/TexSoup}/README.md | 0 {TexSoup => DocParser/TexSoup}/TexSoup/__init__.py | 0 {TexSoup => DocParser/TexSoup}/TexSoup/category.py | 0 {TexSoup => DocParser/TexSoup}/TexSoup/data.py | 0 {TexSoup => DocParser/TexSoup}/TexSoup/reader.py | 0 {TexSoup => DocParser/TexSoup}/TexSoup/tex.py | 0 {TexSoup => DocParser/TexSoup}/TexSoup/tokens.py | 0 {TexSoup => DocParser/TexSoup}/TexSoup/utils.py | 0 {TexSoup => DocParser/TexSoup}/__init__.py | 0 {TexSoup => DocParser/TexSoup}/app/__init__.py | 0 {TexSoup => DocParser/TexSoup}/app/conversion.py | 0 {TexSoup => DocParser/TexSoup}/app/resolve_imports.py | 0 {TexSoup => DocParser/TexSoup}/pytest.ini | 0 {TexSoup => DocParser/TexSoup}/setup.py | 0 {arxiv_cleaner => DocParser}/__init__.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/.gitignore | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/LICENSE.txt | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/README.md | 0 {vrdu => DocParser/arxiv_cleaner}/__init__.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/arguments.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/cleaner.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/cli.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/file_utils.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/latex.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/logger.py | 0 {arxiv_cleaner => DocParser/arxiv_cleaner}/main.py | 0 main.py => DocParser/main.py | 0 DocParser/vrdu/__init__.py | 0 {vrdu => DocParser/vrdu}/block.py | 0 {vrdu => DocParser/vrdu}/compile_latex.sh | 0 {vrdu => DocParser/vrdu}/config/config.json | 0 {vrdu => DocParser/vrdu}/config/config.py | 0 {vrdu => DocParser/vrdu}/config/envs.py | 0 {vrdu => DocParser/vrdu}/layout_annotation.py | 0 {vrdu => DocParser/vrdu}/logger.py | 0 {vrdu => DocParser/vrdu}/order_annotation.py | 0 {vrdu => DocParser/vrdu}/preprocess.py | 0 {vrdu => DocParser/vrdu}/quality_check.py | 0 {vrdu => DocParser/vrdu}/renderer.py | 0 {vrdu => DocParser/vrdu}/utils.py | 0 42 files changed, 0 insertions(+), 0 deletions(-) rename {TexSoup => DocParser/TexSoup}/LICENSE (100%) rename {TexSoup => DocParser/TexSoup}/MANIFEST.in (100%) rename {TexSoup => DocParser/TexSoup}/README.md (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/__init__.py (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/category.py (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/data.py (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/reader.py (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/tex.py (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/tokens.py (100%) rename {TexSoup => DocParser/TexSoup}/TexSoup/utils.py (100%) rename {TexSoup => DocParser/TexSoup}/__init__.py (100%) rename {TexSoup => DocParser/TexSoup}/app/__init__.py (100%) rename {TexSoup => DocParser/TexSoup}/app/conversion.py (100%) rename {TexSoup => DocParser/TexSoup}/app/resolve_imports.py (100%) rename {TexSoup => DocParser/TexSoup}/pytest.ini (100%) rename {TexSoup => DocParser/TexSoup}/setup.py (100%) rename {arxiv_cleaner => DocParser}/__init__.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/.gitignore (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/LICENSE.txt (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/README.md (100%) rename {vrdu => DocParser/arxiv_cleaner}/__init__.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/arguments.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/cleaner.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/cli.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/file_utils.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/latex.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/logger.py (100%) rename {arxiv_cleaner => DocParser/arxiv_cleaner}/main.py (100%) rename main.py => DocParser/main.py (100%) create mode 100644 DocParser/vrdu/__init__.py rename {vrdu => DocParser/vrdu}/block.py (100%) rename {vrdu => DocParser/vrdu}/compile_latex.sh (100%) rename {vrdu => DocParser/vrdu}/config/config.json (100%) rename {vrdu => DocParser/vrdu}/config/config.py (100%) rename {vrdu => DocParser/vrdu}/config/envs.py (100%) rename {vrdu => DocParser/vrdu}/layout_annotation.py (100%) rename {vrdu => DocParser/vrdu}/logger.py (100%) rename {vrdu => DocParser/vrdu}/order_annotation.py (100%) rename {vrdu => DocParser/vrdu}/preprocess.py (100%) rename {vrdu => DocParser/vrdu}/quality_check.py (100%) rename {vrdu => DocParser/vrdu}/renderer.py (100%) rename {vrdu => DocParser/vrdu}/utils.py (100%) diff --git a/TexSoup/LICENSE b/DocParser/TexSoup/LICENSE similarity index 100% rename from TexSoup/LICENSE rename to DocParser/TexSoup/LICENSE diff --git a/TexSoup/MANIFEST.in b/DocParser/TexSoup/MANIFEST.in similarity index 100% rename from TexSoup/MANIFEST.in rename to DocParser/TexSoup/MANIFEST.in diff --git a/TexSoup/README.md b/DocParser/TexSoup/README.md similarity index 100% rename from TexSoup/README.md rename to DocParser/TexSoup/README.md diff --git a/TexSoup/TexSoup/__init__.py b/DocParser/TexSoup/TexSoup/__init__.py similarity index 100% rename from TexSoup/TexSoup/__init__.py rename to DocParser/TexSoup/TexSoup/__init__.py diff --git a/TexSoup/TexSoup/category.py b/DocParser/TexSoup/TexSoup/category.py similarity index 100% rename from TexSoup/TexSoup/category.py rename to DocParser/TexSoup/TexSoup/category.py diff --git a/TexSoup/TexSoup/data.py b/DocParser/TexSoup/TexSoup/data.py similarity index 100% rename from TexSoup/TexSoup/data.py rename to DocParser/TexSoup/TexSoup/data.py diff --git a/TexSoup/TexSoup/reader.py b/DocParser/TexSoup/TexSoup/reader.py similarity index 100% rename from TexSoup/TexSoup/reader.py rename to DocParser/TexSoup/TexSoup/reader.py diff --git a/TexSoup/TexSoup/tex.py b/DocParser/TexSoup/TexSoup/tex.py similarity index 100% rename from TexSoup/TexSoup/tex.py rename to DocParser/TexSoup/TexSoup/tex.py diff --git a/TexSoup/TexSoup/tokens.py b/DocParser/TexSoup/TexSoup/tokens.py similarity index 100% rename from TexSoup/TexSoup/tokens.py rename to DocParser/TexSoup/TexSoup/tokens.py diff --git a/TexSoup/TexSoup/utils.py b/DocParser/TexSoup/TexSoup/utils.py similarity index 100% rename from TexSoup/TexSoup/utils.py rename to DocParser/TexSoup/TexSoup/utils.py diff --git a/TexSoup/__init__.py b/DocParser/TexSoup/__init__.py similarity index 100% rename from TexSoup/__init__.py rename to DocParser/TexSoup/__init__.py diff --git a/TexSoup/app/__init__.py b/DocParser/TexSoup/app/__init__.py similarity index 100% rename from TexSoup/app/__init__.py rename to DocParser/TexSoup/app/__init__.py diff --git a/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py similarity index 100% rename from TexSoup/app/conversion.py rename to DocParser/TexSoup/app/conversion.py diff --git a/TexSoup/app/resolve_imports.py b/DocParser/TexSoup/app/resolve_imports.py similarity index 100% rename from TexSoup/app/resolve_imports.py rename to DocParser/TexSoup/app/resolve_imports.py diff --git a/TexSoup/pytest.ini b/DocParser/TexSoup/pytest.ini similarity index 100% rename from TexSoup/pytest.ini rename to DocParser/TexSoup/pytest.ini diff --git a/TexSoup/setup.py b/DocParser/TexSoup/setup.py similarity index 100% rename from TexSoup/setup.py rename to DocParser/TexSoup/setup.py diff --git a/arxiv_cleaner/__init__.py b/DocParser/__init__.py similarity index 100% rename from arxiv_cleaner/__init__.py rename to DocParser/__init__.py diff --git a/arxiv_cleaner/.gitignore b/DocParser/arxiv_cleaner/.gitignore similarity index 100% rename from arxiv_cleaner/.gitignore rename to DocParser/arxiv_cleaner/.gitignore diff --git a/arxiv_cleaner/LICENSE.txt b/DocParser/arxiv_cleaner/LICENSE.txt similarity index 100% rename from arxiv_cleaner/LICENSE.txt rename to DocParser/arxiv_cleaner/LICENSE.txt diff --git a/arxiv_cleaner/README.md b/DocParser/arxiv_cleaner/README.md similarity index 100% rename from arxiv_cleaner/README.md rename to DocParser/arxiv_cleaner/README.md diff --git a/vrdu/__init__.py b/DocParser/arxiv_cleaner/__init__.py similarity index 100% rename from vrdu/__init__.py rename to DocParser/arxiv_cleaner/__init__.py diff --git a/arxiv_cleaner/arguments.py b/DocParser/arxiv_cleaner/arguments.py similarity index 100% rename from arxiv_cleaner/arguments.py rename to DocParser/arxiv_cleaner/arguments.py diff --git a/arxiv_cleaner/cleaner.py b/DocParser/arxiv_cleaner/cleaner.py similarity index 100% rename from arxiv_cleaner/cleaner.py rename to DocParser/arxiv_cleaner/cleaner.py diff --git a/arxiv_cleaner/cli.py b/DocParser/arxiv_cleaner/cli.py similarity index 100% rename from arxiv_cleaner/cli.py rename to DocParser/arxiv_cleaner/cli.py diff --git a/arxiv_cleaner/file_utils.py b/DocParser/arxiv_cleaner/file_utils.py similarity index 100% rename from arxiv_cleaner/file_utils.py rename to DocParser/arxiv_cleaner/file_utils.py diff --git a/arxiv_cleaner/latex.py b/DocParser/arxiv_cleaner/latex.py similarity index 100% rename from arxiv_cleaner/latex.py rename to DocParser/arxiv_cleaner/latex.py diff --git a/arxiv_cleaner/logger.py b/DocParser/arxiv_cleaner/logger.py similarity index 100% rename from arxiv_cleaner/logger.py rename to DocParser/arxiv_cleaner/logger.py diff --git a/arxiv_cleaner/main.py b/DocParser/arxiv_cleaner/main.py similarity index 100% rename from arxiv_cleaner/main.py rename to DocParser/arxiv_cleaner/main.py diff --git a/main.py b/DocParser/main.py similarity index 100% rename from main.py rename to DocParser/main.py diff --git a/DocParser/vrdu/__init__.py b/DocParser/vrdu/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vrdu/block.py b/DocParser/vrdu/block.py similarity index 100% rename from vrdu/block.py rename to DocParser/vrdu/block.py diff --git a/vrdu/compile_latex.sh b/DocParser/vrdu/compile_latex.sh similarity index 100% rename from vrdu/compile_latex.sh rename to DocParser/vrdu/compile_latex.sh diff --git a/vrdu/config/config.json b/DocParser/vrdu/config/config.json similarity index 100% rename from vrdu/config/config.json rename to DocParser/vrdu/config/config.json diff --git a/vrdu/config/config.py b/DocParser/vrdu/config/config.py similarity index 100% rename from vrdu/config/config.py rename to DocParser/vrdu/config/config.py diff --git a/vrdu/config/envs.py b/DocParser/vrdu/config/envs.py similarity index 100% rename from vrdu/config/envs.py rename to DocParser/vrdu/config/envs.py diff --git a/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py similarity index 100% rename from vrdu/layout_annotation.py rename to DocParser/vrdu/layout_annotation.py diff --git a/vrdu/logger.py b/DocParser/vrdu/logger.py similarity index 100% rename from vrdu/logger.py rename to DocParser/vrdu/logger.py diff --git a/vrdu/order_annotation.py b/DocParser/vrdu/order_annotation.py similarity index 100% rename from vrdu/order_annotation.py rename to DocParser/vrdu/order_annotation.py diff --git a/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py similarity index 100% rename from vrdu/preprocess.py rename to DocParser/vrdu/preprocess.py diff --git a/vrdu/quality_check.py b/DocParser/vrdu/quality_check.py similarity index 100% rename from vrdu/quality_check.py rename to DocParser/vrdu/quality_check.py diff --git a/vrdu/renderer.py b/DocParser/vrdu/renderer.py similarity index 100% rename from vrdu/renderer.py rename to DocParser/vrdu/renderer.py diff --git a/vrdu/utils.py b/DocParser/vrdu/utils.py similarity index 100% rename from vrdu/utils.py rename to DocParser/vrdu/utils.py From 8ddd76e40d8a78f55838b872364d43757a22fdaf Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 10:27:55 +0800 Subject: [PATCH 16/39] refactor(batch_process.py): move batch_process.py to scripts/ --- batch_process.py => scripts/batch_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename batch_process.py => scripts/batch_process.py (99%) diff --git a/batch_process.py b/scripts/batch_process.py similarity index 99% rename from batch_process.py rename to scripts/batch_process.py index 9185a72..f27ca50 100644 --- a/batch_process.py +++ b/scripts/batch_process.py @@ -7,7 +7,7 @@ import pandas as pd from vrdu import logger -from main import process_one_file +from DocParser.main import process_one_file log_file = str(uuid4()) + ".log" log = logger.setup_app_level_logger(file_name=log_file, level="INFO") From 3ab4efbb7952cf40aea009033cbd79ae71e15184 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 10:28:23 +0800 Subject: [PATCH 17/39] docs(setup.py): update metadata --- setup.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index a993283..ad473aa 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,11 @@ setup( name="vrdu_data_process", - version="0.5.0", + version="1.0.0", description="process the academic papers with .tex source files", author="Mao Song", author_email="maosong@pjlab.org.cn", - url="https://github.com/MaoSong2022/vrdu_data_process", + url="https://github.com/UniModal4Reasoning/DocParser.git", license="MIT", packages=find_packages(), install_requires=[ @@ -16,7 +16,6 @@ "numpy==1.24.3", "pdf2image==1.16.3", "pdfminer.six==20221105", - # "Pillow==9.4.0", "Pillow==10.1.0", "pyparsing==3.1.1", "pytest==7.4.2", @@ -24,9 +23,7 @@ "setuptools==68.0.0", "tqdm==4.66.1", ], - scripts=[ - "vrdu/compile_latex.sh", - ], + scripts=[], entry_points={ "console_scripts": [], }, From 54a261c5bbe9c9159ca671c157a3e846cddc4f83 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 11:22:25 +0800 Subject: [PATCH 18/39] test(tests/): change import path --- tests/test_add_definitions.py | 2 +- tests/test_extract_graphics.py | 2 +- tests/test_is_text_eq.py | 2 +- tests/test_remove_hyperref_color.py | 2 +- tests/test_render_abstract.py | 2 +- tests/test_render_algorithm.py | 2 +- tests/test_render_caption.py | 2 +- tests/test_render_code.py | 2 +- tests/test_render_footnote.py | 2 +- tests/test_render_tabular.py | 4 ++-- tests/test_render_title.py | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py index f3ca221..096ca65 100644 --- a/tests/test_add_definitions.py +++ b/tests/test_add_definitions.py @@ -1,7 +1,7 @@ import unittest import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer def test_add_color_definition1(): diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py index 8335db3..14a2cd5 100644 --- a/tests/test_extract_graphics.py +++ b/tests/test_extract_graphics.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestGraphics(unittest.TestCase): diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py index 6426411..3baa280 100644 --- a/tests/test_is_text_eq.py +++ b/tests/test_is_text_eq.py @@ -1,6 +1,6 @@ import unittest -from vrdu.renderer import is_text_eq +from DocParser.vrdu.renderer import is_text_eq class TestTextEq(unittest.TestCase): diff --git a/tests/test_remove_hyperref_color.py b/tests/test_remove_hyperref_color.py index 7db6f39..3b6a287 100644 --- a/tests/test_remove_hyperref_color.py +++ b/tests/test_remove_hyperref_color.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestHyperref(unittest.TestCase): diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py index 16f2cb9..405f6da 100644 --- a/tests/test_render_abstract.py +++ b/tests/test_render_abstract.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestAbstract(unittest.TestCase): diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py index c15821e..a4cf6ad 100644 --- a/tests/test_render_algorithm.py +++ b/tests/test_render_algorithm.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestAlgorithm(unittest.TestCase): diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py index eb21de8..b526f60 100644 --- a/tests/test_render_caption.py +++ b/tests/test_render_caption.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestCaption(unittest.TestCase): diff --git a/tests/test_render_code.py b/tests/test_render_code.py index f6f39fd..55082de 100644 --- a/tests/test_render_code.py +++ b/tests/test_render_code.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestCode(unittest.TestCase): diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py index e0fcebd..e81e0fd 100644 --- a/tests/test_render_footnote.py +++ b/tests/test_render_footnote.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestFootnote(unittest.TestCase): diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py index 55f6f92..e57f363 100644 --- a/tests/test_render_tabular.py +++ b/tests/test_render_tabular.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestTabular(unittest.TestCase): @@ -62,4 +62,4 @@ def test_mix_tabulars(self): file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}Table \\ref{demo-table} has a caption:\\begin{table}[!h]\\begin{center}{\\color{Table_color}\\begin{tabular}{||c c c c||} \\hline Col1 & Col2 & Col2 & Col3 \\ [0.5ex] \\hline\\hline 1 & 6 & 87837 & 787 \\ \\hline 2 & 7 & 78 & 5415 \\ \\hline 3 & 545 & 778 & 7507 \\ \\hline 4 & 545 & 18744 & 7560 \\ \\hline 5 & 88 & 788 & 6344 \\ [1ex] \\hline\\end{tabular}}\\caption{\\label{demo-table}Your caption.}\\end{center}\\end{table} \\begin{table}[!h]\\begin{center}{\\color{Table_color}\\begin{tabularx}{||c c c c||} \\hline Col1 & Col2 & Col2 & Col3 \\ [0.5ex] \\hline\\hline 1 & 6 & 87837 & 787 \\ \\hline 2 & 7 & 78 & 5415 \\ \\hline 3 & 545 & 778 & 7507 \\ \\hline 4 & 545 & 18744 & 7560 \\ \\hline 5 & 88 & 788 & 6344 \\ [1ex] \\hline\\end{tabularx}}\\caption{\\label{demo-table}Your caption.}\\end{center}\\end{table}\\end{document}""" - ) \ No newline at end of file + ) diff --git a/tests/test_render_title.py b/tests/test_render_title.py index 343714e..122063b 100644 --- a/tests/test_render_title.py +++ b/tests/test_render_title.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestTitle(unittest.TestCase): From 357fe6b1e142fd0777617d08a736725e8e53b311 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 11:23:45 +0800 Subject: [PATCH 19/39] refactor(DocParser): update import path --- DocParser/TexSoup/TexSoup/__init__.py | 4 ++-- DocParser/TexSoup/app/conversion.py | 8 ++++---- DocParser/vrdu/renderer.py | 10 +++++----- DocParser/vrdu/utils.py | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/DocParser/TexSoup/TexSoup/__init__.py b/DocParser/TexSoup/TexSoup/__init__.py index a20883d..37a55c0 100644 --- a/DocParser/TexSoup/TexSoup/__init__.py +++ b/DocParser/TexSoup/TexSoup/__init__.py @@ -4,8 +4,8 @@ tree with navigation, search, and modification utilities. """ -from TexSoup.TexSoup.tex import read -from TexSoup.TexSoup.data import TexNode +from .tex import read +from .data import TexNode __version__ = '0.3.1' diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py index 3ffe746..474c228 100644 --- a/DocParser/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -1,11 +1,11 @@ import re -from TexSoup.TexSoup import TexSoup -from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup +from DocParser.TexSoup.TexSoup import TexSoup +from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from vrdu import logger -from vrdu.config import envs +from DocParser.vrdu import logger +from DocParser.vrdu.config import envs log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 5451bb4..5737bc7 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -5,12 +5,12 @@ import re -import vrdu.utils as utils -import vrdu.logger as logger -from vrdu.config import config, envs +import DocParser.vrdu.utils as utils +import DocParser.vrdu.logger as logger +from DocParser.vrdu.config import config, envs -from TexSoup.TexSoup import TexSoup -import TexSoup.app.conversion as conversion +from DocParser.TexSoup.TexSoup import TexSoup +import DocParser.TexSoup.app.conversion as conversion log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py index aecefe8..be6fa51 100755 --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -7,8 +7,8 @@ from pdf2image import pdf2image from pdf2image import generators -from vrdu.block import Block -from vrdu.config import config +from DocParser.vrdu.block import Block +from DocParser.vrdu.config import config def export_to_json(data: Union[Dict, List], file_path: str) -> None: From caed5c497fff3df4d0f14d8c0b26a00711d0680d Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 15:52:44 +0800 Subject: [PATCH 20/39] refactor(scripts): change package import path --- scripts/app.py | 4 ++-- scripts/arxiv_download.py | 4 ++-- scripts/batch_process.py | 2 +- scripts/export_to_dataset.py | 2 +- scripts/generate_reading_annotation.py | 4 ++-- scripts/retrive_metadata.py | 4 ++-- scripts/run_statistics.py | 6 +++--- scripts/visualize_order_annotations_single_page.py | 2 +- scripts/visualize_order_annotations_two_page.py | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/app.py b/scripts/app.py index 549d682..54b4a1c 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -3,8 +3,8 @@ import glob from PIL import Image, ImageDraw -from vrdu import utils -from vrdu.config import config +from DocParser.vrdu import utils +from DocParser.vrdu.config import config pn.extension() diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 9b188ee..237119e 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -5,8 +5,8 @@ import tarfile -from vrdu import utils -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="arxiv_download.log") diff --git a/scripts/batch_process.py b/scripts/batch_process.py index f27ca50..f146761 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -6,7 +6,7 @@ from uuid import uuid4 import pandas as pd -from vrdu import logger +from DocParser.vrdu import logger from DocParser.main import process_one_file log_file = str(uuid4()) + ".log" diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index fafb3d2..f8c41d8 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from vrdu import logger +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index 72696a6..928c78a 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -3,8 +3,8 @@ import multiprocessing import os -from vrdu import utils -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") diff --git a/scripts/retrive_metadata.py b/scripts/retrive_metadata.py index 0b9e4e5..f378ff2 100644 --- a/scripts/retrive_metadata.py +++ b/scripts/retrive_metadata.py @@ -7,8 +7,8 @@ import pandas as pd -from vrdu import utils -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") diff --git a/scripts/run_statistics.py b/scripts/run_statistics.py index a21b7e2..2f8a70a 100644 --- a/scripts/run_statistics.py +++ b/scripts/run_statistics.py @@ -4,10 +4,10 @@ import argparse from datetime import datetime -from vrdu import utils -from vrdu.config import config +from DocParser.vrdu import utils +from DocParser.vrdu.config import config -from vrdu import logger +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="statistics.log") diff --git a/scripts/visualize_order_annotations_single_page.py b/scripts/visualize_order_annotations_single_page.py index d532178..68a0cf2 100644 --- a/scripts/visualize_order_annotations_single_page.py +++ b/scripts/visualize_order_annotations_single_page.py @@ -6,7 +6,7 @@ from PIL import Image, ImageDraw from matplotlib import pyplot as plt -from vrdu import utils +from DocParser.vrdu import utils def arrowedLine( diff --git a/scripts/visualize_order_annotations_two_page.py b/scripts/visualize_order_annotations_two_page.py index fefce3b..d534ea7 100644 --- a/scripts/visualize_order_annotations_two_page.py +++ b/scripts/visualize_order_annotations_two_page.py @@ -6,7 +6,7 @@ from PIL import Image, ImageDraw from matplotlib import pyplot as plt -from vrdu import utils +from DocParser.vrdu import utils def arrowedLine( From 057dba1115f56497c0a1713462fe31da604919c4 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 15:55:04 +0800 Subject: [PATCH 21/39] feat(scripts): remove unused scripts --- scripts/collect_coco_dataset.py | 191 ---------------------- scripts/visualize_dataset_distribution.py | 107 ------------ scripts/visuzliation.py | 41 ----- 3 files changed, 339 deletions(-) delete mode 100644 scripts/collect_coco_dataset.py delete mode 100644 scripts/visualize_dataset_distribution.py delete mode 100644 scripts/visuzliation.py diff --git a/scripts/collect_coco_dataset.py b/scripts/collect_coco_dataset.py deleted file mode 100644 index 81c80e7..0000000 --- a/scripts/collect_coco_dataset.py +++ /dev/null @@ -1,191 +0,0 @@ -import os -import re -import matplotlib.pyplot as plt -import time -import json -import shutil -import random -import argparse -from tqdm import tqdm - - -def extract_tex_files(path, target_pattern): - tex_files = [] - - for root, dirs, files in os.walk(path): - for file in files: - if not file.endswith(".tex"): - continue - if file.startswith("paper_"): - continue - - tex_file = os.path.join(root, file) - - try: - with open(tex_file) as f: - content = f.read() - except UnicodeDecodeError: - continue - - if "\\begin{document}" not in content: - continue - - if not any( - re.match(pattern, root.split("/")[-2]) for pattern in target_pattern - ): - continue - - if os.path.exists(f"{root}/output/result/layout_annotation.json"): - tex_files.append(tex_file) - return tex_files - - -def main(path, target_pattern, ratio): - now_time = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) - coco_dataset_name = f"COCO_datasets/Multi-modal_COCO_dataset_{now_time}" - - target_images_folder = f"{coco_dataset_name}/images" - os.makedirs(coco_dataset_name, exist_ok=True) - os.makedirs(target_images_folder, exist_ok=True) - - tex_files = sorted(extract_tex_files(path, target_pattern)) - tex_files_length = len(tex_files) - - random.seed(0) - random.shuffle(tex_files) - train_list = tex_files[: int(tex_files_length * ratio)] - val_list = tex_files[int(tex_files_length * ratio) :] - dataset_dict = {"train": train_list, "val": val_list} - - info = { - "year": 2023, - "version": "1.0", - "description": "COCO format dataset converted form document genome", - "contributor": "ADLab", - "url": "", - "date_created": f"{time.ctime()}", - } - licenses = [ - { - "url": "http://creativecommons.org/licenses/by/2.0/", - "id": 4, - "name": "Attribution License", - } - ] - images = [] - annotations = [] - categories = [ - {"id": 0, "name": "Algorithm", "supercategory": "Algorithm"}, - {"id": 1, "name": "Caption", "supercategory": "Caption"}, - {"id": 2, "name": "Equation", "supercategory": "Equation"}, - {"id": 3, "name": "Figure", "supercategory": "Figure"}, - {"id": 4, "name": "Footnote", "supercategory": "Footnote"}, - {"id": 5, "name": "List", "supercategory": "List"}, - {"id": 6, "name": "Others", "supercategory": "Others"}, - {"id": 7, "name": "Table", "supercategory": "Table"}, - {"id": 8, "name": "Text", "supercategory": "Text"}, - {"id": 9, "name": "Text-EQ", "supercategory": "Text"}, - {"id": 10, "name": "Title", "supercategory": "Title"}, - {"id": 11, "name": "Reference", "supercategory": "Reference"}, - {"id": 12, "name": "PaperTitle", "supercategory": "Title"}, - {"id": 13, "name": "Code", "supercategory": "Algorithm"}, - {"id": 14, "name": "Abstract", "supercategory": "Text"}, - ] - - anno_id = 0 - image_id = 0 - pattern = r"\d+\.\d+(v\d+)?" - for key, tex_files in dataset_dict.items(): - print(f"Processing {key} set...") - - images = [] - annotations = [] - - for tex_file in tqdm(tex_files): - coco_annotation_file = ( - f"{os.path.dirname(tex_file)}/output/result/layout_annotation.json" - ) - images_path = f"{os.path.dirname(tex_file)}/output/colored" - - if not re.search(pattern, tex_file): - raise NotImplementedError - arxiv_paper_id = re.search(pattern, tex_file).group() - - with open(coco_annotation_file, "r") as fp: - coco_annotation = json.load(fp) - sub_images = coco_annotation["images"] - sub_annotations_list = coco_annotation["annotations"] - - grouped_annotations = {} - for annotation in sub_annotations_list: - anno_image_id = annotation["image_id"] - # 检查image_id是否已经在字典中 - if anno_image_id not in grouped_annotations: - # 如果不在,创建一个新的列表 - grouped_annotations[anno_image_id] = [] - # 将注释添加到相应的列表中 - grouped_annotations[anno_image_id].append(annotation) - - grouped_annotations_key_list = sorted(grouped_annotations.keys()) - for idx in grouped_annotations_key_list: - file_name = arxiv_paper_id.replace(".", "_") + f"-page_{idx:04d}.png" - page_image = plt.imread(f"{images_path}/{idx}.png") - H, W, _ = page_image.shape - page_annotations = grouped_annotations[idx] - - images.append( - { - "id": image_id, - "width": W, - "height": H, - "file_name": file_name, - "coco_url": "https://github.com/MaoSong2022/vrdu_data_process", - "date_captured": now_time, - "flickr_url": "", - "licenses": 4, - } - ) - shutil.copyfile( - f"{images_path}/{idx}.png", f"{target_images_folder}/{file_name}" - ) - - for anno in page_annotations: - annotations.append( - { - "id": anno_id, - "image_id": image_id, - "category_id": anno["category_id"], - "segmentation": anno["segmentation"], - "bbox": anno["bbox"], - "area": anno["area"], - "iscrowd": anno["iscrowd"], - } - ) - anno_id += 1 - image_id += 1 - - coco_json_content = { - "info": info, - "licenses": licenses, - "images": images, - "annotations": annotations, - "categories": categories, - } - - with open(f"{coco_dataset_name}/{key}.json", "w") as fp: - json.dump(coco_json_content, fp, indent=4) - - -if __name__ == "__main__": - # parser = argparse.ArgumentParser() - # parser.add_argument("-p", "--path", type=str, required=True) - # parser.add_argument("-r", "--ratio", type=float, default=0.8) - # args = parser.parse_args() - # path = args.path - - target_pattern = [r"^cs\.\w+$"] - path = os.path.expanduser( - "/cpfs01/shared/ADLab/datasets/arxiv_source/arxiv_source_uncompressed" - ) - ratio = 0.8 - main(path, target_pattern, ratio) diff --git a/scripts/visualize_dataset_distribution.py b/scripts/visualize_dataset_distribution.py deleted file mode 100644 index 1418514..0000000 --- a/scripts/visualize_dataset_distribution.py +++ /dev/null @@ -1,107 +0,0 @@ -from collections import defaultdict -import os -import matplotlib.pyplot as plt -import numpy as np -import csv - - -def get_all_categories(): - """ - Retrieves all categories from the "category_count.csv" file. - - Returns: - categories (list): A list of all categories. - - Reference: - https://arxiv.org/category_taxonomy - """ - categories = [] - with open("scripts/category_count.csv", "r") as f: - reader = csv.DictReader(f) - for row in reader: - categories.append(row["categories"]) - - return categories - - -def visualize_distribution(dict1, dict2): - categories = list(dict1.keys()) # Get the list of categories - - # Get the number of files for each category from both dictionaries - files_dict1 = [dict1[category] for category in categories] - files_dict2 = [dict2[category] for category in categories] - - # normalize - files_dict1 = [x / sum(files_dict1) for x in files_dict1] - files_dict2 = [x / sum(files_dict2) for x in files_dict2] - - # Set up the plot - plt.figure(figsize=(10, 8)) - fig, ax = plt.subplots() - width = 1.2 # Width of the bars - - # Calculate the positions for the bars - positions = np.arange(0, len(categories) * width, width) - - # Plot the number of files for each category - ax.barh(positions, files_dict1, width, label="batch", align="center", color="blue") - ax.barh( - positions, - -np.array(files_dict2), - width, - label="original", - align="center", - color="red", - ) - - # Add labels and title to the plot - ax.set_yticks(positions) - ax.set_yticklabels(categories, fontsize=2) - ax.set_xlabel("Number of Files") - ax.set_title("Distribution of arxiv_source_uncompressed") - ax.legend() - - plt.subplots_adjust(left=0.4) - # Display the plot - plt.savefig("test.png", dpi=300) - - -def analyze_raw_data(path): - all_categories = get_all_categories() - - data = defaultdict(int) - for category in all_categories: - if os.path.exists(os.path.join(path, category)): - data[category] = len(os.listdir(os.path.join(path, category))) - - with open("scripts/batch_count.csv", mode="w") as f: - fieldnames = ["categories", "count"] - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - for key, value in data.items(): - writer.writerow( - { - "categories": key, - "count": value, - } - ) - - return data - - -def main(): - batch = analyze_raw_data( - "/cpfs01/shared/ADLab/datasets/arxiv_source/arxiv_source_uncompressed" - ) - - original = {} - with open("scripts/category_count.csv", newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - original[row["categories"]] = int(row["count"]) - - visualize_distribution(batch, original) - - -if __name__ == "__main__": - main() diff --git a/scripts/visuzliation.py b/scripts/visuzliation.py deleted file mode 100644 index fd23244..0000000 --- a/scripts/visuzliation.py +++ /dev/null @@ -1,41 +0,0 @@ -from graphviz import Digraph - -from vrdu.utils import load_json - - -def draw_dot(annotations, format="svg", rankdir="TB"): - """ - format: png | svg | ... - rankdir: TB (top to bottom graph) | LR (left to right) - """ - assert rankdir in ["LR", "TB"] - - nodes = set() - edges = [] - for annotation in annotations: - nodes.add(annotation["from"]) - nodes.add(annotation["to"]) - edges.append((annotation["from"], annotation["type"], annotation["to"])) - - dot = Digraph(format=format, graph_attr={"rankdir": rankdir}) - - for node in nodes: - dot.node( - name=str(id(node)), - label=str(node), - shape="record", - ) - - for node1, relation, node2 in edges: - dot.edge(str(id(node1)), str(id(node2)), label=relation) - - return dot - - -if __name__ == "__main__": - annotation_file = ( - "/home/PJLAB/maosong/vrdu_data/icml2022/output/result/order_annotation.json" - ) - annotations = load_json(annotation_file) - dot = draw_dot(annotations) - dot.render(filename="gout.dot", view=True) From 3d5f83299e107fd0715969056b09458d59423263 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 16:26:15 +0800 Subject: [PATCH 22/39] refactor(arxiv_download.py): simplify code logic --- scripts/arxiv_download.py | 130 ++++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 47 deletions(-) diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 237119e..971f779 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -1,84 +1,120 @@ +import argparse import arxiv import os -from typing import List, Dict -from tqdm import tqdm +from typing import List import tarfile -from DocParser.vrdu import utils from DocParser.vrdu import logger -log = logger.setup_app_level_logger(file_name="arxiv_download.log") +log = logger.setup_app_level_logger(logger_name="arxiv_download.log") -def arxiv_download(data: List[Dict], path: str) -> None: - """Download papers from arXiv based on category data. - This function takes a list of category download tasks and a base - path. For each category item in the data, it will: - - 1. Create a subdirectory under the given path for that category - 2. Check if there are already enough papers in the subdir - 3. Search arXiv for the category - 4. Download up to the requested count of newest papers - 5. Save each paper to the category subdirectory +def download_papers_with_paper_id( + path: str, discipline: str, paper_ids: List[str] +) -> None: + """ + Downloads papers from the Arxiv repository based on the specified paper IDs. - Arguments: - data (List[Dict]): List of dicts with keys "category" and "count" - path (str): Base directory path to save papers + Args: + - path (str): The path where the downloaded papers will be saved. + - discipline (str): The discipline of the papers to be downloaded. + - paper_ids (List[str]): A list of paper IDs to be downloaded. Returns: - None + None + + Raises: + - tarfile.ReadError: cannot unpack the .tar.gz file + + Usage: + ```python + download_papers_with_paper_id(path, discipline, paper_ids) + ``` + """ client = arxiv.Client() - for row in tqdm(data): - if row["auto_annotated_paper_path"]: - continue - discipline = row["discipline"] - discipline_path = os.path.join(path, discipline) - os.makedirs(discipline_path, exist_ok=True) + discipline_path = os.path.join(path, discipline) + os.makedirs(discipline_path, exist_ok=True) - if os.path.exists(os.path.join(discipline_path, row["paper_id"])): - log.debug(f'{os.path.join(discipline_path, row["paper_id"])} exists') + search_results = client.results(arxiv.Search(id_list=paper_ids)) + + for result in search_results: + # extract {id} without version from http://arxiv.org/abs/{id} + paper_id = result.entry_id.split("/")[1].split("v")[0] + log.info(f"Downloading paper {paper_id}") + + tar_file_path = result.download_source(dirpath=discipline_path) + log.info(f"Downloading tar file {tar_file_path}") + paper_path = os.path.join(discipline_path, paper_id) + if os.path.exists(paper_path): continue - if os.path.exists(os.path.join(discipline_path, row["paper_id"], ".tar.gz")): - log.debug( - f'{os.path.join(discipline_path, row["paper_id"], ".tar.gz")} exists' - ) + try: + with tarfile.open(tar_file_path, "r:gz") as tar: + tar.extractall(paper_path) + except tarfile.ReadError: + log.error(f"{tar_file_path} is not a tar.gz file") continue - search_results = client.results(arxiv.Search(id_list=[row["paper_id"]])) + +def download_batch_papers(path: str, discipline: str, num_papers: int) -> None: + """ + Downloads a batch of papers from the Arxiv repository + based on the specified discipline and number of papers. + + Args: + - path (str): The path where the downloaded papers will be saved. + - discipline (str): The discipline of the papers to be downloaded. + - num_papers (int): The number of papers to be downloaded. + + Returns: + None + + Raises: + None + + Usage: + ```python + download_batch_papers(output_path, discipline, num_papers) + ``` + + """ + client = arxiv.Client() + + paper_ids = [] + while num_papers > 0: + search_results = client.results( + arxiv.Search(query=discipline, max_results=num_papers) + ) for result in search_results: - tar_file_path = result.download_source(dirpath=discipline_path) - log.debug(f"Downloading tar file {tar_file_path}") - paper_path = os.path.join(discipline_path, row["paper_id"]) - try: - with tarfile.open(tar_file_path, "r:gz") as tar: - tar.extractall(paper_path) - except tarfile.ReadError: - log.error(f"{tar_file_path} is not a tar.gz file") - continue + paper_id = result.entry_id.split("/")[1].split("v")[0] + log.debug(f"Downloading paper {paper_id}") + if paper_id not in paper_ids: + paper_ids.append(paper_id) + num_papers -= 1 + download_papers_with_paper_id(path, discipline, paper_ids) -def main(): - import argparse +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-p", "--path", type=str, required=True, help="Path to save result" ) parser.add_argument( - "-f", "--file", type=str, required=True, help="json file for saving result" + "-d", "--discipline", type=str, default="cs.CV", help="discipline to download" + ) + parser.add_argument( + "-i", "--num_papers", type=int, default=1, help="Number of paper to download" ) args = parser.parse_args() - output_path, json_file = args.path, args.file - - json_data = utils.load_json(json_file) + output_path, discipline, num_papers = args.path, args.discipline, args.num_papers - arxiv_download(json_data, output_path) + download_batch_papers(output_path, discipline, num_papers) if __name__ == "__main__": From 08753af8b8c1fc2b4ac7ccad3032d93118b7e1c8 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 16:30:56 +0800 Subject: [PATCH 23/39] refactor(batch_process.py): simplify code logic --- scripts/batch_process.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/scripts/batch_process.py b/scripts/batch_process.py index f146761..78dbe8d 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -3,27 +3,31 @@ import multiprocessing import shutil from typing import List -from uuid import uuid4 import pandas as pd from DocParser.vrdu import logger from DocParser.main import process_one_file -log_file = str(uuid4()) + ".log" -log = logger.setup_app_level_logger(file_name=log_file, level="INFO") - -database = "data/processed_paper_database.csv" +log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") def filter_tex_files(discipline_path: str) -> List[str]: - """extract all MAIN.tex files for processing, if discipline_path is not None, then - only extract MAIN.tex files in the discipline_path (not recursive) + """ + Filters the list of tex files in the given discipline path. Args: - discipline_path (str): path to main directory. + discipline_path (str): The path to the discipline directory containing tex files. Returns: - List[str]: list of tex files that are compilable. + List[str]: A list of filtered tex files that meet the specified criteria. + + Raises: + Exception: If the processing fails. + + 1. Exclude tex files with names "paper_colored.tex", "paper_white.tex", and "paper_original.tex". + 2. Exclude tex files that are inside a subfolder. + 3. Ensure that the tex file is a main document by checking if it contains "\\begin{document}". + """ tex_files = [] @@ -32,7 +36,6 @@ def filter_tex_files(discipline_path: str) -> List[str]: [os.path.join(root, file) for file in files if file.endswith(".tex")] ) - # TODO: move this to config redundant_tex_files = [ "paper_colored.tex", "paper_white.tex", @@ -63,14 +66,6 @@ def filter_tex_files(discipline_path: str) -> List[str]: log.debug(f"failed to read tex file: {tex_file} due to UnicodeDecodeError") continue - # skip processed papers - log.info(f"[VRDU] Before filtering, found {len(result)} tex files") - if os.path.exists(database): - df = pd.read_csv(database) - processed_papers = set(df["path"]) - result = [x for x in result if os.path.dirname(x) not in processed_papers] - - log.info(f"[VRDU] After filtering, found {len(result)} tex files") return result @@ -101,7 +96,6 @@ def process_one_discipline(path: str, cpu_count: int, discipline: str) -> None: finally: # save the process log log.info(f"[VRDU] discipline: {discipline}, finished processing.") - shutil.move(log_file, f"data/batch_process_{discipline}.log") def main(): From 50f564156a4b9606eee7f691b7cee4e60d945aa3 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 16:49:32 +0800 Subject: [PATCH 24/39] feat(scripts/): remove unused script --- scripts/convert_coco_to_yolo.py | 134 -------------------------------- 1 file changed, 134 deletions(-) delete mode 100644 scripts/convert_coco_to_yolo.py diff --git a/scripts/convert_coco_to_yolo.py b/scripts/convert_coco_to_yolo.py deleted file mode 100644 index 869e052..0000000 --- a/scripts/convert_coco_to_yolo.py +++ /dev/null @@ -1,134 +0,0 @@ -import os -import json -import argparse -from shutil import copyfile - - -# parser = argparse.ArgumentParser(description='Test yolo data.') -# parser.add_argument('-j', help='JSON file', dest='json', required=True) -# parser.add_argument('-o', help='path to output folder', dest='out', required=True) -# -# args = parser.parse_args() -# -# json_file = args.json -# output = args.out - -class COCO2YOLO: - def __init__(self, json_file, output_path): - self.json_file = json_file - self.output_path = output_path - self.output_image_path = output_path.replace('labels', 'images') - self.output_folder = os.path.dirname(os.path.dirname(self.output_path)) - self._check_file_and_dir() - self.labels = json.load(open(json_file, 'r', encoding='utf-8')) - self.coco_id_name_map = self._categories() - self.coco_name_list = list(self.coco_id_name_map.values()) - print("total images", len(self.labels['images'])) - print("total categories", len(self.labels['categories'])) - print("total labels", len(self.labels['annotations'])) - - def _check_file_and_dir(self): - if not os.path.exists(self.json_file): - raise ValueError("file not found") - os.makedirs(self.output_path, exist_ok=True) - os.makedirs(self.output_image_path, exist_ok=True) - - def _categories(self): - categories = {} - for cls in self.labels['categories']: - categories[cls['id']] = cls['name'] - return categories - - def _load_images_info(self): - images_info = {} - for image in self.labels['images']: - id = image['id'] - file_name = image['file_name'] - if file_name.find('\\') > -1: - file_name = file_name[file_name.index('\\') + 1:] - w = image['width'] - h = image['height'] - images_info[id] = (file_name, w, h) - - return images_info - - def _bbox_2_yolo(self, bbox, img_w, img_h): - x, y, w, h = bbox[0], bbox[1], bbox[2], bbox[3] - centerx = bbox[0] + w / 2 - centery = bbox[1] + h / 2 - dw = 1 / img_w - dh = 1 / img_h - centerx *= dw - w *= dw - centery *= dh - h *= dh - return centerx, centery, w, h - - def _convert_anno(self, images_info): - anno_dict = dict() - for anno in self.labels['annotations']: - bbox = anno['bbox'] - image_id = anno['image_id'] - category_id = anno['category_id'] - - image_info = images_info.get(image_id) - image_name = image_info[0] - img_w = image_info[1] - img_h = image_info[2] - yolo_box = self._bbox_2_yolo(bbox, img_w, img_h) - - anno_info = (image_name, category_id, yolo_box) - anno_infos = anno_dict.get(image_id) - if not anno_infos: - anno_dict[image_id] = [anno_info] - else: - anno_infos.append(anno_info) - anno_dict[image_id] = anno_infos - return anno_dict - - def save_classes(self): - sorted_classes = list(map(lambda x: x['name'], sorted(self.labels['categories'], key=lambda x: x['id']))) - print('coco names', sorted_classes) - with open(f'{self.output_folder}/classes.txt', 'w', encoding='utf-8') as f: - for cls in sorted_classes: - f.write(cls + '\n') - f.close() - - def coco2yolo(self): - print("loading image info...") - images_info = self._load_images_info() - print("loading done, total images", len(images_info)) - - print("start converting...") - anno_dict = self._convert_anno(images_info) - print("converting done, total labels", len(anno_dict)) - - self.save_classes() - - print("saving txt file...") - self._save_txt(anno_dict) - print("saving done") - - def _save_txt(self, anno_dict): - raw_images_path = os.path.join(os.path.dirname(self.json_file), 'images') - for k, v in anno_dict.items(): - file_name = os.path.splitext(v[0][0])[0] + ".txt" - image_name = os.path.splitext(v[0][0])[0] + ".png" - copyfile(f'{raw_images_path}/{image_name}', f'{self.output_image_path}/{image_name}') - with open(os.path.join(self.output_path, file_name), 'w', encoding='utf-8') as f: - # print(k, v) - for obj in v: - cat_name = self.coco_id_name_map.get(obj[1]) - category_id = self.coco_name_list.index(cat_name) - box = ['{:.6f}'.format(x) for x in obj[2]] - box = ' '.join(box) - line = str(category_id) + ' ' + box - f.write(line + '\n') - - -if __name__ == '__main__': - mode = 'val' - json_file = f'COCO_datasets/Multi-modal_COCO_dataset_2023-12-14-13_52_07/{mode}.json' - output = f'YOLO_datasets/Multi-modal_COCO_dataset_2023-12-14-13_52_07/labels/{mode}' - c2y = COCO2YOLO(json_file, output) - c2y.coco2yolo() From 622faddcbd9b414eb6ecdc271a054385db30d207 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 16:50:16 +0800 Subject: [PATCH 25/39] refactor(generate_reading_annotation.py): simplify code logic and add docstrings --- scripts/generate_reading_annotation.py | 47 +++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index 928c78a..f098d64 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -2,6 +2,7 @@ import glob import multiprocessing import os +from pathlib import Path from DocParser.vrdu import utils from DocParser.vrdu import logger @@ -9,7 +10,25 @@ log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") -def generate_annotation(paper_path) -> None: +def process_one_paper(paper_path: Path) -> None: + """ + Process a single paper by generating reading annotations from order annotation. + + Args: + paper_path (Path): The path to the paper directory. + + Returns: + None + + Raises: + None + + Usage: + ```python + process_one_paper("/path/to/paper") + ``` + + """ log.debug(f"processing paper {paper_path}") order_json_file = os.path.join(paper_path, "order_annotation.json") @@ -44,7 +63,25 @@ def generate_annotation(paper_path) -> None: utils.export_to_json(result, reading_json_file) -def generate_reading_annotation(input_path) -> None: +def process_dataset(input_path: Path) -> None: + """ + Process a dataset by iterating over each discipline and paper within it. + + Args: + input_path (Path): The path to the dataset source. + + Returns: + None: This function does not return any value. + + Raises: + None: This function does not raise any exceptions. + + Usage: + ```python + process_dataset("/path/to/dataset") + ``` + + """ discipline_paths = glob.glob(os.path.join(input_path, "*/")) for discipline_path in discipline_paths: @@ -52,10 +89,10 @@ def generate_reading_annotation(input_path) -> None: paper_paths = glob.glob(os.path.join(discipline_path, "*/")) with multiprocessing.Pool(34) as pool: - pool.map(generate_annotation, paper_paths) + pool.map(process_one_paper, paper_paths) -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input_path", type=str, required=True, help="Path of dataset source" @@ -63,7 +100,7 @@ def main(): args = parser.parse_args() input_path = args.input_path - generate_reading_annotation(input_path) + process_dataset(input_path) if __name__ == "__main__": From 2c7f3eae1ddabf22158d5b28624bba92b0bb5ad4 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 17:07:32 +0800 Subject: [PATCH 26/39] refactor(retrieve_metadata.py): retrieve metadata for papers --- scripts/retrive_metadata.py | 65 ++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/scripts/retrive_metadata.py b/scripts/retrive_metadata.py index f378ff2..6897c67 100644 --- a/scripts/retrive_metadata.py +++ b/scripts/retrive_metadata.py @@ -1,11 +1,10 @@ import glob import os +from pathlib import Path from typing import Any, Dict, List import arxiv import argparse -import pandas as pd - from DocParser.vrdu import utils from DocParser.vrdu import logger @@ -13,15 +12,31 @@ log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") -def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]: +def retrieve_metadata(data: Dict[str, Path], slice_length=100) -> List[Dict[str, Any]]: + """ + Retrieves metadata for the given list of paper IDs. + + Args: + data (Dict[str, Path]): A dictionary where keys are paper IDs and values are the paths to the corresponding papers. + slice_length (int, optional): The number of paper IDs to retrieve metadata for in each iteration. Defaults to 100. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing metadata for each paper. + + Raises: + None + + References: + https://info.arxiv.org/help/api/user-manual.html#_details_of_atom_results_returned + + """ paper_ids = list(data.keys()) client = arxiv.Client() - slice_length = 100 paper_metadata = [] - for i in range(0, len(paper_ids), slice_length): + for i in range(len(paper_ids), slice_length): slices = paper_ids[i : i + slice_length] search_results = client.results(arxiv.Search(id_list=slices)) @@ -49,38 +64,30 @@ def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]: return paper_metadata -def main(): +def main() -> None: parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input_path", type=str, default="data/discipline_info.csv" - ) + parser.add_argument("-i", "--input_path", type=str, required=True) args = parser.parse_args() path = args.input_path - discipline_info = pd.read_csv("data/discipline_info.csv") - disciplines = set(discipline_info["discipline"]) - - for discipline in disciplines: - target_discipline_path = os.path.join(path, discipline) - paper_paths = glob.glob(os.path.join(target_discipline_path, "*/")) + paper_paths = glob.glob(os.path.join(path, "*/")) + # paper_id to paper path + data = {os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths} - data = { - os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths - } + paper_metadata = retrieve_metadata(data) - paper_metadata = retrieve_metadata(data) + # use append mode + existed_json_file = os.path.join(path, "paper_metadata.json") + existed_json_data = [] + if os.path.exists(existed_json_file): + existed_json_data = utils.load_json(existed_json_file) - existed_json_file = os.path.join(target_discipline_path, "paper_metadata.json") - existed_json_data = [] - if os.path.exists(existed_json_file): - existed_json_data = utils.load_json(existed_json_file) - - existed_paper_ids = [x["paper_id"] for x in existed_json_data] - existed_json_data.extend( - [x for x in paper_metadata if x["paper_id"] not in existed_paper_ids] - ) + existed_paper_ids = [x["paper_id"] for x in existed_json_data] + existed_json_data.extend( + [x for x in paper_metadata if x["paper_id"] not in existed_paper_ids] + ) - utils.export_to_json(existed_json_data, existed_json_file) + utils.export_to_json(existed_json_data, existed_json_file) if __name__ == "__main__": From 12e689b5589373d16aaf3244656f7224a194298b Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 17:08:05 +0800 Subject: [PATCH 27/39] fix(retrieve_metadata.py): typo --- scripts/{retrive_metadata.py => retrieve_metadata.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{retrive_metadata.py => retrieve_metadata.py} (100%) diff --git a/scripts/retrive_metadata.py b/scripts/retrieve_metadata.py similarity index 100% rename from scripts/retrive_metadata.py rename to scripts/retrieve_metadata.py From 652aaf4e85098636a8e10d9ad49a82d3f350ea2d Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 17:44:55 +0800 Subject: [PATCH 28/39] refactor(scripts): simplify code logic the most logic of visualize order annotations are the same, so they are merged. --- scripts/visualize_order_annotations.py | 300 ++++++++++++++++++ ...visualize_order_annotations_single_page.py | 141 -------- .../visualize_order_annotations_two_page.py | 179 ----------- 3 files changed, 300 insertions(+), 320 deletions(-) create mode 100644 scripts/visualize_order_annotations.py delete mode 100644 scripts/visualize_order_annotations_single_page.py delete mode 100644 scripts/visualize_order_annotations_two_page.py diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py new file mode 100644 index 0000000..b59b365 --- /dev/null +++ b/scripts/visualize_order_annotations.py @@ -0,0 +1,300 @@ +import argparse +from collections import defaultdict +import math +import os +from pathlib import Path +from typing import Any, Dict, List, Tuple +from PIL import Image, ImageDraw +from matplotlib import pyplot as plt + +from DocParser.vrdu import utils + + +def draw_arrow_line( + image: Image.Image, + point_A: Tuple[float, float], + point_B: Tuple[float, float], + width: int = 1, + color: Tuple[int, int, int] = (0, 255, 0), +) -> Image.Image: + """ + Draws an arrow line between two points on an image. + + Args: + image (PIL.Image.Image): The image on which to draw the arrow line. + point_A (Tuple[float, float]): The first point of the arrow line. + point_B (Tuple[float, float]): The second point of the arrow line. + width (int, optional): The width of the arrow line. Defaults to 1. + color (Tuple[int, int, int], optional): The color of the arrow line. Defaults to (0, 255, 0). + + Returns: + PIL.Image.Image: The image with the arrow line drawn. + + """ + draw = ImageDraw.Draw(image) + draw.line((point_A, point_B), width=width, fill=color) + + # Calculate arrowhead vertices + x0, y0 = point_A + x1, y1 = point_B + xb = 0.95 * (x1 - x0) + x0 + yb = 0.95 * (y1 - y0) + y0 + alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 + a = 8 * math.cos(alpha) + b = 8 * math.sin(alpha) + vtx0 = (xb + a, yb + b) + vtx1 = (xb - a, yb - b) + + # Draw the arrowhead triangle + draw.polygon([vtx0, vtx1, point_B], fill=color) + return image + + +def extract_relations( + page_index: int, order_annotation_data: Dict[str, Any], width=None +) -> List[Tuple[Tuple[float, float], Tuple[float, float], str]]: + """ + Extracts relations between blocks on a given page or across two adjacent pages. + + Args: + page_index (int): The index of the page to extract relations for. + order_annotation_data (Dict[str, Any]): The JSON file containing the order annotation data. + width (int, optional): The width of the image. If not provided, it assumes a single page. + + Returns: + List[Tuple[Tuple[float, float], Tuple[float, float], str]]: A list of tuples containing the coordinates of the block centers and the relation type. + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + relations = extract_relations(10, order_annotation_data, 1000) + ``` + """ + page_blocks = defaultdict(list) + id2blocks = {} + page2id2 = defaultdict(list) + for block in order_annotation_data["annotations"]: + page_blocks[block["page_index"]].append(block) + id2blocks[block["block_id"]] = block + page2id2[block["page_index"]].append(block["block_id"]) + + # single page + if width is None: + relation_tuples = [] + for relation in order_annotation_data["orders"]: + if relation["from"] not in page2id2[page_index]: + continue + if relation["to"] not in page2id2[page_index]: + continue + print(relation) + block_from = id2blocks[relation["from"]] + block_to = id2blocks[relation["to"]] + center_from = ( + (block_from["bbox"][0] + block_from["bbox"][2]) / 2, + (block_from["bbox"][1] + block_from["bbox"][3]) / 2, + ) + center_to = ( + (block_to["bbox"][0] + block_to["bbox"][2]) / 2, + (block_to["bbox"][1] + block_to["bbox"][3]) / 2, + ) + relation_tuples.append((center_from, center_to, relation["type"])) + + return relation_tuples + + # two page + relation_tuples = [] + for relation in order_annotation_data["orders"]: + if relation["from"] not in page2id2[page_index] + page2id2[page_index + 1]: + continue + if relation["to"] not in page2id2[page_index] + page2id2[page_index + 1]: + continue + block_from = id2blocks[relation["from"]] + block_to = id2blocks[relation["to"]] + + center_x = (block_from["bbox"][0] + block_from["bbox"][2]) / 2 + center_y = (block_from["bbox"][1] + block_from["bbox"][3]) / 2 + if block_from["page_index"] != page_index: + center_x += width + center_from = (center_x, center_y) + + center_x = (block_to["bbox"][0] + block_to["bbox"][2]) / 2 + center_y = (block_to["bbox"][1] + block_to["bbox"][3]) / 2 + if block_to["page_index"] != page_index: + center_x += width + center_to = (center_x, center_y) + + relation_tuples.append((center_from, center_to, relation["type"])) + return relation_tuples + + +def visualize_order_annotation_on_image( + relation_tuples: List[Tuple[Tuple[float, float], Tuple[float, float], str]], + image: Image.Image, +) -> None: + """ + Visualizes the order annotation on an image. + + Args: + relation_tuples (List[Tuple[Tuple[float, float], Tuple[float, float], str]]): + A list of tuples containing the coordinates of the block centers and the relation type. + image (PIL.Image.Image): The image on which to draw the arrow lines. + + Returns: + None + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + relation_tuples = extract_relations(10, order_annotation_data, 1000) + visualize_order_annotation_on_image(relation_tuples, image) + ``` + + """ + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(17, 22)) + ax1.imshow(image) + color_map = { + "identical": "green", + "adj": "blue", + "peer": "red", + "implicit-cite": "purple", + "explicit-cite": "brown", + "sub": "orange", + } + + for relation in relation_tuples: + center_from, center_to, relation_type = relation + ax1.arrow( + center_from[0], + center_from[1], + center_to[0] - center_from[0], + center_to[1] - center_from[1], + fc=color_map[relation_type], + ec=color_map[relation_type], + width=3, + ) + ax1.axis("off") + + legend_handles = [] + legend_labels = [] + relation_type_maps = { + "identical": "identical", + "adj": "non-title adjac", + "peer": "title adjacent", + "implicit-cite": "implicitly-referred", + "explicit-cite": "explicitly-referred", + "sub": "subordinate", + } + for relation_type, color in color_map.items(): + legend_handles.append( + plt.Line2D( + [0], [0], color=color, marker="o", linestyle="", label=relation_type + ) + ) + legend_labels.append(relation_type_maps[relation_type]) + + # Add the legend to ax2 + ax2.legend( + handles=legend_handles, + labels=legend_labels, + loc="upper center", + ncol=len(legend_handles), + ) + ax2.axis("off") + plt.tight_layout() + + plt.savefig(f"output/order_annotation.png", dpi=200) + + +def visualize_order_annotation_across_pages(path: Path, page_index: int) -> None: + """ + Visualizes the order annotation across two adjacent pages. + + Args: + path (Path): The path to the directory containing the images and the order annotation JSON file. + page_index (int): The index of the first page to be visualized. + + Returns: + None + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + visualize_order_annotation_across_pages("/path/to/directory", 10) + ``` + """ + order_annotation_file = os.path.join(path, "order_annotation.json") + image_file1 = os.path.join(path, f"page_{page_index:04}.jpg") + image_file2 = os.path.join(path, f"page_{page_index+1:04}.jpg") + + # extract blocks in this page + order_annotation_data = utils.load_json(order_annotation_file) + + # visualize + image1 = Image.open(image_file1) + image2 = Image.open(image_file2) + + relation_tuples = extract_relations(page_index, order_annotation_data, image1.width) + + # concatenate adjacent pages + width = image1.width + image2.width + image = Image.new("RGB", (width, image1.height)) + image.paste(image1, (0, 0)) + image.paste(image2, (image1.width, 0)) + image.save(f"concatenated_image.png") + + visualize_order_annotation_on_image(relation_tuples, image) + + +def visualize_order_annotation_single_page(path: Path, page_index: int) -> None: + """ + Visualizes the order annotation on a single page. + + Args: + path (Path): The path to the directory containing the image and the order annotation JSON file. + page_index (int): The index of the page to be visualized. + + Returns: + None + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + visualize_order_annotation_single_page("/path/to/directory", 10) + ``` + """ + order_annotation_file = os.path.join(path, "order_annotation.json") + order_annotation_data = utils.load_json(order_annotation_file) + + image_file = os.path.join(path, f"page_{page_index:04}.jpg") + image = Image.open(image_file) + + # extract blocks in this page + relation_tuples = extract_relations(page_index, order_annotation_data) + + # visualize + visualize_order_annotation_on_image(relation_tuples, image) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--path", help="path to the path", type=str) + parser.add_argument("-i", "--page_index", help="page index", type=int) + args = parser.parse_args() + + path = args.path + page_index = args.page_index + + visualize_order_annotation_single_page(path, page_index) + # visualize_order_annotation_across_pages(path, page_index) + + +if __name__ == "__main__": + main() diff --git a/scripts/visualize_order_annotations_single_page.py b/scripts/visualize_order_annotations_single_page.py deleted file mode 100644 index 68a0cf2..0000000 --- a/scripts/visualize_order_annotations_single_page.py +++ /dev/null @@ -1,141 +0,0 @@ -import argparse -from collections import defaultdict -import math -import os -from typing import Tuple -from PIL import Image, ImageDraw -from matplotlib import pyplot as plt - -from DocParser.vrdu import utils - - -def arrowedLine( - image: Image.Image, - point_A: Tuple[float, float], - point_B: Tuple[float, float], - width=1, - color=(0, 255, 0), -) -> Image.Image: - """Draw a line from point_A to point_B with an arrow headed at ppoint_B.""" - draw = ImageDraw.Draw(image) - draw.line((point_A, point_B), width=width, fill=color) - - # Calculate arrowhead vertices - x0, y0 = point_A - x1, y1 = point_B - xb = 0.95 * (x1 - x0) + x0 - yb = 0.95 * (y1 - y0) + y0 - alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 - a = 8 * math.cos(alpha) - b = 8 * math.sin(alpha) - vtx0 = (xb + a, yb + b) - vtx1 = (xb - a, yb - b) - - # Draw the arrowhead triangle - draw.polygon([vtx0, vtx1, point_B], fill=color) - return image - - -def visualize_order_annotation_single_page(path: str, page_index: int) -> None: - order_annotation_file = os.path.join(path, "order_annotation.json") - image_file = os.path.join(path, f"page_{page_index:04}.jpg") - - # extract blocks in this page - order_annotation_data = utils.load_json(order_annotation_file) - page_blocks = defaultdict(list) - id2blocks = {} - page2id2 = defaultdict(list) - for block in order_annotation_data["annotations"]: - page_blocks[block["page_index"]].append(block) - id2blocks[block["block_id"]] = block - page2id2[block["page_index"]].append(block["block_id"]) - - page_relations = [] - for item in order_annotation_data["orders"]: - if item["from"] not in page2id2[page_index]: - continue - if item["to"] not in page2id2[page_index]: - continue - page_relations.append(item) - - # visualize - image = Image.open(image_file) - width, height = image.size - - fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(16, 20), gridspec_kw={"height_ratios": [5, 1]} - ) - ax1.imshow(image, extent=[0, width, height, 0]) - ax1.set_xlim(0, width) - ax1.set_ylim(height, 0) - color_map = { - "identical": "green", - "adj": "blue", - "peer": "red", - "implicit-cite": "purple", - "explicit-cite": "brown", - "sub": "orange", - "unknown": "black", - } - - for relation in page_relations: - print(relation) - block_from = id2blocks[relation["from"]] - block_to = id2blocks[relation["to"]] - center_from = ( - (block_from["bbox"][0] + block_from["bbox"][2]) / 2, - (block_from["bbox"][1] + block_from["bbox"][3]) / 2, - ) - center_to = ( - (block_to["bbox"][0] + block_to["bbox"][2]) / 2, - (block_to["bbox"][1] + block_to["bbox"][3]) / 2, - ) - ax1.arrow( - center_from[0], - center_from[1], - center_to[0] - center_from[0], - center_to[1] - center_from[1], - fc=color_map[relation["type"]], - ec=color_map[relation["type"]], - width=3, - ) - ax1.axis("off") - - legend_handles = [] - legend_labels = [] - for relation_type, color in color_map.items(): - legend_handles.append( - plt.Line2D( - [0], [0], color=color, marker="o", linestyle="", label=relation_type - ) - ) - legend_labels.append(relation_type) - - # Add the legend to ax2 - ax2.legend( - handles=legend_handles, - labels=legend_labels, - loc="upper center", - ncol=len(legend_handles), - ) - ax2.axis("off") - plt.tight_layout() - - # plt.show() - plt.savefig("output/order_annotation.png", dpi=200) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", help="path to the path", type=str) - parser.add_argument("-i", "--page_index", help="page index", type=int) - args = parser.parse_args() - - path = args.path - page_index = args.page_index - - visualize_order_annotation_single_page(path, page_index) - - -if __name__ == "__main__": - main() diff --git a/scripts/visualize_order_annotations_two_page.py b/scripts/visualize_order_annotations_two_page.py deleted file mode 100644 index d534ea7..0000000 --- a/scripts/visualize_order_annotations_two_page.py +++ /dev/null @@ -1,179 +0,0 @@ -import argparse -from collections import defaultdict -import math -import os -from typing import Tuple -from PIL import Image, ImageDraw -from matplotlib import pyplot as plt - -from DocParser.vrdu import utils - - -def arrowedLine( - image: Image.Image, - point_A: Tuple[float, float], - point_B: Tuple[float, float], - width=1, - color=(0, 255, 0), -) -> Image.Image: - """Draw a line from point_A to point_B with an arrow headed at ppoint_B.""" - draw = ImageDraw.Draw(image) - draw.line((point_A, point_B), width=width, fill=color) - - # Calculate arrowhead vertices - x0, y0 = point_A - x1, y1 = point_B - xb = 0.95 * (x1 - x0) + x0 - yb = 0.95 * (y1 - y0) + y0 - alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 - a = 8 * math.cos(alpha) - b = 8 * math.sin(alpha) - vtx0 = (xb + a, yb + b) - vtx1 = (xb - a, yb - b) - - # Draw the arrowhead triangle - draw.polygon([vtx0, vtx1, point_B], fill=color) - return image - - -def visualize_order_annotation_across_pages(path: str, page_index: int) -> None: - order_annotation_file = os.path.join(path, "order_annotation.json") - image_file1 = os.path.join(path, f"page_{page_index:04}.jpg") - image_file2 = os.path.join(path, f"page_{page_index+1:04}.jpg") - - # extract blocks in this page - order_annotation_data = utils.load_json(order_annotation_file) - page_blocks = defaultdict(list) - id2blocks = {} - page2id2 = defaultdict(list) - for block in order_annotation_data["annotations"]: - page_blocks[block["page_index"]].append(block) - id2blocks[block["block_id"]] = block - page2id2[block["page_index"]].append(block["block_id"]) - - page_relations = [] - for item in order_annotation_data["orders"]: - if item["from"] not in page2id2[page_index] + page2id2[page_index + 1]: - continue - if item["to"] not in page2id2[page_index] + page2id2[page_index + 1]: - continue - page_relations.append(item) - - # visualize - image1 = Image.open(image_file1) - - image2 = Image.open(image_file2) - - # crop - margin = 150 - h_margin = margin - v_margin = margin * 17 / 22 - bbox = (v_margin, h_margin, 1700 - v_margin * 2.2, 2200 - h_margin * 0.9) - image1 = image1.crop(bbox) - image2 = image2.crop(bbox) - - width, height = image1.size - new_width = image1.width + image2.width - new_image = Image.new("RGB", (new_width, height)) - new_image.paste(image1, (0, 0)) - new_image.paste(image2, (image1.width, 0)) - new_image.save(f"concatenated_image_{page_index}_{page_index+1}.png") - - fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(17, 22) # , gridspec_kw={"height_ratios": [4, 1]} - ) - ax1.imshow(new_image, extent=[0, new_width, height, 0]) - ax1.set_xlim(0, new_width) - ax1.set_ylim(height, 0) - color_map = { - "identical": "green", - "adj": "blue", - "peer": "red", - "implicit-cite": "purple", - "explicit-cite": "brown", - "sub": "orange", - } - - for relation in page_relations: - print(relation) - block_from = id2blocks[relation["from"]] - block_to = id2blocks[relation["to"]] - if block_from["page_index"] != page_index: - center_from = ( - width + (block_from["bbox"][0] + block_from["bbox"][2]) / 2, - (block_from["bbox"][1] + block_from["bbox"][3]) / 2, - ) - else: - center_from = ( - (block_from["bbox"][0] + block_from["bbox"][2]) / 2, - (block_from["bbox"][1] + block_from["bbox"][3]) / 2, - ) - - if block_to["page_index"] != page_index: - center_to = ( - width + (block_to["bbox"][0] + block_to["bbox"][2]) / 2, - (block_to["bbox"][1] + block_to["bbox"][3]) / 2, - ) - else: - center_to = ( - (block_to["bbox"][0] + block_to["bbox"][2]) / 2, - (block_to["bbox"][1] + block_to["bbox"][3]) / 2, - ) - ax1.arrow( - center_from[0] - margin, - center_from[1] - margin, - center_to[0] - center_from[0], - center_to[1] - center_from[1], - fc=color_map[relation["type"]], - ec=color_map[relation["type"]], - width=3, - ) - ax1.axis("off") - - legend_handles = [] - legend_labels = [] - relation_type_maps = { - "identical": "identical", - "adj": "non-title adjac", - "peer": "title adjacent", - "implicit-cite": "implicitly-referred", - "explicit-cite": "explicitly-referred", - "sub": "subordinate", - } - for relation_type, color in color_map.items(): - legend_handles.append( - plt.Line2D( - [0], [0], color=color, marker="o", linestyle="", label=relation_type - ) - ) - legend_labels.append(relation_type_maps[relation_type]) - - # Add the legend to ax2 - ax2.legend( - handles=legend_handles, - labels=legend_labels, - loc="upper center", - ncol=len(legend_handles), - ) - ax2.axis("off") - plt.tight_layout() - - # plt.show() - - plt.savefig(f"output/order_annotation_{page_index}_{page_index + 1}.png", dpi=200) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", help="path to the path", type=str) - parser.add_argument("-i", "--page_index", help="page index", type=int) - args = parser.parse_args() - - path = args.path - page_index = args.page_index - - visualize_order_annotation_across_pages(path, page_index) - - -if __name__ == "__main__": - main() From ced7f36d39a3164ebcdf1632e6edfd2b9083b23f Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 17:45:15 +0800 Subject: [PATCH 29/39] feat(scripts): remove unused scripts --- scripts/run_statistics.py | 249 -------------------------------------- 1 file changed, 249 deletions(-) delete mode 100644 scripts/run_statistics.py diff --git a/scripts/run_statistics.py b/scripts/run_statistics.py deleted file mode 100644 index 2f8a70a..0000000 --- a/scripts/run_statistics.py +++ /dev/null @@ -1,249 +0,0 @@ -import glob -import os -import pandas as pd -import argparse -from datetime import datetime - -from DocParser.vrdu import utils -from DocParser.vrdu.config import config - -from DocParser.vrdu import logger - -log = logger.setup_app_level_logger(file_name="statistics.log") - - -database_file = "data/processed_paper_database.csv" -daily_overview_file = "data/daily_overview.csv" -discpline_info_file = "data/discpline_info.csv" - - -def extract_time(line: str) -> datetime: - time_format = "%Y-%m-%d %H:%M:%S,%f" - log_time = line.split(" - ")[0][1:] - return datetime.strptime(log_time, time_format) - - -def init_dataframe() -> pd.DataFrame: - if os.path.exists(database_file): - return pd.read_csv(database_file, dtype={"uuid": str}) - columns = [ - "uuid", - "discpline", - "path", - "status", - "start_time", - "end_time", - "error_type", - "error_info", - "date", - "pages", - "columns", - "blocks", - "overlap", - ] - df = pd.DataFrame(columns=columns) - return df - - -def update_processed_database(input_path: str): - """store the information of processed papers into a csv file""" - df = init_dataframe() - - log_files = glob.glob(os.path.join(input_path, "batch_process_*.log")) - for log_file in log_files: - log.info(f"processing log file: {log_file}") - - if not os.path.exists(log_file): - continue - with open(log_file, "r") as f: - lines = [line.strip() for line in f.readlines()] - - for line in lines: - if not line.startswith("["): - continue - - if line.find("start to process") != -1: - discpline = line.split("discpline: ")[1].split(", ")[0] - continue - - if line.find("[VRDU] file") == -1: - continue - tex_file = line.split("[VRDU] file: ")[1].split(", ")[0] - path = os.path.dirname(tex_file) - if os.path.basename(os.path.dirname(path)) != discpline: - log.debug(f"unknown discpline: {tex_file}") - continue - - # extract uuid and title - uuid = os.path.basename(path) - log.debug(f"uuid: {uuid}") - current_time = extract_time(line) - - # new file - if line.find("start processing") != -1: - if uuid in df["uuid"].values: - continue - - log.debug(f"new file: {tex_file} with uuid: {uuid}") - data_item = { - "index": len(df), - "uuid": uuid, - "discpline": discpline, - "path": path, - "status": "processing", - "start_time": str(current_time), - "end_time": "", - "error_type": "", - "error_info": "", - "date": "", - "pages": 0, - "columns": 0, - "blocks": 0, - "overlap": 0.0, - } - df.loc[len(df)] = data_item - continue - - # success processing file, update information - if ( - line.find("successfully processed") != -1 - or line.find("paper has been processed") != -1 - ): - if uuid in df["uuid"].values: - index = df[df["uuid"] == uuid].index[0] - if df.loc[index, "status"] == "success": - continue - df.loc[df["uuid"] == uuid, "status"] = "success" - df.loc[df["uuid"] == uuid, "end_time"] = current_time - continue - - # failed to process file, update status and eror information - if line.find("message: ") != -1: - if uuid in df["uuid"].values: - index = df[df["uuid"] == uuid].index[0] - if df.loc[index, "status"] == "failure": - continue - error_type = line.split("type: ")[1].split(", ")[0] - error_info = line.split("message: ")[1].strip() - - df.loc[df["uuid"] == uuid, "status"] = "failure" - df.loc[df["uuid"] == uuid, "error_type"] = error_type - df.loc[df["uuid"] == uuid, "error_info"] = error_info - df.loc[df["uuid"] == uuid, "end_time"] = current_time - continue - - category_names = list(config.category2name.values()) - for category_name in category_names: - df[category_name] = 0 - - for index in range(len(df)): - if df.loc[index, "status"] != "success": - continue - - if df.loc[index, "pages"] != 0: - continue - - # use output result to update information - path = df.loc[index, "path"] - quality_report = utils.load_json( - os.path.join(path, "output/result/quality_report.json") - ) - df.loc[index, "pages"] = quality_report["num_pages"] - df.loc[index, "columns"] = quality_report["num_columns"] - df.loc[index, "blocks"] = quality_report["category_quality"][-1][ - "geometry_count" - ] - df.loc[index, "overlap"] = quality_report["page_quality"][-1]["ratio"] - - for category_item in quality_report["category_quality"]: - df.loc[index, category_item["category"]] = category_item["geometry_count"] - - # remove processing files - df = df[~(df["status"] == "processing")] - df.to_csv(database_file, index=False) - - -def update_discpline_info(): - df = pd.read_csv(discpline_info_file) - - for log_file in glob.glob("data/batch_process_*.log"): - discpline = ( - os.path.basename(log_file).split("batch_process_")[1].split(".log")[0] - ) - with open(log_file) as f: - lines = f.readlines() - for line in lines: - if line.find("finished processing.") != -1: - df.loc[df["discpline"] == discpline, "status"] = "complete" - else: - df.loc[df["discpline"] == discpline, "status"] = "processing" - - database_df = pd.read_csv(database_file) - for index, row in df.iterrows(): - df.loc[index, "success"] = len( - database_df[ - (database_df["discpline"] == row["discpline"]) - & (database_df["status"] == "success") - ] - ) - df.loc[index, "failure"] = len( - database_df[ - (database_df["discpline"] == row["discpline"]) - & (database_df["status"] == "failure") - ] - ) - processed_papers = len( - database_df[(database_df["discpline"] == row["discpline"])] - ) - - df.loc[index, "processed"] = processed_papers - - df.to_csv(discpline_info_file, index=False) - - -def update_daily_overview() -> None: - daily_df = pd.read_csv(daily_overview_file) - database_df = pd.read_csv(database_file) - - num_total_papers = database_df.shape[0] - num_total_processed = database_df[database_df["status"] == "success"].shape[0] - - last_index = daily_df.index[-1] - num_daily_papers = num_total_papers - daily_df.loc[last_index, "#total papers"] - num_daily_processed = ( - num_total_processed - daily_df.loc[last_index, "#total processed"] - ) - - if num_total_papers == daily_df.loc[last_index, "#total papers"]: - log.info("Please update database file before running this script.") - - daily_df.loc[last_index + 1, "date"] = datetime.today().strftime("%Y-%m-%d") - daily_df.loc[last_index + 1, "#daily papers"] = num_daily_papers - daily_df.loc[last_index + 1, "#daily processed"] = num_daily_processed - daily_df.loc[last_index + 1, "#total papers"] = num_total_papers - daily_df.loc[last_index + 1, "#total processed"] = num_total_processed - daily_df.loc[last_index + 1, "#discplines"] = database_df["discpline"].nunique() - daily_df["daily pass ratio"] = ( - daily_df["#daily processed"] / daily_df["#daily papers"] - ) - daily_df["total pass ratio"] = ( - daily_df["#total processed"] / daily_df["#total papers"] - ) - - daily_df.to_csv("data/daily_overview.csv", index=False) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_path", type=str, default="output/") - args = parser.parse_args() - - update_processed_database(args.input_path) - - update_daily_overview() - - update_discpline_info() - - -if __name__ == "__main__": - main() From d564e08c3f0180240adbf1eed0085ce53e06c5f0 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 13 Jun 2024 17:46:25 +0800 Subject: [PATCH 30/39] perf(settings.json): add some nouns used in this project --- .vscode/settings.json | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 03ba071..69cbc88 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,35 +1,59 @@ { "cSpell.words": [ "arxiv", + "autoref", + "colorlinks", "columnsep", "columnwidth", "definecolor", "documentclass", "dtype", + "epspdf", + "eqref", "flickr", + "footnotetext", + "graphicspath", + "headheight", + "headsep", "hoffset", + "hyperref", + "hypersetup", "imread", "includegraphics", "iscrowd", + "labelcref", "laparams", "latexpand", + "levelname", + "lstinputlisting", + "lstlisting", + "lstset", + "nonstopmode", "oddsidemargin", "opencv", + "pageref", "pdfcrop", "pdflatex", "pdfminer", + "psfig", "regionprops", "renewcommand", "rgbcolor", "scikit", "skimage", "subimport", + "synctex", + "tablefootnote", + "texlive", "Texsoup", "textcolor", "textwidth", + "topmargin", "tqdm", "usepackage", + "voffset", "vrdu", - "xcolor" + "xcolor", + "YOLO" ] } \ No newline at end of file From 85b080c4729ac773cae8d37e00b38895da4b2e31 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 11:29:25 +0800 Subject: [PATCH 31/39] refactor(layout_annotation.py): rm reading_annotation the reading annotation result is already contained in order annotation --- DocParser/vrdu/layout_annotation.py | 67 ----------------------------- 1 file changed, 67 deletions(-) diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index 31b7a1f..b886a2a 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -399,66 +399,6 @@ def generate_layout_info(self) -> Dict[int, List[Block]]: layout_info[page_index].extend(figure_layout_info[page_index]) return layout_info - def generate_reading_annotation( - self, layout_info: Dict[int, List[Block]] - ) -> DefaultDict[str, List]: - """Generate a reading annotation based on the layout information. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary containing the layout information - for each page index. The keys are the page indices and the values are lists of - `Block` objects representing the blocks on each page. - - Returns: - DefaultDict[str, List]: A defaultdict containing the reading annotation. The keys - of the defaultdict are the page indices and the values are lists of dictionaries - representing the reading annotation for each block on the page. Each dictionary - contains the following keys: - - "source_code": The source code of the block. - - "image_path": The path to the saved image of the block. - - "category": The category of the block. - - The defaultdict also contains the following keys: - - "categories": A list of dictionaries representing the categories. Each - dictionary contains the following keys: - - "id": The ID of the category. - - "name": The name of the category. - - "macros": A dictionary containing the macro definitions extracted from - the original tex file. - """ - reading_annotation = defaultdict(list) - - # sort all images by page index, see utils.pdf2jpg for details - image_files = sorted( - glob.glob(os.path.join(self.pdf_images_path, "*.jpg")), - key=lambda x: x[-6:-4], - ) - count = 0 - for page_index in layout_info.keys(): - page_image = Image.open(image_files[page_index]) - for block in layout_info[page_index]: - cropped_image = page_image.crop(block.bbox) - - image_name = config.folder_prefix + str(count).zfill(4) + ".jpg" - count += 1 - image_path = os.path.join(self.result_directory, image_name) - cropped_image.save(image_path) - reading_annotation[page_index].append( - { - "source_code": block.source_code, - "image_path": image_name, - "category": block.category, - } - ) - page_image.close() - - reading_annotation["categories"] = [ - {"id": index, "name": category} - for index, category, _ in config.config["category_name"] - ] - - return reading_annotation - def generate_image_annotation( self, layout_info: Dict[int, List[Block]] ) -> Dict[int, Dict[str, Any]]: @@ -536,13 +476,6 @@ def annotate(self): layout_info, image_annotation, file_path=layout_annotation_file ) - # step3: generate reading annotation - reading_annotation = self.generate_reading_annotation(layout_info) - reading_annotation_file = os.path.join( - self.result_directory, "reading_annotation.json" - ) - utils.export_to_json(reading_annotation, reading_annotation_file) - def get_image_pairs(dir1: str, dir2: str): """ From 0588f2ce610e763f4c9b5ea8305c3ebb46241fbd Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 15:12:56 +0800 Subject: [PATCH 32/39] fix(main.py): make dirs twice --- DocParser/main.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/DocParser/main.py b/DocParser/main.py index 2abedd8..3457e0c 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -103,10 +103,6 @@ def process_one_file(file_name: str) -> None: if os.path.exists(output_directory): shutil.rmtree(output_directory) - # output_directory stores the intermediate results - # result_directory stores the final results - os.makedirs(os.path.join(main_directory, "output/result")) - cwd = os.getcwd() try: From 5f3768c16335b7c8d5937cbbaab13d288d0b9f8d Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 15:26:58 +0800 Subject: [PATCH 33/39] refactor(renderer.py): merge logic of processing predefined color --- DocParser/vrdu/renderer.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 5737bc7..1187586 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -362,30 +362,13 @@ def remove_hyperref_color(self, color_tex: str) -> None: if re.search(pattern, content[:preamble_loc]): content = content[:preamble_loc] + hyper_setup + content[preamble_loc:] - # Write the modified content back to the input file - with open(color_tex, "w") as file: - file.write(content) - - def remove_lstlisting_color(self, color_tex: str) -> None: - """Remove color definitions from a LaTeX file. - - Args: - color_tex (str): The path to the LaTeX file. - - Returns: - None - """ - # Read the content of the input file - with open(color_tex, "r") as file: - content = file.read() - - # delete the color definitions + # delete the lstlisting color definitions pattern = r"\\lstset\{.*?\}" - modified_content = re.sub(pattern, "", content) + content = re.sub(pattern, "", content) - # Write the modified content to the output file + # Write the modified content back to the input file with open(color_tex, "w") as file: - file.write(modified_content) + file.write(content) def modify_color_definitions(self, input_file: str, output_file: str) -> None: """Modify the pre-defined color definitions in the input file and write the modified content to the output file. From 0b5af51bd844b6bd917a83a6980dfc1dcf8060dc Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 15:27:33 +0800 Subject: [PATCH 34/39] refactor(renderer.py, test/): use more meaningful name --- DocParser/vrdu/renderer.py | 7 +++---- ..._hyperref_color.py => test_remove_predefined_color.py} | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) rename tests/{test_remove_hyperref_color.py => test_remove_predefined_color.py} (91%) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 1187586..b4da923 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -51,8 +51,7 @@ def render(self, origin_tex: str) -> None: self.add_layout_definition(color_tex) # remove color definitions to prevent conflict - self.remove_hyperref_color(color_tex) - self.remove_lstlisting_color(color_tex) + self.remove_predefined_color(color_tex) self.render_all_env(color_tex) @@ -330,8 +329,8 @@ def add_layout_definition(self, color_tex: str) -> None: with open(color_tex, "w") as f: f.write(content) - def remove_hyperref_color(self, color_tex: str) -> None: - """Removes hyperref color settings from a LaTeX file. + def remove_predefined_color(self, color_tex: str) -> None: + """Removes hyperref and lstlisting color settings from a LaTeX file. Args: color_tex (str): The path to the LaTeX file to modify. diff --git a/tests/test_remove_hyperref_color.py b/tests/test_remove_predefined_color.py similarity index 91% rename from tests/test_remove_hyperref_color.py rename to tests/test_remove_predefined_color.py index 3b6a287..356f378 100644 --- a/tests/test_remove_hyperref_color.py +++ b/tests/test_remove_predefined_color.py @@ -21,7 +21,7 @@ def test1(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -33,7 +33,7 @@ def test2(self): new=unittest.mock.mock_open(read_data=self.mock_file_content2), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -45,7 +45,7 @@ def test3(self): new=unittest.mock.mock_open(read_data=self.mock_file_content3), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -57,7 +57,7 @@ def test4(self): new=unittest.mock.mock_open(read_data=self.mock_file_content4), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\usepackage{amsmath}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" From b36c7981fff924ae5cb9a53b7282412c318ec06e Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 15 Jul 2024 16:55:15 +0800 Subject: [PATCH 35/39] fix(all): Module DocParser not found --- DocParser/TexSoup/app/conversion.py | 8 ++++---- DocParser/vrdu/renderer.py | 10 +++++----- DocParser/vrdu/utils.py | 4 ++-- scripts/app.py | 4 ++-- scripts/arxiv_download.py | 2 +- scripts/batch_process.py | 4 ++-- scripts/export_to_dataset.py | 2 +- scripts/generate_reading_annotation.py | 4 ++-- scripts/retrieve_metadata.py | 4 ++-- scripts/visualize_order_annotations.py | 2 +- setup.py | 2 +- tests/test_add_definitions.py | 2 +- tests/test_extract_graphics.py | 2 +- tests/test_is_text_eq.py | 2 +- tests/test_remove_predefined_color.py | 2 +- tests/test_render_abstract.py | 2 +- tests/test_render_algorithm.py | 2 +- tests/test_render_caption.py | 2 +- tests/test_render_code.py | 6 +++--- tests/test_render_footnote.py | 2 +- tests/test_render_tabular.py | 2 +- tests/test_render_title.py | 2 +- 22 files changed, 36 insertions(+), 36 deletions(-) diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py index 474c228..3ffe746 100644 --- a/DocParser/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -1,11 +1,11 @@ import re -from DocParser.TexSoup.TexSoup import TexSoup -from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup +from TexSoup.TexSoup import TexSoup +from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from DocParser.vrdu import logger -from DocParser.vrdu.config import envs +from vrdu import logger +from vrdu.config import envs log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index b4da923..d277ebd 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -5,12 +5,12 @@ import re -import DocParser.vrdu.utils as utils -import DocParser.vrdu.logger as logger -from DocParser.vrdu.config import config, envs +import vrdu.utils as utils +import vrdu.logger as logger +from vrdu.config import config, envs -from DocParser.TexSoup.TexSoup import TexSoup -import DocParser.TexSoup.app.conversion as conversion +from TexSoup.TexSoup import TexSoup +import TexSoup.app.conversion as conversion log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py index be6fa51..aecefe8 100755 --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -7,8 +7,8 @@ from pdf2image import pdf2image from pdf2image import generators -from DocParser.vrdu.block import Block -from DocParser.vrdu.config import config +from vrdu.block import Block +from vrdu.config import config def export_to_json(data: Union[Dict, List], file_path: str) -> None: diff --git a/scripts/app.py b/scripts/app.py index 54b4a1c..549d682 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -3,8 +3,8 @@ import glob from PIL import Image, ImageDraw -from DocParser.vrdu import utils -from DocParser.vrdu.config import config +from vrdu import utils +from vrdu.config import config pn.extension() diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 971f779..c7c9e10 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -5,7 +5,7 @@ import tarfile -from DocParser.vrdu import logger +from vrdu import logger log = logger.setup_app_level_logger(logger_name="arxiv_download.log") diff --git a/scripts/batch_process.py b/scripts/batch_process.py index 78dbe8d..e357ac4 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -5,8 +5,8 @@ from typing import List import pandas as pd -from DocParser.vrdu import logger -from DocParser.main import process_one_file +from vrdu import logger +from main import process_one_file log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index f8c41d8..fafb3d2 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from DocParser.vrdu import logger +from vrdu import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index f098d64..a4104b7 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -4,8 +4,8 @@ import os from pathlib import Path -from DocParser.vrdu import utils -from DocParser.vrdu import logger +from vrdu import utils +from vrdu import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py index 6897c67..bf97df2 100644 --- a/scripts/retrieve_metadata.py +++ b/scripts/retrieve_metadata.py @@ -6,8 +6,8 @@ import argparse -from DocParser.vrdu import utils -from DocParser.vrdu import logger +from vrdu import utils +from vrdu import logger log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py index b59b365..2f5bc5b 100644 --- a/scripts/visualize_order_annotations.py +++ b/scripts/visualize_order_annotations.py @@ -7,7 +7,7 @@ from PIL import Image, ImageDraw from matplotlib import pyplot as plt -from DocParser.vrdu import utils +from vrdu import utils def draw_arrow_line( diff --git a/setup.py b/setup.py index ad473aa..ba3749e 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name="vrdu_data_process", + name="DocParser", version="1.0.0", description="process the academic papers with .tex source files", author="Mao Song", diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py index 096ca65..f3ca221 100644 --- a/tests/test_add_definitions.py +++ b/tests/test_add_definitions.py @@ -1,7 +1,7 @@ import unittest import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer def test_add_color_definition1(): diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py index 14a2cd5..8335db3 100644 --- a/tests/test_extract_graphics.py +++ b/tests/test_extract_graphics.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestGraphics(unittest.TestCase): diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py index 3baa280..6426411 100644 --- a/tests/test_is_text_eq.py +++ b/tests/test_is_text_eq.py @@ -1,6 +1,6 @@ import unittest -from DocParser.vrdu.renderer import is_text_eq +from vrdu.renderer import is_text_eq class TestTextEq(unittest.TestCase): diff --git a/tests/test_remove_predefined_color.py b/tests/test_remove_predefined_color.py index 356f378..fdc1b34 100644 --- a/tests/test_remove_predefined_color.py +++ b/tests/test_remove_predefined_color.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestHyperref(unittest.TestCase): diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py index 405f6da..16f2cb9 100644 --- a/tests/test_render_abstract.py +++ b/tests/test_render_abstract.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestAbstract(unittest.TestCase): diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py index a4cf6ad..c15821e 100644 --- a/tests/test_render_algorithm.py +++ b/tests/test_render_algorithm.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestAlgorithm(unittest.TestCase): diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py index b526f60..eb21de8 100644 --- a/tests/test_render_caption.py +++ b/tests/test_render_caption.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestCaption(unittest.TestCase): diff --git a/tests/test_render_code.py b/tests/test_render_code.py index 55082de..79dae23 100644 --- a/tests/test_render_code.py +++ b/tests/test_render_code.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestCode(unittest.TestCase): @@ -71,7 +71,7 @@ def test_no_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -83,7 +83,7 @@ def test_remove_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content5), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( r"""\documentclass{article}\n\usepackage{listings}\n\usepackage{xcolor}\n\n\definecolor{codegreen}{rgb}{0,0.6,0}\n\definecolor{codegray}{rgb}{0.5,0.5,0.5}\n\definecolor{codepurple}{rgb}{0.58,0,0.82}\n\definecolor{backcolour}{rgb}{0.95,0.95,0.92}\n\n\lstdefinestyle{mystyle}{\n backgroundcolor=\color{backcolour}, \n commentstyle=\color{codegreen},\n keywordstyle=\color{magenta},\n numberstyle=\tiny\color{codegray},\n stringstyle=\color{codepurple},\n basicstyle=\ttfamily\footnotesize,\n breakatwhitespace=false, \n breaklines=true, \n captionpos=b, \n keepspaces=true, \n numbers=left, \n numbersep=5pt, \n showspaces=false, \n showstringspaces=false,\n showtabs=false, \n tabsize=2\n}\n\n\n\n\begin{document}\nThe next code will be directly imported from a file\n\n\lstinputlisting[language=Octave]{BitXorMatrix.m}\n\end{document}""" diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py index e81e0fd..e0fcebd 100644 --- a/tests/test_render_footnote.py +++ b/tests/test_render_footnote.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestFootnote(unittest.TestCase): diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py index e57f363..7cb1e52 100644 --- a/tests/test_render_tabular.py +++ b/tests/test_render_tabular.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestTabular(unittest.TestCase): diff --git a/tests/test_render_title.py b/tests/test_render_title.py index 122063b..343714e 100644 --- a/tests/test_render_title.py +++ b/tests/test_render_title.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestTitle(unittest.TestCase): From 7ac0c6cc4305ec37a99f1ed4e1b530a49da05167 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 17 Jul 2024 15:34:06 +0800 Subject: [PATCH 36/39] fix(all): use absolute import Previous version use both relative import and absolute import, which lead to error. This version use absolute import consistently to solve this problem. --- DocParser/TexSoup/app/conversion.py | 8 ++++---- DocParser/main.py | 27 +++++++++++++------------- DocParser/vrdu/layout_annotation.py | 8 ++++---- DocParser/vrdu/order_annotation.py | 8 ++++---- DocParser/vrdu/preprocess.py | 9 ++++----- DocParser/vrdu/quality_check.py | 6 +++--- DocParser/vrdu/renderer.py | 10 +++++----- DocParser/vrdu/utils.py | 4 ++-- scripts/app.py | 4 ++-- scripts/arxiv_download.py | 2 +- scripts/batch_process.py | 4 ++-- scripts/export_to_dataset.py | 2 +- scripts/generate_reading_annotation.py | 4 ++-- scripts/retrieve_metadata.py | 4 ++-- scripts/visualize_order_annotations.py | 2 +- tests/test_add_definitions.py | 2 +- tests/test_extract_graphics.py | 2 +- tests/test_is_text_eq.py | 2 +- tests/test_remove_predefined_color.py | 2 +- tests/test_render_abstract.py | 2 +- tests/test_render_algorithm.py | 2 +- tests/test_render_caption.py | 2 +- tests/test_render_code.py | 2 +- tests/test_render_footnote.py | 2 +- tests/test_render_tabular.py | 2 +- tests/test_render_title.py | 2 +- 26 files changed, 62 insertions(+), 62 deletions(-) diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py index 3ffe746..474c228 100644 --- a/DocParser/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -1,11 +1,11 @@ import re -from TexSoup.TexSoup import TexSoup -from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup +from DocParser.TexSoup.TexSoup import TexSoup +from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from vrdu import logger -from vrdu.config import envs +from DocParser.vrdu import logger +from DocParser.vrdu.config import envs log = logger.get_logger(__name__) diff --git a/DocParser/main.py b/DocParser/main.py index 3457e0c..2fffebf 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -5,14 +5,14 @@ from tqdm import tqdm -from vrdu import logger -from vrdu import utils -from vrdu import renderer -from vrdu import preprocess -from vrdu import layout_annotation as layout -from vrdu import order_annotation as order -from vrdu.config import config -from vrdu.quality_check import generate_quality_report +from DocParser.vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu import renderer +from DocParser.vrdu import preprocess +from DocParser.vrdu import layout_annotation as layout +from DocParser.vrdu import order_annotation as order +from DocParser.vrdu.config import config +from DocParser.vrdu.quality_check import generate_quality_report log = logger.setup_app_level_logger(file_name="vrdu_debug.log") @@ -140,11 +140,12 @@ def process_one_file(file_name: str) -> None: log.info(f"[VRDU] file: {original_tex}, successfully processed.") except Exception as e: - error_type = e.__class__.__name__ - error_info = str(e) - log.error( - f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" - ) + # error_type = e.__class__.__name__ + # error_info = str(e) + # log.error( + # f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" + # ) + raise e finally: # remove redundant files diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index b886a2a..c7083c6 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -12,12 +12,12 @@ from pdfminer.high_level import extract_pages from pdfminer.layout import LTFigure, LTPage -from vrdu import utils -from vrdu.block import Block, BoundingBox -from vrdu.config import config, envs -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu.block import Block, BoundingBox +from DocParser.vrdu.config import config, envs +from DocParser.vrdu import logger log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/order_annotation.py b/DocParser/vrdu/order_annotation.py index 868873c..5d1e50e 100644 --- a/DocParser/vrdu/order_annotation.py +++ b/DocParser/vrdu/order_annotation.py @@ -2,10 +2,10 @@ import os from uuid import uuid4 -from vrdu.block import Block -from vrdu.config import config -from vrdu import utils -from vrdu import logger +from DocParser.vrdu.block import Block +from DocParser.vrdu.config import config +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index f4f4003..99f969a 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -1,11 +1,10 @@ import os import re -from arxiv_cleaner.cleaner import Cleaner - -from vrdu.config import envs, config -from vrdu import utils -import vrdu.logger as logger +from DocParser.arxiv_cleaner.cleaner import Cleaner +from DocParser.vrdu.config import envs, config +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/quality_check.py b/DocParser/vrdu/quality_check.py index eee25f1..846ec8d 100644 --- a/DocParser/vrdu/quality_check.py +++ b/DocParser/vrdu/quality_check.py @@ -1,9 +1,9 @@ from typing import Dict, List import os -from vrdu.block import Block -from vrdu import utils -from vrdu.config import config +from DocParser.vrdu.block import Block +from DocParser.vrdu import utils +from DocParser.vrdu.config import config def generate_quality_report(main_directory: str) -> None: diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index d277ebd..db6164b 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -5,12 +5,12 @@ import re -import vrdu.utils as utils -import vrdu.logger as logger -from vrdu.config import config, envs +from DocParser.vrdu import utils +from DocParser.vrdu import logger +from DocParser.vrdu.config import config, envs -from TexSoup.TexSoup import TexSoup -import TexSoup.app.conversion as conversion +from DocParser.TexSoup.TexSoup import TexSoup +import DocParser.TexSoup.app.conversion as conversion log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py index aecefe8..be6fa51 100755 --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -7,8 +7,8 @@ from pdf2image import pdf2image from pdf2image import generators -from vrdu.block import Block -from vrdu.config import config +from DocParser.vrdu.block import Block +from DocParser.vrdu.config import config def export_to_json(data: Union[Dict, List], file_path: str) -> None: diff --git a/scripts/app.py b/scripts/app.py index 549d682..54b4a1c 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -3,8 +3,8 @@ import glob from PIL import Image, ImageDraw -from vrdu import utils -from vrdu.config import config +from DocParser.vrdu import utils +from DocParser.vrdu.config import config pn.extension() diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index c7c9e10..971f779 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -5,7 +5,7 @@ import tarfile -from vrdu import logger +from DocParser.vrdu import logger log = logger.setup_app_level_logger(logger_name="arxiv_download.log") diff --git a/scripts/batch_process.py b/scripts/batch_process.py index e357ac4..78dbe8d 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -5,8 +5,8 @@ from typing import List import pandas as pd -from vrdu import logger -from main import process_one_file +from DocParser.vrdu import logger +from DocParser.main import process_one_file log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index fafb3d2..f8c41d8 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from vrdu import logger +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index a4104b7..f098d64 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -4,8 +4,8 @@ import os from pathlib import Path -from vrdu import utils -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py index bf97df2..6897c67 100644 --- a/scripts/retrieve_metadata.py +++ b/scripts/retrieve_metadata.py @@ -6,8 +6,8 @@ import argparse -from vrdu import utils -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.vrdu import logger log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py index 2f5bc5b..b59b365 100644 --- a/scripts/visualize_order_annotations.py +++ b/scripts/visualize_order_annotations.py @@ -7,7 +7,7 @@ from PIL import Image, ImageDraw from matplotlib import pyplot as plt -from vrdu import utils +from DocParser.vrdu import utils def draw_arrow_line( diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py index f3ca221..096ca65 100644 --- a/tests/test_add_definitions.py +++ b/tests/test_add_definitions.py @@ -1,7 +1,7 @@ import unittest import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer def test_add_color_definition1(): diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py index 8335db3..14a2cd5 100644 --- a/tests/test_extract_graphics.py +++ b/tests/test_extract_graphics.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestGraphics(unittest.TestCase): diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py index 6426411..3baa280 100644 --- a/tests/test_is_text_eq.py +++ b/tests/test_is_text_eq.py @@ -1,6 +1,6 @@ import unittest -from vrdu.renderer import is_text_eq +from DocParser.vrdu.renderer import is_text_eq class TestTextEq(unittest.TestCase): diff --git a/tests/test_remove_predefined_color.py b/tests/test_remove_predefined_color.py index fdc1b34..356f378 100644 --- a/tests/test_remove_predefined_color.py +++ b/tests/test_remove_predefined_color.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestHyperref(unittest.TestCase): diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py index 16f2cb9..405f6da 100644 --- a/tests/test_render_abstract.py +++ b/tests/test_render_abstract.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestAbstract(unittest.TestCase): diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py index c15821e..a4cf6ad 100644 --- a/tests/test_render_algorithm.py +++ b/tests/test_render_algorithm.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestAlgorithm(unittest.TestCase): diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py index eb21de8..b526f60 100644 --- a/tests/test_render_caption.py +++ b/tests/test_render_caption.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestCaption(unittest.TestCase): diff --git a/tests/test_render_code.py b/tests/test_render_code.py index 79dae23..c71bd27 100644 --- a/tests/test_render_code.py +++ b/tests/test_render_code.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestCode(unittest.TestCase): diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py index e0fcebd..e81e0fd 100644 --- a/tests/test_render_footnote.py +++ b/tests/test_render_footnote.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestFootnote(unittest.TestCase): diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py index 7cb1e52..e57f363 100644 --- a/tests/test_render_tabular.py +++ b/tests/test_render_tabular.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestTabular(unittest.TestCase): diff --git a/tests/test_render_title.py b/tests/test_render_title.py index 343714e..122063b 100644 --- a/tests/test_render_title.py +++ b/tests/test_render_title.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestTitle(unittest.TestCase): From a12da9fcf4077d0537f5a39d5749b0d47385ed8f Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 17 Jul 2024 18:13:50 +0800 Subject: [PATCH 37/39] refactor(all): extract logger as a separate module --- DocParser/TexSoup/app/conversion.py | 2 +- DocParser/logger/__init__.py | 0 DocParser/{vrdu => logger}/logger.py | 0 DocParser/main.py | 2 +- DocParser/vrdu/layout_annotation.py | 2 +- DocParser/vrdu/order_annotation.py | 2 +- DocParser/vrdu/preprocess.py | 2 +- DocParser/vrdu/renderer.py | 2 +- scripts/arxiv_download.py | 2 +- scripts/batch_process.py | 2 +- scripts/export_to_dataset.py | 2 +- scripts/generate_reading_annotation.py | 2 +- scripts/retrieve_metadata.py | 2 +- 13 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 DocParser/logger/__init__.py rename DocParser/{vrdu => logger}/logger.py (100%) diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py index 474c228..c95cf2b 100644 --- a/DocParser/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -4,7 +4,7 @@ from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from DocParser.vrdu import logger +from DocParser.logger import logger from DocParser.vrdu.config import envs log = logger.get_logger(__name__) diff --git a/DocParser/logger/__init__.py b/DocParser/logger/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/DocParser/vrdu/logger.py b/DocParser/logger/logger.py similarity index 100% rename from DocParser/vrdu/logger.py rename to DocParser/logger/logger.py diff --git a/DocParser/main.py b/DocParser/main.py index 2fffebf..8a5881c 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -5,7 +5,7 @@ from tqdm import tqdm -from DocParser.vrdu import logger +from DocParser.logger import logger from DocParser.vrdu import utils from DocParser.vrdu import renderer from DocParser.vrdu import preprocess diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index c7083c6..1e70f8d 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -17,7 +17,7 @@ from DocParser.vrdu import utils from DocParser.vrdu.block import Block, BoundingBox from DocParser.vrdu.config import config, envs -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/order_annotation.py b/DocParser/vrdu/order_annotation.py index 5d1e50e..e1dd5ee 100644 --- a/DocParser/vrdu/order_annotation.py +++ b/DocParser/vrdu/order_annotation.py @@ -5,7 +5,7 @@ from DocParser.vrdu.block import Block from DocParser.vrdu.config import config from DocParser.vrdu import utils -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index 99f969a..96f2f95 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -4,7 +4,7 @@ from DocParser.arxiv_cleaner.cleaner import Cleaner from DocParser.vrdu.config import envs, config from DocParser.vrdu import utils -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index db6164b..19e2b32 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -6,7 +6,7 @@ from DocParser.vrdu import utils -from DocParser.vrdu import logger +from DocParser.logger import logger from DocParser.vrdu.config import config, envs from DocParser.TexSoup.TexSoup import TexSoup diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 971f779..d979787 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -5,7 +5,7 @@ import tarfile -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.setup_app_level_logger(logger_name="arxiv_download.log") diff --git a/scripts/batch_process.py b/scripts/batch_process.py index 78dbe8d..6499751 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -5,7 +5,7 @@ from typing import List import pandas as pd -from DocParser.vrdu import logger +from DocParser.logger import logger from DocParser.main import process_one_file log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index f8c41d8..f99afbc 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index f098d64..314bdb9 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -5,7 +5,7 @@ from pathlib import Path from DocParser.vrdu import utils -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py index 6897c67..8cc28f5 100644 --- a/scripts/retrieve_metadata.py +++ b/scripts/retrieve_metadata.py @@ -7,7 +7,7 @@ from DocParser.vrdu import utils -from DocParser.vrdu import logger +from DocParser.logger import logger log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") From e4c3713b4124fd7962c750845f53b51c7e982294 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 18 Jul 2024 10:53:34 +0800 Subject: [PATCH 38/39] fix(arxiv_cleaner): import path error use relative imports --- DocParser/arxiv_cleaner/cleaner.py | 6 +++--- DocParser/arxiv_cleaner/latex.py | 4 ++-- DocParser/arxiv_cleaner/main.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/DocParser/arxiv_cleaner/cleaner.py b/DocParser/arxiv_cleaner/cleaner.py index 4d5aa39..c0c9209 100644 --- a/DocParser/arxiv_cleaner/cleaner.py +++ b/DocParser/arxiv_cleaner/cleaner.py @@ -1,9 +1,9 @@ -from arxiv_cleaner.file_utils import ( +from .file_utils import ( build_relative_path, combine_paths, copy_files, create_temp_dir, does_file_exist, find_files, remove_temp_dir, remove_unnecessary_blank_lines) -from arxiv_cleaner.latex import LatexRunner -from arxiv_cleaner.logger import Logger +from .latex import LatexRunner +from .logger import Logger class Cleaner: diff --git a/DocParser/arxiv_cleaner/latex.py b/DocParser/arxiv_cleaner/latex.py index 2a2d264..f609550 100644 --- a/DocParser/arxiv_cleaner/latex.py +++ b/DocParser/arxiv_cleaner/latex.py @@ -1,8 +1,8 @@ import re import subprocess -from arxiv_cleaner.cli import run_command, check_command_results -from arxiv_cleaner.file_utils import ( +from .cli import run_command, check_command_results +from .file_utils import ( build_relative_path, change_extension, combine_paths, diff --git a/DocParser/arxiv_cleaner/main.py b/DocParser/arxiv_cleaner/main.py index f2f0182..14a1ba0 100644 --- a/DocParser/arxiv_cleaner/main.py +++ b/DocParser/arxiv_cleaner/main.py @@ -1,5 +1,5 @@ -from arxiv_cleaner.arguments import parse_args -from arxiv_cleaner.cleaner import Cleaner +from arguments import parse_args +from cleaner import Cleaner def main(): From 4c92883de757f4eecbfd45d86fdf68c8880488db Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 13 Jan 2025 18:29:43 +0800 Subject: [PATCH 39/39] refactor(all): refactor the whole projects for open-source --- DocParser/main.py | 113 ++-- DocParser/vrdu/block.py | 122 ++-- DocParser/vrdu/compile_latex.sh | 0 DocParser/vrdu/layout_annotation.py | 686 ++++++++++----------- DocParser/vrdu/order_annotation.py | 460 +++++++------- DocParser/vrdu/preprocess.py | 230 ++++--- DocParser/vrdu/quality_check.py | 216 ++++--- DocParser/vrdu/renderer.py | 904 ++++++++++------------------ DocParser/vrdu/utils.py | 427 ++++++++----- README.md | 327 +++++----- dataset_readme.md | 98 +++ scripts/app.py | 120 ---- scripts/arxiv_download.py | 32 +- setup.py | 10 +- 14 files changed, 1790 insertions(+), 1955 deletions(-) mode change 100755 => 100644 DocParser/vrdu/compile_latex.sh mode change 100755 => 100644 DocParser/vrdu/utils.py create mode 100644 dataset_readme.md delete mode 100644 scripts/app.py diff --git a/DocParser/main.py b/DocParser/main.py index 8a5881c..59ba33f 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -2,10 +2,11 @@ import glob import os import shutil +from pathlib import Path +from typing import List from tqdm import tqdm +from loguru import logger - -from DocParser.logger import logger from DocParser.vrdu import utils from DocParser.vrdu import renderer from DocParser.vrdu import preprocess @@ -14,65 +15,76 @@ from DocParser.vrdu.config import config from DocParser.vrdu.quality_check import generate_quality_report -log = logger.setup_app_level_logger(file_name="vrdu_debug.log") +logger.add("vrdu_debug.log", mode="w") -def transform_tex_to_images(main_directory: str) -> None: +def transform_tex_to_images(main_directory: Path) -> None: """ Transforms TeX files with pattern paper_*.tex in the specified directory into jpg images. Args: - main_directory (str): The main directory where the TeX files are located. + main_directory (Path): The main directory where the TeX files are located. Returns: None """ tex_files = glob.glob(f"{main_directory}/paper_*.tex") - output_directory = os.path.join(main_directory, "output") - for tex_file in tqdm(tex_files): - log.debug(f"[VRDU] file: {tex_file}, start transforming into images.") - utils.compile_latex(tex_file) + output_directory = Path(main_directory) / "output" + for tex_file in tqdm(tex_files, desc="Converting TeX files to images"): + logger.debug(f"[VRDU] file: {tex_file}, start transforming into images.") + # Set colored flag based on filename + colored = "paper_colored.tex" in tex_file + utils.compile_latex(tex_file, colored=colored) # get the pdf file name - filename_without_extension = os.path.splitext(os.path.basename(tex_file))[0] - pdf_file = os.path.join(main_directory, f"{filename_without_extension}.pdf") + filename_without_extension = Path(tex_file).stem + pdf_file = Path(main_directory) / f"{filename_without_extension}.pdf" # convert into images - image_directory = os.path.join(output_directory, filename_without_extension) - os.makedirs(image_directory) - utils.pdf2jpg(pdf_file, image_directory) + image_directory = output_directory / filename_without_extension + image_directory.mkdir(parents=True, exist_ok=True) + utils.pdf2jpg(str(pdf_file), str(image_directory)) + + +def get_redundant_folders(main_directory: Path) -> List[str]: + """Get list of redundant folders to remove.""" + pattern = f"{main_directory}/output/paper_{config.folder_prefix}*" + redundant_folders = glob.glob(pattern) + redundant_folders.extend( + [ + f"{main_directory}/output/paper_white", + f"{main_directory}/output/paper_original", + ] + ) + return redundant_folders -def remove_redundant_stuff(main_directory: str) -> None: +def remove_redundant_stuff(main_directory: Path) -> None: """ Remove redundant files and folders from the main directory. Args: - main_directory (str): The path of the main directory. + main_directory (Path): The path of the main directory. Returns: None """ # remove generated tex related files - redundant_files = glob.glob(f"{main_directory}/paper_*") - for file in redundant_files: + for file in glob.glob(f"{main_directory}/paper_*"): os.remove(file) # remove useless pdf and image files - # TODO: move this name pattern into config - redundant_folders = glob.glob( - f"{main_directory}/output/paper_{config.folder_prefix}*" - ) - redundant_folders += [ - f"{main_directory}/output/paper_white", - f"{main_directory}/output/paper_original", - ] - for folder in redundant_folders: + for folder in get_redundant_folders(main_directory): if os.path.exists(folder): shutil.rmtree(folder) -def process_one_file(file_name: str) -> None: +def check_if_already_processed(main_directory: Path) -> bool: + quality_report_file = main_directory / "output/result/quality_report.json" + return quality_report_file.exists() + + +def process_one_file(file_name: Path) -> None: """ Process a file through multiple steps including preprocessing, rendering, transforming into images, generating annotations, and handling exceptions. @@ -83,33 +95,32 @@ def process_one_file(file_name: str) -> None: Returns: None """ - main_directory = os.path.dirname(file_name) - log.info(f"[VRDU] file: {file_name}, start processing.") + main_directory = Path(file_name).parent + logger.info(f"[VRDU] file: {file_name}, start processing.") # check if this paper has been processed - quality_report_file = os.path.join( - main_directory, "output/result/quality_report.json" - ) - if os.path.exists(quality_report_file): - log.info(f"[VRDU] file: {file_name}, paper has been processed") + if check_if_already_processed(main_directory): + logger.info(f"[VRDU] file: {file_name}, paper has been processed") return # make a copy of the original tex file - original_tex = os.path.join(main_directory, "paper_original.tex") + original_tex = main_directory / "paper_original.tex" shutil.copyfile(file_name, original_tex) # remove the output folder if it exists - output_directory = os.path.join(main_directory, "output") - if os.path.exists(output_directory): + output_directory = main_directory / "output" + if output_directory.exists(): shutil.rmtree(output_directory) + # change the working directory to the main directory of the paper cwd = os.getcwd() try: # change the working directory to the main directory of the paper os.chdir(main_directory) - # create output folder - os.makedirs(os.path.join(main_directory, "output/result")) + # create output folder and output/result folder + result_dir = output_directory / "result" + result_dir.mkdir(parents=True) # step 1: preprocess the paper preprocess.run(original_tex) @@ -118,14 +129,14 @@ def process_one_file(file_name: str) -> None: vrdu_renderer = renderer.Renderer() vrdu_renderer.render(original_tex) - # step 2.2: compling tex into PDFs - log.info( + # step 2.2: compiling tex into PDFs + logger.info( f"[VRDU] file: {original_tex}, start transforming into images, this may take a while..." ) transform_tex_to_images(main_directory) # Step 3: generate annotations - log.info( + logger.info( f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..." ) vrdu_layout_annotation = layout.LayoutAnnotation(original_tex) @@ -137,12 +148,12 @@ def process_one_file(file_name: str) -> None: # generate quality report for simple debugging generate_quality_report(main_directory) - log.info(f"[VRDU] file: {original_tex}, successfully processed.") + logger.info(f"[VRDU] file: {original_tex}, successfully processed.") except Exception as e: # error_type = e.__class__.__name__ # error_info = str(e) - # log.error( + # logger.error( # f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" # ) raise e @@ -180,18 +191,18 @@ def main() -> None: Returns: None """ - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="Process TeX files to generate annotations and images" + ) parser.add_argument( "-f", "--file_name", - type=str, + type=Path, required=True, - help="The name of the tex file will full path", + help="The path to the TeX file to process", ) args = parser.parse_args() - file_name = args.file_name - - process_one_file(file_name) + process_one_file(Path(args.file_name)) if __name__ == "__main__": diff --git a/DocParser/vrdu/block.py b/DocParser/vrdu/block.py index 90d9aaf..cd3e55a 100644 --- a/DocParser/vrdu/block.py +++ b/DocParser/vrdu/block.py @@ -1,16 +1,11 @@ from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple - -from typing import Any +from typing import Dict, List, Optional, Tuple, Any, cast @dataclass class BoundingBox: - """A simple bounding box representation. - The coordinates are in the form of (x0, y0, x1, y1) - The origin is in the top left and (x0, y0) is the top left corner, - (x1, y1) is the bottom right corner. - """ + """A bounding box representation with coordinates (x0,y0) for top-left and (x1,y1) for bottom-right. + Origin is at top-left of the page.""" x0: float y0: float @@ -19,10 +14,12 @@ class BoundingBox: @property def width(self) -> float: + """Width of bounding box""" return self.x1 - self.x0 @property def height(self) -> float: + """Height of bounding box""" return self.y1 - self.y0 def __len__(self) -> int: @@ -35,9 +32,11 @@ def __getitem__(self, index: int) -> float: return (self.x0, self.y0, self.x1, self.y1)[index] def area(self) -> float: - return abs((self.x1 - self.x0) * (self.y1 - self.y0)) + """Calculate area of bounding box""" + return abs(self.width * self.height) - def overlap(self, other) -> float: + def overlap(self, other: "BoundingBox") -> float: + """Calculate overlap area with another bounding box""" if ( self.x0 > other.x1 or self.x1 < other.x0 @@ -45,27 +44,30 @@ def overlap(self, other) -> float: or self.y1 < other.y0 ): return 0.0 + x_overlap = max(0, min(self.x1, other.x1) - max(self.x0, other.x0)) y_overlap = max(0, min(self.y1, other.y1) - max(self.y0, other.y0)) return x_overlap * y_overlap - def to_dict(self) -> Dict[str, Any]: - return {"bbox": (self.x0, self.y0, self.x1, self.y1)} + def to_dict(self) -> Dict[str, Tuple[float, float, float, float]]: + """Convert to dictionary format""" + return {"bbox": self.to_tuple()} def to_tuple(self) -> Tuple[float, float, float, float]: + """Convert to tuple format""" return (self.x0, self.y0, self.x1, self.y1) @classmethod - def from_dict(cls, data: Dict[str, Any]): - return cls( - x0=data["bbox"][0], - y0=data["bbox"][1], - x1=data["bbox"][2], - y1=data["bbox"][3], - ) + def from_dict(cls, data: Dict[str, Any]) -> "BoundingBox": + """Create BoundingBox from dictionary""" + bbox = data["bbox"] + return cls(x0=bbox[0], y0=bbox[1], x1=bbox[2], y1=bbox[3]) @classmethod - def from_list(cls, data: List[Tuple[float, float, float, float, float, float]]): + def from_list( + cls, data: List[Tuple[float, float, float, float, float, float]] + ) -> "BoundingBox": + """Create bounding box that encompasses all points in list""" min_x = min(data, key=lambda x: x[1])[1] min_y = min(data, key=lambda x: x[0])[0] max_x = max(data, key=lambda x: x[4])[4] @@ -74,6 +76,8 @@ def from_list(cls, data: List[Tuple[float, float, float, float, float, float]]): class Block: + """Represents a block of content with position and metadata""" + current_id: int = 0 def __init__( @@ -89,27 +93,25 @@ def __init__( labels: Optional[List[str]] = None, references: Optional[List[str]] = None, ) -> None: - if not block_id: - self.id = Block.current_id + self.id = block_id if block_id is not None else Block.current_id + if block_id is None: Block.current_id += 1 - else: - self.id = block_id - - self._category = category - self._page_index = page_index - self._bounding_box = bounding_box - self._previous_block = previous_block - self._parent_block = parent_block - self._next_block = next_block - self._source_code = source_code - self._labels = labels - self._references = references + + self._category: Optional[int] = category + self._page_index: Optional[int] = page_index + self._bounding_box: BoundingBox = bounding_box + self._previous_block: Optional[int] = previous_block + self._parent_block: Optional[int] = parent_block + self._next_block: Optional[int] = next_block + self._source_code: Optional[str] = source_code + self._labels: Optional[List[str]] = labels or [] + self._references: Optional[List[str]] = references or [] def __repr__(self) -> str: return ( f"Block(id={self.id}, category={self.category}, " - f"page_index={self.page_index}, bbox={self.bbox}), " - f"source_code={self.source_code}" + f"page_index={self.page_index}, bbox={self.bbox}, " + f"source_code={self.source_code})" ) @property @@ -122,7 +124,7 @@ def bbox(self, value: BoundingBox) -> None: @property def labels(self) -> List[str]: - return self._labels + return cast(List[str], self._labels or []) @labels.setter def labels(self, value: List[str]) -> None: @@ -130,7 +132,7 @@ def labels(self, value: List[str]) -> None: @property def references(self) -> List[str]: - return self._references + return cast(List[str], self._references or []) @references.setter def references(self, value: List[str]) -> None: @@ -142,7 +144,7 @@ def block_id(self) -> int: @property def category(self) -> int: - return self._category + return cast(int, self._category or 0) @category.setter def category(self, value: int) -> None: @@ -150,7 +152,7 @@ def category(self, value: int) -> None: @property def page_index(self) -> int: - return self._page_index + return cast(int, self._page_index or 0) @page_index.setter def page_index(self, value: int) -> None: @@ -158,7 +160,7 @@ def page_index(self, value: int) -> None: @property def source_code(self) -> str: - return self._source_code + return cast(str, self._source_code or "") @source_code.setter def source_code(self, value: str) -> None: @@ -166,7 +168,7 @@ def source_code(self, value: str) -> None: @property def parent_block(self) -> int: - return self._parent_block + return cast(int, self._parent_block or -1) @parent_block.setter def parent_block(self, value: int) -> None: @@ -174,11 +176,11 @@ def parent_block(self, value: int) -> None: @property def previous_block(self) -> int: - return self._previous_block + return cast(int, self._previous_block or -1) @property def next_block(self) -> int: - return self._next_block + return cast(int, self._next_block or -1) @property def height(self) -> float: @@ -188,25 +190,25 @@ def height(self) -> float: def width(self) -> float: return self._bounding_box.width - def to_dict(self): - data = self._bounding_box.to_dict() - data.update( - { - "block_id": self.block_id, - "category": self.category, - "page_index": self.page_index, - "previous_block": self.previous_block, - "parent_block": self.parent_block, - "next_block": self.next_block, - "source_code": self.source_code, - "labels": self.labels, - "references": self.references, - } - ) + def to_dict(self) -> Dict[str, Any]: + """Convert block to dictionary format""" + data = { + "block_id": self.block_id, + "category": self.category, + "page_index": self.page_index, + "previous_block": self.previous_block, + "parent_block": self.parent_block, + "next_block": self.next_block, + "source_code": self.source_code, + "labels": self.labels, + "references": self.references, + "bbox": self._bounding_box.to_tuple(), + } return data @classmethod - def from_dict(cls, data: Dict[str, Any]): + def from_dict(cls, data: Dict[str, Any]) -> "Block": + """Create Block from dictionary""" return cls( block_id=data["block_id"], bounding_box=BoundingBox.from_dict(data), diff --git a/DocParser/vrdu/compile_latex.sh b/DocParser/vrdu/compile_latex.sh old mode 100755 new mode 100644 diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index 1e70f8d..72c1717 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -1,371 +1,367 @@ from collections import defaultdict -import os import glob import subprocess -from typing import Any, DefaultDict, Dict, List +from typing import Any, Dict, List, Tuple import matplotlib.pyplot as plt import numpy as np from skimage.measure import label, regionprops from PIL import Image, ImageDraw, ImageFont import re from tqdm import tqdm +from pathlib import Path from pdfminer.high_level import extract_pages from pdfminer.layout import LTFigure, LTPage - from DocParser.vrdu import utils from DocParser.vrdu.block import Block, BoundingBox from DocParser.vrdu.config import config, envs -from DocParser.logger import logger - -log = logger.get_logger(__name__) class LayoutAnnotation: - # https://www.overleaf.com/learn/latex/Lengths_in_LaTeX - ONE_INCH = 72.27 + """Class for extracting and annotating layout information from LaTeX documents.""" + + ONE_INCH = 72.27 # LaTeX point unit conversion - def __init__(self, tex_file: str) -> None: + def __init__(self, tex_file: Path) -> None: + """Initialize LayoutAnnotation with paths and load required data. + + Args: + tex_file: Path to the LaTeX source file + """ self.tex_file = tex_file - self.main_directory = os.path.dirname(tex_file) - self.output_directory = os.path.join(self.main_directory, "output") - self.result_directory = os.path.join(self.output_directory, "result") + self.main_directory = tex_file.parent + self.output_directory = self.main_directory / "output" + self.result_directory = self.output_directory / "result" + self.pdf_images_path = self.output_directory / "paper_colored" + + # Load text information + self.text_info = utils.load_json(self.result_directory / "texts.json") self.layout_metadata: Dict = {} - self.text_info = utils.load_json( - os.path.join(self.result_directory, "texts.json") - ) - self.pdf_images_path = os.path.join(self.output_directory, "paper_colored") def extract_pdf_layouts(self) -> List[LTPage]: - """Extracts layout information of each page from a rendered PDF. + """Extract layout information from rendered PDF. + + Returns: + List of page layout objects + """ + rendered_pdf = self.main_directory / "paper_colored.pdf" + return list(extract_pages(rendered_pdf)) - This method reads the rendered PDF file and extracts the layout information for each page. - The layout information includes the position, size, and other attributes of each element on the page. + def _parse_log_metadata(self, log_file: Path) -> Dict: + """Parse LaTeX log file for layout metadata. + + Args: + log_file: Path to LaTeX log file Returns: - List[LTPage]: A list of LTPage objects representing the layout of each page. - - Example: - >>> renderer = PDFRenderer() - >>> layouts = renderer.extract_pdf_layouts() - >>> for layout in layouts: - ... print(layout) - - - ... + Dictionary of extracted metadata values """ - rendered_pdf = os.path.join(self.main_directory, "paper_colored.pdf") - page_layouts = extract_pages(rendered_pdf) - return list(page_layouts) + metadata = {} + pattern = r"\[vrdu_data_process: The (.*) is: ([-+]?\d+\.\d+)pt\]" + + with open(log_file, "r", encoding="latin-1") as f: + for match in re.findall(pattern, f.read()): + key, value = match[0], float(match[1]) + metadata[key] = value + + # Calculate number of columns + textwidth = metadata["textwidth"] + columnsep = metadata["columnsep"] + columnwidth = metadata["columnwidth"] + metadata["num_columns"] = round( + (textwidth + columnsep) / (columnwidth + columnsep) + ) - def parse_metadata(self, pdf_layouts: List[LTPage]) -> None: - """Parse metadata from PDF layouts and store them in the layout_metadata attribute. + return metadata + + def _calculate_margins(self, metadata: Dict) -> Tuple[float, float]: + """Calculate page margins from metadata. Args: - - pdf_layouts (List[LTPage]): A list of LTPage objects representing the PDF layouts. + metadata: Dictionary of layout metadata Returns: - - None + Tuple of (margin_width, margin_height) + """ + margin_width = (self.ONE_INCH + metadata["hoffset"]) + metadata["oddsidemargin"] + + margin_height = ( + (self.ONE_INCH + metadata["voffset"]) + - (metadata["topmargin"] - metadata["headheight"]) + ) + metadata["headsep"] / 2 + + return margin_width, margin_height + + def parse_metadata(self, pdf_layouts: List[LTPage]) -> None: + """Parse and store layout metadata from PDF and log file. + + Args: + pdf_layouts: List of PDF page layouts """ pt2px = config.ppi / self.ONE_INCH + log_file = self.main_directory / "paper_colored.log" - layout_metadata = dict() - - # get metadata from log file - log_file = os.path.join(self.main_directory, "paper_colored.log") - # see renderer.py add_layout_definitions for details - regex_pattern = r"\[vrdu_data_process: The (.*) is: ([-+]?\d+\.\d+)pt\]" - - with open(log_file, "r", encoding="latin-1") as file: - log_content = file.read() - - for match in re.findall(regex_pattern, log_content): - key = match[0] - value = float(match[1]) - layout_metadata[key] = value - - textwidth = layout_metadata["textwidth"] - columnsep = layout_metadata["columnsep"] - columnwidth = layout_metadata["columnwidth"] - # textwidth = n * columnwidth + (n - 1) * columnsep - num_columns = round((textwidth + columnsep) / (columnwidth + columnsep)) - layout_metadata["num_columns"] = num_columns - - # https://www.overleaf.com/learn/latex/Page_size_and_margins - element1 = self.ONE_INCH + layout_metadata["hoffset"] - element2 = self.ONE_INCH + layout_metadata["voffset"] - element3 = layout_metadata["oddsidemargin"] - element4 = layout_metadata["topmargin"] - element5 = layout_metadata["headheight"] - element6 = layout_metadata["headsep"] - margin_width = element1 + element3 - margin_height = (element2 - (element4 - element5)) + element6 / 2 - layout_metadata["margin_width"] = margin_width - - # sort all images by page index, see utils.pdf2jpg for details - image_files = sorted( - glob.glob(f"{self.pdf_images_path}/*.jpg"), key=lambda x: x[-6:-4] - ) - for page_index, page_layout in enumerate(pdf_layouts): - layout_metadata[page_index] = {} - - pdf_width, pdf_height = page_layout.width, page_layout.height - layout_metadata[page_index]["pdf_width"] = pdf_width - layout_metadata[page_index]["pdf_height"] = pdf_height - - with Image.open(image_files[page_index]) as page_image: - image_width, image_height = page_image.size - layout_metadata[page_index]["image_width"] = image_width - layout_metadata[page_index]["image_height"] = image_height - - px2img = image_height / pdf_height - layout_metadata[page_index]["px2img"] = px2img - layout_metadata[page_index]["separations"] = [0] - - # x is initialize as left boundary of a column minus a half of column separation width - # this can make sure the separation is in the middle of two columns - x = margin_width - 0.5 * columnsep - for i in range(num_columns - 1): - separation = x + columnwidth + columnsep - layout_metadata[page_index]["separations"].append( - separation * pt2px * px2img - ) - x += separation - # TODO: consider the margin notes - layout_metadata[page_index]["separations"].append(pdf_width * px2img) - layout_metadata[page_index]["top_margin"] = margin_height + # Parse basic metadata + metadata = self._parse_log_metadata(log_file) + margin_width, margin_height = self._calculate_margins(metadata) + metadata["margin_width"] = margin_width - utils.export_to_json( - layout_metadata, - os.path.join(self.result_directory, "layout_metadata.json"), + # Get image files sorted by page number + image_files = sorted( + glob.glob(str(self.pdf_images_path / "*.jpg")), key=lambda x: x[-6:-4] ) - self.layout_metadata = layout_metadata + # Process each page + for page_idx, page_layout in enumerate(pdf_layouts): + page_metadata = { + "pdf_width": page_layout.width, + "pdf_height": page_layout.height, + "top_margin": margin_height, + } + + # Get image dimensions + with Image.open(image_files[page_idx]) as img: + page_metadata["image_width"], page_metadata["image_height"] = img.size + + # Calculate scale factors + px2img = page_metadata["image_height"] / page_layout.height + page_metadata["px2img"] = px2img + + # Calculate column separations + separations = [0] + x = margin_width - 0.5 * metadata["columnsep"] + for _ in range(metadata["num_columns"] - 1): + sep = x + metadata["columnwidth"] + metadata["columnsep"] + separations.append(sep * pt2px * px2img) + x += sep + separations.append(page_layout.width * px2img) + page_metadata["separations"] = separations + + metadata[page_idx] = page_metadata + + self.layout_metadata = metadata + utils.export_to_json(metadata, self.result_directory / "layout_metadata.json") def retrieve_figure_source_code( self, figure_layout_info: Dict[int, List[Block]] ) -> None: - """Retrieves the source code of a figure using synctex. + """Retrieve LaTeX source code for figures using synctex. Args: - figure_layout_info (Dict[int, List[Block]]): A dictionary where the keys are page indices - and the values are lists of Block objects representing the bounding boxes of figures on each page. - - Returns: - None - - Note: - use `synctex help edit` to view usage of synctex + figure_layout_info: Dictionary mapping page numbers to figure blocks """ - # paper_colored.tex is what we are working for - tex_filename = os.path.basename(self.tex_file).replace( - "paper_original", "paper_colored" - ) + tex_filename = self.tex_file.name.replace("paper_original", "paper_colored") pdf_filename = tex_filename.replace(".tex", ".pdf") - with open(os.path.join(self.main_directory, tex_filename), "r") as file: - content_lines = file.readlines() - for page_index, blocks in figure_layout_info.items(): + with open(self.main_directory / tex_filename, "r") as f: + content_lines = f.readlines() + + for page_idx, blocks in figure_layout_info.items(): for block in blocks: - bbox = block.bbox - center_x, center_y = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2 - log.debug( - f"page index: {page_index + 1}, center: ({center_x}, {center_y}), pdf filename: {pdf_filename}" - ) - # use synctex to retrieve the line index corresponding to the center of the bounding box + # Get center point of figure + center_x = (block.bbox[0] + block.bbox[2]) / 2 + center_y = (block.bbox[1] + block.bbox[3]) / 2 + + # Run synctex to get source line result = subprocess.run( [ "synctex", "edit", "-o", - f"{page_index + 1}:{center_x:.2f}:{center_y:.2f}:{pdf_filename}", + f"{page_idx + 1}:{center_x:.2f}:{center_y:.2f}:{pdf_filename}", "-d", - self.main_directory, + str(self.main_directory), ], check=True, capture_output=True, text=True, ) - # parse the output of synctex to get the source code - line_index = result.stdout.split("\nLine:")[1].split("\n")[0] - block.source_code = content_lines[int(line_index) - 1] - log.debug(f"line index: {line_index}, source code: {block.source_code}") + + # Extract line number and source + line_idx = int(result.stdout.split("\nLine:")[1].split("\n")[0]) + block.source_code = content_lines[line_idx - 1] def generate_figure_bb(self, pdf_layouts: List[LTPage]) -> Dict[int, List[Block]]: - """Generate bounding boxes for figures in a PDF layout using Pdfminer. + """Generate bounding boxes for figures in PDF layout. Args: - pdf_layouts (List[LTPage]): A list of LTPage objects representing the layout of a PDF. + pdf_layouts: List of PDF page layouts Returns: - Dict[int, List[Block]]: A dictionary where the keys are page indices and the values are lists of - Block objects representing the bounding boxes of figures on each page. + Dictionary mapping page numbers to figure blocks """ layout_info = defaultdict(list) - for page_index, page_layout in enumerate(pdf_layouts): - height = page_layout.height - for element in page_layout: + + for page_idx, page in enumerate(pdf_layouts): + for element in page: if not isinstance(element, LTFigure): continue - # the coordinate system of Pdfminer is in contrast to the coordinate system of the image - # by flipping the y axis - y0 = height - element.bbox[3] - y1 = height - element.bbox[1] - x0 = element.bbox[0] - x1 = element.bbox[2] - layout_info[page_index].append( + + # Convert coordinates (flip y-axis) + y0 = page.height - element.bbox[3] + y1 = page.height - element.bbox[1] + x0, x1 = element.bbox[0], element.bbox[2] + + layout_info[page_idx].append( Block( bounding_box=BoundingBox(x0, y0, x1, y1), - page_index=page_index, + page_index=page_idx, category=config.name2category["Figure"], source_code="", ) ) - # find the corresponding source code to figure bounding box self.retrieve_figure_source_code(layout_info) - - # convert bounding boxes from PDF coordinate system to image coordinate system self.transform(layout_info) return layout_info def transform(self, layout_info: Dict[int, List[Block]]) -> None: - """Transforms bounding boxes from PDF coordinate system to image coordinate system, - and change them in place. + """Transform bounding boxes from PDF to image coordinates. Args: - layout_info (Dict[int, List[Block]]): A dictionary containing the layout information of each page. - The keys represent the page indices, and the values are lists of Block objects - representing the elements in the layout. + layout_info: Dictionary mapping page numbers to blocks + """ + for page_idx, elements in layout_info.items(): + px2img = self.layout_metadata[page_idx]["px2img"] + + for element in elements: + x0, y0, x1, y1 = element.bbox + width = element.width + height = element.height + + # Scale coordinates + x0 *= px2img + y0 *= px2img + x1 = x0 + width * px2img + y1 = y0 + height * px2img + + element.bbox = BoundingBox(x0, y0, x1, y1) + + def _process_bounding_boxes( + self, + bounding_boxes: List[Tuple], + category: str, + page_idx: int, + source_code: str, + elements: List[Block], + ) -> List[Block]: + """Process and create blocks for bounding boxes. + + Args: + bounding_boxes: List of bounding box coordinates + category: Block category + page_idx: Page number + source_code: Source code for block + elements: Existing block elements Returns: - None + Updated list of block elements """ - for page_index in layout_info.keys(): - px2img = self.layout_metadata[page_index]["px2img"] - for index, element in enumerate(layout_info[page_index]): - x0, y0, x1, y1 = element.bbox - # scale - width, height = element.width, element.height - x0, y0 = x0 * px2img, y0 * px2img - x1, y1 = x0 + width * px2img, y0 + height * px2img - layout_info[page_index][index].bbox = BoundingBox(x0, y0, x1, y1) + separations = self.layout_metadata[page_idx]["separations"] + + if category in envs.one_column_envs: + if bounding_boxes: + element = Block( + bounding_box=BoundingBox.from_list(bounding_boxes), + source_code=source_code, + category=config.name2category[category], + page_index=page_idx, + ) + if elements: + element.parent_block = elements[-1].block_id + elements.append(element) + return elements + + # Handle multi-column case + for col in range(self.layout_metadata["num_columns"]): + col_boxes = [ + bb + for bb in bounding_boxes + if separations[col] <= bb[1] <= separations[col + 1] + ] + if not col_boxes: + continue + + element = Block( + bounding_box=BoundingBox.from_list(col_boxes), + source_code=source_code, + category=config.name2category[category], + page_index=page_idx, + ) + + if elements: + element.parent_block = elements[-1].block_id + + # Merge overlapping blocks + if ( + elements + and elements[-1].category == element.category + and elements[-1].page_index == page_idx + and elements[-1].source_code == source_code + and elements[-1].bbox.overlap(element.bbox) + ): + + elements[-1].bbox = BoundingBox( + min(elements[-1].bbox.x0, element.bbox.x0), + min(elements[-1].bbox.y0, element.bbox.y0), + max(elements[-1].bbox.x1, element.bbox.x1), + max(elements[-1].bbox.y1, element.bbox.y1), + ) + else: + elements.append(element) + + return elements def generate_non_figure_bb(self) -> Dict[int, List[Block]]: - """Generates non-figure bounding boxes using the image pairs. + """Generate bounding boxes for non-figure elements. Returns: - Dict[int, List[Block]]: A dictionary containing the layout information of each page. - The keys represent the page indices, and the values are lists of Block objects - representing the elements in the layout. + Dictionary mapping page numbers to block elements """ - background_directory = os.path.join(self.output_directory, "paper_white") - block_directories = glob.glob( - f"{self.output_directory}/paper_{config.folder_prefix}*" + background_dir = self.output_directory / "paper_white" + block_dirs = glob.glob( + str(self.output_directory / f"paper_{config.folder_prefix}*") ) layout_info = defaultdict(list) pattern = r"paper_(\w+)_(\d{5})_(.*?)_(\d{5})" - for block_directory in tqdm(sorted(block_directories)): - log.debug(f"Processing {block_directory}") - image_pairs = get_image_pairs(block_directory, background_directory) - matches = re.match(pattern, os.path.basename(block_directory)) + for block_dir in tqdm(sorted(block_dirs)): + matches = re.match(pattern, Path(block_dir).name) if not matches: - raise ValueError(f"Cannot find the matching pattern: {block_directory}") + raise ValueError(f"Invalid directory name pattern: {block_dir}") + category = matches.group(3) index = int(matches.group(4)) - log.debug(f"category: {category}, index: {index}") + elements: List[Block] = [] - elements = [] - for image_pair in image_pairs: - page_index = image_pair[0] + for page_idx, img1_path, img2_path in get_image_pairs( + Path(block_dir), background_dir + ): + # Compare images + img1 = np.array(plt.imread(img1_path), dtype=np.uint8) + img2 = np.array(plt.imread(img2_path), dtype=np.uint8) + diff = np.abs(img2 - img1, dtype=np.uint8) - image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8) - image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8) - - diff_image = np.abs(image2_array - image1_array, dtype=np.uint8) - if np.all(diff_image == 0): - continue - labeled_image, num = label( - diff_image > config.threshold, return_num=True - ) - if num == 0: + if np.all(diff == 0): continue - regions = regionprops(labeled_image) - bounding_boxes = [region.bbox for region in regions] - - if len(bounding_boxes) == 0: + # Get regions + labeled, num = label(diff > config.threshold, return_num=True) + if num == 0: continue - separations = self.layout_metadata[page_index]["separations"] - top_margin = self.layout_metadata[page_index]["top_margin"] - - # We do not consider the cross column case for these envs. - if category in envs.one_column_envs: - bounding_boxes = [bb for bb in bounding_boxes] - if len(bounding_boxes) == 0: - continue - element = Block( - bounding_box=BoundingBox.from_list(bounding_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, - ) - if elements: - element.parent_block = elements[-1].block_id - elements.append(element) + bounding_boxes = [region.bbox for region in regionprops(labeled)] + if not bounding_boxes: continue - # consider possible cross column case - for column in range(self.layout_metadata["num_columns"]): - # min_x: bb[1], min_y: bb[0], max_x: bb[4], max_y: bb[3] - column_boxes = [ - bb - for bb in bounding_boxes - if bb[1] >= separations[column] - and bb[1] <= separations[column + 1] - ] - if not column_boxes: - continue - - element = Block( - bounding_box=BoundingBox.from_list(column_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, - ) - if elements: - element.parent_block = elements[-1].block_id - - if ( - len(elements) > 0 - and elements[-1].category == element.category - and elements[-1].page_index == element.page_index - and elements[-1].source_code == element.source_code - and elements[-1].bbox.overlap(element.bbox) - ): - elements[-1].bbox = BoundingBox( - min( - elements[-1].bbox.x0, - element.bbox.x0, - ), - min( - elements[-1].bbox.y0, - element.bbox.y0, - ), - max( - elements[-1].bbox.x1, - element.bbox.x1, - ), - max( - elements[-1].bbox.y1, - element.bbox.y1, - ), - ) - continue - elements.append(element) + elements = self._process_bounding_boxes( + bounding_boxes, + category, + page_idx, + self.text_info[category][index], + elements, + ) for element in elements: layout_info[element.page_index].append(element) @@ -373,176 +369,126 @@ def generate_non_figure_bb(self) -> Dict[int, List[Block]]: return layout_info def generate_layout_info(self) -> Dict[int, List[Block]]: - """Generate layout information for the given PDF. - - This function extracts the PDF layouts using the `extract_pdf_layouts` method - and parses the metadata using the `parse_metadata` method. - Then, it generates non-figure bounding boxes using the `generate_non_figure_bb` method - and figure bounding boxes using the `generate_figure_bb` method. - - Args: - None + """Generate complete layout information. Returns: - Dict[int, List[Block]]: A dictionary containing the layout information for - each page of the PDF. - The keys represent the page indices, and the values are lists of `Block` objects - that represent the bounding boxes. - + Dictionary mapping page numbers to all block elements """ pdf_layouts = self.extract_pdf_layouts() self.parse_metadata(pdf_layouts) + layout_info = self.generate_non_figure_bb() figure_layout_info = self.generate_figure_bb(pdf_layouts) - for page_index in layout_info.keys(): - layout_info[page_index].extend(figure_layout_info[page_index]) + # Combine figure and non-figure info + for page_idx, figures in figure_layout_info.items(): + layout_info[page_idx].extend(figures) + return layout_info def generate_image_annotation( self, layout_info: Dict[int, List[Block]] ) -> Dict[int, Dict[str, Any]]: - """Generate image annotations based on the layout information. + """Generate annotated images with bounding boxes. Args: - layout_info (Dict[int, List[Block]]): A dictionary mapping page indices to a list of Block objects - representing the layout information. + layout_info: Dictionary mapping page numbers to blocks Returns: - Dict[int, Dict[str, Any]]: A dictionary mapping page indices to annotated image info. + Dictionary of image annotation information """ - # sort all images by page index, see utils.pdf2jpg for details - # FIXME: use more robust way image_files = sorted( - glob.glob(os.path.join(self.pdf_images_path, "*.jpg")), + glob.glob(str(self.pdf_images_path / "*.jpg")), key=lambda x: x[-6:-4], ) - image_info = {} # annotation image info member of COCO - for page_index in layout_info.keys(): - image_info[page_index] = {} - page_image = Image.open(image_files[page_index]) - draw = ImageDraw.Draw(page_image) - # use `locate .ttf` to find the available fonts - font = ImageFont.truetype( - config.config["annotation_image_font_type"], - config.config["annotation_image_font_size"], - ) - - for element in layout_info[page_index]: - category = element.category - draw.rectangle( - element.bbox.to_tuple(), - outline=config.colors_map[str(category)], - width=3, - ) - draw.text( - (element.bbox[0], element.bbox[1]), - config.category2name[category], - fill=(255, 0, 0), - font=font, + image_info = {} + for page_idx in layout_info: + with Image.open(image_files[page_idx]) as page_image: + annotated = generate_geometry_annotation( + page_image, layout_info[page_idx] ) - image_name = "page_" + str(page_index).zfill(4) + ".jpg" - annotated_image_path = os.path.join(self.result_directory, image_name) - image_info[page_index]["file_name"] = image_name - image_info[page_index]["width"] = page_image.width - image_info[page_index]["height"] = page_image.height - page_image.save(annotated_image_path) - page_image.close() + image_name = f"page_{str(page_idx).zfill(4)}.jpg" + annotated.save(self.result_directory / image_name) + + image_info[page_idx] = { + "file_name": image_name, + "width": page_image.width, + "height": page_image.height, + } return image_info def annotate(self): - """Annotates the layout, reading, order, and quality report of the given image. - - Returns: - None - """ - # step1: generate layout info + """Generate complete layout annotation.""" + # Generate layout information layout_info = self.generate_layout_info() layout_info_data = { - key: [x.to_dict() for x in values] for key, values in layout_info.items() + key: [x.to_dict() for x in blocks] for key, blocks in layout_info.items() } - layout_info_file = os.path.join(self.result_directory, "layout_info.json") - utils.export_to_json(layout_info_data, layout_info_file) + utils.export_to_json( + layout_info_data, self.result_directory / "layout_info.json" + ) - # step2: generate layout detection result + # Generate annotations image_annotation = self.generate_image_annotation(layout_info) - layout_annotation_file = os.path.join( - self.result_directory, "layout_annotation.json" - ) utils.export_to_coco( - layout_info, image_annotation, file_path=layout_annotation_file + layout_info, + image_annotation, + self.result_directory / "layout_annotation.json", ) -def get_image_pairs(dir1: str, dir2: str): - """ - Generate a list of image pairs based on the directories provided. - - Parameters: - dir1 (str): The directory path to the first set of images. - dir2 (str): The directory path to the second set of images. +def get_image_pairs(dir1: Path, dir2: Path) -> List[Tuple[int, str, str]]: + """Get matching pairs of images from two directories. - Raises: - FileNotFoundError: If the number of images in each directory does not - match or if the page index in the file names does not match. + Args: + dir1: First directory path + dir2: Second directory path Returns: - list: A list of tuples representing the image pairs. - Each tuple contains the page index, the path to the rendered image, - and the path to the changed image. + List of tuples containing (page_index, image1_path, image2_path) + + Raises: + FileNotFoundError: If image counts don't match + ValueError: If page index can't be extracted """ - file_pattern = os.path.join(dir1, "*.jpg") - rendered_jpg_files = sorted(glob.glob(file_pattern)) - file_pattern = os.path.join(dir2, "*.jpg") - changed_jpg_files = sorted(glob.glob(file_pattern)) + rendered_files = sorted(glob.glob(str(dir1 / "*.jpg"))) + changed_files = sorted(glob.glob(str(dir2 / "*.jpg"))) - if len(rendered_jpg_files) != len(changed_jpg_files): - raise FileNotFoundError("Wrong image path or file name or page index!") + if len(rendered_files) != len(changed_files): + raise FileNotFoundError("Mismatched image counts between directories") def extract_page_index(filename: str) -> int: - pattern = r"thread-\d+-page-(\d+)\.jpg" - - match = re.search(pattern, filename) - if match: - page_index = int(match.group(1)) - return page_index - 1 - else: - raise ValueError("Cannot found corresponding page index") + match = re.search(r"thread-\d+-page-(\d+)\.jpg", filename) + if not match: + raise ValueError(f"Cannot extract page index from {filename}") + return int(match.group(1)) - 1 - page_indices = [] - for i in range(len(rendered_jpg_files)): - file_name = os.path.basename(rendered_jpg_files[i]) - page_index = extract_page_index(file_name) - page_indices.append(int(page_index)) - - image_pairs = list(zip(page_indices, rendered_jpg_files, changed_jpg_files)) - return image_pairs + page_indices = [extract_page_index(Path(f).name) for f in rendered_files] + return list(zip(page_indices, rendered_files, changed_files)) def generate_geometry_annotation( page_image: Image.Image, layout_elements: List[Block] ) -> Image.Image: - """ - Generate an annotation for an image. + """Add geometric annotations to an image. Args: - page_image (Image.Image): The image to annotate. - page_elements (List[LTComponent]): A list of elements to be annotated. + page_image: Image to annotate + layout_elements: List of block elements to annotate Returns: - Image.Image: The annotated image. + Annotated image """ draw = ImageDraw.Draw(page_image) - # use `locate .ttf` to find the available fonts font = ImageFont.truetype( config.config["annotation_image_font_type"], config.config["annotation_image_font_size"], ) - for index, element in enumerate(layout_elements): + for element in layout_elements: category = element.category draw.rectangle( element.bbox.to_tuple(), outline=config.colors_map[str(category)], width=3 diff --git a/DocParser/vrdu/order_annotation.py b/DocParser/vrdu/order_annotation.py index e1dd5ee..da5e109 100644 --- a/DocParser/vrdu/order_annotation.py +++ b/DocParser/vrdu/order_annotation.py @@ -1,65 +1,70 @@ import re -import os from uuid import uuid4 +from pathlib import Path +from typing import Dict, List, Any from DocParser.vrdu.block import Block from DocParser.vrdu.config import config from DocParser.vrdu import utils -from DocParser.logger import logger - -log = logger.get_logger(__name__) class OrderAnnotation: - def __init__(self, tex_file: str) -> None: + """Handles annotation of reading order relationships between document elements.""" + + def __init__(self, tex_file: Path) -> None: + """Initialize order annotation for a LaTeX file. + + Args: + tex_file: Path to the LaTeX file + """ self.tex_file = tex_file - self.main_directory = os.path.dirname(tex_file) - self.result_directory = os.path.join(self.main_directory, "output/result") - layout_info_file = os.path.join(self.result_directory, "layout_info.json") + self.main_directory = tex_file.parent + self.result_directory = self.main_directory / "output/result" + + # Load layout info + layout_info_file = self.result_directory / "layout_info.json" layout_info_data = utils.load_json(layout_info_file) layout_info = { int(key): [Block.from_dict(item) for item in values] for key, values in layout_info_data.items() } - # result - self.annotations = {} - self.annotations["annotations"] = [ - _block - for page_index in layout_info.keys() - for _block in layout_info[page_index] - ] + # Initialize annotations + self.annotations: Dict[str, Any] = { + "annotations": [ + block for page_blocks in layout_info.values() for block in page_blocks + ], + "orders": [], + } - def annotate(self): - self.annotations["orders"] = [] + def annotate(self) -> None: + """Generate and save all order annotations.""" + # Generate different types of order relationships self.generate_sortable_envs_order() - self.generate_float_envs_order() - self.generate_cross_reference_order() - order_annotation_file = os.path.join( - self.result_directory, "order_annotation.json" - ) - + # Save annotations + order_annotation_file = self.result_directory / "order_annotation.json" transformed_annotations = { "annotations": [x.to_dict() for x in self.annotations["annotations"]], "orders": self.annotations["orders"], } - utils.export_to_json(transformed_annotations, order_annotation_file) - def generate_cross_reference_order(self): - annotations = [] + def generate_cross_reference_order(self) -> None: + """Generate order annotations for cross-references.""" + annotations: List[Dict[str, str]] = [] - # map from label to block_id - label_to_block_id = {} - for block in self.annotations["annotations"]: - if not block.labels: - continue - for _label in block.labels: - label_to_block_id[_label] = block.block_id + # Build label to block ID mapping + label_to_block_id = { + label: block.block_id + for block in self.annotations["annotations"] + if block.labels + for label in block.labels + } + # Reference patterns to match ref_patterns = "|".join( [ r"\\ref\{(.*?)\}", @@ -71,274 +76,241 @@ def generate_cross_reference_order(self): r"\\labelcref\{(.*?)\}", ] ) - # generate reference according to label - for block in self.annotations["annotations"]: - if config.category2name[block.category] not in ["Text", "Text-EQ"]: - continue - block.references = [ - x - for group in re.findall(ref_patterns, block.source_code) - for x in group - if x - ] - for _label in block.references: - if _label in label_to_block_id: - annotations.append( - { - "type": "explicit-cite", - "from": block.block_id, - "to": label_to_block_id[_label], - } - ) + # Process text blocks for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Caption": - continue - if not block.references: - continue - for _label in block.references: - if _label not in label_to_block_id: - continue - annotations.append( - { - "type": "implicit-cite", - "from": block.block_id, - "to": label_to_block_id[_label], - } + category = config.category2name[block.category] + + # Handle text and equation references + if category in ["Text", "Text-EQ"]: + block.references = self._extract_references( + block.source_code, ref_patterns + ) + self._add_reference_annotations( + block, label_to_block_id, annotations, "explicit-cite" ) - # generate reference for float environments - for block in self.annotations["annotations"]: - if config.category2name[block.category] not in ["Table", "Algorithm"]: - continue - block.references = [ - x - for group in re.findall(ref_patterns, block.source_code) - for x in group - if x - ] - for _label in block.references: - if _label in label_to_block_id: - annotations.append( - { - "type": "explicit-cite", - "from": block.block_id, - "to": label_to_block_id[_label], - } - ) + # Handle caption references + elif category == "Caption" and block.references: + self._add_reference_annotations( + block, label_to_block_id, annotations, "implicit-cite" + ) + + # Handle table and algorithm references + elif category in ["Table", "Algorithm"]: + block.references = self._extract_references( + block.source_code, ref_patterns + ) + self._add_reference_annotations( + block, label_to_block_id, annotations, "explicit-cite" + ) self.annotations["orders"].extend(annotations) - def generate_float_envs_order(self): - label_pattern = r"\\label\{(.*?)\}" + def _extract_references(self, text: str, pattern: str) -> List[str]: + """Extract reference labels from text using pattern.""" + return [x for group in re.findall(pattern, text) for x in group if x] + + def _add_reference_annotations( + self, + block: Block, + label_map: Dict[str, str], + annotations: List[Dict[str, str]], + ref_type: str, + ) -> None: + """Add reference annotations for a block.""" + for label in block.references: + if label in label_map: + annotations.append( + {"type": ref_type, "from": block.block_id, "to": label_map[label]} + ) + def generate_float_envs_order(self) -> None: + """Generate order annotations for floating environments.""" with open(self.tex_file, "r") as f: latex_content = f.read() - # 0, add labels for titles - # TODO: add labels for other types of titles + + # Process title labels + self._process_title_labels(latex_content) + + # Process equation labels + self._process_equation_labels() + + # Process float environment labels + category_patterns = { + "Table": r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", + "Figure": r"\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}", + "Algorithm": r"\\begin\{algorithm\*?\}(.*?)\\end\{algorithm\*?\}", + } + + category_indices = { + category: [ + (match.start(), match.end(), str(uuid4())) + for match in re.finditer(pattern, latex_content, re.DOTALL) + ] + for category, pattern in category_patterns.items() + } + + label_pattern = r"\\label\{(.*?)\}" + + # Process each category + for category, indices in category_indices.items(): + self._process_float_env_labels( + category, indices, latex_content, label_pattern + ) + + def _process_title_labels(self, latex_content: str) -> None: + """Process and add labels for title blocks.""" + label_pattern = r"\\label\{(.*?)\}" + for block in self.annotations["annotations"]: if config.category2name[block.category] != "Title": continue + block.labels = re.findall(label_pattern, block.source_code) - start_index = latex_content.find(block.source_code) - if start_index == -1: + # Find additional labels after the title + start_idx = latex_content.find(block.source_code) + if start_idx == -1: continue - end_index = start_index + len(block.source_code) - _matches = re.finditer(label_pattern, latex_content[end_index:], re.DOTALL) - for _match in _matches: - label_start_index, label_end_index = ( - _match.start() + end_index, - _match.end() + end_index, - ) - label_content = latex_content[label_start_index:label_end_index] - if latex_content[end_index:label_start_index].isspace(): + + end_idx = start_idx + len(block.source_code) + matches = re.finditer(label_pattern, latex_content[end_idx:], re.DOTALL) + + for match in matches: + label_start = match.start() + end_idx + label_end = match.end() + end_idx + label_content = latex_content[label_start:label_end] + + if latex_content[end_idx:label_start].isspace(): block.labels.extend(re.findall(label_pattern, label_content)) break - # 1. add labels for equations + def _process_equation_labels(self) -> None: + """Process and add labels for equation blocks.""" + label_pattern = r"\\label\{(.*?)\}" + + for block in self.annotations["annotations"]: + if config.category2name[block.category] == "Equation": + block.labels = re.findall(label_pattern, block.source_code) + + def _process_float_env_labels( + self, + category: str, + indices: List[tuple], + latex_content: str, + label_pattern: str, + ) -> None: + """Process and add labels for floating environment blocks.""" for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Equation": + if config.category2name[block.category] != category: continue - block.labels = re.findall(label_pattern, block.source_code) - # 2. add labels for float envs - # find the interval of tables - category_to_patterns = { - "Table": re.compile( - r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL - ), - "Figure": re.compile( - r"\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}", re.DOTALL - ), - "Algorithm": re.compile( - r"\\begin\{algorithm\*?\}(.*?)\\end\{algorithm\*?\}", re.DOTALL - ), - } + start_idx = latex_content.find(block.source_code) + if start_idx == -1: + continue - category_to_indices = {} - for category, pattern in category_to_patterns.items(): - category_to_indices[category] = [] - indices = pattern.finditer(latex_content) - # we add a uuid to match for float environments in case - # there are no explicit cite - for _match in indices: - category_to_indices[category].append( - (_match.start(), _match.end(), str(uuid4())) - ) + end_idx = start_idx + len(block.source_code) - for category_name, indices in category_to_indices.items(): - # find labels for those float environments - for block in self.annotations["annotations"]: - if config.category2name[block.category] != category_name: + for idx_start, idx_end, uuid in indices: + if not (start_idx >= idx_start and end_idx <= idx_end): continue - start_index = latex_content.find(block.source_code) - if start_index == -1: - continue - end_index = start_index + len(block.source_code) + labels = re.findall(label_pattern, latex_content[idx_start:idx_end]) + block.labels = labels + block.labels.append(uuid) - for index in indices: - if start_index < index[0] or end_index > index[1]: - continue + # Process caption references + for block in self.annotations["annotations"]: + if config.category2name[block.category] != "Caption": + continue - labels = re.findall( - label_pattern, latex_content[index[0] : index[1]] - ) - block.labels = labels - block.labels.append(index[2]) + start_idx = latex_content.find(block.source_code) + if start_idx == -1: + continue - # add references for captions to those float environments - for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Caption": - continue - start_index = latex_content.find(block.source_code) - if start_index == -1: - continue - end_index = start_index + len(block.source_code) - for index in indices: - if start_index < index[0] or end_index > index[1]: - continue + end_idx = start_idx + len(block.source_code) + + for idx_start, idx_end, uuid in indices: + if start_idx >= idx_start and end_idx <= idx_end: + block.references = [uuid] - block.references = [index[2]] + def generate_sortable_envs_order(self) -> None: + """Generate order annotations for sortable environments.""" + annotations: List[Dict[str, str]] = [] - def generate_sortable_envs_order(self): - annotations = [] - sortable_categories = [ + # Get relevant category IDs + sortable_cats = [ config.name2category[name] for name in config.sortable_categories ] - - sortable_elements = [ - _block - for _block in self.annotations["annotations"] - if _block.category in sortable_categories + title_cats = [ + config.name2category[name] for name in ["Title", "PaperTitle", "Abstract"] ] - - title_categories = [ - config.name2category[x] for x in ["Title", "PaperTitle", "Abstract"] + text_cats = [ + config.name2category[name] + for name in ["Text", "Text-EQ", "Equation", "List"] ] - text_categories = [ - config.name2category[x] for x in ["Text", "Text-EQ", "Equation", "List"] + # Get sortable elements + sortable_elements = [ + block + for block in self.annotations["annotations"] + if block.category in sortable_cats ] - stack = [] - for index, element in enumerate(sortable_elements): - if index == 0 or not stack: + stack: List[Block] = [] + for idx, element in enumerate(sortable_elements): + if idx == 0 or not stack: stack.append(element) continue - # case 0: both corresponding to the same text, mark as identical + # Handle different cases if element.parent_block == stack[-1].block_id: - annotations.append( - { - "type": "identical", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) + self._add_order_annotation(annotations, element, stack[-1], "identical") stack.pop() stack.append(element) - continue - # case 1: both in the text category, mark as adj - if ( - element.category in text_categories - and stack[-1].category in text_categories - ): - annotations.append( - { - "type": "adj", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) + elif element.category in text_cats and stack[-1].category in text_cats: + self._add_order_annotation(annotations, element, stack[-1], "adj") stack.pop() stack.append(element) - continue - # case 2: current in text, prev in title, mark as sub - if ( - element.category in text_categories - and stack[-1].category in title_categories + elif ( + element.category in text_cats + and stack[-1].category in title_cats + and element.category != stack[-1].category ): - if element.category != stack[-1].category: - annotations.append( - { - "type": "sub", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - stack.append(element) - continue + self._add_order_annotation(annotations, element, stack[-1], "sub") + stack.append(element) - # case 3: current in title, prev in text, find the most recent title - if ( - element.category in title_categories - and stack[-1].category in text_categories - ): - while stack and stack[-1].category not in title_categories: + elif element.category in title_cats and stack[-1].category in text_cats: + while stack and stack[-1].category not in title_cats: stack.pop() - if not stack: - stack.append(element) - continue - - annotations.append( - { - "type": "peer", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) + if stack: + self._add_order_annotation(annotations, element, stack[-1], "peer") stack.append(element) - continue - # case 4: both in titles, mark as peer - if ( - element.category in title_categories - and stack[-1].category in title_categories - ): - annotations.append( - { - "type": "peer", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) + elif element.category in title_cats and stack[-1].category in title_cats: + self._add_order_annotation(annotations, element, stack[-1], "peer") stack.pop() stack.append(element) - continue - if element.category == config.name2category["Footnote"]: - annotations.append( - { - "type": "explicit-cite", - "from": element.block_id, - "to": stack[-1].block_id, - } + elif element.category == config.name2category["Footnote"]: + self._add_order_annotation( + annotations, element, stack[-1], "explicit-cite" ) - continue self.annotations["orders"].extend(annotations) + + def _add_order_annotation( + self, + annotations: List[Dict[str, str]], + from_block: Block, + to_block: Block, + rel_type: str, + ) -> None: + """Add an order annotation between two blocks.""" + annotations.append( + {"type": rel_type, "from": from_block.block_id, "to": to_block.block_id} + ) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index 96f2f95..e5ea59a 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -1,189 +1,169 @@ -import os import re +from pathlib import Path +from typing import Optional +from loguru import logger from DocParser.arxiv_cleaner.cleaner import Cleaner from DocParser.vrdu.config import envs, config from DocParser.vrdu import utils -from DocParser.logger import logger -log = logger.get_logger(__name__) - - -def remove_comments(original_tex: str) -> None: +def remove_comments(tex_file: Path) -> None: """ - Removes comments from a TeX file. + Removes LaTeX comments from a TeX file. Args: - original_tex (str): The path to the original TeX file. - - Returns: - None + tex_file: Path to the TeX file """ - with open(original_tex, "r") as file: - content = file.read() + tex_file = Path(tex_file) + content = tex_file.read_text() # Remove LaTeX comments pattern = r"\\begin{comment}(.*?)\\end{comment}" - removed_comments = re.sub(pattern, "", content, flags=re.DOTALL) + content = re.sub(pattern, "", content, flags=re.DOTALL) - with open(original_tex, "w") as file: - file.write(removed_comments) + tex_file.write_text(content) -def clean_tex(original_tex: str) -> None: +def clean_tex(tex_file: Path) -> None: """ - Clean the given TeX file by creating a cleaner object and running the clean method. + Clean the given TeX file using arxiv-cleaner. Args: - original_tex (str): The path to the original TeX file. - - Returns: - None + tex_file: Path to the TeX file """ - main_directory = os.path.dirname(original_tex) - tex = os.path.basename(original_tex) + tex_file = Path(tex_file) + main_directory = tex_file.parent - # Create the cleaner + # Create and run the cleaner cleaner = Cleaner( - input_dir=main_directory, - output_dir=main_directory, - tex=tex, + input_dir=str(main_directory), + output_dir=str(main_directory), + tex=tex_file.name, command_options=config.command_options, verbose=False, ) - - # Run the cleaner cleaner.clean() - # remove comments - remove_comments(original_tex) + # Remove any remaining comments + remove_comments(tex_file) + +def get_graphics_path(content: str) -> str: + """Extract graphics path from LaTeX content.""" + pattern = r"\\graphicspath\{\{(.+?)}" + if match := re.search(pattern, content, re.DOTALL): + return match.group(1) + return "" -def replace_pdf_ps_figures_with_png(original_tex: str) -> None: + +def convert_image( + image_path: Path, main_dir: Path, graphics_path: str, target_ext: str = ".png" +) -> Optional[str]: + """ + Convert image to target format if needed. + Returns the new image name or None if conversion failed. """ - Replaces PDF, ps, eps figures with PNG figures in a TeX file + if not image_path.exists(): + logger.error(f"File not found: {image_path}") + return None + + if image_path.suffix in [".eps", ".ps"]: + # Convert eps/ps to pdf first + pdf_path = image_path.with_suffix(".pdf") + utils.convert_eps_image_to_pdf_image(image_path, pdf_path) + image_path = pdf_path + + if image_path.suffix == ".pdf": + # Convert pdf to png + png_path = image_path.with_suffix(".png") + utils.convert_pdf_figure_to_png_image(image_path, png_path) + return png_path.name + + return image_path.name + + +def replace_pdf_ps_figures_with_png(tex_file: Path) -> None: + """ + Replace PDF, PS, EPS figures with PNG figures in a TeX file to support pdfminer detecting bounding box. Args: - original_tex (str): The path to the original TeX file. - - Returns: - None: This function does not return anything. + tex_file: Path to the TeX file Raises: - FileNotFoundError: If a PDF file specified in the TeX file is not found. + FileNotFoundError: If an image file is not found """ + tex_file = Path(tex_file) + main_directory = tex_file.parent + content = tex_file.read_text() - # FIXME: use more robust way, since the path to images may not exists. - main_directory = os.path.dirname(original_tex) - with open(original_tex) as f: - content = f.read() + graphics_path = get_graphics_path(content) - graphicspath_pattern = r"\\graphicspath\{\{(.+?)}" - match = re.search(graphicspath_pattern, content, re.DOTALL) - if match: - graphic_path = match.group(1) - else: - graphic_path = "" - - # Replace \psfig{...} with \includegraphics{...} + # Replace \psfig and \epsfig with \includegraphics content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Replace \epsfig{...} with \includegraphics{...} content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content) - # Regular expression pattern to match \includegraphics - # commands with PDF files + # Find all \includegraphics commands pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}" - - # Find all matches of \includegraphics with PDF files matches = re.findall(pattern, content) - # Replace PDF paths with PNG paths + # Supported extensions ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - for match in matches: - image_name = match[1] + + # Process each image + for _, img_path in matches: + image_name = img_path + + # Add extension if missing if not any(ext in image_name for ext in ext_patterns): for ext in ext_patterns: - image_file = os.path.join(main_directory, graphic_path, image_name, ext) - if os.path.exists(image_file): - image_name = image_name + ext + test_path = Path(main_directory, graphics_path, image_name).with_suffix( + ext + ) + if test_path.exists(): + image_name = f"{image_name}{ext}" break - # detectable image type, see pdfminer.six for details - if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]): - content = content.replace(match[1], image_name) + # Skip if already in supported format + if any(ext in image_name for ext in [".jpg", ".jpeg", ".png"]): + content = content.replace(img_path, image_name) continue - # convert eps to pdf - if any(ext in image_name for ext in [".eps", ".ps"]): - eps_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(eps_image): - log.error(f"File not found: {eps_image}") - continue - pdf_image = os.path.splitext(eps_image)[0] + ".pdf" - utils.convert_eps_image_to_pdf_image(eps_image, pdf_image) - image_name = os.path.basename(pdf_image) - - # convert pdf to png - if image_name.endswith(".pdf"): - pdf_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(pdf_image): - log.error(f"File not found: {pdf_image}") - continue - png_image = os.path.splitext(pdf_image)[0] + ".png" - utils.convert_pdf_figure_to_png_image(pdf_image, png_image) - image_name = os.path.splitext(image_name)[0] + ".png" - - # replace the reference in tex file - content = content.replace(match[1], image_name) - - with open(original_tex, "w") as f: - f.write(content) - - -def delete_table_of_contents(original_tex: str) -> None: - """ - Deletes the table of contents from the given original_tex file. - This includes table of contents, list of figures, list of tables, and list of algorithms. + # Convert image if needed + image_path = Path(main_directory, graphics_path, image_name) + if new_name := convert_image(image_path, main_directory, graphics_path): + content = content.replace(img_path, new_name) + + tex_file.write_text(content) - Parameters: - original_tex (str): The path to the original .tex file. - Returns: - None +def delete_table_of_contents(tex_file: Path) -> None: """ - with open(original_tex, "r") as file: - latex_content = file.read() + Delete table of contents, list of figures/tables/algorithms. + + Args: + tex_file: Path to the TeX file + """ + tex_file = Path(tex_file) + content = tex_file.read_text() pattern = r"\\(" + "|".join(envs.table_of_contents) + r")" - modified_content = re.sub(pattern, "", latex_content) + content = re.sub(pattern, "", content) - with open(original_tex, "w") as file: - file.write(modified_content) + tex_file.write_text(content) -def run(original_tex: str) -> None: +def run(tex_file: Path) -> None: """ - Generates a modified version of the given LaTeX document by performing the following steps: - - Step 0: Clean the LaTeX document with arxiv_cleaner package. - Step 1: Replace EPS figures with PDF to make the LaTeX document compilable with pdflatex. - Step 2: Replace PDF figures with PNG to make pdfminer work. - Step 3: Delete the table of contents from the LaTeX document. + Preprocess a LaTeX document by: + 1. Cleaning with arxiv_cleaner + 2. Converting figures to PNG format + 3. Removing table of contents Args: - original_tex (str): The original LaTeX document. - - Returns: - None + tex_file: Path to the LaTeX document """ - # Step 0: clean tex - clean_tex(original_tex) - - # Step 2: process images - replace_pdf_ps_figures_with_png(original_tex) - - # Step 3: delete table of contents - delete_table_of_contents(original_tex) + clean_tex(tex_file) + replace_pdf_ps_figures_with_png(tex_file) + delete_table_of_contents(tex_file) diff --git a/DocParser/vrdu/quality_check.py b/DocParser/vrdu/quality_check.py index 846ec8d..49eedf2 100644 --- a/DocParser/vrdu/quality_check.py +++ b/DocParser/vrdu/quality_check.py @@ -1,127 +1,186 @@ -from typing import Dict, List -import os +"""Quality check module for analyzing layout and text information.""" + +from typing import Dict, List, Any +from pathlib import Path from DocParser.vrdu.block import Block from DocParser.vrdu import utils from DocParser.vrdu.config import config -def generate_quality_report(main_directory: str) -> None: - """Generates a quality report based on the provided layout information. +def generate_quality_report(main_directory: Path) -> None: + """Generate a quality report analyzing layout and text information. - Args: - layout_info (Dict[int, List[Block]]): A dictionary where the keys are page indices - and the values are lists of blocks on each page. + Analyzes layout metadata, text content, and block positioning to generate + a quality report with metrics like missing content rates and block overlaps. - Returns: - None + Args: + main_directory: Base directory containing the input files """ - result_directory = os.path.join(main_directory, "output/result") + result_dir = main_directory / "output" / "result" + + # Load input files + layout_metadata = utils.load_json(result_dir / "layout_metadata.json") + text_info = utils.load_json(result_dir / "texts.json") + layout_info_data = utils.load_json(result_dir / "layout_info.json") + + # Convert layout info to Block objects + layout_info = _convert_layout_info(layout_info_data) + + # Generate report + result = { + "num_pages": max(layout_info.keys()), + "num_columns": layout_metadata["num_columns"], + "category_quality": _analyze_category_quality(layout_info, text_info), + "page_quality": _analyze_page_quality(layout_info), + } + + # Save report + utils.export_to_json(result, result_dir / "quality_report.json") - layout_metadata_file = os.path.join(result_directory, "layout_metadata.json") - layout_metadata = utils.load_json(layout_metadata_file) - text_info_file = os.path.join(result_directory, "texts.json") - text_info = utils.load_json(text_info_file) +def _convert_layout_info(layout_info_data: Dict) -> Dict[int, List[Block]]: + """Convert raw layout info data to Block objects. - layout_info_file = os.path.join(result_directory, "layout_info.json") - layout_info_data = utils.load_json(layout_info_file) - # order_annotation_file = os.path.join(result_directory, "order_annotation.json") - # order_annotation = utils.load_json(order_annotation_file) - # layout_info_data = order_annotation["annotation"] - layout_info = { + Args: + layout_info_data: Raw layout info dictionary from JSON + + Returns: + Dictionary mapping page numbers to lists of Block objects + """ + return { int(key): [Block.from_dict(item) for item in values] for key, values in layout_info_data.items() } - result = {} - result["num_pages"] = max(layout_info.keys()) - result["num_columns"] = layout_metadata["num_columns"] - result["category_quality"] = [] - total_reading, total_geometry = 0, 0 - for key, value in text_info.items(): - # currently, ignore graphics - if key == config.name2category["Figure"]: +def _analyze_category_quality( + layout_info: Dict[int, List[Block]], text_info: Dict[str, List[Any]] +) -> List[Dict[str, Any]]: + """Analyze quality metrics for each content category. + + Compares text content vs geometric blocks to identify missing content. + Calculates metrics like counts and missing rates for each category. + + Args: + layout_info: Page index to list of Block objects mapping + text_info: Category name to list of text content mapping + + Returns: + List of quality metrics per category including totals + """ + quality_metrics = [] + total_reading = total_geometry = 0 + + for category, texts in text_info.items(): + # Skip figure analysis since they're handled differently + if category == config.name2category["Figure"]: continue - reading_count = len(value) - geometry_count = 0 - for page_index, blocks in layout_info.items(): - for block in blocks: - # only major block is counted - if ( - block.category == config.name2category[key] - and block.parent_block is None - ): - geometry_count += 1 - missing_rate = 0 if reading_count == 0 else 1 - geometry_count / reading_count - result["category_quality"].append( + reading_count = len(texts) + geometry_count = _count_category_blocks(layout_info, category) + + missing_rate = _calculate_missing_rate(reading_count, geometry_count) + + quality_metrics.append( { - "category": key, + "category": category, "geometry_count": geometry_count, - "reading_count": len(value), + "reading_count": reading_count, "missing_rate": missing_rate, } ) total_reading += reading_count total_geometry += geometry_count - result["category_quality"].append( + + # Add aggregate metrics + quality_metrics.append( { "category": "Total", "geometry_count": total_geometry, "reading_count": total_reading, - "missing_rate": 1 - total_geometry / total_reading, + "missing_rate": _calculate_missing_rate(total_reading, total_geometry), } ) - result["page_quality"] = compute_overlap(layout_info) + return quality_metrics - report_file = os.path.join(result_directory, "quality_report.json") - utils.export_to_json(result, report_file) +def _calculate_missing_rate(reading_count: int, geometry_count: int) -> float: + """Calculate missing rate between reading and geometry counts. -def compute_overlap(layout_info: Dict[int, List[Block]]) -> List[Dict]: - """Computes the overlap between blocks in a layout. + Args: + reading_count: Number of text elements found + geometry_count: Number of geometric blocks found + + Returns: + Missing rate as a float between 0 and 1 + """ + return 0 if reading_count == 0 else 1 - geometry_count / reading_count + + +def _count_category_blocks(layout_info: Dict[int, List[Block]], category: str) -> int: + """Count number of top-level blocks of a given category. + + Only counts blocks that don't have a parent block (top-level blocks). Args: - layout_info (Dict[int, List[Block]]): A dictionary where the keys are page indices - and the values are lists of blocks on each page. + layout_info: Page index to list of Block objects mapping + category: Category to count Returns: - List[Dict]: A list of dictionaries containing the overlap information for each page and - the total overlap information. + Number of blocks found + """ + count = 0 + for blocks in layout_info.values(): + count += sum( + 1 + for block in blocks + if block.category == config.name2category[category] + and block.parent_block is None + ) + return count + + +def _analyze_page_quality(layout_info: Dict[int, List[Block]]) -> List[Dict[str, Any]]: + """Analyze quality metrics for each page. + + Calculates area and overlap metrics for blocks on each page. + Includes total metrics across all pages. + + Args: + layout_info: Page index to list of Block objects mapping + Returns: + List of quality metrics per page including totals """ - result = [] - total_area, total_overlap, total_blocks = 0, 0, 0 - for page_index in layout_info.keys(): - blocks = layout_info[page_index] + metrics = [] + total_area = total_overlap = total_blocks = 0 + + for page_index, blocks in layout_info.items(): blocks.sort(key=lambda block: block.bbox.x0) - area, overlap = 0, 0 - for i in range(len(blocks)): - area += blocks[i].bbox.area() - for j in range(i + 1, len(blocks)): - if blocks[j].bbox.x0 > blocks[i].bbox.x1: - break - overlap += blocks[i].bbox.overlap(blocks[j].bbox) + area = sum(block.bbox.area() for block in blocks) + overlap = _calculate_page_overlap(blocks) + overlap_ratio = 0 if area == 0 else overlap / area - result.append( + metrics.append( { "page": page_index, "num_blocks": len(blocks), "area": area, "overlap": overlap, - "ratio": 0 if area == 0 else overlap / area, + "ratio": overlap_ratio, } ) + total_area += area total_overlap += overlap total_blocks += len(blocks) - result.append( + # Add aggregate metrics + metrics.append( { "page": "total", "num_blocks": total_blocks, @@ -131,4 +190,25 @@ def compute_overlap(layout_info: Dict[int, List[Block]]) -> List[Dict]: } ) - return result + return metrics + + +def _calculate_page_overlap(blocks: List[Block]) -> float: + """Calculate total overlap area between blocks on a page. + + Blocks must be sorted by x0 coordinate for early termination optimization. + + Args: + blocks: List of blocks sorted by x0 coordinate + + Returns: + Total overlap area between all blocks + """ + overlap = 0 + for i, block in enumerate(blocks[:-1]): + for other in blocks[i + 1 :]: + # Early termination - no more overlaps possible + if other.bbox.x0 > block.bbox.x1: + break + overlap += block.bbox.overlap(other.bbox) + return overlap diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 19e2b32..bd0fe02 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -1,156 +1,144 @@ +"""LaTeX document rendering module for colorizing and processing semantic elements.""" + from collections import defaultdict -import os import shutil -from typing import List, Tuple, Union +from typing import List, Union, Dict import re - +from pathlib import Path +from loguru import logger from DocParser.vrdu import utils -from DocParser.logger import logger from DocParser.vrdu.config import config, envs +from DocParser.vrdu.utils import ( + data_from_tex_file, + tex_file_from_data, + is_text_eq, + find_env, + replace_nth, +) -from DocParser.TexSoup.TexSoup import TexSoup -import DocParser.TexSoup.app.conversion as conversion -log = logger.get_logger(__name__) +class Renderer: + """Handles rendering and colorizing of LaTeX documents. + This class provides functionality to: + - Parse and process LaTeX documents + - Add color definitions and styling + - Render different semantic elements with distinct colors + - Generate individual files for each element type + """ -class Renderer: def __init__(self) -> None: - self.texts = defaultdict(list) - - def render(self, origin_tex: str) -> None: - """Render the colored version of a LaTeX document. + """Initialize renderer with empty text storage.""" + self.texts: Dict[str, List[str]] = defaultdict(list) - This method performs the rendering process for generating the colored version of a LaTeX document. - It includes the following steps: - 1. Create a copy of the original LaTeX file with a new name. - 2. Add color definitions and layout definitions to the copied file. - 3. Remove color definitions that may cause conflicts. - 4. Render all environments in the copied file. - 5. Iterate over semantic elements and change their enclosing color, generating corresponding LaTeX files. - 6. Export the rendered texts to a JSON file. + def render(self, origin_tex: Path) -> None: + """Render a colored version of a LaTeX document. Args: - origin_tex (str): The path to the original LaTeX file. - - Returns: - None - - Examples: - >>> renderer = LaTeXRenderer() - >>> renderer.render("original.tex") + origin_tex: Path to original LaTeX file + + The rendering process: + 1. Creates a colored copy of the original file + 2. Adds required color and layout definitions + 3. Removes any conflicting color definitions + 4. Renders all semantic environments + 5. Generates individual files per element + 6. Exports the rendered text elements """ - main_directory = os.path.dirname(origin_tex) + main_directory = origin_tex.parent + color_tex = main_directory / "paper_colored.tex" - # copy the original tex file - color_tex = os.path.join(main_directory, "paper_colored.tex") + # Setup colored document shutil.copyfile(origin_tex, color_tex) + self._setup_document_styling(color_tex) - self.add_color_definition(color_tex) - self.add_layout_definition(color_tex) - - # remove color definitions to prevent conflict - self.remove_predefined_color(color_tex) - + # Process environments self.render_all_env(color_tex) - - # change the enclose color of semantic elements one by one and generate corresponding tex files self.render_one_env(main_directory) - text_file = os.path.join(main_directory, "output/result/texts.json") + # Export results + text_file = main_directory / "output/result/texts.json" utils.export_to_json(self.texts, text_file) - def render_all_env(self, color_tex: str) -> None: - """ - Render all environments, it includes simple environments and float environments. + def _setup_document_styling(self, color_tex: Path) -> None: + """Set up document styling by adding color and layout definitions. Args: - color_tex (str): The color texture. + color_tex: Path to LaTeX file to modify + """ + self.add_color_definition(color_tex) + self.add_layout_definition(color_tex) + self.remove_predefined_color(color_tex) - Returns: - None + def render_all_env(self, color_tex: Path) -> None: + """Render all environments in the document. + + Args: + color_tex: Path to colored LaTeX file """ self.render_simple_envs(color_tex) self.render_float_envs(color_tex) - def render_simple_envs(self, color_tex: str) -> None: - """Renders simple environments in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering various simple environments, - such as sections, lists, equations, and text. - The modifications are done in-place, directly modifying the provided file. + def render_simple_envs(self, color_tex: Path) -> None: + """Render simple environments like sections, lists, equations and text. Args: - color_tex (str): The path to the LaTeX file to modify. - - Returns: - None + color_tex: Path to LaTeX file to modify Raises: - EOFError: If TexSoup failed to parse the input file due to runaway environments. - AssertionError: If TexSoup failed to parse the input file due to Command \\item invalid in math mode. - + EOFError: If TexSoup fails to parse due to runaway environments + AssertionError: If TexSoup fails due to invalid math mode commands """ data, start, end = data_from_tex_file(color_tex) - self.render_section(data) - self.render_list(data) - self.render_equation(data) - self.render_text(data) - # self.enclose_reference(data, color=name2color["Reference"]) + # Process each environment type + for renderer in [ + self.render_section, + self.render_list, + self.render_equation, + self.render_text, + ]: + renderer(data) - # Write the modified data back to the TeX file + # Write back to file tex_file_from_data(data, color_tex, start=start, end=end) - def render_float_envs(self, tex_file: str) -> None: - """Renders float environments in a LaTeX file. - - This method applies rendering to various float environments in the LaTeX file - by calling specific rendering methods for each type of environment. + def render_float_envs(self, tex_file: Path) -> None: + """Render floating environments like figures, tables, algorithms etc. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file to modify + + The environments are rendered in a specific order to handle dependencies: + 1. Algorithms + 2. Tables + 3. Code blocks + 4. Footnotes + 5. Graphics + 6. Captions + 7. Title + 8. Abstract """ + renderers = [ + self.render_algorithm, + self.render_tabular, + self.render_code, + self.render_footnote, + self.extract_graphics, + self.render_caption, + self.render_title, + self.render_abstract, + ] - # Step 1: Render algorithm environments - self.render_algorithm(tex_file) - - # Step 2: Render tabular environments - self.render_tabular(tex_file) - - # Step 3: Render code environments - self.render_code(tex_file) - - # Step 4: Render footnotes - self.render_footnote(tex_file) - - # Step 5: Extract graphics paths - self.extract_graphics(tex_file) - - # Step 6: Render captions - self.render_caption(tex_file) - - # the following two envs are placed here because they also use string regex to render - # Step 7: Render titles - self.render_title(tex_file) - - # Step 8: Render abstracts - self.render_abstract(tex_file) + for renderer in renderers: + renderer(tex_file) def render_section(self, data: List[Union[dict, str]]) -> None: - """Render sections in the given data with a configured color. - This function modifies the data in-place. + """Render section headings with configured color. Args: - data (List[Union[dict, str]]): The data to be enclosed. - color (str, optional): The color of the enclosed section. Defaults to 'red'. - - Returns: - None + data: LaTeX content as structured data """ for item in data: if not isinstance(item, dict): @@ -164,14 +152,10 @@ def render_section(self, data: List[Union[dict, str]]) -> None: item[env] = utils.colorize(item[env], "Title") def render_list(self, data: List[Union[dict, str]]) -> None: - """Render equations in the given data with a configured color. - This function modifies the data in-place. + """Render list environments with configured color. Args: - data (List[Union[dict, str]]): The list of items to be processed. - - Returns: - None + data: LaTeX content as structured data """ for item in data: if not isinstance(item, dict): @@ -179,93 +163,80 @@ def render_list(self, data: List[Union[dict, str]]) -> None: env = find_env(item, envs.list_envs) if env is None: + # Process nested lists recursively for value in item.values(): - if not isinstance(value, list): - continue - self.render_list(value[1]) + if isinstance(value, list): + self.render_list(value[1]) continue self.texts["List"].append(item[env]) item[env] = utils.colorize(item[env], "List") def render_equation(self, data: List[Union[dict, str]]) -> None: - """Render equations in the given data with a configured color. + """Render equation environments with configured color. Args: - - data (List[Union[dict, str]]): The data containing equations to enclose. - - Returns: - None + data: LaTeX content as structured data """ for item in data: if not isinstance(item, dict): continue env = find_env(item, envs.math_envs) - if env is None: + # Process nested equations for value in item.values(): - if not isinstance(value, list): - continue - self.render_equation(value[1]) + if isinstance(value, list): + self.render_equation(value[1]) continue self.texts["Equation"].append(item[env]) item[env] = utils.colorize(item[env], "Equation") def render_text(self, data: List[Union[dict, str]]) -> None: - """Render texts and text-eqs in the given data with a configured color. - This function modifies the data in-place. + """Render text content with configured colors. - Args: - data (List[Union[dict, str]]): The list of items to be processed. + Handles both regular text and text containing equations. - Returns: - None + Args: + data: LaTeX content as structured data """ for index, item in enumerate(data): if not isinstance(item, str): - if not isinstance(item, dict): - continue - for key, value in item.items(): - if key.lower() not in envs.text_envs: - continue - if not isinstance(value, list): - continue - self.render_text(value[1]) + if isinstance(item, dict): + for key, value in item.items(): + if key.lower() in envs.text_envs and isinstance(value, list): + self.render_text(value[1]) continue - if not item or item == "\n" or item == "\n\n" or item.isspace(): + if not item or item.isspace(): continue - if is_text_eq(item): - data[index] = utils.colorize(item, "Text-EQ") - self.texts["Text-EQ"].append(item) - else: - data[index] = utils.colorize(item, "Text") - self.texts["Text"].append(item) + # Determine text type and colorize + text_type = "Text-EQ" if is_text_eq(item) else "Text" + colored_text = utils.colorize(item, text_type) + self.texts[text_type].append(item) - # format + # Preserve whitespace if item[0] == "\n": - data[index] = "\n" + data[index] + colored_text = "\n" + colored_text if item[-1] == "\n": - data[index] += "\n" + colored_text += "\n" - def add_color_definition(self, color_tex: str) -> None: - """Adds color definitions to a LaTeX file. + data[index] = colored_text + + def add_color_definition(self, color_tex: Path) -> None: + """Add color package and definitions to LaTeX file. Args: - color_tex (str): The path to the LaTeX file to modify. + color_tex: Path to LaTeX file to modify Raises: - ValueError: If the beginning of the document is not found. - - Returns: - None + ValueError: If document begin tag not found """ - with open(color_tex, "r") as f: - content = f.read() + content = color_tex.read_text() + # Build color definitions definitions = ["\\usepackage{xcolor}"] for name, rgb_color in config.name2rgbcolor.items(): color_name = config.name2color[name] @@ -275,617 +246,370 @@ def add_color_definition(self, color_tex: str) -> None: color_definitions = "\n" + "\n".join(definitions) + "\n" - # Find location to insert package + # Insert at document begin preamble = re.search(r"\\begin{document}", content) if not preamble: - raise ValueError("begin of document not found") - preamble_loc = preamble.start() + raise ValueError("Document begin tag not found") - # Insert package line - content = content[:preamble_loc] + color_definitions + content[preamble_loc:] + content = ( + content[: preamble.start()] + + color_definitions + + content[preamble.start() :] + ) - # Write updated content - with open(color_tex, "w") as f: - f.write(content) + color_tex.write_text(content) - def add_layout_definition(self, color_tex: str) -> None: - """Adds layout definitions to a LaTeX file. + def add_layout_definition(self, color_tex: Path) -> None: + """Add layout definitions to LaTeX file. Args: - color_tex (str): The path to the LaTeX file to modify. + color_tex: Path to LaTeX file to modify Raises: - ValueError: If the end of the document is not found. - - Returns: - None + ValueError: If document end tag not found Reference: https://www.overleaf.com/learn/latex/Page_size_and_margins """ - with open(color_tex, "r") as f: - content = f.read() - - keys = config.layout_keys + content = color_tex.read_text() + # Build layout definitions definitions = ["\\message{[vrdu_data_process: Info]}"] - for key in keys: + for key in config.layout_keys: definition = f"\\message{{[vrdu_data_process: The {key} is: \\the\\{key}]}}" definitions.append(definition) layout_definitions = "\n" + "\n".join(definitions) + "\n" - package_re = r"\\end{document}" - match = re.search(package_re, content) - if not match: - raise ValueError("end of document not found") - - package_loc = match.start() + # Insert before document end + doc_end = re.search(r"\\end{document}", content) + if not doc_end: + raise ValueError("Document end tag not found") - # Insert package line - content = content[:package_loc] + layout_definitions + content[package_loc:] + content = ( + content[: doc_end.start()] + layout_definitions + content[doc_end.start() :] + ) - # Write updated content - with open(color_tex, "w") as f: - f.write(content) + color_tex.write_text(content) - def remove_predefined_color(self, color_tex: str) -> None: - """Removes hyperref and lstlisting color settings from a LaTeX file. + def remove_predefined_color(self, color_tex: Path) -> None: + """Remove hyperref and lstlisting color settings. Args: - color_tex (str): The path to the LaTeX file to modify. + color_tex: Path to LaTeX file to modify Raises: - ValueError: If the beginning of the document is not found. - - Returns: - None + ValueError: If document begin tag not found Reference: https://www.overleaf.com/learn/latex/Hyperlinks """ - # Read the content of the input file - with open(color_tex, "r") as file: - content = file.read() - - # Define the pattern to match the color definitions - pattern = r"\\usepackage{hyperref}|\\usepackage(\[)?\[.*?\]?(\])?{hyperref}" + content = color_tex.read_text() + # Find document begin preamble = re.search(r"\\begin{document}", content) if not preamble: - raise ValueError("begin of document not found") - preamble_loc = preamble.start() + raise ValueError("Document begin tag not found") - # forbidden the color used by hyperref - hyper_setup = "\\hypersetup{colorlinks=false}\n" - if re.search(pattern, content[:preamble_loc]): - content = content[:preamble_loc] + hyper_setup + content[preamble_loc:] + # Disable hyperref colors if present + hyperref_pattern = ( + r"\\usepackage{hyperref}|\\usepackage(\[)?\[.*?\]?(\])?{hyperref}" + ) + if re.search(hyperref_pattern, content[: preamble.start()]): + content = ( + content[: preamble.start()] + + "\\hypersetup{colorlinks=false}\n" + + content[preamble.start() :] + ) - # delete the lstlisting color definitions - pattern = r"\\lstset\{.*?\}" - content = re.sub(pattern, "", content) + # Remove lstlisting colors + content = re.sub(r"\\lstset\{.*?\}", "", content) - # Write the modified content back to the input file - with open(color_tex, "w") as file: - file.write(content) + color_tex.write_text(content) - def modify_color_definitions(self, input_file: str, output_file: str) -> None: - """Modify the pre-defined color definitions in the input file and write the modified content to the output file. + def modify_color_definitions(self, input_file: Path, output_file: Path) -> None: + """Modify color definitions to white in output file. Args: - input_file (str): The path to the input file. - output_file (str): The path to the output file. - - Returns: - None + input_file: Source LaTeX file path + output_file: Destination LaTeX file path """ - with open(input_file, "r") as file: - content = file.read() + content = input_file.read_text() - # Define the pattern to match the color definitions - for name in config.name2rgbcolor.keys(): + # Replace each color with white + for name in config.name2rgbcolor: color_name = config.name2color[name] - pattern = r"\\definecolor{" + color_name + r"}{RGB}{(\d+), (\d+), (\d+)}" - - # Replace the color definitions with pure white + pattern = rf"\\definecolor{{{color_name}}}{{RGB}}{{(\d+), (\d+), (\d+)}}" content = re.sub( pattern, - r"\\definecolor{" + color_name + r"}{RGB}{255, 255, 255}", + rf"\\definecolor{{{color_name}}}{{RGB}}{{255, 255, 255}}", content, ) - with open(output_file, "w") as file: - file.write(content) + output_file.write_text(content) - def get_env_orders(self, tex_file: str) -> List[str]: - """Returns a list of environment orders based on the contents of the given `tex_file`. + def get_env_orders(self, tex_file: Path) -> List[str]: + """Get ordered list of environments from file. Args: - tex_file (str): The path to the .tex file. + tex_file: Path to LaTeX file Returns: - List[str]: A list of environment orders. + List of environment names in order of appearance """ - with open(tex_file) as f: - contents = f.read() - colors = list(config.name2color.values()) - matches = [] + contents = tex_file.read_text() + colors = list(config.name2color.values()) pattern = "|".join(rf"\b{re.escape(term)}\b" for term in colors) - for m in re.finditer(pattern, contents): - matches.append(m.group(0)) + matches = [m.group(0) for m in re.finditer(pattern, contents)] - # the definitions are discarded + # Skip color definitions at start return matches[len(colors) :] - def render_one_env(self, main_directory: str) -> None: - """Render one environment by modifying the corresponding rendering color to black. + def render_one_env(self, main_directory: Path) -> None: + """Render individual files with one environment highlighted. Args: - main_directory (str): The main directory. - - Returns: - None: This function does not return anything. + main_directory: Working directory path """ - color_tex_file = os.path.join(main_directory, "paper_colored.tex") - white_tex_file = os.path.join(main_directory, "paper_white.tex") - self.modify_color_definitions(color_tex_file, white_tex_file) - ordered_env_colors = self.get_env_orders(white_tex_file) - suffix = "_color" - index_map = defaultdict(int) + color_tex = main_directory / "paper_colored.tex" + white_tex = main_directory / "paper_white.tex" + + self.modify_color_definitions(color_tex, white_tex) + ordered_envs = self.get_env_orders(white_tex) + + content = white_tex.read_text() - with open(white_tex_file, "r") as f: - content = f.read() + index_map = defaultdict(int) + suffix = "_color" - for index, env_color in enumerate(ordered_env_colors): + for i, env_color in enumerate(ordered_envs): env = env_color[: -len(suffix)] - # the first one is the color definition, skip it + env_count = index_map[env] + + # Replace nth occurrence with black new_content = replace_nth( - content, "{" + env_color + "}", r"{black}", index_map[env] + 2 + content, "{" + env_color + "}", "{black}", env_count + 2 ) - output_file = os.path.join( - main_directory, - f"paper_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.tex", + # Generate output filename + output_file = ( + main_directory + / f"paper_{config.folder_prefix}_{str(i).zfill(5)}_{env}_{str(env_count).zfill(5)}.tex" ) - index_map[env] += 1 - with open(output_file, "w") as f: - f.write(new_content) - def render_caption(self, tex_file: str) -> None: - """Renders captions in a LaTeX file. + output_file.write_text(new_content) + + index_map[env] += 1 - This method modifies the content of a LaTeX file by rendering captions with a specified color. - It searches for caption commands in the file and applies colorization to their contents. + def render_caption(self, tex_file: Path) -> None: + """Render captions with color. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file """ - with open(tex_file) as f: - content = f.read() + content = tex_file.read_text() pattern = r"\\caption(?:\[[^\]]*\])?(?:\{[^}]*\})" result = self._render_simple_envs(content, pattern, "Caption") - with open(tex_file, "w") as f: - f.write(result) + tex_file.write_text(result) - def render_title(self, tex_file: str) -> None: - """Renders the title in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering the title with a specified color. - It searches for the title command in the file and applies colorization to its content. + def render_title(self, tex_file: Path) -> None: + """Render document title with color. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file """ - with open(tex_file) as f: - content = f.read() + content = tex_file.read_text() pattern = r"\\title(?:\{[^}]*\})" result = self._render_simple_envs(content, pattern, "PaperTitle") - with open(tex_file, "w") as f: - f.write(result) - - def render_footnote(self, tex_file: str) -> None: - """Renders footnotes in a LaTeX file. + tex_file.write_text(result) - This method modifies the content of a LaTeX file by rendering footnotes with a specified color. - It searches for various footnote environments and applies colorization to their contents. + def render_footnote(self, tex_file: Path) -> None: + """Render footnotes with color. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file """ - # \footnote{...}, \footnote[]{...}, \footnotetext{...}, \footnotetext[]{...}, \tablefootnote{} - with open(tex_file) as f: - content = f.read() + content = tex_file.read_text() for env_name in envs.footnote_envs: pattern = r"\\" + env_name + r"(?:\[[^\]]*\])?(?:\{[^}]*\})" - content = self._render_simple_envs(content, pattern, "Footnote") - with open(tex_file, "w") as f: - f.write(content) + tex_file.write_text(content) def _render_simple_envs(self, content: str, pattern: str, category: str) -> str: - """Renders specific environments in the content using replacement. - - This method searches for occurrences of a pattern in the content and replaces them with colored versions. - The replacement is based on the specified category for colorization. + """Render simple environments with color. Args: - content (str): The content of the LaTeX file. - pattern (str): The regular expression pattern to match. - category (str): The category of the environment for colorization. + content: LaTeX content + pattern: Regex pattern to match + category: Environment category name Returns: - str: The modified content with the rendered environments. + Modified content with colored environments """ matches = re.finditer(pattern, content) result = "" - index = 0 + last_end = 0 + for match in matches: start = match.start() end = match.end() - # the regex is greedy, iterate to find the end of footnote env - num_left_brackets = content[start:end].count("{") - num_right_brackets = content[start:end].count("}") - while num_right_brackets < num_left_brackets: + # Handle nested brackets + num_left = content[start:end].count("{") + num_right = content[start:end].count("}") + + while num_right < num_left: if content[end] == "{": - num_left_brackets += 1 + num_left += 1 elif content[end] == "}": - num_right_brackets += 1 + num_right += 1 end += 1 - category_content = content[start:end] + env_content = content[start:end] + self.texts[category].append(env_content) - self.texts[category].append(category_content) - colored_title = utils.colorize(category_content, category) - result += content[index:start] - result += colored_title - index = end + result += content[last_end:start] + result += utils.colorize(env_content, category) + last_end = end - result += content[index:] + result += content[last_end:] return result - def render_abstract(self, tex_file: str) -> None: - """Renders the abstract section in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering the abstract section with a specified color. - It searches for the abstract section in the file and applies colorization to its contents. + def render_abstract(self, tex_file: Path) -> None: + """Render abstract with color. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file Raises: - ValueError: If more than one abstract section is found. + ValueError: If multiple abstracts found """ - with open(tex_file) as f: - content = f.read() + content = tex_file.read_text() pattern = r"\\begin{abstract}.*?\\end{abstract}" - indexes = [ - (m.start(), m.end()) for m in re.finditer(pattern, content, re.DOTALL) - ] + matches = list(re.finditer(pattern, content, re.DOTALL)) - if len(indexes) > 1: - raise ValueError("more than one abstract found") + if len(matches) > 1: + raise ValueError("Multiple abstracts found") - if not indexes: + if not matches: return - start, end = indexes[0] - abstract = content[start:end] + match = matches[0] + abstract = content[match.start() : match.end()] self.texts["Abstract"].append(abstract) - colored_abstract = utils.colorize(abstract, "Abstract") - result = content[:start] + colored_abstract + content[end:] - with open(tex_file, "w") as f: - f.write(result) + result = ( + content[: match.start()] + + utils.colorize(abstract, "Abstract") + + content[match.end() :] + ) - def render_tabular(self, tex_file: str) -> None: - """Renders tabular environments in a LaTeX file. + tex_file.write_text(result) - This method modifies the content of a LaTeX file by rendering tabular environments with a specified color. - It searches for tabular environments in the file and applies colorization to their contents. + def render_tabular(self, tex_file: Path) -> None: + """Render tables with color. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file """ - with open(tex_file) as f: - content = f.read() + content = tex_file.read_text() + pattern = r"\\begin{(tabular[*xy]?)}.*?\\end{\1}" result = self._render_float_envs(content, pattern, "Table") - with open(tex_file, "w") as f: - f.write(result) + tex_file.write_text(result) - def render_algorithm(self, tex_file: str) -> None: - """Renders algorithm environments in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering algorithm environments with a specified color. - It searches for algorithm environments in the file and applies colorization to their contents. + def render_algorithm(self, tex_file: Path) -> None: + """Render algorithms with color. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None + tex_file: Path to LaTeX file """ - with open(tex_file) as f: - content = f.read() + content = tex_file.read_text() pattern = r"\\begin{algorithm[*]?}(.*?)\\end{algorithm[*]?}" result = self._render_float_envs(content, pattern, "Algorithm") - with open(tex_file, "w") as f: - f.write(result) + tex_file.write_text(result) - def render_code(self, tex_file: str) -> None: - """Renders code environments in a LaTeX file. + def render_code(self, tex_file: Path) -> None: + """Render code blocks with color. - This method modifies the content of a LaTeX file by rendering code environments with a specified color. - It searches for code environments and `\\lstinputlisting` commands in the file and applies colorization to their contents. + Handles both code environments and lstinputlisting. Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - - Notes: - There are two types of code environments: - - pattern 1: code environment - - pattern 2: lstinputlisting to input a file + tex_file: Path to LaTeX file Reference: https://en.wikibooks.org/wiki/LaTeX/Source_Code_Listings """ - with open(tex_file, "r") as file: - content = file.read() + content = tex_file.read_text() + + patterns = [ + r"\\begin{(verbatim|lstlisting|program)[*]?}(.*?)\\end{\1[*]?}", + r"\\lstinputlisting\[[^\]]*\]{[^\}]*}", + ] + pattern = "|".join(patterns) - pattern = ( - r"\\begin{(verbatim|lstlisting|program)[*]?}(.*?)\\end{\1[*]?}" - + "|" - + r"\\lstinputlisting\[[^\]]*\]{[^\}]*}" - ) result = self._render_float_envs(content, pattern, "Code") - with open(tex_file, "w") as f: - f.write(result) + tex_file.write_text(result) def _render_float_envs(self, content: str, pattern: str, category: str) -> str: - """Renders specific float environments in the content. - - This method searches for occurrences of a pattern in the content and replaces them with colored versions. - The replacement is based on the specified category for colorization. + """Render floating environments with color. Args: - content (str): The content of the LaTeX file. - pattern (str): The regular expression pattern to match. - category (str): The category of the environment for colorization. + content: LaTeX content + pattern: Regex pattern to match + category: Environment category name Returns: - str: The modified content with the rendered float environments. + Modified content with colored environments """ - indexes = [ - (m.start(), m.end()) for m in re.finditer(pattern, content, re.DOTALL) - ] + matches = list(re.finditer(pattern, content, re.DOTALL)) - if not indexes: - log.debug(f"no {category} found") + if not matches: + logger.debug(f"No {category} environments found") return content - result = content[: indexes[0][0]] - for i, _ in enumerate(indexes): + result = content[: matches[0].start()] + + for i, match in enumerate(matches): if i > 0: - result += content[indexes[i - 1][1] : indexes[i][0]] - float_env = content[indexes[i][0] : indexes[i][1]] + result += content[matches[i - 1].end() : match.start()] - # filter table of figures - if category == "Table" and float_env.find("\\includegraphics") != -1: - continue + env_content = content[match.start() : match.end()] - # TODO: filter table in equation envs + # Skip figures in tables + if category == "Table" and "\\includegraphics" in env_content: + continue - self.texts[category].append(float_env) - colored_float_env = utils.colorize(float_env, category) - result += colored_float_env + self.texts[category].append(env_content) + result += utils.colorize(env_content, category) - result += content[indexes[-1][1] :] + result += content[matches[-1].end() :] return result - def extract_graphics(self, tex_file: str) -> None: - """Extracts graphics paths from a LaTeX file. - - This method reads a LaTeX file and extracts the paths of graphics included using the `\\includegraphics` command. - The extracted graphics paths are stored in the `texts["Figure"]` list. + def extract_graphics(self, tex_file: Path) -> None: + """Extract graphics commands. Args: - tex_file (str): The path to the LaTeX file to extract graphics from. - - Returns: - None + tex_file: Path to LaTeX file """ - with open(tex_file, "r") as file: - content = file.read() + content = tex_file.read_text() pattern = r"\\includegraphics(?:\[(.*?)\])?{(.*?)}" - matches = re.findall(pattern, content) - for match in matches: + for options, path in re.findall(pattern, content): graphic = "\\includegraphics" - if match[0]: - graphic += f"[{match[0]}]" - graphic += f"{{{match[1]}}}" + if options: + graphic += f"[{options}]" + graphic += f"{{{path}}}" self.texts["Figure"].append(graphic) - - -def extract_main_content(tex_file: str) -> Tuple[str, int, int]: - """Extracts the main content from a LaTeX file. - - Args: - tex_file (str): The path to the LaTeX file. - - Returns: - Tuple[str, int, int]: A tuple containing the main content of the LaTeX file, - the start position of the main content in the file, and the end position - of the main content in the file. - """ - with open(tex_file) as f: - content = f.read() - - start = content.find("\\begin{document}") - end = content.find("\\end{document}") - - if start == -1 or end == -1: - raise ValueError("Document tags not found") - - start += len("\\begin{document}") - main_content = content[start:end] - - return main_content, start, end - - -def data_from_tex_file(tex_file: str) -> Tuple[List[Union[dict, str]], int, int]: - """Extracts data from a Tex file using TexSoup. - - Args: - tex_file (str): The path to the Tex file. - - Returns: - Tuple[List, int, int]: A tuple containing the extracted data, the start - position of the extracted content, and the end position of the extracted - content. - """ - main_content, start, end = extract_main_content(tex_file) - tex_tree = TexSoup(main_content).expr.all - data = conversion.to_list(tex_tree) - - return data, start, end - - -def tex_file_from_data( - data: List[Union[dict, str]], - tex_file: str, - start: int = 0, - end: int = -1, -) -> None: - """Generate a TeX file from the given TexSoup data. - - Args: - data (List[Union[dict, str]]): The data to be converted into LaTeX. - tex_file (str): The path of the TeX file to be generated. - start (int, optional): The starting position in the TeX file to replace content. Defaults to 0. - end (int, optional): The ending position in the TeX file to replace content. Defaults to -1. - - Returns: - None: This function does not return any value. - """ - with open(tex_file, "r") as f: - content = f.read() - - # convert the data into latex - rendered_tex = conversion.to_latex(data) - - content = content[:start] + rendered_tex + content[end:] - - with open(tex_file, "w") as f: - f.write(content) - - -def replace_nth(string: str, old: str, new: str, n: int) -> str: - """ - Replace the n-th occurrence of a substring in a given string with a new substring. - - Args: - string (str): The original string to search and perform the replacement on. - old (str): The substring to be replaced. - new (str): The substring to replace the n-th occurrence of `old` in `string`. - n (int): The occurrence number of `old` to be replaced (1-based index). - - Returns: - str: The modified string with the n-th occurrence of `old` replaced by `new`. If the - occurrence is not found, the original string is returned. - - Example: - >>> replace_nth("Hello, hello, hello!", 'hello', 'hi', 2) - 'Hello, hello, hi!' - """ - index_of_occurrence = string.find(old) - occurrence = int(index_of_occurrence != -1) - - while index_of_occurrence != -1 and occurrence != n: - index_of_occurrence = string.find(old, index_of_occurrence + 1) - occurrence += 1 - - if occurrence == n: - return ( - string[:index_of_occurrence] - + new - + string[index_of_occurrence + len(old) :] - ) - - return string - - -def find_env(wrapped_env: dict, query: List[str]) -> Union[str, None]: - """ - Finds and returns the environment variable from the given query list - that exists in the wrapped_env dictionary. - - Args: - wrapped_env (dict): A dictionary containing environment variables as keys. - query (list): A list of environment variables to search for. - - Returns: - Union[str, None]: The environment variable found in the query list that exists in the wrapped_env dictionary, or None - if no matching environment variable is found. - """ - for env in query: - if env in wrapped_env: - return env - - return None - - -def is_text_eq(text: str) -> bool: - """Check if the given text contains any mathematical expressions. - - Args: - text (str): The text to be checked for mathematical expressions. - - Returns: - bool: True if the text contains mathematical expressions, False otherwise. - - Note: - This function uses a regular expression pattern to match mathematical expressions - - Reference: - https://www.overleaf.com/learn/latex/Mathematical_expressions - """ - pattern = r"(\\\(.*?\\\))|(\$.*?\$)|(\\begin\{math\}.*?\\end\{math\})" - matches = re.findall(pattern, text) - - for match in matches: - if not re.search(r"\\\$", match[0]): - return True - - return False diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py old mode 100755 new mode 100644 index be6fa51..43eeb3f --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -1,170 +1,155 @@ -import os -import subprocess -import json -from typing import Any, Dict, List, Union +"""Utility functions for LaTeX document processing and file operations.""" +import re +import json +import subprocess +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union -from pdf2image import pdf2image -from pdf2image import generators - +from pdf2image import pdf2image, generators +from DocParser.TexSoup.TexSoup import TexSoup +import DocParser.TexSoup.app.conversion as conversion from DocParser.vrdu.block import Block from DocParser.vrdu.config import config -def export_to_json(data: Union[Dict, List], file_path: str) -> None: - """ - Write the contents of a dictionary or a list to a JSON file. +def export_to_json(data: Union[Dict, List], file_path: Union[str, Path]) -> None: + """Write data to a JSON file with indentation. - Parameters: - data (Union[Dict, List]): The dictionary to be written to the file. - file_path (str): The path to the JSON file. + Args: + data: Dictionary or list to write + file_path: Output JSON file path """ - with open(file_path, "w") as json_file: - json.dump(data, json_file, indent=4) + with open(file_path, "w") as f: + json.dump(data, f, indent=4) -def load_json(file_path: str) -> Union[Dict, List]: - """ - Load a JSON file into a dictionary or a list. +def load_json(file_path: Union[str, Path]) -> Union[Dict, List]: + """Load data from a JSON file. - Parameters: - file_path (str): The path to the JSON file. + Args: + file_path: Input JSON file path Returns: - Union[Dict, List]: The loaded JSON data as a dictionary or a list. + Loaded dictionary or list """ - with open(file_path, "r") as json_file: - data = json.load(json_file) - return data - + with open(file_path) as f: + return json.load(f) -def compile_latex(file: str) -> None: - """ - Compile a LaTeX file using pdflatex engine. - Parameters: - file (str): The path to the LaTeX file to be compiled. +def compile_latex(file: Union[str, Path], colored: bool = False) -> None: + """Compile a LaTeX file using pdflatex. - Returns: - None + Args: + file: Path to LaTeX file + colored: Whether this is the colored version requiring synctex """ - file_name = os.path.basename(file) + file_name = Path(file).name + base_cmd = ["pdflatex", "-interaction=nonstopmode"] - subprocess.run( - ["pdflatex", "-interaction=nonstopmode", file_name], - timeout=1000, - stdout=subprocess.DEVNULL, - ) + # Run twice for references + for _ in range(2): + subprocess.run(base_cmd + [file_name], timeout=1000, stdout=subprocess.DEVNULL) - subprocess.run( - ["pdflatex", "-interaction=nonstopmode", file_name], - timeout=1000, - stdout=subprocess.DEVNULL, - ) - - if file_name == "paper_colored.tex": + # Additional run with synctex for colored version + if colored: subprocess.run( - ["pdflatex", "-interaction=nonstopmode", "-synctex=1", file_name], + base_cmd + ["-synctex=1", file_name], timeout=1000, stdout=subprocess.DEVNULL, ) -def pdf2jpg(pdf_path: str, output_directory: str) -> None: - """ - Convert a PDF file into a series of jpg images. +def pdf2jpg(pdf_path: Union[str, Path], output_directory: Union[str, Path]) -> None: + """Convert PDF pages to JPG images. - Parameters: - pdf_path (str): The path of the PDF file to be converted. - output_directory (str): The directory where the converted images will be saved. - Returns: - None + Args: + pdf_path: Input PDF file path + output_directory: Output directory for JPG files - Reference: - https://pypi.org/project/pdf2image/ + Output files are named: thread-000x-yz.jpg + where x is thread index and yz is page number """ - os.makedirs(output_directory, exist_ok=True) - # the output images has name of format: thread-000x-yz.png - # where x is the thread index, yz is the index of pdf page start from 1 + output_dir = Path(output_directory) + output_dir.mkdir(parents=True, exist_ok=True) + pdf2image.convert_from_path( pdf_path, fmt="jpg", - output_folder=output_directory, + output_folder=str(output_dir), output_file=generators.counter_generator(prefix="thread-", suffix="-page"), ) -def convert_pdf_figure_to_png_image(pdf_image: str, png_image: str, dpi: int = 72): - """ - Convert a PDF to a PNG image. - - Parameters: - pdf_image (str): The filepath of the PDF image to convert. - png_image (str): The filepath where the PNG image will be saved. - dpi (int): The resolution for the conversion (default is 72). +def convert_pdf_figure_to_png_image( + pdf_image: Union[str, Path], png_image: Union[str, Path], dpi: int = 72 +) -> None: + """Convert PDF figure to PNG image. - Returns: - None + Args: + pdf_image: Input PDF file path + png_image: Output PNG file path + dpi: Resolution for conversion """ - # crop the pdf image + # Crop PDF subprocess.run( - ["pdfcrop", pdf_image, pdf_image], - stdout=subprocess.DEVNULL, + ["pdfcrop", str(pdf_image), str(pdf_image)], stdout=subprocess.DEVNULL ) - # convert the pdf image into png + + # Convert to PNG images = pdf2image.convert_from_path(pdf_image, dpi=dpi) images[0].save(png_image) -def convert_eps_image_to_pdf_image(eps_image_path: str, pdf_image_path: str): - """ - A function that converts an EPS image to a PDF image. +def convert_eps_image_to_pdf_image( + eps_image_path: Union[str, Path], pdf_image_path: Union[str, Path] +) -> None: + """Convert EPS image to PDF. Args: - eps_image_path (str): The file path of the EPS image to convert. - pdf_image_path (str): The file path where the PDF image will be saved. + eps_image_path: Input EPS file path + pdf_image_path: Output PDF file path """ - subprocess.run(["epspdf", eps_image_path, pdf_image_path]) + subprocess.run(["epspdf", str(eps_image_path), str(pdf_image_path)]) def export_to_coco( layout_info: Dict[int, List[Block]], image_infos: Dict[int, Dict[str, Any]], - file_path: str, + file_path: Union[str, Path], ) -> None: - """ - Export the given layout information and image information to a COCO format JSON file. + """Export layout and image info to COCO format JSON. Args: - layout_info (Dict[int, List[Block]]): - A dictionary mapping page indices to lists of Block objects. - image_infos (Dict[int, Dict[str, Any]]): - A dictionary mapping page indices to dictionaries containing image information. - file_path (str): The name of the output JSON file. - - Returns: - None + layout_info: Page index to list of Block objects mapping + image_infos: Page index to image info mapping + file_path: Output JSON file path - Reference: - https://cocodataset.org/#format-data + See: https://cocodataset.org/#format-data """ - category_info = [ - { - "id": index, - "name": category, - "supercategory": supercategory, - } - for index, category, supercategory in config.config["category_name"] - ] result = { "info": config.config["coco_info"], "licenses": config.config["coco_licenses"], - "images": [], - "annotations": [], - "categories": category_info, + "images": _build_coco_images(layout_info, image_infos), + "annotations": _build_coco_annotations(layout_info), + "categories": _build_coco_categories(), } + export_to_json(result, file_path) + + +def _build_coco_categories() -> List[Dict[str, Any]]: + """Build COCO format category information.""" + return [ + {"id": index, "name": category, "supercategory": supercategory} + for index, category, supercategory in config.config["category_name"] + ] - result["images"] = [ + +def _build_coco_images( + layout_info: Dict[int, List[Block]], image_infos: Dict[int, Dict[str, Any]] +) -> List[Dict[str, Any]]: + """Build COCO format image information.""" + return [ { "id": page_index, "width": image_infos[page_index]["width"], @@ -172,83 +157,209 @@ def export_to_coco( "file_name": image_infos[page_index]["file_name"], **config.config["coco_image_info"], } - for page_index in layout_info.keys() + for page_index in layout_info ] + +def _build_coco_annotations( + layout_info: Dict[int, List[Block]] +) -> List[Dict[str, Any]]: + """Build COCO format annotation information.""" + annotations = [] for page_index, page_elements in layout_info.items(): for index, element in enumerate(page_elements): width, height = element.width, element.height - annotation = { - "id": index, - "image_id": page_index, - "category_id": element.category, - "segmentation": [], - "bbox": [element.bbox[0], element.bbox[1], width, height], - "area": width * height, - "iscrowd": 0, - } - result["annotations"].append(annotation) - - export_to_json(result, file_path) + annotations.append( + { + "id": index, + "image_id": page_index, + "category_id": element.category, + "segmentation": [], + "bbox": [element.bbox[0], element.bbox[1], width, height], + "area": width * height, + "iscrowd": 0, + } + ) + return annotations def colorize(text: str, category_name: str) -> str: - """ - Given a piece of text and a category name, colorizes the text based on the category. + """Colorize text based on category. Args: - text (str): The text to be colorized. - category_name (str): The category name to determine the colorization. + text: Text to colorize + category_name: Category determining color Returns: - str: The colorized text based on the category. + Colorized LaTeX text + + Raises: + NotImplementedError: For unknown categories """ color = config.name2color[category_name] - if category_name == "Caption": - index = text.find("{") - return text[: index + 1] + "{\\color{" + color + "}" + text[index + 1 :] + "}" - if category_name == "Footnote": + + # Simple wrapping + if category_name in {"Table", "Title", "List", "Code"}: + return f"{{\\color{{{color}}}{text}}}" + + # Text coloring + if category_name in {"Text", "Text-EQ"}: + return f"{{\\textcolor{{{color}}}{{{text}}}}}" + + # Complex cases + if category_name in {"Caption", "Footnote"}: index = text.find("{") - return text[: index + 1] + "{\\color{" + color + "}" + text[index + 1 :] + "}" - if category_name == "Table": - return "{\\color{" + color + "}" + text + "}" + return f"{text[:index + 1]}{{\\color{{{color}}}{text[index + 1:]}}}" + if category_name == "Algorithm": - # skip the position arguments, like \\begin{algorithm}[hbt!] prefix = text.find("\\", len("\\begin{algorithm}")) suffix = text.find("\\end{algorithm}") return ( - text[:prefix] - + "{\\color{" - + color - + "}" - + text[prefix:suffix] - + "}" - + text[suffix:] + f"{text[:prefix]}{{\\color{{{color}}}{text[prefix:suffix]}}}{text[suffix:]}" ) - if category_name == "Title": - return "{\\color{" + color + "}" + text + "}" - if category_name == "List": - return "{\\color{" + color + "}" + text + "}" - if category_name == "Text": - return "{\\textcolor{" + color + "}{" + text + "}}" - if category_name == "Text-EQ": - return "{\\textcolor{" + color + "}{" + text + "}}" + if category_name == "PaperTitle": index = text.find("{") - return ( - text[: index + 1] - + "{\\textcolor{" - + color - + "}{" - + text[index + 1 :] - + "}}" - ) + return f"{text[:index + 1]}{{\\textcolor{{{color}}}{{{text[index + 1:]}}}}}" + if category_name == "Equation": - return "{\\color{" + color + "}{" + text + "}}" + return f"{{\\color{{{color}}}{{{text}}}}}" + if category_name == "Abstract": prefix = len("\\begin{abstract}") - return "{" + text[:prefix] + "\\color{" + color + "}" + text[prefix:] + "}" - if category_name == "Code": - return "{\\color{" + color + "}" + text + "}" + return f"{{{text[:prefix]}\\color{{{color}}}{text[prefix:]}}}" raise NotImplementedError(f"Invalid category name: {category_name}") + + +def extract_main_content(tex_file: str) -> Tuple[str, int, int]: + """Extract the main content from a LaTeX file. + + Args: + tex_file: Path to the LaTeX file + + Returns: + Tuple containing: + - Main content between document tags + - Start position of main content in file + - End position of main content in file + + Raises: + ValueError: If document tags not found + """ + with open(tex_file) as f: + content = f.read() + + start = content.find("\\begin{document}") + end = content.find("\\end{document}") + + if start == -1 or end == -1: + raise ValueError("Document tags not found") + + start += len("\\begin{document}") + main_content = content[start:end] + + return main_content, start, end + + +def data_from_tex_file(tex_file: str) -> Tuple[List[Union[dict, str]], int, int]: + """Extract data from a TeX file using TexSoup. + + Args: + tex_file: Path to the TeX file + + Returns: + Tuple containing: + - Extracted data as list + - Start position of main content in file + - End position of main content in file + """ + main_content, start, end = extract_main_content(tex_file) + tex_tree = TexSoup(main_content).expr.all + data = conversion.to_list(tex_tree) + + return data, start, end + + +def tex_file_from_data( + data: List[Union[dict, str]], + tex_file: Union[str, Path], + start: int = 0, + end: int = -1, +) -> None: + """Generate a TeX file from TexSoup data. + + Args: + data: Data to convert to LaTeX + tex_file: Output TeX file path + start: Start position for content replacement + end: End position for content replacement + """ + with open(tex_file, "r") as f: + content = f.read() + + rendered_tex = conversion.to_latex(data) + content = content[:start] + rendered_tex + content[end:] + + with open(tex_file, "w") as f: + f.write(content) + + +def replace_nth(string: str, old: str, new: str, n: int) -> str: + """Replace the n-th occurrence of a substring. + + Args: + string: Original string + old: Substring to replace + new: Replacement substring + n: Which occurrence to replace (1-based) + + Returns: + Modified string with n-th occurrence replaced + + Example: + >>> replace_nth("Hello, hello, hello!", 'hello', 'hi', 2) + 'Hello, hello, hi!' + """ + index = string.find(old) + count = int(index != -1) + + while index != -1 and count != n: + index = string.find(old, index + 1) + count += 1 + + if count == n: + return string[:index] + new + string[index + len(old) :] + + return string + + +def find_env(wrapped_env: dict, query: List[str]) -> Optional[str]: + """Find first matching environment variable from query list. + + Args: + wrapped_env: Dictionary of environment variables + query: List of environment variables to search for + + Returns: + First matching environment variable or None + """ + return next((env for env in query if env in wrapped_env), None) + + +def is_text_eq(text: str) -> bool: + """Check if text contains mathematical expressions. + + Args: + text: Text to check + + Returns: + True if contains math expressions, False otherwise + + Reference: + https://www.overleaf.com/learn/latex/Mathematical_expressions + """ + pattern = r"(\\\(.*?\\\))|(\$.*?\$)|(\\begin\{math\}.*?\\end\{math\})" + matches = re.findall(pattern, text) + + return any(not re.search(r"\\\$", match[0]) for match in matches) diff --git a/README.md b/README.md index f5b8a1d..e3764c1 100644 --- a/README.md +++ b/README.md @@ -1,185 +1,184 @@ -# vrdu_data_process -This repository is used to process paper with `.tex` source files to obtain: -1. object detection results -2. latex source code - visual bounding box pairs -3. layout reading orders. - - -# Installation -## Step 1 Install package -First create a conda environment (if Anaconda has not been installed, see [installation](https://docs.anaconda.com/free/anaconda/install/index.html)) -```shell -conda create --name vrdu python=3.8 -``` +# DocParser -Then activate the environment and install packages: -```shell -conda activate vrdu -pip install -e . -``` +A tool for processing academic papers with `.tex` source files to extract: + +1. Object detection results +2. LaTeX source code with visual bounding box pairs +3. Layout reading orders + +## Project Links + +- GitHub Repository: +- HuggingFace dataset: + +## Installation + +### Prerequisites + +1. **Python Environment** + - Python 3.8 or higher + - Anaconda (recommended) - [Installation Guide](https://docs.anaconda.com/free/anaconda/install/index.html) + +2. **TeX Live Distribution** + - Required for LaTeX compilation + - Installation guide available at [tug.org/texlive](https://www.tug.org/texlive/) + + For Ubuntu users: + + ```bash + sudo apt-get install texlive-full # Requires ~5.4GB disk space + ``` + + Note: `texlive-full` is recommended to avoid missing package errors. See [package differences](https://tex.stackexchange.com/a/504566). + +### Setup + +1. Create and activate conda environment: -## Step 2 Install TexLive -To compile latex, we need to install **Tex Live Distribution**, where you can find installation guide on [this page](https://www.tug.org/texlive/). + ```bash + conda create --name doc_parser python=3.8 + conda activate doc_parser + ``` -For Ubuntu, we recommend install `texlive-full` by running the following command on terminal (Requires ~5.4GB disk space) -```shell -sudo apt-get install texlive-full -``` -this version avoids missing package error, to see differences among versions, see [Differences between texlive packages in Linux](https://tex.stackexchange.com/a/504566) +2. Install the package: -# Usage -```python + ```bash + pip install -e . + ``` + +## Usage + +Run the parser on your LaTeX file: + +```bash python main.py --file_name path_to_paper/paper.tex ``` -the script then generates the bounding box of the following categories and their corresponding content (if there are text inside the bounding box): -1. layout annotation, with a bounding box around each semantic element, such as table, text paragraph, equation, etc. -2. reading annotation, which is a pair that links the bounding box and corresponding latex source code. -the result is stored in the `path_to_paper/output/result`, the folder structure is given as follows: -```shell -path_to_paper -├── output -│   └── result -│   ├── layout_annotation.json -│   ├── reading_annotation.json -│   ├── ordering_annotation.json -│   ├── quality_report.json -│   ├── texts.json -│   ├── env_orders.json -│   ├── layout_info.json -│   ├── layout_metadata.json -│   ├── raw_parsed_data.json -│   ├── page_0.jpg -| ├── page_1.jpg -| ├── block_0.jpg -└─ └── block_1.jpg +### Output Structure -``` -The result contains three parts: -1. Object detection result, which includes `layout_annotation.json` and `page_{n}.png`, the result is is represented as [COCO format](https://cocodataset.org/#format-data) -2. Reading detection result, which includes `reading_annotation.json` and `block_{n}.png`, it matches the bounding box and its original tex represented contents -3. Reading order result, which includes `ordering_annotation.json`. The reading order is represented via triple (`relationship`, `from`, `to`), indicates the relationship between the block with id `from` and the block with id `to`. -4. Debugging infos, this parts contains: - - `texts.json`, it contains the original tex contents - - `env_orders.json`, it is used to annotate reading orders - - `layout_info.json`, it is the raw content of object detection result - - `layout_metadata.json`, it contains the information about the paper layouts - - `raw_parsed_data.json`, it contains the result of main content of the tex file parsed by `TexSoup`. - -## Common issues -### 1. `latexpand` command running error -``` -ValueError: Failed to run the command "latexpand --output="/tmp/arxiv_cleaner.46fp5l_e.latexpand_output/paper_original.tex" --fatal --out-encoding="encoding(UTF-8)" "paper_original.tex"" -Return code: 2 -``` -if this error occurs, please check the version of installed `latexpand` with -``` -latexpand --help -``` -in the last line of output will print the version. If the version is below $1.6$, then we need to upgrade it to $\geq1.6$, the simplest way is -1. go to [latexpand v1.6](https://gitlab.com/latexpand/latexpand/-/tags/v1.6) download the source code -2. use `sudo vim $(which latexpand)` to edit the content of `latexpand` script (`sudo` is necessary since `latexpand` usually locates in `/usr/bin`) -3. copy the content of `v1.6/latexpand` to the old version of `latexpand` (opened with vim) +Results are stored in `path_to_paper/output/result`: -### 2. `pdf2image` error ``` -pdf2image.exceptions.PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH? -``` -use the following command to install `poppler`: -``` -sudo apt-get install poppler-utils +path_to_paper +├── output +│ ├── paper_colored/ # Rendered paper images +│ │ ├── thread-0001-page-01.jpg +│ │ └── ... +│ └── result/ +│ ├── layout_annotation.json # Object detection results (COCO format) +│ ├── reading_annotation.json # Bounding box to LaTeX source mapping +│ ├── ordering_annotation.json # Reading order relationships +│ ├── quality_report.json +│ ├── texts.json # Original tex contents +│ ├── layout_info.json # Raw detection results +│ ├── layout_metadata.json # Paper layout information +│ ├── page_*.jpg # Pages with bounding boxes +│ └── block_*.jpg # Individual block images ``` -for details, see [reference](https://pdf2image.readthedocs.io/en/latest/installation.html#installing-poppler). -### 3. `path_to_paper/block_*****.pdf` not found -Usually, this means the rendering process destroys the original latex, therefore it is not compilable, the reason varies from case to case. +### Output Components +1. **Object Detection Results** + - `layout_annotation.json` and `page_*.jpg` + - Uses [COCO format](https://cocodataset.org/#format-data) -# Documentation -The documentation is built with [Sphinx](https://www.sphinx-doc.org/en/master/), to build documentation, run the following commands: -``` +2. **Reading Detection Results** + - `reading_annotation.json` + - Maps bounding boxes to original LaTeX content + +3. **Reading Order Results** + - `ordering_annotation.json` + - Defines relationships between blocks using triples: (relationship, from, to) + +## Categories + +Each bounding box is classified into one of these categories: + +| Category | Name | Super Category | Description | +|----------|------|----------------|-------------| +| 0 | Algorithm | Algorithm | Algorithm environments | +| 1 | Caption | Caption | Figure, Table, Algorithm captions | +| 2 | Equation | Equation | Display equations (equation, align) | +| 3 | Figure | Figure | Figures | +| 4 | Footnote | Footnote | Footnotes | +| 5 | List | List | itemize, enumerate, description | +| 6 | Others | Others | Currently unused | +| 7 | Table | Table | Tables | +| 8 | Text | Text | Plain text without equations | +| 9 | Text-EQ | Text | Text with inline equations | +| 10 | Title | Title | Section/subsection titles | +| 11 | Reference | Reference | References | +| 12 | PaperTitle | Title | Paper title | +| 13 | Code | Algorithm | Code listings | +| 14 | Abstract | Text | Paper abstract | + +## Troubleshooting + +### Common Issues + +1. **Latexpand Error** + + ```bash + ValueError: Failed to run the command "latexpand..." + ``` + + Solution: + - Check latexpand version: `latexpand --help` + - If < 1.6, upgrade using: + 1. Download from [latexpand v1.6](https://gitlab.com/latexpand/latexpand/-/tags/v1.6) + 2. Update existing script: `sudo vim $(which latexpand)` + +2. **PDF2Image Error** + + ```bash + PDFInfoNotInstalledError: Unable to get page count + ``` + + Solution: + + ```bash + sudo apt-get install poppler-utils + ``` + +3. **Missing Block PDF** + - If `block_*.pdf` is missing, the LaTeX rendering likely failed + - This is case-specific and requires manual investigation + +## Known Limitations + +1. **Custom Environments**: Some custom environments (e.g., `\newtheorem{defn}[thm]{Definition}`) require manual addition to `envs.text_envs` +2. **Rendering Issues**: Some environments may fail during PDF compilation +3. **Special Figures**: TikZ and similar formats may not be correctly classified + +## Documentation + +Build the documentation using Sphinx: + +```bash cd docs sphinx-build . _build ``` -then the documentations are listed in `docs/_build`, which can be viewed by open `index.html` with a browser. - -# Category -each bounding box is classified into one the following category. - -| Category | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14| -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |--- |--- |--- |---| -| **Name** | Algorithm | Caption | Equation | Figure | Footnote | List | Others | Table | Text | Text-EQ | Title | Reference | PaperTitle | Code | Abstract| -| **Super Category** | Algorithm | Caption | Equation | Figure | Footnote | List | Others | Table | Text | Text | Title | Reference | Title | Algorithm | Text| - -Explanation: -- `Algorithm` contains Algorithm environment -- `Code` contains listing environments -- `Caption` contains Figure caption, Table caption and Algorithm caption -- `Equation` contains all display equations such as `equation`, `align` environments. -- `List` contains `itemize`, `enumerate` and `description`. -- `Others` Currently there is no element that is classified into Others -- `Text` refers to a paragraph of texts without inline equations, -- `Text-EQ` refers to text with equations, such as `$a$`. -- `Title` contains section title, subsection title. Others titles are ignored. -- `PaperTitle` contains paper title. - - -For more details, see `config/envs.py`. - -# Pipeline -1. Preprocess the original tex file (copy), this includes two substeps: - - resolve inputs and clean comments with `arxiv_cleaner` - - convert all pdf figures into png format - - delete table of contents -2. render tex file, this process first call `TexSoup` to parse tex files into a list, then add a color to each semantic element. This process generates a bunch of tex files, each tex file is different with the original colored tex file in a small part -3. Compile these tex files into PDFs and further transform the PDFs into png images. -4. Extract the layout metadata of PDF, so that one-column and multi-column can be classified. -5. Generating bounding box for each semantic elements, generation is composed of two methods: - - For `Figure` elements, we use `PDFMiner` to get the bounding box - - For other semantic elements, we use the difference of two images to get the bounding box -6. By linking the bounding box and its related latex source code, we obtain the reading annotations. -7. After processing, we remove all redundant files. - -# Update log -## 2023.12 -- [x] fix known bugs -- [x] add new categories -- [x] add quality report - - - -## 2023.11 -- [x] release v0.2 that correctly annotate all environments. - - [x] fix pdf figure bounding box generation error - - [x] fix cross column environments bounding box generation error - - [x] fix pdfminer cannot match source with bb error - - [x] fix pdfminer cannot accurately generate bounding box error - - - [x] feat: add bb-source_code match algorithm - - -## 2023.10 -- [x] release v0.1 that can handle algorithm, equation, table environments. - -## 2023.09 -- [x] extract elements in '.tex' files -- [x] fix environment with argument parsing error. -- [x] fix align environment rendering error -- [x] fix list environment parsing error - - -# Known Issues -1. Some customized environments will not be annotated, for example, `\newtheorem{defn}[thm]{Definition}`. This can be solved by adding the customized environment to `envs.text_envs`, then the environment will be annotated. -2. Rendering error, this happens when we render a environment successfully, but we cannot compile the rendered tex file into a PDF. This is still an open problem. -3. Some figures such as `tikz` format, will not be correctly classified, this may cause further error. - -# Acknowledgements -This project is based on the following python packages: -- [Texsoup](https://texsoup.alvinwan.com/) -- [pdf2image](https://pypi.org/project/pdf2image/) -- [pdfminer.six](https://pdfminersix.readthedocs.io/en/latest/index.html) + +View the documentation by opening `docs/_build/index.html` in a browser. + +## Acknowledgements + +Built using: + +- [Texsoup](https://texsoup.alvinwan.com/) +- [pdf2image](https://pypi.org/project/pdf2image/) +- [pdfminer.six](https://pdfminersix.readthedocs.io/en/latest/index.html) - [arxiv_cleaner](https://github.com/elsa-lab/arxiv-cleaner.git) +# Citation +if you found this package useful, please cite: -# License +```bibtex +@article{xia2024docgenome, + title={DocGenome: An Open Large-scale Scientific Document Benchmark for Training and Testing Multi-modal Large Language Models}, + author={Xia, Renqiu and Mao, Song and Yan, Xiangchao and Zhou, Hongbin and Zhang, Bo and Peng, Haoyang and Pi, Jiahao and Fu, Daocheng and Wu, Wenjie and Ye, Hancheng and others}, + journal={arXiv preprint arXiv:2406.11633}, + year={2024} +} +``` diff --git a/dataset_readme.md b/dataset_readme.md new file mode 100644 index 0000000..c5ab030 --- /dev/null +++ b/dataset_readme.md @@ -0,0 +1,98 @@ +# File structure + +Here is an example of file structure of the dataset for discipline `math.GM`. + +```bash +math.GM +├── 0906.1099 +│ ├── layout_annotation.json +│ ├── order_annotation.json +│ ├── page_xxxx.jpg +│ ├── quality_report.json +│ └── reading_annotation.json +└── 2103.02443 + ├── layout_annotation.json + ├── order_annotation.json + ├── page_xxxx.jpg + ├── quality_report.json + └── reading_annotation.json +``` + +each paper folder, for example, `math.GM/2103.02443` contains five parts: + +1. `page_xxxx.jpg`, this image represents each page of the paper, the page index is contained in the filename. Notice that this might be different from the original paper. +2. `layout_annotation.json`, this json file contains the layout annotation of each page in COCO format. +3. `reading_annotation.json`, this json file contains Latex source code for each blocks (except Figure). Notice that the latex source code may contain macros. +4. `order_annotation.json`, this json file contains the relationship between different blocks in triple format. +5. `quality_report.json`, this json file contains the quality computing result for each page and the whole paper for further use. + +# Layout annotation + +## Layout annotation category + +| **Index** | **Category** | **Notes** | +|----------------|-------------------|------------------------------------------| +| 0 | Algorithm | | +| 1 | Caption | Titles of Images, Tables, and Algorithms | +| 2 | Equation | | +| 3 | Figure | | +| 4 | Footnote | | +| 5 | List | | +| 7 | Table | | +| 8 | Text | | +| 9 | Text-EQ | Text block with inline equations | +| 10 | Title | Section titles | +| 12 | PaperTitle | | +| 13 | Code | | +| 14 | Abstract | | + +## Known Issues + +1. The IoU of Bounding boxes are too large, this happens when the paper template is too complex. +2. The category of the bounding boxes are not correct. This happens when user-defined macros are used. For example, some authors may use `\newcommand{\beq}{\begin{equation}}`, `\newcommand{\eeq}{\end{equation}}`, in this case, the equation may be detected as `Text` class. +3. Bounding box is missing, this happens due to rare packages are used. Some rare packages may not identified by our rule-based methods. +4. Bounding boxes are correct, but overlaps with other adjacent bounding boxe slightly, this happens due to layout adjustments, for example `vspace`, `input` commands. + +# Order annotation category + +## Category Definition + +| **Category** | **Description** | **Example** | +|----------------|-------------------|------------------------------------------| +| identical | two blocks corresponding to the same latex code chunk | paragraphs that cross columns or pages | +| peer | two blocks are both belongs to Title | \section{introduction}, \section{method} | +| sub | one block is a child of another block logically | \section{introduction} and the first paragraph in Introduction section | +| adj | two adjacent Text blocks | Paragraph1 and Paragraph2 | +| explicit-cite | one block cites another block with `ref` | As shown in \ref{Fig: 5}. | +| implicit-cite | The caption block and the corresponding float environment | \begin{table}\\caption\{A}\\begin{tabular}B\end{tabular}\end{table}, then A implicit-cite B | + +## Order annotation representation + +each `reading_annotation.json` contains two field: + +1. `annotations`: containing the block information for each block, the `block_id` of each block is used to represent the relationship. +2. `orders`: containing a list of triples, the meaning of each triple is: + 1. `type`, representing the category of the current relationship, see table above for details. + 2. `from`, representing the `block_id` of the starting block of the relationship + 3. `to`, representing the `block_id` of the ending block of the relationship + +## Known issues + +1. `reading_annotation.json` file of some papers may not contain the field `annotations` for unknown reason. +2. `reading_annotation.json` doesn't contain the `implicit-cite` relationship, the `implicit-cite` relationship is used in test-dataset for efficiency consideration. +3. `explicit-cite` only supports `Equation`, the support for `Table`, `Figrue` is developed after the training dataset is complete. + +# Quality report + +This file containing the rule-based quality check for further use. Explanation is as follows: + +1. `num_pages`: the number of pages of the paper. +2. `num_columns`: 1 (single column) or 2 (two column), depends on the last page of the paper +3. `category_quality`: we record the number rendered latex code chunks for each category `reading_count`, and the number of detected bounding boxes `geometry_count`, then `missing_rate` is computed as `(reading_count - geometry_count)/reading_count`. Finally, the `Total` category is the summary of all other categories. +4. `page_quality` containing IoU information of each page and the whole paper: + 1. `page`: page index + 2. `num_blocks`: how many bounding boxes in this page + 3. `area`: sum of area of all blocks, $\sum_i \text{area}(\text{bbox}_i)$ + 4. `overlap`: sum of intersection area of all blocks, $\sum_i\sum_{j>i} \text{area}(\text{bbox}_i\cap bbox_j)$ + 5. `ratio` the ratio between `overlap` and `area`. Note that this ratio may be very large if there is template issue. + diff --git a/scripts/app.py b/scripts/app.py deleted file mode 100644 index 54b4a1c..0000000 --- a/scripts/app.py +++ /dev/null @@ -1,120 +0,0 @@ -import panel as pn -import os -import glob -from PIL import Image, ImageDraw - -from DocParser.vrdu import utils -from DocParser.vrdu.config import config - -pn.extension() - - -data_path = ( - "/cpfs01/shared/ADLab/ADLab_hdd/vrdu_arxiv/vrdu_autolabel_final_3_120w/nlin.AO" -) -default_path = os.path.expanduser( - "/cpfs01/shared/ADLab/ADLab_hdd/vrdu_arxiv/vrdu_autolabel_final_3_120w/nlin.AO/0809.2301" -) - -# get all renderable paper paths -paper_paths = [] -for root, dirs, files in os.walk(data_path): - if "order_annotation.json" in files: - paper_paths.append(root) - -# generate select widget from paper paths -paper_select = pn.widgets.Select(value=default_path, options=paper_paths) - -# load layout info from a given paper -layout_info = utils.load_json(os.path.join(default_path, "order_annotation.json")) -layout_info = layout_info["annotations"] - -# get all image paths of a given paper -image_paths = sorted(glob.glob(os.path.join(default_path, "original-page-*.jpg"))) - -# generate select widget from image paths -image_select = pn.widgets.Select(value=image_paths[0], options=image_paths) -image_pane = pn.pane.PNG() -image_pane.height = 800 -image_pane.width = 600 - -# generate pane to display source code -source_code_pane = pn.Column("# Source Code") - -# generate select widget to show annotations of different categories -category_select = pn.widgets.Select( - value="All", options=["All"] + list(config.name2category.keys()) -) - - -@pn.depends(paper_select.param.value) -def update_paper(path): - global layout_info - layout_info = utils.load_json(os.path.join(path, "order_annotation.json")) - layout_info = layout_info["annotations"] - image_paths = sorted(glob.glob(os.path.join(path, "original-page-*.jpg"))) - image_select.options = image_paths - - -@pn.depends(image_select.param.value) -def update_image(image_path): - image = Image.open(image_path) - image_pane.object = image - image_pane.width = image.size[0] - image_pane.height = image.size[1] - - -@pn.depends(image_select.param.value, category_select.param.value) -def update_annotation(image_path, category): - if not image_path: - return - source_code_pane.clear() - source_code_pane.append("# Source Code") - image = Image.open(image_path) - draw = ImageDraw.Draw(image) - image_id = int(os.path.splitext(os.path.basename(image_path))[0][-4:]) - print(f"image_id={image_id}") - - # filter blocks - if category == "All": - print(layout_info) - blocks = [block for block in layout_info if block["page_index"] == image_id] - else: - blocks = [ - block - for block in layout_info - if block["page_index"] == image_id - if block["category"] == config.name2category[category] - ] - - for index, block in enumerate(blocks): - bbox = ( - block["bbox"][0], - block["bbox"][1], - block["bbox"][2], - block["bbox"][3], - ) - draw.rectangle(bbox, outline="red", width=3) - if block["parent_block"] is None: - source_code_pane.append("* " + block["source_code"]) - - image_pane.object = image - - -app = pn.Row( - image_pane, - pn.Column( - pn.Row("# Paper", paper_select), - pn.Row("# Image", image_select), - pn.Row("# Category", category_select), - source_code_pane, - ), - update_paper, - update_image, - update_annotation, -) - -app.servable() - -# use the following command to visualize -# panel serve app.py --show --autoreload diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index d979787..e6de1f5 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -11,7 +11,7 @@ log = logger.setup_app_level_logger(logger_name="arxiv_download.log") -def download_papers_with_paper_id( +def download_papers_source_with_paper_id( path: str, discipline: str, paper_ids: List[str] ) -> None: """ @@ -59,6 +59,22 @@ def download_papers_with_paper_id( continue +def download_papers_pdf_with_paper_id(path: str, discipline: str, paper_ids: List[str]): + client = arxiv.Client() + discipline_path = os.path.join(path, discipline) + os.makedirs(discipline_path, exist_ok=True) + + search_results = client.results(arxiv.Search(id_list=paper_ids)) + + for result in search_results: + # extract {id} without version from http://arxiv.org/abs/{id} + paper_id = result.entry_id.split("/")[1].split("v")[0] + log.info(f"Downloading paper {paper_id}") + + pdf_path = result.download_pdf(dirpath=discipline_path) + log.info(f"Downloading pdf file {pdf_path}") + + def download_batch_papers(path: str, discipline: str, num_papers: int) -> None: """ Downloads a batch of papers from the Arxiv repository @@ -81,6 +97,7 @@ def download_batch_papers(path: str, discipline: str, num_papers: int) -> None: ``` """ + log.debug(f"path: {path}, discipline: {discipline}, num_papers: {num_papers}") client = arxiv.Client() paper_ids = [] @@ -96,7 +113,7 @@ def download_batch_papers(path: str, discipline: str, num_papers: int) -> None: paper_ids.append(paper_id) num_papers -= 1 - download_papers_with_paper_id(path, discipline, paper_ids) + download_papers_pdf_with_paper_id(path, discipline, paper_ids) def main() -> None: @@ -114,7 +131,16 @@ def main() -> None: args = parser.parse_args() output_path, discipline, num_papers = args.path, args.discipline, args.num_papers - download_batch_papers(output_path, discipline, num_papers) + import json + + with open("/cpfs01/user/maosong/vrdu_data_process/data/discipline_map.json") as f: + discipline_map = json.load(f) + + disciplines = [x for value in discipline_map.values() for x in value] + + for discipline in disciplines: + log.debug("Downloading discipline %s", discipline) + download_batch_papers(output_path, discipline, num_papers) if __name__ == "__main__": diff --git a/setup.py b/setup.py index ba3749e..85834f1 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="DocParser", version="1.0.0", - description="process the academic papers with .tex source files", + description="Process academic papers with .tex source files for layout analysis", author="Mao Song", author_email="maosong@pjlab.org.cn", url="https://github.com/UniModal4Reasoning/DocParser.git", @@ -22,9 +22,15 @@ "scikit_image==0.19.3", "setuptools==68.0.0", "tqdm==4.66.1", + "sphinx", + "arxiv-cleaner", + "texsoup", ], + python_requires=">=3.8", scripts=[], entry_points={ - "console_scripts": [], + "console_scripts": [ + "vrdu_process=vrdu_data_process.main:main", + ], }, )