diff --git a/.vscode/settings.json b/.vscode/settings.json index 03ba071..69cbc88 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,35 +1,59 @@ { "cSpell.words": [ "arxiv", + "autoref", + "colorlinks", "columnsep", "columnwidth", "definecolor", "documentclass", "dtype", + "epspdf", + "eqref", "flickr", + "footnotetext", + "graphicspath", + "headheight", + "headsep", "hoffset", + "hyperref", + "hypersetup", "imread", "includegraphics", "iscrowd", + "labelcref", "laparams", "latexpand", + "levelname", + "lstinputlisting", + "lstlisting", + "lstset", + "nonstopmode", "oddsidemargin", "opencv", + "pageref", "pdfcrop", "pdflatex", "pdfminer", + "psfig", "regionprops", "renewcommand", "rgbcolor", "scikit", "skimage", "subimport", + "synctex", + "tablefootnote", + "texlive", "Texsoup", "textcolor", "textwidth", + "topmargin", "tqdm", "usepackage", + "voffset", "vrdu", - "xcolor" + "xcolor", + "YOLO" ] } \ No newline at end of file diff --git a/TexSoup/LICENSE b/DocParser/TexSoup/LICENSE similarity index 100% rename from TexSoup/LICENSE rename to DocParser/TexSoup/LICENSE diff --git a/TexSoup/MANIFEST.in b/DocParser/TexSoup/MANIFEST.in similarity index 100% rename from TexSoup/MANIFEST.in rename to DocParser/TexSoup/MANIFEST.in diff --git a/TexSoup/README.md b/DocParser/TexSoup/README.md similarity index 100% rename from TexSoup/README.md rename to DocParser/TexSoup/README.md diff --git a/TexSoup/TexSoup/__init__.py b/DocParser/TexSoup/TexSoup/__init__.py similarity index 96% rename from TexSoup/TexSoup/__init__.py rename to DocParser/TexSoup/TexSoup/__init__.py index a20883d..37a55c0 100644 --- a/TexSoup/TexSoup/__init__.py +++ b/DocParser/TexSoup/TexSoup/__init__.py @@ -4,8 +4,8 @@ tree with navigation, search, and modification utilities. """ -from TexSoup.TexSoup.tex import read -from TexSoup.TexSoup.data import TexNode +from .tex import read +from .data import TexNode __version__ = '0.3.1' diff --git a/TexSoup/TexSoup/category.py b/DocParser/TexSoup/TexSoup/category.py similarity index 100% rename from TexSoup/TexSoup/category.py rename to DocParser/TexSoup/TexSoup/category.py diff --git a/TexSoup/TexSoup/data.py b/DocParser/TexSoup/TexSoup/data.py similarity index 100% rename from TexSoup/TexSoup/data.py rename to DocParser/TexSoup/TexSoup/data.py diff --git a/TexSoup/TexSoup/reader.py b/DocParser/TexSoup/TexSoup/reader.py similarity index 100% rename from TexSoup/TexSoup/reader.py rename to DocParser/TexSoup/TexSoup/reader.py diff --git a/TexSoup/TexSoup/tex.py b/DocParser/TexSoup/TexSoup/tex.py similarity index 100% rename from TexSoup/TexSoup/tex.py rename to DocParser/TexSoup/TexSoup/tex.py diff --git a/TexSoup/TexSoup/tokens.py b/DocParser/TexSoup/TexSoup/tokens.py similarity index 100% rename from TexSoup/TexSoup/tokens.py rename to DocParser/TexSoup/TexSoup/tokens.py diff --git a/TexSoup/TexSoup/utils.py b/DocParser/TexSoup/TexSoup/utils.py similarity index 100% rename from TexSoup/TexSoup/utils.py rename to DocParser/TexSoup/TexSoup/utils.py diff --git a/TexSoup/__init__.py b/DocParser/TexSoup/__init__.py similarity index 100% rename from TexSoup/__init__.py rename to DocParser/TexSoup/__init__.py diff --git a/TexSoup/app/__init__.py b/DocParser/TexSoup/app/__init__.py similarity index 100% rename from TexSoup/app/__init__.py rename to DocParser/TexSoup/app/__init__.py diff --git a/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py similarity index 95% rename from TexSoup/app/conversion.py rename to DocParser/TexSoup/app/conversion.py index 3ffe746..c95cf2b 100644 --- a/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -1,11 +1,11 @@ import re -from TexSoup.TexSoup import TexSoup -from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup +from DocParser.TexSoup.TexSoup import TexSoup +from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from vrdu import logger -from vrdu.config import envs +from DocParser.logger import logger +from DocParser.vrdu.config import envs log = logger.get_logger(__name__) diff --git a/TexSoup/app/resolve_imports.py b/DocParser/TexSoup/app/resolve_imports.py similarity index 100% rename from TexSoup/app/resolve_imports.py rename to DocParser/TexSoup/app/resolve_imports.py diff --git a/TexSoup/pytest.ini b/DocParser/TexSoup/pytest.ini similarity index 100% rename from TexSoup/pytest.ini rename to DocParser/TexSoup/pytest.ini diff --git a/TexSoup/setup.py b/DocParser/TexSoup/setup.py similarity index 100% rename from TexSoup/setup.py rename to DocParser/TexSoup/setup.py diff --git a/arxiv_cleaner/__init__.py b/DocParser/__init__.py similarity index 100% rename from arxiv_cleaner/__init__.py rename to DocParser/__init__.py diff --git a/arxiv_cleaner/.gitignore b/DocParser/arxiv_cleaner/.gitignore similarity index 100% rename from arxiv_cleaner/.gitignore rename to DocParser/arxiv_cleaner/.gitignore diff --git a/arxiv_cleaner/LICENSE.txt b/DocParser/arxiv_cleaner/LICENSE.txt similarity index 100% rename from arxiv_cleaner/LICENSE.txt rename to DocParser/arxiv_cleaner/LICENSE.txt diff --git a/arxiv_cleaner/README.md b/DocParser/arxiv_cleaner/README.md similarity index 100% rename from arxiv_cleaner/README.md rename to DocParser/arxiv_cleaner/README.md diff --git a/vrdu/__init__.py b/DocParser/arxiv_cleaner/__init__.py similarity index 100% rename from vrdu/__init__.py rename to DocParser/arxiv_cleaner/__init__.py diff --git a/arxiv_cleaner/arguments.py b/DocParser/arxiv_cleaner/arguments.py similarity index 100% rename from arxiv_cleaner/arguments.py rename to DocParser/arxiv_cleaner/arguments.py diff --git a/arxiv_cleaner/cleaner.py b/DocParser/arxiv_cleaner/cleaner.py similarity index 98% rename from arxiv_cleaner/cleaner.py rename to DocParser/arxiv_cleaner/cleaner.py index 4d5aa39..c0c9209 100644 --- a/arxiv_cleaner/cleaner.py +++ b/DocParser/arxiv_cleaner/cleaner.py @@ -1,9 +1,9 @@ -from arxiv_cleaner.file_utils import ( +from .file_utils import ( build_relative_path, combine_paths, copy_files, create_temp_dir, does_file_exist, find_files, remove_temp_dir, remove_unnecessary_blank_lines) -from arxiv_cleaner.latex import LatexRunner -from arxiv_cleaner.logger import Logger +from .latex import LatexRunner +from .logger import Logger class Cleaner: diff --git a/arxiv_cleaner/cli.py b/DocParser/arxiv_cleaner/cli.py similarity index 100% rename from arxiv_cleaner/cli.py rename to DocParser/arxiv_cleaner/cli.py diff --git a/arxiv_cleaner/file_utils.py b/DocParser/arxiv_cleaner/file_utils.py similarity index 100% rename from arxiv_cleaner/file_utils.py rename to DocParser/arxiv_cleaner/file_utils.py diff --git a/arxiv_cleaner/latex.py b/DocParser/arxiv_cleaner/latex.py similarity index 98% rename from arxiv_cleaner/latex.py rename to DocParser/arxiv_cleaner/latex.py index 2a2d264..f609550 100644 --- a/arxiv_cleaner/latex.py +++ b/DocParser/arxiv_cleaner/latex.py @@ -1,8 +1,8 @@ import re import subprocess -from arxiv_cleaner.cli import run_command, check_command_results -from arxiv_cleaner.file_utils import ( +from .cli import run_command, check_command_results +from .file_utils import ( build_relative_path, change_extension, combine_paths, diff --git a/arxiv_cleaner/logger.py b/DocParser/arxiv_cleaner/logger.py similarity index 100% rename from arxiv_cleaner/logger.py rename to DocParser/arxiv_cleaner/logger.py diff --git a/arxiv_cleaner/main.py b/DocParser/arxiv_cleaner/main.py similarity index 89% rename from arxiv_cleaner/main.py rename to DocParser/arxiv_cleaner/main.py index f2f0182..14a1ba0 100644 --- a/arxiv_cleaner/main.py +++ b/DocParser/arxiv_cleaner/main.py @@ -1,5 +1,5 @@ -from arxiv_cleaner.arguments import parse_args -from arxiv_cleaner.cleaner import Cleaner +from arguments import parse_args +from cleaner import Cleaner def main(): diff --git a/DocParser/logger/__init__.py b/DocParser/logger/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vrdu/logger.py b/DocParser/logger/logger.py similarity index 100% rename from vrdu/logger.py rename to DocParser/logger/logger.py diff --git a/main.py b/DocParser/main.py similarity index 50% rename from main.py rename to DocParser/main.py index 2abedd8..59ba33f 100644 --- a/main.py +++ b/DocParser/main.py @@ -2,77 +2,89 @@ import glob import os import shutil +from pathlib import Path +from typing import List from tqdm import tqdm +from loguru import logger +from DocParser.vrdu import utils +from DocParser.vrdu import renderer +from DocParser.vrdu import preprocess +from DocParser.vrdu import layout_annotation as layout +from DocParser.vrdu import order_annotation as order +from DocParser.vrdu.config import config +from DocParser.vrdu.quality_check import generate_quality_report -from vrdu import logger -from vrdu import utils -from vrdu import renderer -from vrdu import preprocess -from vrdu import layout_annotation as layout -from vrdu import order_annotation as order -from vrdu.config import config -from vrdu.quality_check import generate_quality_report +logger.add("vrdu_debug.log", mode="w") -log = logger.setup_app_level_logger(file_name="vrdu_debug.log") - -def transform_tex_to_images(main_directory: str) -> None: +def transform_tex_to_images(main_directory: Path) -> None: """ Transforms TeX files with pattern paper_*.tex in the specified directory into jpg images. Args: - main_directory (str): The main directory where the TeX files are located. + main_directory (Path): The main directory where the TeX files are located. Returns: None """ tex_files = glob.glob(f"{main_directory}/paper_*.tex") - output_directory = os.path.join(main_directory, "output") - for tex_file in tqdm(tex_files): - log.debug(f"[VRDU] file: {tex_file}, start transforming into images.") - utils.compile_latex(tex_file) + output_directory = Path(main_directory) / "output" + for tex_file in tqdm(tex_files, desc="Converting TeX files to images"): + logger.debug(f"[VRDU] file: {tex_file}, start transforming into images.") + # Set colored flag based on filename + colored = "paper_colored.tex" in tex_file + utils.compile_latex(tex_file, colored=colored) # get the pdf file name - filename_without_extension = os.path.splitext(os.path.basename(tex_file))[0] - pdf_file = os.path.join(main_directory, f"{filename_without_extension}.pdf") + filename_without_extension = Path(tex_file).stem + pdf_file = Path(main_directory) / f"{filename_without_extension}.pdf" # convert into images - image_directory = os.path.join(output_directory, filename_without_extension) - os.makedirs(image_directory) - utils.pdf2jpg(pdf_file, image_directory) + image_directory = output_directory / filename_without_extension + image_directory.mkdir(parents=True, exist_ok=True) + utils.pdf2jpg(str(pdf_file), str(image_directory)) + + +def get_redundant_folders(main_directory: Path) -> List[str]: + """Get list of redundant folders to remove.""" + pattern = f"{main_directory}/output/paper_{config.folder_prefix}*" + redundant_folders = glob.glob(pattern) + redundant_folders.extend( + [ + f"{main_directory}/output/paper_white", + f"{main_directory}/output/paper_original", + ] + ) + return redundant_folders -def remove_redundant_stuff(main_directory: str) -> None: +def remove_redundant_stuff(main_directory: Path) -> None: """ Remove redundant files and folders from the main directory. Args: - main_directory (str): The path of the main directory. + main_directory (Path): The path of the main directory. Returns: None """ # remove generated tex related files - redundant_files = glob.glob(f"{main_directory}/paper_*") - for file in redundant_files: + for file in glob.glob(f"{main_directory}/paper_*"): os.remove(file) # remove useless pdf and image files - # TODO: move this name pattern into config - redundant_folders = glob.glob( - f"{main_directory}/output/paper_{config.folder_prefix}*" - ) - redundant_folders += [ - f"{main_directory}/output/paper_white", - f"{main_directory}/output/paper_original", - ] - for folder in redundant_folders: + for folder in get_redundant_folders(main_directory): if os.path.exists(folder): shutil.rmtree(folder) -def process_one_file(file_name: str) -> None: +def check_if_already_processed(main_directory: Path) -> bool: + quality_report_file = main_directory / "output/result/quality_report.json" + return quality_report_file.exists() + + +def process_one_file(file_name: Path) -> None: """ Process a file through multiple steps including preprocessing, rendering, transforming into images, generating annotations, and handling exceptions. @@ -83,37 +95,32 @@ def process_one_file(file_name: str) -> None: Returns: None """ - main_directory = os.path.dirname(file_name) - log.info(f"[VRDU] file: {file_name}, start processing.") + main_directory = Path(file_name).parent + logger.info(f"[VRDU] file: {file_name}, start processing.") # check if this paper has been processed - quality_report_file = os.path.join( - main_directory, "output/result/quality_report.json" - ) - if os.path.exists(quality_report_file): - log.info(f"[VRDU] file: {file_name}, paper has been processed") + if check_if_already_processed(main_directory): + logger.info(f"[VRDU] file: {file_name}, paper has been processed") return # make a copy of the original tex file - original_tex = os.path.join(main_directory, "paper_original.tex") + original_tex = main_directory / "paper_original.tex" shutil.copyfile(file_name, original_tex) # remove the output folder if it exists - output_directory = os.path.join(main_directory, "output") - if os.path.exists(output_directory): + output_directory = main_directory / "output" + if output_directory.exists(): shutil.rmtree(output_directory) - # output_directory stores the intermediate results - # result_directory stores the final results - os.makedirs(os.path.join(main_directory, "output/result")) - + # change the working directory to the main directory of the paper cwd = os.getcwd() try: # change the working directory to the main directory of the paper os.chdir(main_directory) - # create output folder - os.makedirs(os.path.join(main_directory, "output/result")) + # create output folder and output/result folder + result_dir = output_directory / "result" + result_dir.mkdir(parents=True) # step 1: preprocess the paper preprocess.run(original_tex) @@ -122,14 +129,14 @@ def process_one_file(file_name: str) -> None: vrdu_renderer = renderer.Renderer() vrdu_renderer.render(original_tex) - # step 2.2: compling tex into PDFs - log.info( + # step 2.2: compiling tex into PDFs + logger.info( f"[VRDU] file: {original_tex}, start transforming into images, this may take a while..." ) transform_tex_to_images(main_directory) # Step 3: generate annotations - log.info( + logger.info( f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..." ) vrdu_layout_annotation = layout.LayoutAnnotation(original_tex) @@ -141,14 +148,15 @@ def process_one_file(file_name: str) -> None: # generate quality report for simple debugging generate_quality_report(main_directory) - log.info(f"[VRDU] file: {original_tex}, successfully processed.") + logger.info(f"[VRDU] file: {original_tex}, successfully processed.") except Exception as e: - error_type = e.__class__.__name__ - error_info = str(e) - log.error( - f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" - ) + # error_type = e.__class__.__name__ + # error_info = str(e) + # logger.error( + # f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" + # ) + raise e finally: # remove redundant files @@ -183,18 +191,18 @@ def main() -> None: Returns: None """ - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="Process TeX files to generate annotations and images" + ) parser.add_argument( "-f", "--file_name", - type=str, + type=Path, required=True, - help="The name of the tex file will full path", + help="The path to the TeX file to process", ) args = parser.parse_args() - file_name = args.file_name - - process_one_file(file_name) + process_one_file(Path(args.file_name)) if __name__ == "__main__": diff --git a/DocParser/vrdu/__init__.py b/DocParser/vrdu/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/DocParser/vrdu/block.py b/DocParser/vrdu/block.py new file mode 100644 index 0000000..cd3e55a --- /dev/null +++ b/DocParser/vrdu/block.py @@ -0,0 +1,221 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Any, cast + + +@dataclass +class BoundingBox: + """A bounding box representation with coordinates (x0,y0) for top-left and (x1,y1) for bottom-right. + Origin is at top-left of the page.""" + + x0: float + y0: float + x1: float + y1: float + + @property + def width(self) -> float: + """Width of bounding box""" + return self.x1 - self.x0 + + @property + def height(self) -> float: + """Height of bounding box""" + return self.y1 - self.y0 + + def __len__(self) -> int: + return 4 + + def __repr__(self) -> str: + return f"BoundingBox({self.x0}, {self.y0}, {self.x1}, {self.y1})" + + def __getitem__(self, index: int) -> float: + return (self.x0, self.y0, self.x1, self.y1)[index] + + def area(self) -> float: + """Calculate area of bounding box""" + return abs(self.width * self.height) + + def overlap(self, other: "BoundingBox") -> float: + """Calculate overlap area with another bounding box""" + if ( + self.x0 > other.x1 + or self.x1 < other.x0 + or self.y0 > other.y1 + or self.y1 < other.y0 + ): + return 0.0 + + x_overlap = max(0, min(self.x1, other.x1) - max(self.x0, other.x0)) + y_overlap = max(0, min(self.y1, other.y1) - max(self.y0, other.y0)) + return x_overlap * y_overlap + + def to_dict(self) -> Dict[str, Tuple[float, float, float, float]]: + """Convert to dictionary format""" + return {"bbox": self.to_tuple()} + + def to_tuple(self) -> Tuple[float, float, float, float]: + """Convert to tuple format""" + return (self.x0, self.y0, self.x1, self.y1) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "BoundingBox": + """Create BoundingBox from dictionary""" + bbox = data["bbox"] + return cls(x0=bbox[0], y0=bbox[1], x1=bbox[2], y1=bbox[3]) + + @classmethod + def from_list( + cls, data: List[Tuple[float, float, float, float, float, float]] + ) -> "BoundingBox": + """Create bounding box that encompasses all points in list""" + min_x = min(data, key=lambda x: x[1])[1] + min_y = min(data, key=lambda x: x[0])[0] + max_x = max(data, key=lambda x: x[4])[4] + max_y = max(data, key=lambda x: x[3])[3] + return cls(x0=min_x, y0=min_y, x1=max_x, y1=max_y) + + +class Block: + """Represents a block of content with position and metadata""" + + current_id: int = 0 + + def __init__( + self, + bounding_box: BoundingBox, + block_id: Optional[int] = None, + category: Optional[int] = None, + page_index: Optional[int] = None, + previous_block: Optional[int] = None, + parent_block: Optional[int] = None, + next_block: Optional[int] = None, + source_code: Optional[str] = None, + labels: Optional[List[str]] = None, + references: Optional[List[str]] = None, + ) -> None: + self.id = block_id if block_id is not None else Block.current_id + if block_id is None: + Block.current_id += 1 + + self._category: Optional[int] = category + self._page_index: Optional[int] = page_index + self._bounding_box: BoundingBox = bounding_box + self._previous_block: Optional[int] = previous_block + self._parent_block: Optional[int] = parent_block + self._next_block: Optional[int] = next_block + self._source_code: Optional[str] = source_code + self._labels: Optional[List[str]] = labels or [] + self._references: Optional[List[str]] = references or [] + + def __repr__(self) -> str: + return ( + f"Block(id={self.id}, category={self.category}, " + f"page_index={self.page_index}, bbox={self.bbox}, " + f"source_code={self.source_code})" + ) + + @property + def bbox(self) -> BoundingBox: + return self._bounding_box + + @bbox.setter + def bbox(self, value: BoundingBox) -> None: + self._bounding_box = value + + @property + def labels(self) -> List[str]: + return cast(List[str], self._labels or []) + + @labels.setter + def labels(self, value: List[str]) -> None: + self._labels = value + + @property + def references(self) -> List[str]: + return cast(List[str], self._references or []) + + @references.setter + def references(self, value: List[str]) -> None: + self._references = value + + @property + def block_id(self) -> int: + return self.id + + @property + def category(self) -> int: + return cast(int, self._category or 0) + + @category.setter + def category(self, value: int) -> None: + self._category = value + + @property + def page_index(self) -> int: + return cast(int, self._page_index or 0) + + @page_index.setter + def page_index(self, value: int) -> None: + self._page_index = value + + @property + def source_code(self) -> str: + return cast(str, self._source_code or "") + + @source_code.setter + def source_code(self, value: str) -> None: + self._source_code = value + + @property + def parent_block(self) -> int: + return cast(int, self._parent_block or -1) + + @parent_block.setter + def parent_block(self, value: int) -> None: + self._parent_block = value + + @property + def previous_block(self) -> int: + return cast(int, self._previous_block or -1) + + @property + def next_block(self) -> int: + return cast(int, self._next_block or -1) + + @property + def height(self) -> float: + return self._bounding_box.height + + @property + def width(self) -> float: + return self._bounding_box.width + + def to_dict(self) -> Dict[str, Any]: + """Convert block to dictionary format""" + data = { + "block_id": self.block_id, + "category": self.category, + "page_index": self.page_index, + "previous_block": self.previous_block, + "parent_block": self.parent_block, + "next_block": self.next_block, + "source_code": self.source_code, + "labels": self.labels, + "references": self.references, + "bbox": self._bounding_box.to_tuple(), + } + return data + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Block": + """Create Block from dictionary""" + return cls( + block_id=data["block_id"], + bounding_box=BoundingBox.from_dict(data), + category=data["category"], + previous_block=data["previous_block"], + parent_block=data["parent_block"], + next_block=data["next_block"], + source_code=data["source_code"], + page_index=data["page_index"], + ) diff --git a/vrdu/compile_latex.sh b/DocParser/vrdu/compile_latex.sh old mode 100755 new mode 100644 similarity index 100% rename from vrdu/compile_latex.sh rename to DocParser/vrdu/compile_latex.sh diff --git a/vrdu/config/config.json b/DocParser/vrdu/config/config.json similarity index 100% rename from vrdu/config/config.json rename to DocParser/vrdu/config/config.json diff --git a/vrdu/config/config.py b/DocParser/vrdu/config/config.py similarity index 100% rename from vrdu/config/config.py rename to DocParser/vrdu/config/config.py diff --git a/vrdu/config/envs.py b/DocParser/vrdu/config/envs.py similarity index 100% rename from vrdu/config/envs.py rename to DocParser/vrdu/config/envs.py diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py new file mode 100644 index 0000000..72c1717 --- /dev/null +++ b/DocParser/vrdu/layout_annotation.py @@ -0,0 +1,503 @@ +from collections import defaultdict +import glob +import subprocess +from typing import Any, Dict, List, Tuple +import matplotlib.pyplot as plt +import numpy as np +from skimage.measure import label, regionprops +from PIL import Image, ImageDraw, ImageFont +import re +from tqdm import tqdm +from pathlib import Path + +from pdfminer.high_level import extract_pages +from pdfminer.layout import LTFigure, LTPage + +from DocParser.vrdu import utils +from DocParser.vrdu.block import Block, BoundingBox +from DocParser.vrdu.config import config, envs + + +class LayoutAnnotation: + """Class for extracting and annotating layout information from LaTeX documents.""" + + ONE_INCH = 72.27 # LaTeX point unit conversion + + def __init__(self, tex_file: Path) -> None: + """Initialize LayoutAnnotation with paths and load required data. + + Args: + tex_file: Path to the LaTeX source file + """ + self.tex_file = tex_file + self.main_directory = tex_file.parent + self.output_directory = self.main_directory / "output" + self.result_directory = self.output_directory / "result" + self.pdf_images_path = self.output_directory / "paper_colored" + + # Load text information + self.text_info = utils.load_json(self.result_directory / "texts.json") + self.layout_metadata: Dict = {} + + def extract_pdf_layouts(self) -> List[LTPage]: + """Extract layout information from rendered PDF. + + Returns: + List of page layout objects + """ + rendered_pdf = self.main_directory / "paper_colored.pdf" + return list(extract_pages(rendered_pdf)) + + def _parse_log_metadata(self, log_file: Path) -> Dict: + """Parse LaTeX log file for layout metadata. + + Args: + log_file: Path to LaTeX log file + + Returns: + Dictionary of extracted metadata values + """ + metadata = {} + pattern = r"\[vrdu_data_process: The (.*) is: ([-+]?\d+\.\d+)pt\]" + + with open(log_file, "r", encoding="latin-1") as f: + for match in re.findall(pattern, f.read()): + key, value = match[0], float(match[1]) + metadata[key] = value + + # Calculate number of columns + textwidth = metadata["textwidth"] + columnsep = metadata["columnsep"] + columnwidth = metadata["columnwidth"] + metadata["num_columns"] = round( + (textwidth + columnsep) / (columnwidth + columnsep) + ) + + return metadata + + def _calculate_margins(self, metadata: Dict) -> Tuple[float, float]: + """Calculate page margins from metadata. + + Args: + metadata: Dictionary of layout metadata + + Returns: + Tuple of (margin_width, margin_height) + """ + margin_width = (self.ONE_INCH + metadata["hoffset"]) + metadata["oddsidemargin"] + + margin_height = ( + (self.ONE_INCH + metadata["voffset"]) + - (metadata["topmargin"] - metadata["headheight"]) + ) + metadata["headsep"] / 2 + + return margin_width, margin_height + + def parse_metadata(self, pdf_layouts: List[LTPage]) -> None: + """Parse and store layout metadata from PDF and log file. + + Args: + pdf_layouts: List of PDF page layouts + """ + pt2px = config.ppi / self.ONE_INCH + log_file = self.main_directory / "paper_colored.log" + + # Parse basic metadata + metadata = self._parse_log_metadata(log_file) + margin_width, margin_height = self._calculate_margins(metadata) + metadata["margin_width"] = margin_width + + # Get image files sorted by page number + image_files = sorted( + glob.glob(str(self.pdf_images_path / "*.jpg")), key=lambda x: x[-6:-4] + ) + + # Process each page + for page_idx, page_layout in enumerate(pdf_layouts): + page_metadata = { + "pdf_width": page_layout.width, + "pdf_height": page_layout.height, + "top_margin": margin_height, + } + + # Get image dimensions + with Image.open(image_files[page_idx]) as img: + page_metadata["image_width"], page_metadata["image_height"] = img.size + + # Calculate scale factors + px2img = page_metadata["image_height"] / page_layout.height + page_metadata["px2img"] = px2img + + # Calculate column separations + separations = [0] + x = margin_width - 0.5 * metadata["columnsep"] + for _ in range(metadata["num_columns"] - 1): + sep = x + metadata["columnwidth"] + metadata["columnsep"] + separations.append(sep * pt2px * px2img) + x += sep + separations.append(page_layout.width * px2img) + page_metadata["separations"] = separations + + metadata[page_idx] = page_metadata + + self.layout_metadata = metadata + utils.export_to_json(metadata, self.result_directory / "layout_metadata.json") + + def retrieve_figure_source_code( + self, figure_layout_info: Dict[int, List[Block]] + ) -> None: + """Retrieve LaTeX source code for figures using synctex. + + Args: + figure_layout_info: Dictionary mapping page numbers to figure blocks + """ + tex_filename = self.tex_file.name.replace("paper_original", "paper_colored") + pdf_filename = tex_filename.replace(".tex", ".pdf") + + with open(self.main_directory / tex_filename, "r") as f: + content_lines = f.readlines() + + for page_idx, blocks in figure_layout_info.items(): + for block in blocks: + # Get center point of figure + center_x = (block.bbox[0] + block.bbox[2]) / 2 + center_y = (block.bbox[1] + block.bbox[3]) / 2 + + # Run synctex to get source line + result = subprocess.run( + [ + "synctex", + "edit", + "-o", + f"{page_idx + 1}:{center_x:.2f}:{center_y:.2f}:{pdf_filename}", + "-d", + str(self.main_directory), + ], + check=True, + capture_output=True, + text=True, + ) + + # Extract line number and source + line_idx = int(result.stdout.split("\nLine:")[1].split("\n")[0]) + block.source_code = content_lines[line_idx - 1] + + def generate_figure_bb(self, pdf_layouts: List[LTPage]) -> Dict[int, List[Block]]: + """Generate bounding boxes for figures in PDF layout. + + Args: + pdf_layouts: List of PDF page layouts + + Returns: + Dictionary mapping page numbers to figure blocks + """ + layout_info = defaultdict(list) + + for page_idx, page in enumerate(pdf_layouts): + for element in page: + if not isinstance(element, LTFigure): + continue + + # Convert coordinates (flip y-axis) + y0 = page.height - element.bbox[3] + y1 = page.height - element.bbox[1] + x0, x1 = element.bbox[0], element.bbox[2] + + layout_info[page_idx].append( + Block( + bounding_box=BoundingBox(x0, y0, x1, y1), + page_index=page_idx, + category=config.name2category["Figure"], + source_code="", + ) + ) + + self.retrieve_figure_source_code(layout_info) + self.transform(layout_info) + return layout_info + + def transform(self, layout_info: Dict[int, List[Block]]) -> None: + """Transform bounding boxes from PDF to image coordinates. + + Args: + layout_info: Dictionary mapping page numbers to blocks + """ + for page_idx, elements in layout_info.items(): + px2img = self.layout_metadata[page_idx]["px2img"] + + for element in elements: + x0, y0, x1, y1 = element.bbox + width = element.width + height = element.height + + # Scale coordinates + x0 *= px2img + y0 *= px2img + x1 = x0 + width * px2img + y1 = y0 + height * px2img + + element.bbox = BoundingBox(x0, y0, x1, y1) + + def _process_bounding_boxes( + self, + bounding_boxes: List[Tuple], + category: str, + page_idx: int, + source_code: str, + elements: List[Block], + ) -> List[Block]: + """Process and create blocks for bounding boxes. + + Args: + bounding_boxes: List of bounding box coordinates + category: Block category + page_idx: Page number + source_code: Source code for block + elements: Existing block elements + + Returns: + Updated list of block elements + """ + separations = self.layout_metadata[page_idx]["separations"] + + if category in envs.one_column_envs: + if bounding_boxes: + element = Block( + bounding_box=BoundingBox.from_list(bounding_boxes), + source_code=source_code, + category=config.name2category[category], + page_index=page_idx, + ) + if elements: + element.parent_block = elements[-1].block_id + elements.append(element) + return elements + + # Handle multi-column case + for col in range(self.layout_metadata["num_columns"]): + col_boxes = [ + bb + for bb in bounding_boxes + if separations[col] <= bb[1] <= separations[col + 1] + ] + if not col_boxes: + continue + + element = Block( + bounding_box=BoundingBox.from_list(col_boxes), + source_code=source_code, + category=config.name2category[category], + page_index=page_idx, + ) + + if elements: + element.parent_block = elements[-1].block_id + + # Merge overlapping blocks + if ( + elements + and elements[-1].category == element.category + and elements[-1].page_index == page_idx + and elements[-1].source_code == source_code + and elements[-1].bbox.overlap(element.bbox) + ): + + elements[-1].bbox = BoundingBox( + min(elements[-1].bbox.x0, element.bbox.x0), + min(elements[-1].bbox.y0, element.bbox.y0), + max(elements[-1].bbox.x1, element.bbox.x1), + max(elements[-1].bbox.y1, element.bbox.y1), + ) + else: + elements.append(element) + + return elements + + def generate_non_figure_bb(self) -> Dict[int, List[Block]]: + """Generate bounding boxes for non-figure elements. + + Returns: + Dictionary mapping page numbers to block elements + """ + background_dir = self.output_directory / "paper_white" + block_dirs = glob.glob( + str(self.output_directory / f"paper_{config.folder_prefix}*") + ) + layout_info = defaultdict(list) + pattern = r"paper_(\w+)_(\d{5})_(.*?)_(\d{5})" + + for block_dir in tqdm(sorted(block_dirs)): + matches = re.match(pattern, Path(block_dir).name) + if not matches: + raise ValueError(f"Invalid directory name pattern: {block_dir}") + + category = matches.group(3) + index = int(matches.group(4)) + elements: List[Block] = [] + + for page_idx, img1_path, img2_path in get_image_pairs( + Path(block_dir), background_dir + ): + # Compare images + img1 = np.array(plt.imread(img1_path), dtype=np.uint8) + img2 = np.array(plt.imread(img2_path), dtype=np.uint8) + diff = np.abs(img2 - img1, dtype=np.uint8) + + if np.all(diff == 0): + continue + + # Get regions + labeled, num = label(diff > config.threshold, return_num=True) + if num == 0: + continue + + bounding_boxes = [region.bbox for region in regionprops(labeled)] + if not bounding_boxes: + continue + + elements = self._process_bounding_boxes( + bounding_boxes, + category, + page_idx, + self.text_info[category][index], + elements, + ) + + for element in elements: + layout_info[element.page_index].append(element) + + return layout_info + + def generate_layout_info(self) -> Dict[int, List[Block]]: + """Generate complete layout information. + + Returns: + Dictionary mapping page numbers to all block elements + """ + pdf_layouts = self.extract_pdf_layouts() + self.parse_metadata(pdf_layouts) + + layout_info = self.generate_non_figure_bb() + figure_layout_info = self.generate_figure_bb(pdf_layouts) + + # Combine figure and non-figure info + for page_idx, figures in figure_layout_info.items(): + layout_info[page_idx].extend(figures) + + return layout_info + + def generate_image_annotation( + self, layout_info: Dict[int, List[Block]] + ) -> Dict[int, Dict[str, Any]]: + """Generate annotated images with bounding boxes. + + Args: + layout_info: Dictionary mapping page numbers to blocks + + Returns: + Dictionary of image annotation information + """ + image_files = sorted( + glob.glob(str(self.pdf_images_path / "*.jpg")), + key=lambda x: x[-6:-4], + ) + + image_info = {} + for page_idx in layout_info: + with Image.open(image_files[page_idx]) as page_image: + annotated = generate_geometry_annotation( + page_image, layout_info[page_idx] + ) + + image_name = f"page_{str(page_idx).zfill(4)}.jpg" + annotated.save(self.result_directory / image_name) + + image_info[page_idx] = { + "file_name": image_name, + "width": page_image.width, + "height": page_image.height, + } + + return image_info + + def annotate(self): + """Generate complete layout annotation.""" + # Generate layout information + layout_info = self.generate_layout_info() + layout_info_data = { + key: [x.to_dict() for x in blocks] for key, blocks in layout_info.items() + } + utils.export_to_json( + layout_info_data, self.result_directory / "layout_info.json" + ) + + # Generate annotations + image_annotation = self.generate_image_annotation(layout_info) + utils.export_to_coco( + layout_info, + image_annotation, + self.result_directory / "layout_annotation.json", + ) + + +def get_image_pairs(dir1: Path, dir2: Path) -> List[Tuple[int, str, str]]: + """Get matching pairs of images from two directories. + + Args: + dir1: First directory path + dir2: Second directory path + + Returns: + List of tuples containing (page_index, image1_path, image2_path) + + Raises: + FileNotFoundError: If image counts don't match + ValueError: If page index can't be extracted + """ + rendered_files = sorted(glob.glob(str(dir1 / "*.jpg"))) + changed_files = sorted(glob.glob(str(dir2 / "*.jpg"))) + + if len(rendered_files) != len(changed_files): + raise FileNotFoundError("Mismatched image counts between directories") + + def extract_page_index(filename: str) -> int: + match = re.search(r"thread-\d+-page-(\d+)\.jpg", filename) + if not match: + raise ValueError(f"Cannot extract page index from {filename}") + return int(match.group(1)) - 1 + + page_indices = [extract_page_index(Path(f).name) for f in rendered_files] + return list(zip(page_indices, rendered_files, changed_files)) + + +def generate_geometry_annotation( + page_image: Image.Image, layout_elements: List[Block] +) -> Image.Image: + """Add geometric annotations to an image. + + Args: + page_image: Image to annotate + layout_elements: List of block elements to annotate + + Returns: + Annotated image + """ + draw = ImageDraw.Draw(page_image) + font = ImageFont.truetype( + config.config["annotation_image_font_type"], + config.config["annotation_image_font_size"], + ) + + for element in layout_elements: + category = element.category + draw.rectangle( + element.bbox.to_tuple(), outline=config.colors_map[str(category)], width=3 + ) + draw.text( + (element.bbox[0], element.bbox[1]), + config.category2name[category], + fill=(255, 0, 0), + font=font, + ) + + return page_image diff --git a/DocParser/vrdu/order_annotation.py b/DocParser/vrdu/order_annotation.py new file mode 100644 index 0000000..da5e109 --- /dev/null +++ b/DocParser/vrdu/order_annotation.py @@ -0,0 +1,316 @@ +import re +from uuid import uuid4 +from pathlib import Path +from typing import Dict, List, Any + +from DocParser.vrdu.block import Block +from DocParser.vrdu.config import config +from DocParser.vrdu import utils + + +class OrderAnnotation: + """Handles annotation of reading order relationships between document elements.""" + + def __init__(self, tex_file: Path) -> None: + """Initialize order annotation for a LaTeX file. + + Args: + tex_file: Path to the LaTeX file + """ + self.tex_file = tex_file + self.main_directory = tex_file.parent + self.result_directory = self.main_directory / "output/result" + + # Load layout info + layout_info_file = self.result_directory / "layout_info.json" + layout_info_data = utils.load_json(layout_info_file) + layout_info = { + int(key): [Block.from_dict(item) for item in values] + for key, values in layout_info_data.items() + } + + # Initialize annotations + self.annotations: Dict[str, Any] = { + "annotations": [ + block for page_blocks in layout_info.values() for block in page_blocks + ], + "orders": [], + } + + def annotate(self) -> None: + """Generate and save all order annotations.""" + # Generate different types of order relationships + self.generate_sortable_envs_order() + self.generate_float_envs_order() + self.generate_cross_reference_order() + + # Save annotations + order_annotation_file = self.result_directory / "order_annotation.json" + transformed_annotations = { + "annotations": [x.to_dict() for x in self.annotations["annotations"]], + "orders": self.annotations["orders"], + } + utils.export_to_json(transformed_annotations, order_annotation_file) + + def generate_cross_reference_order(self) -> None: + """Generate order annotations for cross-references.""" + annotations: List[Dict[str, str]] = [] + + # Build label to block ID mapping + label_to_block_id = { + label: block.block_id + for block in self.annotations["annotations"] + if block.labels + for label in block.labels + } + + # Reference patterns to match + ref_patterns = "|".join( + [ + r"\\ref\{(.*?)\}", + r"\\eqref\{(.*?)\}", + r"\\pageref\{(.*?)\}", + r"\\autoref\{(.*?)\}", + r"\\vref\{(.*?)\}", + r"\\cref\{(.*?)\}", + r"\\labelcref\{(.*?)\}", + ] + ) + + # Process text blocks + for block in self.annotations["annotations"]: + category = config.category2name[block.category] + + # Handle text and equation references + if category in ["Text", "Text-EQ"]: + block.references = self._extract_references( + block.source_code, ref_patterns + ) + self._add_reference_annotations( + block, label_to_block_id, annotations, "explicit-cite" + ) + + # Handle caption references + elif category == "Caption" and block.references: + self._add_reference_annotations( + block, label_to_block_id, annotations, "implicit-cite" + ) + + # Handle table and algorithm references + elif category in ["Table", "Algorithm"]: + block.references = self._extract_references( + block.source_code, ref_patterns + ) + self._add_reference_annotations( + block, label_to_block_id, annotations, "explicit-cite" + ) + + self.annotations["orders"].extend(annotations) + + def _extract_references(self, text: str, pattern: str) -> List[str]: + """Extract reference labels from text using pattern.""" + return [x for group in re.findall(pattern, text) for x in group if x] + + def _add_reference_annotations( + self, + block: Block, + label_map: Dict[str, str], + annotations: List[Dict[str, str]], + ref_type: str, + ) -> None: + """Add reference annotations for a block.""" + for label in block.references: + if label in label_map: + annotations.append( + {"type": ref_type, "from": block.block_id, "to": label_map[label]} + ) + + def generate_float_envs_order(self) -> None: + """Generate order annotations for floating environments.""" + with open(self.tex_file, "r") as f: + latex_content = f.read() + + # Process title labels + self._process_title_labels(latex_content) + + # Process equation labels + self._process_equation_labels() + + # Process float environment labels + category_patterns = { + "Table": r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", + "Figure": r"\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}", + "Algorithm": r"\\begin\{algorithm\*?\}(.*?)\\end\{algorithm\*?\}", + } + + category_indices = { + category: [ + (match.start(), match.end(), str(uuid4())) + for match in re.finditer(pattern, latex_content, re.DOTALL) + ] + for category, pattern in category_patterns.items() + } + + label_pattern = r"\\label\{(.*?)\}" + + # Process each category + for category, indices in category_indices.items(): + self._process_float_env_labels( + category, indices, latex_content, label_pattern + ) + + def _process_title_labels(self, latex_content: str) -> None: + """Process and add labels for title blocks.""" + label_pattern = r"\\label\{(.*?)\}" + + for block in self.annotations["annotations"]: + if config.category2name[block.category] != "Title": + continue + + block.labels = re.findall(label_pattern, block.source_code) + + # Find additional labels after the title + start_idx = latex_content.find(block.source_code) + if start_idx == -1: + continue + + end_idx = start_idx + len(block.source_code) + matches = re.finditer(label_pattern, latex_content[end_idx:], re.DOTALL) + + for match in matches: + label_start = match.start() + end_idx + label_end = match.end() + end_idx + label_content = latex_content[label_start:label_end] + + if latex_content[end_idx:label_start].isspace(): + block.labels.extend(re.findall(label_pattern, label_content)) + break + + def _process_equation_labels(self) -> None: + """Process and add labels for equation blocks.""" + label_pattern = r"\\label\{(.*?)\}" + + for block in self.annotations["annotations"]: + if config.category2name[block.category] == "Equation": + block.labels = re.findall(label_pattern, block.source_code) + + def _process_float_env_labels( + self, + category: str, + indices: List[tuple], + latex_content: str, + label_pattern: str, + ) -> None: + """Process and add labels for floating environment blocks.""" + for block in self.annotations["annotations"]: + if config.category2name[block.category] != category: + continue + + start_idx = latex_content.find(block.source_code) + if start_idx == -1: + continue + + end_idx = start_idx + len(block.source_code) + + for idx_start, idx_end, uuid in indices: + if not (start_idx >= idx_start and end_idx <= idx_end): + continue + + labels = re.findall(label_pattern, latex_content[idx_start:idx_end]) + block.labels = labels + block.labels.append(uuid) + + # Process caption references + for block in self.annotations["annotations"]: + if config.category2name[block.category] != "Caption": + continue + + start_idx = latex_content.find(block.source_code) + if start_idx == -1: + continue + + end_idx = start_idx + len(block.source_code) + + for idx_start, idx_end, uuid in indices: + if start_idx >= idx_start and end_idx <= idx_end: + block.references = [uuid] + + def generate_sortable_envs_order(self) -> None: + """Generate order annotations for sortable environments.""" + annotations: List[Dict[str, str]] = [] + + # Get relevant category IDs + sortable_cats = [ + config.name2category[name] for name in config.sortable_categories + ] + title_cats = [ + config.name2category[name] for name in ["Title", "PaperTitle", "Abstract"] + ] + text_cats = [ + config.name2category[name] + for name in ["Text", "Text-EQ", "Equation", "List"] + ] + + # Get sortable elements + sortable_elements = [ + block + for block in self.annotations["annotations"] + if block.category in sortable_cats + ] + + stack: List[Block] = [] + for idx, element in enumerate(sortable_elements): + if idx == 0 or not stack: + stack.append(element) + continue + + # Handle different cases + if element.parent_block == stack[-1].block_id: + self._add_order_annotation(annotations, element, stack[-1], "identical") + stack.pop() + stack.append(element) + + elif element.category in text_cats and stack[-1].category in text_cats: + self._add_order_annotation(annotations, element, stack[-1], "adj") + stack.pop() + stack.append(element) + + elif ( + element.category in text_cats + and stack[-1].category in title_cats + and element.category != stack[-1].category + ): + self._add_order_annotation(annotations, element, stack[-1], "sub") + stack.append(element) + + elif element.category in title_cats and stack[-1].category in text_cats: + while stack and stack[-1].category not in title_cats: + stack.pop() + + if stack: + self._add_order_annotation(annotations, element, stack[-1], "peer") + stack.append(element) + + elif element.category in title_cats and stack[-1].category in title_cats: + self._add_order_annotation(annotations, element, stack[-1], "peer") + stack.pop() + stack.append(element) + + elif element.category == config.name2category["Footnote"]: + self._add_order_annotation( + annotations, element, stack[-1], "explicit-cite" + ) + + self.annotations["orders"].extend(annotations) + + def _add_order_annotation( + self, + annotations: List[Dict[str, str]], + from_block: Block, + to_block: Block, + rel_type: str, + ) -> None: + """Add an order annotation between two blocks.""" + annotations.append( + {"type": rel_type, "from": from_block.block_id, "to": to_block.block_id} + ) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py new file mode 100644 index 0000000..e5ea59a --- /dev/null +++ b/DocParser/vrdu/preprocess.py @@ -0,0 +1,169 @@ +import re +from pathlib import Path +from typing import Optional +from loguru import logger + +from DocParser.arxiv_cleaner.cleaner import Cleaner +from DocParser.vrdu.config import envs, config +from DocParser.vrdu import utils + + +def remove_comments(tex_file: Path) -> None: + """ + Removes LaTeX comments from a TeX file. + + Args: + tex_file: Path to the TeX file + """ + tex_file = Path(tex_file) + content = tex_file.read_text() + + # Remove LaTeX comments + pattern = r"\\begin{comment}(.*?)\\end{comment}" + content = re.sub(pattern, "", content, flags=re.DOTALL) + + tex_file.write_text(content) + + +def clean_tex(tex_file: Path) -> None: + """ + Clean the given TeX file using arxiv-cleaner. + + Args: + tex_file: Path to the TeX file + """ + tex_file = Path(tex_file) + main_directory = tex_file.parent + + # Create and run the cleaner + cleaner = Cleaner( + input_dir=str(main_directory), + output_dir=str(main_directory), + tex=tex_file.name, + command_options=config.command_options, + verbose=False, + ) + cleaner.clean() + + # Remove any remaining comments + remove_comments(tex_file) + + +def get_graphics_path(content: str) -> str: + """Extract graphics path from LaTeX content.""" + pattern = r"\\graphicspath\{\{(.+?)}" + if match := re.search(pattern, content, re.DOTALL): + return match.group(1) + return "" + + +def convert_image( + image_path: Path, main_dir: Path, graphics_path: str, target_ext: str = ".png" +) -> Optional[str]: + """ + Convert image to target format if needed. + Returns the new image name or None if conversion failed. + """ + if not image_path.exists(): + logger.error(f"File not found: {image_path}") + return None + + if image_path.suffix in [".eps", ".ps"]: + # Convert eps/ps to pdf first + pdf_path = image_path.with_suffix(".pdf") + utils.convert_eps_image_to_pdf_image(image_path, pdf_path) + image_path = pdf_path + + if image_path.suffix == ".pdf": + # Convert pdf to png + png_path = image_path.with_suffix(".png") + utils.convert_pdf_figure_to_png_image(image_path, png_path) + return png_path.name + + return image_path.name + + +def replace_pdf_ps_figures_with_png(tex_file: Path) -> None: + """ + Replace PDF, PS, EPS figures with PNG figures in a TeX file + to support pdfminer detecting bounding box. + + Args: + tex_file: Path to the TeX file + + Raises: + FileNotFoundError: If an image file is not found + """ + tex_file = Path(tex_file) + main_directory = tex_file.parent + content = tex_file.read_text() + + graphics_path = get_graphics_path(content) + + # Replace \psfig and \epsfig with \includegraphics + content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content) + content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content) + + # Find all \includegraphics commands + pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}" + matches = re.findall(pattern, content) + + # Supported extensions + ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + + # Process each image + for _, img_path in matches: + image_name = img_path + + # Add extension if missing + if not any(ext in image_name for ext in ext_patterns): + for ext in ext_patterns: + test_path = Path(main_directory, graphics_path, image_name).with_suffix( + ext + ) + if test_path.exists(): + image_name = f"{image_name}{ext}" + break + + # Skip if already in supported format + if any(ext in image_name for ext in [".jpg", ".jpeg", ".png"]): + content = content.replace(img_path, image_name) + continue + + # Convert image if needed + image_path = Path(main_directory, graphics_path, image_name) + if new_name := convert_image(image_path, main_directory, graphics_path): + content = content.replace(img_path, new_name) + + tex_file.write_text(content) + + +def delete_table_of_contents(tex_file: Path) -> None: + """ + Delete table of contents, list of figures/tables/algorithms. + + Args: + tex_file: Path to the TeX file + """ + tex_file = Path(tex_file) + content = tex_file.read_text() + + pattern = r"\\(" + "|".join(envs.table_of_contents) + r")" + content = re.sub(pattern, "", content) + + tex_file.write_text(content) + + +def run(tex_file: Path) -> None: + """ + Preprocess a LaTeX document by: + 1. Cleaning with arxiv_cleaner + 2. Converting figures to PNG format + 3. Removing table of contents + + Args: + tex_file: Path to the LaTeX document + """ + clean_tex(tex_file) + replace_pdf_ps_figures_with_png(tex_file) + delete_table_of_contents(tex_file) diff --git a/DocParser/vrdu/quality_check.py b/DocParser/vrdu/quality_check.py new file mode 100644 index 0000000..49eedf2 --- /dev/null +++ b/DocParser/vrdu/quality_check.py @@ -0,0 +1,214 @@ +"""Quality check module for analyzing layout and text information.""" + +from typing import Dict, List, Any +from pathlib import Path + +from DocParser.vrdu.block import Block +from DocParser.vrdu import utils +from DocParser.vrdu.config import config + + +def generate_quality_report(main_directory: Path) -> None: + """Generate a quality report analyzing layout and text information. + + Analyzes layout metadata, text content, and block positioning to generate + a quality report with metrics like missing content rates and block overlaps. + + Args: + main_directory: Base directory containing the input files + """ + result_dir = main_directory / "output" / "result" + + # Load input files + layout_metadata = utils.load_json(result_dir / "layout_metadata.json") + text_info = utils.load_json(result_dir / "texts.json") + layout_info_data = utils.load_json(result_dir / "layout_info.json") + + # Convert layout info to Block objects + layout_info = _convert_layout_info(layout_info_data) + + # Generate report + result = { + "num_pages": max(layout_info.keys()), + "num_columns": layout_metadata["num_columns"], + "category_quality": _analyze_category_quality(layout_info, text_info), + "page_quality": _analyze_page_quality(layout_info), + } + + # Save report + utils.export_to_json(result, result_dir / "quality_report.json") + + +def _convert_layout_info(layout_info_data: Dict) -> Dict[int, List[Block]]: + """Convert raw layout info data to Block objects. + + Args: + layout_info_data: Raw layout info dictionary from JSON + + Returns: + Dictionary mapping page numbers to lists of Block objects + """ + return { + int(key): [Block.from_dict(item) for item in values] + for key, values in layout_info_data.items() + } + + +def _analyze_category_quality( + layout_info: Dict[int, List[Block]], text_info: Dict[str, List[Any]] +) -> List[Dict[str, Any]]: + """Analyze quality metrics for each content category. + + Compares text content vs geometric blocks to identify missing content. + Calculates metrics like counts and missing rates for each category. + + Args: + layout_info: Page index to list of Block objects mapping + text_info: Category name to list of text content mapping + + Returns: + List of quality metrics per category including totals + """ + quality_metrics = [] + total_reading = total_geometry = 0 + + for category, texts in text_info.items(): + # Skip figure analysis since they're handled differently + if category == config.name2category["Figure"]: + continue + + reading_count = len(texts) + geometry_count = _count_category_blocks(layout_info, category) + + missing_rate = _calculate_missing_rate(reading_count, geometry_count) + + quality_metrics.append( + { + "category": category, + "geometry_count": geometry_count, + "reading_count": reading_count, + "missing_rate": missing_rate, + } + ) + + total_reading += reading_count + total_geometry += geometry_count + + # Add aggregate metrics + quality_metrics.append( + { + "category": "Total", + "geometry_count": total_geometry, + "reading_count": total_reading, + "missing_rate": _calculate_missing_rate(total_reading, total_geometry), + } + ) + + return quality_metrics + + +def _calculate_missing_rate(reading_count: int, geometry_count: int) -> float: + """Calculate missing rate between reading and geometry counts. + + Args: + reading_count: Number of text elements found + geometry_count: Number of geometric blocks found + + Returns: + Missing rate as a float between 0 and 1 + """ + return 0 if reading_count == 0 else 1 - geometry_count / reading_count + + +def _count_category_blocks(layout_info: Dict[int, List[Block]], category: str) -> int: + """Count number of top-level blocks of a given category. + + Only counts blocks that don't have a parent block (top-level blocks). + + Args: + layout_info: Page index to list of Block objects mapping + category: Category to count + + Returns: + Number of blocks found + """ + count = 0 + for blocks in layout_info.values(): + count += sum( + 1 + for block in blocks + if block.category == config.name2category[category] + and block.parent_block is None + ) + return count + + +def _analyze_page_quality(layout_info: Dict[int, List[Block]]) -> List[Dict[str, Any]]: + """Analyze quality metrics for each page. + + Calculates area and overlap metrics for blocks on each page. + Includes total metrics across all pages. + + Args: + layout_info: Page index to list of Block objects mapping + + Returns: + List of quality metrics per page including totals + """ + metrics = [] + total_area = total_overlap = total_blocks = 0 + + for page_index, blocks in layout_info.items(): + blocks.sort(key=lambda block: block.bbox.x0) + + area = sum(block.bbox.area() for block in blocks) + overlap = _calculate_page_overlap(blocks) + overlap_ratio = 0 if area == 0 else overlap / area + + metrics.append( + { + "page": page_index, + "num_blocks": len(blocks), + "area": area, + "overlap": overlap, + "ratio": overlap_ratio, + } + ) + + total_area += area + total_overlap += overlap + total_blocks += len(blocks) + + # Add aggregate metrics + metrics.append( + { + "page": "total", + "num_blocks": total_blocks, + "area": total_area, + "overlap": total_overlap, + "ratio": 0 if total_area == 0 else total_overlap / total_area, + } + ) + + return metrics + + +def _calculate_page_overlap(blocks: List[Block]) -> float: + """Calculate total overlap area between blocks on a page. + + Blocks must be sorted by x0 coordinate for early termination optimization. + + Args: + blocks: List of blocks sorted by x0 coordinate + + Returns: + Total overlap area between all blocks + """ + overlap = 0 + for i, block in enumerate(blocks[:-1]): + for other in blocks[i + 1 :]: + # Early termination - no more overlaps possible + if other.bbox.x0 > block.bbox.x1: + break + overlap += block.bbox.overlap(other.bbox) + return overlap diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py new file mode 100644 index 0000000..bd0fe02 --- /dev/null +++ b/DocParser/vrdu/renderer.py @@ -0,0 +1,615 @@ +"""LaTeX document rendering module for colorizing and processing semantic elements.""" + +from collections import defaultdict +import shutil +from typing import List, Union, Dict +import re +from pathlib import Path +from loguru import logger + +from DocParser.vrdu import utils +from DocParser.vrdu.config import config, envs +from DocParser.vrdu.utils import ( + data_from_tex_file, + tex_file_from_data, + is_text_eq, + find_env, + replace_nth, +) + + +class Renderer: + """Handles rendering and colorizing of LaTeX documents. + + This class provides functionality to: + - Parse and process LaTeX documents + - Add color definitions and styling + - Render different semantic elements with distinct colors + - Generate individual files for each element type + """ + + def __init__(self) -> None: + """Initialize renderer with empty text storage.""" + self.texts: Dict[str, List[str]] = defaultdict(list) + + def render(self, origin_tex: Path) -> None: + """Render a colored version of a LaTeX document. + + Args: + origin_tex: Path to original LaTeX file + + The rendering process: + 1. Creates a colored copy of the original file + 2. Adds required color and layout definitions + 3. Removes any conflicting color definitions + 4. Renders all semantic environments + 5. Generates individual files per element + 6. Exports the rendered text elements + """ + main_directory = origin_tex.parent + color_tex = main_directory / "paper_colored.tex" + + # Setup colored document + shutil.copyfile(origin_tex, color_tex) + self._setup_document_styling(color_tex) + + # Process environments + self.render_all_env(color_tex) + self.render_one_env(main_directory) + + # Export results + text_file = main_directory / "output/result/texts.json" + utils.export_to_json(self.texts, text_file) + + def _setup_document_styling(self, color_tex: Path) -> None: + """Set up document styling by adding color and layout definitions. + + Args: + color_tex: Path to LaTeX file to modify + """ + self.add_color_definition(color_tex) + self.add_layout_definition(color_tex) + self.remove_predefined_color(color_tex) + + def render_all_env(self, color_tex: Path) -> None: + """Render all environments in the document. + + Args: + color_tex: Path to colored LaTeX file + """ + self.render_simple_envs(color_tex) + self.render_float_envs(color_tex) + + def render_simple_envs(self, color_tex: Path) -> None: + """Render simple environments like sections, lists, equations and text. + + Args: + color_tex: Path to LaTeX file to modify + + Raises: + EOFError: If TexSoup fails to parse due to runaway environments + AssertionError: If TexSoup fails due to invalid math mode commands + """ + data, start, end = data_from_tex_file(color_tex) + + # Process each environment type + for renderer in [ + self.render_section, + self.render_list, + self.render_equation, + self.render_text, + ]: + renderer(data) + + # Write back to file + tex_file_from_data(data, color_tex, start=start, end=end) + + def render_float_envs(self, tex_file: Path) -> None: + """Render floating environments like figures, tables, algorithms etc. + + Args: + tex_file: Path to LaTeX file to modify + + The environments are rendered in a specific order to handle dependencies: + 1. Algorithms + 2. Tables + 3. Code blocks + 4. Footnotes + 5. Graphics + 6. Captions + 7. Title + 8. Abstract + """ + renderers = [ + self.render_algorithm, + self.render_tabular, + self.render_code, + self.render_footnote, + self.extract_graphics, + self.render_caption, + self.render_title, + self.render_abstract, + ] + + for renderer in renderers: + renderer(tex_file) + + def render_section(self, data: List[Union[dict, str]]) -> None: + """Render section headings with configured color. + + Args: + data: LaTeX content as structured data + """ + for item in data: + if not isinstance(item, dict): + continue + + env = find_env(item, envs.section_envs) + if env is None: + continue + + self.texts["Title"].append(item[env]) + item[env] = utils.colorize(item[env], "Title") + + def render_list(self, data: List[Union[dict, str]]) -> None: + """Render list environments with configured color. + + Args: + data: LaTeX content as structured data + """ + for item in data: + if not isinstance(item, dict): + continue + + env = find_env(item, envs.list_envs) + if env is None: + # Process nested lists recursively + for value in item.values(): + if isinstance(value, list): + self.render_list(value[1]) + continue + + self.texts["List"].append(item[env]) + item[env] = utils.colorize(item[env], "List") + + def render_equation(self, data: List[Union[dict, str]]) -> None: + """Render equation environments with configured color. + + Args: + data: LaTeX content as structured data + """ + for item in data: + if not isinstance(item, dict): + continue + + env = find_env(item, envs.math_envs) + if env is None: + # Process nested equations + for value in item.values(): + if isinstance(value, list): + self.render_equation(value[1]) + continue + + self.texts["Equation"].append(item[env]) + item[env] = utils.colorize(item[env], "Equation") + + def render_text(self, data: List[Union[dict, str]]) -> None: + """Render text content with configured colors. + + Handles both regular text and text containing equations. + + Args: + data: LaTeX content as structured data + """ + for index, item in enumerate(data): + if not isinstance(item, str): + if isinstance(item, dict): + for key, value in item.items(): + if key.lower() in envs.text_envs and isinstance(value, list): + self.render_text(value[1]) + continue + + if not item or item.isspace(): + continue + + # Determine text type and colorize + text_type = "Text-EQ" if is_text_eq(item) else "Text" + colored_text = utils.colorize(item, text_type) + self.texts[text_type].append(item) + + # Preserve whitespace + if item[0] == "\n": + colored_text = "\n" + colored_text + if item[-1] == "\n": + colored_text += "\n" + + data[index] = colored_text + + def add_color_definition(self, color_tex: Path) -> None: + """Add color package and definitions to LaTeX file. + + Args: + color_tex: Path to LaTeX file to modify + + Raises: + ValueError: If document begin tag not found + """ + content = color_tex.read_text() + + # Build color definitions + definitions = ["\\usepackage{xcolor}"] + for name, rgb_color in config.name2rgbcolor.items(): + color_name = config.name2color[name] + r, g, b = rgb_color + definition = f"\\definecolor{{{color_name}}}{{RGB}}{{{r}, {g}, {b}}}" + definitions.append(definition) + + color_definitions = "\n" + "\n".join(definitions) + "\n" + + # Insert at document begin + preamble = re.search(r"\\begin{document}", content) + if not preamble: + raise ValueError("Document begin tag not found") + + content = ( + content[: preamble.start()] + + color_definitions + + content[preamble.start() :] + ) + + color_tex.write_text(content) + + def add_layout_definition(self, color_tex: Path) -> None: + """Add layout definitions to LaTeX file. + + Args: + color_tex: Path to LaTeX file to modify + + Raises: + ValueError: If document end tag not found + + Reference: + https://www.overleaf.com/learn/latex/Page_size_and_margins + """ + content = color_tex.read_text() + + # Build layout definitions + definitions = ["\\message{[vrdu_data_process: Info]}"] + for key in config.layout_keys: + definition = f"\\message{{[vrdu_data_process: The {key} is: \\the\\{key}]}}" + definitions.append(definition) + + layout_definitions = "\n" + "\n".join(definitions) + "\n" + + # Insert before document end + doc_end = re.search(r"\\end{document}", content) + if not doc_end: + raise ValueError("Document end tag not found") + + content = ( + content[: doc_end.start()] + layout_definitions + content[doc_end.start() :] + ) + + color_tex.write_text(content) + + def remove_predefined_color(self, color_tex: Path) -> None: + """Remove hyperref and lstlisting color settings. + + Args: + color_tex: Path to LaTeX file to modify + + Raises: + ValueError: If document begin tag not found + + Reference: + https://www.overleaf.com/learn/latex/Hyperlinks + """ + content = color_tex.read_text() + + # Find document begin + preamble = re.search(r"\\begin{document}", content) + if not preamble: + raise ValueError("Document begin tag not found") + + # Disable hyperref colors if present + hyperref_pattern = ( + r"\\usepackage{hyperref}|\\usepackage(\[)?\[.*?\]?(\])?{hyperref}" + ) + if re.search(hyperref_pattern, content[: preamble.start()]): + content = ( + content[: preamble.start()] + + "\\hypersetup{colorlinks=false}\n" + + content[preamble.start() :] + ) + + # Remove lstlisting colors + content = re.sub(r"\\lstset\{.*?\}", "", content) + + color_tex.write_text(content) + + def modify_color_definitions(self, input_file: Path, output_file: Path) -> None: + """Modify color definitions to white in output file. + + Args: + input_file: Source LaTeX file path + output_file: Destination LaTeX file path + """ + content = input_file.read_text() + + # Replace each color with white + for name in config.name2rgbcolor: + color_name = config.name2color[name] + pattern = rf"\\definecolor{{{color_name}}}{{RGB}}{{(\d+), (\d+), (\d+)}}" + content = re.sub( + pattern, + rf"\\definecolor{{{color_name}}}{{RGB}}{{255, 255, 255}}", + content, + ) + + output_file.write_text(content) + + def get_env_orders(self, tex_file: Path) -> List[str]: + """Get ordered list of environments from file. + + Args: + tex_file: Path to LaTeX file + + Returns: + List of environment names in order of appearance + """ + contents = tex_file.read_text() + + colors = list(config.name2color.values()) + pattern = "|".join(rf"\b{re.escape(term)}\b" for term in colors) + matches = [m.group(0) for m in re.finditer(pattern, contents)] + + # Skip color definitions at start + return matches[len(colors) :] + + def render_one_env(self, main_directory: Path) -> None: + """Render individual files with one environment highlighted. + + Args: + main_directory: Working directory path + """ + color_tex = main_directory / "paper_colored.tex" + white_tex = main_directory / "paper_white.tex" + + self.modify_color_definitions(color_tex, white_tex) + ordered_envs = self.get_env_orders(white_tex) + + content = white_tex.read_text() + + index_map = defaultdict(int) + suffix = "_color" + + for i, env_color in enumerate(ordered_envs): + env = env_color[: -len(suffix)] + env_count = index_map[env] + + # Replace nth occurrence with black + new_content = replace_nth( + content, "{" + env_color + "}", "{black}", env_count + 2 + ) + + # Generate output filename + output_file = ( + main_directory + / f"paper_{config.folder_prefix}_{str(i).zfill(5)}_{env}_{str(env_count).zfill(5)}.tex" + ) + + output_file.write_text(new_content) + + index_map[env] += 1 + + def render_caption(self, tex_file: Path) -> None: + """Render captions with color. + + Args: + tex_file: Path to LaTeX file + """ + content = tex_file.read_text() + + pattern = r"\\caption(?:\[[^\]]*\])?(?:\{[^}]*\})" + result = self._render_simple_envs(content, pattern, "Caption") + + tex_file.write_text(result) + + def render_title(self, tex_file: Path) -> None: + """Render document title with color. + + Args: + tex_file: Path to LaTeX file + """ + content = tex_file.read_text() + + pattern = r"\\title(?:\{[^}]*\})" + result = self._render_simple_envs(content, pattern, "PaperTitle") + + tex_file.write_text(result) + + def render_footnote(self, tex_file: Path) -> None: + """Render footnotes with color. + + Args: + tex_file: Path to LaTeX file + """ + content = tex_file.read_text() + + for env_name in envs.footnote_envs: + pattern = r"\\" + env_name + r"(?:\[[^\]]*\])?(?:\{[^}]*\})" + content = self._render_simple_envs(content, pattern, "Footnote") + + tex_file.write_text(content) + + def _render_simple_envs(self, content: str, pattern: str, category: str) -> str: + """Render simple environments with color. + + Args: + content: LaTeX content + pattern: Regex pattern to match + category: Environment category name + + Returns: + Modified content with colored environments + """ + matches = re.finditer(pattern, content) + result = "" + last_end = 0 + + for match in matches: + start = match.start() + end = match.end() + + # Handle nested brackets + num_left = content[start:end].count("{") + num_right = content[start:end].count("}") + + while num_right < num_left: + if content[end] == "{": + num_left += 1 + elif content[end] == "}": + num_right += 1 + end += 1 + + env_content = content[start:end] + self.texts[category].append(env_content) + + result += content[last_end:start] + result += utils.colorize(env_content, category) + last_end = end + + result += content[last_end:] + return result + + def render_abstract(self, tex_file: Path) -> None: + """Render abstract with color. + + Args: + tex_file: Path to LaTeX file + + Raises: + ValueError: If multiple abstracts found + """ + content = tex_file.read_text() + + pattern = r"\\begin{abstract}.*?\\end{abstract}" + matches = list(re.finditer(pattern, content, re.DOTALL)) + + if len(matches) > 1: + raise ValueError("Multiple abstracts found") + + if not matches: + return + + match = matches[0] + abstract = content[match.start() : match.end()] + self.texts["Abstract"].append(abstract) + + result = ( + content[: match.start()] + + utils.colorize(abstract, "Abstract") + + content[match.end() :] + ) + + tex_file.write_text(result) + + def render_tabular(self, tex_file: Path) -> None: + """Render tables with color. + + Args: + tex_file: Path to LaTeX file + """ + content = tex_file.read_text() + + pattern = r"\\begin{(tabular[*xy]?)}.*?\\end{\1}" + result = self._render_float_envs(content, pattern, "Table") + + tex_file.write_text(result) + + def render_algorithm(self, tex_file: Path) -> None: + """Render algorithms with color. + + Args: + tex_file: Path to LaTeX file + """ + content = tex_file.read_text() + + pattern = r"\\begin{algorithm[*]?}(.*?)\\end{algorithm[*]?}" + result = self._render_float_envs(content, pattern, "Algorithm") + + tex_file.write_text(result) + + def render_code(self, tex_file: Path) -> None: + """Render code blocks with color. + + Handles both code environments and lstinputlisting. + + Args: + tex_file: Path to LaTeX file + + Reference: + https://en.wikibooks.org/wiki/LaTeX/Source_Code_Listings + """ + content = tex_file.read_text() + + patterns = [ + r"\\begin{(verbatim|lstlisting|program)[*]?}(.*?)\\end{\1[*]?}", + r"\\lstinputlisting\[[^\]]*\]{[^\}]*}", + ] + pattern = "|".join(patterns) + + result = self._render_float_envs(content, pattern, "Code") + + tex_file.write_text(result) + + def _render_float_envs(self, content: str, pattern: str, category: str) -> str: + """Render floating environments with color. + + Args: + content: LaTeX content + pattern: Regex pattern to match + category: Environment category name + + Returns: + Modified content with colored environments + """ + matches = list(re.finditer(pattern, content, re.DOTALL)) + + if not matches: + logger.debug(f"No {category} environments found") + return content + + result = content[: matches[0].start()] + + for i, match in enumerate(matches): + if i > 0: + result += content[matches[i - 1].end() : match.start()] + + env_content = content[match.start() : match.end()] + + # Skip figures in tables + if category == "Table" and "\\includegraphics" in env_content: + continue + + self.texts[category].append(env_content) + result += utils.colorize(env_content, category) + + result += content[matches[-1].end() :] + return result + + def extract_graphics(self, tex_file: Path) -> None: + """Extract graphics commands. + + Args: + tex_file: Path to LaTeX file + """ + content = tex_file.read_text() + + pattern = r"\\includegraphics(?:\[(.*?)\])?{(.*?)}" + for options, path in re.findall(pattern, content): + graphic = "\\includegraphics" + if options: + graphic += f"[{options}]" + graphic += f"{{{path}}}" + self.texts["Figure"].append(graphic) diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py new file mode 100644 index 0000000..43eeb3f --- /dev/null +++ b/DocParser/vrdu/utils.py @@ -0,0 +1,365 @@ +"""Utility functions for LaTeX document processing and file operations.""" + +import re +import json +import subprocess +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from pdf2image import pdf2image, generators +from DocParser.TexSoup.TexSoup import TexSoup +import DocParser.TexSoup.app.conversion as conversion +from DocParser.vrdu.block import Block +from DocParser.vrdu.config import config + + +def export_to_json(data: Union[Dict, List], file_path: Union[str, Path]) -> None: + """Write data to a JSON file with indentation. + + Args: + data: Dictionary or list to write + file_path: Output JSON file path + """ + with open(file_path, "w") as f: + json.dump(data, f, indent=4) + + +def load_json(file_path: Union[str, Path]) -> Union[Dict, List]: + """Load data from a JSON file. + + Args: + file_path: Input JSON file path + + Returns: + Loaded dictionary or list + """ + with open(file_path) as f: + return json.load(f) + + +def compile_latex(file: Union[str, Path], colored: bool = False) -> None: + """Compile a LaTeX file using pdflatex. + + Args: + file: Path to LaTeX file + colored: Whether this is the colored version requiring synctex + """ + file_name = Path(file).name + base_cmd = ["pdflatex", "-interaction=nonstopmode"] + + # Run twice for references + for _ in range(2): + subprocess.run(base_cmd + [file_name], timeout=1000, stdout=subprocess.DEVNULL) + + # Additional run with synctex for colored version + if colored: + subprocess.run( + base_cmd + ["-synctex=1", file_name], + timeout=1000, + stdout=subprocess.DEVNULL, + ) + + +def pdf2jpg(pdf_path: Union[str, Path], output_directory: Union[str, Path]) -> None: + """Convert PDF pages to JPG images. + + Args: + pdf_path: Input PDF file path + output_directory: Output directory for JPG files + + Output files are named: thread-000x-yz.jpg + where x is thread index and yz is page number + """ + output_dir = Path(output_directory) + output_dir.mkdir(parents=True, exist_ok=True) + + pdf2image.convert_from_path( + pdf_path, + fmt="jpg", + output_folder=str(output_dir), + output_file=generators.counter_generator(prefix="thread-", suffix="-page"), + ) + + +def convert_pdf_figure_to_png_image( + pdf_image: Union[str, Path], png_image: Union[str, Path], dpi: int = 72 +) -> None: + """Convert PDF figure to PNG image. + + Args: + pdf_image: Input PDF file path + png_image: Output PNG file path + dpi: Resolution for conversion + """ + # Crop PDF + subprocess.run( + ["pdfcrop", str(pdf_image), str(pdf_image)], stdout=subprocess.DEVNULL + ) + + # Convert to PNG + images = pdf2image.convert_from_path(pdf_image, dpi=dpi) + images[0].save(png_image) + + +def convert_eps_image_to_pdf_image( + eps_image_path: Union[str, Path], pdf_image_path: Union[str, Path] +) -> None: + """Convert EPS image to PDF. + + Args: + eps_image_path: Input EPS file path + pdf_image_path: Output PDF file path + """ + subprocess.run(["epspdf", str(eps_image_path), str(pdf_image_path)]) + + +def export_to_coco( + layout_info: Dict[int, List[Block]], + image_infos: Dict[int, Dict[str, Any]], + file_path: Union[str, Path], +) -> None: + """Export layout and image info to COCO format JSON. + + Args: + layout_info: Page index to list of Block objects mapping + image_infos: Page index to image info mapping + file_path: Output JSON file path + + See: https://cocodataset.org/#format-data + """ + result = { + "info": config.config["coco_info"], + "licenses": config.config["coco_licenses"], + "images": _build_coco_images(layout_info, image_infos), + "annotations": _build_coco_annotations(layout_info), + "categories": _build_coco_categories(), + } + export_to_json(result, file_path) + + +def _build_coco_categories() -> List[Dict[str, Any]]: + """Build COCO format category information.""" + return [ + {"id": index, "name": category, "supercategory": supercategory} + for index, category, supercategory in config.config["category_name"] + ] + + +def _build_coco_images( + layout_info: Dict[int, List[Block]], image_infos: Dict[int, Dict[str, Any]] +) -> List[Dict[str, Any]]: + """Build COCO format image information.""" + return [ + { + "id": page_index, + "width": image_infos[page_index]["width"], + "height": image_infos[page_index]["height"], + "file_name": image_infos[page_index]["file_name"], + **config.config["coco_image_info"], + } + for page_index in layout_info + ] + + +def _build_coco_annotations( + layout_info: Dict[int, List[Block]] +) -> List[Dict[str, Any]]: + """Build COCO format annotation information.""" + annotations = [] + for page_index, page_elements in layout_info.items(): + for index, element in enumerate(page_elements): + width, height = element.width, element.height + annotations.append( + { + "id": index, + "image_id": page_index, + "category_id": element.category, + "segmentation": [], + "bbox": [element.bbox[0], element.bbox[1], width, height], + "area": width * height, + "iscrowd": 0, + } + ) + return annotations + + +def colorize(text: str, category_name: str) -> str: + """Colorize text based on category. + + Args: + text: Text to colorize + category_name: Category determining color + + Returns: + Colorized LaTeX text + + Raises: + NotImplementedError: For unknown categories + """ + color = config.name2color[category_name] + + # Simple wrapping + if category_name in {"Table", "Title", "List", "Code"}: + return f"{{\\color{{{color}}}{text}}}" + + # Text coloring + if category_name in {"Text", "Text-EQ"}: + return f"{{\\textcolor{{{color}}}{{{text}}}}}" + + # Complex cases + if category_name in {"Caption", "Footnote"}: + index = text.find("{") + return f"{text[:index + 1]}{{\\color{{{color}}}{text[index + 1:]}}}" + + if category_name == "Algorithm": + prefix = text.find("\\", len("\\begin{algorithm}")) + suffix = text.find("\\end{algorithm}") + return ( + f"{text[:prefix]}{{\\color{{{color}}}{text[prefix:suffix]}}}{text[suffix:]}" + ) + + if category_name == "PaperTitle": + index = text.find("{") + return f"{text[:index + 1]}{{\\textcolor{{{color}}}{{{text[index + 1:]}}}}}" + + if category_name == "Equation": + return f"{{\\color{{{color}}}{{{text}}}}}" + + if category_name == "Abstract": + prefix = len("\\begin{abstract}") + return f"{{{text[:prefix]}\\color{{{color}}}{text[prefix:]}}}" + + raise NotImplementedError(f"Invalid category name: {category_name}") + + +def extract_main_content(tex_file: str) -> Tuple[str, int, int]: + """Extract the main content from a LaTeX file. + + Args: + tex_file: Path to the LaTeX file + + Returns: + Tuple containing: + - Main content between document tags + - Start position of main content in file + - End position of main content in file + + Raises: + ValueError: If document tags not found + """ + with open(tex_file) as f: + content = f.read() + + start = content.find("\\begin{document}") + end = content.find("\\end{document}") + + if start == -1 or end == -1: + raise ValueError("Document tags not found") + + start += len("\\begin{document}") + main_content = content[start:end] + + return main_content, start, end + + +def data_from_tex_file(tex_file: str) -> Tuple[List[Union[dict, str]], int, int]: + """Extract data from a TeX file using TexSoup. + + Args: + tex_file: Path to the TeX file + + Returns: + Tuple containing: + - Extracted data as list + - Start position of main content in file + - End position of main content in file + """ + main_content, start, end = extract_main_content(tex_file) + tex_tree = TexSoup(main_content).expr.all + data = conversion.to_list(tex_tree) + + return data, start, end + + +def tex_file_from_data( + data: List[Union[dict, str]], + tex_file: Union[str, Path], + start: int = 0, + end: int = -1, +) -> None: + """Generate a TeX file from TexSoup data. + + Args: + data: Data to convert to LaTeX + tex_file: Output TeX file path + start: Start position for content replacement + end: End position for content replacement + """ + with open(tex_file, "r") as f: + content = f.read() + + rendered_tex = conversion.to_latex(data) + content = content[:start] + rendered_tex + content[end:] + + with open(tex_file, "w") as f: + f.write(content) + + +def replace_nth(string: str, old: str, new: str, n: int) -> str: + """Replace the n-th occurrence of a substring. + + Args: + string: Original string + old: Substring to replace + new: Replacement substring + n: Which occurrence to replace (1-based) + + Returns: + Modified string with n-th occurrence replaced + + Example: + >>> replace_nth("Hello, hello, hello!", 'hello', 'hi', 2) + 'Hello, hello, hi!' + """ + index = string.find(old) + count = int(index != -1) + + while index != -1 and count != n: + index = string.find(old, index + 1) + count += 1 + + if count == n: + return string[:index] + new + string[index + len(old) :] + + return string + + +def find_env(wrapped_env: dict, query: List[str]) -> Optional[str]: + """Find first matching environment variable from query list. + + Args: + wrapped_env: Dictionary of environment variables + query: List of environment variables to search for + + Returns: + First matching environment variable or None + """ + return next((env for env in query if env in wrapped_env), None) + + +def is_text_eq(text: str) -> bool: + """Check if text contains mathematical expressions. + + Args: + text: Text to check + + Returns: + True if contains math expressions, False otherwise + + Reference: + https://www.overleaf.com/learn/latex/Mathematical_expressions + """ + pattern = r"(\\\(.*?\\\))|(\$.*?\$)|(\\begin\{math\}.*?\\end\{math\})" + matches = re.findall(pattern, text) + + return any(not re.search(r"\\\$", match[0]) for match in matches) diff --git a/README.md b/README.md index f5b8a1d..e3764c1 100644 --- a/README.md +++ b/README.md @@ -1,185 +1,184 @@ -# vrdu_data_process -This repository is used to process paper with `.tex` source files to obtain: -1. object detection results -2. latex source code - visual bounding box pairs -3. layout reading orders. - - -# Installation -## Step 1 Install package -First create a conda environment (if Anaconda has not been installed, see [installation](https://docs.anaconda.com/free/anaconda/install/index.html)) -```shell -conda create --name vrdu python=3.8 -``` +# DocParser -Then activate the environment and install packages: -```shell -conda activate vrdu -pip install -e . -``` +A tool for processing academic papers with `.tex` source files to extract: + +1. Object detection results +2. LaTeX source code with visual bounding box pairs +3. Layout reading orders + +## Project Links + +- GitHub Repository: +- HuggingFace dataset: + +## Installation + +### Prerequisites + +1. **Python Environment** + - Python 3.8 or higher + - Anaconda (recommended) - [Installation Guide](https://docs.anaconda.com/free/anaconda/install/index.html) + +2. **TeX Live Distribution** + - Required for LaTeX compilation + - Installation guide available at [tug.org/texlive](https://www.tug.org/texlive/) + + For Ubuntu users: + + ```bash + sudo apt-get install texlive-full # Requires ~5.4GB disk space + ``` + + Note: `texlive-full` is recommended to avoid missing package errors. See [package differences](https://tex.stackexchange.com/a/504566). + +### Setup + +1. Create and activate conda environment: -## Step 2 Install TexLive -To compile latex, we need to install **Tex Live Distribution**, where you can find installation guide on [this page](https://www.tug.org/texlive/). + ```bash + conda create --name doc_parser python=3.8 + conda activate doc_parser + ``` -For Ubuntu, we recommend install `texlive-full` by running the following command on terminal (Requires ~5.4GB disk space) -```shell -sudo apt-get install texlive-full -``` -this version avoids missing package error, to see differences among versions, see [Differences between texlive packages in Linux](https://tex.stackexchange.com/a/504566) +2. Install the package: -# Usage -```python + ```bash + pip install -e . + ``` + +## Usage + +Run the parser on your LaTeX file: + +```bash python main.py --file_name path_to_paper/paper.tex ``` -the script then generates the bounding box of the following categories and their corresponding content (if there are text inside the bounding box): -1. layout annotation, with a bounding box around each semantic element, such as table, text paragraph, equation, etc. -2. reading annotation, which is a pair that links the bounding box and corresponding latex source code. -the result is stored in the `path_to_paper/output/result`, the folder structure is given as follows: -```shell -path_to_paper -├── output -│   └── result -│   ├── layout_annotation.json -│   ├── reading_annotation.json -│   ├── ordering_annotation.json -│   ├── quality_report.json -│   ├── texts.json -│   ├── env_orders.json -│   ├── layout_info.json -│   ├── layout_metadata.json -│   ├── raw_parsed_data.json -│   ├── page_0.jpg -| ├── page_1.jpg -| ├── block_0.jpg -└─ └── block_1.jpg +### Output Structure -``` -The result contains three parts: -1. Object detection result, which includes `layout_annotation.json` and `page_{n}.png`, the result is is represented as [COCO format](https://cocodataset.org/#format-data) -2. Reading detection result, which includes `reading_annotation.json` and `block_{n}.png`, it matches the bounding box and its original tex represented contents -3. Reading order result, which includes `ordering_annotation.json`. The reading order is represented via triple (`relationship`, `from`, `to`), indicates the relationship between the block with id `from` and the block with id `to`. -4. Debugging infos, this parts contains: - - `texts.json`, it contains the original tex contents - - `env_orders.json`, it is used to annotate reading orders - - `layout_info.json`, it is the raw content of object detection result - - `layout_metadata.json`, it contains the information about the paper layouts - - `raw_parsed_data.json`, it contains the result of main content of the tex file parsed by `TexSoup`. - -## Common issues -### 1. `latexpand` command running error -``` -ValueError: Failed to run the command "latexpand --output="/tmp/arxiv_cleaner.46fp5l_e.latexpand_output/paper_original.tex" --fatal --out-encoding="encoding(UTF-8)" "paper_original.tex"" -Return code: 2 -``` -if this error occurs, please check the version of installed `latexpand` with -``` -latexpand --help -``` -in the last line of output will print the version. If the version is below $1.6$, then we need to upgrade it to $\geq1.6$, the simplest way is -1. go to [latexpand v1.6](https://gitlab.com/latexpand/latexpand/-/tags/v1.6) download the source code -2. use `sudo vim $(which latexpand)` to edit the content of `latexpand` script (`sudo` is necessary since `latexpand` usually locates in `/usr/bin`) -3. copy the content of `v1.6/latexpand` to the old version of `latexpand` (opened with vim) +Results are stored in `path_to_paper/output/result`: -### 2. `pdf2image` error ``` -pdf2image.exceptions.PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH? -``` -use the following command to install `poppler`: -``` -sudo apt-get install poppler-utils +path_to_paper +├── output +│ ├── paper_colored/ # Rendered paper images +│ │ ├── thread-0001-page-01.jpg +│ │ └── ... +│ └── result/ +│ ├── layout_annotation.json # Object detection results (COCO format) +│ ├── reading_annotation.json # Bounding box to LaTeX source mapping +│ ├── ordering_annotation.json # Reading order relationships +│ ├── quality_report.json +│ ├── texts.json # Original tex contents +│ ├── layout_info.json # Raw detection results +│ ├── layout_metadata.json # Paper layout information +│ ├── page_*.jpg # Pages with bounding boxes +│ └── block_*.jpg # Individual block images ``` -for details, see [reference](https://pdf2image.readthedocs.io/en/latest/installation.html#installing-poppler). -### 3. `path_to_paper/block_*****.pdf` not found -Usually, this means the rendering process destroys the original latex, therefore it is not compilable, the reason varies from case to case. +### Output Components +1. **Object Detection Results** + - `layout_annotation.json` and `page_*.jpg` + - Uses [COCO format](https://cocodataset.org/#format-data) -# Documentation -The documentation is built with [Sphinx](https://www.sphinx-doc.org/en/master/), to build documentation, run the following commands: -``` +2. **Reading Detection Results** + - `reading_annotation.json` + - Maps bounding boxes to original LaTeX content + +3. **Reading Order Results** + - `ordering_annotation.json` + - Defines relationships between blocks using triples: (relationship, from, to) + +## Categories + +Each bounding box is classified into one of these categories: + +| Category | Name | Super Category | Description | +|----------|------|----------------|-------------| +| 0 | Algorithm | Algorithm | Algorithm environments | +| 1 | Caption | Caption | Figure, Table, Algorithm captions | +| 2 | Equation | Equation | Display equations (equation, align) | +| 3 | Figure | Figure | Figures | +| 4 | Footnote | Footnote | Footnotes | +| 5 | List | List | itemize, enumerate, description | +| 6 | Others | Others | Currently unused | +| 7 | Table | Table | Tables | +| 8 | Text | Text | Plain text without equations | +| 9 | Text-EQ | Text | Text with inline equations | +| 10 | Title | Title | Section/subsection titles | +| 11 | Reference | Reference | References | +| 12 | PaperTitle | Title | Paper title | +| 13 | Code | Algorithm | Code listings | +| 14 | Abstract | Text | Paper abstract | + +## Troubleshooting + +### Common Issues + +1. **Latexpand Error** + + ```bash + ValueError: Failed to run the command "latexpand..." + ``` + + Solution: + - Check latexpand version: `latexpand --help` + - If < 1.6, upgrade using: + 1. Download from [latexpand v1.6](https://gitlab.com/latexpand/latexpand/-/tags/v1.6) + 2. Update existing script: `sudo vim $(which latexpand)` + +2. **PDF2Image Error** + + ```bash + PDFInfoNotInstalledError: Unable to get page count + ``` + + Solution: + + ```bash + sudo apt-get install poppler-utils + ``` + +3. **Missing Block PDF** + - If `block_*.pdf` is missing, the LaTeX rendering likely failed + - This is case-specific and requires manual investigation + +## Known Limitations + +1. **Custom Environments**: Some custom environments (e.g., `\newtheorem{defn}[thm]{Definition}`) require manual addition to `envs.text_envs` +2. **Rendering Issues**: Some environments may fail during PDF compilation +3. **Special Figures**: TikZ and similar formats may not be correctly classified + +## Documentation + +Build the documentation using Sphinx: + +```bash cd docs sphinx-build . _build ``` -then the documentations are listed in `docs/_build`, which can be viewed by open `index.html` with a browser. - -# Category -each bounding box is classified into one the following category. - -| Category | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14| -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |--- |--- |--- |---| -| **Name** | Algorithm | Caption | Equation | Figure | Footnote | List | Others | Table | Text | Text-EQ | Title | Reference | PaperTitle | Code | Abstract| -| **Super Category** | Algorithm | Caption | Equation | Figure | Footnote | List | Others | Table | Text | Text | Title | Reference | Title | Algorithm | Text| - -Explanation: -- `Algorithm` contains Algorithm environment -- `Code` contains listing environments -- `Caption` contains Figure caption, Table caption and Algorithm caption -- `Equation` contains all display equations such as `equation`, `align` environments. -- `List` contains `itemize`, `enumerate` and `description`. -- `Others` Currently there is no element that is classified into Others -- `Text` refers to a paragraph of texts without inline equations, -- `Text-EQ` refers to text with equations, such as `$a$`. -- `Title` contains section title, subsection title. Others titles are ignored. -- `PaperTitle` contains paper title. - - -For more details, see `config/envs.py`. - -# Pipeline -1. Preprocess the original tex file (copy), this includes two substeps: - - resolve inputs and clean comments with `arxiv_cleaner` - - convert all pdf figures into png format - - delete table of contents -2. render tex file, this process first call `TexSoup` to parse tex files into a list, then add a color to each semantic element. This process generates a bunch of tex files, each tex file is different with the original colored tex file in a small part -3. Compile these tex files into PDFs and further transform the PDFs into png images. -4. Extract the layout metadata of PDF, so that one-column and multi-column can be classified. -5. Generating bounding box for each semantic elements, generation is composed of two methods: - - For `Figure` elements, we use `PDFMiner` to get the bounding box - - For other semantic elements, we use the difference of two images to get the bounding box -6. By linking the bounding box and its related latex source code, we obtain the reading annotations. -7. After processing, we remove all redundant files. - -# Update log -## 2023.12 -- [x] fix known bugs -- [x] add new categories -- [x] add quality report - - - -## 2023.11 -- [x] release v0.2 that correctly annotate all environments. - - [x] fix pdf figure bounding box generation error - - [x] fix cross column environments bounding box generation error - - [x] fix pdfminer cannot match source with bb error - - [x] fix pdfminer cannot accurately generate bounding box error - - - [x] feat: add bb-source_code match algorithm - - -## 2023.10 -- [x] release v0.1 that can handle algorithm, equation, table environments. - -## 2023.09 -- [x] extract elements in '.tex' files -- [x] fix environment with argument parsing error. -- [x] fix align environment rendering error -- [x] fix list environment parsing error - - -# Known Issues -1. Some customized environments will not be annotated, for example, `\newtheorem{defn}[thm]{Definition}`. This can be solved by adding the customized environment to `envs.text_envs`, then the environment will be annotated. -2. Rendering error, this happens when we render a environment successfully, but we cannot compile the rendered tex file into a PDF. This is still an open problem. -3. Some figures such as `tikz` format, will not be correctly classified, this may cause further error. - -# Acknowledgements -This project is based on the following python packages: -- [Texsoup](https://texsoup.alvinwan.com/) -- [pdf2image](https://pypi.org/project/pdf2image/) -- [pdfminer.six](https://pdfminersix.readthedocs.io/en/latest/index.html) + +View the documentation by opening `docs/_build/index.html` in a browser. + +## Acknowledgements + +Built using: + +- [Texsoup](https://texsoup.alvinwan.com/) +- [pdf2image](https://pypi.org/project/pdf2image/) +- [pdfminer.six](https://pdfminersix.readthedocs.io/en/latest/index.html) - [arxiv_cleaner](https://github.com/elsa-lab/arxiv-cleaner.git) +# Citation +if you found this package useful, please cite: -# License +```bibtex +@article{xia2024docgenome, + title={DocGenome: An Open Large-scale Scientific Document Benchmark for Training and Testing Multi-modal Large Language Models}, + author={Xia, Renqiu and Mao, Song and Yan, Xiangchao and Zhou, Hongbin and Zhang, Bo and Peng, Haoyang and Pi, Jiahao and Fu, Daocheng and Wu, Wenjie and Ye, Hancheng and others}, + journal={arXiv preprint arXiv:2406.11633}, + year={2024} +} +``` diff --git a/data/discpline_info.csv b/data/discpline_info.csv deleted file mode 100644 index 0a60b55..0000000 --- a/data/discpline_info.csv +++ /dev/null @@ -1,154 +0,0 @@ -discpline,num_papers,success,failure,processed,status -hep-ph,69164,0.0,0.0,0.0,TBD -quant-ph,66714,0.0,0.0,0.0,TBD -hep-th,54047,0.0,0.0,0.0,TBD -cs.CV,53498,0.0,0.0,0.0,TBD -cs.LG,49171,0.0,0.0,0.0,TBD -gr-qc,38174,0.0,0.0,0.0,TBD -cond-mat.mes-hall,37376,0.0,0.0,0.0,TBD -astro-ph.GA,37065,0.0,0.0,0.0,TBD -astro-ph.SR,36193,0.0,0.0,0.0,TBD -astro-ph.CO,34882,0.0,0.0,0.0,TBD -astro-ph.HE,31842,0.0,0.0,0.0,TBD -cond-mat.str-el,28712,0.0,0.0,0.0,TBD -cs.IT,26643,0.0,0.0,0.0,TBD -cs.CL,25202,0.0,0.0,0.0,TBD -cond-mat.mtrl-sci,24458,0.0,0.0,0.0,TBD -math-ph,22455,0.0,0.0,0.0,TBD -cond-mat.stat-mech,21981,0.0,0.0,0.0,TBD -nucl-th,18004,0.0,0.0,0.0,TBD -astro-ph,17200,0.0,0.0,0.0,TBD -astro-ph.EP,15999,0.0,0.0,0.0,TBD -cond-mat.soft,15697,0.0,0.0,0.0,TBD -cond-mat.supr-con,15066,0.0,0.0,0.0,TBD -hep-ex,13033,0.0,0.0,0.0,TBD -astro-ph.IM,12721,0.0,0.0,0.0,TBD -cond-mat.quant-gas,11842,0.0,0.0,0.0,TBD -cs.RO,11700,0.0,0.0,0.0,TBD -cs.CR,11659,0.0,0.0,0.0,TBD -cs.DS,11282,0.0,0.0,0.0,TBD -cs.AI,10798,0.0,0.0,0.0,TBD -stat.ML,10516,0.0,0.0,0.0,TBD -math.NA,9458,0.0,0.0,0.0,TBD -cs.NI,9055,0.0,0.0,0.0,TBD -hep-lat,8742,0.0,0.0,0.0,TBD -eess.SP,8584,0.0,0.0,0.0,TBD -cs.DC,8203,0.0,0.0,0.0,TBD -cs.LO,7619,0.0,0.0,0.0,TBD -nucl-ex,7077,0.0,0.0,0.0,TBD -eess.IV,6976,0.0,0.0,0.0,TBD -cs.SI,6638,0.0,0.0,0.0,TBD -math.OC,6442,0.0,0.0,0.0,TBD -eess.SY,6337,1378.0,806.0,2184.0,processing -cs.SE,6177,1379.0,783.0,2162.0,complete -cond-mat.dis-nn,4938,2102.0,1563.0,3665.0,complete -cs.IR,5350,1545.0,599.0,2144.0,complete -physics.ins-det,4967,1346.0,961.0,2307.0,processing -cs.GT,4808,1115.0,681.0,1796.0,complete -math.PR,3190,618.0,665.0,1283.0,complete -physics.soc-ph,3764,2119.0,1229.0,3348.0,processing -cs.SY,3822,2123.0,1258.0,3381.0,complete -math.CO,2687,850.0,479.0,1329.0,processing -math.AP,2036,490.0,570.0,1060.0,complete -cs.HC,4204,2817.0,1011.0,3828.0,complete -cs.NE,3690,2428.0,1074.0,3502.0,complete -cs.CC,2873,1750.0,1053.0,2803.0,processing -cs.CY,3550,2409.0,952.0,3361.0,complete -cs.DM,2667,1544.0,1030.0,2574.0,complete -eess.AS,3567,2692.0,718.0,3410.0,processing -cs.DB,3350,1538.0,1101.0,2639.0,processing -physics.optics,3130,1863.0,1124.0,2987.0,complete -cond-mat.other,2689,1514.0,1131.0,2645.0,complete -cs.PL,3234,841.0,1109.0,1950.0,processing -cs.SD,3329,2431.0,768.0,3199.0,complete -cs.CG,2963,1644.0,992.0,2636.0,processing -physics.atom-ph,2416,1411.0,926.0,2337.0,complete -physics.comp-ph,2357,1408.0,854.0,2262.0,complete -physics.chem-ph,2203,1323.0,741.0,2064.0,complete -physics.flu-dyn,2124,1185.0,874.0,2059.0,complete -cs.FL,1843,1000.0,804.0,1804.0,complete -math.DG,836,464.0,359.0,823.0,complete -cs.CE,1826,1102.0,655.0,1757.0,complete -cs.MA,1457,880.0,494.0,1374.0,complete -physics.bio-ph,1317,790.0,468.0,1258.0,complete -cs.GR,1446,827.0,539.0,1366.0,complete -math.AG,700,433.0,262.0,695.0,complete -econ.EM,1359,770.0,533.0,1303.0,complete -q-fin.ST,1164,732.0,427.0,1159.0,complete -nlin.SI,533,337.0,192.0,529.0,complete -cs.AR,1318,750.0,496.0,1246.0,complete -math.DS,869,530.0,329.0,859.0,complete -math.ST,1131,603.0,512.0,1115.0,complete -physics.plasm-ph,1037,606.0,397.0,1003.0,complete -math.SP,444,228.0,202.0,430.0,complete -stat.ME,1135,647.0,446.0,1093.0,complete -q-fin.MF,836,481.0,348.0,829.0,complete -physics.app-ph,1185,709.0,398.0,1107.0,complete -math.QA,351,211.0,140.0,351.0,complete -cs.MM,1069,758.0,257.0,1015.0,complete -cs.ET,1088,690.0,335.0,1025.0,complete -q-bio.NC,965,558.0,368.0,926.0,complete -q-fin.PR,690,355.0,332.0,687.0,complete -physics.gen-ph,432,254.0,175.0,429.0,complete -econ.GN,979,535.0,366.0,901.0,complete -nlin.CD,778,410.0,362.0,772.0,complete -cs.DL,893,543.0,314.0,857.0,complete -physics.data-an,851,443.0,385.0,828.0,complete -cs.SC,670,329.0,314.0,643.0,complete -q-bio.PE,830,422.0,384.0,806.0,complete -q-fin.RM,682,375.0,302.0,677.0,complete -econ.TH,759,374.0,345.0,719.0,complete -physics.hist-ph,501,280.0,210.0,490.0,complete -cs.NA,759,421.0,321.0,742.0,complete -q-fin.GN,647,418.0,228.0,646.0,complete -q-fin.CP,723,389.0,322.0,711.0,complete -q-fin.PM,609,194.0,411.0,605.0,complete -stat.AP,767,456.0,285.0,741.0,complete -q-fin.TR,685,390.0,285.0,675.0,complete -q-bio.QM,766,446.0,273.0,719.0,complete -cs.PF,732,435.0,278.0,713.0,complete -cs.MS,728,388.0,309.0,697.0,complete -math.CA,308,193.0,116.0,309.0,complete -math.NT,314,210.0,94.0,304.0,complete -physics.class-ph,439,269.0,167.0,436.0,complete -math.LO,458,269.0,180.0,449.0,complete -math.FA,225,131.0,91.0,222.0,complete -physics.space-ph,603,298.0,281.0,579.0,complete -math.RT,222,136.0,86.0,222.0,complete -cs.OH,534,350.0,165.0,515.0,complete -nlin.PS,522,340.0,175.0,515.0,complete -q-bio.BM,436,262.0,154.0,416.0,complete -nlin.AO,387,225.0,153.0,378.0,complete -physics.med-ph,429,262.0,141.0,403.0,complete -math.OA,119,69.0,48.0,117.0,complete -physics.acc-ph,372,197.0,160.0,357.0,complete -stat.CO,392,195.0,179.0,374.0,complete -physics.geo-ph,361,220.0,132.0,352.0,complete -math.RA,125,82.0,40.0,122.0,complete -math.AT,248,119.0,126.0,245.0,complete -math.GT,232,128.0,104.0,232.0,complete -q-bio.MN,277,160.0,103.0,263.0,complete -math.GR,175,91.0,57.0,148.0,complete -math.SG,139,66.0,70.0,136.0,complete -cs.OS,269,141.0,121.0,262.0,complete -math.MG,220,138.0,79.0,217.0,complete -physics.ao-ph,265,139.0,105.0,244.0,complete -q-fin.EC,187,121.0,65.0,186.0,complete -physics.pop-ph,189,112.0,72.0,184.0,complete -math.CT,233,88.0,110.0,198.0,complete -q-bio.GN,222,131.0,73.0,204.0,complete -physics.ed-ph,178,98.0,77.0,175.0,complete -math.CV,61,30.0,30.0,60.0,complete -physics.atm-clus,117,78.0,38.0,116.0,complete -q-bio.CB,114,65.0,44.0,109.0,complete -q-bio.SC,122,73.0,43.0,116.0,complete -math.HO,81,50.0,30.0,80.0,complete -math.AC,55,36.0,18.0,54.0,complete -nlin.CG,90,55.0,35.0,90.0,complete -stat.OT,46,28.0,18.0,46.0,complete -q-bio.TO,48,34.0,14.0,48.0,complete -math.KT,18,10.0,8.0,18.0,complete -cs.GL,53,21.0,7.0,28.0,complete -math.GN,25,17.0,8.0,25.0,complete -q-bio.OT,28,12.0,16.0,28.0,complete -math.GM,8,2.0,6.0,8.0,complete diff --git a/dataset_readme.md b/dataset_readme.md new file mode 100644 index 0000000..c5ab030 --- /dev/null +++ b/dataset_readme.md @@ -0,0 +1,98 @@ +# File structure + +Here is an example of file structure of the dataset for discipline `math.GM`. + +```bash +math.GM +├── 0906.1099 +│ ├── layout_annotation.json +│ ├── order_annotation.json +│ ├── page_xxxx.jpg +│ ├── quality_report.json +│ └── reading_annotation.json +└── 2103.02443 + ├── layout_annotation.json + ├── order_annotation.json + ├── page_xxxx.jpg + ├── quality_report.json + └── reading_annotation.json +``` + +each paper folder, for example, `math.GM/2103.02443` contains five parts: + +1. `page_xxxx.jpg`, this image represents each page of the paper, the page index is contained in the filename. Notice that this might be different from the original paper. +2. `layout_annotation.json`, this json file contains the layout annotation of each page in COCO format. +3. `reading_annotation.json`, this json file contains Latex source code for each blocks (except Figure). Notice that the latex source code may contain macros. +4. `order_annotation.json`, this json file contains the relationship between different blocks in triple format. +5. `quality_report.json`, this json file contains the quality computing result for each page and the whole paper for further use. + +# Layout annotation + +## Layout annotation category + +| **Index** | **Category** | **Notes** | +|----------------|-------------------|------------------------------------------| +| 0 | Algorithm | | +| 1 | Caption | Titles of Images, Tables, and Algorithms | +| 2 | Equation | | +| 3 | Figure | | +| 4 | Footnote | | +| 5 | List | | +| 7 | Table | | +| 8 | Text | | +| 9 | Text-EQ | Text block with inline equations | +| 10 | Title | Section titles | +| 12 | PaperTitle | | +| 13 | Code | | +| 14 | Abstract | | + +## Known Issues + +1. The IoU of Bounding boxes are too large, this happens when the paper template is too complex. +2. The category of the bounding boxes are not correct. This happens when user-defined macros are used. For example, some authors may use `\newcommand{\beq}{\begin{equation}}`, `\newcommand{\eeq}{\end{equation}}`, in this case, the equation may be detected as `Text` class. +3. Bounding box is missing, this happens due to rare packages are used. Some rare packages may not identified by our rule-based methods. +4. Bounding boxes are correct, but overlaps with other adjacent bounding boxe slightly, this happens due to layout adjustments, for example `vspace`, `input` commands. + +# Order annotation category + +## Category Definition + +| **Category** | **Description** | **Example** | +|----------------|-------------------|------------------------------------------| +| identical | two blocks corresponding to the same latex code chunk | paragraphs that cross columns or pages | +| peer | two blocks are both belongs to Title | \section{introduction}, \section{method} | +| sub | one block is a child of another block logically | \section{introduction} and the first paragraph in Introduction section | +| adj | two adjacent Text blocks | Paragraph1 and Paragraph2 | +| explicit-cite | one block cites another block with `ref` | As shown in \ref{Fig: 5}. | +| implicit-cite | The caption block and the corresponding float environment | \begin{table}\\caption\{A}\\begin{tabular}B\end{tabular}\end{table}, then A implicit-cite B | + +## Order annotation representation + +each `reading_annotation.json` contains two field: + +1. `annotations`: containing the block information for each block, the `block_id` of each block is used to represent the relationship. +2. `orders`: containing a list of triples, the meaning of each triple is: + 1. `type`, representing the category of the current relationship, see table above for details. + 2. `from`, representing the `block_id` of the starting block of the relationship + 3. `to`, representing the `block_id` of the ending block of the relationship + +## Known issues + +1. `reading_annotation.json` file of some papers may not contain the field `annotations` for unknown reason. +2. `reading_annotation.json` doesn't contain the `implicit-cite` relationship, the `implicit-cite` relationship is used in test-dataset for efficiency consideration. +3. `explicit-cite` only supports `Equation`, the support for `Table`, `Figrue` is developed after the training dataset is complete. + +# Quality report + +This file containing the rule-based quality check for further use. Explanation is as follows: + +1. `num_pages`: the number of pages of the paper. +2. `num_columns`: 1 (single column) or 2 (two column), depends on the last page of the paper +3. `category_quality`: we record the number rendered latex code chunks for each category `reading_count`, and the number of detected bounding boxes `geometry_count`, then `missing_rate` is computed as `(reading_count - geometry_count)/reading_count`. Finally, the `Total` category is the summary of all other categories. +4. `page_quality` containing IoU information of each page and the whole paper: + 1. `page`: page index + 2. `num_blocks`: how many bounding boxes in this page + 3. `area`: sum of area of all blocks, $\sum_i \text{area}(\text{bbox}_i)$ + 4. `overlap`: sum of intersection area of all blocks, $\sum_i\sum_{j>i} \text{area}(\text{bbox}_i\cap bbox_j)$ + 5. `ratio` the ratio between `overlap` and `area`. Note that this ratio may be very large if there is template issue. + diff --git a/scripts/app.py b/scripts/app.py deleted file mode 100644 index 549d682..0000000 --- a/scripts/app.py +++ /dev/null @@ -1,120 +0,0 @@ -import panel as pn -import os -import glob -from PIL import Image, ImageDraw - -from vrdu import utils -from vrdu.config import config - -pn.extension() - - -data_path = ( - "/cpfs01/shared/ADLab/ADLab_hdd/vrdu_arxiv/vrdu_autolabel_final_3_120w/nlin.AO" -) -default_path = os.path.expanduser( - "/cpfs01/shared/ADLab/ADLab_hdd/vrdu_arxiv/vrdu_autolabel_final_3_120w/nlin.AO/0809.2301" -) - -# get all renderable paper paths -paper_paths = [] -for root, dirs, files in os.walk(data_path): - if "order_annotation.json" in files: - paper_paths.append(root) - -# generate select widget from paper paths -paper_select = pn.widgets.Select(value=default_path, options=paper_paths) - -# load layout info from a given paper -layout_info = utils.load_json(os.path.join(default_path, "order_annotation.json")) -layout_info = layout_info["annotations"] - -# get all image paths of a given paper -image_paths = sorted(glob.glob(os.path.join(default_path, "original-page-*.jpg"))) - -# generate select widget from image paths -image_select = pn.widgets.Select(value=image_paths[0], options=image_paths) -image_pane = pn.pane.PNG() -image_pane.height = 800 -image_pane.width = 600 - -# generate pane to display source code -source_code_pane = pn.Column("# Source Code") - -# generate select widget to show annotations of different categories -category_select = pn.widgets.Select( - value="All", options=["All"] + list(config.name2category.keys()) -) - - -@pn.depends(paper_select.param.value) -def update_paper(path): - global layout_info - layout_info = utils.load_json(os.path.join(path, "order_annotation.json")) - layout_info = layout_info["annotations"] - image_paths = sorted(glob.glob(os.path.join(path, "original-page-*.jpg"))) - image_select.options = image_paths - - -@pn.depends(image_select.param.value) -def update_image(image_path): - image = Image.open(image_path) - image_pane.object = image - image_pane.width = image.size[0] - image_pane.height = image.size[1] - - -@pn.depends(image_select.param.value, category_select.param.value) -def update_annotation(image_path, category): - if not image_path: - return - source_code_pane.clear() - source_code_pane.append("# Source Code") - image = Image.open(image_path) - draw = ImageDraw.Draw(image) - image_id = int(os.path.splitext(os.path.basename(image_path))[0][-4:]) - print(f"image_id={image_id}") - - # filter blocks - if category == "All": - print(layout_info) - blocks = [block for block in layout_info if block["page_index"] == image_id] - else: - blocks = [ - block - for block in layout_info - if block["page_index"] == image_id - if block["category"] == config.name2category[category] - ] - - for index, block in enumerate(blocks): - bbox = ( - block["bbox"][0], - block["bbox"][1], - block["bbox"][2], - block["bbox"][3], - ) - draw.rectangle(bbox, outline="red", width=3) - if block["parent_block"] is None: - source_code_pane.append("* " + block["source_code"]) - - image_pane.object = image - - -app = pn.Row( - image_pane, - pn.Column( - pn.Row("# Paper", paper_select), - pn.Row("# Image", image_select), - pn.Row("# Category", category_select), - source_code_pane, - ), - update_paper, - update_image, - update_annotation, -) - -app.servable() - -# use the following command to visualize -# panel serve app.py --show --autoreload diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 9b188ee..e6de1f5 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -1,84 +1,146 @@ +import argparse import arxiv import os -from typing import List, Dict -from tqdm import tqdm +from typing import List import tarfile -from vrdu import utils -from vrdu import logger +from DocParser.logger import logger -log = logger.setup_app_level_logger(file_name="arxiv_download.log") +log = logger.setup_app_level_logger(logger_name="arxiv_download.log") -def arxiv_download(data: List[Dict], path: str) -> None: - """Download papers from arXiv based on category data. - This function takes a list of category download tasks and a base - path. For each category item in the data, it will: - - 1. Create a subdirectory under the given path for that category - 2. Check if there are already enough papers in the subdir - 3. Search arXiv for the category - 4. Download up to the requested count of newest papers - 5. Save each paper to the category subdirectory +def download_papers_source_with_paper_id( + path: str, discipline: str, paper_ids: List[str] +) -> None: + """ + Downloads papers from the Arxiv repository based on the specified paper IDs. - Arguments: - data (List[Dict]): List of dicts with keys "category" and "count" - path (str): Base directory path to save papers + Args: + - path (str): The path where the downloaded papers will be saved. + - discipline (str): The discipline of the papers to be downloaded. + - paper_ids (List[str]): A list of paper IDs to be downloaded. Returns: - None + None + + Raises: + - tarfile.ReadError: cannot unpack the .tar.gz file + + Usage: + ```python + download_papers_with_paper_id(path, discipline, paper_ids) + ``` + """ client = arxiv.Client() - for row in tqdm(data): - if row["auto_annotated_paper_path"]: - continue - discipline = row["discipline"] - discipline_path = os.path.join(path, discipline) - os.makedirs(discipline_path, exist_ok=True) + discipline_path = os.path.join(path, discipline) + os.makedirs(discipline_path, exist_ok=True) + + search_results = client.results(arxiv.Search(id_list=paper_ids)) + + for result in search_results: + # extract {id} without version from http://arxiv.org/abs/{id} + paper_id = result.entry_id.split("/")[1].split("v")[0] + log.info(f"Downloading paper {paper_id}") - if os.path.exists(os.path.join(discipline_path, row["paper_id"])): - log.debug(f'{os.path.join(discipline_path, row["paper_id"])} exists') + tar_file_path = result.download_source(dirpath=discipline_path) + log.info(f"Downloading tar file {tar_file_path}") + paper_path = os.path.join(discipline_path, paper_id) + if os.path.exists(paper_path): continue - if os.path.exists(os.path.join(discipline_path, row["paper_id"], ".tar.gz")): - log.debug( - f'{os.path.join(discipline_path, row["paper_id"], ".tar.gz")} exists' - ) + try: + with tarfile.open(tar_file_path, "r:gz") as tar: + tar.extractall(paper_path) + except tarfile.ReadError: + log.error(f"{tar_file_path} is not a tar.gz file") continue - search_results = client.results(arxiv.Search(id_list=[row["paper_id"]])) + +def download_papers_pdf_with_paper_id(path: str, discipline: str, paper_ids: List[str]): + client = arxiv.Client() + discipline_path = os.path.join(path, discipline) + os.makedirs(discipline_path, exist_ok=True) + + search_results = client.results(arxiv.Search(id_list=paper_ids)) + + for result in search_results: + # extract {id} without version from http://arxiv.org/abs/{id} + paper_id = result.entry_id.split("/")[1].split("v")[0] + log.info(f"Downloading paper {paper_id}") + + pdf_path = result.download_pdf(dirpath=discipline_path) + log.info(f"Downloading pdf file {pdf_path}") + + +def download_batch_papers(path: str, discipline: str, num_papers: int) -> None: + """ + Downloads a batch of papers from the Arxiv repository + based on the specified discipline and number of papers. + + Args: + - path (str): The path where the downloaded papers will be saved. + - discipline (str): The discipline of the papers to be downloaded. + - num_papers (int): The number of papers to be downloaded. + + Returns: + None + + Raises: + None + + Usage: + ```python + download_batch_papers(output_path, discipline, num_papers) + ``` + + """ + log.debug(f"path: {path}, discipline: {discipline}, num_papers: {num_papers}") + client = arxiv.Client() + + paper_ids = [] + while num_papers > 0: + search_results = client.results( + arxiv.Search(query=discipline, max_results=num_papers) + ) for result in search_results: - tar_file_path = result.download_source(dirpath=discipline_path) - log.debug(f"Downloading tar file {tar_file_path}") - paper_path = os.path.join(discipline_path, row["paper_id"]) - try: - with tarfile.open(tar_file_path, "r:gz") as tar: - tar.extractall(paper_path) - except tarfile.ReadError: - log.error(f"{tar_file_path} is not a tar.gz file") - continue + paper_id = result.entry_id.split("/")[1].split("v")[0] + log.debug(f"Downloading paper {paper_id}") + if paper_id not in paper_ids: + paper_ids.append(paper_id) + num_papers -= 1 + download_papers_pdf_with_paper_id(path, discipline, paper_ids) -def main(): - import argparse +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-p", "--path", type=str, required=True, help="Path to save result" ) parser.add_argument( - "-f", "--file", type=str, required=True, help="json file for saving result" + "-d", "--discipline", type=str, default="cs.CV", help="discipline to download" + ) + parser.add_argument( + "-i", "--num_papers", type=int, default=1, help="Number of paper to download" ) args = parser.parse_args() - output_path, json_file = args.path, args.file + output_path, discipline, num_papers = args.path, args.discipline, args.num_papers + + import json + + with open("/cpfs01/user/maosong/vrdu_data_process/data/discipline_map.json") as f: + discipline_map = json.load(f) - json_data = utils.load_json(json_file) + disciplines = [x for value in discipline_map.values() for x in value] - arxiv_download(json_data, output_path) + for discipline in disciplines: + log.debug("Downloading discipline %s", discipline) + download_batch_papers(output_path, discipline, num_papers) if __name__ == "__main__": diff --git a/batch_process.py b/scripts/batch_process.py similarity index 69% rename from batch_process.py rename to scripts/batch_process.py index 2505a28..6499751 100644 --- a/batch_process.py +++ b/scripts/batch_process.py @@ -2,35 +2,40 @@ import argparse import multiprocessing import shutil -from typing import List, Optional -from uuid import uuid4 +from typing import List import pandas as pd -from vrdu import logger -from vrdu import utils -from main import process_one_file +from DocParser.logger import logger +from DocParser.main import process_one_file -log_file = str(uuid4()) + ".log" -log = logger.setup_app_level_logger(file_name=log_file, level="INFO") +log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") -database = "data/processed_paper_database.csv" - -def filter_tex_files( - tex_files: List[str], main_path: Optional[str] = None -) -> List[str]: - """extract all MAIN.tex files for processing, if main_path is not None, then - only extract MAIN.tex files in the main_path (not recursive) +def filter_tex_files(discipline_path: str) -> List[str]: + """ + Filters the list of tex files in the given discipline path. Args: - tex_files (List[str]): list of tex files - main_path (str): path to main directory. + discipline_path (str): The path to the discipline directory containing tex files. Returns: - List[str]: list of tex files that are compilable. + List[str]: A list of filtered tex files that meet the specified criteria. + + Raises: + Exception: If the processing fails. + + 1. Exclude tex files with names "paper_colored.tex", "paper_white.tex", and "paper_original.tex". + 2. Exclude tex files that are inside a subfolder. + 3. Ensure that the tex file is a main document by checking if it contains "\\begin{document}". + """ + tex_files = [] + + for root, _, files in os.walk(discipline_path): + tex_files.extend( + [os.path.join(root, file) for file in files if file.endswith(".tex")] + ) - # TODO: move this to config redundant_tex_files = [ "paper_colored.tex", "paper_white.tex", @@ -47,7 +52,7 @@ def filter_tex_files( # ensure the tex files inside a subfolder is not included # ex: cs.AI/1234.4567/figs/draw.tex will be excluded - if main_path and os.path.dirname(os.path.dirname(tex_file)) != main_path: + if os.path.dirname(os.path.dirname(tex_file)) != discipline_path: continue # make sure the tex file is compilable (main document) @@ -61,14 +66,6 @@ def filter_tex_files( log.debug(f"failed to read tex file: {tex_file} due to UnicodeDecodeError") continue - # skip processed papers - log.info(f"[VRDU] Before filtering, found {len(result)} tex files") - if os.path.exists(database): - df = pd.read_csv(database) - processed_papers = set(df["path"]) - result = [x for x in result if os.path.dirname(x) not in processed_papers] - - log.info(f"[VRDU] After filtering, found {len(result)} tex files") return result @@ -89,8 +86,7 @@ def process_one_discipline(path: str, cpu_count: int, discipline: str) -> None: discipline_path = os.path.join(path, discipline) log.info(f"[VRDU] Path to raw data: {discipline_path}") log.info(f"[VRDU] Using cpu counts: {cpu_count}") - tex_files = utils.extract_all_tex_files(discipline_path) - tex_files = filter_tex_files(tex_files, discipline_path) + tex_files = filter_tex_files(discipline_path) try: with multiprocessing.Pool(cpu_count) as pool: @@ -100,7 +96,6 @@ def process_one_discipline(path: str, cpu_count: int, discipline: str) -> None: finally: # save the process log log.info(f"[VRDU] discipline: {discipline}, finished processing.") - shutil.move(log_file, f"data/batch_process_{discipline}.log") def main(): diff --git a/scripts/collect_coco_dataset.py b/scripts/collect_coco_dataset.py deleted file mode 100644 index 81c80e7..0000000 --- a/scripts/collect_coco_dataset.py +++ /dev/null @@ -1,191 +0,0 @@ -import os -import re -import matplotlib.pyplot as plt -import time -import json -import shutil -import random -import argparse -from tqdm import tqdm - - -def extract_tex_files(path, target_pattern): - tex_files = [] - - for root, dirs, files in os.walk(path): - for file in files: - if not file.endswith(".tex"): - continue - if file.startswith("paper_"): - continue - - tex_file = os.path.join(root, file) - - try: - with open(tex_file) as f: - content = f.read() - except UnicodeDecodeError: - continue - - if "\\begin{document}" not in content: - continue - - if not any( - re.match(pattern, root.split("/")[-2]) for pattern in target_pattern - ): - continue - - if os.path.exists(f"{root}/output/result/layout_annotation.json"): - tex_files.append(tex_file) - return tex_files - - -def main(path, target_pattern, ratio): - now_time = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) - coco_dataset_name = f"COCO_datasets/Multi-modal_COCO_dataset_{now_time}" - - target_images_folder = f"{coco_dataset_name}/images" - os.makedirs(coco_dataset_name, exist_ok=True) - os.makedirs(target_images_folder, exist_ok=True) - - tex_files = sorted(extract_tex_files(path, target_pattern)) - tex_files_length = len(tex_files) - - random.seed(0) - random.shuffle(tex_files) - train_list = tex_files[: int(tex_files_length * ratio)] - val_list = tex_files[int(tex_files_length * ratio) :] - dataset_dict = {"train": train_list, "val": val_list} - - info = { - "year": 2023, - "version": "1.0", - "description": "COCO format dataset converted form document genome", - "contributor": "ADLab", - "url": "", - "date_created": f"{time.ctime()}", - } - licenses = [ - { - "url": "http://creativecommons.org/licenses/by/2.0/", - "id": 4, - "name": "Attribution License", - } - ] - images = [] - annotations = [] - categories = [ - {"id": 0, "name": "Algorithm", "supercategory": "Algorithm"}, - {"id": 1, "name": "Caption", "supercategory": "Caption"}, - {"id": 2, "name": "Equation", "supercategory": "Equation"}, - {"id": 3, "name": "Figure", "supercategory": "Figure"}, - {"id": 4, "name": "Footnote", "supercategory": "Footnote"}, - {"id": 5, "name": "List", "supercategory": "List"}, - {"id": 6, "name": "Others", "supercategory": "Others"}, - {"id": 7, "name": "Table", "supercategory": "Table"}, - {"id": 8, "name": "Text", "supercategory": "Text"}, - {"id": 9, "name": "Text-EQ", "supercategory": "Text"}, - {"id": 10, "name": "Title", "supercategory": "Title"}, - {"id": 11, "name": "Reference", "supercategory": "Reference"}, - {"id": 12, "name": "PaperTitle", "supercategory": "Title"}, - {"id": 13, "name": "Code", "supercategory": "Algorithm"}, - {"id": 14, "name": "Abstract", "supercategory": "Text"}, - ] - - anno_id = 0 - image_id = 0 - pattern = r"\d+\.\d+(v\d+)?" - for key, tex_files in dataset_dict.items(): - print(f"Processing {key} set...") - - images = [] - annotations = [] - - for tex_file in tqdm(tex_files): - coco_annotation_file = ( - f"{os.path.dirname(tex_file)}/output/result/layout_annotation.json" - ) - images_path = f"{os.path.dirname(tex_file)}/output/colored" - - if not re.search(pattern, tex_file): - raise NotImplementedError - arxiv_paper_id = re.search(pattern, tex_file).group() - - with open(coco_annotation_file, "r") as fp: - coco_annotation = json.load(fp) - sub_images = coco_annotation["images"] - sub_annotations_list = coco_annotation["annotations"] - - grouped_annotations = {} - for annotation in sub_annotations_list: - anno_image_id = annotation["image_id"] - # 检查image_id是否已经在字典中 - if anno_image_id not in grouped_annotations: - # 如果不在,创建一个新的列表 - grouped_annotations[anno_image_id] = [] - # 将注释添加到相应的列表中 - grouped_annotations[anno_image_id].append(annotation) - - grouped_annotations_key_list = sorted(grouped_annotations.keys()) - for idx in grouped_annotations_key_list: - file_name = arxiv_paper_id.replace(".", "_") + f"-page_{idx:04d}.png" - page_image = plt.imread(f"{images_path}/{idx}.png") - H, W, _ = page_image.shape - page_annotations = grouped_annotations[idx] - - images.append( - { - "id": image_id, - "width": W, - "height": H, - "file_name": file_name, - "coco_url": "https://github.com/MaoSong2022/vrdu_data_process", - "date_captured": now_time, - "flickr_url": "", - "licenses": 4, - } - ) - shutil.copyfile( - f"{images_path}/{idx}.png", f"{target_images_folder}/{file_name}" - ) - - for anno in page_annotations: - annotations.append( - { - "id": anno_id, - "image_id": image_id, - "category_id": anno["category_id"], - "segmentation": anno["segmentation"], - "bbox": anno["bbox"], - "area": anno["area"], - "iscrowd": anno["iscrowd"], - } - ) - anno_id += 1 - image_id += 1 - - coco_json_content = { - "info": info, - "licenses": licenses, - "images": images, - "annotations": annotations, - "categories": categories, - } - - with open(f"{coco_dataset_name}/{key}.json", "w") as fp: - json.dump(coco_json_content, fp, indent=4) - - -if __name__ == "__main__": - # parser = argparse.ArgumentParser() - # parser.add_argument("-p", "--path", type=str, required=True) - # parser.add_argument("-r", "--ratio", type=float, default=0.8) - # args = parser.parse_args() - # path = args.path - - target_pattern = [r"^cs\.\w+$"] - path = os.path.expanduser( - "/cpfs01/shared/ADLab/datasets/arxiv_source/arxiv_source_uncompressed" - ) - ratio = 0.8 - main(path, target_pattern, ratio) diff --git a/scripts/convert_coco_to_yolo.py b/scripts/convert_coco_to_yolo.py deleted file mode 100644 index 869e052..0000000 --- a/scripts/convert_coco_to_yolo.py +++ /dev/null @@ -1,134 +0,0 @@ -import os -import json -import argparse -from shutil import copyfile - - -# parser = argparse.ArgumentParser(description='Test yolo data.') -# parser.add_argument('-j', help='JSON file', dest='json', required=True) -# parser.add_argument('-o', help='path to output folder', dest='out', required=True) -# -# args = parser.parse_args() -# -# json_file = args.json -# output = args.out - -class COCO2YOLO: - def __init__(self, json_file, output_path): - self.json_file = json_file - self.output_path = output_path - self.output_image_path = output_path.replace('labels', 'images') - self.output_folder = os.path.dirname(os.path.dirname(self.output_path)) - self._check_file_and_dir() - self.labels = json.load(open(json_file, 'r', encoding='utf-8')) - self.coco_id_name_map = self._categories() - self.coco_name_list = list(self.coco_id_name_map.values()) - print("total images", len(self.labels['images'])) - print("total categories", len(self.labels['categories'])) - print("total labels", len(self.labels['annotations'])) - - def _check_file_and_dir(self): - if not os.path.exists(self.json_file): - raise ValueError("file not found") - os.makedirs(self.output_path, exist_ok=True) - os.makedirs(self.output_image_path, exist_ok=True) - - def _categories(self): - categories = {} - for cls in self.labels['categories']: - categories[cls['id']] = cls['name'] - return categories - - def _load_images_info(self): - images_info = {} - for image in self.labels['images']: - id = image['id'] - file_name = image['file_name'] - if file_name.find('\\') > -1: - file_name = file_name[file_name.index('\\') + 1:] - w = image['width'] - h = image['height'] - images_info[id] = (file_name, w, h) - - return images_info - - def _bbox_2_yolo(self, bbox, img_w, img_h): - x, y, w, h = bbox[0], bbox[1], bbox[2], bbox[3] - centerx = bbox[0] + w / 2 - centery = bbox[1] + h / 2 - dw = 1 / img_w - dh = 1 / img_h - centerx *= dw - w *= dw - centery *= dh - h *= dh - return centerx, centery, w, h - - def _convert_anno(self, images_info): - anno_dict = dict() - for anno in self.labels['annotations']: - bbox = anno['bbox'] - image_id = anno['image_id'] - category_id = anno['category_id'] - - image_info = images_info.get(image_id) - image_name = image_info[0] - img_w = image_info[1] - img_h = image_info[2] - yolo_box = self._bbox_2_yolo(bbox, img_w, img_h) - - anno_info = (image_name, category_id, yolo_box) - anno_infos = anno_dict.get(image_id) - if not anno_infos: - anno_dict[image_id] = [anno_info] - else: - anno_infos.append(anno_info) - anno_dict[image_id] = anno_infos - return anno_dict - - def save_classes(self): - sorted_classes = list(map(lambda x: x['name'], sorted(self.labels['categories'], key=lambda x: x['id']))) - print('coco names', sorted_classes) - with open(f'{self.output_folder}/classes.txt', 'w', encoding='utf-8') as f: - for cls in sorted_classes: - f.write(cls + '\n') - f.close() - - def coco2yolo(self): - print("loading image info...") - images_info = self._load_images_info() - print("loading done, total images", len(images_info)) - - print("start converting...") - anno_dict = self._convert_anno(images_info) - print("converting done, total labels", len(anno_dict)) - - self.save_classes() - - print("saving txt file...") - self._save_txt(anno_dict) - print("saving done") - - def _save_txt(self, anno_dict): - raw_images_path = os.path.join(os.path.dirname(self.json_file), 'images') - for k, v in anno_dict.items(): - file_name = os.path.splitext(v[0][0])[0] + ".txt" - image_name = os.path.splitext(v[0][0])[0] + ".png" - copyfile(f'{raw_images_path}/{image_name}', f'{self.output_image_path}/{image_name}') - with open(os.path.join(self.output_path, file_name), 'w', encoding='utf-8') as f: - # print(k, v) - for obj in v: - cat_name = self.coco_id_name_map.get(obj[1]) - category_id = self.coco_name_list.index(cat_name) - box = ['{:.6f}'.format(x) for x in obj[2]] - box = ' '.join(box) - line = str(category_id) + ' ' + box - f.write(line + '\n') - - -if __name__ == '__main__': - mode = 'val' - json_file = f'COCO_datasets/Multi-modal_COCO_dataset_2023-12-14-13_52_07/{mode}.json' - output = f'YOLO_datasets/Multi-modal_COCO_dataset_2023-12-14-13_52_07/labels/{mode}' - c2y = COCO2YOLO(json_file, output) - c2y.coco2yolo() diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index fafb3d2..f99afbc 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from vrdu import logger +from DocParser.logger import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index 72696a6..314bdb9 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -2,14 +2,33 @@ import glob import multiprocessing import os +from pathlib import Path -from vrdu import utils -from vrdu import logger +from DocParser.vrdu import utils +from DocParser.logger import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") -def generate_annotation(paper_path) -> None: +def process_one_paper(paper_path: Path) -> None: + """ + Process a single paper by generating reading annotations from order annotation. + + Args: + paper_path (Path): The path to the paper directory. + + Returns: + None + + Raises: + None + + Usage: + ```python + process_one_paper("/path/to/paper") + ``` + + """ log.debug(f"processing paper {paper_path}") order_json_file = os.path.join(paper_path, "order_annotation.json") @@ -44,7 +63,25 @@ def generate_annotation(paper_path) -> None: utils.export_to_json(result, reading_json_file) -def generate_reading_annotation(input_path) -> None: +def process_dataset(input_path: Path) -> None: + """ + Process a dataset by iterating over each discipline and paper within it. + + Args: + input_path (Path): The path to the dataset source. + + Returns: + None: This function does not return any value. + + Raises: + None: This function does not raise any exceptions. + + Usage: + ```python + process_dataset("/path/to/dataset") + ``` + + """ discipline_paths = glob.glob(os.path.join(input_path, "*/")) for discipline_path in discipline_paths: @@ -52,10 +89,10 @@ def generate_reading_annotation(input_path) -> None: paper_paths = glob.glob(os.path.join(discipline_path, "*/")) with multiprocessing.Pool(34) as pool: - pool.map(generate_annotation, paper_paths) + pool.map(process_one_paper, paper_paths) -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input_path", type=str, required=True, help="Path of dataset source" @@ -63,7 +100,7 @@ def main(): args = parser.parse_args() input_path = args.input_path - generate_reading_annotation(input_path) + process_dataset(input_path) if __name__ == "__main__": diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py new file mode 100644 index 0000000..8cc28f5 --- /dev/null +++ b/scripts/retrieve_metadata.py @@ -0,0 +1,94 @@ +import glob +import os +from pathlib import Path +from typing import Any, Dict, List +import arxiv +import argparse + + +from DocParser.vrdu import utils +from DocParser.logger import logger + +log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") + + +def retrieve_metadata(data: Dict[str, Path], slice_length=100) -> List[Dict[str, Any]]: + """ + Retrieves metadata for the given list of paper IDs. + + Args: + data (Dict[str, Path]): A dictionary where keys are paper IDs and values are the paths to the corresponding papers. + slice_length (int, optional): The number of paper IDs to retrieve metadata for in each iteration. Defaults to 100. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing metadata for each paper. + + Raises: + None + + References: + https://info.arxiv.org/help/api/user-manual.html#_details_of_atom_results_returned + + """ + paper_ids = list(data.keys()) + + client = arxiv.Client() + + paper_metadata = [] + + for i in range(len(paper_ids), slice_length): + slices = paper_ids[i : i + slice_length] + search_results = client.results(arxiv.Search(id_list=slices)) + + for index, result in enumerate(search_results): + paper_metadata.append( + { + "entry_id": result.entry_id, + "updated": str(result.updated), + "published": str(result.published), + "title": result.title, + "doi": result.doi, + "authors": [str(author) for author in result.authors], + "summary": result.summary, + "journal_ref": result.journal_ref, + "primary_category": result.primary_category, + "categories": result.categories, + "links": [str(link) for link in result.links], + "pdf_url": result.pdf_url, + "paper_id": slices[index], + "paper_path": data[slices[index]], + "quality": "low", + } + ) + + return paper_metadata + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_path", type=str, required=True) + args = parser.parse_args() + path = args.input_path + + paper_paths = glob.glob(os.path.join(path, "*/")) + # paper_id to paper path + data = {os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths} + + paper_metadata = retrieve_metadata(data) + + # use append mode + existed_json_file = os.path.join(path, "paper_metadata.json") + existed_json_data = [] + if os.path.exists(existed_json_file): + existed_json_data = utils.load_json(existed_json_file) + + existed_paper_ids = [x["paper_id"] for x in existed_json_data] + existed_json_data.extend( + [x for x in paper_metadata if x["paper_id"] not in existed_paper_ids] + ) + + utils.export_to_json(existed_json_data, existed_json_file) + + +if __name__ == "__main__": + main() diff --git a/scripts/retrive_metadata.py b/scripts/retrive_metadata.py deleted file mode 100644 index 0b9e4e5..0000000 --- a/scripts/retrive_metadata.py +++ /dev/null @@ -1,87 +0,0 @@ -import glob -import os -from typing import Any, Dict, List -import arxiv -import argparse - -import pandas as pd - - -from vrdu import utils -from vrdu import logger - -log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") - - -def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]: - paper_ids = list(data.keys()) - - client = arxiv.Client() - - slice_length = 100 - paper_metadata = [] - - for i in range(0, len(paper_ids), slice_length): - slices = paper_ids[i : i + slice_length] - search_results = client.results(arxiv.Search(id_list=slices)) - - for index, result in enumerate(search_results): - paper_metadata.append( - { - "entry_id": result.entry_id, - "updated": str(result.updated), - "published": str(result.published), - "title": result.title, - "doi": result.doi, - "authors": [str(author) for author in result.authors], - "summary": result.summary, - "journal_ref": result.journal_ref, - "primary_category": result.primary_category, - "categories": result.categories, - "links": [str(link) for link in result.links], - "pdf_url": result.pdf_url, - "paper_id": slices[index], - "paper_path": data[slices[index]], - "quality": "low", - } - ) - - return paper_metadata - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input_path", type=str, default="data/discipline_info.csv" - ) - args = parser.parse_args() - path = args.input_path - - discipline_info = pd.read_csv("data/discipline_info.csv") - disciplines = set(discipline_info["discipline"]) - - for discipline in disciplines: - target_discipline_path = os.path.join(path, discipline) - paper_paths = glob.glob(os.path.join(target_discipline_path, "*/")) - - data = { - os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths - } - - paper_metadata = retrieve_metadata(data) - - existed_json_file = os.path.join(target_discipline_path, "paper_metadata.json") - existed_json_data = [] - if os.path.exists(existed_json_file): - existed_json_data = utils.load_json(existed_json_file) - - existed_paper_ids = [x["paper_id"] for x in existed_json_data] - existed_json_data.extend( - [x for x in paper_metadata if x["paper_id"] not in existed_paper_ids] - ) - - utils.export_to_json(existed_json_data, existed_json_file) - - -if __name__ == "__main__": - main() diff --git a/scripts/run_statistics.py b/scripts/run_statistics.py deleted file mode 100644 index a21b7e2..0000000 --- a/scripts/run_statistics.py +++ /dev/null @@ -1,249 +0,0 @@ -import glob -import os -import pandas as pd -import argparse -from datetime import datetime - -from vrdu import utils -from vrdu.config import config - -from vrdu import logger - -log = logger.setup_app_level_logger(file_name="statistics.log") - - -database_file = "data/processed_paper_database.csv" -daily_overview_file = "data/daily_overview.csv" -discpline_info_file = "data/discpline_info.csv" - - -def extract_time(line: str) -> datetime: - time_format = "%Y-%m-%d %H:%M:%S,%f" - log_time = line.split(" - ")[0][1:] - return datetime.strptime(log_time, time_format) - - -def init_dataframe() -> pd.DataFrame: - if os.path.exists(database_file): - return pd.read_csv(database_file, dtype={"uuid": str}) - columns = [ - "uuid", - "discpline", - "path", - "status", - "start_time", - "end_time", - "error_type", - "error_info", - "date", - "pages", - "columns", - "blocks", - "overlap", - ] - df = pd.DataFrame(columns=columns) - return df - - -def update_processed_database(input_path: str): - """store the information of processed papers into a csv file""" - df = init_dataframe() - - log_files = glob.glob(os.path.join(input_path, "batch_process_*.log")) - for log_file in log_files: - log.info(f"processing log file: {log_file}") - - if not os.path.exists(log_file): - continue - with open(log_file, "r") as f: - lines = [line.strip() for line in f.readlines()] - - for line in lines: - if not line.startswith("["): - continue - - if line.find("start to process") != -1: - discpline = line.split("discpline: ")[1].split(", ")[0] - continue - - if line.find("[VRDU] file") == -1: - continue - tex_file = line.split("[VRDU] file: ")[1].split(", ")[0] - path = os.path.dirname(tex_file) - if os.path.basename(os.path.dirname(path)) != discpline: - log.debug(f"unknown discpline: {tex_file}") - continue - - # extract uuid and title - uuid = os.path.basename(path) - log.debug(f"uuid: {uuid}") - current_time = extract_time(line) - - # new file - if line.find("start processing") != -1: - if uuid in df["uuid"].values: - continue - - log.debug(f"new file: {tex_file} with uuid: {uuid}") - data_item = { - "index": len(df), - "uuid": uuid, - "discpline": discpline, - "path": path, - "status": "processing", - "start_time": str(current_time), - "end_time": "", - "error_type": "", - "error_info": "", - "date": "", - "pages": 0, - "columns": 0, - "blocks": 0, - "overlap": 0.0, - } - df.loc[len(df)] = data_item - continue - - # success processing file, update information - if ( - line.find("successfully processed") != -1 - or line.find("paper has been processed") != -1 - ): - if uuid in df["uuid"].values: - index = df[df["uuid"] == uuid].index[0] - if df.loc[index, "status"] == "success": - continue - df.loc[df["uuid"] == uuid, "status"] = "success" - df.loc[df["uuid"] == uuid, "end_time"] = current_time - continue - - # failed to process file, update status and eror information - if line.find("message: ") != -1: - if uuid in df["uuid"].values: - index = df[df["uuid"] == uuid].index[0] - if df.loc[index, "status"] == "failure": - continue - error_type = line.split("type: ")[1].split(", ")[0] - error_info = line.split("message: ")[1].strip() - - df.loc[df["uuid"] == uuid, "status"] = "failure" - df.loc[df["uuid"] == uuid, "error_type"] = error_type - df.loc[df["uuid"] == uuid, "error_info"] = error_info - df.loc[df["uuid"] == uuid, "end_time"] = current_time - continue - - category_names = list(config.category2name.values()) - for category_name in category_names: - df[category_name] = 0 - - for index in range(len(df)): - if df.loc[index, "status"] != "success": - continue - - if df.loc[index, "pages"] != 0: - continue - - # use output result to update information - path = df.loc[index, "path"] - quality_report = utils.load_json( - os.path.join(path, "output/result/quality_report.json") - ) - df.loc[index, "pages"] = quality_report["num_pages"] - df.loc[index, "columns"] = quality_report["num_columns"] - df.loc[index, "blocks"] = quality_report["category_quality"][-1][ - "geometry_count" - ] - df.loc[index, "overlap"] = quality_report["page_quality"][-1]["ratio"] - - for category_item in quality_report["category_quality"]: - df.loc[index, category_item["category"]] = category_item["geometry_count"] - - # remove processing files - df = df[~(df["status"] == "processing")] - df.to_csv(database_file, index=False) - - -def update_discpline_info(): - df = pd.read_csv(discpline_info_file) - - for log_file in glob.glob("data/batch_process_*.log"): - discpline = ( - os.path.basename(log_file).split("batch_process_")[1].split(".log")[0] - ) - with open(log_file) as f: - lines = f.readlines() - for line in lines: - if line.find("finished processing.") != -1: - df.loc[df["discpline"] == discpline, "status"] = "complete" - else: - df.loc[df["discpline"] == discpline, "status"] = "processing" - - database_df = pd.read_csv(database_file) - for index, row in df.iterrows(): - df.loc[index, "success"] = len( - database_df[ - (database_df["discpline"] == row["discpline"]) - & (database_df["status"] == "success") - ] - ) - df.loc[index, "failure"] = len( - database_df[ - (database_df["discpline"] == row["discpline"]) - & (database_df["status"] == "failure") - ] - ) - processed_papers = len( - database_df[(database_df["discpline"] == row["discpline"])] - ) - - df.loc[index, "processed"] = processed_papers - - df.to_csv(discpline_info_file, index=False) - - -def update_daily_overview() -> None: - daily_df = pd.read_csv(daily_overview_file) - database_df = pd.read_csv(database_file) - - num_total_papers = database_df.shape[0] - num_total_processed = database_df[database_df["status"] == "success"].shape[0] - - last_index = daily_df.index[-1] - num_daily_papers = num_total_papers - daily_df.loc[last_index, "#total papers"] - num_daily_processed = ( - num_total_processed - daily_df.loc[last_index, "#total processed"] - ) - - if num_total_papers == daily_df.loc[last_index, "#total papers"]: - log.info("Please update database file before running this script.") - - daily_df.loc[last_index + 1, "date"] = datetime.today().strftime("%Y-%m-%d") - daily_df.loc[last_index + 1, "#daily papers"] = num_daily_papers - daily_df.loc[last_index + 1, "#daily processed"] = num_daily_processed - daily_df.loc[last_index + 1, "#total papers"] = num_total_papers - daily_df.loc[last_index + 1, "#total processed"] = num_total_processed - daily_df.loc[last_index + 1, "#discplines"] = database_df["discpline"].nunique() - daily_df["daily pass ratio"] = ( - daily_df["#daily processed"] / daily_df["#daily papers"] - ) - daily_df["total pass ratio"] = ( - daily_df["#total processed"] / daily_df["#total papers"] - ) - - daily_df.to_csv("data/daily_overview.csv", index=False) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_path", type=str, default="output/") - args = parser.parse_args() - - update_processed_database(args.input_path) - - update_daily_overview() - - update_discpline_info() - - -if __name__ == "__main__": - main() diff --git a/scripts/visualize_dataset_distribution.py b/scripts/visualize_dataset_distribution.py deleted file mode 100644 index 1418514..0000000 --- a/scripts/visualize_dataset_distribution.py +++ /dev/null @@ -1,107 +0,0 @@ -from collections import defaultdict -import os -import matplotlib.pyplot as plt -import numpy as np -import csv - - -def get_all_categories(): - """ - Retrieves all categories from the "category_count.csv" file. - - Returns: - categories (list): A list of all categories. - - Reference: - https://arxiv.org/category_taxonomy - """ - categories = [] - with open("scripts/category_count.csv", "r") as f: - reader = csv.DictReader(f) - for row in reader: - categories.append(row["categories"]) - - return categories - - -def visualize_distribution(dict1, dict2): - categories = list(dict1.keys()) # Get the list of categories - - # Get the number of files for each category from both dictionaries - files_dict1 = [dict1[category] for category in categories] - files_dict2 = [dict2[category] for category in categories] - - # normalize - files_dict1 = [x / sum(files_dict1) for x in files_dict1] - files_dict2 = [x / sum(files_dict2) for x in files_dict2] - - # Set up the plot - plt.figure(figsize=(10, 8)) - fig, ax = plt.subplots() - width = 1.2 # Width of the bars - - # Calculate the positions for the bars - positions = np.arange(0, len(categories) * width, width) - - # Plot the number of files for each category - ax.barh(positions, files_dict1, width, label="batch", align="center", color="blue") - ax.barh( - positions, - -np.array(files_dict2), - width, - label="original", - align="center", - color="red", - ) - - # Add labels and title to the plot - ax.set_yticks(positions) - ax.set_yticklabels(categories, fontsize=2) - ax.set_xlabel("Number of Files") - ax.set_title("Distribution of arxiv_source_uncompressed") - ax.legend() - - plt.subplots_adjust(left=0.4) - # Display the plot - plt.savefig("test.png", dpi=300) - - -def analyze_raw_data(path): - all_categories = get_all_categories() - - data = defaultdict(int) - for category in all_categories: - if os.path.exists(os.path.join(path, category)): - data[category] = len(os.listdir(os.path.join(path, category))) - - with open("scripts/batch_count.csv", mode="w") as f: - fieldnames = ["categories", "count"] - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - for key, value in data.items(): - writer.writerow( - { - "categories": key, - "count": value, - } - ) - - return data - - -def main(): - batch = analyze_raw_data( - "/cpfs01/shared/ADLab/datasets/arxiv_source/arxiv_source_uncompressed" - ) - - original = {} - with open("scripts/category_count.csv", newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - original[row["categories"]] = int(row["count"]) - - visualize_distribution(batch, original) - - -if __name__ == "__main__": - main() diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py new file mode 100644 index 0000000..b59b365 --- /dev/null +++ b/scripts/visualize_order_annotations.py @@ -0,0 +1,300 @@ +import argparse +from collections import defaultdict +import math +import os +from pathlib import Path +from typing import Any, Dict, List, Tuple +from PIL import Image, ImageDraw +from matplotlib import pyplot as plt + +from DocParser.vrdu import utils + + +def draw_arrow_line( + image: Image.Image, + point_A: Tuple[float, float], + point_B: Tuple[float, float], + width: int = 1, + color: Tuple[int, int, int] = (0, 255, 0), +) -> Image.Image: + """ + Draws an arrow line between two points on an image. + + Args: + image (PIL.Image.Image): The image on which to draw the arrow line. + point_A (Tuple[float, float]): The first point of the arrow line. + point_B (Tuple[float, float]): The second point of the arrow line. + width (int, optional): The width of the arrow line. Defaults to 1. + color (Tuple[int, int, int], optional): The color of the arrow line. Defaults to (0, 255, 0). + + Returns: + PIL.Image.Image: The image with the arrow line drawn. + + """ + draw = ImageDraw.Draw(image) + draw.line((point_A, point_B), width=width, fill=color) + + # Calculate arrowhead vertices + x0, y0 = point_A + x1, y1 = point_B + xb = 0.95 * (x1 - x0) + x0 + yb = 0.95 * (y1 - y0) + y0 + alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 + a = 8 * math.cos(alpha) + b = 8 * math.sin(alpha) + vtx0 = (xb + a, yb + b) + vtx1 = (xb - a, yb - b) + + # Draw the arrowhead triangle + draw.polygon([vtx0, vtx1, point_B], fill=color) + return image + + +def extract_relations( + page_index: int, order_annotation_data: Dict[str, Any], width=None +) -> List[Tuple[Tuple[float, float], Tuple[float, float], str]]: + """ + Extracts relations between blocks on a given page or across two adjacent pages. + + Args: + page_index (int): The index of the page to extract relations for. + order_annotation_data (Dict[str, Any]): The JSON file containing the order annotation data. + width (int, optional): The width of the image. If not provided, it assumes a single page. + + Returns: + List[Tuple[Tuple[float, float], Tuple[float, float], str]]: A list of tuples containing the coordinates of the block centers and the relation type. + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + relations = extract_relations(10, order_annotation_data, 1000) + ``` + """ + page_blocks = defaultdict(list) + id2blocks = {} + page2id2 = defaultdict(list) + for block in order_annotation_data["annotations"]: + page_blocks[block["page_index"]].append(block) + id2blocks[block["block_id"]] = block + page2id2[block["page_index"]].append(block["block_id"]) + + # single page + if width is None: + relation_tuples = [] + for relation in order_annotation_data["orders"]: + if relation["from"] not in page2id2[page_index]: + continue + if relation["to"] not in page2id2[page_index]: + continue + print(relation) + block_from = id2blocks[relation["from"]] + block_to = id2blocks[relation["to"]] + center_from = ( + (block_from["bbox"][0] + block_from["bbox"][2]) / 2, + (block_from["bbox"][1] + block_from["bbox"][3]) / 2, + ) + center_to = ( + (block_to["bbox"][0] + block_to["bbox"][2]) / 2, + (block_to["bbox"][1] + block_to["bbox"][3]) / 2, + ) + relation_tuples.append((center_from, center_to, relation["type"])) + + return relation_tuples + + # two page + relation_tuples = [] + for relation in order_annotation_data["orders"]: + if relation["from"] not in page2id2[page_index] + page2id2[page_index + 1]: + continue + if relation["to"] not in page2id2[page_index] + page2id2[page_index + 1]: + continue + block_from = id2blocks[relation["from"]] + block_to = id2blocks[relation["to"]] + + center_x = (block_from["bbox"][0] + block_from["bbox"][2]) / 2 + center_y = (block_from["bbox"][1] + block_from["bbox"][3]) / 2 + if block_from["page_index"] != page_index: + center_x += width + center_from = (center_x, center_y) + + center_x = (block_to["bbox"][0] + block_to["bbox"][2]) / 2 + center_y = (block_to["bbox"][1] + block_to["bbox"][3]) / 2 + if block_to["page_index"] != page_index: + center_x += width + center_to = (center_x, center_y) + + relation_tuples.append((center_from, center_to, relation["type"])) + return relation_tuples + + +def visualize_order_annotation_on_image( + relation_tuples: List[Tuple[Tuple[float, float], Tuple[float, float], str]], + image: Image.Image, +) -> None: + """ + Visualizes the order annotation on an image. + + Args: + relation_tuples (List[Tuple[Tuple[float, float], Tuple[float, float], str]]): + A list of tuples containing the coordinates of the block centers and the relation type. + image (PIL.Image.Image): The image on which to draw the arrow lines. + + Returns: + None + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + relation_tuples = extract_relations(10, order_annotation_data, 1000) + visualize_order_annotation_on_image(relation_tuples, image) + ``` + + """ + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(17, 22)) + ax1.imshow(image) + color_map = { + "identical": "green", + "adj": "blue", + "peer": "red", + "implicit-cite": "purple", + "explicit-cite": "brown", + "sub": "orange", + } + + for relation in relation_tuples: + center_from, center_to, relation_type = relation + ax1.arrow( + center_from[0], + center_from[1], + center_to[0] - center_from[0], + center_to[1] - center_from[1], + fc=color_map[relation_type], + ec=color_map[relation_type], + width=3, + ) + ax1.axis("off") + + legend_handles = [] + legend_labels = [] + relation_type_maps = { + "identical": "identical", + "adj": "non-title adjac", + "peer": "title adjacent", + "implicit-cite": "implicitly-referred", + "explicit-cite": "explicitly-referred", + "sub": "subordinate", + } + for relation_type, color in color_map.items(): + legend_handles.append( + plt.Line2D( + [0], [0], color=color, marker="o", linestyle="", label=relation_type + ) + ) + legend_labels.append(relation_type_maps[relation_type]) + + # Add the legend to ax2 + ax2.legend( + handles=legend_handles, + labels=legend_labels, + loc="upper center", + ncol=len(legend_handles), + ) + ax2.axis("off") + plt.tight_layout() + + plt.savefig(f"output/order_annotation.png", dpi=200) + + +def visualize_order_annotation_across_pages(path: Path, page_index: int) -> None: + """ + Visualizes the order annotation across two adjacent pages. + + Args: + path (Path): The path to the directory containing the images and the order annotation JSON file. + page_index (int): The index of the first page to be visualized. + + Returns: + None + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + visualize_order_annotation_across_pages("/path/to/directory", 10) + ``` + """ + order_annotation_file = os.path.join(path, "order_annotation.json") + image_file1 = os.path.join(path, f"page_{page_index:04}.jpg") + image_file2 = os.path.join(path, f"page_{page_index+1:04}.jpg") + + # extract blocks in this page + order_annotation_data = utils.load_json(order_annotation_file) + + # visualize + image1 = Image.open(image_file1) + image2 = Image.open(image_file2) + + relation_tuples = extract_relations(page_index, order_annotation_data, image1.width) + + # concatenate adjacent pages + width = image1.width + image2.width + image = Image.new("RGB", (width, image1.height)) + image.paste(image1, (0, 0)) + image.paste(image2, (image1.width, 0)) + image.save(f"concatenated_image.png") + + visualize_order_annotation_on_image(relation_tuples, image) + + +def visualize_order_annotation_single_page(path: Path, page_index: int) -> None: + """ + Visualizes the order annotation on a single page. + + Args: + path (Path): The path to the directory containing the image and the order annotation JSON file. + page_index (int): The index of the page to be visualized. + + Returns: + None + + Raises: + FileNotFoundError: If the order annotation JSON file or any of the image files are not found. + + Usage: + ```python + visualize_order_annotation_single_page("/path/to/directory", 10) + ``` + """ + order_annotation_file = os.path.join(path, "order_annotation.json") + order_annotation_data = utils.load_json(order_annotation_file) + + image_file = os.path.join(path, f"page_{page_index:04}.jpg") + image = Image.open(image_file) + + # extract blocks in this page + relation_tuples = extract_relations(page_index, order_annotation_data) + + # visualize + visualize_order_annotation_on_image(relation_tuples, image) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--path", help="path to the path", type=str) + parser.add_argument("-i", "--page_index", help="page index", type=int) + args = parser.parse_args() + + path = args.path + page_index = args.page_index + + visualize_order_annotation_single_page(path, page_index) + # visualize_order_annotation_across_pages(path, page_index) + + +if __name__ == "__main__": + main() diff --git a/scripts/visualize_order_annotations_single_page.py b/scripts/visualize_order_annotations_single_page.py deleted file mode 100644 index d532178..0000000 --- a/scripts/visualize_order_annotations_single_page.py +++ /dev/null @@ -1,141 +0,0 @@ -import argparse -from collections import defaultdict -import math -import os -from typing import Tuple -from PIL import Image, ImageDraw -from matplotlib import pyplot as plt - -from vrdu import utils - - -def arrowedLine( - image: Image.Image, - point_A: Tuple[float, float], - point_B: Tuple[float, float], - width=1, - color=(0, 255, 0), -) -> Image.Image: - """Draw a line from point_A to point_B with an arrow headed at ppoint_B.""" - draw = ImageDraw.Draw(image) - draw.line((point_A, point_B), width=width, fill=color) - - # Calculate arrowhead vertices - x0, y0 = point_A - x1, y1 = point_B - xb = 0.95 * (x1 - x0) + x0 - yb = 0.95 * (y1 - y0) + y0 - alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 - a = 8 * math.cos(alpha) - b = 8 * math.sin(alpha) - vtx0 = (xb + a, yb + b) - vtx1 = (xb - a, yb - b) - - # Draw the arrowhead triangle - draw.polygon([vtx0, vtx1, point_B], fill=color) - return image - - -def visualize_order_annotation_single_page(path: str, page_index: int) -> None: - order_annotation_file = os.path.join(path, "order_annotation.json") - image_file = os.path.join(path, f"page_{page_index:04}.jpg") - - # extract blocks in this page - order_annotation_data = utils.load_json(order_annotation_file) - page_blocks = defaultdict(list) - id2blocks = {} - page2id2 = defaultdict(list) - for block in order_annotation_data["annotations"]: - page_blocks[block["page_index"]].append(block) - id2blocks[block["block_id"]] = block - page2id2[block["page_index"]].append(block["block_id"]) - - page_relations = [] - for item in order_annotation_data["orders"]: - if item["from"] not in page2id2[page_index]: - continue - if item["to"] not in page2id2[page_index]: - continue - page_relations.append(item) - - # visualize - image = Image.open(image_file) - width, height = image.size - - fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(16, 20), gridspec_kw={"height_ratios": [5, 1]} - ) - ax1.imshow(image, extent=[0, width, height, 0]) - ax1.set_xlim(0, width) - ax1.set_ylim(height, 0) - color_map = { - "identical": "green", - "adj": "blue", - "peer": "red", - "implicit-cite": "purple", - "explicit-cite": "brown", - "sub": "orange", - "unknown": "black", - } - - for relation in page_relations: - print(relation) - block_from = id2blocks[relation["from"]] - block_to = id2blocks[relation["to"]] - center_from = ( - (block_from["bbox"][0] + block_from["bbox"][2]) / 2, - (block_from["bbox"][1] + block_from["bbox"][3]) / 2, - ) - center_to = ( - (block_to["bbox"][0] + block_to["bbox"][2]) / 2, - (block_to["bbox"][1] + block_to["bbox"][3]) / 2, - ) - ax1.arrow( - center_from[0], - center_from[1], - center_to[0] - center_from[0], - center_to[1] - center_from[1], - fc=color_map[relation["type"]], - ec=color_map[relation["type"]], - width=3, - ) - ax1.axis("off") - - legend_handles = [] - legend_labels = [] - for relation_type, color in color_map.items(): - legend_handles.append( - plt.Line2D( - [0], [0], color=color, marker="o", linestyle="", label=relation_type - ) - ) - legend_labels.append(relation_type) - - # Add the legend to ax2 - ax2.legend( - handles=legend_handles, - labels=legend_labels, - loc="upper center", - ncol=len(legend_handles), - ) - ax2.axis("off") - plt.tight_layout() - - # plt.show() - plt.savefig("output/order_annotation.png", dpi=200) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", help="path to the path", type=str) - parser.add_argument("-i", "--page_index", help="page index", type=int) - args = parser.parse_args() - - path = args.path - page_index = args.page_index - - visualize_order_annotation_single_page(path, page_index) - - -if __name__ == "__main__": - main() diff --git a/scripts/visualize_order_annotations_two_page.py b/scripts/visualize_order_annotations_two_page.py deleted file mode 100644 index fefce3b..0000000 --- a/scripts/visualize_order_annotations_two_page.py +++ /dev/null @@ -1,179 +0,0 @@ -import argparse -from collections import defaultdict -import math -import os -from typing import Tuple -from PIL import Image, ImageDraw -from matplotlib import pyplot as plt - -from vrdu import utils - - -def arrowedLine( - image: Image.Image, - point_A: Tuple[float, float], - point_B: Tuple[float, float], - width=1, - color=(0, 255, 0), -) -> Image.Image: - """Draw a line from point_A to point_B with an arrow headed at ppoint_B.""" - draw = ImageDraw.Draw(image) - draw.line((point_A, point_B), width=width, fill=color) - - # Calculate arrowhead vertices - x0, y0 = point_A - x1, y1 = point_B - xb = 0.95 * (x1 - x0) + x0 - yb = 0.95 * (y1 - y0) + y0 - alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 - a = 8 * math.cos(alpha) - b = 8 * math.sin(alpha) - vtx0 = (xb + a, yb + b) - vtx1 = (xb - a, yb - b) - - # Draw the arrowhead triangle - draw.polygon([vtx0, vtx1, point_B], fill=color) - return image - - -def visualize_order_annotation_across_pages(path: str, page_index: int) -> None: - order_annotation_file = os.path.join(path, "order_annotation.json") - image_file1 = os.path.join(path, f"page_{page_index:04}.jpg") - image_file2 = os.path.join(path, f"page_{page_index+1:04}.jpg") - - # extract blocks in this page - order_annotation_data = utils.load_json(order_annotation_file) - page_blocks = defaultdict(list) - id2blocks = {} - page2id2 = defaultdict(list) - for block in order_annotation_data["annotations"]: - page_blocks[block["page_index"]].append(block) - id2blocks[block["block_id"]] = block - page2id2[block["page_index"]].append(block["block_id"]) - - page_relations = [] - for item in order_annotation_data["orders"]: - if item["from"] not in page2id2[page_index] + page2id2[page_index + 1]: - continue - if item["to"] not in page2id2[page_index] + page2id2[page_index + 1]: - continue - page_relations.append(item) - - # visualize - image1 = Image.open(image_file1) - - image2 = Image.open(image_file2) - - # crop - margin = 150 - h_margin = margin - v_margin = margin * 17 / 22 - bbox = (v_margin, h_margin, 1700 - v_margin * 2.2, 2200 - h_margin * 0.9) - image1 = image1.crop(bbox) - image2 = image2.crop(bbox) - - width, height = image1.size - new_width = image1.width + image2.width - new_image = Image.new("RGB", (new_width, height)) - new_image.paste(image1, (0, 0)) - new_image.paste(image2, (image1.width, 0)) - new_image.save(f"concatenated_image_{page_index}_{page_index+1}.png") - - fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(17, 22) # , gridspec_kw={"height_ratios": [4, 1]} - ) - ax1.imshow(new_image, extent=[0, new_width, height, 0]) - ax1.set_xlim(0, new_width) - ax1.set_ylim(height, 0) - color_map = { - "identical": "green", - "adj": "blue", - "peer": "red", - "implicit-cite": "purple", - "explicit-cite": "brown", - "sub": "orange", - } - - for relation in page_relations: - print(relation) - block_from = id2blocks[relation["from"]] - block_to = id2blocks[relation["to"]] - if block_from["page_index"] != page_index: - center_from = ( - width + (block_from["bbox"][0] + block_from["bbox"][2]) / 2, - (block_from["bbox"][1] + block_from["bbox"][3]) / 2, - ) - else: - center_from = ( - (block_from["bbox"][0] + block_from["bbox"][2]) / 2, - (block_from["bbox"][1] + block_from["bbox"][3]) / 2, - ) - - if block_to["page_index"] != page_index: - center_to = ( - width + (block_to["bbox"][0] + block_to["bbox"][2]) / 2, - (block_to["bbox"][1] + block_to["bbox"][3]) / 2, - ) - else: - center_to = ( - (block_to["bbox"][0] + block_to["bbox"][2]) / 2, - (block_to["bbox"][1] + block_to["bbox"][3]) / 2, - ) - ax1.arrow( - center_from[0] - margin, - center_from[1] - margin, - center_to[0] - center_from[0], - center_to[1] - center_from[1], - fc=color_map[relation["type"]], - ec=color_map[relation["type"]], - width=3, - ) - ax1.axis("off") - - legend_handles = [] - legend_labels = [] - relation_type_maps = { - "identical": "identical", - "adj": "non-title adjac", - "peer": "title adjacent", - "implicit-cite": "implicitly-referred", - "explicit-cite": "explicitly-referred", - "sub": "subordinate", - } - for relation_type, color in color_map.items(): - legend_handles.append( - plt.Line2D( - [0], [0], color=color, marker="o", linestyle="", label=relation_type - ) - ) - legend_labels.append(relation_type_maps[relation_type]) - - # Add the legend to ax2 - ax2.legend( - handles=legend_handles, - labels=legend_labels, - loc="upper center", - ncol=len(legend_handles), - ) - ax2.axis("off") - plt.tight_layout() - - # plt.show() - - plt.savefig(f"output/order_annotation_{page_index}_{page_index + 1}.png", dpi=200) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--path", help="path to the path", type=str) - parser.add_argument("-i", "--page_index", help="page index", type=int) - args = parser.parse_args() - - path = args.path - page_index = args.page_index - - visualize_order_annotation_across_pages(path, page_index) - - -if __name__ == "__main__": - main() diff --git a/scripts/visuzliation.py b/scripts/visuzliation.py deleted file mode 100644 index fd23244..0000000 --- a/scripts/visuzliation.py +++ /dev/null @@ -1,41 +0,0 @@ -from graphviz import Digraph - -from vrdu.utils import load_json - - -def draw_dot(annotations, format="svg", rankdir="TB"): - """ - format: png | svg | ... - rankdir: TB (top to bottom graph) | LR (left to right) - """ - assert rankdir in ["LR", "TB"] - - nodes = set() - edges = [] - for annotation in annotations: - nodes.add(annotation["from"]) - nodes.add(annotation["to"]) - edges.append((annotation["from"], annotation["type"], annotation["to"])) - - dot = Digraph(format=format, graph_attr={"rankdir": rankdir}) - - for node in nodes: - dot.node( - name=str(id(node)), - label=str(node), - shape="record", - ) - - for node1, relation, node2 in edges: - dot.edge(str(id(node1)), str(id(node2)), label=relation) - - return dot - - -if __name__ == "__main__": - annotation_file = ( - "/home/PJLAB/maosong/vrdu_data/icml2022/output/result/order_annotation.json" - ) - annotations = load_json(annotation_file) - dot = draw_dot(annotations) - dot.render(filename="gout.dot", view=True) diff --git a/setup.py b/setup.py index a993283..85834f1 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ from setuptools import setup, find_packages setup( - name="vrdu_data_process", - version="0.5.0", - description="process the academic papers with .tex source files", + name="DocParser", + version="1.0.0", + description="Process academic papers with .tex source files for layout analysis", author="Mao Song", author_email="maosong@pjlab.org.cn", - url="https://github.com/MaoSong2022/vrdu_data_process", + url="https://github.com/UniModal4Reasoning/DocParser.git", license="MIT", packages=find_packages(), install_requires=[ @@ -16,18 +16,21 @@ "numpy==1.24.3", "pdf2image==1.16.3", "pdfminer.six==20221105", - # "Pillow==9.4.0", "Pillow==10.1.0", "pyparsing==3.1.1", "pytest==7.4.2", "scikit_image==0.19.3", "setuptools==68.0.0", "tqdm==4.66.1", + "sphinx", + "arxiv-cleaner", + "texsoup", ], - scripts=[ - "vrdu/compile_latex.sh", - ], + python_requires=">=3.8", + scripts=[], entry_points={ - "console_scripts": [], + "console_scripts": [ + "vrdu_process=vrdu_data_process.main:main", + ], }, ) diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py index f3ca221..096ca65 100644 --- a/tests/test_add_definitions.py +++ b/tests/test_add_definitions.py @@ -1,7 +1,7 @@ import unittest import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer def test_add_color_definition1(): diff --git a/tests/test_compile_check.py b/tests/test_compile_check.py deleted file mode 100644 index b1adbd8..0000000 --- a/tests/test_compile_check.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest -import unittest.mock -import os - -from vrdu.utils import compile_check - - -class TestGraphics(unittest.TestCase): - def test_equation1(self): - self.assertEqual(compile_check(r"\begin{equation}a \end{equation}"), True) - - temp_files = [file for file in os.listdir(".") if file.startswith("temp")] - self.assertEqual(len(temp_files), 0) - - def test_equation2(self): - self.assertEqual(compile_check(r"\begin{equation}\e\end{equation}"), False) - - temp_files = [file for file in os.listdir(".") if file.startswith("temp")] - self.assertEqual(len(temp_files), 0) diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py index 8335db3..14a2cd5 100644 --- a/tests/test_extract_graphics.py +++ b/tests/test_extract_graphics.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestGraphics(unittest.TestCase): diff --git a/tests/test_extract_macro_definitions.py b/tests/test_extract_macro_definitions.py deleted file mode 100644 index 72efeec..0000000 --- a/tests/test_extract_macro_definitions.py +++ /dev/null @@ -1,104 +0,0 @@ -import unittest -import unittest.mock - -from vrdu.utils import extract_macro_definitions - - -class TestExtractMacroDefinitions(unittest.TestCase): - def setUp(self) -> None: - self.mock_file_content1 = ( - """\\documentclass{article}\\begin{document}\\end{document}""" - ) - self.mock_file_content2 = r"""\documentclass{article} - \newcommand{\Sin}{\mathrm{sin}\,\theta} - \newcommand{\Cos}{\mathrm{cos}\,\theta} - \newcommand{\Tan}{\mathrm{tan}\,\theta} - \begin{document} - \[ \Tan = \frac{\Sin}{\Cos} \] \[ (\Sin)^2 + (\Cos)^2 =1 \] - \[ \cot\theta = \frac{\Cos}{\Sin} \] - \end{document} - """ - self.mock_file_content3 = r"""\documentclass{article} - \newcommand{\trig}[1]{\mathrm{\#1}\,\theta} - \begin{document} - \[ \trig{sin},\,\trig{cos},\,\trig{tan} \] - \[ \trig{tan} = \frac{\trig{sin}}{\trig{cos}} \] - \[ \trig{sin^2} + \trig{cos^2} =1 \] - \[ \int \frac{\trig{cos^3}}{1+\trig{sin^2}}d\theta \] - \end{document} - """ - self.mock_file_content4 = r"""\documentclass{article} - \newcommand{\trig}[2]{\mathrm{\#1}\left(\#2\right)} - \newcommand{\Int}[2]{\int_{\#2}^{\#1}} - \begin{document} - \[ \int\frac{du}{\sqrt{a^2 + u^2}}=\trig{sin^{\!-1}}{\frac{u}{a}} + C \] - \[ \int\trig{sec}{\frac{a}{x}}dx = \frac{1}{a} \log\trig{tan}{\frac{\pi}{4}+ \frac{a}{2x}} + C \] - \[ \Int{a}{b}f(x)dx = \sum_{k=1}^n \trig{sin}{5+\frac{3k}{n}} \] - \[ \Int{b}{a}f(x)dx = \lim_{n \to \infty} \sum_{i=1}^{n}f(x_i)\delta x \] - \end{document} - """ - self.mock_file_content5 = r"""\documentclass{article} - \usepackage{xcolor} - \newcommand{\trig}[3][]{\mathrm{\#2^{\#1}}\left(\#3\right)} - \newcommand{\trigx}[3][]{\mathrm{\#2}\left({\color{\#1}\#3}\right)} - \begin{document} - \[ \trig{sin}{\alpha}, \trig[n]{sin}{\beta},\trig[m]{sin}{\gamma} \] - \[ \trigx[red]{cos}{2\theta}-\trigx[blue]{sin}{2\theta}=\trigx[green]{cos}{4\theta} \] - \[ \theta=\trigx[red]{tan^{-1}}{\frac{x}{y}},\trigx[red]{tan}{\alpha+\beta}=\frac{\trigx[blue]{tan}{\alpha}+\trigx[blue]{tan}{\beta}}{1-\trigx[blue]{tan}{\alpha}\trigx[blue]{tan}{\beta}}\] - \end{document} - """ - - def test_no_macro(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content1), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual(result, []) - - def test_no_arguments(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content2), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual( - result, - [ - r"\newcommand{\Sin}{\mathrm{sin}\,\theta}", - r"\newcommand{\Cos}{\mathrm{cos}\,\theta}", - r"\newcommand{\Tan}{\mathrm{tan}\,\theta}", - ], - ) - - def test_more_than_one_arguments(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content4), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual( - result, - [ - r"\newcommand{\trig}[2]{\mathrm{\#1}\left(\#2\right)}", - r"\newcommand{\Int}[2]{\int_{\#2}^{\#1}}", - ], - ) - - def test_optional_arguments(self): - with unittest.mock.patch( - "builtins.open", - new=unittest.mock.mock_open(read_data=self.mock_file_content5), - create=True, - ) as file_mock: - result = extract_macro_definitions(file_mock) - self.assertEqual( - result, - [ - r"\newcommand{\trig}[3][]{\mathrm{\#2^{\#1}}\left(\#3\right)}", - r"\newcommand{\trigx}[3][]{\mathrm{\#2}\left({\color{\#1}\#3}\right)}", - ], - ) diff --git a/tests/test_extract_title_name.py b/tests/test_extract_title_name.py deleted file mode 100644 index cf6093f..0000000 --- a/tests/test_extract_title_name.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - - -from vrdu.utils import extract_title_name - - -class TestExtractTitleName(unittest.TestCase): - def test_title_name(self): - self.assertEqual(extract_title_name("\\section{Name}"), "section") - self.assertEqual(extract_title_name("\\subsection*{AnotherName}"), "subsection") - self.assertEqual(extract_title_name("No match"), "") - self.assertEqual( - extract_title_name("\\subsubsection{No match}"), "subsubsection" - ) - self.assertEqual( - extract_title_name("\\subsubsection*{No match}"), "subsubsection" - ) diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py index 6426411..3baa280 100644 --- a/tests/test_is_text_eq.py +++ b/tests/test_is_text_eq.py @@ -1,6 +1,6 @@ import unittest -from vrdu.renderer import is_text_eq +from DocParser.vrdu.renderer import is_text_eq class TestTextEq(unittest.TestCase): diff --git a/tests/test_remove_hyperref_color.py b/tests/test_remove_predefined_color.py similarity index 89% rename from tests/test_remove_hyperref_color.py rename to tests/test_remove_predefined_color.py index 7db6f39..356f378 100644 --- a/tests/test_remove_hyperref_color.py +++ b/tests/test_remove_predefined_color.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestHyperref(unittest.TestCase): @@ -21,7 +21,7 @@ def test1(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -33,7 +33,7 @@ def test2(self): new=unittest.mock.mock_open(read_data=self.mock_file_content2), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -45,7 +45,7 @@ def test3(self): new=unittest.mock.mock_open(read_data=self.mock_file_content3), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -57,7 +57,7 @@ def test4(self): new=unittest.mock.mock_open(read_data=self.mock_file_content4), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\usepackage{amsmath}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py index 16f2cb9..405f6da 100644 --- a/tests/test_render_abstract.py +++ b/tests/test_render_abstract.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestAbstract(unittest.TestCase): diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py index c15821e..a4cf6ad 100644 --- a/tests/test_render_algorithm.py +++ b/tests/test_render_algorithm.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestAlgorithm(unittest.TestCase): diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py index eb21de8..b526f60 100644 --- a/tests/test_render_caption.py +++ b/tests/test_render_caption.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestCaption(unittest.TestCase): diff --git a/tests/test_render_code.py b/tests/test_render_code.py index f6f39fd..c71bd27 100644 --- a/tests/test_render_code.py +++ b/tests/test_render_code.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestCode(unittest.TestCase): @@ -71,7 +71,7 @@ def test_no_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -83,7 +83,7 @@ def test_remove_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content5), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( r"""\documentclass{article}\n\usepackage{listings}\n\usepackage{xcolor}\n\n\definecolor{codegreen}{rgb}{0,0.6,0}\n\definecolor{codegray}{rgb}{0.5,0.5,0.5}\n\definecolor{codepurple}{rgb}{0.58,0,0.82}\n\definecolor{backcolour}{rgb}{0.95,0.95,0.92}\n\n\lstdefinestyle{mystyle}{\n backgroundcolor=\color{backcolour}, \n commentstyle=\color{codegreen},\n keywordstyle=\color{magenta},\n numberstyle=\tiny\color{codegray},\n stringstyle=\color{codepurple},\n basicstyle=\ttfamily\footnotesize,\n breakatwhitespace=false, \n breaklines=true, \n captionpos=b, \n keepspaces=true, \n numbers=left, \n numbersep=5pt, \n showspaces=false, \n showstringspaces=false,\n showtabs=false, \n tabsize=2\n}\n\n\n\n\begin{document}\nThe next code will be directly imported from a file\n\n\lstinputlisting[language=Octave]{BitXorMatrix.m}\n\end{document}""" diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py index e0fcebd..e81e0fd 100644 --- a/tests/test_render_footnote.py +++ b/tests/test_render_footnote.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestFootnote(unittest.TestCase): diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py index 55f6f92..e57f363 100644 --- a/tests/test_render_tabular.py +++ b/tests/test_render_tabular.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestTabular(unittest.TestCase): @@ -62,4 +62,4 @@ def test_mix_tabulars(self): file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}Table \\ref{demo-table} has a caption:\\begin{table}[!h]\\begin{center}{\\color{Table_color}\\begin{tabular}{||c c c c||} \\hline Col1 & Col2 & Col2 & Col3 \\ [0.5ex] \\hline\\hline 1 & 6 & 87837 & 787 \\ \\hline 2 & 7 & 78 & 5415 \\ \\hline 3 & 545 & 778 & 7507 \\ \\hline 4 & 545 & 18744 & 7560 \\ \\hline 5 & 88 & 788 & 6344 \\ [1ex] \\hline\\end{tabular}}\\caption{\\label{demo-table}Your caption.}\\end{center}\\end{table} \\begin{table}[!h]\\begin{center}{\\color{Table_color}\\begin{tabularx}{||c c c c||} \\hline Col1 & Col2 & Col2 & Col3 \\ [0.5ex] \\hline\\hline 1 & 6 & 87837 & 787 \\ \\hline 2 & 7 & 78 & 5415 \\ \\hline 3 & 545 & 778 & 7507 \\ \\hline 4 & 545 & 18744 & 7560 \\ \\hline 5 & 88 & 788 & 6344 \\ [1ex] \\hline\\end{tabularx}}\\caption{\\label{demo-table}Your caption.}\\end{center}\\end{table}\\end{document}""" - ) \ No newline at end of file + ) diff --git a/tests/test_render_title.py b/tests/test_render_title.py index 343714e..122063b 100644 --- a/tests/test_render_title.py +++ b/tests/test_render_title.py @@ -2,7 +2,7 @@ import unittest.mock -from vrdu.renderer import Renderer +from DocParser.vrdu.renderer import Renderer class TestTitle(unittest.TestCase): diff --git a/vrdu/block.py b/vrdu/block.py deleted file mode 100644 index 0c46dcc..0000000 --- a/vrdu/block.py +++ /dev/null @@ -1,212 +0,0 @@ -from dataclasses import dataclass -from typing import Dict, List, Tuple - -from pyparsing import Any - - -@dataclass -class BoundingBox: - """A simple bounding box representation. - The coordinates are in the form of (x0, y0, x1, y1) - The origin is in the top left and (x0, y0) is the top left corner, - (x1, y1) is the bottom right corner. - """ - - x0: float - y0: float - x1: float - y1: float - - @property - def width(self) -> float: - return self.x1 - self.x0 - - @property - def height(self) -> float: - return self.y1 - self.y0 - - def __len__(self) -> int: - return 4 - - def __repr__(self) -> str: - return f"BoundingBox({self.x0}, {self.y0}, {self.x1}, {self.y1})" - - def __getitem__(self, index: int) -> float: - return (self.x0, self.y0, self.x1, self.y1)[index] - - def area(self): - return abs((self.x1 - self.x0) * (self.y1 - self.y0)) - - def overlap(self, other): - if ( - self.x0 > other.x1 - or self.x1 < other.x0 - or self.y0 > other.y1 - or self.y1 < other.y0 - ): - return 0 - x_overlap = max(0, min(self.x1, other.x1) - max(self.x0, other.x0)) - y_overlap = max(0, min(self.y1, other.y1) - max(self.y0, other.y0)) - return x_overlap * y_overlap - - def to_dict(self) -> Dict[str, Any]: - return {"bbox": (self.x0, self.y0, self.x1, self.y1)} - - @classmethod - def from_dict(cls, data: Dict[str, Any]): - return cls( - x0=data["bbox"][0], - y0=data["bbox"][1], - x1=data["bbox"][2], - y1=data["bbox"][3], - ) - - @classmethod - def from_list(cls, data: List[Tuple[float, float, float, float, float, float]]): - min_x = min(data, key=lambda x: x[1])[1] - min_y = min(data, key=lambda x: x[0])[0] - max_x = max(data, key=lambda x: x[4])[4] - max_y = max(data, key=lambda x: x[3])[3] - return cls(x0=min_x, y0=min_y, x1=max_x, y1=max_y) - - -class Block: - current_id: int = 0 - - def __init__( - self, - block_id: int = None, - bounding_box: BoundingBox = None, - category: int = None, - page_index: int = None, - previous_block: int = None, - parent_block: int = None, - next_block: int = None, - source_code: str = None, - labels: List[str] = None, - references: List[str] = None, - ) -> None: - if not block_id: - self.id = Block.current_id - Block.current_id += 1 - else: - self.id = block_id - - self._category = category - self._page_index = page_index - self._bounding_box = bounding_box - self._previous_block = previous_block - self._parent_block = parent_block - self._next_block = next_block - self._source_code = source_code - self._labels = labels - self._references = references - - def __repr__(self) -> str: - return f"Block(id={self.id}, category={self.category}, page_index={self.page_index}, bbox={self.bbox}), source_code={self.source_code}" - - @property - def bbox(self): - return self._bounding_box - - @bbox.setter - def bbox(self, value: BoundingBox) -> None: - self._bounding_box = value - - @property - def labels(self) -> List[str]: - return self._labels - - @labels.setter - def labels(self, value: List[str]) -> None: - self._labels = value - - @property - def references(self) -> List[str]: - return self._references - - @references.setter - def references(self, value: List[str]) -> None: - self._references = value - - @property - def block_id(self) -> int: - return self.id - - @property - def category(self) -> int: - return self._category - - @category.setter - def category(self, value: int) -> None: - self._category = value - - @property - def page_index(self) -> int: - return self._page_index - - @page_index.setter - def page_index(self, value: int) -> None: - self._page_index = value - - @property - def source_code(self) -> str: - return self._source_code - - @source_code.setter - def source_code(self, value: str) -> None: - self._source_code = value - - @property - def parent_block(self) -> int: - return self._parent_block - - @parent_block.setter - def parent_block(self, value: int) -> None: - self._parent_block = value - - @property - def previous_block(self) -> int: - return self._previous_block - - @property - def next_block(self) -> int: - return self._next_block - - @property - def height(self) -> float: - return self._bounding_box.height - - @property - def width(self) -> float: - return self._bounding_box.width - - def to_dict(self): - data = self._bounding_box.to_dict() - data.update( - { - "block_id": self.block_id, - "category": self.category, - "page_index": self.page_index, - "previous_block": self.previous_block, - "parent_block": self.parent_block, - "next_block": self.next_block, - "source_code": self.source_code, - "labels": self.labels, - "references": self.references, - } - ) - return data - - @classmethod - def from_dict(cls, data: Dict[str, Any]): - return cls( - block_id=data["block_id"], - bounding_box=BoundingBox.from_dict(data), - category=data["category"], - previous_block=data["previous_block"], - parent_block=data["parent_block"], - next_block=data["next_block"], - source_code=data["source_code"], - page_index=data["page_index"], - ) diff --git a/vrdu/layout_annotation.py b/vrdu/layout_annotation.py deleted file mode 100644 index b4a186b..0000000 --- a/vrdu/layout_annotation.py +++ /dev/null @@ -1,620 +0,0 @@ -from collections import defaultdict -import os -import glob -import subprocess -from typing import Any, DefaultDict, Dict, List, Tuple -import matplotlib.pyplot as plt -import numpy as np -from skimage.measure import label, regionprops -from PIL import Image, ImageDraw, ImageFont -import re -from tqdm import tqdm - -from pdfminer.high_level import extract_pages -from pdfminer.layout import LTFigure, LTPage -from vrdu import utils - - -from vrdu.block import Block, BoundingBox -from vrdu.config import config, envs -from vrdu import logger - -log = logger.get_logger(__name__) - - -class LayoutAnnotation: - # https://www.overleaf.com/learn/latex/Lengths_in_LaTeX - ONE_INCH = 72.27 - - def __init__(self, tex_file: str) -> None: - self.tex_file = tex_file - self.main_directory = os.path.dirname(tex_file) - self.output_directory = os.path.join(self.main_directory, "output") - self.result_directory = os.path.join(self.output_directory, "result") - self.layout_metadata: Dict = {} - self.text_info = utils.load_json( - os.path.join(self.result_directory, "texts.json") - ) - self.pdf_images_path = os.path.join(self.output_directory, "paper_colored") - - def extract_pdf_layouts(self) -> List[LTPage]: - """Extracts layout information of each page from a rendered PDF. - - This method reads the rendered PDF file and extracts the layout information for each page. - The layout information includes the position, size, and other attributes of each element on the page. - - Returns: - List[LTPage]: A list of LTPage objects representing the layout of each page. - - Example: - >>> renderer = PDFRenderer() - >>> layouts = renderer.extract_pdf_layouts() - >>> for layout in layouts: - ... print(layout) - - - ... - """ - rendered_pdf = os.path.join(self.main_directory, "paper_colored.pdf") - page_layouts = extract_pages(rendered_pdf) - return list(page_layouts) - - def parse_metadata(self, pdf_layouts: List[LTPage]) -> None: - """Parse metadata from PDF layouts and store them in the layout_metadata attribute. - - Args: - - pdf_layouts (List[LTPage]): A list of LTPage objects representing the PDF layouts. - - Returns: - - None - """ - pt2px = config.ppi / self.ONE_INCH - - layout_metadata = dict() - - # get metadata from log file - log_file = os.path.join(self.main_directory, "paper_colored.log") - # see renderer.py add_layout_definitions for details - regex_pattern = r"\[vrdu_data_process: The (.*) is: ([-+]?\d+\.\d+)pt\]" - - with open(log_file, "r", encoding="latin-1") as file: - log_content = file.read() - - for match in re.findall(regex_pattern, log_content): - key = match[0] - value = float(match[1]) - layout_metadata[key] = value - - textwidth = layout_metadata["textwidth"] - columnsep = layout_metadata["columnsep"] - columnwidth = layout_metadata["columnwidth"] - # textwidth = n * columnwidth + (n - 1) * columnsep - num_columns = round((textwidth + columnsep) / (columnwidth + columnsep)) - layout_metadata["num_columns"] = num_columns - - # https://www.overleaf.com/learn/latex/Page_size_and_margins - element1 = self.ONE_INCH + layout_metadata["hoffset"] - element2 = self.ONE_INCH + layout_metadata["voffset"] - element3 = layout_metadata["oddsidemargin"] - element4 = layout_metadata["topmargin"] - element5 = layout_metadata["headheight"] - element6 = layout_metadata["headsep"] - margin_width = element1 + element3 - margin_height = (element2 - (element4 - element5)) + element6 / 2 - layout_metadata["margin_width"] = margin_width - - # sort all images by page index, see utils.pdf2jpg for details - image_files = sorted( - glob.glob(f"{self.pdf_images_path}/*.jpg"), key=lambda x: x[-6:-4] - ) - for page_index, page_layout in enumerate(pdf_layouts): - layout_metadata[page_index] = {} - - pdf_width, pdf_height = page_layout.width, page_layout.height - layout_metadata[page_index]["pdf_width"] = pdf_width - layout_metadata[page_index]["pdf_height"] = pdf_height - - with Image.open(image_files[page_index]) as page_image: - image_width, image_height = page_image.size - layout_metadata[page_index]["image_width"] = image_width - layout_metadata[page_index]["image_height"] = image_height - - px2img = image_height / pdf_height - layout_metadata[page_index]["px2img"] = px2img - layout_metadata[page_index]["separations"] = [0] - - # x is initialize as left boundary of a column minus a half of column separation width - # this can make sure the separation is in the middle of two columns - x = margin_width - 0.5 * columnsep - for i in range(num_columns - 1): - separation = x + columnwidth + columnsep - layout_metadata[page_index]["separations"].append( - separation * pt2px * px2img - ) - x += separation - # TODO: consider the margin notes - layout_metadata[page_index]["separations"].append(pdf_width * px2img) - layout_metadata[page_index]["top_margin"] = margin_height - - utils.export_to_json( - layout_metadata, - os.path.join(self.result_directory, "layout_metadata.json"), - ) - - self.layout_metadata = layout_metadata - - def retrive_figure_source_code( - self, figure_layout_info: Dict[int, List[Block]] - ) -> None: - """Retrieves the source code of a figure using synctex. - - Args: - figure_layout_info (Dict[int, List[Block]]): A dictionary where the keys are page indices - and the values are lists of Block objects representing the bounding boxes of figures on each page. - - Returns: - None - - Note: - use `synctex help edit` to view usage of synctex - """ - # paper_colored.tex is what we are working for - tex_filename = os.path.basename(self.tex_file).replace( - "paper_original", "paper_colored" - ) - pdf_filename = tex_filename.replace(".tex", ".pdf") - with open(os.path.join(self.main_directory, tex_filename), "r") as file: - content_lines = file.readlines() - - for page_index, blocks in figure_layout_info.items(): - for block in blocks: - bbox = block.bbox - center_x, center_y = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2 - log.debug( - f"page index: {page_index + 1}, center: ({center_x}, {center_y}), pdf filename: {pdf_filename}" - ) - # use synctex to retrieve the line index corresponding to the center of the bounding box - result = subprocess.run( - [ - "synctex", - "edit", - "-o", - f"{page_index + 1}:{center_x:.2f}:{center_y:.2f}:{pdf_filename}", - "-d", - self.main_directory, - ], - check=True, - capture_output=True, - text=True, - ) - # parse the output of synctex to get the source code - line_index = result.stdout.split("\nLine:")[1].split("\n")[0] - block.source_code = content_lines[int(line_index) - 1] - log.debug(f"line index: {line_index}, source code: {block.source_code}") - - def generate_figure_bb(self, pdf_layouts: List[LTPage]) -> Dict[int, List[Block]]: - """Generate bounding boxes for figures in a PDF layout using Pdfminer. - - Args: - pdf_layouts (List[LTPage]): A list of LTPage objects representing the layout of a PDF. - - Returns: - Dict[int, List[Block]]: A dictionary where the keys are page indices and the values are lists of - Block objects representing the bounding boxes of figures on each page. - """ - layout_info = defaultdict(list) - for page_index, page_layout in enumerate(pdf_layouts): - height = page_layout.height - for element in page_layout: - if not isinstance(element, LTFigure): - continue - # the coordinate system of Pdfminer is in contrast to the coordinate system of the image - # by fliping the y axis - y0 = height - element.bbox[3] - y1 = height - element.bbox[1] - x0 = element.bbox[0] - x1 = element.bbox[2] - layout_info[page_index].append( - Block( - bounding_box=BoundingBox(x0, y0, x1, y1), - page_index=page_index, - category=config.name2category["Figure"], - source_code="", - ) - ) - - # find the corresponding source code to figure bounding box - self.retrive_figure_source_code(layout_info) - - # convert bounding boxes from PDF coordinate system to image coordinate system - self.transform(layout_info) - return layout_info - - def transform(self, layout_info: Dict[int, List[Block]]) -> None: - """Transforms bounding boxes from PDF coordinate system to image coordinate system, - and change them in place. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary containing the layout information of each page. - The keys represent the page indices, and the values are lists of Block objects - representing the elements in the layout. - - Returns: - None - """ - for page_index in layout_info.keys(): - px2img = self.layout_metadata[page_index]["px2img"] - for index, element in enumerate(layout_info[page_index]): - x0, y0, x1, y1 = element.bbox - # scale - width, height = element.width, element.height - x0, y0 = x0 * px2img, y0 * px2img - x1, y1 = x0 + width * px2img, y0 + height * px2img - layout_info[page_index][index].bbox = BoundingBox(x0, y0, x1, y1) - - def generate_non_figure_bb(self) -> Dict[int, List[Block]]: - """Generates non-figure bounding boxes using the image pairs. - - Returns: - Dict[int, List[Block]]: A dictionary containing the layout information of each page. - The keys represent the page indices, and the values are lists of Block objects - representing the elements in the layout. - """ - background_directory = os.path.join(self.output_directory, "paper_white") - block_directories = glob.glob( - f"{self.output_directory}/paper_{config.folder_prefix}*" - ) - layout_info = defaultdict(list) - pattern = r"paper_(\w+)_(\d{5})_(.*?)_(\d{5})" - - for block_directory in tqdm(sorted(block_directories)): - log.debug(f"Processing {block_directory}") - image_pairs = get_image_pairs(block_directory, background_directory) - matches = re.match(pattern, os.path.basename(block_directory)) - if not matches: - raise ValueError(f"Cannot find the matching pattern: {block_directory}") - category = matches.group(3) - index = int(matches.group(4)) - log.debug(f"category: {category}, index: {index}") - - elements = [] - for image_pair in image_pairs: - page_index = image_pair[0] - - image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8) - image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8) - - diff_image = np.abs(image2_array - image1_array, dtype=np.uint8) - if np.all(diff_image == 0): - continue - labeled_image, num = label( - diff_image > config.threshold, return_num=True - ) - if num == 0: - continue - - regions = regionprops(labeled_image) - bounding_boxes = [region.bbox for region in regions] - - if len(bounding_boxes) == 0: - continue - - separations = self.layout_metadata[page_index]["separations"] - top_margin = self.layout_metadata[page_index]["top_margin"] - - # We do not consider the cross column case for these envs. - if category in envs.one_column_envs: - bboxes = [bb for bb in bounding_boxes] - if len(bboxes) == 0: - continue - element = Block( - bounding_box=BoundingBox.from_list(bboxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, - ) - if elements: - element.parent_block = elements[-1].block_id - elements.append(element) - continue - - # consider possible cross column case - for column in range(self.layout_metadata["num_columns"]): - # min_x: bb[1], min_y: bb[0], max_x: bb[4], max_y: bb[3] - column_boxes = [ - bb - for bb in bounding_boxes - if bb[1] >= separations[column] - and bb[1] <= separations[column + 1] - ] - if not column_boxes: - continue - - element = Block( - bounding_box=BoundingBox.from_list(column_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, - ) - if elements: - element.parent_block = elements[-1].block_id - - if ( - len(elements) > 0 - and elements[-1].category == element.category - and elements[-1].page_index == element.page_index - and elements[-1].source_code == element.source_code - and elements[-1].bbox.overlap(element.bbox) - ): - elements[-1].bbox = BoundingBox( - min( - elements[-1].bbox.x0, - element.bbox.x0, - ), - min( - elements[-1].bbox.y0, - element.bbox.y0, - ), - max( - elements[-1].bbox.x1, - element.bbox.x1, - ), - max( - elements[-1].bbox.y1, - element.bbox.y1, - ), - ) - continue - elements.append(element) - - for element in elements: - layout_info[element.page_index].append(element) - - return layout_info - - def generate_layout_info(self) -> Dict[int, List[Block]]: - """Generate layout information for the given PDF. - - This function extracts the PDF layouts using the `extract_pdf_layouts` method - and parses the metadata using the `parse_metadata` method. - Then, it generates non-figure bounding boxes using the `generate_non_figure_bb` method - and figure bounding boxes using the `generate_figure_bb` method. - - Args: - None - - Returns: - Dict[int, List[Block]]: A dictionary containing the layout information for - each page of the PDF. - The keys represent the page indices, and the values are lists of `Block` objects - that represent the bounding boxes. - - """ - pdf_layouts = self.extract_pdf_layouts() - self.parse_metadata(pdf_layouts) - layout_info = self.generate_non_figure_bb() - figure_layout_info = self.generate_figure_bb(pdf_layouts) - - for page_index in layout_info.keys(): - layout_info[page_index].extend(figure_layout_info[page_index]) - return layout_info - - def generate_reading_annotation( - self, layout_info: Dict[int, List[Block]] - ) -> DefaultDict[str, List]: - """Generate a reading annotation based on the layout information. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary containing the layout information - for each page index. The keys are the page indices and the values are lists of - `Block` objects representing the blocks on each page. - - Returns: - DefaultDict[str, List]: A defaultdict containing the reading annotation. The keys - of the defaultdict are the page indices and the values are lists of dictionaries - representing the reading annotation for each block on the page. Each dictionary - contains the following keys: - - "source_code": The source code of the block. - - "image_path": The path to the saved image of the block. - - "category": The category of the block. - - The defaultdict also contains the following keys: - - "categories": A list of dictionaries representing the categories. Each - dictionary contains the following keys: - - "id": The ID of the category. - - "name": The name of the category. - - "macros": A dictionary containing the macro definitions extracted from - the original tex file. - """ - reading_annotation = defaultdict(list) - - # sort all images by page index, see utils.pdf2jpg for details - image_files = sorted( - glob.glob(os.path.join(self.pdf_images_path, "*.jpg")), - key=lambda x: x[-6:-4], - ) - count = 0 - for page_index in layout_info.keys(): - page_image = Image.open(image_files[page_index]) - for block in layout_info[page_index]: - cropped_image = page_image.crop(block.bbox) - - image_name = config.folder_prefix + str(count).zfill(4) + ".jpg" - count += 1 - image_path = os.path.join(self.result_directory, image_name) - cropped_image.save(image_path) - reading_annotation[page_index].append( - { - "source_code": block.source_code, - "image_path": image_name, - "category": block.category, - } - ) - page_image.close() - - reading_annotation["categories"] = [ - {"id": index, "name": category} - for index, category, _ in config.config["category_name"] - ] - - return reading_annotation - - def generate_image_annotation( - self, layout_info: Dict[int, List[Block]] - ) -> Dict[int, Dict[str, Any]]: - """Generate image annotations based on the layout information. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary mapping page indices to a list of Block objects - representing the layout information. - - Returns: - Dict[int, Dict[str, Any]]: A dictionary mapping page indices to annotated image info. - """ - # sort all images by page index, see utils.pdf2jpg for details - # FIXME: use more robust way - image_files = sorted( - glob.glob(os.path.join(self.pdf_images_path, "*.jpg")), - key=lambda x: x[-6:-4], - ) - - image_info = {} # annotation image info member of COCO - for page_index in layout_info.keys(): - image_info[page_index] = {} - page_image = Image.open(image_files[page_index]) - draw = ImageDraw.Draw(page_image) - # use `locate .ttf` to find the available fonts - font = ImageFont.truetype( - config.config["annotation_image_font_type"], - config.config["annotation_image_font_size"], - ) - - for element in layout_info[page_index]: - category = element.category - draw.rectangle( - element.bbox, outline=config.colors_map[str(category)], width=3 - ) - draw.text( - (element.bbox[0], element.bbox[1]), - config.category2name[category], - fill=(255, 0, 0), - font=font, - ) - - image_name = "page_" + str(page_index).zfill(4) + ".jpg" - annotated_image_path = os.path.join(self.result_directory, image_name) - image_info[page_index]["file_name"] = image_name - image_info[page_index]["width"] = page_image.width - image_info[page_index]["height"] = page_image.height - page_image.save(annotated_image_path) - page_image.close() - - return image_info - - def annotate(self): - """Annotates the layout, reading, order, and quality report of the given image. - - Returns: - None - """ - # step1: generate layout info - layout_info = self.generate_layout_info() - layout_info_data = { - key: [x.to_dict() for x in values] for key, values in layout_info.items() - } - layout_info_file = os.path.join(self.result_directory, "layout_info.json") - utils.export_to_json(layout_info_data, layout_info_file) - - # step2: generate layout detection result - image_annotation = self.generate_image_annotation(layout_info) - layout_annotation_file = os.path.join( - self.result_directory, "layout_annotation.json" - ) - utils.export_to_coco( - layout_info, image_annotation, filename=layout_annotation_file - ) - - # step3: generate reading annotation - reading_annotation = self.generate_reading_annotation(layout_info) - reading_annotation_file = os.path.join( - self.result_directory, "reading_annotation.json" - ) - utils.export_to_json(reading_annotation, reading_annotation_file) - - -def get_image_pairs(dir1: str, dir2: str): - """ - Generate a list of image pairs based on the directories provided. - - Parameters: - dir1 (str): The directory path to the first set of images. - dir2 (str): The directory path to the second set of images. - - Raises: - FileNotFoundError: If the number of images in each directory does not - match or if the page index in the file names does not match. - - Returns: - list: A list of tuples representing the image pairs. - Each tuple contains the page index, the path to the rendered image, - and the path to the changed image. - """ - file_pattern = os.path.join(dir1, "*.jpg") - rendered_jpg_files = sorted(glob.glob(file_pattern)) - file_pattern = os.path.join(dir2, "*.jpg") - changed_jpg_files = sorted(glob.glob(file_pattern)) - - if len(rendered_jpg_files) != len(changed_jpg_files): - raise FileNotFoundError("Wrong image path or file name or page index!") - - def extract_page_index(filename: str) -> int: - pattern = r"thread-\d+-page-(\d+)\.jpg" - - match = re.search(pattern, filename) - if match: - page_index = int(match.group(1)) - return page_index - 1 - else: - raise ValueError("Cannot found corresponding page index") - - page_indices = [] - for i in range(len(rendered_jpg_files)): - file_name = os.path.basename(rendered_jpg_files[i]) - page_index = extract_page_index(file_name) - page_indices.append(int(page_index)) - - image_pairs = list(zip(page_indices, rendered_jpg_files, changed_jpg_files)) - return image_pairs - - -def generate_geometry_annotation( - page_image: Image.Image, layout_elements: List[Block] -) -> Image.Image: - """ - Generate an annotation for an image. - - Args: - page_image (Image.Image): The image to annotate. - page_elements (List[LTComponent]): A list of elements to be annotated. - - Returns: - Image.Image: The annotated image. - """ - draw = ImageDraw.Draw(page_image) - # use `locate .ttf` to find the available fonts - font = ImageFont.truetype( - config.config["annotation_image_font_type"], - config.config["annotation_image_font_size"], - ) - - for index, element in enumerate(layout_elements): - category = element.category - draw.rectangle(element.bbox, outline=config.colors_map[str(category)], width=3) - draw.text( - (element.bbox[0], element.bbox[1]), - config.category2name[category], - fill=(255, 0, 0), - font=font, - ) - - return page_image diff --git a/vrdu/order_annotation.py b/vrdu/order_annotation.py deleted file mode 100644 index 3462d86..0000000 --- a/vrdu/order_annotation.py +++ /dev/null @@ -1,344 +0,0 @@ -import re -import os -from uuid import uuid4 - -from vrdu.block import Block -from vrdu.config import config -from vrdu import utils -from vrdu import logger - -log = logger.get_logger(__name__) - - -class OrderAnnotation: - def __init__(self, tex_file: str) -> None: - self.tex_file = tex_file - self.main_directory = os.path.dirname(tex_file) - self.result_directory = os.path.join(self.main_directory, "output/result") - layout_info_file = os.path.join(self.result_directory, "layout_info.json") - layout_info_data = utils.load_json(layout_info_file) - layout_info = { - int(key): [Block.from_dict(item) for item in values] - for key, values in layout_info_data.items() - } - - # result - self.annotations = {} - self.annotations["annotations"] = [ - _block - for page_index in layout_info.keys() - for _block in layout_info[page_index] - ] - - def annotate(self): - self.annotations["orders"] = [] - self.generate_sortable_envs_order() - - self.generate_float_envs_order() - - self.generate_cross_reference_order() - - order_annotation_file = os.path.join( - self.result_directory, "order_annotation.json" - ) - - transformed_annotations = { - "annotations": [x.to_dict() for x in self.annotations["annotations"]], - "orders": self.annotations["orders"], - } - - utils.export_to_json(transformed_annotations, order_annotation_file) - - def generate_cross_reference_order(self): - annotations = [] - - # map from label to block_id - label_to_block_id = {} - for block in self.annotations["annotations"]: - if not block.labels: - continue - for _label in block.labels: - label_to_block_id[_label] = block.block_id - - ref_patterns = "|".join( - [ - r"\\ref\{(.*?)\}", - r"\\eqref\{(.*?)\}", - r"\\pageref\{(.*?)\}", - r"\\autoref\{(.*?)\}", - r"\\vref\{(.*?)\}", - r"\\cref\{(.*?)\}", - r"\\labelcref\{(.*?)\}", - ] - ) - # generate reference according to label - for block in self.annotations["annotations"]: - if config.category2name[block.category] not in ["Text", "Text-EQ"]: - continue - block.references = [ - x - for group in re.findall(ref_patterns, block.source_code) - for x in group - if x - ] - for _label in block.references: - if _label in label_to_block_id: - annotations.append( - { - "type": "explicit-cite", - "from": block.block_id, - "to": label_to_block_id[_label], - } - ) - - for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Caption": - continue - if not block.references: - continue - for _label in block.references: - if _label not in label_to_block_id: - continue - annotations.append( - { - "type": "implicit-cite", - "from": block.block_id, - "to": label_to_block_id[_label], - } - ) - - # generate reference for float environments - for block in self.annotations["annotations"]: - if config.category2name[block.category] not in ["Table", "Algorithm"]: - continue - block.references = [ - x - for group in re.findall(ref_patterns, block.source_code) - for x in group - if x - ] - for _label in block.references: - if _label in label_to_block_id: - annotations.append( - { - "type": "explicit-cite", - "from": block.block_id, - "to": label_to_block_id[_label], - } - ) - - self.annotations["orders"].extend(annotations) - - def generate_float_envs_order(self): - label_pattern = r"\\label\{(.*?)\}" - - with open(self.tex_file, "r") as f: - latex_content = f.read() - # 0, add labels for titles - # TODO: add labels for other types of titles - for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Title": - continue - block.labels = re.findall(label_pattern, block.source_code) - - start_index = latex_content.find(block.source_code) - if start_index == -1: - continue - end_index = start_index + len(block.source_code) - _matches = re.finditer(label_pattern, latex_content[end_index:], re.DOTALL) - for _match in _matches: - label_start_index, label_end_index = ( - _match.start() + end_index, - _match.end() + end_index, - ) - label_content = latex_content[label_start_index:label_end_index] - if latex_content[end_index:label_start_index].isspace(): - block.labels.extend(re.findall(label_pattern, label_content)) - break - - # 1. add labels for equations - for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Equation": - continue - block.labels = re.findall(label_pattern, block.source_code) - - # 2. add labels for float envs - # find the intetval of tables - category_to_patterns = { - "Table": re.compile( - r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL - ), - "Figure": re.compile( - r"\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}", re.DOTALL - ), - "Algorithm": re.compile( - r"\\begin\{algorithm\*?\}(.*?)\\end\{algorithm\*?\}", re.DOTALL - ), - } - - category_to_indicdes = {} - for category, pattern in category_to_patterns.items(): - category_to_indicdes[category] = [] - indices = pattern.finditer(latex_content) - # we add a uuid to match for float environments in case - # there are no explicit cite - for _match in indices: - category_to_indicdes[category].append( - (_match.start(), _match.end(), str(uuid4())) - ) - - for category_name, indices in category_to_indicdes.items(): - # find labels for those float environments - for block in self.annotations["annotations"]: - if config.category2name[block.category] != category_name: - continue - - start_index = latex_content.find(block.source_code) - if start_index == -1: - continue - end_index = start_index + len(block.source_code) - - for index in indices: - if start_index < index[0] or end_index > index[1]: - continue - - labels = re.findall( - label_pattern, latex_content[index[0] : index[1]] - ) - block.labels = labels - block.labels.append(index[2]) - - # add references for captions to those float environments - for block in self.annotations["annotations"]: - if config.category2name[block.category] != "Caption": - continue - start_index = latex_content.find(block.source_code) - if start_index == -1: - continue - end_index = start_index + len(block.source_code) - for index in indices: - if start_index < index[0] or end_index > index[1]: - continue - - block.references = [index[2]] - - def generate_sortable_envs_order(self): - annotations = [] - sortable_categories = [ - config.name2category[name] for name in config.sortable_categories - ] - - sortable_elements = [ - _block - for _block in self.annotations["annotations"] - if _block.category in sortable_categories - ] - - title_categories = [ - config.name2category[x] for x in ["Title", "PaperTitle", "Abstract"] - ] - - text_categories = [ - config.name2category[x] for x in ["Text", "Text-EQ", "Equation", "List"] - ] - - stack = [] - for index, element in enumerate(sortable_elements): - if index == 0 or not stack: - stack.append(element) - continue - - # case 0: both corresponding to the same text, mark as identical - if element.parent_block == stack[-1].block_id: - annotations.append( - { - "type": "identical", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - stack.pop() - stack.append(element) - continue - - # case 1: both in the text category, mark as adj - if ( - element.category in text_categories - and stack[-1].category in text_categories - ): - annotations.append( - { - "type": "adj", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - stack.pop() - stack.append(element) - continue - - # case 2: current in text, prev in title, mark as sub - if ( - element.category in text_categories - and stack[-1].category in title_categories - ): - if element.category != stack[-1].category: - annotations.append( - { - "type": "sub", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - stack.append(element) - continue - - # case 3: current in title, prev in text, find the most recent title - if ( - element.category in title_categories - and stack[-1].category in text_categories - ): - while stack and stack[-1].category not in title_categories: - stack.pop() - - if not stack: - stack.append(element) - continue - - annotations.append( - { - "type": "peer", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - stack.append(element) - continue - - # case 4: both in titles, mark as peer - if ( - element.category in title_categories - and stack[-1].category in title_categories - ): - annotations.append( - { - "type": "peer", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - stack.pop() - stack.append(element) - continue - - if element.category == config.name2category["Footnote"]: - annotations.append( - { - "type": "explicit-cite", - "from": element.block_id, - "to": stack[-1].block_id, - } - ) - continue - - self.annotations["orders"].extend(annotations) diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py deleted file mode 100644 index f4f4003..0000000 --- a/vrdu/preprocess.py +++ /dev/null @@ -1,190 +0,0 @@ -import os -import re - -from arxiv_cleaner.cleaner import Cleaner - -from vrdu.config import envs, config -from vrdu import utils -import vrdu.logger as logger - - -log = logger.get_logger(__name__) - - -def remove_comments(original_tex: str) -> None: - """ - Removes comments from a TeX file. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None - """ - with open(original_tex, "r") as file: - content = file.read() - - # Remove LaTeX comments - pattern = r"\\begin{comment}(.*?)\\end{comment}" - removed_comments = re.sub(pattern, "", content, flags=re.DOTALL) - - with open(original_tex, "w") as file: - file.write(removed_comments) - - -def clean_tex(original_tex: str) -> None: - """ - Clean the given TeX file by creating a cleaner object and running the clean method. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None - """ - main_directory = os.path.dirname(original_tex) - tex = os.path.basename(original_tex) - - # Create the cleaner - cleaner = Cleaner( - input_dir=main_directory, - output_dir=main_directory, - tex=tex, - command_options=config.command_options, - verbose=False, - ) - - # Run the cleaner - cleaner.clean() - - # remove comments - remove_comments(original_tex) - - -def replace_pdf_ps_figures_with_png(original_tex: str) -> None: - """ - Replaces PDF, ps, eps figures with PNG figures in a TeX file - to support pdfminer detecting bounding box. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None: This function does not return anything. - - Raises: - FileNotFoundError: If a PDF file specified in the TeX file is not found. - """ - - # FIXME: use more robust way, since the path to images may not exists. - main_directory = os.path.dirname(original_tex) - with open(original_tex) as f: - content = f.read() - - graphicspath_pattern = r"\\graphicspath\{\{(.+?)}" - match = re.search(graphicspath_pattern, content, re.DOTALL) - if match: - graphic_path = match.group(1) - else: - graphic_path = "" - - # Replace \psfig{...} with \includegraphics{...} - content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Replace \epsfig{...} with \includegraphics{...} - content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Regular expression pattern to match \includegraphics - # commands with PDF files - pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}" - - # Find all matches of \includegraphics with PDF files - matches = re.findall(pattern, content) - - # Replace PDF paths with PNG paths - ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - for match in matches: - image_name = match[1] - if not any(ext in image_name for ext in ext_patterns): - for ext in ext_patterns: - image_file = os.path.join(main_directory, graphic_path, image_name, ext) - if os.path.exists(image_file): - image_name = image_name + ext - break - - # detectable image type, see pdfminer.six for details - if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]): - content = content.replace(match[1], image_name) - continue - - # convert eps to pdf - if any(ext in image_name for ext in [".eps", ".ps"]): - eps_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(eps_image): - log.error(f"File not found: {eps_image}") - continue - pdf_image = os.path.splitext(eps_image)[0] + ".pdf" - utils.convert_eps_image_to_pdf_image(eps_image, pdf_image) - image_name = os.path.basename(pdf_image) - - # convert pdf to png - if image_name.endswith(".pdf"): - pdf_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(pdf_image): - log.error(f"File not found: {pdf_image}") - continue - png_image = os.path.splitext(pdf_image)[0] + ".png" - utils.convert_pdf_figure_to_png_image(pdf_image, png_image) - image_name = os.path.splitext(image_name)[0] + ".png" - - # replace the reference in tex file - content = content.replace(match[1], image_name) - - with open(original_tex, "w") as f: - f.write(content) - - -def delete_table_of_contents(original_tex: str) -> None: - """ - Deletes the table of contents from the given original_tex file. - This includes table of contents, list of figures, list of tables, and list of algorithms. - - Parameters: - original_tex (str): The path to the original .tex file. - - Returns: - None - """ - with open(original_tex, "r") as file: - latex_content = file.read() - - pattern = r"\\(" + "|".join(envs.table_of_contents) + r")" - modified_content = re.sub(pattern, "", latex_content) - - with open(original_tex, "w") as file: - file.write(modified_content) - - -def run(original_tex: str) -> None: - """ - Generates a modified version of the given LaTeX document by performing the following steps: - - Step 0: Clean the LaTeX document with arxiv_cleaner package. - Step 1: Replace EPS figures with PDF to make the LaTeX document compilable with pdflatex. - Step 2: Replace PDF figures with PNG to make pdfminer work. - Step 3: Delete the table of contents from the LaTeX document. - - Args: - original_tex (str): The original LaTeX document. - - Returns: - None - """ - # Step 0: clean tex - clean_tex(original_tex) - - # Step 2: process images - replace_pdf_ps_figures_with_png(original_tex) - - # Step 3: delete table of contents - delete_table_of_contents(original_tex) diff --git a/vrdu/quality_check.py b/vrdu/quality_check.py deleted file mode 100644 index eee25f1..0000000 --- a/vrdu/quality_check.py +++ /dev/null @@ -1,134 +0,0 @@ -from typing import Dict, List -import os - -from vrdu.block import Block -from vrdu import utils -from vrdu.config import config - - -def generate_quality_report(main_directory: str) -> None: - """Generates a quality report based on the provided layout information. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary where the keys are page indices - and the values are lists of blocks on each page. - - Returns: - None - """ - result_directory = os.path.join(main_directory, "output/result") - - layout_metadata_file = os.path.join(result_directory, "layout_metadata.json") - layout_metadata = utils.load_json(layout_metadata_file) - - text_info_file = os.path.join(result_directory, "texts.json") - text_info = utils.load_json(text_info_file) - - layout_info_file = os.path.join(result_directory, "layout_info.json") - layout_info_data = utils.load_json(layout_info_file) - # order_annotation_file = os.path.join(result_directory, "order_annotation.json") - # order_annotation = utils.load_json(order_annotation_file) - # layout_info_data = order_annotation["annotation"] - layout_info = { - int(key): [Block.from_dict(item) for item in values] - for key, values in layout_info_data.items() - } - - result = {} - result["num_pages"] = max(layout_info.keys()) - result["num_columns"] = layout_metadata["num_columns"] - result["category_quality"] = [] - - total_reading, total_geometry = 0, 0 - for key, value in text_info.items(): - # currently, ignore graphics - if key == config.name2category["Figure"]: - continue - - reading_count = len(value) - geometry_count = 0 - for page_index, blocks in layout_info.items(): - for block in blocks: - # only major block is counted - if ( - block.category == config.name2category[key] - and block.parent_block is None - ): - geometry_count += 1 - missing_rate = 0 if reading_count == 0 else 1 - geometry_count / reading_count - result["category_quality"].append( - { - "category": key, - "geometry_count": geometry_count, - "reading_count": len(value), - "missing_rate": missing_rate, - } - ) - - total_reading += reading_count - total_geometry += geometry_count - result["category_quality"].append( - { - "category": "Total", - "geometry_count": total_geometry, - "reading_count": total_reading, - "missing_rate": 1 - total_geometry / total_reading, - } - ) - - result["page_quality"] = compute_overlap(layout_info) - - report_file = os.path.join(result_directory, "quality_report.json") - utils.export_to_json(result, report_file) - - -def compute_overlap(layout_info: Dict[int, List[Block]]) -> List[Dict]: - """Computes the overlap between blocks in a layout. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary where the keys are page indices - and the values are lists of blocks on each page. - - Returns: - List[Dict]: A list of dictionaries containing the overlap information for each page and - the total overlap information. - - """ - result = [] - total_area, total_overlap, total_blocks = 0, 0, 0 - for page_index in layout_info.keys(): - blocks = layout_info[page_index] - blocks.sort(key=lambda block: block.bbox.x0) - - area, overlap = 0, 0 - for i in range(len(blocks)): - area += blocks[i].bbox.area() - for j in range(i + 1, len(blocks)): - if blocks[j].bbox.x0 > blocks[i].bbox.x1: - break - overlap += blocks[i].bbox.overlap(blocks[j].bbox) - - result.append( - { - "page": page_index, - "num_blocks": len(blocks), - "area": area, - "overlap": overlap, - "ratio": 0 if area == 0 else overlap / area, - } - ) - total_area += area - total_overlap += overlap - total_blocks += len(blocks) - - result.append( - { - "page": "total", - "num_blocks": total_blocks, - "area": total_area, - "overlap": total_overlap, - "ratio": 0 if total_area == 0 else total_overlap / total_area, - } - ) - - return result diff --git a/vrdu/renderer.py b/vrdu/renderer.py deleted file mode 100644 index 73a0bf3..0000000 --- a/vrdu/renderer.py +++ /dev/null @@ -1,909 +0,0 @@ -from collections import defaultdict -import os -import shutil -from typing import List, Tuple, Union -import re - - -import vrdu.utils as utils -import vrdu.logger as logger -from vrdu.config import config, envs - -from TexSoup.TexSoup import TexSoup -import TexSoup.app.conversion as conversion - -log = logger.get_logger(__name__) - - -class Renderer: - def __init__(self) -> None: - self.texts = defaultdict(list) - - def render(self, origin_tex: str) -> None: - """Render the colored version of a LaTeX document. - - This method performs the rendering process for generating the colored version of a LaTeX document. - It includes the following steps: - 1. Create a copy of the original LaTeX file with a new name. - 2. Add color definitions and layout definitions to the copied file. - 3. Remove color definitions that may cause conflicts. - 4. Render all environments in the copied file. - 5. Iterate over semantic elements and change their enclosing color, generating corresponding LaTeX files. - 6. Export the rendered texts to a JSON file. - - Args: - origin_tex (str): The path to the original LaTeX file. - - Returns: - None - - Examples: - >>> renderer = LaTeXRenderer() - >>> renderer.render("original.tex") - """ - main_directory = os.path.dirname(origin_tex) - - # copy the original tex file - color_tex = os.path.join(main_directory, "paper_colored.tex") - shutil.copyfile(origin_tex, color_tex) - - self.add_color_definition(color_tex) - self.add_layout_definition(color_tex) - - # remove color definitions to prevent conflict - self.remove_hyperref_color(color_tex) - self.remove_lstlisting_color(color_tex) - - self.render_all_env(color_tex) - - # change the enclose color of semantic elements one by one and generate corresponding tex files - self.render_one_env(main_directory) - - text_file = os.path.join(main_directory, "output/result/texts.json") - utils.export_to_json(self.texts, text_file) - - def render_all_env(self, color_tex: str) -> None: - """ - Render all environments, it includes simple environments and float environments. - - Args: - color_tex (str): The color texture. - - Returns: - None - """ - self.render_simple_envs(color_tex) - self.render_float_envs(color_tex) - - def render_simple_envs(self, color_tex: str) -> None: - """Renders simple environments in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering various simple environments, - such as sections, lists, equations, and text. - The modifications are done in-place, directly modifying the provided file. - - Args: - color_tex (str): The path to the LaTeX file to modify. - - Returns: - None - - Raises: - EOFError: If TexSoup failed to parse the input file due to runaway environments. - AssertionError: If TexSoup failed to parse the input file due to Command \\item invalid in math mode. - - """ - data, start, end = data_from_tex_file(color_tex) - - self.render_section(data) - self.render_list(data) - self.render_equation(data) - self.render_text(data) - # self.enclose_reference(data, color=name2color["Reference"]) - - # Write the modified data back to the TeX file - tex_file_from_data(data, color_tex, start=start, end=end) - - def render_float_envs(self, tex_file: str) -> None: - """Renders float environments in a LaTeX file. - - This method applies rendering to various float environments in the LaTeX file - by calling specific rendering methods for each type of environment. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - """ - - # Step 1: Render algorithm environments - self.render_algorithm(tex_file) - - # Step 2: Render tabular environments - self.render_tabular(tex_file) - - # Step 3: Render code environments - self.render_code(tex_file) - - # Step 4: Render footnotes - self.render_footnote(tex_file) - - # Step 5: Extract graphics paths - self.extract_graphics(tex_file) - - # Step 6: Render captions - self.render_caption(tex_file) - - # the following two envs are placed here because they also use string regex to render - # Step 7: Render titles - self.render_title(tex_file) - - # Step 8: Render abstracts - self.render_abstract(tex_file) - - def render_section(self, data: List[Union[dict, str]]) -> None: - """Render sections in the given data with a configured color. - This function modifies the data in-place. - - Args: - data (List[Union[dict, str]]): The data to be enclosed. - color (str, optional): The color of the enclosed section. Defaults to 'red'. - - Returns: - None - """ - for item in data: - if not isinstance(item, dict): - continue - - env = find_env(item, envs.section_envs) - if env is None: - continue - - self.texts["Title"].append(item[env]) - item[env] = utils.colorize(item[env], "Title") - - def render_list(self, data: List[Union[dict, str]]) -> None: - """Render equations in the given data with a configured color. - This function modifies the data in-place. - - Args: - data (List[Union[dict, str]]): The list of items to be processed. - - Returns: - None - """ - for item in data: - if not isinstance(item, dict): - continue - - env = find_env(item, envs.list_envs) - if env is None: - for value in item.values(): - if not isinstance(value, list): - continue - self.render_list(value[1]) - continue - - self.texts["List"].append(item[env]) - item[env] = utils.colorize(item[env], "List") - - def render_equation(self, data: List[Union[dict, str]]) -> None: - """Render equations in the given data with a configured color. - - Args: - - data (List[Union[dict, str]]): The data containing equations to enclose. - - Returns: - None - """ - for item in data: - if not isinstance(item, dict): - continue - - env = find_env(item, envs.math_envs) - - if env is None: - for value in item.values(): - if not isinstance(value, list): - continue - self.render_equation(value[1]) - continue - - self.texts["Equation"].append(item[env]) - item[env] = utils.colorize(item[env], "Equation") - - def render_text(self, data: List[Union[dict, str]]) -> None: - """Render texts and text-eqs in the given data with a configured color. - This function modifies the data in-place. - - Args: - data (List[Union[dict, str]]): The list of items to be processed. - - Returns: - None - """ - for index, item in enumerate(data): - if not isinstance(item, str): - if not isinstance(item, dict): - continue - for key, value in item.items(): - if key.lower() not in envs.text_envs: - continue - if not isinstance(value, list): - continue - self.render_text(value[1]) - continue - - if not item or item == "\n" or item == "\n\n" or item.isspace(): - continue - - if is_text_eq(item): - data[index] = utils.colorize(item, "Text-EQ") - self.texts["Text-EQ"].append(item) - else: - data[index] = utils.colorize(item, "Text") - self.texts["Text"].append(item) - - # format - if item[0] == "\n": - data[index] = "\n" + data[index] - if item[-1] == "\n": - data[index] += "\n" - - def add_color_definition(self, color_tex: str) -> None: - """Adds color definitions to a LaTeX file. - - Args: - color_tex (str): The path to the LaTeX file to modify. - - Raises: - ValueError: If the beginning of the document is not found. - - Returns: - None - """ - with open(color_tex, "r") as f: - content = f.read() - - definitions = ["\\usepackage{xcolor}"] - for name, rgb_color in config.name2rgbcolor.items(): - color_name = config.name2color[name] - r, g, b = rgb_color - definition = f"\\definecolor{{{color_name}}}{{RGB}}{{{r}, {g}, {b}}}" - definitions.append(definition) - - color_definitions = "\n" + "\n".join(definitions) + "\n" - - # Find location to insert package - preamble = re.search(r"\\begin{document}", content) - if not preamble: - raise ValueError("begin of document not found") - preamble_loc = preamble.start() - - # Insert package line - content = content[:preamble_loc] + color_definitions + content[preamble_loc:] - - # Write updated content - with open(color_tex, "w") as f: - f.write(content) - - def add_layout_definition(self, color_tex: str) -> None: - """Adds layout definitions to a LaTeX file. - - Args: - color_tex (str): The path to the LaTeX file to modify. - - Raises: - ValueError: If the end of the document is not found. - - Returns: - None - - Reference: - https://www.overleaf.com/learn/latex/Page_size_and_margins - """ - with open(color_tex, "r") as f: - content = f.read() - - keys = config.layout_keys - - definitions = ["\\message{[vrdu_data_process: Info]}"] - for key in keys: - definition = f"\\message{{[vrdu_data_process: The {key} is: \\the\\{key}]}}" - definitions.append(definition) - - layout_definitions = "\n" + "\n".join(definitions) + "\n" - - package_re = r"\\end{document}" - match = re.search(package_re, content) - if not match: - raise ValueError("end of document not found") - - package_loc = match.start() - - # Insert package line - content = content[:package_loc] + layout_definitions + content[package_loc:] - - # Write updated content - with open(color_tex, "w") as f: - f.write(content) - - def remove_hyperref_color(self, color_tex: str) -> None: - """Removes hyperref color settings from a LaTeX file. - - Args: - color_tex (str): The path to the LaTeX file to modify. - - Raises: - ValueError: If the beginning of the document is not found. - - Returns: - None - - Reference: - https://www.overleaf.com/learn/latex/Hyperlinks - """ - # Read the content of the input file - with open(color_tex, "r") as file: - content = file.read() - - # Define the pattern to match the color definitions - pattern = r"\\usepackage{hyperref}|\\usepackage(\[)?\[.*?\]?(\])?{hyperref}" - - preamble = re.search(r"\\begin{document}", content) - if not preamble: - raise ValueError("begin of document not found") - preamble_loc = preamble.start() - - # forbidden the color used by hyperref - hyper_setup = "\\hypersetup{colorlinks=false}\n" - if re.search(pattern, content[:preamble_loc]): - content = content[:preamble_loc] + hyper_setup + content[preamble_loc:] - - # Write the modified content back to the input file - with open(color_tex, "w") as file: - file.write(content) - - def remove_lstlisting_color(self, color_tex: str) -> None: - """Remove color definitions from a LaTeX file. - - Args: - color_tex (str): The path to the LaTeX file. - - Returns: - None - """ - # Read the content of the input file - with open(color_tex, "r") as file: - content = file.read() - - # delete the color definitions - pattern = r"\\lstset\{.*?\}" - modified_content = re.sub(pattern, "", content) - - # Write the modified content to the output file - with open(color_tex, "w") as file: - file.write(modified_content) - - def modify_color_definitions(self, input_file: str, output_file: str) -> None: - """Modify the pre-defined color definitions in the input file and write the modified content to the output file. - - Args: - input_file (str): The path to the input file. - output_file (str): The path to the output file. - - Returns: - None - """ - with open(input_file, "r") as file: - content = file.read() - - # Define the pattern to match the color definitions - for name in config.name2rgbcolor.keys(): - color_name = config.name2color[name] - pattern = r"\\definecolor{" + color_name + r"}{RGB}{(\d+), (\d+), (\d+)}" - - # Replace the color definitions with pure white - content = re.sub( - pattern, - r"\\definecolor{" + color_name + r"}{RGB}{255, 255, 255}", - content, - ) - - with open(output_file, "w") as file: - file.write(content) - - def get_env_orders(self, tex_file: str) -> List[str]: - """Returns a list of environment orders based on the contents of the given `tex_file`. - - Args: - tex_file (str): The path to the .tex file. - - Returns: - List[str]: A list of environment orders. - """ - with open(tex_file) as f: - contents = f.read() - colors = list(config.name2color.values()) - matches = [] - - pattern = "|".join(rf"\b{re.escape(term)}\b" for term in colors) - for m in re.finditer(pattern, contents): - matches.append(m.group(0)) - - # the definitions are discarded - return matches[len(colors) :] - - def render_one_env(self, main_directory: str) -> None: - """Render one environment by modifying the corresponding rendering color to black. - - Args: - main_directory (str): The main directory. - - Returns: - None: This function does not return anything. - """ - color_tex_file = os.path.join(main_directory, "paper_colored.tex") - white_tex_file = os.path.join(main_directory, "paper_white.tex") - self.modify_color_definitions(color_tex_file, white_tex_file) - ordered_env_colors = self.get_env_orders(white_tex_file) - suffix = "_color" - index_map = defaultdict(int) - - with open(white_tex_file, "r") as f: - content = f.read() - - for index, env_color in enumerate(ordered_env_colors): - env = env_color[: -len(suffix)] - # the first one is the color definition, skip it - new_content = replace_nth( - content, "{" + env_color + "}", r"{black}", index_map[env] + 2 - ) - - output_file = os.path.join( - main_directory, - f"paper_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.tex", - ) - index_map[env] += 1 - with open(output_file, "w") as f: - f.write(new_content) - - def render_caption(self, tex_file: str) -> None: - """Renders captions in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering captions with a specified color. - It searches for caption commands in the file and applies colorization to their contents. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - """ - with open(tex_file) as f: - content = f.read() - - pattern = r"\\caption(?:\[[^\]]*\])?(?:\{[^}]*\})" - result = self._render_simple_envs(content, pattern, "Caption") - - with open(tex_file, "w") as f: - f.write(result) - - def render_title(self, tex_file: str) -> None: - """Renders the title in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering the title with a specified color. - It searches for the title command in the file and applies colorization to its content. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - """ - with open(tex_file) as f: - content = f.read() - - pattern = r"\\title(?:\{[^}]*\})" - result = self._render_simple_envs(content, pattern, "PaperTitle") - - with open(tex_file, "w") as f: - f.write(result) - - def render_footnote(self, tex_file: str) -> None: - """Renders footnotes in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering footnotes with a specified color. - It searches for various footnote environments and applies colorization to their contents. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - """ - # \footnote{...}, \footnote[]{...}, \footnotetext{...}, \footnotetext[]{...}, \tablefootnote{} - with open(tex_file) as f: - content = f.read() - - for env_name in envs.footnote_envs: - pattern = r"\\" + env_name + r"(?:\[[^\]]*\])?(?:\{[^}]*\})" - - content = self._render_simple_envs(content, pattern, "Footnote") - - with open(tex_file, "w") as f: - f.write(content) - - def _render_simple_envs(self, content: str, pattern: str, category: str) -> str: - """Renders specific environments in the content using replacement. - - This method searches for occurrences of a pattern in the content and replaces them with colored versions. - The replacement is based on the specified category for colorization. - - Args: - content (str): The content of the LaTeX file. - pattern (str): The regular expression pattern to match. - category (str): The category of the environment for colorization. - - Returns: - str: The modified content with the rendered environments. - """ - matches = re.finditer(pattern, content) - result = "" - index = 0 - for match in matches: - start = match.start() - end = match.end() - - # the regex is greedy, iterate to find the end of footnote env - num_left_brackets = content[start:end].count("{") - num_right_brackets = content[start:end].count("}") - while num_right_brackets < num_left_brackets: - if content[end] == "{": - num_left_brackets += 1 - elif content[end] == "}": - num_right_brackets += 1 - end += 1 - - category_content = content[start:end] - - self.texts[category].append(category_content) - colored_title = utils.colorize(category_content, category) - result += content[index:start] - result += colored_title - index = end - - result += content[index:] - return result - - def render_abstract(self, tex_file: str) -> None: - """Renders the abstract section in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering the abstract section with a specified color. - It searches for the abstract section in the file and applies colorization to its contents. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - - Raises: - ValueError: If more than one abstract section is found. - """ - with open(tex_file) as f: - content = f.read() - - pattern = r"\\begin{abstract}.*?\\end{abstract}" - indexes = [ - (m.start(), m.end()) for m in re.finditer(pattern, content, re.DOTALL) - ] - - if len(indexes) > 1: - raise ValueError("more than one abstract found") - - if not indexes: - return - - start, end = indexes[0] - abstract = content[start:end] - self.texts["Abstract"].append(abstract) - colored_abstract = utils.colorize(abstract, "Abstract") - result = content[:start] + colored_abstract + content[end:] - - with open(tex_file, "w") as f: - f.write(result) - - def render_tabular(self, tex_file: str) -> None: - """Renders tabular environments in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering tabular environments with a specified color. - It searches for tabular environments in the file and applies colorization to their contents. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - """ - with open(tex_file) as f: - content = f.read() - pattern = r"\\begin{(tabular[*xy]?)}.*?\\end{\1}" - result = self._render_float_envs(content, pattern, "Table") - - with open(tex_file, "w") as f: - f.write(result) - - def render_algorithm(self, tex_file: str) -> None: - """Renders algorithm environments in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering algorithm environments with a specified color. - It searches for algorithm environments in the file and applies colorization to their contents. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - """ - with open(tex_file) as f: - content = f.read() - - pattern = r"\\begin{algorithm[*]?}(.*?)\\end{algorithm[*]?}" - result = self._render_float_envs(content, pattern, "Algorithm") - - with open(tex_file, "w") as f: - f.write(result) - - def render_code(self, tex_file: str) -> None: - """Renders code environments in a LaTeX file. - - This method modifies the content of a LaTeX file by rendering code environments with a specified color. - It searches for code environments and `\\lstinputlisting` commands in the file and applies colorization to their contents. - - Args: - tex_file (str): The path to the LaTeX file to modify. - - Returns: - None - - Notes: - There are two types of code environments: - - pattern 1: code environment - - pattern 2: lstinputlisting to input a file - - Reference: - https://en.wikibooks.org/wiki/LaTeX/Source_Code_Listings - """ - with open(tex_file, "r") as file: - content = file.read() - - pattern = ( - r"\\begin{(verbatim|lstlisting|program)[*]?}(.*?)\\end{\1[*]?}" - + "|" - + r"\\lstinputlisting\[[^\]]*\]{[^\}]*}" - ) - result = self._render_float_envs(content, pattern, "Code") - - with open(tex_file, "w") as f: - f.write(result) - - def _render_float_envs(self, content: str, pattern: str, category: str) -> str: - """Renders specific float environments in the content. - - This method searches for occurrences of a pattern in the content and replaces them with colored versions. - The replacement is based on the specified category for colorization. - - Args: - content (str): The content of the LaTeX file. - pattern (str): The regular expression pattern to match. - category (str): The category of the environment for colorization. - - Returns: - str: The modified content with the rendered float environments. - """ - indexes = [ - (m.start(), m.end()) for m in re.finditer(pattern, content, re.DOTALL) - ] - - if not indexes: - log.debug(f"no {category} found") - return content - - result = content[: indexes[0][0]] - for i, _ in enumerate(indexes): - if i > 0: - result += content[indexes[i - 1][1] : indexes[i][0]] - float_env = content[indexes[i][0] : indexes[i][1]] - - # filter tablle of figures - if category == "Table" and float_env.find("\\includegraphics") != -1: - continue - - # TODO: filter table in equation envs - - self.texts[category].append(float_env) - colored_float_env = utils.colorize(float_env, category) - result += colored_float_env - - result += content[indexes[-1][1] :] - return result - - def extract_graphics(self, tex_file: str) -> None: - """Extracts graphics paths from a LaTeX file. - - This method reads a LaTeX file and extracts the paths of graphics included using the `\\includegraphics` command. - The extracted graphics paths are stored in the `texts["Figure"]` list. - - Args: - tex_file (str): The path to the LaTeX file to extract graphics from. - - Returns: - None - """ - with open(tex_file, "r") as file: - content = file.read() - - pattern = r"\\includegraphics(?:\[(.*?)\])?{(.*?)}" - matches = re.findall(pattern, content) - for match in matches: - graphic = "\\includegraphics" - if match[0]: - graphic += f"[{match[0]}]" - graphic += f"{{{match[1]}}}" - self.texts["Figure"].append(graphic) - - -def extract_main_content(tex_file: str) -> Tuple[str, int, int]: - """Extracts the main content from a LaTeX file. - - Args: - tex_file (str): The path to the LaTeX file. - - Returns: - Tuple[str, int, int]: A tuple containing the main content of the LaTeX file, - the start position of the main content in the file, and the end position - of the main content in the file. - """ - with open(tex_file) as f: - content = f.read() - - start = content.find("\\begin{document}") - end = content.find("\\end{document}") - - if start == -1 or end == -1: - raise ValueError("Document tags not found") - - start += len("\\begin{document}") - main_content = content[start:end] - - return main_content, start, end - - -def data_from_tex_file(tex_file: str) -> Tuple[List[Union[dict, str]], int, int]: - """Extracts data from a Tex file using TexSoup. - - Args: - tex_file (str): The path to the Tex file. - - Returns: - Tuple[List, int, int]: A tuple containing the extracted data, the start - position of the extracted content, and the end position of the extracted - content. - """ - main_content, start, end = extract_main_content(tex_file) - tex_tree = TexSoup(main_content).expr.all - data = conversion.to_list(tex_tree) - - return data, start, end - - -def tex_file_from_data( - data: List[Union[dict, str]], - tex_file: str, - start: int = 0, - end: int = -1, -) -> None: - """Generate a TeX file from the given TexSoup data. - - Args: - data (List[Union[dict, str]]): The data to be converted into LaTeX. - tex_file (str): The path of the TeX file to be generated. - start (int, optional): The starting position in the TeX file to replace content. Defaults to 0. - end (int, optional): The ending position in the TeX file to replace content. Defaults to -1. - - Returns: - None: This function does not return any value. - """ - with open(tex_file, "r") as f: - content = f.read() - - # convert the data into latex - rendered_tex = conversion.to_latex(data) - - content = content[:start] + rendered_tex + content[end:] - - with open(tex_file, "w") as f: - f.write(content) - - -def replace_nth(string: str, old: str, new: str, n: int) -> str: - """ - Replace the n-th occurrence of a substring in a given string with a new substring. - - Args: - string (str): The original string to search and perform the replacement on. - old (str): The substring to be replaced. - new (str): The substring to replace the n-th occurrence of `old` in `string`. - n (int): The occurrence number of `old` to be replaced (1-based index). - - Returns: - str: The modified string with the n-th occurrence of `old` replaced by `new`. If the - occurrence is not found, the original string is returned. - - Example: - >>> replace_nth("Hello, hello, hello!", 'hello', 'hi', 2) - 'Hello, hello, hi!' - """ - index_of_occurrence = string.find(old) - occurrence = int(index_of_occurrence != -1) - - while index_of_occurrence != -1 and occurrence != n: - index_of_occurrence = string.find(old, index_of_occurrence + 1) - occurrence += 1 - - if occurrence == n: - return ( - string[:index_of_occurrence] - + new - + string[index_of_occurrence + len(old) :] - ) - - return string - - -def find_env(wrapped_env: dict, query: List[str]) -> Union[str, None]: - """ - Finds and returns the environment variable from the given query list - that exists in the wrapped_env dictionary. - - Args: - wrapped_env (dict): A dictionary containing environment variables as keys. - query (list): A list of environment variables to search for. - - Returns: - Union[str, None]: The environment variable found in the query list that exists in the wrapped_env dictionary, or None - if no matching environment variable is found. - """ - for env in query: - if env in wrapped_env: - return env - - return None - - -def is_text_eq(text: str) -> bool: - """Check if the given text contains any mathematical expressions. - - Args: - text (str): The text to be checked for mathematical expressions. - - Returns: - bool: True if the text contains mathematical expressions, False otherwise. - - Note: - This function uses a regular expression pattern to match mathematical expressions - - Reference: - https://www.overleaf.com/learn/latex/Mathematical_expressions - """ - pattern = r"(\\\(.*?\\\))|(\$.*?\$)|(\\begin\{math\}.*?\\end\{math\})" - matches = re.findall(pattern, text) - - for match in matches: - if not re.search(r"\\\$", match[0]): - return True - - return False diff --git a/vrdu/utils.py b/vrdu/utils.py deleted file mode 100755 index 22eda88..0000000 --- a/vrdu/utils.py +++ /dev/null @@ -1,401 +0,0 @@ -import csv -import glob -import os -import re -import subprocess -import json -from typing import Any, Dict, List -import uuid - - -from pdf2image import pdf2image -from pdf2image import generators - -from vrdu.block import Block -from vrdu.config import config - - -def extract_all_tex_files(path) -> List[str]: - """ - Given a path, this function extracts all the .tex files within the - specified directory and its subdirectories. - - Args: - path (str): The path to the directory where the .tex files are located. - - Returns: - List[str]: A list of paths to the .tex files found. - """ - tex_files = [] - - for root, _, files in os.walk(path): - tex_files.extend( - [os.path.join(root, file) for file in files if file.endswith(".tex")] - ) - return tex_files - - -def export_to_json(data, file_path) -> None: - """ - Write the contents of a dictionary to a JSON file. - - Parameters: - data (dict): The dictionary to be written to the file. - file_path (str): The path to the JSON file. - """ - with open(file_path, "w") as json_file: - json.dump(data, json_file, indent=4) - - -def load_json(file_path) -> Any: - """ - Load a JSON file into a dictionary. - - Parameters: - file_path (str): The path to the JSON file. - - Returns: - dict: The loaded JSON data as a dictionary. - """ - with open(file_path, "r") as json_file: - data = json.load(json_file) - return data - - -def compile_check(source_code: str) -> bool: - """ - check if the source code can be compiled, - used to check if there are macros in the source code. - """ - prefix = r""" - \documentclass{article} - \usepackage{amsmath} - \usepackage{amssymb} - \usepackage{amsfonts, bm} - \usepackage{amsthm} - \usepackage{array} - \usepackage{tabularx} - \usepackage{multirow} - \usepackage{booktabs} - \begin{document} - """ - - suffix = r""" - \end{document} - """ - temp_filename = str(uuid.uuid4()) - content = prefix + source_code + suffix - with open(f"{temp_filename}.tex", "w") as f: - f.write(content) - - result = True - try: - subprocess.run( - ["pdflatex", "-halt-on-error", f"{temp_filename}.tex"], check=True - ) - except subprocess.CalledProcessError: - result = False - finally: - # remove files - files = glob.glob(f"{os.getcwd()}/{temp_filename}.*") - for file in files: - os.remove(file) - - return result - - -def get_main_content(data): - """ - Generate the main content of a document. - - Parameters: - - data (list): A list of dictionaries representing the document. - - Returns: - - main_content (str): The main content of the document. - - Raises: - - Exception: If the document is not found in the data. - - """ - main_content = None - main_content_index = None - for index, item in enumerate(data): - if isinstance(item, dict) and "document" in item: - main_content = item["document"][1] - main_content_index = index - break - - if main_content is None: - raise Exception("document not found") - - return main_content, main_content_index - - -def compile_latex(file: str): - """ - Compile a LaTeX file using either pdflatex or xelatex as the tex engine. - - Parameters: - file (str): The path to the LaTeX file to be compiled. - - Returns: - None - """ - file_name = os.path.basename(file) - - subprocess.run( - ["pdflatex", "-interaction=nonstopmode", file_name], - timeout=1000, - stdout=subprocess.DEVNULL, - ) - - subprocess.run( - ["pdflatex", "-interaction=nonstopmode", file_name], - timeout=1000, - stdout=subprocess.DEVNULL, - ) - - if file_name == "paper_colored.tex": - subprocess.run( - ["pdflatex", "-interaction=nonstopmode", "-synctex=1", file_name], - timeout=1000, - stdout=subprocess.DEVNULL, - ) - - -def pdf2jpg(pdf_path: str, output_directory: str) -> None: - """ - Convert a PDF file into a series of jpg images. - - Parameters: - pdf_path (str): The path of the PDF file to be converted. - output_directory (str): The directory where the converted images will be saved. - Returns: - None - - Reference: - https://pypi.org/project/pdf2image/ - """ - os.makedirs(output_directory, exist_ok=True) - # the output images has name of format: thread-000x-yz.png - # where x is the thread index, yz is the index of pdf page start from 1 - pdf2image.convert_from_path( - pdf_path, - fmt="jpg", - output_folder=output_directory, - output_file=generators.counter_generator(prefix="thread-", suffix="-page"), - ) - - -def convert_pdf_figure_to_png_image(pdf_image: str, png_image: str, dpi: int = 72): - """ - Convert a PDF to a PNG image. - - Parameters: - pdf_image (str): The filepath of the PDF image to convert. - png_image (str): The filepath where the PNG image will be saved. - dpi (int): The resolution for the conversion (default is 72). - - Returns: - None - """ - # crop the pdf image - subprocess.run( - ["pdfcrop", pdf_image, pdf_image], - stdout=subprocess.DEVNULL, - ) - # convert the pdf image into png - images = pdf2image.convert_from_path(pdf_image, dpi=dpi) - images[0].save(png_image) - - -def convert_eps_image_to_pdf_image(eps_image_path: str, pdf_image_path: str): - """ - A function that converts an EPS image to a PDF image. - - Args: - eps_image_path (str): The file path of the EPS image to convert. - pdf_image_path (str): The file path where the PDF image will be saved. - """ - subprocess.run(["epspdf", eps_image_path, pdf_image_path]) - - - -def extract_macro_definitions(tex_file) -> List[str]: - """ - Extracts macro definitions from a given tex file. - - Args: - tex_file (str): The path to the tex file. - - Returns: - List[str]: A list of macro definitions extracted from the tex file. - """ - macro_patterns = [ - r"\\newcommand{[^}]+}", - r"\\renewcommand{[^}]+}", - r"\\newenvironment{[^}]+}", - r"\\renewenvironment{[^}]+}", - ] - - macros = [] - with open(tex_file, "r") as file: - text_lines = file.readlines() - macros = [ - line.strip() - for line in text_lines - if any(re.findall(pattern, line) for pattern in macro_patterns) - ] - - return macros - - -def export_to_coco( - layout_info: Dict[int, List[Block]], - image_infos: Dict[int, Dict[str, Any]], - filename: str, -) -> None: - """ - Export the given layout information and image information to a COCO format JSON file. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary mapping page indices to lists of Block objects. - image_infos (Dict[int, Dict[str, Any]]): A dictionary mapping page indices to dictionaries containing image information. - filename (str): The name of the output JSON file. - - Returns: - None - - Reference: - https://cocodataset.org/#format-data - """ - category_info = [ - { - "id": index, - "name": category, - "supercategory": supercategory, - } - for index, category, supercategory in config.config["category_name"] - ] - result = { - "info": config.config["coco_info"], - "licenses": config.config["coco_licenses"], - "images": [], - "annotations": [], - "categories": category_info, - } - - result["images"] = [ - { - "id": page_index, - "width": image_infos[page_index]["width"], - "height": image_infos[page_index]["height"], - "file_name": image_infos[page_index]["file_name"], - **config.config["coco_image_info"], - } - for page_index in layout_info.keys() - ] - - for page_index, page_elements in layout_info.items(): - for index, element in enumerate(page_elements): - width, height = element.width, element.height - annotation = { - "id": index, - "image_id": page_index, - "category_id": element.category, - "segmentation": [], - "bbox": [element.bbox[0], element.bbox[1], width, height], - "area": width * height, - "iscrowd": 0, - } - result["annotations"].append(annotation) - - export_to_json(result, filename) - - -def extract_title_name(title) -> str: - """ - Extracts the name of a title from its format. - - Args: - title (str): The title string to extract the name from. - - Returns: - str: The extracted title environment name from the title. - - Example: - >>> extract_title_name("\\section{Name}") - 'section' - >>> extract_title_name("\\subsection*{AnotherName}") - 'subsection' - >>> extract_title_name("No match") - '' - """ - match = re.search(r"\\(\w+)(\*?){(.*)}", title) - if match: - return match.group(1) - - return "" - - -def colorize(text: str, category_name: str) -> str: - """ - Given a piece of text and a category name, colorizes the text based on the category. - - Args: - text (str): The text to be colorized. - category_name (str): The category name to determine the colorization. - - Returns: - str: The colorized text based on the category. - """ - color = config.name2color[category_name] - if category_name == "Caption": - index = text.find("{") - return text[: index + 1] + "{\\color{" + color + "}" + text[index + 1 :] + "}" - if category_name == "Footnote": - index = text.find("{") - return text[: index + 1] + "{\\color{" + color + "}" + text[index + 1 :] + "}" - if category_name == "Table": - return "{\\color{" + color + "}" + text + "}" - if category_name == "Algorithm": - # skip the position arguments, like \\begin{algorithm}[hbt!] - prefix = text.find("\\", len("\\begin{algorithm}")) - suffix = text.find("\\end{algorithm}") - return ( - text[:prefix] - + "{\\color{" - + color - + "}" - + text[prefix:suffix] - + "}" - + text[suffix:] - ) - if category_name == "Title": - return "{\\color{" + color + "}" + text + "}" - if category_name == "List": - return "{\\color{" + color + "}" + text + "}" - if category_name == "Text": - return "{\\textcolor{" + color + "}{" + text + "}}" - if category_name == "Text-EQ": - return "{\\textcolor{" + color + "}{" + text + "}}" - if category_name == "PaperTitle": - index = text.find("{") - return ( - text[: index + 1] - + "{\\textcolor{" - + color - + "}{" - + text[index + 1 :] - + "}}" - ) - if category_name == "Equation": - return "{\\color{" + color + "}{" + text + "}}" - if category_name == "Abstract": - prefix = len("\\begin{abstract}") - return "{" + text[:prefix] + "\\color{" + color + "}" + text[prefix:] + "}" - if category_name == "Code": - return "{\\color{" + color + "}" + text + "}" - - raise NotImplementedError(f"Invalid category name: {category_name}")