Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b88d61a
feat(discpline_info.csv): remove this file
MaoSong2022 Jun 12, 2024
d51d638
feat(block.py): add type hint
MaoSong2022 Jun 12, 2024
bd2d2e7
refactor(block.py): make block field required instead of optional
MaoSong2022 Jun 12, 2024
bf0dece
refactor(batch_process.py, utils.py): move extract_tex_files to batch…
MaoSong2022 Jun 12, 2024
903f291
refactor(utils.py): optimize load and export logic
MaoSong2022 Jun 12, 2024
912d10d
refactor(utils.py, tests/): remove unused functions
MaoSong2022 Jun 12, 2024
4d1363d
refactor(utils.py, layout_annotation.py): use consistent argument names
MaoSong2022 Jun 12, 2024
a7e5c95
fix(block.py): return type error
MaoSong2022 Jun 12, 2024
4cb0b8f
style(block.py): format f-string
MaoSong2022 Jun 12, 2024
e8be463
style(utils.py): remove unused packages
MaoSong2022 Jun 12, 2024
3d84de9
docs(utils.py): remove wrong docstring
MaoSong2022 Jun 12, 2024
ac67642
feat(block.py, layout_annotation.py): use consistent argument type
MaoSong2022 Jun 12, 2024
64facb6
fix(layout_annotation.py): typo
MaoSong2022 Jun 12, 2024
61255a3
fix(order_annotation.py, renderer.py): typo
MaoSong2022 Jun 12, 2024
b21f39d
refactor(DocParser): enclose packages as a whole
MaoSong2022 Jun 13, 2024
8ddd76e
refactor(batch_process.py): move batch_process.py to scripts/
MaoSong2022 Jun 13, 2024
3ab4efb
docs(setup.py): update metadata
MaoSong2022 Jun 13, 2024
54a261c
test(tests/): change import path
MaoSong2022 Jun 13, 2024
357fe6b
refactor(DocParser): update import path
MaoSong2022 Jun 13, 2024
caed5c4
refactor(scripts): change package import path
MaoSong2022 Jun 13, 2024
057dba1
feat(scripts): remove unused scripts
MaoSong2022 Jun 13, 2024
3d5f832
refactor(arxiv_download.py): simplify code logic
MaoSong2022 Jun 13, 2024
08753af
refactor(batch_process.py): simplify code logic
MaoSong2022 Jun 13, 2024
50f5641
feat(scripts/): remove unused script
MaoSong2022 Jun 13, 2024
622fadd
refactor(generate_reading_annotation.py): simplify code logic and add…
MaoSong2022 Jun 13, 2024
2c7f3ea
refactor(retrieve_metadata.py): retrieve metadata for papers
MaoSong2022 Jun 13, 2024
12e689b
fix(retrieve_metadata.py): typo
MaoSong2022 Jun 13, 2024
652aaf4
refactor(scripts): simplify code logic
MaoSong2022 Jun 13, 2024
ced7f36
feat(scripts): remove unused scripts
MaoSong2022 Jun 13, 2024
d564e08
perf(settings.json): add some nouns used in this project
MaoSong2022 Jun 13, 2024
85b080c
refactor(layout_annotation.py): rm reading_annotation
MaoSong2022 Jun 17, 2024
0588f2c
fix(main.py): make dirs twice
MaoSong2022 Jun 17, 2024
5f3768c
refactor(renderer.py): merge logic of processing predefined color
MaoSong2022 Jun 17, 2024
0b5af51
refactor(renderer.py, test/): use more meaningful name
MaoSong2022 Jun 17, 2024
b36c798
fix(all): Module DocParser not found
MaoSong2022 Jul 15, 2024
7ac0c6c
fix(all): use absolute import
MaoSong2022 Jul 17, 2024
a12da9f
refactor(all): extract logger as a separate module
MaoSong2022 Jul 17, 2024
e4c3713
fix(arxiv_cleaner): import path error
MaoSong2022 Jul 18, 2024
4c92883
refactor(all): refactor the whole projects for open-source
MaoSong2022 Jan 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,35 +1,59 @@
{
"cSpell.words": [
"arxiv",
"autoref",
"colorlinks",
"columnsep",
"columnwidth",
"definecolor",
"documentclass",
"dtype",
"epspdf",
"eqref",
"flickr",
"footnotetext",
"graphicspath",
"headheight",
"headsep",
"hoffset",
"hyperref",
"hypersetup",
"imread",
"includegraphics",
"iscrowd",
"labelcref",
"laparams",
"latexpand",
"levelname",
"lstinputlisting",
"lstlisting",
"lstset",
"nonstopmode",
"oddsidemargin",
"opencv",
"pageref",
"pdfcrop",
"pdflatex",
"pdfminer",
"psfig",
"regionprops",
"renewcommand",
"rgbcolor",
"scikit",
"skimage",
"subimport",
"synctex",
"tablefootnote",
"texlive",
"Texsoup",
"textcolor",
"textwidth",
"topmargin",
"tqdm",
"usepackage",
"voffset",
"vrdu",
"xcolor"
"xcolor",
"YOLO"
]
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
tree with navigation, search, and modification utilities.
"""

from TexSoup.TexSoup.tex import read
from TexSoup.TexSoup.data import TexNode
from .tex import read
from .data import TexNode

__version__ = '0.3.1'

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import re

from TexSoup.TexSoup import TexSoup
from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup
from DocParser.TexSoup.TexSoup import TexSoup
from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup


from vrdu import logger
from vrdu.config import envs
from DocParser.logger import logger
from DocParser.vrdu.config import envs

log = logger.get_logger(__name__)

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from arxiv_cleaner.file_utils import (
from .file_utils import (
build_relative_path, combine_paths, copy_files, create_temp_dir,
does_file_exist, find_files, remove_temp_dir,
remove_unnecessary_blank_lines)
from arxiv_cleaner.latex import LatexRunner
from arxiv_cleaner.logger import Logger
from .latex import LatexRunner
from .logger import Logger


class Cleaner:
Expand Down
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions arxiv_cleaner/latex.py → DocParser/arxiv_cleaner/latex.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
import subprocess

from arxiv_cleaner.cli import run_command, check_command_results
from arxiv_cleaner.file_utils import (
from .cli import run_command, check_command_results
from .file_utils import (
build_relative_path,
change_extension,
combine_paths,
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions arxiv_cleaner/main.py → DocParser/arxiv_cleaner/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from arxiv_cleaner.arguments import parse_args
from arxiv_cleaner.cleaner import Cleaner
from arguments import parse_args
from cleaner import Cleaner


def main():
Expand Down
Empty file added DocParser/logger/__init__.py
Empty file.
File renamed without changes.
140 changes: 74 additions & 66 deletions main.py → DocParser/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,89 @@
import glob
import os
import shutil
from pathlib import Path
from typing import List
from tqdm import tqdm
from loguru import logger

from DocParser.vrdu import utils
from DocParser.vrdu import renderer
from DocParser.vrdu import preprocess
from DocParser.vrdu import layout_annotation as layout
from DocParser.vrdu import order_annotation as order
from DocParser.vrdu.config import config
from DocParser.vrdu.quality_check import generate_quality_report

from vrdu import logger
from vrdu import utils
from vrdu import renderer
from vrdu import preprocess
from vrdu import layout_annotation as layout
from vrdu import order_annotation as order
from vrdu.config import config
from vrdu.quality_check import generate_quality_report
logger.add("vrdu_debug.log", mode="w")

log = logger.setup_app_level_logger(file_name="vrdu_debug.log")


def transform_tex_to_images(main_directory: str) -> None:
def transform_tex_to_images(main_directory: Path) -> None:
"""
Transforms TeX files with pattern paper_*.tex in the specified directory into jpg images.

Args:
main_directory (str): The main directory where the TeX files are located.
main_directory (Path): The main directory where the TeX files are located.

Returns:
None
"""
tex_files = glob.glob(f"{main_directory}/paper_*.tex")
output_directory = os.path.join(main_directory, "output")
for tex_file in tqdm(tex_files):
log.debug(f"[VRDU] file: {tex_file}, start transforming into images.")
utils.compile_latex(tex_file)
output_directory = Path(main_directory) / "output"
for tex_file in tqdm(tex_files, desc="Converting TeX files to images"):
logger.debug(f"[VRDU] file: {tex_file}, start transforming into images.")
# Set colored flag based on filename
colored = "paper_colored.tex" in tex_file
utils.compile_latex(tex_file, colored=colored)

# get the pdf file name
filename_without_extension = os.path.splitext(os.path.basename(tex_file))[0]
pdf_file = os.path.join(main_directory, f"{filename_without_extension}.pdf")
filename_without_extension = Path(tex_file).stem
pdf_file = Path(main_directory) / f"{filename_without_extension}.pdf"

# convert into images
image_directory = os.path.join(output_directory, filename_without_extension)
os.makedirs(image_directory)
utils.pdf2jpg(pdf_file, image_directory)
image_directory = output_directory / filename_without_extension
image_directory.mkdir(parents=True, exist_ok=True)
utils.pdf2jpg(str(pdf_file), str(image_directory))


def get_redundant_folders(main_directory: Path) -> List[str]:
"""Get list of redundant folders to remove."""
pattern = f"{main_directory}/output/paper_{config.folder_prefix}*"
redundant_folders = glob.glob(pattern)
redundant_folders.extend(
[
f"{main_directory}/output/paper_white",
f"{main_directory}/output/paper_original",
]
)
return redundant_folders


def remove_redundant_stuff(main_directory: str) -> None:
def remove_redundant_stuff(main_directory: Path) -> None:
"""
Remove redundant files and folders from the main directory.

Args:
main_directory (str): The path of the main directory.
main_directory (Path): The path of the main directory.

Returns:
None
"""
# remove generated tex related files
redundant_files = glob.glob(f"{main_directory}/paper_*")
for file in redundant_files:
for file in glob.glob(f"{main_directory}/paper_*"):
os.remove(file)

# remove useless pdf and image files
# TODO: move this name pattern into config
redundant_folders = glob.glob(
f"{main_directory}/output/paper_{config.folder_prefix}*"
)
redundant_folders += [
f"{main_directory}/output/paper_white",
f"{main_directory}/output/paper_original",
]
for folder in redundant_folders:
for folder in get_redundant_folders(main_directory):
if os.path.exists(folder):
shutil.rmtree(folder)


def process_one_file(file_name: str) -> None:
def check_if_already_processed(main_directory: Path) -> bool:
quality_report_file = main_directory / "output/result/quality_report.json"
return quality_report_file.exists()


def process_one_file(file_name: Path) -> None:
"""
Process a file through multiple steps including preprocessing, rendering,
transforming into images, generating annotations, and handling exceptions.
Expand All @@ -83,37 +95,32 @@ def process_one_file(file_name: str) -> None:
Returns:
None
"""
main_directory = os.path.dirname(file_name)
log.info(f"[VRDU] file: {file_name}, start processing.")
main_directory = Path(file_name).parent
logger.info(f"[VRDU] file: {file_name}, start processing.")

# check if this paper has been processed
quality_report_file = os.path.join(
main_directory, "output/result/quality_report.json"
)
if os.path.exists(quality_report_file):
log.info(f"[VRDU] file: {file_name}, paper has been processed")
if check_if_already_processed(main_directory):
logger.info(f"[VRDU] file: {file_name}, paper has been processed")
return

# make a copy of the original tex file
original_tex = os.path.join(main_directory, "paper_original.tex")
original_tex = main_directory / "paper_original.tex"
shutil.copyfile(file_name, original_tex)

# remove the output folder if it exists
output_directory = os.path.join(main_directory, "output")
if os.path.exists(output_directory):
output_directory = main_directory / "output"
if output_directory.exists():
shutil.rmtree(output_directory)

# output_directory stores the intermediate results
# result_directory stores the final results
os.makedirs(os.path.join(main_directory, "output/result"))

# change the working directory to the main directory of the paper
cwd = os.getcwd()

try:
# change the working directory to the main directory of the paper
os.chdir(main_directory)
# create output folder
os.makedirs(os.path.join(main_directory, "output/result"))
# create output folder and output/result folder
result_dir = output_directory / "result"
result_dir.mkdir(parents=True)

# step 1: preprocess the paper
preprocess.run(original_tex)
Expand All @@ -122,14 +129,14 @@ def process_one_file(file_name: str) -> None:
vrdu_renderer = renderer.Renderer()
vrdu_renderer.render(original_tex)

# step 2.2: compling tex into PDFs
log.info(
# step 2.2: compiling tex into PDFs
logger.info(
f"[VRDU] file: {original_tex}, start transforming into images, this may take a while..."
)
transform_tex_to_images(main_directory)

# Step 3: generate annotations
log.info(
logger.info(
f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..."
)
vrdu_layout_annotation = layout.LayoutAnnotation(original_tex)
Expand All @@ -141,14 +148,15 @@ def process_one_file(file_name: str) -> None:
# generate quality report for simple debugging
generate_quality_report(main_directory)

log.info(f"[VRDU] file: {original_tex}, successfully processed.")
logger.info(f"[VRDU] file: {original_tex}, successfully processed.")

except Exception as e:
error_type = e.__class__.__name__
error_info = str(e)
log.error(
f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}"
)
# error_type = e.__class__.__name__
# error_info = str(e)
# logger.error(
# f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}"
# )
raise e

finally:
# remove redundant files
Expand Down Expand Up @@ -183,18 +191,18 @@ def main() -> None:
Returns:
None
"""
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(
description="Process TeX files to generate annotations and images"
)
parser.add_argument(
"-f",
"--file_name",
type=str,
type=Path,
required=True,
help="The name of the tex file will full path",
help="The path to the TeX file to process",
)
args = parser.parse_args()
file_name = args.file_name

process_one_file(file_name)
process_one_file(Path(args.file_name))


if __name__ == "__main__":
Expand Down
Empty file added DocParser/vrdu/__init__.py
Empty file.
Loading
Loading