-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathpdf2llm.py
More file actions
executable file
·295 lines (236 loc) · 10.2 KB
/
pdf2llm.py
File metadata and controls
executable file
·295 lines (236 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "platformdirs",
# "markitdown[pdf]",
# "pymupdf4llm",
# "psutil",
# ]
# ///
# See https://docs.astral.sh/uv/guides/scripts/#using-a-shebang-to-create-an-executable-file
"""pdf2llm.py here.
https://github.com/wilsonmar/python-samples/blob/main/pdf2llm.py
This code uses several techniques to parse a PDF file for use by LLM.
Sample file "LSDPrep-V8.pdf" is 509 pages in 9.8 MB.
# Before running, on a Terminal
# Create a folder:
rm -rf .venv .pytest_cache __pycache__
rm pyproject.toml
uv init # create README.md, pyproject.toml, main.py, .python-version (latest), .gitignore, .git
python -m venv .venv # creates bin, include, lib, pyvenv.cfg
source .venv/bin/activate
#uv venv --python python3.12
uv add --frozen MarkItDown[pdf]
uv add --frozen pymupdf4llm
uv add --frozen requests
uv add platformdirs
uv pip install -e .
#uv sync
chmod +x pdf2llm.py
ruff check pdf2llm.py
uv run pdf2llm.py -v -vv -f "LSDPrep-V8.pdf"
AFTER RUN:
deactivate
rm -rf .venv .pytest_cache __pycache__
"""
__last_change__ = "25-11-17 v001 + new :pdf2llm.py"
__status__ = "NOT WORKING - new"
# Built-in libraries:
import argparse
from datetime import datetime, timezone
#from pathlib import Path
import platform
#import re
import psutil
import time
# External libraries defined in requirements.txt:
try:
import pymupdf4llm
from markitdown import MarkItDown
#import matplotlib. pyplot
#import requests
except Exception as e:
print(f"Python module import failed: {e}")
# uv run log-time-csv.py
#print(" sys.prefix = ", sys.prefix)
#print(" sys.base_prefix = ", sys.base_prefix)
print("Please activate your virtual environment:\n uv env .env\n source .venv/bin/activate")
exit(9)
# Global static variables default values for override by args:
SHOW_VERBOSE = False
SHOW_DEBUG = False
SHOW_SUMMARY = False
gpu_device = "cpu"
MAX_LOOPS = 3 # 0 = infinite
SLEEP_SECS = 1
# Program Timings:
# For wall time measurements:
pgm_strt_datetimestamp = datetime.now()
print(platform.system()) # Darwin, Linux, Windows, etc.
def read_cmd_args() -> None:
"""Read command line arguments and set global variables.
See https://realpython.com/command-line-interfaces-python-argparse/
"""
#import argparse
#from argparse import ArgumentParser
parser = argparse.ArgumentParser(allow_abbrev=True,description="swap-a-secret.py")
parser.add_argument("-q", "--quiet", action="store_true", help="Run without output")
parser.add_argument("-v", "--verbose", action="store_true", help="Show inputs into functions")
parser.add_argument("-vv", "--debug", action="store_true", help="Debug outputs from functions")
parser.add_argument("-s", "--summary", action="store_true", help="Show summary stats")
parser.add_argument("-g", "--gpu", action="store_true", help="gpu device")
parser.add_argument("-r", "--ray", action="store_true", help="use ray")
# Default -h = --help (list arguments)
# uv run gpu-sample.py -v -vv -g "mps"
args = parser.parse_args()
#### SECTION 08 - Override defaults and .env file with run-time parms:
# In sequence of workflow:
global SHOW_VERBOSE, SHOW_DEBUG, SHOW_SUMMARY
if args.verbose: # -v --verbose
SHOW_VERBOSE = True
if args.debug: # -vv --debug
SHOW_DEBUG = True
if args.summary: # -s --summary
SHOW_SUMMARY = True
if args.gpu: # -g --gpu
gpu_device = args.gpu # noqa
if args.quiet: # -q --quiet
SHOW_VERBOSE = False
SHOW_DEBUG = False
SHOW_SUMMARY = False
return None
### OS-level utilities:
def is_macos() -> bool:
"""Return True if this is running on macOS."""
# import platform
return platform.system() == "Darwin"
# For custom GPU operations in Python, use pyobjc to interface with Apple Metal APIs, but it is more complex than using PyTorch/TensorFlow.
#### Utility Time Functions:
def day_of_week(local_time_obj) -> str:
"""Return day of week string from date object (starts at 0)."""
# str(days[local_time_obj.weekday()]) # Monday=0 ... Sunday=6
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return str(days[local_time_obj.weekday()])
def timestamp_local() -> str:
"""Generate a timestamp straing containing the local time with AM/PM & Time zone code."""
# import pytz
# now = datetime.now(tz) # adds time zone.
# from datetime import datetime
local_time_obj = datetime.now().astimezone()
local_timestamp = local_time_obj.strftime("%Y-%m-%d_%I:%M:%S %p %Z%z") # local timestamp with AM/PM & Time zone codes
return local_timestamp
def timestamp_utc() -> str:
"""Generate a timestamp straing containing the UTC "Z" time with no AM/PM & Time zone code."""
# import time
timestamp = time.time() # UTC epoch time.
# from datetime import datetime, timezone
# Get the current UTC time as a timezone-aware datetime object
now_utc = datetime.now(timezone.utc)
# Format the UTC timestamp as a string, e.g., ISO 8601 format
timestamp = now_utc.strftime('%Y%m%dT%H%M%SZ')
# 20251118T02:50:20Z # : not allowed in file names.
return timestamp
def func_timer_strt():
"""Capture start time for elapsed seconds calculation by func_timer_stop()."""
strt_func_time = time.perf_counter()
return strt_func_time
def func_timer_stop(strt_time):
"""Calculate elapsed seconds using start time previously captured."""
stop_time = time.perf_counter()
elapsed_secs = stop_time - strt_time
return elapsed_secs
def string_byte_count(string: str, encoding='utf-8') -> int:
"""Encode the string to bytes using the specified (utf-8)."""
byte_sequence = string.encode(encoding)
# Return the length of the byte sequence
return len(byte_sequence)
def user_gb_mem_avail() -> float:
"""Return the GB of RAM for system, using the psutil library.
cross-platform vs. /proc/meminfo Linux sums "MemFree", "Buffers", and "Cached" values in kB.
"""
#import os, psutil # psutil-5.9.5
memory_bytes = psutil.virtual_memory().available # for user
gb = memory_bytes / (1024 ** 3) # from bytes to Gb
return gb
def pgm_memory_used() -> (float, str):
"""Return the MiB of RAM for the current process, using the psutil library."""
#import os, psutil # psutil-5.9.5
process = psutil.Process()
process_info = str(process)
mem=process.memory_info().rss / (1024 ** 2) # in bytes
return mem, process_info
def pgm_diskspace_free() -> float:
"""Return the GB of disk space free of the partition in use, using the psutil library."""
#import os, psutil # psutil-5.9.5
disk = psutil.disk_usage('/')
free_space_gb = disk.free / (1024 * 1024 * 1024) # = 1024 * 1024 * 1024
return free_space_gb
def pgm_summary(std_strt_datetimestamp, loops_count):
"""Print summary count of files processed and the time to do them."""
# For wall time of standard imports:
pgm_stop_datetimestamp = datetime.now()
pgm_elapsed_wall_time = pgm_stop_datetimestamp - pgm_strt_datetimestamp
if SHOW_DEBUG:
pgm_stop_mem_used, process_data = pgm_memory_used()
pgm_stop_mem_diff = pgm_stop_mem_used - pgm_strt_mem_used
print(f"{pgm_stop_mem_diff:.6f} MB memory consumed during run in {process_data}.")
pgm_stop_disk_diff = pgm_strt_disk_free - pgm_diskspace_free()
print(f"{pgm_stop_disk_diff:.6f} GB disk space consumed during run.")
print(f"SUMMARY: Ended while attempting loop {loops_count} in {pgm_elapsed_wall_time} seconds.")
else:
print(f"SUMMARY: Ended while attempting loop {loops_count}.")
def use_markitdown(input_pdf):
"""Use use_markitdown to create markdown."""
# Handles OCR and image descriptions when integrated with LLMs like OpenAI models.
# from markitdown import MarkItDown
md = MarkItDown()
md_text = md.convert(input_pdf)
return md_text
def use_pymupdf4llm(input_pdf):
"""Use pymupdf4llm to create markdown."""
# import pymupdf4llm
md_text = pymupdf4llm.to_markdown(input_pdf)
return md_text
# Vision Parse: Uses Vision LLMs to convert PDFs, recognizing text, tables, and formatting into Markdown. https://www.reddit.com/r/MachineLearning/comments/1hg5d3p/p_vision_parse_parse_pdf_documents_into_markdown/
# llama-parse: Uses an API to convert PDFs to Markdown with structure preservation, requiring API key setup. https://stackoverflow.com/questions/77834102/converting-pdf-to-markdown-in-python-with-structure-preservation
if __name__ == '__main__':
SHOW_VERBOSE = True
SHOW_DEBUG = True
SHOW_SUMMARY = True
print("\n# Program command variables: ")
print(f" -v SHOW_VERBOSE={SHOW_VERBOSE}")
print(f" -vv SHOW_DEBUG={SHOW_DEBUG}")
print(f" -s SHOW_SUMMARY={SHOW_SUMMARY}")
local_timestamp = timestamp_local()
if SHOW_DEBUG:
pgm_strt_mem_used, pgm_process = pgm_memory_used()
print(f"DEBUG: {pgm_process}")
print("DEBUG: pgm_memory used()="+str(pgm_strt_mem_used)+" MiB being used.")
user_gb_mem_avail = user_gb_mem_avail()
print(f"DEBUG: user_gb_mem_avail()={user_gb_mem_avail:.2f} GB")
pgm_strt_disk_free = pgm_diskspace_free()
print(f"DEBUG: pgm_diskspace_free()={pgm_strt_disk_free:.2f} GB")
# list_disk_space_by_device()
# From https://www.ldsavow.com/LDSPREP/LDSPrep-V8.pdf
algo = "pymupdf4llm"
#algo = "MarkItDown"
input_pdf = "LDSPrep-V8.pdf"
yymmdd = timestamp_utc()
md_out_filename = f"LDSPrep-V8-{algo}-{yymmdd}.md"
if algo == "pymupdf4llm":
md_out = use_pymupdf4llm(input_pdf)
elif algo == "MarkItDown":
md_out = use_markitdown(input_pdf)
#TODO: Convert UTF-8 "" to "##" markdown
#input_string = "\f\fFF"
#output_string = input_string.replace("\f\fFF", "##")
# print(output_string)
else:
print("Invalid algo.")
exit()
print(f"use_markitdown {string_byte_count(str(md_out))} bytes.")
#Too big to print(md_text)
with open(md_out_filename, "w") as file:
file.write(str(md_out))
print(f"pdf2llm.py wrote {md_out_filename}") # 2MB