Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/pymupdf4llm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,29 @@ The PyMuPDF4LLM API
}
}

.. method:: markdown_to_pdf(md_path: str | pathlib.Path, \
user_css: str | None = None, \
page_rect: rect-like | None = None, \
margins: rect-like | None) = None, \
archive: str | pathlib.Path | None = None, \
output_path: str | pathlib.Path | None = None) -> pymupdf.Document | None

Convert the markdown text content of the file specified by `md_path` into a PDF document.

The function is always available -- independently of whether you are using the PyMuPDF Layout module or not.

:arg str|Path md_path: the file path of the markdown file to be converted.

:arg str|None user_css: optional, a string of CSS code to be applied to the markdown content. This may be used to customize the appearance of the generated PDF document. If `None` (default), the built-in default CSS is used.

:arg rect-like|None page_rect: optional, the rectangle defining the page boundaries for the generated PDF document. If `None` (default), ISO A4 page dimensions are used. To use one of PyMuPDF's predefined page formats, use e.g. ``pymupdf.paper_rect("Letter")``.

:arg rect-like|None margins: optional, the margins (borders) for the generated pages. This must be a sequence of four floats ``[left, top, right, bottom]`` specifying the respective border width in points (1/72 inches). If `None` (default), the default ``[50, 50, 50, 50]`` margins are used.

:arg str|Archive|None archive: optional. This is be required if the markdown source references images that are **not** stored in the same folder as the markdown file. In this case, `archive` must be a `pymupdf.Archive` object which provides access to the respective image files. If `None` (default), it is assumed that all referenced images are stored in the same folder as the markdown file. The parameter **may** also be required if a custom ``user_css`` references external resources like font files.

:arg str|Path|None output_path: optional, the file path where the generated PDF document will be saved. If specified, the generated PDF will be saved to that location. If `None` (default), the document is returned as a `pymupdf.Document` object.

.. note::

Please see `this site <https://github.com/pymupdf/pymupdf4llm/discussions/327>`_ for more background and the current status of further improvements regarding usage with :ref:`PyMuPDF Layout <pymupdf-layout>`.
Expand Down
83 changes: 63 additions & 20 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2897,7 +2897,7 @@ def __getitem__(self, i=0):
raise IndexError(f"page {i} not in document")
return self.load_page(i)

def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11):
def __init__(self, filename=None, stream=None, filetype=None, archive=None, rect=None, width=0, height=0, fontsize=11):
"""Creates a document. Use 'open' as a synonym.

Notes:
Expand Down Expand Up @@ -2943,7 +2943,14 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0

self._name = filename
self.stream = stream

if isinstance(archive, pathlib.Path):
archive = Archive(archive.name)
elif isinstance(archive, str):
archive = Archive(archive)
elif not archive:
archive = Archive()
elif not isinstance(archive, Archive):
raise TypeError(f"bad archive: {type(archive)=}.")
if stream is not None:
if filename is not None and filetype is None:
# 2025-05-06: Use <filename> as the filetype. This is
Expand All @@ -2958,6 +2965,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
stream = stream.getvalue()
else:
raise TypeError(f"bad stream: {type(stream)=}.")

# this prevents bad things if original goes out of existence:
self.stream = stream

assert isinstance(stream, (bytes, memoryview))
Expand All @@ -2967,9 +2976,9 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
# raise a specific exception.
raise EmptyFileError('Cannot open empty stream.')

stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
fz_stream = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
try:
doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2)
doc = mupdf.fz_open_document_with_stream_and_dir(filetype if filetype else '', fz_stream, archive.this)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError('Failed to open stream') from e
Expand All @@ -2996,20 +3005,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
raise EmptyFileError(f'Cannot open empty file: {filename=}.')

if filetype:
# Override the type implied by <filename>. MuPDF does not
# have a way to do this directly so we open via a stream.
try:
fz_stream = mupdf.fz_open_file(filename)
doc = mupdf.fz_open_document_with_stream(filetype, fz_stream)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e
suffix = filetype
else:
try:
doc = mupdf.fz_open_document(filename)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError(f'Failed to open file {filename!r}.') from e
suffix = pathlib.Path(filename).suffix.strip(".")
try:
fz_stream = mupdf.fz_open_file(filename)
doc = mupdf.fz_open_document_with_stream_and_dir(suffix, fz_stream, archive.this)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError(f'Failed to open file {filename!r} as type {suffix}.') from e

else:
pdf = mupdf.PdfDocument()
Expand Down Expand Up @@ -19098,7 +19102,7 @@ def JM_copy_rectangle(page, area):
return s


def JM_convert_to_pdf(doc, fp, tp, rotate):
def JM_convert_to_pdf(doc, fp, tp, rotate) -> bytes:
'''
Convert any MuPDF document to a PDF
Returns bytes object containing the PDF, created via 'write' function.
Expand All @@ -19113,7 +19117,8 @@ def JM_convert_to_pdf(doc, fp, tp, rotate):
e = fp # ... range
rot = JM_norm_rotation(rotate)
i = fp
while 1: # interpret & write document pages as PDF pages
internal_links = [] # collect PDF-wide internal links here
while 1: # interpret & write document pages as PDF pages
if not _INRANGE(i, s, e):
break
page = mupdf.fz_load_page(doc, i)
Expand All @@ -19124,11 +19129,49 @@ def JM_convert_to_pdf(doc, fp, tp, rotate):
dev = None
page_obj = mupdf.pdf_add_page(pdfout, mediabox, rot, resources, contents)
mupdf.pdf_insert_page(pdfout, -1, page_obj)

# also copy links to the output PDF page
# get the PDF page we've just created
pdf_page = mupdf.pdf_load_page(pdfout, i)

# loop through source page links
link = mupdf.fz_load_links(page) # load first link
while link.m_internal: # break loop when link is None
uri = link.uri() # URI string
rect = mupdf.FzRect(link.rect()) # link "from" rectangle
is_external = mupdf.fz_is_external_link(uri)

if is_external: # external links can be copied directly
mupdf.pdf_create_link(pdf_page, rect, uri)
else: # internal links done when PDF is complete
# find target of internal link
ret, xp, yp = mupdf.fz_resolve_link(doc, uri)
ilink={"page": i, "ret": ret, "from": rect, "h": rect.y1-rect.y0, "w": rect.x1-rect.x0, "xp": xp, "yp": yp}
internal_links.append(ilink)
link = link.next()

i += incr

# PDF created - now write it to Python bytearray
# insert any internal links collected before:
for ilink in internal_links:
pdf_page = mupdf.pdf_load_page(pdfout, ilink["page"])
ret = ilink["ret"]
dest = mupdf.fz_link_dest()
dest.type = 7 # XYZ destination format
dest.loc.chapter = ret.chapter
dest.loc.page = ret.page
dest.h = ilink["h"]
dest.w = ilink["w"]
dest.x = ilink["xp"]
dest.y = ilink["yp"]
dest.zoom = 0
rect=ilink["from"]
uri = mupdf.pdf_new_uri_from_explicit_dest(mupdf.FzLinkDest(dest))
mupdf.pdf_create_link(pdf_page, rect, uri)
# prepare write options structure
opts = mupdf.PdfWriteOptions()
opts.do_garbage = 4
opts.do_garbage = 3
opts.do_compress = 1
opts.do_compress_images = 1
opts.do_compress_fonts = 1
Expand Down
37 changes: 26 additions & 11 deletions tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -1444,8 +1444,8 @@ def check(filename=None, stream=None, filetype=None, exception=None):
etype = pymupdf.FileDataError
etype2 = 'FzErrorBase' if platform.system() == 'OpenBSD' else 'FzErrorUnsupported'
etext = (
re.escape(f'mupdf.{etype2}: code=6: cannot find document handler for file: {path}'),
re.escape(f'pymupdf.FileDataError: Failed to open file {path!r}.'),
re.escape(f'mupdf.{etype2}: code=6: cannot find document handler for file type: \'pickle\''),
re.escape(f'pymupdf.FileDataError: Failed to open file {path!r} as type pickle.'),
)
check(path, exception=(etype, etext))

Expand Down Expand Up @@ -1518,7 +1518,7 @@ def dict_set_path(dict_, *items):
results = dict()

for path in paths:
print(path)
print(f'{path=}')
for ext in extensions:
path2 = f'{root}/foo{ext}'
path3 = shutil.copy2(f'{root}/{path}', path2)
Expand Down Expand Up @@ -1548,7 +1548,7 @@ def dict_set_path(dict_, *items):
e = ee
wt = pymupdf.TOOLS.mupdf_warnings()
text = get_result(e, document)
print(f' fz_open_document_with_stream(magic={ext!r}) => {text}')
print(f' fz_open_document_with_stream() {path=} magic={ext!r}) => {text}')
dict_set_path(results, path, ext, 'stream', text)

finally:
Expand Down Expand Up @@ -1591,16 +1591,31 @@ def dict_set_path(dict_, *items):
with open(path_out, 'w') as f:
json.dump(results, f, indent=4, sort_keys=1)

with open(os.path.normpath(f'{__file__}/../../tests/resources/test_open2_expected.json')) as f:
path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_open2_expected.json')
with open(path_expected) as f:
results_expected = json.load(f)
if results != results_expected:
print(f'results != results_expected:')
def show(r, name):
text = json.dumps(r, indent=4, sort_keys=1)
print(f'{name}:')
print(textwrap.indent(text, ' '))
show(results_expected, 'results_expected')
show(results, 'results')
print(f'Expected: {path_expected}')
print(f'Actual: {path_out}')
if 0:
# Show entire json data. Very verbose.
def show(r, name):
text = json.dumps(r, indent=4, sort_keys=1)
print(f'{name}:')
print(textwrap.indent(text, ' '))
show(results_expected, 'results_expected')
show(results, 'results')
json_expected = json.dumps(results_expected, indent=4, sort_keys=1)
json_actual = json.dumps(results, indent=4, sort_keys=1)
import difflib
diff = difflib.unified_diff(
json_expected.split('\n'),
json_actual.split('\n'),
lineterm='',
)
print(f'Diff expected => actual:')
print(textwrap.indent('\n'.join(diff), ' '))
assert 0


Expand Down
Loading