pymupdf · JorjMcKie · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 19, 2026
diff --git a/docs/pymupdf4llm/api.rst b/docs/pymupdf4llm/api.rst
@@ -279,6 +279,29 @@ The PyMuPDF4LLM API
             }
         }    
 
+.. method:: markdown_to_pdf(md_path: str | pathlib.Path, \
+    user_css: str | None = None, \
+    page_rect: rect-like | None = None, \
+    margins: rect-like | None) = None, \
+    archive: str | pathlib.Path | None = None, \
+    output_path: str | pathlib.Path | None = None) -> pymupdf.Document | None
+
+    Convert the markdown text content of the file specified by `md_path` into a PDF document.
+
+    The function is always available -- independently of whether you are using the PyMuPDF Layout module or not.
+
+    :arg str|Path md_path: the file path of the markdown file to be converted.
+
+    :arg str|None user_css: optional, a string of CSS code to be applied to the markdown content. This may be used to customize the appearance of the generated PDF document. If `None` (default), the built-in default CSS is used.
+
+    :arg rect-like|None page_rect: optional, the rectangle defining the page boundaries for the generated PDF document. If `None` (default), ISO A4 page dimensions are used. To use one of PyMuPDF's predefined page formats, use e.g. ``pymupdf.paper_rect("Letter")``.
+
+    :arg rect-like|None margins: optional, the margins (borders) for the generated pages. This must be a sequence of four floats ``[left, top, right, bottom]`` specifying the respective border width in points (1/72 inches). If `None` (default), the default ``[50, 50, 50, 50]`` margins are used.
+
+    :arg str|Archive|None archive: optional. This is be required if the markdown source references images that are **not** stored in the same folder as the markdown file. In this case, `archive` must be a `pymupdf.Archive` object which provides access to the respective image files. If `None` (default), it is assumed that all referenced images are stored in the same folder as the markdown file. The parameter **may** also be required if a custom ``user_css`` references external resources like font files.
+
+    :arg str|Path|None output_path: optional, the file path where the generated PDF document will be saved. If specified, the generated PDF will be saved to that location. If `None` (default), the document is returned as a `pymupdf.Document` object.
+
 .. note::
 
     Please see `this site <https://github.com/pymupdf/pymupdf4llm/discussions/327>`_ for more background and the current status of further improvements regarding usage with :ref:`PyMuPDF Layout <pymupdf-layout>`.

diff --git a/src/__init__.py b/src/__init__.py
@@ -2897,7 +2897,7 @@ def __getitem__(self, i=0):
             raise IndexError(f"page {i} not in document")
         return self.load_page(i)
 
-    def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11):
+    def __init__(self, filename=None, stream=None, filetype=None, archive=None, rect=None, width=0, height=0, fontsize=11):
         """Creates a document. Use 'open' as a synonym.
 
         Notes:
@@ -2943,7 +2943,14 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
 
             self._name = filename
             self.stream = stream
-
+            if isinstance(archive, pathlib.Path):
+                archive = Archive(archive.name)
+            elif isinstance(archive, str):
+                archive = Archive(archive)
+            elif not archive:
+                archive = Archive()
+            elif not isinstance(archive, Archive):
+                raise TypeError(f"bad archive: {type(archive)=}.")
             if stream is not None:
                 if filename is not None and filetype is None:
                     # 2025-05-06: Use <filename> as the filetype. This is
@@ -2958,6 +2965,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
                     stream = stream.getvalue()
                 else:
                     raise TypeError(f"bad stream: {type(stream)=}.")
+
+                # this prevents bad things if original goes out of existence:
                 self.stream = stream
 
                 assert isinstance(stream, (bytes, memoryview))
@@ -2967,9 +2976,9 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
                     # raise a specific exception.
                     raise EmptyFileError('Cannot open empty stream.')
 
-                stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
+                fz_stream = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
                 try:
-                    doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2)
+                    doc = mupdf.fz_open_document_with_stream_and_dir(filetype if filetype else '', fz_stream, archive.this)
                 except Exception as e:
                     if g_exceptions_verbose > 1:    exception_info()
                     raise FileDataError('Failed to open stream') from e
@@ -2996,20 +3005,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
                     raise EmptyFileError(f'Cannot open empty file: {filename=}.')
 
                 if filetype:
-                    # Override the type implied by <filename>. MuPDF does not
-                    # have a way to do this directly so we open via a stream.
-                    try:
-                        fz_stream = mupdf.fz_open_file(filename)
-                        doc = mupdf.fz_open_document_with_stream(filetype, fz_stream)
-                    except Exception as e:
-                        if g_exceptions_verbose > 1:    exception_info()
-                        raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e
+                    suffix = filetype
                 else:
-                    try:
-                        doc = mupdf.fz_open_document(filename)
-                    except Exception as e:
-                        if g_exceptions_verbose > 1:    exception_info()
-                        raise FileDataError(f'Failed to open file {filename!r}.') from e
+                    suffix = pathlib.Path(filename).suffix.strip(".")
+                try:
+                    fz_stream = mupdf.fz_open_file(filename)
+                    doc = mupdf.fz_open_document_with_stream_and_dir(suffix, fz_stream, archive.this)
+                except Exception as e:
+                    if g_exceptions_verbose > 1:    exception_info()
+                    raise FileDataError(f'Failed to open file {filename!r} as type {suffix}.') from e
 
             else:
                 pdf = mupdf.PdfDocument()
@@ -19098,7 +19102,7 @@ def JM_copy_rectangle(page, area):
     return s
 
 
-def JM_convert_to_pdf(doc, fp, tp, rotate):
+def JM_convert_to_pdf(doc, fp, tp, rotate) -> bytes:
     '''
     Convert any MuPDF document to a PDF
     Returns bytes object containing the PDF, created via 'write' function.
@@ -19113,7 +19117,8 @@ def JM_convert_to_pdf(doc, fp, tp, rotate):
         e = fp      # ... range
     rot = JM_norm_rotation(rotate)
     i = fp
-    while 1:    # interpret & write document pages as PDF pages
+    internal_links = []  # collect PDF-wide internal links here
+    while 1:  # interpret & write document pages as PDF pages
         if not _INRANGE(i, s, e):
             break
         page = mupdf.fz_load_page(doc, i)
@@ -19124,11 +19129,49 @@ def JM_convert_to_pdf(doc, fp, tp, rotate):
         dev = None
         page_obj = mupdf.pdf_add_page(pdfout, mediabox, rot, resources, contents)
         mupdf.pdf_insert_page(pdfout, -1, page_obj)
+
+        # also copy links to the output PDF page
+        # get the PDF page we've just created
+        pdf_page = mupdf.pdf_load_page(pdfout, i)
+
+        # loop through source page links
+        link = mupdf.fz_load_links(page)  # load first link
+        while link.m_internal:  # break loop when link is None
+            uri = link.uri()  # URI string
+            rect = mupdf.FzRect(link.rect())  # link "from" rectangle
+            is_external = mupdf.fz_is_external_link(uri)
+
+            if is_external:  # external links can be copied directly
+                mupdf.pdf_create_link(pdf_page, rect, uri)
+            else:  # internal links done when PDF is complete
+                # find target of internal link
+                ret, xp, yp = mupdf.fz_resolve_link(doc, uri)
+                ilink={"page": i, "ret": ret, "from": rect, "h": rect.y1-rect.y0, "w": rect.x1-rect.x0, "xp": xp, "yp": yp}
+                internal_links.append(ilink)
+            link = link.next()
+
         i += incr
+
     # PDF created - now write it to Python bytearray
+    # insert any internal links collected before:
+    for ilink in internal_links:
+        pdf_page = mupdf.pdf_load_page(pdfout, ilink["page"])
+        ret = ilink["ret"]
+        dest = mupdf.fz_link_dest()
+        dest.type = 7  # XYZ destination format
+        dest.loc.chapter = ret.chapter
+        dest.loc.page = ret.page
+        dest.h = ilink["h"]
+        dest.w = ilink["w"]
+        dest.x = ilink["xp"]
+        dest.y = ilink["yp"]
+        dest.zoom = 0
+        rect=ilink["from"]
+        uri = mupdf.pdf_new_uri_from_explicit_dest(mupdf.FzLinkDest(dest))
+        mupdf.pdf_create_link(pdf_page, rect, uri)
     # prepare write options structure
     opts = mupdf.PdfWriteOptions()
-    opts.do_garbage         = 4
+    opts.do_garbage         = 3
     opts.do_compress        = 1
     opts.do_compress_images = 1
     opts.do_compress_fonts  = 1

diff --git a/tests/test_general.py b/tests/test_general.py
@@ -1444,8 +1444,8 @@ def check(filename=None, stream=None, filetype=None, exception=None):
     etype = pymupdf.FileDataError
     etype2 = 'FzErrorBase' if platform.system() == 'OpenBSD' else 'FzErrorUnsupported'
     etext = (
-            re.escape(f'mupdf.{etype2}: code=6: cannot find document handler for file: {path}'),
-            re.escape(f'pymupdf.FileDataError: Failed to open file {path!r}.'),
+            re.escape(f'mupdf.{etype2}: code=6: cannot find document handler for file type: \'pickle\''),
+            re.escape(f'pymupdf.FileDataError: Failed to open file {path!r} as type pickle.'),
             )
     check(path, exception=(etype, etext))
 
@@ -1518,7 +1518,7 @@ def dict_set_path(dict_, *items):
         results = dict()
 
         for path in paths:
-            print(path)
+            print(f'{path=}')
             for ext in extensions:
                 path2 = f'{root}/foo{ext}'
                 path3 = shutil.copy2(f'{root}/{path}', path2)
@@ -1548,7 +1548,7 @@ def dict_set_path(dict_, *items):
                     e = ee
                 wt = pymupdf.TOOLS.mupdf_warnings()
                 text = get_result(e, document)
-                print(f'    fz_open_document_with_stream(magic={ext!r}) => {text}')
+                print(f'    fz_open_document_with_stream() {path=} magic={ext!r}) => {text}')
                 dict_set_path(results, path, ext, 'stream', text)
 
     finally:
@@ -1591,16 +1591,31 @@ def dict_set_path(dict_, *items):
     with open(path_out, 'w') as f:
         json.dump(results, f, indent=4, sort_keys=1)
 
-    with open(os.path.normpath(f'{__file__}/../../tests/resources/test_open2_expected.json')) as f:
+    path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_open2_expected.json')
+    with open(path_expected) as f:
         results_expected = json.load(f)
     if results != results_expected:
         print(f'results != results_expected:')
-        def show(r, name):
-            text = json.dumps(r, indent=4, sort_keys=1)
-            print(f'{name}:')
-            print(textwrap.indent(text, '    '))
-        show(results_expected, 'results_expected')
-        show(results, 'results')
+        print(f'Expected: {path_expected}')
+        print(f'Actual: {path_out}')
+        if 0:
+            # Show entire json data. Very verbose.
+            def show(r, name):
+                text = json.dumps(r, indent=4, sort_keys=1)
+                print(f'{name}:')
+                print(textwrap.indent(text, '    '))
+            show(results_expected, 'results_expected')
+            show(results, 'results')
+        json_expected = json.dumps(results_expected, indent=4, sort_keys=1)
+        json_actual = json.dumps(results, indent=4, sort_keys=1)
+        import difflib
+        diff = difflib.unified_diff(
+                json_expected.split('\n'),
+                json_actual.split('\n'),
+                lineterm='',
+                )
+        print(f'Diff expected => actual:')
+        print(textwrap.indent('\n'.join(diff), '    '))
         assert 0