diff --git a/docs/pymupdf4llm/api.rst b/docs/pymupdf4llm/api.rst index afa88bede..8f6789203 100644 --- a/docs/pymupdf4llm/api.rst +++ b/docs/pymupdf4llm/api.rst @@ -279,6 +279,29 @@ The PyMuPDF4LLM API } } +.. method:: markdown_to_pdf(md_path: str | pathlib.Path, \ + user_css: str | None = None, \ + page_rect: rect-like | None = None, \ + margins: rect-like | None) = None, \ + archive: str | pathlib.Path | None = None, \ + output_path: str | pathlib.Path | None = None) -> pymupdf.Document | None + + Convert the markdown text content of the file specified by `md_path` into a PDF document. + + The function is always available -- independently of whether you are using the PyMuPDF Layout module or not. + + :arg str|Path md_path: the file path of the markdown file to be converted. + + :arg str|None user_css: optional, a string of CSS code to be applied to the markdown content. This may be used to customize the appearance of the generated PDF document. If `None` (default), the built-in default CSS is used. + + :arg rect-like|None page_rect: optional, the rectangle defining the page boundaries for the generated PDF document. If `None` (default), ISO A4 page dimensions are used. To use one of PyMuPDF's predefined page formats, use e.g. ``pymupdf.paper_rect("Letter")``. + + :arg rect-like|None margins: optional, the margins (borders) for the generated pages. This must be a sequence of four floats ``[left, top, right, bottom]`` specifying the respective border width in points (1/72 inches). If `None` (default), the default ``[50, 50, 50, 50]`` margins are used. + + :arg str|Archive|None archive: optional. This is be required if the markdown source references images that are **not** stored in the same folder as the markdown file. In this case, `archive` must be a `pymupdf.Archive` object which provides access to the respective image files. If `None` (default), it is assumed that all referenced images are stored in the same folder as the markdown file. The parameter **may** also be required if a custom ``user_css`` references external resources like font files. + + :arg str|Path|None output_path: optional, the file path where the generated PDF document will be saved. If specified, the generated PDF will be saved to that location. If `None` (default), the document is returned as a `pymupdf.Document` object. + .. note:: Please see `this site `_ for more background and the current status of further improvements regarding usage with :ref:`PyMuPDF Layout `. diff --git a/src/__init__.py b/src/__init__.py index c98164dc7..910862111 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -2897,7 +2897,7 @@ def __getitem__(self, i=0): raise IndexError(f"page {i} not in document") return self.load_page(i) - def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11): + def __init__(self, filename=None, stream=None, filetype=None, archive=None, rect=None, width=0, height=0, fontsize=11): """Creates a document. Use 'open' as a synonym. Notes: @@ -2943,7 +2943,14 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self._name = filename self.stream = stream - + if isinstance(archive, pathlib.Path): + archive = Archive(archive.name) + elif isinstance(archive, str): + archive = Archive(archive) + elif not archive: + archive = Archive() + elif not isinstance(archive, Archive): + raise TypeError(f"bad archive: {type(archive)=}.") if stream is not None: if filename is not None and filetype is None: # 2025-05-06: Use as the filetype. This is @@ -2958,6 +2965,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 stream = stream.getvalue() else: raise TypeError(f"bad stream: {type(stream)=}.") + + # this prevents bad things if original goes out of existence: self.stream = stream assert isinstance(stream, (bytes, memoryview)) @@ -2967,9 +2976,9 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 # raise a specific exception. raise EmptyFileError('Cannot open empty stream.') - stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream)) + fz_stream = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream)) try: - doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2) + doc = mupdf.fz_open_document_with_stream_and_dir(filetype if filetype else '', fz_stream, archive.this) except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError('Failed to open stream') from e @@ -2996,20 +3005,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 raise EmptyFileError(f'Cannot open empty file: {filename=}.') if filetype: - # Override the type implied by . MuPDF does not - # have a way to do this directly so we open via a stream. - try: - fz_stream = mupdf.fz_open_file(filename) - doc = mupdf.fz_open_document_with_stream(filetype, fz_stream) - except Exception as e: - if g_exceptions_verbose > 1: exception_info() - raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e + suffix = filetype else: - try: - doc = mupdf.fz_open_document(filename) - except Exception as e: - if g_exceptions_verbose > 1: exception_info() - raise FileDataError(f'Failed to open file {filename!r}.') from e + suffix = pathlib.Path(filename).suffix.strip(".") + try: + fz_stream = mupdf.fz_open_file(filename) + doc = mupdf.fz_open_document_with_stream_and_dir(suffix, fz_stream, archive.this) + except Exception as e: + if g_exceptions_verbose > 1: exception_info() + raise FileDataError(f'Failed to open file {filename!r} as type {suffix}.') from e else: pdf = mupdf.PdfDocument() @@ -19098,7 +19102,7 @@ def JM_copy_rectangle(page, area): return s -def JM_convert_to_pdf(doc, fp, tp, rotate): +def JM_convert_to_pdf(doc, fp, tp, rotate) -> bytes: ''' Convert any MuPDF document to a PDF Returns bytes object containing the PDF, created via 'write' function. @@ -19113,7 +19117,8 @@ def JM_convert_to_pdf(doc, fp, tp, rotate): e = fp # ... range rot = JM_norm_rotation(rotate) i = fp - while 1: # interpret & write document pages as PDF pages + internal_links = [] # collect PDF-wide internal links here + while 1: # interpret & write document pages as PDF pages if not _INRANGE(i, s, e): break page = mupdf.fz_load_page(doc, i) @@ -19124,11 +19129,49 @@ def JM_convert_to_pdf(doc, fp, tp, rotate): dev = None page_obj = mupdf.pdf_add_page(pdfout, mediabox, rot, resources, contents) mupdf.pdf_insert_page(pdfout, -1, page_obj) + + # also copy links to the output PDF page + # get the PDF page we've just created + pdf_page = mupdf.pdf_load_page(pdfout, i) + + # loop through source page links + link = mupdf.fz_load_links(page) # load first link + while link.m_internal: # break loop when link is None + uri = link.uri() # URI string + rect = mupdf.FzRect(link.rect()) # link "from" rectangle + is_external = mupdf.fz_is_external_link(uri) + + if is_external: # external links can be copied directly + mupdf.pdf_create_link(pdf_page, rect, uri) + else: # internal links done when PDF is complete + # find target of internal link + ret, xp, yp = mupdf.fz_resolve_link(doc, uri) + ilink={"page": i, "ret": ret, "from": rect, "h": rect.y1-rect.y0, "w": rect.x1-rect.x0, "xp": xp, "yp": yp} + internal_links.append(ilink) + link = link.next() + i += incr + # PDF created - now write it to Python bytearray + # insert any internal links collected before: + for ilink in internal_links: + pdf_page = mupdf.pdf_load_page(pdfout, ilink["page"]) + ret = ilink["ret"] + dest = mupdf.fz_link_dest() + dest.type = 7 # XYZ destination format + dest.loc.chapter = ret.chapter + dest.loc.page = ret.page + dest.h = ilink["h"] + dest.w = ilink["w"] + dest.x = ilink["xp"] + dest.y = ilink["yp"] + dest.zoom = 0 + rect=ilink["from"] + uri = mupdf.pdf_new_uri_from_explicit_dest(mupdf.FzLinkDest(dest)) + mupdf.pdf_create_link(pdf_page, rect, uri) # prepare write options structure opts = mupdf.PdfWriteOptions() - opts.do_garbage = 4 + opts.do_garbage = 3 opts.do_compress = 1 opts.do_compress_images = 1 opts.do_compress_fonts = 1 diff --git a/tests/test_general.py b/tests/test_general.py index 2f93b14d5..9a531faee 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -1444,8 +1444,8 @@ def check(filename=None, stream=None, filetype=None, exception=None): etype = pymupdf.FileDataError etype2 = 'FzErrorBase' if platform.system() == 'OpenBSD' else 'FzErrorUnsupported' etext = ( - re.escape(f'mupdf.{etype2}: code=6: cannot find document handler for file: {path}'), - re.escape(f'pymupdf.FileDataError: Failed to open file {path!r}.'), + re.escape(f'mupdf.{etype2}: code=6: cannot find document handler for file type: \'pickle\''), + re.escape(f'pymupdf.FileDataError: Failed to open file {path!r} as type pickle.'), ) check(path, exception=(etype, etext)) @@ -1518,7 +1518,7 @@ def dict_set_path(dict_, *items): results = dict() for path in paths: - print(path) + print(f'{path=}') for ext in extensions: path2 = f'{root}/foo{ext}' path3 = shutil.copy2(f'{root}/{path}', path2) @@ -1548,7 +1548,7 @@ def dict_set_path(dict_, *items): e = ee wt = pymupdf.TOOLS.mupdf_warnings() text = get_result(e, document) - print(f' fz_open_document_with_stream(magic={ext!r}) => {text}') + print(f' fz_open_document_with_stream() {path=} magic={ext!r}) => {text}') dict_set_path(results, path, ext, 'stream', text) finally: @@ -1591,16 +1591,31 @@ def dict_set_path(dict_, *items): with open(path_out, 'w') as f: json.dump(results, f, indent=4, sort_keys=1) - with open(os.path.normpath(f'{__file__}/../../tests/resources/test_open2_expected.json')) as f: + path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_open2_expected.json') + with open(path_expected) as f: results_expected = json.load(f) if results != results_expected: print(f'results != results_expected:') - def show(r, name): - text = json.dumps(r, indent=4, sort_keys=1) - print(f'{name}:') - print(textwrap.indent(text, ' ')) - show(results_expected, 'results_expected') - show(results, 'results') + print(f'Expected: {path_expected}') + print(f'Actual: {path_out}') + if 0: + # Show entire json data. Very verbose. + def show(r, name): + text = json.dumps(r, indent=4, sort_keys=1) + print(f'{name}:') + print(textwrap.indent(text, ' ')) + show(results_expected, 'results_expected') + show(results, 'results') + json_expected = json.dumps(results_expected, indent=4, sort_keys=1) + json_actual = json.dumps(results, indent=4, sort_keys=1) + import difflib + diff = difflib.unified_diff( + json_expected.split('\n'), + json_actual.split('\n'), + lineterm='', + ) + print(f'Diff expected => actual:') + print(textwrap.indent('\n'.join(diff), ' ')) assert 0