From 328abc075d5799edb2ce742f9f718dc874124ec7 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 28 May 2026 06:48:14 -0700 Subject: [PATCH 1/3] geotiff: include GB allocation estimate in max_pixels error (#2553) The PixelSafetyLimitError raised by ``_check_dimensions`` and the VRT per-source resample-intermediate guard now report the byte cost of the request alongside the pixel count. Estimate uses 4 bytes/pixel (float32), matching the convention in MAX_PIXELS_DEFAULT's docstring. Existing assertions on "exceed the safety limit" are preserved. --- xrspatial/geotiff/_layout.py | 19 +++++++++++++++--- xrspatial/geotiff/_reader.py | 4 ++-- xrspatial/geotiff/_vrt.py | 9 ++++++--- xrspatial/geotiff/tests/test_security.py | 23 ++++++++++++++++++++++ xrspatial/geotiff/tests/vrt/test_window.py | 21 ++++++++++++++++++++ 5 files changed, 68 insertions(+), 8 deletions(-) diff --git a/xrspatial/geotiff/_layout.py b/xrspatial/geotiff/_layout.py index e0805391..15ebb2ff 100644 --- a/xrspatial/geotiff/_layout.py +++ b/xrspatial/geotiff/_layout.py @@ -32,6 +32,18 @@ #: Override per-call via the ``max_pixels`` keyword argument. MAX_PIXELS_DEFAULT = 1_000_000_000 +#: Bytes-per-pixel basis used for the GB hint in PixelSafetyLimitError. +#: float32 matches the constant docstring above and the dtype the +#: decoder produces for the common single-band float case. The hint is +#: approximate; the actual decoded dtype may differ. +_PIXEL_HINT_BYTES = 4 + + +def _gb_hint(pixels): + """Return a ``~X.XX GB at N bytes/pixel`` string for ``pixels``.""" + gb = (pixels * _PIXEL_HINT_BYTES) / (1024 ** 3) + return f"~{gb:.2f} GB at {_PIXEL_HINT_BYTES} bytes/pixel" + class PixelSafetyLimitError(ValueError): """Raised when a requested TIFF allocation exceeds max_pixels.""" @@ -43,9 +55,10 @@ def _check_dimensions(width, height, samples, max_pixels): if total > max_pixels: raise PixelSafetyLimitError( f"TIFF image dimensions ({width} x {height} x {samples} = " - f"{total:,} pixels) exceed the safety limit of " - f"{max_pixels:,} pixels. Pass a larger max_pixels value to " - f"read_to_array() if this file is legitimate." + f"{total:,} pixels, {_gb_hint(total)}) exceed the safety " + f"limit of {max_pixels:,} pixels ({_gb_hint(max_pixels)}). " + f"Pass a larger max_pixels value to read_to_array() if this " + f"file is legitimate." ) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 23de809c..b2036cba 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -67,8 +67,8 @@ # preserved without churn. from ._layout import (_FULL_IMAGE_BUDGET_HEADER_SLACK, MAX_PIXELS_DEFAULT, # noqa: F401 PixelSafetyLimitError, _check_dimensions, _check_source_dimensions, - _compute_full_image_byte_budget, _has_sparse, _ifd_required_extent, - _sparse_fill_value) + _compute_full_image_byte_budget, _gb_hint, _has_sparse, + _ifd_required_extent, _sparse_fill_value) # The data-source layer (local mmap, HTTP with SSRF defences and DNS-rebind # pinning, fsspec cloud, BytesIO) lives in ``_sources``. It is imported back # here so that: diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index f7286066..62b9238a 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -1533,13 +1533,16 @@ def read_vrt(vrt_path: str, *, window=None, if (sub_dst_w > max_pixels or sub_dst_h > max_pixels or sub_dst_w * sub_dst_h > max_pixels): + from ._layout import _gb_hint + sub_total = sub_dst_w * sub_dst_h raise ValueError( f"VRT SimpleSource DstRect " f"(xSize={dr.x_size}, ySize={dr.y_size}) requires " f"a resample intermediate of " - f"{sub_dst_w * sub_dst_h:,} pixels for the " - f"requested window, which exceeds the safety " - f"limit of {max_pixels:,} pixels. Pass a larger " + f"{sub_total:,} pixels ({_gb_hint(sub_total)}) " + f"for the requested window, which exceeds the " + f"safety limit of {max_pixels:,} pixels " + f"({_gb_hint(max_pixels)}). Pass a larger " f"max_pixels= to read_vrt() if this file is " f"legitimate." ) diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index d1b2d253..6cd3f185 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -51,6 +51,29 @@ def test_custom_limit(self): # Relaxed: passes with large limit _check_dimensions(100_000, 100_000, 1, max_pixels=100_000_000_000) + def test_error_message_includes_gb_estimate(self): + """Error message reports both pixels and a GB allocation hint.""" + # 50000 x 50000 x 1 = 2.5e9 pixels = ~9.31 GB at 4 bytes/pixel + # MAX_PIXELS_DEFAULT = 1e9 pixels = ~3.73 GB at 4 bytes/pixel + with pytest.raises(ValueError) as exc: + _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT) + msg = str(exc.value) + # Pixel count still present (preserves existing assertions). + assert "2,500,000,000 pixels" in msg + assert "1,000,000,000 pixels" in msg + # GB hint added for both the requested and the limit allocations. + assert "~9.31 GB at 4 bytes/pixel" in msg + assert "~3.73 GB at 4 bytes/pixel" in msg + + def test_gb_hint_helper_rounds_to_two_decimals(self): + """_gb_hint formats bytes/pixel * count as a ~X.XX GB string.""" + from xrspatial.geotiff._layout import _gb_hint + # 1 GiB worth of pixels at 4 bytes each -> 4.00 GB hint. + pixels = (1024 ** 3) # 1 GiB pixel count + assert _gb_hint(pixels) == "~4.00 GB at 4 bytes/pixel" + # Zero pixels still formats sensibly. + assert _gb_hint(0) == "~0.00 GB at 4 bytes/pixel" + def test_read_strips_rejects_huge_header(self): """_read_strips refuses to allocate when header claims huge dims.""" # Build a valid TIFF with small pixel data but huge header dimensions. diff --git a/xrspatial/geotiff/tests/vrt/test_window.py b/xrspatial/geotiff/tests/vrt/test_window.py index f346a320..dd4a62b0 100644 --- a/xrspatial/geotiff/tests/vrt/test_window.py +++ b/xrspatial/geotiff/tests/vrt/test_window.py @@ -437,6 +437,27 @@ def test_per_source_cap_inclusive_boundary(): assert arr.shape == (100, 100) +def test_per_source_cap_error_includes_gb_hint(): + """Both the per-source resample-intermediate guard and the output + dimension guard now mention a GB allocation hint. Whichever fires + first for this VRT, the error string must include the hint so + callers see the byte cost without doing the multiplication.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt( + td, + dst_x_size=2000, + dst_y_size=2000, + raster_x=2000, + raster_y=2000, + ) + with pytest.raises(ValueError) as exc: + _dstrect_cap_read_vrt_internal(vrt_path, max_pixels=1_000_000) + msg = str(exc.value) + assert 'GB at 4 bytes/pixel' in msg + assert 'pixels' in msg + + def test_negative_dstrect_rejected(): """Negative ``xSize`` / ``ySize`` must surface as ``ValueError`` rather than be silently skipped by the overlap check. The error From d1596840e32804960f051f3495c946d402817ae0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 28 May 2026 06:58:52 -0700 Subject: [PATCH 2/3] geotiff: dtype-aware GB hint + decimal GB units (#2553) Addresses the /review-pr feedback on the initial commit: * Switch _gb_hint to decimal GB (10**9) so the printed number matches the "~4 GB" claim in MAX_PIXELS_DEFAULT's docstring (and what df / du / S3 report). * Plumb the decoded dtype through _check_dimensions and _gb_hint so the byte estimate uses dtype.itemsize. The hint now scales: f32 -> 4 bytes/pixel, f64 -> 8, u8 -> 1, etc. Real call sites in _decode.py and _vrt.py pass the dtype; dtype= remains optional with the float32 fallback for callers without a resolved dtype. * Add monkeypatched test isolating the VRT per-source error path, closing the nit about that string being covered only via the dimension guard's overlap. * New tests assert dtype scaling at the helper, layout-guard, and VRT-source-guard levels. --- xrspatial/geotiff/_decode.py | 6 +-- xrspatial/geotiff/_layout.py | 58 +++++++++++++++------- xrspatial/geotiff/_vrt.py | 20 +++++--- xrspatial/geotiff/tests/test_security.py | 46 ++++++++++++++--- xrspatial/geotiff/tests/vrt/test_window.py | 29 ++++++++--- 5 files changed, 116 insertions(+), 43 deletions(-) diff --git a/xrspatial/geotiff/_decode.py b/xrspatial/geotiff/_decode.py index 9a8c89e0..f7b7d1e0 100644 --- a/xrspatial/geotiff/_decode.py +++ b/xrspatial/geotiff/_decode.py @@ -383,7 +383,7 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, out_h = r1 - r0 out_w = c1 - c0 - _check_dimensions(out_w, out_h, samples, max_pixels) + _check_dimensions(out_w, out_h, samples, max_pixels, dtype=dtype) # StripByteCounts must have at least one entry per strip; a corrupt count # field can shrink it. Detect the mismatch after the dimension safety @@ -574,7 +574,7 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, # what enforces the user's per-call memory cap. The source-read path # under ``read_vrt`` relies on that output check to honour a small # caller ``max_pixels`` against a normal-tile source. - _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT, dtype=dtype) # Per-tile compressed-byte cap. Same env var as the # HTTP path. mmap slicing is bounded by the file size, but the slice @@ -608,7 +608,7 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, out_h = r1 - r0 out_w = c1 - c0 - _check_dimensions(out_w, out_h, samples, max_pixels) + _check_dimensions(out_w, out_h, samples, max_pixels, dtype=dtype) # Reject malformed TIFFs whose declared tile grid exceeds the number of # supplied TileOffsets entries. Silent skipping in the CPU loop below diff --git a/xrspatial/geotiff/_layout.py b/xrspatial/geotiff/_layout.py index 15ebb2ff..107f5d20 100644 --- a/xrspatial/geotiff/_layout.py +++ b/xrspatial/geotiff/_layout.py @@ -32,33 +32,57 @@ #: Override per-call via the ``max_pixels`` keyword argument. MAX_PIXELS_DEFAULT = 1_000_000_000 -#: Bytes-per-pixel basis used for the GB hint in PixelSafetyLimitError. -#: float32 matches the constant docstring above and the dtype the -#: decoder produces for the common single-band float case. The hint is -#: approximate; the actual decoded dtype may differ. -_PIXEL_HINT_BYTES = 4 - - -def _gb_hint(pixels): - """Return a ``~X.XX GB at N bytes/pixel`` string for ``pixels``.""" - gb = (pixels * _PIXEL_HINT_BYTES) / (1024 ** 3) - return f"~{gb:.2f} GB at {_PIXEL_HINT_BYTES} bytes/pixel" +#: Fallback bytes-per-pixel used by ``_gb_hint`` when the caller has +#: not resolved the decoded dtype yet. float32 matches the constant +#: docstring above and the dtype the decoder produces for the common +#: single-band float case. Real call sites pass the actual dtype so +#: the GB hint reflects the allocation that would have happened. +_DEFAULT_HINT_BYTES = 4 + + +def _gb_hint(pixels, dtype=None): + """Return a ``~X.XX GB at N bytes/pixel`` string for ``pixels``. + + ``dtype`` is the numpy dtype of the would-be allocation. When + provided, the hint uses ``dtype.itemsize`` so f64 reports 8 + bytes/pixel, u8 reports 1, etc. When omitted, the hint falls + back to ``_DEFAULT_HINT_BYTES`` (float32) for older call sites + that have not threaded the dtype through yet. + + Uses decimal GB (10**9 bytes) so the printed number matches the + ``~4 GB`` figure quoted in the :data:`MAX_PIXELS_DEFAULT` + docstring above and the convention reported by ``df`` / ``du`` / + most cloud object stores. + """ + if dtype is None: + bpp = _DEFAULT_HINT_BYTES + else: + bpp = int(np.dtype(dtype).itemsize) + gb = (pixels * bpp) / 1_000_000_000 + return f"~{gb:.2f} GB at {bpp} bytes/pixel" class PixelSafetyLimitError(ValueError): """Raised when a requested TIFF allocation exceeds max_pixels.""" -def _check_dimensions(width, height, samples, max_pixels): - """Raise PixelSafetyLimitError if the request exceeds *max_pixels*.""" +def _check_dimensions(width, height, samples, max_pixels, dtype=None): + """Raise PixelSafetyLimitError if the request exceeds *max_pixels*. + + ``dtype`` is forwarded to :func:`_gb_hint` so the byte-allocation + estimate in the error message reflects the actual dtype that + would have been allocated. Optional for callers that do not have + the dtype resolved yet (the hint then falls back to float32). + """ total = width * height * samples if total > max_pixels: raise PixelSafetyLimitError( f"TIFF image dimensions ({width} x {height} x {samples} = " - f"{total:,} pixels, {_gb_hint(total)}) exceed the safety " - f"limit of {max_pixels:,} pixels ({_gb_hint(max_pixels)}). " - f"Pass a larger max_pixels value to read_to_array() if this " - f"file is legitimate." + f"{total:,} pixels, {_gb_hint(total, dtype)}) exceed the " + f"safety limit of {max_pixels:,} pixels " + f"({_gb_hint(max_pixels, dtype)}). Pass a larger " + f"max_pixels value to read_to_array() if this file is " + f"legitimate." ) diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 62b9238a..dce88e70 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -1371,14 +1371,18 @@ def read_vrt(vrt_path: str, *, window=None, from ._reader import MAX_PIXELS_DEFAULT, _check_dimensions if max_pixels is None: max_pixels = MAX_PIXELS_DEFAULT - n_bands = len([vrt.bands[band]] if band is not None else vrt.bands) - _check_dimensions(out_w, out_h, n_bands, max_pixels) - # Select bands + # Select bands first so the dtype is known for the GB hint in the + # safety-limit error. if band is not None: selected_bands = [vrt.bands[band]] else: selected_bands = vrt.bands + n_bands = len(selected_bands) + # All selected bands typically share a dtype; if they differ the + # hint uses the first band's dtype as an approximation. + _check_dimensions(out_w, out_h, n_bands, max_pixels, + dtype=selected_bands[0].dtype) # Allocate output. # @@ -1535,16 +1539,18 @@ def read_vrt(vrt_path: str, *, window=None, or sub_dst_w * sub_dst_h > max_pixels): from ._layout import _gb_hint sub_total = sub_dst_w * sub_dst_h + band_dtype = vrt_band.dtype raise ValueError( f"VRT SimpleSource DstRect " f"(xSize={dr.x_size}, ySize={dr.y_size}) requires " f"a resample intermediate of " - f"{sub_total:,} pixels ({_gb_hint(sub_total)}) " + f"{sub_total:,} pixels " + f"({_gb_hint(sub_total, band_dtype)}) " f"for the requested window, which exceeds the " f"safety limit of {max_pixels:,} pixels " - f"({_gb_hint(max_pixels)}). Pass a larger " - f"max_pixels= to read_vrt() if this file is " - f"legitimate." + f"({_gb_hint(max_pixels, band_dtype)}). Pass a " + f"larger max_pixels= to read_vrt() if this file " + f"is legitimate." ) else: read_r0 = sr.y_off + (clip_r0 - dst_r0) diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index 6cd3f185..8d500366 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -52,9 +52,13 @@ def test_custom_limit(self): _check_dimensions(100_000, 100_000, 1, max_pixels=100_000_000_000) def test_error_message_includes_gb_estimate(self): - """Error message reports both pixels and a GB allocation hint.""" - # 50000 x 50000 x 1 = 2.5e9 pixels = ~9.31 GB at 4 bytes/pixel - # MAX_PIXELS_DEFAULT = 1e9 pixels = ~3.73 GB at 4 bytes/pixel + """Error message reports both pixels and a GB allocation hint. + + With no dtype passed the hint falls back to float32 + (4 bytes/pixel). Real call sites pass the actual dtype. + """ + # 50000 x 50000 x 1 = 2.5e9 pixels = 10.00 GB at 4 bytes/pixel + # MAX_PIXELS_DEFAULT = 1e9 pixels = 4.00 GB at 4 bytes/pixel with pytest.raises(ValueError) as exc: _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT) msg = str(exc.value) @@ -62,17 +66,43 @@ def test_error_message_includes_gb_estimate(self): assert "2,500,000,000 pixels" in msg assert "1,000,000,000 pixels" in msg # GB hint added for both the requested and the limit allocations. - assert "~9.31 GB at 4 bytes/pixel" in msg - assert "~3.73 GB at 4 bytes/pixel" in msg + # Uses decimal GB (10**9 bytes) to match the docstring convention. + assert "~10.00 GB at 4 bytes/pixel" in msg + assert "~4.00 GB at 4 bytes/pixel" in msg + + def test_error_message_uses_passed_dtype_for_gb_hint(self): + """When the caller passes the decoded dtype, the GB hint reports + the exact byte width: f64 → 8 bytes/pixel (double the float32 + default), u8 → 1 byte/pixel (a quarter).""" + # float64: 1e9 pixels * 8 bytes = 8.00 GB + with pytest.raises(ValueError) as exc: + _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT, + dtype=np.float64) + assert "20.00 GB at 8 bytes/pixel" in str(exc.value) + assert "8.00 GB at 8 bytes/pixel" in str(exc.value) + + # uint8: 1e9 pixels * 1 byte = 1.00 GB + with pytest.raises(ValueError) as exc: + _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT, + dtype=np.uint8) + assert "2.50 GB at 1 bytes/pixel" in str(exc.value) + assert "1.00 GB at 1 bytes/pixel" in str(exc.value) def test_gb_hint_helper_rounds_to_two_decimals(self): """_gb_hint formats bytes/pixel * count as a ~X.XX GB string.""" from xrspatial.geotiff._layout import _gb_hint - # 1 GiB worth of pixels at 4 bytes each -> 4.00 GB hint. - pixels = (1024 ** 3) # 1 GiB pixel count - assert _gb_hint(pixels) == "~4.00 GB at 4 bytes/pixel" + # No dtype: 1 billion pixels * 4 bytes (float32 default) -> + # 4.00 GB hint, matching MAX_PIXELS_DEFAULT's docstring. + assert _gb_hint(1_000_000_000) == "~4.00 GB at 4 bytes/pixel" # Zero pixels still formats sensibly. assert _gb_hint(0) == "~0.00 GB at 4 bytes/pixel" + # With dtype: itemsize drives the byte multiplier. + assert _gb_hint(1_000_000_000, dtype=np.float64) == \ + "~8.00 GB at 8 bytes/pixel" + assert _gb_hint(1_000_000_000, dtype=np.uint8) == \ + "~1.00 GB at 1 bytes/pixel" + assert _gb_hint(1_000_000_000, dtype=np.int16) == \ + "~2.00 GB at 2 bytes/pixel" def test_read_strips_rejects_huge_header(self): """_read_strips refuses to allocate when header claims huge dims.""" diff --git a/xrspatial/geotiff/tests/vrt/test_window.py b/xrspatial/geotiff/tests/vrt/test_window.py index dd4a62b0..47f2a116 100644 --- a/xrspatial/geotiff/tests/vrt/test_window.py +++ b/xrspatial/geotiff/tests/vrt/test_window.py @@ -437,11 +437,22 @@ def test_per_source_cap_inclusive_boundary(): assert arr.shape == (100, 100) -def test_per_source_cap_error_includes_gb_hint(): - """Both the per-source resample-intermediate guard and the output - dimension guard now mention a GB allocation hint. Whichever fires - first for this VRT, the error string must include the hint so - callers see the byte cost without doing the multiplication.""" +def test_per_source_cap_error_includes_gb_hint(monkeypatch): + """The per-source resample-intermediate guard reports the GB + allocation hint alongside the pixel count. + + The output-dimension guard fires before the per-source guard at + the same ``max_pixels`` threshold, so to verify the per-source + error string in isolation we no-op the dimension guard for the + duration of the read. That keeps the test honest about which + branch's format string is being asserted. + """ + # ``read_vrt`` imports ``_check_dimensions`` from ``_reader`` at + # call time, so patching the reader module's binding takes effect + # on the next call. + from xrspatial.geotiff import _reader as reader_mod + monkeypatch.setattr(reader_mod, '_check_dimensions', + lambda *args, **kwargs: None) with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: _dstrect_cap_write_source(td) vrt_path = _dstrect_cap_write_vrt( @@ -451,11 +462,13 @@ def test_per_source_cap_error_includes_gb_hint(): raster_x=2000, raster_y=2000, ) - with pytest.raises(ValueError) as exc: + with pytest.raises(ValueError, match='resample intermediate') as exc: _dstrect_cap_read_vrt_internal(vrt_path, max_pixels=1_000_000) msg = str(exc.value) - assert 'GB at 4 bytes/pixel' in msg - assert 'pixels' in msg + # Sub-window is 2000x2000 = 4_000_000 pixels. The VRT band's + # dataType is "Byte" (uint8), so the GB hint uses 1 byte/pixel. + assert '4,000,000 pixels' in msg + assert 'GB at 1 bytes/pixel' in msg def test_negative_dstrect_rejected(): From 86950835b9c3f8bca678b14f4ab647058f61f76d Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 28 May 2026 07:14:04 -0700 Subject: [PATCH 3/3] geotiff: suggest window= and chunks= when max_pixels gate fires (#2553) The PixelSafetyLimitError message now ends with a 'To read this file, try one of:' list with three recovery options: * Pass a larger max_pixels= if the allocation is acceptable. * Read a sub-region with window=(r0, c0, r1, c1), e.g. window=(0, 0, 1024, 1024). * Read lazily in chunks (dask-aware): - dask installed -> chunks=N where N is sized to fit max_pixels. - dask missing -> pip / conda install hint plus the same chunks= value to use once installed. The chunk-side suggestion is the largest square chunk whose pixel count fits under max_pixels for the band count, capped at 1024 to stay near common COG tile sizes (256 / 512 / 1024). For the default 1e9 / 1-band case that is exactly 1024. Detection uses ``importlib.util.find_spec`` so dask is not imported just to format an error. --- xrspatial/geotiff/_layout.py | 46 +++++++++++++++++-- xrspatial/geotiff/tests/test_security.py | 57 ++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 3 deletions(-) diff --git a/xrspatial/geotiff/_layout.py b/xrspatial/geotiff/_layout.py index 107f5d20..724c8cbc 100644 --- a/xrspatial/geotiff/_layout.py +++ b/xrspatial/geotiff/_layout.py @@ -14,6 +14,7 @@ """ from __future__ import annotations +import importlib.util import math import numpy as np @@ -62,6 +63,46 @@ def _gb_hint(pixels, dtype=None): return f"~{gb:.2f} GB at {bpp} bytes/pixel" +def _suggest_chunk_side(max_pixels, samples): + """Pick a square chunksize whose pixel count fits comfortably under + ``max_pixels`` for the given band count. Capped at 1024 so the hint + stays near common COG tile sizes (256 / 512 / 1024).""" + samples = max(1, int(samples)) + if max_pixels <= 0: + return 1 + side = int(math.isqrt(max_pixels // samples)) + return max(1, min(1024, side)) + + +def _recovery_hint(max_pixels, samples): + """Build the multi-line recovery hint appended to PixelSafetyLimitError. + + Always suggests ``window=`` for reading a sub-region. Suggests + ``chunks=`` when dask is installed; otherwise recommends installing + dask so chunked reads become available. ``importlib.util.find_spec`` + avoids importing dask just to format an error. + """ + chunk = _suggest_chunk_side(max_pixels, samples) + lines = [ + "To read this file, try one of:", + " * Pass a larger max_pixels= if the allocation is acceptable.", + f" * Read a sub-region with window=(r0, c0, r1, c1), " + f"e.g. window=(0, 0, {chunk}, {chunk}).", + ] + if importlib.util.find_spec('dask') is not None: + lines.append( + f" * Read lazily in chunks with chunks={chunk} so each " + f"decoded buffer stays under max_pixels." + ) + else: + lines.append( + " * Install dask (`pip install dask` or " + "`conda install -c conda-forge dask`) to read the file " + f"lazily via chunks={chunk}." + ) + return "\n".join(lines) + + class PixelSafetyLimitError(ValueError): """Raised when a requested TIFF allocation exceeds max_pixels.""" @@ -80,9 +121,8 @@ def _check_dimensions(width, height, samples, max_pixels, dtype=None): f"TIFF image dimensions ({width} x {height} x {samples} = " f"{total:,} pixels, {_gb_hint(total, dtype)}) exceed the " f"safety limit of {max_pixels:,} pixels " - f"({_gb_hint(max_pixels, dtype)}). Pass a larger " - f"max_pixels value to read_to_array() if this file is " - f"legitimate." + f"({_gb_hint(max_pixels, dtype)}).\n" + f"{_recovery_hint(max_pixels, samples)}" ) diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index 8d500366..31245922 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -88,6 +88,63 @@ def test_error_message_uses_passed_dtype_for_gb_hint(self): assert "2.50 GB at 1 bytes/pixel" in str(exc.value) assert "1.00 GB at 1 bytes/pixel" in str(exc.value) + def test_error_message_includes_window_suggestion(self): + """Error message suggests window= for reading a sub-region.""" + with pytest.raises(ValueError) as exc: + _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT) + msg = str(exc.value) + assert "window=(r0, c0, r1, c1)" in msg + # Concrete example uses the same chunk-side as the chunks hint. + assert "window=(0, 0, 1024, 1024)" in msg + + def test_error_message_suggests_chunks_when_dask_installed(self, + monkeypatch): + """When dask is importable, suggest chunks=.""" + # Force the 'dask installed' branch regardless of test env. + monkeypatch.setattr( + 'xrspatial.geotiff._layout.importlib.util.find_spec', + lambda name: object() if name == 'dask' else None, + ) + with pytest.raises(ValueError) as exc: + _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT) + msg = str(exc.value) + assert "chunks=1024" in msg + assert "lazily" in msg + # Should not recommend installing dask. + assert "pip install dask" not in msg + assert "conda install" not in msg + + def test_error_message_recommends_install_when_dask_missing(self, + monkeypatch): + """When dask is not importable, recommend installation.""" + monkeypatch.setattr( + 'xrspatial.geotiff._layout.importlib.util.find_spec', + lambda name: None, + ) + with pytest.raises(ValueError) as exc: + _check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT) + msg = str(exc.value) + assert "pip install dask" in msg + assert "conda install -c conda-forge dask" in msg + # The chunks= number is still surfaced so the user knows what to + # pass once dask is installed. + assert "chunks=1024" in msg + + def test_suggested_chunk_side_scales_with_max_pixels(self): + """The suggested chunk side fits under max_pixels for the given + band count and never exceeds 1024.""" + from xrspatial.geotiff._layout import _suggest_chunk_side + # Default budget: capped at 1024. + assert _suggest_chunk_side(1_000_000_000, 1) == 1024 + # Tight budget: 100 pixels, 1 band -> side 10. + assert _suggest_chunk_side(100, 1) == 10 + # Multi-band reduces the per-side budget. + # 1_000_000 budget, 4 bands -> sqrt(250_000) = 500. + assert _suggest_chunk_side(1_000_000, 4) == 500 + # Pathological inputs do not crash. + assert _suggest_chunk_side(0, 1) == 1 + assert _suggest_chunk_side(10, 0) == 3 + def test_gb_hint_helper_rounds_to_two_decimals(self): """_gb_hint formats bytes/pixel * count as a ~X.XX GB string.""" from xrspatial.geotiff._layout import _gb_hint