Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions xrspatial/geotiff/_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
out_h = r1 - r0
out_w = c1 - c0

_check_dimensions(out_w, out_h, samples, max_pixels)
_check_dimensions(out_w, out_h, samples, max_pixels, dtype=dtype)

# StripByteCounts must have at least one entry per strip; a corrupt count
# field can shrink it. Detect the mismatch after the dimension safety
Expand Down Expand Up @@ -574,7 +574,7 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
# what enforces the user's per-call memory cap. The source-read path
# under ``read_vrt`` relies on that output check to honour a small
# caller ``max_pixels`` against a normal-tile source.
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT, dtype=dtype)

# Per-tile compressed-byte cap. Same env var as the
# HTTP path. mmap slicing is bounded by the file size, but the slice
Expand Down Expand Up @@ -608,7 +608,7 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
out_h = r1 - r0
out_w = c1 - c0

_check_dimensions(out_w, out_h, samples, max_pixels)
_check_dimensions(out_w, out_h, samples, max_pixels, dtype=dtype)

# Reject malformed TIFFs whose declared tile grid exceeds the number of
# supplied TileOffsets entries. Silent skipping in the CPU loop below
Expand Down
87 changes: 82 additions & 5 deletions xrspatial/geotiff/_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"""
from __future__ import annotations

import importlib.util
import math

import numpy as np
Expand All @@ -32,20 +33,96 @@
#: Override per-call via the ``max_pixels`` keyword argument.
MAX_PIXELS_DEFAULT = 1_000_000_000

#: Fallback bytes-per-pixel used by ``_gb_hint`` when the caller has
#: not resolved the decoded dtype yet. float32 matches the constant
#: docstring above and the dtype the decoder produces for the common
#: single-band float case. Real call sites pass the actual dtype so
#: the GB hint reflects the allocation that would have happened.
_DEFAULT_HINT_BYTES = 4


def _gb_hint(pixels, dtype=None):
"""Return a ``~X.XX GB at N bytes/pixel`` string for ``pixels``.

``dtype`` is the numpy dtype of the would-be allocation. When
provided, the hint uses ``dtype.itemsize`` so f64 reports 8
bytes/pixel, u8 reports 1, etc. When omitted, the hint falls
back to ``_DEFAULT_HINT_BYTES`` (float32) for older call sites
that have not threaded the dtype through yet.

Uses decimal GB (10**9 bytes) so the printed number matches the
``~4 GB`` figure quoted in the :data:`MAX_PIXELS_DEFAULT`
docstring above and the convention reported by ``df`` / ``du`` /
most cloud object stores.
"""
if dtype is None:
bpp = _DEFAULT_HINT_BYTES
else:
bpp = int(np.dtype(dtype).itemsize)
gb = (pixels * bpp) / 1_000_000_000
return f"~{gb:.2f} GB at {bpp} bytes/pixel"


def _suggest_chunk_side(max_pixels, samples):
"""Pick a square chunksize whose pixel count fits comfortably under
``max_pixels`` for the given band count. Capped at 1024 so the hint
stays near common COG tile sizes (256 / 512 / 1024)."""
samples = max(1, int(samples))
if max_pixels <= 0:
return 1
side = int(math.isqrt(max_pixels // samples))
return max(1, min(1024, side))


def _recovery_hint(max_pixels, samples):
"""Build the multi-line recovery hint appended to PixelSafetyLimitError.

Always suggests ``window=`` for reading a sub-region. Suggests
``chunks=`` when dask is installed; otherwise recommends installing
dask so chunked reads become available. ``importlib.util.find_spec``
avoids importing dask just to format an error.
"""
chunk = _suggest_chunk_side(max_pixels, samples)
lines = [
"To read this file, try one of:",
" * Pass a larger max_pixels= if the allocation is acceptable.",
f" * Read a sub-region with window=(r0, c0, r1, c1), "
f"e.g. window=(0, 0, {chunk}, {chunk}).",
]
if importlib.util.find_spec('dask') is not None:
lines.append(
f" * Read lazily in chunks with chunks={chunk} so each "
f"decoded buffer stays under max_pixels."
)
else:
lines.append(
" * Install dask (`pip install dask` or "
"`conda install -c conda-forge dask`) to read the file "
f"lazily via chunks={chunk}."
)
return "\n".join(lines)


class PixelSafetyLimitError(ValueError):
"""Raised when a requested TIFF allocation exceeds max_pixels."""


def _check_dimensions(width, height, samples, max_pixels):
"""Raise PixelSafetyLimitError if the request exceeds *max_pixels*."""
def _check_dimensions(width, height, samples, max_pixels, dtype=None):
"""Raise PixelSafetyLimitError if the request exceeds *max_pixels*.

``dtype`` is forwarded to :func:`_gb_hint` so the byte-allocation
estimate in the error message reflects the actual dtype that
would have been allocated. Optional for callers that do not have
the dtype resolved yet (the hint then falls back to float32).
"""
total = width * height * samples
if total > max_pixels:
raise PixelSafetyLimitError(
f"TIFF image dimensions ({width} x {height} x {samples} = "
f"{total:,} pixels) exceed the safety limit of "
f"{max_pixels:,} pixels. Pass a larger max_pixels value to "
f"read_to_array() if this file is legitimate."
f"{total:,} pixels, {_gb_hint(total, dtype)}) exceed the "
f"safety limit of {max_pixels:,} pixels "
f"({_gb_hint(max_pixels, dtype)}).\n"
f"{_recovery_hint(max_pixels, samples)}"
)


Expand Down
4 changes: 2 additions & 2 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@
# preserved without churn.
from ._layout import (_FULL_IMAGE_BUDGET_HEADER_SLACK, MAX_PIXELS_DEFAULT, # noqa: F401
PixelSafetyLimitError, _check_dimensions, _check_source_dimensions,
_compute_full_image_byte_budget, _has_sparse, _ifd_required_extent,
_sparse_fill_value)
_compute_full_image_byte_budget, _gb_hint, _has_sparse,
_ifd_required_extent, _sparse_fill_value)
# The data-source layer (local mmap, HTTP with SSRF defences and DNS-rebind
# pinning, fsspec cloud, BytesIO) lives in ``_sources``. It is imported back
# here so that:
Expand Down
25 changes: 17 additions & 8 deletions xrspatial/geotiff/_vrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,14 +1371,18 @@ def read_vrt(vrt_path: str, *, window=None,
from ._reader import MAX_PIXELS_DEFAULT, _check_dimensions
if max_pixels is None:
max_pixels = MAX_PIXELS_DEFAULT
n_bands = len([vrt.bands[band]] if band is not None else vrt.bands)
_check_dimensions(out_w, out_h, n_bands, max_pixels)

# Select bands
# Select bands first so the dtype is known for the GB hint in the
# safety-limit error.
if band is not None:
selected_bands = [vrt.bands[band]]
else:
selected_bands = vrt.bands
n_bands = len(selected_bands)
# All selected bands typically share a dtype; if they differ the
# hint uses the first band's dtype as an approximation.
_check_dimensions(out_w, out_h, n_bands, max_pixels,
dtype=selected_bands[0].dtype)

# Allocate output.
#
Expand Down Expand Up @@ -1533,15 +1537,20 @@ def read_vrt(vrt_path: str, *, window=None,
if (sub_dst_w > max_pixels
or sub_dst_h > max_pixels
or sub_dst_w * sub_dst_h > max_pixels):
from ._layout import _gb_hint
sub_total = sub_dst_w * sub_dst_h
band_dtype = vrt_band.dtype
raise ValueError(
f"VRT SimpleSource DstRect "
f"(xSize={dr.x_size}, ySize={dr.y_size}) requires "
f"a resample intermediate of "
f"{sub_dst_w * sub_dst_h:,} pixels for the "
f"requested window, which exceeds the safety "
f"limit of {max_pixels:,} pixels. Pass a larger "
f"max_pixels= to read_vrt() if this file is "
f"legitimate."
f"{sub_total:,} pixels "
f"({_gb_hint(sub_total, band_dtype)}) "
f"for the requested window, which exceeds the "
f"safety limit of {max_pixels:,} pixels "
f"({_gb_hint(max_pixels, band_dtype)}). Pass a "
f"larger max_pixels= to read_vrt() if this file "
f"is legitimate."
)
else:
read_r0 = sr.y_off + (clip_r0 - dst_r0)
Expand Down
110 changes: 110 additions & 0 deletions xrspatial/geotiff/tests/test_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,116 @@ def test_custom_limit(self):
# Relaxed: passes with large limit
_check_dimensions(100_000, 100_000, 1, max_pixels=100_000_000_000)

def test_error_message_includes_gb_estimate(self):
"""Error message reports both pixels and a GB allocation hint.

With no dtype passed the hint falls back to float32
(4 bytes/pixel). Real call sites pass the actual dtype.
"""
# 50000 x 50000 x 1 = 2.5e9 pixels = 10.00 GB at 4 bytes/pixel
# MAX_PIXELS_DEFAULT = 1e9 pixels = 4.00 GB at 4 bytes/pixel
with pytest.raises(ValueError) as exc:
_check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT)
msg = str(exc.value)
# Pixel count still present (preserves existing assertions).
assert "2,500,000,000 pixels" in msg
assert "1,000,000,000 pixels" in msg
# GB hint added for both the requested and the limit allocations.
# Uses decimal GB (10**9 bytes) to match the docstring convention.
assert "~10.00 GB at 4 bytes/pixel" in msg
assert "~4.00 GB at 4 bytes/pixel" in msg

def test_error_message_uses_passed_dtype_for_gb_hint(self):
"""When the caller passes the decoded dtype, the GB hint reports
the exact byte width: f64 → 8 bytes/pixel (double the float32
default), u8 → 1 byte/pixel (a quarter)."""
# float64: 1e9 pixels * 8 bytes = 8.00 GB
with pytest.raises(ValueError) as exc:
_check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT,
dtype=np.float64)
assert "20.00 GB at 8 bytes/pixel" in str(exc.value)
assert "8.00 GB at 8 bytes/pixel" in str(exc.value)

# uint8: 1e9 pixels * 1 byte = 1.00 GB
with pytest.raises(ValueError) as exc:
_check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT,
dtype=np.uint8)
assert "2.50 GB at 1 bytes/pixel" in str(exc.value)
assert "1.00 GB at 1 bytes/pixel" in str(exc.value)

def test_error_message_includes_window_suggestion(self):
"""Error message suggests window= for reading a sub-region."""
with pytest.raises(ValueError) as exc:
_check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT)
msg = str(exc.value)
assert "window=(r0, c0, r1, c1)" in msg
# Concrete example uses the same chunk-side as the chunks hint.
assert "window=(0, 0, 1024, 1024)" in msg

def test_error_message_suggests_chunks_when_dask_installed(self,
monkeypatch):
"""When dask is importable, suggest chunks=."""
# Force the 'dask installed' branch regardless of test env.
monkeypatch.setattr(
'xrspatial.geotiff._layout.importlib.util.find_spec',
lambda name: object() if name == 'dask' else None,
)
with pytest.raises(ValueError) as exc:
_check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT)
msg = str(exc.value)
assert "chunks=1024" in msg
assert "lazily" in msg
# Should not recommend installing dask.
assert "pip install dask" not in msg
assert "conda install" not in msg

def test_error_message_recommends_install_when_dask_missing(self,
monkeypatch):
"""When dask is not importable, recommend installation."""
monkeypatch.setattr(
'xrspatial.geotiff._layout.importlib.util.find_spec',
lambda name: None,
)
with pytest.raises(ValueError) as exc:
_check_dimensions(50_000, 50_000, 1, MAX_PIXELS_DEFAULT)
msg = str(exc.value)
assert "pip install dask" in msg
assert "conda install -c conda-forge dask" in msg
# The chunks= number is still surfaced so the user knows what to
# pass once dask is installed.
assert "chunks=1024" in msg

def test_suggested_chunk_side_scales_with_max_pixels(self):
"""The suggested chunk side fits under max_pixels for the given
band count and never exceeds 1024."""
from xrspatial.geotiff._layout import _suggest_chunk_side
# Default budget: capped at 1024.
assert _suggest_chunk_side(1_000_000_000, 1) == 1024
# Tight budget: 100 pixels, 1 band -> side 10.
assert _suggest_chunk_side(100, 1) == 10
# Multi-band reduces the per-side budget.
# 1_000_000 budget, 4 bands -> sqrt(250_000) = 500.
assert _suggest_chunk_side(1_000_000, 4) == 500
# Pathological inputs do not crash.
assert _suggest_chunk_side(0, 1) == 1
assert _suggest_chunk_side(10, 0) == 3

def test_gb_hint_helper_rounds_to_two_decimals(self):
"""_gb_hint formats bytes/pixel * count as a ~X.XX GB string."""
from xrspatial.geotiff._layout import _gb_hint
# No dtype: 1 billion pixels * 4 bytes (float32 default) ->
# 4.00 GB hint, matching MAX_PIXELS_DEFAULT's docstring.
assert _gb_hint(1_000_000_000) == "~4.00 GB at 4 bytes/pixel"
# Zero pixels still formats sensibly.
assert _gb_hint(0) == "~0.00 GB at 4 bytes/pixel"
# With dtype: itemsize drives the byte multiplier.
assert _gb_hint(1_000_000_000, dtype=np.float64) == \
"~8.00 GB at 8 bytes/pixel"
assert _gb_hint(1_000_000_000, dtype=np.uint8) == \
"~1.00 GB at 1 bytes/pixel"
assert _gb_hint(1_000_000_000, dtype=np.int16) == \
"~2.00 GB at 2 bytes/pixel"

def test_read_strips_rejects_huge_header(self):
"""_read_strips refuses to allocate when header claims huge dims."""
# Build a valid TIFF with small pixel data but huge header dimensions.
Expand Down
34 changes: 34 additions & 0 deletions xrspatial/geotiff/tests/vrt/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,40 @@ def test_per_source_cap_inclusive_boundary():
assert arr.shape == (100, 100)


def test_per_source_cap_error_includes_gb_hint(monkeypatch):
"""The per-source resample-intermediate guard reports the GB
allocation hint alongside the pixel count.

The output-dimension guard fires before the per-source guard at
the same ``max_pixels`` threshold, so to verify the per-source
error string in isolation we no-op the dimension guard for the
duration of the read. That keeps the test honest about which
branch's format string is being asserted.
"""
# ``read_vrt`` imports ``_check_dimensions`` from ``_reader`` at
# call time, so patching the reader module's binding takes effect
# on the next call.
from xrspatial.geotiff import _reader as reader_mod
monkeypatch.setattr(reader_mod, '_check_dimensions',
lambda *args, **kwargs: None)
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_dstrect_cap_write_source(td)
vrt_path = _dstrect_cap_write_vrt(
td,
dst_x_size=2000,
dst_y_size=2000,
raster_x=2000,
raster_y=2000,
)
with pytest.raises(ValueError, match='resample intermediate') as exc:
_dstrect_cap_read_vrt_internal(vrt_path, max_pixels=1_000_000)
msg = str(exc.value)
# Sub-window is 2000x2000 = 4_000_000 pixels. The VRT band's
# dataType is "Byte" (uint8), so the GB hint uses 1 byte/pixel.
assert '4,000,000 pixels' in msg
assert 'GB at 1 bytes/pixel' in msg


def test_negative_dstrect_rejected():
"""Negative ``xSize`` / ``ySize`` must surface as ``ValueError``
rather than be silently skipped by the overlap check. The error
Expand Down
Loading