From 7eb92e5289285dba2a75b67a34b24f13dcfe6af6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 7 Jun 2026 09:27:52 -0700 Subject: [PATCH 1/2] Thread sidecar_origin through the HTTP/fsspec sidecar georef path (#3023) Pass a sidecar_origin mapping into extract_geo_info_with_overview_inheritance on the HTTP/fsspec sidecar path in _cog_http.py so it matches the eager local and metadata-only readers. Add remote test coverage proving a sidecar that carries its own GeoKeys wins over the base file's georef on the HTTP eager, HTTP chunked, fsspec chunked, and direct _parse_cog_http_meta paths. --- xrspatial/geotiff/_cog_http.py | 22 ++- .../geotiff/tests/integration/test_sidecar.py | 134 ++++++++++++++++++ 2 files changed, 150 insertions(+), 6 deletions(-) diff --git a/xrspatial/geotiff/_cog_http.py b/xrspatial/geotiff/_cog_http.py index 97fa584d8..dd6d8c6c1 100644 --- a/xrspatial/geotiff/_cog_http.py +++ b/xrspatial/geotiff/_cog_http.py @@ -250,15 +250,25 @@ def _parse_cog_http_meta( # typically carry no out-of-line geokeys and inherit from level-0 # (which sits in the base buffer). # - # The ``sidecar_origin`` kwarg used by the eager local / fsspec - # paths is intentionally not threaded here; the HTTP / dask - # sidecar-byte-order case is handled separately. When that lands, - # this call should pick up the same mapping so an HTTP sidecar with - # its own geokeys is parsed against the sidecar bytes too. + # Map the sidecar's IFDs to their own (bytes, byte_order) so a + # sidecar that declares its own geokeys is parsed against the + # sidecar bytes, not the base file's. Sidecar IFDs without geokeys + # still inherit from the base level-0 via the overview path. This + # mirrors the ``georef_origin`` mapping the eager local / fsspec + # reader builds in ``_reader.py``; without it a big-endian ``.ovr`` + # with its own georef (or any sidecar whose byte order differs from + # the base file) is read with the wrong byte source and returns + # corrupt georeferencing. + georef_origin = ( + {id(sc_ifd): (sidecar.data, sidecar.header.byte_order) + for sc_ifd in sidecar.ifds} + if sidecar is not None else None + ) geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, header_bytes, header.byte_order, allow_rotated=allow_rotated, - allow_invalid_nodata=allow_invalid_nodata) + allow_invalid_nodata=allow_invalid_nodata, + sidecar_origin=georef_origin) # When the chosen IFD lives in the sidecar, return the sidecar's own # ``TIFFHeader`` so the per-chunk / eager decode step sees the byte # order of the file the bytes actually came from. A big-endian diff --git a/xrspatial/geotiff/tests/integration/test_sidecar.py b/xrspatial/geotiff/tests/integration/test_sidecar.py index 5ab8592ff..311a74456 100644 --- a/xrspatial/geotiff/tests/integration/test_sidecar.py +++ b/xrspatial/geotiff/tests/integration/test_sidecar.py @@ -1852,3 +1852,137 @@ def test_parse_cog_http_meta_returns_sidecar_header(tmp_path, monkeypatch, finally: httpd.shutdown() httpd.server_close() + + +# --------------------------------------------------------------------------- +# Section: sidecar_geokeys_remote +# +# Remote analog of ``test_sidecar_with_own_geokeys_wins_eager`` (issue +# #3023). The local eager / metadata readers thread a ``sidecar_origin`` +# mapping into ``extract_geo_info_with_overview_inheritance``; the +# HTTP / fsspec path in ``_cog_http.py`` now threads the same mapping so +# the two surfaces stay symmetric. These tests pin the observable +# contract on every remote read path: when the selected overview IFD +# lives in a sidecar that declares its own GeoKeys, the sidecar's +# transform and CRS win over the base file's. ``_write_pair`` builds a +# base + geokey-bearing sidecar whose georef differs from the base. +# --------------------------------------------------------------------------- +_GEOKEY_BASE_GEO_3023 = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) +_GEOKEY_SIDE_GEO_3023 = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + +def _assert_sidecar_geokeys_won_3023(da): + """Sidecar georef (transform + CRS) wins over the base file's.""" + t = da.attrs["transform"] + assert t[0] == pytest.approx(2.5) + assert t[2] == pytest.approx(500.0) + assert t[4] == pytest.approx(-2.5) + assert t[5] == pytest.approx(800.0) + assert da.attrs.get("crs") == 3857 + + +@requires_loopback +def test_http_eager_sidecar_with_own_geokeys_wins(tmp_path, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + base_path, sidecar_path = _write_pair( + tmp_path, base_geo=_GEOKEY_BASE_GEO_3023, + sidecar_geo=_GEOKEY_SIDE_GEO_3023, sidecar_has_geokeys=True) + payloads = { + "/x.tif": base_path.read_bytes(), + "/x.tif.ovr": sidecar_path.read_bytes(), + } + httpd, port = _start_range_http_server_2314(payloads) + try: + url = f"http://127.0.0.1:{port}/x.tif" + da = open_geotiff(url, overview_level=1) + _assert_sidecar_geokeys_won_3023(da) + finally: + httpd.shutdown() + httpd.server_close() + + +@requires_loopback +def test_http_chunked_sidecar_with_own_geokeys_wins(tmp_path, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + base_path, sidecar_path = _write_pair( + tmp_path, base_geo=_GEOKEY_BASE_GEO_3023, + sidecar_geo=_GEOKEY_SIDE_GEO_3023, sidecar_has_geokeys=True) + payloads = { + "/x.tif": base_path.read_bytes(), + "/x.tif.ovr": sidecar_path.read_bytes(), + } + httpd, port = _start_range_http_server_2314(payloads) + try: + url = f"http://127.0.0.1:{port}/x.tif" + # ``chunks`` drives the dask metadata path through + # ``_parse_cog_http_meta``; its ``geo_info`` is what coords are + # built from, so a wrong byte source surfaces in the attrs here. + da = open_geotiff(url, overview_level=1, chunks=16) + _assert_sidecar_geokeys_won_3023(da) + finally: + httpd.shutdown() + httpd.server_close() + + +def test_fsspec_chunked_sidecar_with_own_geokeys_wins(tmp_path): + fsspec = pytest.importorskip("fsspec") + base_path, sidecar_path = _write_pair( + tmp_path, base_geo=_GEOKEY_BASE_GEO_3023, + sidecar_geo=_GEOKEY_SIDE_GEO_3023, sidecar_has_geokeys=True) + fs = fsspec.filesystem("memory") + # memory:// is process-global; collide-proof the key. + key = f"issue3023-{os.getpid()}-{uuid.uuid4().hex[:8]}" + base_uri = f"memory://{key}/x.tif" + side_uri = base_uri + ".ovr" + fs.pipe_file(base_uri.replace("memory://", ""), base_path.read_bytes()) + fs.pipe_file(side_uri.replace("memory://", ""), sidecar_path.read_bytes()) + try: + da = open_geotiff(base_uri, overview_level=1, chunks=16) + _assert_sidecar_geokeys_won_3023(da) + finally: + for p in (base_uri, side_uri): + try: + fs.rm_file(p.replace("memory://", "")) + except Exception: + pass + + +@requires_loopback +def test_parse_cog_http_meta_geo_info_honours_sidecar_geokeys( + tmp_path, monkeypatch): + """``_parse_cog_http_meta`` returns sidecar-derived geo_info directly.""" + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + from xrspatial.geotiff._cog_http import _parse_cog_http_meta + from xrspatial.geotiff._reader import _HTTPSource + from xrspatial.geotiff._sidecar import close_sidecar + base_path, sidecar_path = _write_pair( + tmp_path, base_geo=_GEOKEY_BASE_GEO_3023, + sidecar_geo=_GEOKEY_SIDE_GEO_3023, sidecar_has_geokeys=True) + payloads = { + "/x.tif": base_path.read_bytes(), + "/x.tif.ovr": sidecar_path.read_bytes(), + } + httpd, port = _start_range_http_server_2314(payloads) + try: + url = f"http://127.0.0.1:{port}/x.tif" + src = _HTTPSource(url) + try: + _, _, geo_info, _, sidecar_meta = _parse_cog_http_meta( + src, overview_level=1, source_path=url, return_sidecar=True) + finally: + src.close() + sidecar, _, used = sidecar_meta + try: + assert used is True + assert geo_info.transform.origin_x == pytest.approx(500.0) + assert geo_info.transform.origin_y == pytest.approx(800.0) + assert geo_info.transform.pixel_width == pytest.approx(2.5) + assert geo_info.transform.pixel_height == pytest.approx(-2.5) + assert geo_info.crs_epsg == 3857 + finally: + close_sidecar(sidecar) + finally: + httpd.shutdown() + httpd.server_close() From 4fbd09ac34085b78827b71132a77be9e83a8b9e3 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 7 Jun 2026 09:29:26 -0700 Subject: [PATCH 2/2] Address review: reword sidecar_origin comment to match mechanism (#3023) The previous comment claimed the HTTP path returned corrupt georeferencing without the mapping. On the current eager-parse IFD model that is not accurate: extract_geo_info only consumes data / byte_order via _parse_geokeys, which reads values already materialized on ifd.entries during parse_ifd. Reword to describe the change as a consistency alignment with the local reader that also future-proofs against lazy tag reads. --- xrspatial/geotiff/_cog_http.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/xrspatial/geotiff/_cog_http.py b/xrspatial/geotiff/_cog_http.py index dd6d8c6c1..bcbbe28ef 100644 --- a/xrspatial/geotiff/_cog_http.py +++ b/xrspatial/geotiff/_cog_http.py @@ -250,15 +250,19 @@ def _parse_cog_http_meta( # typically carry no out-of-line geokeys and inherit from level-0 # (which sits in the base buffer). # - # Map the sidecar's IFDs to their own (bytes, byte_order) so a - # sidecar that declares its own geokeys is parsed against the - # sidecar bytes, not the base file's. Sidecar IFDs without geokeys - # still inherit from the base level-0 via the overview path. This - # mirrors the ``georef_origin`` mapping the eager local / fsspec - # reader builds in ``_reader.py``; without it a big-endian ``.ovr`` - # with its own georef (or any sidecar whose byte order differs from - # the base file) is read with the wrong byte source and returns - # corrupt georeferencing. + # Map the sidecar's IFDs to their own (bytes, byte_order) and pass + # it as ``sidecar_origin`` so this path stays symmetric with the + # eager local / fsspec reader, which builds the same mapping in + # ``_reader.py``. On today's eager-parse IFD model the mapping does + # not change the result: ``extract_geo_info`` only forwards + # ``data`` / ``byte_order`` to ``_parse_geokeys``, which reads + # values already materialized on ``ifd.entries`` (every tag, + # including out-of-line georef arrays, is decoded in ``parse_ifd`` + # against its own file's byte order, and the sidecar IFDs are + # parsed with the sidecar header in ``discover_remote_sidecar``). + # We thread it anyway to match the local path and to stay correct + # if tag-value reads ever become lazy and start depending on the + # buffer / byte order handed to ``extract_geo_info``. georef_origin = ( {id(sc_ifd): (sidecar.data, sidecar.header.byte_order) for sc_ifd in sidecar.ifds}