Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3781.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added `Struct` class (subclass of `Structured`) implementing the zarr-extensions `struct` dtype spec. Uses object-style field format and dict fill values. Legacy `Structured` remains available for backward compatibility.
31 changes: 31 additions & 0 deletions docs/user-guide/data_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,37 @@ here, it's possible to create it yourself: see [Adding New Data Types](#adding-n
#### Struct-like
- [Structured][zarr.dtype.Structured]

!!! note "Zarr V3 Structured Data Types"

In Zarr V3, structured data types are specified using the `struct` extension defined in the
[zarr-extensions repository](https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/struct).
The JSON representation uses an object format for fields:

```json
{
"name": "struct",
"configuration": {
"fields": [
{"name": "x", "data_type": "float32"},
{"name": "y", "data_type": "int64"}
]
}
}
```

For backward compatibility, Zarr Python also accepts the legacy `structured` name with
tuple-format fields when reading existing data.

Fill values for structured types are represented as JSON objects mapping field names to values:

```json
{"x": 1.5, "y": 42}
```

When using structured types with multi-byte fields, the `bytes` codec must specify an
explicit `endian` parameter. If omitted, Zarr Python assumes little-endian for legacy
compatibility but emits a warning.

### Example Usage

This section will demonstrates the basic usage of Zarr data types.
Expand Down
17 changes: 16 additions & 1 deletion src/zarr/codecs/bytes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import sys
import warnings
from dataclasses import dataclass, replace
from enum import Enum
from typing import TYPE_CHECKING
Expand All @@ -9,6 +10,7 @@
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import JSON, parse_enum, parse_named_configuration
from zarr.core.dtype.common import HasEndianness
from zarr.core.dtype.npy.structured import Structured

if TYPE_CHECKING:
from typing import Self
Expand Down Expand Up @@ -56,7 +58,20 @@ def to_dict(self) -> dict[str, JSON]:
return {"name": "bytes", "configuration": {"endian": self.endian.value}}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
if not isinstance(array_spec.dtype, HasEndianness):
if isinstance(array_spec.dtype, Structured):
if array_spec.dtype.has_multi_byte_fields():
if self.endian is None:
warnings.warn(
"Missing 'endian' for structured dtype with multi-byte fields. "
"Assuming little-endian for legacy compatibility.",
UserWarning,
stacklevel=2,
)
return replace(self, endian=Endian.little)
else:
if self.endian is not None:
return replace(self, endian=None)
elif not isinstance(array_spec.dtype, HasEndianness):
if self.endian is not None:
return replace(self, endian=None)
elif self.endian is None:
Expand Down
6 changes: 5 additions & 1 deletion src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
)
from zarr.core.config import config as zarr_config
from zarr.core.dtype import (
Structured,
VariableLengthBytes,
VariableLengthUTF8,
ZDType,
Expand Down Expand Up @@ -5064,10 +5065,13 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
``VLenBytesCodec``, respectively.
Structured data types with multi-byte fields use ``BytesCodec`` with little-endian encoding.
"""
serializer: ArrayBytesCodec = BytesCodec(endian=None)

if isinstance(dtype, HasEndianness):
if isinstance(dtype, HasEndianness) or (
isinstance(dtype, Structured) and dtype.has_multi_byte_fields()
):
serializer = BytesCodec(endian="little")
elif isinstance(dtype, HasObjectCodec):
if dtype.object_codec_id == "vlen-bytes":
Expand Down
18 changes: 15 additions & 3 deletions src/zarr/core/dtype/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@
from zarr.core.dtype.npy.complex import Complex64, Complex128
from zarr.core.dtype.npy.float import Float16, Float32, Float64
from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64
from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3
from zarr.core.dtype.npy.structured import (
Struct,
StructJSON_V3,
Structured,
StructuredJSON_V2,
StructuredJSON_V3,
)
from zarr.core.dtype.npy.time import (
DateTime64,
DateTime64JSON_V2,
Expand Down Expand Up @@ -75,6 +81,8 @@
"RawBytes",
"RawBytesJSON_V2",
"RawBytesJSON_V3",
"Struct",
"StructJSON_V3",
"Structured",
"StructuredJSON_V2",
"StructuredJSON_V3",
Expand Down Expand Up @@ -124,7 +132,7 @@
| ComplexFloatDType
| StringDType
| BytesDType
| Structured
| Struct
| TimeDType
| VariableLengthBytes
)
Expand All @@ -137,7 +145,7 @@
*COMPLEX_FLOAT_DTYPE,
*STRING_DTYPE,
*BYTES_DTYPE,
Structured,
Struct,
*TIME_DTYPE,
VariableLengthBytes,
)
Expand All @@ -155,6 +163,10 @@
# mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType
data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type]

# Register Structured for reading legacy "structured" format JSON, but don't include it in
# ANY_DTYPE since it doesn't support native dtype matching (use Struct instead).
data_type_registry.register(Structured._zarr_v3_name, Structured)


# TODO: find a better name for this function
def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, TBaseScalar]:
Expand Down
Loading
Loading