From a5d57d201425ff87ac272a360ca72c680f52a0d7 Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 18:01:03 +0100 Subject: [PATCH 01/10] Refactor create_item_with_zarr/icechunk_asset() to populate Item schema using global CF attrs from asset & add group support to open_icechunk_store(). --- OceanDataStore/catalog/stac/utils.py | 420 ++++++++++++++++++--------- 1 file changed, 277 insertions(+), 143 deletions(-) diff --git a/OceanDataStore/catalog/stac/utils.py b/OceanDataStore/catalog/stac/utils.py index e47e5340..01581f61 100644 --- a/OceanDataStore/catalog/stac/utils.py +++ b/OceanDataStore/catalog/stac/utils.py @@ -9,46 +9,18 @@ - Ollie Tooth (oliver.tooth@noc.ac.uk) """ # -- Import Python Modules -- # -import sys import pystac -import logging import datetime import icechunk import xarray as xr from shapely.geometry import Polygon, mapping - -# -- Logging Functions -- # -def create_logging_banner(logger: logging.Logger) -> None: - """Add OceanDataStore banner to logger.""" - logger.info(r""" - .~~~. - .( ).~~~~~~. - ~( ).~~~. - .( OceanDataStore ). - (___________________________). - - STAC Catalog Creator - """, - extra={"simple": True}, - ) - - -def initialise_logging(logger: logging.Logger) -> None: - """Initialise logging configuration.""" - logging.basicConfig( - stream=sys.stdout, - format="🌐 OceanDataStore 🌐 | %(levelname)10s | %(asctime)s | %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", - ) - - # -- I/O Functions -- # def open_icechunk_store( bucket: str, prefix: str, branch: str = "main", + group: str | None = None, endpoint_url: str = "https://noc-msm-o.s3-ext.jc.rl.ac.uk", ) -> xr.Dataset: """ @@ -62,6 +34,8 @@ def open_icechunk_store( Prefix for the Icechunk repository in the S3 bucket. branch : str, optional Branch of the Icechunk repository to open (default is "main"). + group : str, optional + Group within the Icechunk repository to open (default is None). endpoint_url : str, optional The S3 endpoint URL (default is "https://noc-msm-o.s3-ext.jc.rl.ac.uk"). """ @@ -79,57 +53,111 @@ def open_icechunk_store( repo = icechunk.Repository.open(storage=storage) # Open Dataset from Icechunk Store: - return xr.open_zarr(repo.readonly_session(branch=branch).store, consolidated=False) + return xr.open_zarr(repo.readonly_session(branch=branch).store, group=group, consolidated=False) # -- STAC Functions -- # -def create_item_with_icechunk_asset( - id: str, +def create_item_with_zarr_asset( + id : str, ds: xr.Dataset, bucket: str, - platform: str, prefix: str, + title: str, + dataset_type: str = "model", + product_type: str = "timeseries", + product_version: str = "1.0", + institution: str = "National Oceanography Centre, UK", + platform: str = "gn", + horizontal_grid_type: str = "curvilinear", + horizontal_grid_resolution: str = "1 degree", + vertical_grid_type: str = "zps", + vertical_grid_coordinate: str = "depth with partial steps", + vertical_grid_levels: int = 75, + operation: str ="annual-mean", + status: str = "completed", + update_frequency: str = "None", variant: str = "r1i1c1f1", + ocean_component: str = "NEMO v4.2.2", + sea_ice_component: str = "SI3 v4.0", + biogeochemistry_component: str = "None", + atmosphere_component: str = "None", + atmospheric_forcing: str = "JRA55-do", start_date: str = "1976-01-01", - end_date: str = "2024-12-31", + end_date: str = "2024-02-01", bbox: tuple = (-180.0, -90.0, 180.0, 90.0), - collection: str = "noc-npd-era5", - config: str = "eORCA1 ERA5v1 NPD", - operation: str ="annual-mean", + collection : str = "noc-npd-jra55", + variable_stores: bool = True, endpoint_url: str = "https://noc-msm-o.s3-ext.jc.rl.ac.uk", + zarr_format: int = 2, ) -> pystac.Item: """ - Create a STAC Item from an Icechunk Store asset. + Create a STAC Item from a Zarr Store asset. Parameters ---------- id : str Unique identifier for the STAC Item. ds : xr.Dataset - The xarray Dataset containing the data to be included in the STAC Item. + Dataset containing the data to be included in the STAC Item. bucket : str - The S3 bucket name where the data is stored. - platform : str - The platform name (e.g., "gn_global", "gr_global", etc.). + S3 bucket name where the data is stored. prefix : str - The prefix for the data in the S3 bucket (e.g., "U1y", "U1m", etc.). + Prefix for the data in the S3 bucket (e.g., "U1y", "U1m", etc.). + title : str + Title of the dataset. + dataset_type : str + Type of dataset (e.g., "model", "observation", etc.). + product_type : str + Type of product (e.g., "climatology", "timeseries", etc.). + product_version : str + Version of the product. + institution : str + Institution responsible for producing the dataset. + platform : str + Platform name (e.g., "gn_global", "gr_global", etc.). + horizontal_grid_type : str + Type of horizontal grid used in the dataset (e.g., "regular rectilinear", "irregular rectilinear", "curvilinear", etc.). + horizontal_grid_resolution : str + Horizontal grid resolution of the dataset (e.g., "1 degreee", "0.25 degree", etc.). + vertical_grid_type : str + Type of vertical grid used in the dataset (e.g., "z", "sigma", "hybrid", etc.). + vertical_grid_coordinate : str + Type of vertical coordinate used in the dataset (e.g., "depth", "sigma", etc.). + vertical_grid_levels : int + Number of vertical levels in the dataset. + operation : str, optional + Operation string indicating the type of operation performed on the dataset (default is "annual-mean"). + status : str, optional + Status of the dataset (e.g., "ongoing", "completed", etc.) (default is "completed"). + update_frequency : str, optional + Frequency at which the dataset is updated (e.g., "monthly", "biannually", etc.) (default is "None"). variant : str, optional - The simulation variant string for the dataset (default is "r1i1c1f1"). + Simulation variant string for the dataset (default is "r1i1c1f1"). + ocean_component : str, optional + Ocean model component used to produce the dataset (e.g., "NEMO v4.2.2", etc.) (default is "NEMO v4.2.2"). + sea_ice_component : str, optional + Sea ice model component used to produce the dataset (e.g., "CICE v6.1", etc.) (default is "SI3 v4.0"). + biogeochemistry_component : str, optional + Biogeochemistry model component used to produce the dataset (e.g., "PISCES v2", etc.) (default is "None"). + atmosphere_component : str, optional + Atmospheric model component used to produce the dataset (e.g., "UKMO UM Global Atmosphere 7.1", etc.) (default is "None"). + atmospheric_forcing : str, optional + Atmospheric forcing used to produce the dataset (e.g., "ERA5", "JRA55-do", etc.) (default is "JRA55-do"). start_date : str, optional - The start date of the dataset in "YYYY-MM-DD" format (default is "1976-01-01"). + Start date of the dataset in "YYYY-MM-DD" format (default is "1976-01-01"). end_date : str, optional - The end date of the dataset in "YYYY-MM-DD" format (default is "2024-12-31"). + End date of the dataset in "YYYY-MM-DD" format (default is "2024-12-31"). bbox : tuple, optional Bounding box for the dataset in the format (min_lon, min_lat, max_lon, max_lat). (default is global coverage). collection : str, optional - The name of the STAC Collection to which this Item belongs (default is "noc-npd-era5"). - config : str, optional - The configuration string for the dataset (default is "eORCA1 ERA5v1 NPD"). - operation : str, optional - The operation string indicating the type of operation performed on the dataset (default is "annual-mean"). + STAC Collection to which this Item belongs (default is "noc-npd-jra55"). + variable_stores : bool, optional + Whether each variable is stored in a separate Zarr store (default is True). endpoint_url : str, optional - The S3 endpoint URL (default is "https://noc-msm-o.s3-ext.jc.rl.ac.uk"). + S3 endpoint URL (default is "https://noc-msm-o.s3-ext.jc.rl.ac.uk"). + zarr_format: int, optional + Zarr format version (default is 2). Returns ------- @@ -137,16 +165,18 @@ def create_item_with_icechunk_asset( A STAC Item containing the dataset information and an asset pointing to the data. """ # Define the item description based on the prefix: + var = f"{prefix.split('/')[-1]} output" if variable_stores else "outputs" + if 'domain' in prefix: description = "**Global ocean model domain and mesh mask variables.**" elif 'I' in prefix: - description = f"**{operation.capitalize()} global sea-ice outputs defined at NEMO model T-points.**" + description = f"**{operation.capitalize()} global sea-ice {var} defined at NEMO model T-points.**" elif 'S' in prefix: - description = f"**{operation.capitalize()} global ocean scalar outputs.**" + description = f"**{operation.capitalize()} global ocean scalar {var}.**" elif 'M' in prefix: - description = f"**{operation.capitalize()} ocean physics transect outputs defined at {prefix.split('/')[-1]}.**" + description = f"**{operation.capitalize()} ocean physics transect {var} defined at {prefix.split('/')[-1]}.**" else: - description = f"**{operation.capitalize()} global ocean physics outputs defined at NEMO model {prefix[0]}-points.**" + description = f"**{operation.capitalize()} global ocean physics {var} defined at {prefix[0]}-points.**" # Add OceanDataCatalog Access Information to the description: description += f"\n\n**OceanDataCatalog Access:**\n`catalog.open_dataset(id='{id}')`" @@ -172,36 +202,46 @@ def create_item_with_icechunk_asset( start_datetime=datetime.datetime(year=int(start_date.split("-")[0]), month=int(start_date.split("-")[1]), day=int(start_date.split("-")[2])), end_datetime=datetime.datetime(year=int(end_date.split("-")[0]), month=int(end_date.split("-")[1]), day=int(end_date.split("-")[2])), properties={ - "title": f"{prefix} Icechunk repository", + "title": title, "description": description, - "variant": variant, + "dataset_type": dataset_type, + "product_type": product_type, + "product_version": product_version, + "institution": institution, "platform": platform, + "horizontal_grid_type": horizontal_grid_type, + "horizontal_grid_resolution": horizontal_grid_resolution, + "vertical_grid_type": vertical_grid_type, + "vertical_grid_coordinate": vertical_grid_coordinate, + "vertical_grid_levels": vertical_grid_levels, + "dimensions": list(ds.dims), "variables": list(ds.data_vars), "variable_standard_names": [ds[var].attrs.get('standard_name', var) for var in ds.data_vars], - "dimensions": list(ds.dims), - "operation": operation.split(" ")[1], - "operation_frequency": operation.split(" ")[0], - "ocean_component": "NEMO v4.2.2", - "sea_ice_component": "SI3 v4.0", - "biogeochemistry_component": None, - "atmosphere_component": None, - "status": "ongoing", - "update_frequency": "quarterly", + "aggregation": operation.split()[1].lower(), + "aggregation_frequency": operation.split()[0].lower(), + "status": status, + "update_frequency": update_frequency, "latest_data_update": datetime.datetime.now().isoformat(), + "variant": variant, + "ocean_component": ocean_component, + "sea_ice_component": sea_ice_component, + "biogeochemistry_component": biogeochemistry_component, + "atmosphere_component": atmosphere_component, + "atmospheric_forcing": atmospheric_forcing, }, collection=collection, ) item.add_asset(key=prefix.split('/')[-1], asset=pystac.Asset( href=f"https://noc-msm-o.s3-ext.jc.rl.ac.uk/{bucket}/{prefix}", - title=f"{config}: {prefix} Icechunk repository", + title=title, description=description, - media_type="application/vnd.zarr+icechunk", + media_type="application/vnd.zarr", extra_fields=dict( endpoint_url=endpoint_url, bucket=bucket, prefix=prefix, - variant=variant, + zarr_format=zarr_format, anonymous=True ) )) @@ -209,81 +249,135 @@ def create_item_with_icechunk_asset( return item -def create_item_with_zarr_asset( - id : str, +def create_item_with_icechunk_asset( ds: xr.Dataset, + id: str, bucket: str, - platform: str, prefix: str, - variant: str = "r1i1c1f1", - start_date: str = "1976-01-01", - end_date: str = "2024-02-01", - bbox: tuple = (-180.0, -90.0, 180.0, 90.0), - collection : str = "noc-npd-jra55", - config: str = "eORCA1 JRA55v1 NPD", - operation: str ="annual-mean", - variable_stores: bool = True, + title: str | None = None, + description: str | None = None, + dataset_type: str | None = None, + product_type: str | None = None, + product_version: str | None = None, + institution: str | None = None, + citation: str | None = None, + acknowledgement: str | None = None, + license: str | None = None, + doi: str | None = None, + platform: str | None = None, + horizontal_grid_type: str | None = None, + horizontal_grid_resolution: str | None = None, + vertical_grid_type: str | None = None, + vertical_grid_coordinate: str | None = None, + vertical_grid_levels: int | None = None, + aggregation: str | None = None, + aggregation_frequency: str | None = None, + status: str | None = None, + update_frequency: str | None = None, + ocean_component: str | None = None, + sea_ice_component: str | None = None, + biogeochemistry_component: str | None = None, + atmosphere_component: str | None = None, + atmospheric_forcing: str | None = None, + variant: str | None = None, + start_date: str | None = None, + end_date: str | None = None, + bbox: tuple | None = None, + collection: str = "noc-npd-era5", endpoint_url: str = "https://noc-msm-o.s3-ext.jc.rl.ac.uk", - zarr_format: int = 2, + group: str | None = None, + anonymous: bool = True, ) -> pystac.Item: """ - Create a STAC Item from a Zarr Store asset. + Create a STAC Item from an Icechunk Store. Parameters ---------- + ds : xr.Dataset + Dataset to be included in the STAC Item. id : str Unique identifier for the STAC Item. - ds : xr.Dataset - The xarray Dataset containing the data to be included in the STAC Item. bucket : str - The S3 bucket name where the data is stored. - platform : str - The platform name (e.g., "gn_global", "gr_global", etc.). + S3 bucket name where the dataset is stored. prefix : str - The prefix for the data in the S3 bucket (e.g., "U1y", "U1m", etc.). + Prefix for the dataset in the S3 bucket (e.g., "U1y", "U1m", etc.). + title : str, optional + Title of the dataset (default is None, which will use the "title" attribute from the dataset if available). + description : str, optional + Description of the dataset (default is None, which will use the "description" attribute from the dataset if available). + dataset_type : str, optional + Type of dataset (e.g., "model", "observation", etc.) (default is None, which will use the "dataset_type" attribute from the dataset if available). + product_type : str, optional + Type of product (e.g., "climatology", "timeseries", etc.) (default is None, which will use the "product_type" attribute from the dataset if available). + product_version : str, optional + Version of the product (default is None, which will use the "product_version" attribute from the dataset if available). + institution : str, optional + Institution responsible for producing the dataset (default is None, which will use the "institution" attribute from the dataset if available). + citation : str, optional + Citation for the dataset (default is None, which will use the "citation" attribute from the dataset if available). + acknowledgement : str, optional + Acknowledgement for the dataset (default is None, which will use the "acknowledgement" attribute from the dataset if available). + license : str, optional + License for the dataset (default is None, which will use the "license" attribute from the dataset if available). + doi : str, optional + Digital Object Identifier (DOI) for the dataset (default is None, which will use the "doi" attribute from the dataset if available). + platform : str, optional + Platform string (e.g., "gn", "gr", "tn", etc.) (default is None, which will use the "platform" attribute from the dataset if available). + horizontal_grid_type : str, optional + Type of horizontal grid used in the dataset (e.g., "regular rectilinear", "irregular rectilinear", "curvilinear", etc.) (default is None, which will use the "horizontal_grid_type" attribute from the dataset if available). + horizontal_grid_resolution : str, optional + Horizontal resolution of the dataset (e.g., "1 degree", "0.25 degree", etc.) (default is None, which will use the "horizontal_grid_resolution" attribute from the dataset if available). + vertical_grid_type : str, optional + Type of vertical grid used in the dataset (e.g., "z", "sigma", "hybrid", etc.) (default is None, which will use the "vertical_grid_type" attribute from the dataset if available). + vertical_grid_coordinate : str, optional + Type of vertical coordinate used in the dataset (e.g., "depth", "sigma", etc.) (default is None, which will use the "vertical_grid_coordinate" attribute from the dataset if available). + vertical_grid_levels : int, optional + Number of vertical levels in the dataset (default is None, which will use the "vertical_grid_levels" attribute from the dataset if available). + aggregation : str, optional + Type of aggregation used to produce the dataset (e.g., "mean", "max", etc.) (default is None, which will use the "aggregation" attribute from the dataset if available). + aggregation_frequency : str, optional + Frequency at which the aggregation is applied (e.g., "monthly", "biannually", etc.) (default is None, which will use the "aggregation_frequency" attribute from the dataset if available). + status : str, optional + Status of the dataset (e.g., "ongoing", "completed", etc.) (default is None, which will use the "status" attribute from the dataset if available). + update_frequency : str, optional + Frequency at which the dataset is updated (e.g., "monthly", "biannually", etc.) (default is None, which will use the "update_frequency" attribute from the dataset if available). + ocean_component : str, optional + Ocean model component used to produce the dataset (e.g., "NEMO v4.2.2", etc.) (default is None, which will use the "ocean_component" attribute from the dataset if available). + sea_ice_component : str, optional + Sea ice model component used to produce the dataset (e.g., "CICE v6.1", etc.) (default is None, which will use the "sea_ice_component" attribute from the dataset if available). + biogeochemistry_component : str, optional + Biogeochemistry model component used to produce the dataset (e.g., "PISCES v2", etc.) (default is None, which will use the "biogeochemistry_component" attribute from the dataset if available). + atmosphere_component : str, optional + Atmospheric model component used to produce the dataset (e.g., "UKMO UM Global Atmosphere 7.1", etc.) (default is None, which will use the "atmosphere_component" attribute from the dataset if available). + atmospheric_forcing : str, optional + Atmospheric forcing used to produce the dataset (e.g., "ERA5", "JRA-55", etc.) (default is None, which will use the "atmospheric_forcing" attribute from the dataset if available). variant : str, optional - The simulation variant string for the dataset (default is "r1i1c1f + Configuration variant string for the dataset (default is "r1i1c1f1"). start_date : str, optional - The start date of the dataset in "YYYY-MM-DD" format (default is "1976-01-01"). + Start date of the dataset in "YYYY-MM-DD" format (default is "1976-01-01"). end_date : str, optional - The end date of the dataset in "YYYY-MM-DD" format (default is "2024-12-31"). + End date of the dataset in "YYYY-MM-DD" format (default is "2024-12-31"). bbox : tuple, optional Bounding box for the dataset in the format (min_lon, min_lat, max_lon, max_lat). (default is global coverage). collection : str, optional - The name of the STAC Collection to which this Item belongs (default is "noc-npd-jra55"). - config : str, optional - The configuration string for the dataset (default is "eORCA1 JRA55v1 NPD"). - operation : str, optional - The operation string indicating the type of operation performed on the dataset (default is "annual-mean"). - variable_stores : bool, optional - Whether each variable is stored in a separate Zarr store (default is True). + Collection to which this Item belongs (default is "noc-npd-era5"). endpoint_url : str, optional The S3 endpoint URL (default is "https://noc-msm-o.s3-ext.jc.rl.ac.uk"). - zarr_format: int, optional - The Zarr format version (default is 2). + group : str, optional + Group within the Icechunk repository to open (default is None). + anonymous : bool, optional + Whether anonymous access is supported for the S3 asset (default is True). Returns ------- pystac.Item - A STAC Item containing the dataset information and an asset pointing to the data. + STAC Item containing the dataset metadata and associated dataset asset. """ - # Define the item description based on the prefix: - var = f"{prefix.split('/')[-1]} output" if variable_stores else "outputs" - - if 'domain' in prefix: - description = "**Global ocean model domain and mesh mask variables.**" - elif 'I' in prefix: - description = f"**{operation.capitalize()} global sea-ice {var} outputs defined at NEMO model T-points.**" - elif 'S' in prefix: - description = f"**{operation.capitalize()} global ocean scalar {var} outputs.**" - elif 'M' in prefix: - description = f"**{operation.capitalize()} ocean physics transect {var} outputs defined at {prefix.split('/')[-1]}.**" - else: - description = f"**{operation.capitalize()} global ocean physics {var} outputs defined at {prefix[0]}-points.**" - - # Add OceanDataCatalog Access Information to the description: - description += f"\n\n**OceanDataCatalog Access:**\n`catalog.open_dataset(id='{id}')`" + # === Geometry === # + # Collect bounding box from dataset attributes if not provided: + bbox = ds.attrs.get("bbox", "[-180.0, -90.0, 180.0, 90.0]") if bbox is None else bbox + bbox = [float(bound) for bound in bbox.replace("[", "").replace("]", "").split(",")] # Define Polygon geometry for the item: polygon = Polygon([ @@ -297,46 +391,86 @@ def create_item_with_zarr_asset( # Convert the Polygon to GeoJSON format: geometry = mapping(polygon) - # Create a STAC Item with Asset: + # === Properties === # + # Add OceanDataCatalog Access Information to description: + if description is None: + description = ds.attrs.get("description", "") + description += f"\n\n**OceanDataCatalog Access:**\n`catalog.open_dataset(id='{id}')`" + + # Define start and end datetimes for the Item: + if start_date is None: + start_date = ds.attrs.get("start_date", None) + if start_date is None: + raise ValueError("'start_date' must be provided either as a parameter or as a global dataset attribute.") + if end_date is None: + end_date = ds.attrs.get("end_date", None) + if end_date is None: + raise ValueError("'end_date' must be provided either as a parameter or as a global dataset attribute.") + + # Define standard properties dictionary: + properties={ + "title": ds.attrs.get("title", None) if title is None else title, + "description": description, + "dataset_type": ds.attrs.get("dataset_type", None) if dataset_type is None else dataset_type, + "product_type": ds.attrs.get("product_type", None) if product_type is None else product_type, + "product_version": ds.attrs.get("product_version", None) if product_version is None else product_version, + "institution": ds.attrs.get("institution", None) if institution is None else institution, + "citation": ds.attrs.get("citation", None) if citation is None else citation, + "acknowledgement": ds.attrs.get("acknowledgement", None) if acknowledgement is None else acknowledgement, + "license": ds.attrs.get("license", None) if license is None else license, + "doi": ds.attrs.get("doi", None) if doi is None else doi, + "platform": ds.attrs.get("platform", None) if platform is None else platform, + "horizontal_grid_type": ds.attrs.get("horizontal_grid_type", None) if horizontal_grid_type is None else horizontal_grid_type, + "horizontal_grid_resolution": ds.attrs.get("horizontal_grid_resolution", None) if horizontal_grid_resolution is None else horizontal_grid_resolution, + "vertical_grid_type": ds.attrs.get("vertical_grid_type", None) if vertical_grid_type is None else vertical_grid_type, + "vertical_grid_coordinate": ds.attrs.get("vertical_grid_coordinate", None) if vertical_grid_coordinate is None else vertical_grid_coordinate, + "vertical_grid_levels": ds.attrs.get("vertical_grid_levels", None) if vertical_grid_levels is None else vertical_grid_levels, + "dimensions": list(ds.dims), + "variables": list(ds.data_vars), + "variable_standard_names": [ds[var].attrs.get('standard_name', var) for var in ds.data_vars], + "aggregation": ds.attrs.get("aggregation", None) if aggregation is None else aggregation, + "aggregation_frequency": ds.attrs.get("aggregation_frequency", None) if aggregation_frequency is None else aggregation_frequency, + "status": ds.attrs.get("status", None) if status is None else status, + "update_frequency": ds.attrs.get("update_frequency", None) if update_frequency is None else update_frequency, + "latest_data_update": datetime.datetime.now().isoformat(), + } + + if properties["dataset_type"] == "model": + # Append numerical model specific properties: + properties.update({ + "variant": ds.attrs.get("variant", None) if variant is None else variant, + "ocean_component": ds.attrs.get("ocean_component", None) if ocean_component is None else ocean_component, + "sea_ice_component": ds.attrs.get("sea_ice_component", None) if sea_ice_component is None else sea_ice_component, + "biogeochemistry_component": ds.attrs.get("biogeochemistry_component", None) if biogeochemistry_component is None else biogeochemistry_component, + "atmosphere_component": ds.attrs.get("atmosphere_component", None) if atmosphere_component is None else atmosphere_component, + "atmospheric_forcing": ds.attrs.get("atmospheric_forcing", None) if atmospheric_forcing is None else atmospheric_forcing, + }) + + # === Create a STAC Item with Asset === # item = pystac.Item( id=id, geometry=geometry, - bbox=list(polygon.bounds), # [min_lon, min_lat, max_lon, max_lat] + bbox=list(polygon.bounds), datetime=datetime.datetime(year=(int(start_date.split("-")[0]) + int(end_date.split("-")[0])) // 2, month=1, day=1), start_datetime=datetime.datetime(year=int(start_date.split("-")[0]), month=int(start_date.split("-")[1]), day=int(start_date.split("-")[2])), end_datetime=datetime.datetime(year=int(end_date.split("-")[0]), month=int(end_date.split("-")[1]), day=int(end_date.split("-")[2])), - properties={ - "title": f"{prefix} Zarr store", - "description": description, - "platform": platform, - "variant": variant, - "variables": list(ds.data_vars), - "variable_standard_names": [ds[var].attrs.get('standard_name', var) for var in ds.data_vars], - "dimensions": list(ds.dims), - "operation": operation.split(" ")[1], - "operation_frequency": operation.split(" ")[0], - "ocean_component": "NEMO v4.2.2", - "sea_ice_component": "SI3 v4.0", - "biogeochemistry_component": None, - "atmosphere_component": None, - "status": "completed", - "latest_data_update": datetime.datetime.now().isoformat(), - }, + properties=properties, collection=collection, ) item.add_asset(key=prefix.split('/')[-1], asset=pystac.Asset( - href=f"https://noc-msm-o.s3-ext.jc.rl.ac.uk/{bucket}/{prefix}", - title=f"{config}: {prefix} Zarr store.", + href=f"{endpoint_url}/{bucket}/{prefix}", + title=ds.attrs.get("title", None) if title is None else title, description=description, - media_type="application/vnd.zarr", + media_type="application/vnd.zarr+icechunk", extra_fields=dict( endpoint_url=endpoint_url, bucket=bucket, prefix=prefix, - zarr_format=zarr_format, - anonymous=True + variant=ds.attrs.get("variant", None) if variant is None else variant, + group=group, + anonymous=anonymous ) )) - return item \ No newline at end of file + return item From 61800296a1f72ca1943cf0b4d4403ac083a16702 Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 18:23:16 +0100 Subject: [PATCH 02/10] Refactor noc-npd-era5 collection to remove legacy platform subcatalog & support Item creation from hierarchical Icechunk stores. --- .../catalog/stac/npd_era5_collection.py | 332 ++++++++---------- 1 file changed, 156 insertions(+), 176 deletions(-) diff --git a/OceanDataStore/catalog/stac/npd_era5_collection.py b/OceanDataStore/catalog/stac/npd_era5_collection.py index 45ad1906..61f7c872 100644 --- a/OceanDataStore/catalog/stac/npd_era5_collection.py +++ b/OceanDataStore/catalog/stac/npd_era5_collection.py @@ -9,12 +9,65 @@ - Ollie Tooth (oliver.tooth@noc.ac.uk) """ # -- Import Python Modules -- # -import logging -import pystac import datetime +import logging -from OceanDataStore.catalog.stac.utils import open_icechunk_store -from OceanDataStore.catalog.stac.utils import create_item_with_icechunk_asset +import pystac +import xarray as xr + +from OceanDataStore.catalog.stac.utils import ( + create_item_with_icechunk_asset, + open_icechunk_store, +) + + +def description_from_prefix(prefix: str, ds: xr.Dataset) -> str: + """ + Define Item description based on the prefix and Dataset attributes. + + Parameters: + ---------- + prefix : str + Prefix of Icechunk repository, + + ds : xr.Dataset + Dataset opened from Icechunk repository, which contains global attributes + 'aggregation_frequency' and 'aggregation' used to define Item description. + + Returns: + ------- + description : str + Description of NEMO model output Item. + """ + # --- Validate input arguments --- # + if not isinstance(prefix, str): + raise TypeError("'prefix' must be a string.") + if not isinstance(ds, xr.Dataset): + raise TypeError("'ds' must be an xarray.Dataset.") + + + # --- --- Define the item description based on the prefix --- # + if 'domain' in prefix: + description = "**Global ocean model domain and mesh mask variables.**" + elif 'T' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} global ocean scalar outputs defined at {prefix[0]}-points.**" + elif 'U' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} global ocean zonal vector outputs defined at {prefix[0]}-points.**" + elif 'V' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} global ocean meridional vector outputs defined at {prefix[0]}-points.**" + elif 'W' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} global ocean vertical vector outputs defined at {prefix[0]}-points.**" + elif 'I' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} global sea-ice outputs defined at T-points.**" + elif 'S' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} global ocean scalar outputs.**" + elif 'M' in prefix: + description = f"**{ds.attrs.get('aggregation_frequency', 'monthly').capitalize()} {ds.attrs.get('aggregation', 'mean')} ocean physics transect outputs defined at {prefix.split('/')[-1]}.**" + else: + raise ValueError(f"Unable to determine variable type from prefix: {prefix}") + + + return description def create_npd_era5_collection() -> pystac.Collection: @@ -31,7 +84,7 @@ def create_npd_era5_collection() -> pystac.Collection: spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 0, 180.0, 90.0, 6000]]) # Define the current temporal extent for the collection: - collection_interval = sorted([datetime.datetime(year=1976, month=1, day=1), datetime.datetime(year=2025, month=7, day=31)]) + collection_interval = sorted([datetime.datetime(year=1976, month=1, day=1), datetime.datetime(year=2026, month=5, day=15)]) temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) # Define the Near-Present Day Collection: @@ -42,11 +95,11 @@ def create_npd_era5_collection() -> pystac.Collection: extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), # Open Government License (OGL) - UK version 3.0 - http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ license="OGL-UK-3.0", - extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="AtlantiS", status="ongoing", update_frequency="biannual", last_data_update="2025-07-31"), + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="AtlantiS", status="ongoing", update_frequency="biannual", last_data_update="2026-06-10"), keywords=["NOC", "Near-Present Day", "AtlantiS", "hindcast", "global", "model", "ocean", "sea-ice"], providers=[ pystac.Provider( - name="National Oceanography Centre", + name="National Oceanography Centre (NOC)", description="National Oceanography Centre (United Kingdom) - Ocean Modelling Group.", roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], url="https://noc-msm.github.io/NOC_Near_Present_Day/" @@ -60,7 +113,7 @@ def create_npd_era5_collection() -> pystac.Collection: ], ) - logging.info(f"Completed: Created NOC STAC Collection with ID: {npd_collection.id}") + logging.info(f"Completed: Created STAC Collection with ID: {npd_collection.id}") # ==== Define NOC Near-Present Day Model Configuration Catalogs ==== # npd_eorca1_era5v1 = pystac.Catalog( @@ -69,7 +122,7 @@ def create_npd_era5_collection() -> pystac.Collection: description="Catalog of eORCA1 ERA-5 Near-Present Day ocean sea-ice simulations performed by the National Oceanography Centre." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {npd_eorca1_era5v1.id}") + logging.info(f"-> Completed: Created STAC Catalog with ID: {npd_eorca1_era5v1.id}") npd_eorca025_era5v1 = pystac.Catalog( id="npd-eorca025-era5v1", @@ -77,7 +130,7 @@ def create_npd_era5_collection() -> pystac.Collection: description="Catalog of eORCA025 ERA-5 Near-Present Day ocean sea-ice simulations performed by the National Oceanography Centre.", ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {npd_eorca025_era5v1.id}") + logging.info(f"-> Completed: Created STAC Catalog with ID: {npd_eorca025_era5v1.id}") npd_eorca12_era5v1 = pystac.Catalog( id="npd-eorca12-era5v1", @@ -85,7 +138,7 @@ def create_npd_era5_collection() -> pystac.Collection: description="Catalog of eORCA12 ERA-5 Near-Present Day ocean sea-ice simulations performed by the National Oceanography Centre.", ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {npd_eorca12_era5v1.id}") + logging.info(f"-> Completed: Created STAC Catalog with ID: {npd_eorca12_era5v1.id}") # ==== Define NOC Near-Present Day Model Variant Catalogs ==== # r1i1c1f1_eorca1_era5v1 = pystac.Catalog( @@ -94,7 +147,7 @@ def create_npd_era5_collection() -> pystac.Collection: description="Catalog of eORCA1 ERA-5 Near-Present Day ocean physics & sea-ice outputs for model variant: r1i1c1f1.\n\n**Variant Label:**\n\nRealisation=1, Initialisation=1, Configuration=1, Forcing=1." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r1i1c1f1_eorca1_era5v1.id}") + logging.info(f"-> Completed: Created STAC Catalog with ID: {r1i1c1f1_eorca1_era5v1.id}") r1i1c1f1_eorca025_era5v1 = pystac.Catalog( id="r1i1c1f1", @@ -102,7 +155,7 @@ def create_npd_era5_collection() -> pystac.Collection: description="Catalog of eORCA025 ERA-5 Near-Present Day ocean physics & sea-ice outputs for model variant: r1i1c1f1.\n\n**Variant Label:**\n\nRealisation=1, Initialisation=1, Configuration=1, Forcing=1." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r1i1c1f1_eorca025_era5v1.id}") + logging.info(f"-> Completed: Created STAC Catalog with ID: {r1i1c1f1_eorca025_era5v1.id}") r1i1c1f1_eorca12_era5v1 = pystac.Catalog( id="r1i1c1f1", @@ -110,126 +163,59 @@ def create_npd_era5_collection() -> pystac.Collection: description="Catalog of eORCA12 ERA-5 Near-Present Day ocean physics & sea-ice outputs for model variant: r1i1c1f1.\n\n**Variant Label:**\n\nRealisation=1, Initialisation=1, Configuration=1, Forcing=1." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r1i1c1f1_eorca12_era5v1.id}") - - # ==== Define NOC Near-Present Day Platform Sub-Catalogs ==== # - # Note: Options for platforms are: "gn", "gr", "tn", "tr". - # where gn = native model grids, gr = regridded grids, tn = transects on native model grids, tr = transects on regridded grids. - - gn_eorca1_era5v1 = pystac.Catalog( - id="gn", - title="eORCA1 ERA5v1 NPD: Global Native Model Grid Catalog", - description="Catalog of global ocean physics & sea-ice outputs stored on the native eORCA1 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_eorca1_era5v1.id}") - - gn_eorca025_era5v1 = pystac.Catalog( - id="gn", - title="eORCA025 ERA5v1 NPD: Global Native Model Grid Catalog", - description="Catalog of global ocean physics & sea-ice outputs stored on the native eORCA025 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_eorca025_era5v1.id}") - - gn_eorca12_era5v1 = pystac.Catalog( - id="gn", - title="eORCA12 ERA5v1 NPD: Global Native Model Grid Catalog", - description="Catalog of global ocean physics & sea-ice outputs stored on the native eORCA12 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_eorca12_era5v1.id}") - - tn_eorca1_era5v1 = pystac.Catalog( - id="tn", - title="eORCA1 ERA5v1 NPD: Transect Catalog", - description="Catalog of ocean physics transect outputs defined on the native eORCA1 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {tn_eorca1_era5v1.id}") - - tn_eorca025_era5v1 = pystac.Catalog( - id="tn", - title="eORCA025 ERA5v1 NPD: Transect Catalog", - description="Catalog of ocean physics transect outputs defined on the native eORCA025 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {tn_eorca025_era5v1.id}") - - tn_eorca12_era5v1 = pystac.Catalog( - id="tn", - title="eORCA12 ERA5v1 NPD: Transect Catalog", - description="Catalog of ocean physics transect outputs defined on the native eORCA12 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {tn_eorca12_era5v1.id}") + logging.info(f"-> Completed: Created STAC Catalog with ID: {r1i1c1f1_eorca12_era5v1.id}") # -- Add Items to NOC Near-Present Day eORCA1 ERA5v1 {gn} Sub-Catalog -- # # Define the store credentials for the eORCA1 ERA5v1 NPD data: bucket = "npd-eorca1-era5v1" - variant = "r1i1c1f1" for prefix in ["T1y", "U1y", "V1y", "W1y", "I1y", "S1y", "T1m", "U1m", "V1m", "W1m", "I1m", "S1m", "domain/domain_cfg" ]: # Open dataset from Icechunk repository: ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") - # Create item with asset for each eORCA1 ERA5v1 NPD prefix: - if 'domain' in prefix: - operation = "None None" - elif '1y' in prefix: - operation = "annual mean" - elif '1m' in prefix: - operation = "monthly mean" - elif '5d' in prefix: - operation = "5-day mean" - item = create_item_with_icechunk_asset( - id=f"noc-npd-era5/{bucket}/{variant}/gn/{prefix}", ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", bucket=bucket, - platform="gn", prefix=prefix, - variant=variant, + title=f"NPD eORCA1 ERA5v1 {prefix}", + description=description_from_prefix(prefix=prefix, ds=ds), start_date="1976-01-01", - end_date="2025-07-31", - config="eORCA1 ERA5v1 NPD", - operation=operation, - ) + end_date="2026-05-15", + collection=bucket + ) # Add item to the eORCA1 ERA5v1 NPD global native model grid catalog: - gn_eorca1_era5v1.add_item(item) + r1i1c1f1_eorca1_era5v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_eorca1_era5v1.id}") + logging.info(f"-> Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca1_era5v1.id}") # -- Add Items to NOC Near-Present Day eORCA1 ERA5v1 {tn} Sub-Catalog -- # # Define the store credentials for the eORCA1 ERA5v1 NPD data: for prefix in ["M1m/MOVE_16N", "M1m/SAMBA_34_5S", "M1m/RAPID_26N", "M1m/OSNAP"]: # Open dataset from Icechunk repository: ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") - - # Create item with asset for each eORCA1 ERA5v1 NPD prefix: - operation = "monthly mean" + item = create_item_with_icechunk_asset( - id=f"noc-npd-era5/{bucket}/{variant}/tn/{prefix}", ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", bucket=bucket, - platform="tn", prefix=prefix, - variant=variant, + title=f"NPD eORCA1 ERA5v1 {prefix}", + description=description_from_prefix(prefix=prefix, ds=ds), + platform=ds.attrs.get('platform', 'tn'), start_date="1976-01-01", - end_date="2025-07-31", - config="eORCA1 ERA5v1 NPD", - operation=operation, - ) + end_date="2026-05-15", + collection=bucket + ) # Add item to the eORCA1 ERA5v1 NPD transect catalog: - tn_eorca1_era5v1.add_item(item) + r1i1c1f1_eorca1_era5v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {tn_eorca1_era5v1.id}") + logging.info(f"-> Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca1_era5v1.id}") # -- Add Items to NOC Near-Present Day eORCA025 ERA5v1 {gn} Sub-Catalog -- # # Define the store credentials for the eORCA025 ERA5v1 NPD data: bucket = "npd-eorca025-era5v1" - variant = "r1i1c1f1" for prefix in ["T1y_3d", "T1y_4d", "U1y_3d", "U1y_4d", "V1y_3d", "V1y_4d", "W1y_4d", "I1y_3d", "S1y_1d", "T1m_3d", "T1m_4d", "U1m_3d", "U1m_4d", "V1m_3d", "V1m_4d", "W1m_4d", "I1m_3d", "S1m_1d", "T5d_3d", "T5d_4d", "U5d_3d", "U5d_4d", "V5d_3d", "V5d_4d", "I5d_3d", "S5d_1d", @@ -238,32 +224,21 @@ def create_npd_era5_collection() -> pystac.Collection: # Open dataset from Icechunk repository: ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") - # Create item with asset for each eORCA025 ERA5v1 NPD prefix: - if 'domain' in prefix: - operation = "None None" - elif '1y' in prefix: - operation = "annual mean" - elif '1m' in prefix: - operation = "monthly mean" - elif '5d' in prefix: - operation = "5-day mean" - item = create_item_with_icechunk_asset( - id=f"noc-npd-era5/{bucket}/{variant}/gn/{prefix}", ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", bucket=bucket, - platform="gn", prefix=prefix, - variant=variant, + title=f"NPD eORCA025 ERA5v1 {prefix}", + description=description_from_prefix(prefix=prefix, ds=ds), start_date="1976-01-01", - end_date="2025-07-31", - config="eORCA025 ERA5v1 NPD", - operation=operation, - ) + end_date="2026-05-15", + collection=bucket + ) # Add item to the eORCA025 ERA5v1 NPD global native model grid catalog: - gn_eorca025_era5v1.add_item(item) + r1i1c1f1_eorca025_era5v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_eorca025_era5v1.id}") + logging.info(f"-> Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca025_era5v1.id}") # -- Add Items to NOC Near-Present Day eORCA025 ERA5v1 {tn} Sub-Catalog -- # # Define the store credentials for the eORCA025 ERA5v1 NPD data: @@ -271,69 +246,85 @@ def create_npd_era5_collection() -> pystac.Collection: # Open dataset from Icechunk repository: ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") - # Create item with asset for each eORCA025 ERA5v1 NPD prefix: - operation = "monthly mean" item = create_item_with_icechunk_asset( - id=f"noc-npd-era5/{bucket}/{variant}/tn/{prefix}", ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", bucket=bucket, - platform="tn", prefix=prefix, - variant=variant, + title=f"NPD eORCA025 ERA5v1 {prefix}", + description=description_from_prefix(prefix=prefix, ds=ds), + platform=ds.attrs.get('platform', 'tn'), start_date="1976-01-01", - end_date="2025-07-31", - config="eORCA025 ERA5v1 NPD", - operation=operation, - ) + end_date="2026-05-15", + collection=bucket + ) # Add item to the eORCA025 ERA5v1 NPD transect catalog: - tn_eorca025_era5v1.add_item(item) + r1i1c1f1_eorca025_era5v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {tn_eorca025_era5v1.id}") + logging.info(f"-> Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca025_era5v1.id}") # -- Add Items to NOC Near-Present Day eORCA12 ERA5v1 Sub-Catalog -- # # Define the store credentials for the eORCA12 ERA5v1 NPD data: bucket = "npd-eorca12-era5v1" - variant = "r1i1c1f1" + + # Add annual-mean and monthly mean Icechunk repositories: for prefix in ["T1y_3d", "T1y_4d", "U1y_3d", "U1y_4d", "V1y_3d", "V1y_4d", "W1y_4d", "I1y_3d", "S1y_1d", "T1m_3d", "T1m_4d", "U1m_3d", "U1m_4d", "V1m_3d", "V1m_4d", "W1m_4d", "I1m_3d", "S1m_1d", - "T5d_3d", "T5d_4d", "U5d_3d", "U5d_4d", "V5d_3d", "V5d_4d", "I5d_3d", "S5d_1d", "domain/domain_cfg", ]: # Open dataset from Icechunk repository: ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") - # Create item with asset for each eORCA12 ERA5v1 NPD prefix: - if 'domain' in prefix: - operation = "None None" - elif '1y' in prefix: - operation = "annual mean" - start_date="1976-01-01" - end_date="2025-06-30" - elif '1m' in prefix: - operation = "monthly mean" - start_date="1976-01-01" - end_date="2025-06-30" - elif '5d' in prefix: - operation = "5-day mean" - start_date="1990-01-01" - end_date="2025-06-30" - item = create_item_with_icechunk_asset( - id=f"noc-npd-era5/{bucket}/{variant}/gn/{prefix}", ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", bucket=bucket, - platform="gn", prefix=prefix, - variant=variant, - start_date=start_date, - end_date=end_date, - config="eORCA12 ERA5v1 NPD", - operation=operation, + title=f"NPD eORCA12 ERA5v1 {prefix}", + description=description_from_prefix(prefix=prefix, ds=ds), + start_date="1976-01-01", + end_date="2026-05-15", + collection=bucket + ) + # Add item to the eORCA12 ERA5v1 NPD global native model grid catalog: + r1i1c1f1_eorca12_era5v1.add_item(item) + + # Add root group for hierarchical Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix="eorca12-era5v1-5d", branch="main") + item = create_item_with_icechunk_asset( + ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/eorca12-era5v1-5d", + bucket=bucket, + prefix="eorca12-era5v1-5d", + title="NPD eORCA12 ERA5v1 5d", + description="**5-day mean global ocean outputs.**", + start_date="1990-01-01", + end_date="2024-12-31", + collection=bucket, ) + # Add item to the eORCA12 ERA5v1 NPD global native model grid catalog: + r1i1c1f1_eorca12_era5v1.add_item(item) + + # Add individual groups in hierarchical Icechunk repository: + for prefix in ["T5d", "U5d", "V5d"]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix="eorca12-era5v1-5d", branch="main", group=f"grid{prefix[0]}") + item = create_item_with_icechunk_asset( + ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", + bucket=bucket, + prefix="eorca12-era5v1-5d", + title=f"NPD eORCA12 ERA5v1 {prefix}", + description=f"**5-day mean global ocean {'scalar' if prefix[0] == 'T' else 'vector'} outputs defined at {prefix[0]}-points.**", + start_date="1990-01-01", + end_date="2024-12-31", + collection=bucket, + group=f"grid{prefix[0]}" + ) # Add item to the eORCA12 ERA5v1 NPD global native model grid catalog: - gn_eorca12_era5v1.add_item(item) + r1i1c1f1_eorca12_era5v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_eorca12_era5v1.id}") + logging.info(f"-> Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca12_era5v1.id}") # -- Add Items to NOC Near-Present Day eORCA12 ERA5v1 {tn} Sub-Catalog -- # # Define the store credentials for the eORCA12 ERA5v1 NPD data: @@ -341,35 +332,24 @@ def create_npd_era5_collection() -> pystac.Collection: # Open dataset from Icechunk repository: ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") - # Create item with asset for each eORCA12 ERA5v1 NPD prefix: - operation = "monthly mean" item = create_item_with_icechunk_asset( - id=f"noc-npd-era5/{bucket}/{variant}/tn/{prefix}", ds=ds, + id=f"noc-npd-era5/{bucket}/{ds.attrs.get('variant', 'r1i1c1f1')}/{prefix}", bucket=bucket, - platform="tn", prefix=prefix, - variant=variant, + title=f"NPD eORCA12 ERA5v1 {prefix}", + description=description_from_prefix(prefix=prefix, ds=ds), + platform=ds.attrs.get('platform', 'tn'), start_date="1976-01-01", - end_date="2024-12-31", - config="eORCA12 ERA5v1 NPD", - operation=operation, - ) + end_date="2026-05-15", + collection=bucket + ) # Add item to the eORCA12 ERA5v1 NPD transect catalog: - tn_eorca12_era5v1.add_item(item) + r1i1c1f1_eorca12_era5v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {tn_eorca12_era5v1.id}") + logging.info(f"-> Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca12_era5v1.id}") # ==== Add Nested Catalogs to NOC Near-Present Day Collection ==== # - # Global Native Model Grid Catalogs -> Model Simulation Variant Catalogs: - r1i1c1f1_eorca1_era5v1.add_child(gn_eorca1_era5v1) - r1i1c1f1_eorca025_era5v1.add_child(gn_eorca025_era5v1) - r1i1c1f1_eorca12_era5v1.add_child(gn_eorca12_era5v1) - - # Transect Catalogs -> Model Simulation Variant Catalogs: - r1i1c1f1_eorca1_era5v1.add_child(tn_eorca1_era5v1) - r1i1c1f1_eorca025_era5v1.add_child(tn_eorca025_era5v1) - r1i1c1f1_eorca12_era5v1.add_child(tn_eorca12_era5v1) # Model Simulation Variant Catalogs -> Model Simulation Catalogs: npd_eorca1_era5v1.add_child(r1i1c1f1_eorca1_era5v1) From 4a3de03a67860128662c0fc7dc1541453a153327 Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 18:24:13 +0100 Subject: [PATCH 03/10] Refactor noc-npd-era5 & noc-rapid-evolution collections to remove legacy platform subcatalogs. --- .../catalog/stac/npd_jra55_collection.py | 61 +++++------- .../catalog/stac/rapid_evo_collection.py | 92 +++++++------------ 2 files changed, 54 insertions(+), 99 deletions(-) diff --git a/OceanDataStore/catalog/stac/npd_jra55_collection.py b/OceanDataStore/catalog/stac/npd_jra55_collection.py index 22def9a4..b416756a 100644 --- a/OceanDataStore/catalog/stac/npd_jra55_collection.py +++ b/OceanDataStore/catalog/stac/npd_jra55_collection.py @@ -48,7 +48,7 @@ def create_npd_jra55_collection( keywords=["NOC", "JRA55-do", "Near-Present Day", "AtlantiS", "hindcast", "global", "model", "ocean", "sea-ice"], providers=[ pystac.Provider( - name="National Oceanography Centre", + name="National Oceanography Centre (NOC)", description="National Oceanography Centre (United Kingdom) - Ocean Modelling Group.", roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], url="https://noc-msm.github.io/NOC_Near_Present_Day/" @@ -71,7 +71,7 @@ def create_npd_jra55_collection( description="Catalog of eORCA1 JRA55-do Near-Present Day ocean sea-ice simulations performed by the National Oceanography Centre." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {npd_eorca1_jra55v1.id}") + logging.info(f"Completed: Created STAC Catalog with ID: {npd_eorca1_jra55v1.id}") npd_eorca025_jra55v1 = pystac.Catalog( id="npd-eorca025-jra55v1", @@ -79,7 +79,7 @@ def create_npd_jra55_collection( description="Catalog of eORCA025 JRA55-do Near-Present Day ocean sea-ice simulations performed by the National Oceanography Centre.", ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {npd_eorca025_jra55v1.id}") + logging.info(f"Completed: Created STAC Catalog with ID: {npd_eorca025_jra55v1.id}") # ==== Define NOC Near-Present Day Model Variant Catalogs ==== # r1i1c1f1_eorca1_jra55v1 = pystac.Catalog( @@ -88,7 +88,7 @@ def create_npd_jra55_collection( description="Catalog of eORCA1 JRA55-do Near-Present Day ocean physics & sea-ice outputs for model variant: r1i1c1f1.\n\n**Variant Label:**\n\nRealisation=1, Initialisation=1, Configuration=1, Forcing=1." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r1i1c1f1_eorca1_jra55v1.id}") + logging.info(f"Completed: Created STAC Catalog with ID: {r1i1c1f1_eorca1_jra55v1.id}") r1i1c1f1_eorca025_jra55v1 = pystac.Catalog( id="r1i1c1f1", @@ -96,27 +96,7 @@ def create_npd_jra55_collection( description="Catalog of eORCA025 JRA55-do Near-Present Day ocean physics & sea-ice outputs for model variant: r1i1c1f1.\n\n**Variant Label:**\n\nRealisation=1, Initialisation=1, Configuration=1, Forcing=1.", ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r1i1c1f1_eorca025_jra55v1.id}") - - # ==== Define NOC Near-Present Day Platform Sub-Catalogs ==== # - # Note: Options for platforms are: "gn", "gr", "tn", "tr". - # where gn = native model grids, gr = regridded grids, tn = transects on native model grids, tr = transects on regridded grids. - - gn_eorca1_jra55v1 = pystac.Catalog( - id="gn", - title="eORCA1 JRA55v1 NPD: Global Native Model Grid Catalog", - description="Catalog of global ocean physics & sea-ice outputs stored on the native eORCA1 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_eorca1_jra55v1.id}") - - gn_eorca025_jra55v1 = pystac.Catalog( - id="gn", - title="eORCA025 JRA55v1 NPD: Global Native Model Grid Catalog", - description="Catalog of global ocean physics & sea-ice outputs stored on the native eORCA025 curvilinear NEMO model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_eorca025_jra55v1.id}") + logging.info(f"Completed: Created STAC Catalog with ID: {r1i1c1f1_eorca025_jra55v1.id}") # -- Add Items to NOC Near-Present Day eORCA1 JRA55v1 {gn} Sub-Catalog -- # # Define url & bucket for eORCA1 JRA55v1 NPD data: @@ -141,22 +121,24 @@ def create_npd_jra55_collection( operation = "5-day mean" item = create_item_with_zarr_asset( - id=f"noc-npd-jra55/{bucket}/{variant}/gn/{prefix}", + id=f"noc-npd-jra55/{bucket}/{variant}/{prefix}", ds=ds, bucket=bucket, - platform="gn", prefix=prefix, + title=f"NPD eORCA1 JRA55v1 {prefix}", + platform="gn", + horizontal_grid_resolution="1 degree", variant=variant, start_date="1976-01-01", end_date="2024-01-31", - config="eORCA1 JRA55v1 NPD", operation=operation, - zarr_format=3 + zarr_format=3, + variable_stores=False, ) # Add item to the eORCA1 JRA55v1 NPD global native model grid catalog: - gn_eorca1_jra55v1.add_item(item) + r1i1c1f1_eorca1_jra55v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_eorca1_jra55v1.id}") + logging.info(f"Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca1_jra55v1.id}") # -- Add Items to NOC Near-Present Day eORCA025 JRA55v1 {gn} Sub-Catalog -- # # Define url & bucket for eORCA025 JRA55v1 NPD data: @@ -182,27 +164,26 @@ def create_npd_jra55_collection( operation = "5-day mean" item = create_item_with_zarr_asset( - id=f"noc-npd-jra55/{bucket}/{variant}/gn/{prefix}", + id=f"noc-npd-jra55/{bucket}/{variant}/{prefix}", ds=ds, bucket=bucket, - platform="gn", prefix=prefix, + title=f"NPD eORCA025 JRA55v1 {prefix}", + platform="gn", + horizontal_grid_resolution="1/4 degree", variant=variant, start_date="1976-01-01", end_date="2024-01-31", - config="eORCA025 JRA55v1 NPD", operation=operation, - zarr_format=3 + zarr_format=3, + variable_stores=False ) # Add item to the eORCA025 JRA55v1 NPD global native model grid catalog: - gn_eorca025_jra55v1.add_item(item) + r1i1c1f1_eorca025_jra55v1.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_eorca025_jra55v1.id}") + logging.info(f"Completed: Added Items to STAC Catalog with ID: {r1i1c1f1_eorca025_jra55v1.id}") # ==== Add Nested Catalogs to NOC Near-Present Day Collection ==== # - # Global Native Model Grid Catalogs -> Model Simulation Variant Catalogs: - r1i1c1f1_eorca1_jra55v1.add_child(gn_eorca1_jra55v1) - r1i1c1f1_eorca025_jra55v1.add_child(gn_eorca025_jra55v1) # Model Simulation Variant Catalogs -> Model Simulation Catalogs: npd_eorca1_jra55v1.add_child(r1i1c1f1_eorca1_jra55v1) diff --git a/OceanDataStore/catalog/stac/rapid_evo_collection.py b/OceanDataStore/catalog/stac/rapid_evo_collection.py index 9e356c1d..c7482f0d 100644 --- a/OceanDataStore/catalog/stac/rapid_evo_collection.py +++ b/OceanDataStore/catalog/stac/rapid_evo_collection.py @@ -19,7 +19,7 @@ from OceanDataStore.catalog.stac.utils import create_item_with_zarr_asset def create_rapid_evo_collection( - credentials_json: str = "/dssgfs01/working/otooth/AtlantiS/jasmin_os/credentials/rapid_evo_credentials.json" + credentials_json: str ) -> pystac.Collection: """ Create the NOC Rapid Evolution STAC Collection. @@ -63,7 +63,7 @@ def create_rapid_evo_collection( keywords=["NOC", "Rapid Evolution", "hindcast", "global", "nested", "model", "ocean", "sea-ice"], providers=[ pystac.Provider( - name="National Oceanography Centre", + name="National Oceanography Centre (NOC)", description="National Oceanography Centre (United Kingdom) - Ocean Modelling Group.", roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], url="https://rapid.ac.uk/rapid-evolution" @@ -77,7 +77,7 @@ def create_rapid_evo_collection( ], ) - logging.info(f"Completed: Created NOC STAC Collection with ID: {rapid_evo_collection.id}") + logging.info(f"Completed: Created STAC Collection: {rapid_evo_collection}") # -- Define NOC RAPID-Evolution Model Configuration Catalogs -- # r_evo_eorca025 = pystac.Catalog( @@ -86,7 +86,7 @@ def create_rapid_evo_collection( description="Catalog of eORCA025 JRA55-do global parent domain outputs from the RAPID-Evolution ocean physics simulation performed by the National Oceanography Centre." ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r_evo_eorca025.id}") + logging.info(f"Completed: Created STAC Catalog: {r_evo_eorca025}") r_evo_rapid12 = pystac.Catalog( id="r_evo_rapid12", @@ -94,7 +94,7 @@ def create_rapid_evo_collection( description="Catalog of RAPID12 JRA55-do nested child domain outputs from the RAPID-Evolution ocean physics simulation performed by the National Oceanography Centre.", ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r_evo_rapid12.id}") + logging.info(f"Completed: Created STAC Catalog: {r_evo_rapid12}") r_evo_rapid36 = pystac.Catalog( id="r_evo_rapid36", @@ -102,35 +102,7 @@ def create_rapid_evo_collection( description="Catalog of RAPID36 JRA55-do nested grandchild domain outputs from the RAPID-Evolution ocean physics simulation performed by the National Oceanography Centre.", ) - logging.info(f"Completed: Created NOC STAC Catalog with ID: {r_evo_rapid36.id}") - - # Define NOC RAPID-Evolution Platform Sub-Catalogs -- # - # Note: Options for platforms are: "gn", "gr", "tn", "tr". - # where gn = native model grids, gr = regridded grids, tn = transects on native model grids, tr = transects on regridded grids. - - gn_eorca025 = pystac.Catalog( - id="gn", - title="RAPID12 JRA55-do global parent domain native model grid Catalog", - description="Catalog of global ocean physics outputs stored on the native global eORCA025 curvilinear model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_eorca025.id}") - - gn_rapid12 = pystac.Catalog( - id="gn", - title="RAPID12 JRA55-do nested child domain native model grid Catalog", - description="Catalog of ocean physics outputs stored on the native nested RAPID12 curvilinear model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_rapid12.id}") - - gn_rapid36 = pystac.Catalog( - id="gn", - title="RAPID36 JRA55-do nested grandchild domain native model grid Catalog", - description="Catalog of ocean physics outputs stored on the native nested RAPID36 curvilinear model grid." - ) - - logging.info(f"Completed: Created NOC STAC Nested Catalog with ID: {gn_rapid36.id}") + logging.info(f"Completed: Created STAC Catalog: {r_evo_rapid36}") # -- Add Items to NOC RAPID-Evolution eORCA025 global parent Sub-Catalog -- # # Define url for eORCA025 RAPID-Evolution data: @@ -145,21 +117,22 @@ def create_rapid_evo_collection( # Open domain_cfg dataset from Zarr store: ds = xr.open_zarr(f"{endpoint_url}/{bucket}/{prefix}", consolidated=True) item = create_item_with_zarr_asset( - id="noc-rapid-evolution/r_evo_eorca025/gn/domain_cfg", + id="noc-rapid-evolution/r_evo_eorca025/domain_cfg", ds=ds, bucket=bucket, platform="gn", prefix=prefix, + title=f"RAPID-Evolution eORCA025 {prefix}", + horizontal_grid_resolution="1/4 degree", start_date="1976-01-01", end_date="2023-12-31", collection="noc-rapid-evolution", - config="eORCA025 RAPID-Evolution global parent", operation=operation, endpoint_url=endpoint_url, zarr_format=3, ) # Add item to the eORCA025 RAPID-Evolution global parent domain native model grid catalog: - gn_eorca025.add_item(item) + r_evo_eorca025.add_item(item) else: bucket="r-evo1-eorca025-rapid12-rapid36" @@ -179,23 +152,24 @@ def create_rapid_evo_collection( # Open dataset from Zarr store: ds = xr.open_zarr(f"{endpoint_url}/{bucket}/{prefix}/{var}", consolidated=True) item = create_item_with_zarr_asset( - id=f"noc-rapid-evolution/r_evo_eorca025/gn/{prefix}/{var}", + id=f"noc-rapid-evolution/r_evo_eorca025/{prefix}/{var}", ds=ds, bucket=bucket, platform="gn", prefix=f"{prefix}/{var}", + title=f"RAPID-Evolution eORCA025 {prefix}/{var}", + horizontal_grid_resolution="1/4 degree", start_date="1976-01-01", end_date="2023-12-31", collection="noc-rapid-evolution", - config="eORCA025 RAPID-Evolution global parent", operation=operation, endpoint_url=endpoint_url, zarr_format=3, ) # Add item to the eORCA025 RAPID-Evolution global parent domain native model grid catalog: - gn_eorca025.add_item(item) + r_evo_eorca025.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_eorca025.id}") + logging.info(f"Completed: Added Items to STAC Catalog: {r_evo_eorca025}") # -- Add Items to NOC RAPID-Evolution RAPID12 nested child domain Sub-Catalog -- # for prefix in ["T1m", "U1m", "V1m", "W1m", "S1m", "eORCA025_RAPID12_domain_cfg"]: @@ -207,22 +181,23 @@ def create_rapid_evo_collection( # Open domain_cfg dataset from Zarr store: ds = xr.open_zarr(f"{endpoint_url}/{bucket}/{prefix}", consolidated=True) item = create_item_with_zarr_asset( - id="noc-rapid-evolution/r_evo_rapid12/gn/domain_cfg", + id="noc-rapid-evolution/r_evo_rapid12/domain_cfg", ds=ds, bucket=bucket, platform="gn", prefix=prefix, + title=f"RAPID-Evolution RAPID12 {prefix}", + horizontal_grid_resolution="1/12 degree", start_date="1976-01-01", end_date="2023-12-31", bbox=(-100.143814, 6.0719233, -1.8753614, 42.41955), collection="noc-rapid-evolution", - config="RAPID12 RAPID-Evolution child nest", operation=operation, endpoint_url=endpoint_url, zarr_format=3, ) # Add item to the RAPID12 RAPID-Evolution nested child domain native model grid catalog: - gn_rapid12.add_item(item) + r_evo_rapid12.add_item(item) else: bucket="r-evo1-eorca025-rapid12-rapid36" @@ -241,24 +216,25 @@ def create_rapid_evo_collection( # Open dataset from Zarr store: ds = xr.open_zarr(f"{endpoint_url}/{bucket}/{prefix}/{var}", consolidated=True) item = create_item_with_zarr_asset( - id=f"noc-rapid-evolution/r_evo_rapid12/gn/{prefix}/{var}", + id=f"noc-rapid-evolution/r_evo_rapid12/{prefix}/{var}", ds=ds, bucket=bucket, platform="gn", prefix=f"{prefix}/{var}", + title=f"RAPID-Evolution RAPID12 {prefix}/{var}", + horizontal_grid_resolution="1/12 degree", start_date="1976-01-01", end_date="2023-12-31", bbox=(-100.143814, 6.0719233, -1.8753614, 42.41955), collection="noc-rapid-evolution", - config="RAPID12 RAPID-Evolution child nest", operation=operation, endpoint_url=endpoint_url, zarr_format=3, ) # Add item to the RAPID12 RAPID-Evolution nested child domain native model grid catalog: - gn_rapid12.add_item(item) + r_evo_rapid12.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_rapid12.id}") + logging.info(f"Completed: Added Items to STAC Catalog: {r_evo_rapid12}") # -- Add Items to NOC RAPID-Evolution RAPID36 nested grandchild domain Sub-Catalog -- # for prefix in ["T1m", "U1m", "V1m", "W1m", "S1m", "eORCA025_RAPID36_domain_cfg"]: @@ -270,22 +246,23 @@ def create_rapid_evo_collection( # Open domain_cfg dataset from Zarr store: ds = xr.open_zarr(f"{endpoint_url}/{bucket}/{prefix}", consolidated=True) item = create_item_with_zarr_asset( - id="noc-rapid-evolution/r_evo_rapid36/gn/domain_cfg", + id="noc-rapid-evolution/r_evo_rapid36/domain_cfg", ds=ds, bucket=bucket, platform="gn", prefix=prefix, + title=f"RAPID-Evolution RAPID36 {prefix}", + horizontal_grid_resolution="1/36 degree", start_date="1976-01-01", end_date="2023-12-31", bbox=(-98.530975, 17.34014, -8.879465, 30.447763), collection="noc-rapid-evolution", - config="RAPID36 RAPID-Evolution grandchild nest", operation=operation, endpoint_url=endpoint_url, zarr_format=3, ) # Add item to the RAPID36 RAPID-Evolution nested grandchild domain native model grid catalog: - gn_rapid36.add_item(item) + r_evo_rapid36.add_item(item) else: bucket="r-evo1-eorca025-rapid12-rapid36" @@ -304,30 +281,27 @@ def create_rapid_evo_collection( # Open dataset from Zarr store: ds = xr.open_zarr(f"{endpoint_url}/{bucket}/{prefix}/{var}", consolidated=True) item = create_item_with_zarr_asset( - id=f"noc-rapid-evolution/r_evo_rapid36/gn/{prefix}/{var}", + id=f"noc-rapid-evolution/r_evo_rapid36/{prefix}/{var}", ds=ds, bucket=bucket, platform="gn", + horizontal_grid_resolution="1/36 degree", start_date="1976-01-01", end_date="2023-12-31", prefix=f"{prefix}/{var}", bbox=(-98.530975, 17.34014, -8.879465, 30.447763), collection="noc-rapid-evolution", - config="RAPID36 RAPID-Evolution grandchild nest", + title=f"RAPID-Evolution RAPID36 {prefix}/{var}", operation=operation, endpoint_url=endpoint_url, zarr_format=3, ) # Add item to the RAPID36 RAPID-Evolution nested grandchild domain native model grid catalog: - gn_rapid36.add_item(item) + r_evo_rapid36.add_item(item) - logging.info(f"Completed: Added Items to NOC STAC Catalog with ID: {gn_rapid36.id}") + logging.info(f"Completed: Added Items to STAC Catalog: {r_evo_rapid36}") # -- Add Nested Catalogs to NOC RAPID-Evolution Collection -- # - r_evo_eorca025.add_child(gn_eorca025) - r_evo_rapid12.add_child(gn_rapid12) - r_evo_rapid36.add_child(gn_rapid36) - rapid_evo_collection.add_child(r_evo_eorca025) rapid_evo_collection.add_child(r_evo_rapid12) rapid_evo_collection.add_child(r_evo_rapid36) From ef912ebebfcaf37dd095ed61d03831d054a786ef Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 18:25:53 +0100 Subject: [PATCH 04/10] Add ods_obs_collection.py to define STAC collections from ocean observations - nsidc, woa23, oisst, en4.2.2., hadisst, era5, armor-3d. --- .../catalog/stac/ods_obs_collection.py | 534 ++++++++++++++++++ 1 file changed, 534 insertions(+) create mode 100644 OceanDataStore/catalog/stac/ods_obs_collection.py diff --git a/OceanDataStore/catalog/stac/ods_obs_collection.py b/OceanDataStore/catalog/stac/ods_obs_collection.py new file mode 100644 index 00000000..6808a5c3 --- /dev/null +++ b/OceanDataStore/catalog/stac/ods_obs_collection.py @@ -0,0 +1,534 @@ +""" +obs_collections.py + +Description: +Function to create Spatio-Temporal Access Catalog Collections +for ocean observation datasets. + +Authors: + - Ollie Tooth (oliver.tooth@noc.ac.uk) +""" +# -- Import Python Modules -- # +import logging +import pystac +import datetime + +from OceanDataStore.catalog.stac.utils import open_icechunk_store, create_item_with_icechunk_asset + + +def create_nsidc_collection() -> pystac.Collection: + """ + Create the NSIDC Sea Ice Index, Version 4 STAC Collection. + + Returns: + ------- + nsidc_collection : pystac.Collection + NSIDC Sea Ice Index, Version 4 STAC Collection. + """ + # ==== Define NSIDC Sea Ice Index, Version 4 Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1978, month=11, day=15), datetime.datetime(year=2025, month=12, day=15)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the NSIDC Sea Ice Index, Version 4 Collection: + nsidc_collection = pystac.Collection( + id="nsidc", + title="NSIDC Sea Ice Index, Version 4 Collection", + description="**About:**\n\nCollection of National Snow and Ice Data Center (NSIDC) Sea Ice Index, Version 4 datasets.\n\n**More Information:**\n - [NSIDC](https://nsidc.org/home)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="U.S. Government Works License", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="quarterly", last_data_update="2025-12-15"), + keywords=["NSIDC", "arctic", "antarctic", "observation", "sea-ice"], + providers=[ + pystac.Provider( + name="National Snow and Ice Data Center (NSIDC)", + description="National Snow and Ice Data Center (NSIDC), Cooperative Institute for Research in Environmental Sciences, University of Colorado, United States.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://nsidc.org/data/g02135/versions/4" + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {nsidc_collection}") + + # -- Add Items to NSIDC Sea Ice Index Collection -- # + bucket = "nsidc" + for prefix in ["nsidc_sea_ice_index_v4_antarctic_monthly", "nsidc_sea_ice_index_v4_arctic_monthly"]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date="1978-11-15", + end_date="2025-12-15", + collection=bucket + ) + # Add Item to the NSIDC Sea Ice Index Collection: + nsidc_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {nsidc_collection.id}") + + return nsidc_collection + + +def create_woa23_collection() -> pystac.Collection: + """ + Create the World Ocean Atlas 2023 STAC Collection. + + Returns: + ------- + woa23_collection : pystac.Collection + World Ocean Atlas 2023 STAC Collection. + """ + # ==== Define World Ocean Atlas 2023 Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1971, month=1, day=1), datetime.datetime(year=2020, month=12, day=31)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the World Ocean Atlas 2023 Collection: + woa23_collection = pystac.Collection( + id="woa23", + title="World Ocean Atlas 2023 Collection", + description="**About:**\n\nCollection of World Ocean Atlas 2023 climatology datasets.\n\n**More Information:**\n - [World Ocean Atlas](https://www.ncei.noaa.gov/access/world-ocean-atlas-2023/)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="Creative Commons CC0 1.0 Universal License", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="None", last_data_update="2024-02-01"), + keywords=["WOA23", "global", "observation", "temperature", "salinity"], + providers=[ + pystac.Provider( + name="NOAA National Centers for Environmental Information (NCEI)", + description="National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI), United States.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://www.ncei.noaa.gov", + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {woa23_collection}") + + # -- Add Items to World Ocean Atlas 2023 Collection -- # + bucket = "woa23" + for prefix in ["woa23_1971_2000_annual_climatology", + "woa23_1971_2000_monthly_climatology", + "woa23_1981_2010_annual_climatology", + "woa23_1981_2010_monthly_climatology", + "woa23_1991_2020_annual_climatology", + "woa23_1991_2020_monthly_climatology" + ]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date=f"{prefix.split('_')[1]}-01-01", + end_date=f"{prefix.split('_')[2]}-12-31", + collection=bucket + ) + # Add item to the World Ocean Atlas 2023 Collection: + woa23_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {woa23_collection.id}") + + return woa23_collection + + +def create_oisst_collection() -> pystac.Collection: + """ + Create the OISST Version 2.1 STAC Collection. + + Returns: + ------- + oisst_collection : pystac.Collection + OISST Version 2.1 STAC Collection. + """ + # ==== Define OISST Version 2.1 Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1981, month=9, day=1), datetime.datetime(year=2026, month=5, day=1)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the OISST Version 2.1 Collection: + oisst_collection = pystac.Collection( + id="oisst", + title="OISST Version 2.1 Collection", + description="**About:**\n\nCollection of OISST Version 2.1 datasets.\n\n**More Information:**\n - [OISST Version 2.1](https://psl.noaa.gov/data/gridded/data.noaa.oisst.v2.highres.html)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="Creative Commons CC0 1.0 Universal License", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="quarterly", last_data_update="2026-05-01"), + keywords=["OISSTv2.1", "global", "observation", "sea surface temperature", "sea ice concentration"], + providers=[ + pystac.Provider( + name="NOAA National Centers for Environmental Information (NCEI)", + description="National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI), United States.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://www.ncei.noaa.gov", + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {oisst_collection}") + + # -- Add Items to OISST Version 2.1 Collection -- # + bucket = "oisst" + for prefix in ["oisst_v2.1_monthly", + "oisst_v2.1_1991_2020_daily_climatology", + "oisst_v2.1_1991_2020_monthly_climatology", + ]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + if "1991_2020" in prefix: + start_date = "1991-01-01" + end_date = "2020-12-31" + else: + start_date = "1981-09-01" + end_date = "2026-05-01" + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date=start_date, + end_date=end_date, + collection=bucket + ) + # Add item to the OISST Version 2.1 Collection: + oisst_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {oisst_collection.id}") + + return oisst_collection + + +def create_en4_collection() -> pystac.Collection: + """ + Create the EN4.2.2 STAC Collection. + + Returns: + ------- + en4_collection : pystac.Collection + EN4.2.2 STAC Collection. + """ + # ==== Define EN4.2.2 Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1950, month=1, day=1), datetime.datetime(year=2026, month=3, day=1)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the EN4.2.2 Collection: + en4_collection = pystac.Collection( + id="en4.2.2", + title="EN4.2.2 Collection", + description="**About:**\n\nCollection of EN4.2.2 quality Controlled Ocean datasets.\n\n**More Information:**\n - [EN4.2.2](https://www.metoffice.gov.uk/hadobs/en4/)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="Non-Commercial Government Licence", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="quarterly", last_data_update="2026-05-01"), + keywords=["EN4.2.2", "global", "observation", "temperature", "salinity"], + providers=[ + pystac.Provider( + name="Met Office", + description="Met Office, United Kingdom.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://www.metoffice.gov.uk", + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {en4_collection}") + + # -- Add Items to EN4.2.2 Collection -- # + bucket = "en4.2.2" + for prefix in ["en4.2.2_analysis_g10_monthly", + "en4.2.2_analysis_g10_1971_2000_monthly_climatology", + "en4.2.2_analysis_g10_1981_2010_monthly_climatology", + "en4.2.2_analysis_g10_1991_2020_monthly_climatology", + ]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + if "19" in prefix: + start_date = f"{prefix.split('_')[3]}-01-01" + end_date = f"{prefix.split('_')[4]}-12-31" + else: + start_date = "1950-01-01" + end_date = "2026-03-12" + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date=start_date, + end_date=end_date, + collection=bucket + ) + # Add item to the EN4.2.2 Collection: + en4_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {en4_collection.id}") + + return en4_collection + + +def create_armor3d_collection() -> pystac.Collection: + """ + Create the ARMOR3D STAC Collection. + + Returns: + ------- + armor3d_collection : pystac.Collection + ARMOR3D STAC Collection. + """ + # ==== Define ARMOR3D Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1993, month=1, day=1), datetime.datetime(year=2024, month=12, day=31)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the ARMOR3D Collection: + armor3d_collection = pystac.Collection( + id="armor3d", + title="ARMOR3D Collection", + description="**About:**\n\nCollection of Multi Observation Global Ocean ARMOR3D Temperature Salinity Height Geostrophic Current and MLD.\n\n**More Information:**\n - [ARMOR3D](https://data.marine.copernicus.eu/product/MULTIOBS_GLO_PHY_TSUV_3D_MYNRT_015_012/description)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="Copernicus Marine Environment Monitoring Service Service Level Agreement (SLA)", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="quarterly", last_data_update="2025-11-01"), + keywords=["ARMOR3D", "global", "observation", "temperature", "salinity", "dynamic height", "geostrophic current", "mixed layer depth"], + providers=[ + pystac.Provider( + name="Copernicus Marine Service", + description="Copernicus Marine Service, Mercator Ocean International, France.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://marine.copernicus.eu", + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {armor3d_collection}") + + # -- Add Items to ARMOR3D Collection -- # + bucket = "armor3d" + for prefix in ["armor3d_global_my_monthly", + "armor3d_global_my_1971_2000_monthly_climatology", + "armor3d_global_my_1981_2010_monthly_climatology", + "armor3d_global_my_1991_2020_monthly_climatology", + ]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + if "19" in prefix: + start_date = f"{prefix.split('_')[3]}-01-01" + end_date = f"{prefix.split('_')[4]}-12-31" + else: + start_date = "1993-01-01" + end_date = "2024-12-31" + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date=start_date, + end_date=end_date, + collection=bucket + ) + # Add item to the ARMOR3D Collection: + armor3d_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {armor3d_collection.id}") + + return armor3d_collection + + +def create_hadisst_collection() -> pystac.Collection: + """ + Create the HadISST Version 1.1 STAC Collection. + + Returns: + ------- + hadisst_collection : pystac.Collection + HadISST Version 1.1 STAC Collection. + """ + # ==== Define HadISST Version 1.1 Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1870, month=1, day=16), datetime.datetime(year=2026, month=4, day=16)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the HadISST Version 1.1 Collection: + hadisst_collection = pystac.Collection( + id="hadisst", + title="HadISST Version 1.1 Collection", + description="**About:**\n\nCollection of HadISST Version 1.1 datasets.\n\n**More Information:**\n - [HadISST Version 1.1](https://www.metoffice.gov.uk/hadobs/hadisst/)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="Non-Commercial Government Licence", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="quarterly", last_data_update="2026-05-01"), + keywords=["HadISSTv1.1", "global", "observation", "sea surface temperature", "sea ice concentration"], + providers=[ + pystac.Provider( + name="Met Office", + description="Met Office, United Kingdom.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://www.metoffice.gov.uk", + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {hadisst_collection}") + + # -- Add Items to HadISST Version 1.1 Collection -- # + bucket = "hadisst" + for prefix in ["hadisst_v1.1_monthly"]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date="1870-01-16", + end_date="2026-05-01", + collection=bucket + ) + # Add item to the HadISST Version 1.1 Collection: + hadisst_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {hadisst_collection.id}") + + return hadisst_collection + + +def create_era5_collection() -> pystac.Collection: + """ + Create the ERA5 STAC Collection. + + Returns: + ------- + era5_collection : pystac.Collection + ERA5 STAC Collection. + """ + # ==== Define ERA5 Collection ==== # + # Define the spatial extent for the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection: + collection_interval = sorted([datetime.datetime(year=1980, month=1, day=1), datetime.datetime(year=2026, month=6, day=20)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define the ERA5 Collection: + era5_collection = pystac.Collection( + id="era5", + title="ERA5 Collection", + description="**About:**\n\nCollection of ERA5 datasets.\n\n**More Information:**\n - [ERA5](https://www.ecmwf.int/en/forecasts/datasets/reanalysis-datasets/era5)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="Creative Commons CC-BY-4.0 License", + extra_fields=dict(contact="Ollie Tooth (oliver.tooth@noc.ac.uk)", project="OceanDataStore", status="ongoing", update_frequency="quarterly", last_data_update="2026-06-20"), + keywords=["ERA5", "global", "reanalysis", "sea surface temperature", "sea ice concentration"], + providers=[ + pystac.Provider( + name="ECMWF", + description="European Centre for Medium-Range Weather Forecasts (ECMWF), EU.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://www.ecmwf.int", + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection: {era5_collection.id}") + + # -- Add Items to ERA5 Collection -- # + bucket = "era5" + prefixes = ["era5_1991_2020_daily_climatology", + "era5_1996_2025_daily_climatology", + "era5_daily_timeseries", + "era5_monthly_timeseries" + ] + dates = [("1991-01-01", "2020-12-31"), + ("1996-01-01", "2025-12-31"), + ("1980-01-01", "2026-06-20"), + ("1980-01-01", "2026-06-20") + ] + + for prefix, date in zip(prefixes, dates): + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="main") + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date=date[0], + end_date=date[1], + collection=bucket + ) + # Add item to the ERA5 Collection: + era5_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {era5_collection.id}") + + return era5_collection \ No newline at end of file From d5f9c6cab6fff573c5c8fa8dcb87d9f17f82337c Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 18:26:17 +0100 Subject: [PATCH 05/10] Add example template for additions to NOC STAC. --- .../catalog/stac/template_collection.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 OceanDataStore/catalog/stac/template_collection.py diff --git a/OceanDataStore/catalog/stac/template_collection.py b/OceanDataStore/catalog/stac/template_collection.py new file mode 100644 index 00000000..7600ab2c --- /dev/null +++ b/OceanDataStore/catalog/stac/template_collection.py @@ -0,0 +1,85 @@ +""" +template_collections.py + +Description: +Template Function to create Spatio-Temporal Access Catalog Collections +for ocean observation datasets. + +Authors: + - Ollie Tooth (oliver.tooth@noc.ac.uk) +""" +# -- Import Python Modules -- # +import logging +import pystac +import datetime + +from OceanDataStore.catalog.stac.utils import open_icechunk_store, create_item_with_icechunk_asset + + +def create_example_collection() -> pystac.Collection: + """ + Create an example STAC Collection from an Icechunk repository + stored in the JASMIN cloud object store. + + Returns: + ------- + example_collection : pystac.Collection + Example STAC Collection. + + """ + # ==== Define Example Collection ==== # + # Define the spatial extent for the collection - this is the maximum extent of all datasets in the collection: + spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]]) + + # Define the current temporal extent for the collection - this is the maximum temporal extent of all datasets in the collection: + collection_interval = sorted([datetime.datetime(year=1975, month=1, day=1), datetime.datetime(year=2026, month=6, day=1)]) + temporal_extent = pystac.TemporalExtent(intervals=[collection_interval]) + + # Define PySTAC Collection: + example_collection = pystac.Collection( + id="example", + title="Example STAC Collection", + description="**About:**\n\nExample STAC Collection from an Icechunk repository.\n\n**More Information:**\n - [Source](https://link-to-source.com)", + extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent), + license="License String", # For example, UK Open Government License v3.0 + extra_fields=dict(contact="Name (email)", project="project_name", status="ongoing / completed", update_frequency="monthly/quarterly/annually", last_data_update="YYYY-MM-DD"), + keywords=["SOURCE", "global / arctic / antarctic", "model / observation", "temperature / salinity / sea ice concentration"], + providers=[ + pystac.Provider( + name="National Oceanography Centre", + description="National Oceanography Centre (NOC), United Kingdom.", + roles=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.LICENSOR], + url="https://www.noc.ac.uk" + ), + pystac.Provider( + name="JASMIN", + description="JASMIN Environmental Data Analysis Facility (United Kingdom).", + roles=[pystac.ProviderRole.HOST], + url="https://jasmin.ac.uk" + ) + ], + ) + + logging.info(f"Completed: Created STAC Collection with ID: {example_collection.id}") + + # -- Add Items to Example STAC Collection -- # + bucket = "bucket_name" + for prefix in ["example_dataset_1", "example_dataset_2"]: + # Open dataset from Icechunk repository: + ds = open_icechunk_store(bucket=bucket, prefix=prefix, branch="branch_name") + + item = create_item_with_icechunk_asset( + ds=ds, + id=f"{bucket}/{prefix}", + bucket=bucket, + prefix=prefix, + start_date="YYYY-MM-DD", + end_date="YYYY-MM-DD", + collection=bucket, + ) + # Add Item to the Example STAC Collection: + example_collection.add_item(item) + + logging.info(f"Completed: Added Items to STAC Collection with ID: {example_collection.id}") + + return example_collection From cc1966adda317006c62c43e78f8a5fbf1815574d Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 18:27:03 +0100 Subject: [PATCH 06/10] Update stac module to include obs catalog creators and refactor create_noc_stac.py to create latest noc-stac catalog. --- OceanDataStore/catalog/stac/__init__.py | 18 +++- .../catalog/stac/create_noc_stac.py | 86 +++++++++++++------ 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/OceanDataStore/catalog/stac/__init__.py b/OceanDataStore/catalog/stac/__init__.py index 8ffa268c..b0a789f9 100644 --- a/OceanDataStore/catalog/stac/__init__.py +++ b/OceanDataStore/catalog/stac/__init__.py @@ -3,14 +3,28 @@ Tools for constructing the NOC Spatio-Temporal Asset Catalog (STAC) to local JSON files. """ -from OceanDataStore.catalog.stac.create_noc_stac import create_noc_stac from OceanDataStore.catalog.stac.npd_era5_collection import create_npd_era5_collection from OceanDataStore.catalog.stac.npd_jra55_collection import create_npd_jra55_collection from OceanDataStore.catalog.stac.rapid_evo_collection import create_rapid_evo_collection +from OceanDataStore.catalog.stac.ods_obs_collection import ( + create_nsidc_collection, + create_woa23_collection, + create_oisst_collection, + create_en4_collection, + create_armor3d_collection, + create_hadisst_collection, + create_era5_collection +) __all__ = ( - "create_noc_stac", "create_npd_era5_collection", "create_npd_jra55_collection", "create_rapid_evo_collection", + "create_nsidc_collection", + "create_woa23_collection", + "create_oisst_collection", + "create_en4_collection", + "create_armor3d_collection", + "create_hadisst_collection", + "create_era5_collection" ) diff --git a/OceanDataStore/catalog/stac/create_noc_stac.py b/OceanDataStore/catalog/stac/create_noc_stac.py index bbf32e10..1a974290 100644 --- a/OceanDataStore/catalog/stac/create_noc_stac.py +++ b/OceanDataStore/catalog/stac/create_noc_stac.py @@ -9,67 +9,101 @@ - Ollie Tooth (oliver.tooth@noc.ac.uk) """ # -- Import Python Modules -- # -import os -import sys +import datetime import logging +import os + import pystac -import datetime -from OceanDataStore.catalog.stac import create_npd_era5_collection -from OceanDataStore.catalog.stac import create_npd_jra55_collection -from OceanDataStore.catalog.stac import create_rapid_evo_collection -from OceanDataStore.catalog.stac.utils import create_logging_banner, initialise_logging +from OceanDataStore.catalog.stac import ( + create_npd_era5_collection, + create_npd_jra55_collection, + create_rapid_evo_collection, + create_nsidc_collection, + create_woa23_collection, + create_oisst_collection, + create_en4_collection, + create_armor3d_collection, + create_hadisst_collection, + create_era5_collection +) +from OceanDataStore.cli import initialise_logging + +logger = logging.getLogger(__name__) def create_noc_stac(): """ Create the NOC STAC and write to JSON files. """ - # -- Define NOC STAC Base Catalog -- # + # =========== Configure OceanDataStore Logging =========== # + initialise_logging() + + # =========== Define NOC STAC =========== # noc_stac = pystac.Catalog(id="noc-stac", title="NOC STAC Catalog", description="National Oceanography Centre Spatio-Temporal Asset Catalog for Ocean Model and Observational Data.\n\n**About:**\n\nThe National Oceanography Centre (NOC) is one of the world's leading oceanographic institutions and has been in existence, in its various forms, for over six decades.\nWe undertake world-leading research from coastal seas to deep water, to enhance understanding of the ocean and to address critical environmental challenges.\n\n**Links:**\n- [Website](https://noc.ac.uk)\n- [OceanDataStore](https://noc-msm.github.io/OceanDataStore/)", stac_extensions=None, extra_fields={ "last_update": datetime.datetime.now().isoformat(timespec="hours"), - "catalog_version": "0.2.0", + "catalog_version": "0.3.0", "contacts": "Oliver Tooth (oliver.tooth@noc.ac.uk), Adam Blaker (atb299@noc.ac.uk), Andrew Coward (acc@noc.ac.uk)", }, ) logging.info(f"Completed: Created NOC STAC Catalog with ID: {noc_stac.id}") - # -- Create & Add Rapid-EVO Collection to NOC STAC Catalog -- # - rapid_evo_collection = create_rapid_evo_collection() + # -- Add Rapid-Evolution Collection to NOC STAC Catalog -- # + rapid_evo_collection = create_rapid_evo_collection(credentials_json="/dssgfs01/working/otooth/AtlantiS/credentials/rapid_evo_credentials.json") noc_stac.add_child(rapid_evo_collection) - # -- Create & Add NOC Near-Present Day JRA55-do Collection to NOC STAC Catalog -- # + # -- Add NOC Near-Present Day JRA55-do Collection to NOC STAC Catalog -- # npd_jra55v1_collection = create_npd_jra55_collection() noc_stac.add_child(npd_jra55v1_collection) - # -- Create & Add NOC Near-Present Day ERA5 Collection to NOC STAC Catalog -- # + # -- Add NOC Near-Present Day ERA5 Collection to NOC STAC Catalog -- # npd_era5v1_collection = create_npd_era5_collection() noc_stac.add_child(npd_era5v1_collection) - logging.info(f"Completed: Added NOC Near-Present Day Collection Catalogs to NOC STAC: {noc_stac.id}") + logging.info(f"Completed: Added NOC Near-Present Day Collections to NOC STAC: {noc_stac.id}") + + # -- Add NSIDC Sea Ice Index Collection to NOC STAC Catalog -- # + nsidc_collection = create_nsidc_collection() + noc_stac.add_child(nsidc_collection) + + # -- Add WOA23 Collection to NOC STAC Catalog -- # + woa23_collection = create_woa23_collection() + noc_stac.add_child(woa23_collection) + + # -- Add OISST Collection to NOC STAC Catalog -- # + oisst_collection = create_oisst_collection() + noc_stac.add_child(oisst_collection) + + # -- Add EN4.2.2 Collection to NOC STAC Catalog -- # + en4_collection = create_en4_collection() + noc_stac.add_child(en4_collection) + + # -- Add ARMOR3D Collection to NOC STAC Catalog -- # + armor3d_collection = create_armor3d_collection() + noc_stac.add_child(armor3d_collection) + + # -- Add HadISST1 Collection to NOC STAC Catalog -- # + hadisst_collection = create_hadisst_collection() + noc_stac.add_child(hadisst_collection) + + # -- Add ERA5 Collection to NOC STAC Catalog -- # + era5_collection = create_era5_collection() + noc_stac.add_child(era5_collection) + + logging.info(f"Completed: Added Ocean Observation Collections to NOC STAC: {noc_stac.id}") # -- Write NOC STAC Catalog to local filesystem -- # logging.info(f"NOC STAC {noc_stac.id} Summary:") print(noc_stac.describe()) noc_stac.normalize_hrefs(root_href="https://noc-msm-o.s3-ext.jc.rl.ac.uk/noc-stac/") - noc_stac.save(catalog_type=pystac.CatalogType.SELF_CONTAINED, dest_href=os.path.join(os.getcwd(), "noc-stac")) + noc_stac.save(catalog_type=pystac.CatalogType.RELATIVE_PUBLISHED, dest_href=os.path.join(os.getcwd(), "noc-stac")) logging.info(f"Completed: Write NOC STAC to -> {os.path.join(os.getcwd(), 'noc-stac')}") if __name__ == "__main__": - # -- Configure Logging -- # - logger = logging.getLogger(__name__) - - initialise_logging(logger) - create_logging_banner(logger) - # -- Create NOC STAC Catalog -- # - try: - create_noc_stac() - except Exception as e: - logger.error(f"An error occurred: {e}") - sys.exit(1) \ No newline at end of file + create_noc_stac() From ad864fd36df0bbcbc6f3a6ccad71620246da9d9a Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 19:25:56 +0100 Subject: [PATCH 07/10] Refactor OceanDataCatalog __repr_html__ to replace collection_summary() & add option to copy open_repo() access string from item_summary(). --- OceanDataStore/catalog/oceandatacatalog.py | 150 ++++++++------------- 1 file changed, 57 insertions(+), 93 deletions(-) diff --git a/OceanDataStore/catalog/oceandatacatalog.py b/OceanDataStore/catalog/oceandatacatalog.py index 1203d24c..f089fc63 100644 --- a/OceanDataStore/catalog/oceandatacatalog.py +++ b/OceanDataStore/catalog/oceandatacatalog.py @@ -395,6 +395,41 @@ def _repr_html_(self) -> str: if self.Items is not None else "no search yet" ) + + def _extent_dates(col): + try: + ext = col.extent.temporal.intervals + start = ext[0][0].strftime("%Y-%m-%d") if ext[0][0] else "—" + end = ext[0][1].strftime("%Y-%m-%d") if ext[0][1] else "present" + except Exception: + start, end = "—", "—" + return start, end + + rows_html = "" + for col in list(self.Catalog.get_all_collections()): + start, end = _extent_dates(col) + desc = col.description or "" + desc_cell = ( + f"
" + f"Summary" + f"
{desc.replace('**', '')}
" + f"
" + if desc else "" + ) + active = " active" if ( + self.Collection and col.id == self.Collection.id + ) else "" + col_title_cell = col.title if col.title else "" + rows_html += ( + f"" + f"{col.id}{active}" + f"{col_title_cell}" + f"{desc_cell}" + f"{start}" + f"{end}" + f"" + ) + return ( f"{_NOC_CSS}" f"
" @@ -404,11 +439,20 @@ def _repr_html_(self) -> str: f"
" f"
" f"
" + f"
Version {self.Catalog.extra_fields.get('catalog_version', 'None')}
" f"
Collections {n_collections}
" f"
Active collection {col_name}
" f"
Last search {n_items}
" f"
" - f" " + f" " + f" " + f" " + f" " + f" " + f" {rows_html}" + f"
Collection IDTitleDescriptionFromTo
" + f"
Source URL
" + f" " f"
" f"" ) @@ -431,9 +475,9 @@ def available_items(self) -> list[str]: # Return all Item IDs from the most recent search: return [item.id for item in self.Items] else: - # Return first 25 Item IDs from the current Collection or root Catalog: + # Return all Item IDs from the current Collection or root Catalog: scope = self.Collection if self.Collection else self.Catalog - return [next(scope.get_items(recursive=True), None).id for _ in range(25)] + return list(item.id for item in scope.get_items(recursive=True)) def summary(self) -> CatalogSummary: @@ -527,94 +571,9 @@ def summary(self) -> CatalogSummary: return CatalogSummary(display_text=text, display_html=html) - def collection_summary(self) -> CatalogSummary: - """ - Display a summary table of all Collections in the OceanDataCatalog: - - * In Jupyter / Marimo environments a styled HTML table is displayed. - * In plain Python / CLI environments a formatted text table is printed instead. - """ - collections = list(self.Catalog.get_all_collections()) - n = len(collections) - - def _extent_dates(col): - try: - ext = col.extent.temporal.intervals - start = ext[0][0].strftime("%Y-%m-%d") if ext[0][0] else "—" - end = ext[0][1].strftime("%Y-%m-%d") if ext[0][1] else "present" - except Exception: - start, end = "—", "—" - return start, end - - # ----- HTML Output ----- # - rows_html = "" - for col in collections: - start, end = _extent_dates(col) - desc = col.description or "" - desc_cell = ( - f"
" - f"Summary" - f"
{desc.replace('**', '')}
" - f"
" - if desc else "" - ) - active = " active" if ( - self.Collection and col.id == self.Collection.id - ) else "" - col_title_cell = col.title if col.title else "" - rows_html += ( - f"" - f"{col.id}{active}" - f"{col_title_cell}" - f"{desc_cell}" - f"{start}" - f"{end}" - f"" - ) - - html = ( - f"{_NOC_CSS}" - f"
" - f"
" - f" Collections" - f" {n} available" - f"
" - f"
" - f" " - f" " - f" " - f" " - f" " - f" {rows_html}" - f"
Collection IDTitleDescriptionFromTo
" - f"
" - f"
" - ) - - # ----- Plain-Text Output ----- # - col_w = [30, 42, 12, 12] - headers = ["Collection ID", "Title", "From", "To"] - sep = "+" + "+".join("-" * (w + 2) for w in col_w) + "+" - header_row = "| " + " | ".join(h.ljust(col_w[i]) for i, h in enumerate(headers)) + " |" - text_lines = [f"Collections — {n} available", sep, header_row, sep] - for col in collections: - start, end = _extent_dates(col) - row = [ - col.id[:col_w[0]], - (col.title or "")[:col_w[1]], - start[:col_w[2]], - end[:col_w[3]], - ] - text_lines.append("| " + " | ".join(v.ljust(col_w[i]) for i, v in enumerate(row)) + " |") - text_lines.append(sep) - text = "\n".join(text_lines) - - return CatalogSummary(display_text=text, display_html=html) - - def item_summary(self, id: str) -> CatalogSummary: """ - Display detailed metadata for a single OceanDataStore Item. + Display the detailed summary for a single OceanDataStore Item. Searches the current Items list first; if the Item is not found there it is fetched directly from the Catalog URL. @@ -754,7 +713,8 @@ def item_summary(self, id: str) -> CatalogSummary: f"" ) - access_str = f"catalog.open_dataset(id='{id}')" + access_ds_str = f"catalog.open_dataset(id='{id}')" + access_repo_str = f"catalog.open_repo(id='{id}')" _copy_js = ( "(function(b){" "var t=document.createElement('textarea');" @@ -770,8 +730,12 @@ def item_summary(self, id: str) -> CatalogSummary: access_section = ( f"
Access
" f"
" - f" {access_str}" - f" " + f" {access_ds_str}" + f" " + f"
" + f"
" + f" {access_repo_str}" + f" " f"
" ) @@ -820,7 +784,7 @@ def item_summary(self, id: str) -> CatalogSummary: af = asset.extra_fields loc = f"{af.get('endpoint_url', '')}/{af.get('bucket', '')}/{af.get('prefix', '')}" text_lines.append(f" {asset_key}: {asset.media_type or ''} — {loc}") - text_lines += ["", f" Access: {access_str}"] + text_lines += ["", f" Access: {access_ds_str}"] text = "\n".join(text_lines) return CatalogSummary(display_text=text, display_html=html) From 1778af0543e9cd37fe021bfd5cc6a71ea4aa73b4 Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 19:49:47 +0100 Subject: [PATCH 08/10] Refactor search() & _filter_items() to include dataset_type & product_type, add clear() method to clear search history, and refactor _open_item() to traverse noc-stac to find Items. --- OceanDataStore/catalog/oceandatacatalog.py | 84 ++++++++++++++-------- 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/OceanDataStore/catalog/oceandatacatalog.py b/OceanDataStore/catalog/oceandatacatalog.py index f089fc63..1b85ca1c 100644 --- a/OceanDataStore/catalog/oceandatacatalog.py +++ b/OceanDataStore/catalog/oceandatacatalog.py @@ -9,7 +9,6 @@ Authors: - Ollie Tooth """ -import os from typing import Optional import icechunk @@ -792,20 +791,24 @@ def item_summary(self, id: str) -> CatalogSummary: def _filter_items(self, items: list[pystac.Item], - platform: Optional[str] = None, + dataset_type: Optional[str] = None, + product_type: Optional[str] = None, variable_name: Optional[str] = None, standard_name: Optional[str] = None, item_name: Optional[str] = None ): """ - Filter Items based on specified platform and variable. + Filter Items based on specified dataset type, product type, + variable name, and standard name. Parameters ---------- items : list[pystac.Item] List of STAC Items to filter. - platform : str, optional - Platform name to filter Items by. + dataset_type : str, optional + Dataset type to filter Items by. + product_type : str, optional + Product type to filter Items by. variable_name : str, optional Variable name to filter Items by. standard_name : str, optional @@ -813,8 +816,10 @@ def _filter_items(self, item_name : str, optional Substring to filter Item IDs by. """ - if platform: - items = [item for item in items if platform in item.properties.get('platform', '')] + if dataset_type: + items = [item for item in items if dataset_type in str(item.properties.get('dataset_type', ''))] + if product_type: + items = [item for item in items if product_type in str(item.properties.get('product_type', ''))] if variable_name: items = [item for item in items if any(variable_name in var for var in item.properties.get('variables', []))] if standard_name: @@ -823,29 +828,42 @@ def _filter_items(self, items = [item for item in items if item_name in item.id] return items + + + def clear(self) -> None: + """ + Clear the Active Collection and Items returned from + the latest OceanDataCatalog search. + """ + self.Collection = None + self.Items = None def search(self, collection: Optional[str] = None, - platform: Optional[str] = None, + dataset_type: Optional[str] = None, + product_type: Optional[str] = None, variable_name: Optional[str] = None, standard_name: Optional[str] = None, item_name: Optional[str] = None ) -> None: """ - Search the NOC STAC Catalog for Items matching the specified criteria. + Search the OceanDataCatalog for Items matching the specified criteria. - When both a platform and a variable / standard name are provided, - the search returns all Items which match both criteria. + When both dataset_type / product_type and variable / standard names are + provided, the search returns all Items which match both criteria. Parameters ---------- collection : str, optional Collection name to search for. Default is None, which searches the entire root Catalog. - platform : str, optional - Platform name to search for. Default is None, - which retrieves Items from all platforms. + dataset_type : str, optional + Dataset type to search for (e.g., 'model', 'observation'). + Default is None, which retrieves Items from all dataset types. + product_type : str, optional + Product type to search for (e.g., 'timeseries', 'climatology'). + Default is None, which retrieves Items from all product types. variable_name : str, optional Variable name to search for. Default is None, which retrieves all Items. @@ -865,10 +883,13 @@ def search(self, TypeError If any of the input parameters are of incorrect type. """ + # -- Validate Inputs -- # if not isinstance(collection, (type(None), str)): raise TypeError("'collection' must be a string or None.") - if not isinstance(platform, (type(None), str)): - raise TypeError("'platform' must be a string or None.") + if not isinstance(dataset_type, (type(None), str)): + raise TypeError("'dataset_type' must be a string or None.") + if not isinstance(product_type, (type(None), str)): + raise TypeError("'product_type' must be a string or None.") if not isinstance(variable_name, (type(None), str)): raise TypeError("'variable_name' must be a string or None.") if not isinstance(standard_name, (type(None), str)): @@ -890,7 +911,8 @@ def search(self, raise ValueError("Only one of 'variable_name' or 'standard_name' can be specified.") else: self.Items = self._filter_items(items=items, - platform=platform, + dataset_type=dataset_type, + product_type=product_type, variable_name=variable_name, standard_name=standard_name, item_name=item_name @@ -903,7 +925,7 @@ def _open_item( id: str, ) -> pystac.Item: """ - Open a STAC Item directly from URL using Item ID. + Open a STAC Item directly from the Item ID. Parameters ---------- @@ -915,17 +937,21 @@ def _open_item( pystac.Item STAC Item object. """ - # Define base URL to the root catalog: - base_url = os.path.dirname(self._stac_url) - - # Construct URL to the Item JSON file: - # Assumes Item IDs use path-like representation. - id_list = [f"{id_n}/" for id_n in id.split("/")] - id_prefix = "".join(id_list[:4]) - item_url = f"{base_url}/{id_prefix}{id}/{id}.json" - - # Open the Item from the constructed URL: - item = pystac.Item.from_file(item_url) + # Define components of Item ID path: + parts = id.split("/") + # Initialise node to root Catalog: + node = self.Catalog + + # Iterate over ID components: + for _, part in enumerate(parts): + # Traverse Catalog to child node containing Item: + child = node.get_child(part) + if child is not None: + node = child + continue + else: + # Collect STAC Item from child node: + item = node.get_item(id) return item From b635c728722f13bbf9467f34a1fd080636499f30 Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 20:34:09 +0100 Subject: [PATCH 09/10] Replace deprecated get_item() in _open_item() utility method. --- OceanDataStore/catalog/oceandatacatalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OceanDataStore/catalog/oceandatacatalog.py b/OceanDataStore/catalog/oceandatacatalog.py index 1b85ca1c..75e487c3 100644 --- a/OceanDataStore/catalog/oceandatacatalog.py +++ b/OceanDataStore/catalog/oceandatacatalog.py @@ -951,7 +951,7 @@ def _open_item( continue else: # Collect STAC Item from child node: - item = node.get_item(id) + item = next(node.get_items(id), None) return item From ac5550da7f38f4e5f338c808cdd911707507f3fe Mon Sep 17 00:00:00 2001 From: Ollie Tooth Date: Fri, 26 Jun 2026 20:35:36 +0100 Subject: [PATCH 10/10] Update OceanDataCatalog unit and integration tests & update fixture properties. --- tests/integration/catalog/test_catalog.py | 2 +- tests/unit/catalog/conftest.py | 4 ++ tests/unit/catalog/test_oceandatacatalog.py | 48 +++++++++++++-------- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/tests/integration/catalog/test_catalog.py b/tests/integration/catalog/test_catalog.py index 975e9998..b03c5dec 100644 --- a/tests/integration/catalog/test_catalog.py +++ b/tests/integration/catalog/test_catalog.py @@ -47,7 +47,7 @@ def test_available_items(catalog): assert all(isinstance(i, str) for i in items) def test_available_search_items(catalog): - catalog.search(platform="gn") + catalog.search(item_name="domain_cfg") items = catalog.available_items assert isinstance(items, list) assert all(isinstance(i, str) for i in items) diff --git a/tests/unit/catalog/conftest.py b/tests/unit/catalog/conftest.py index c240f035..7e786336 100644 --- a/tests/unit/catalog/conftest.py +++ b/tests/unit/catalog/conftest.py @@ -87,6 +87,8 @@ def catalog_instance(mock_catalog): "sea_surface_salinity", ], "title": "Test ERA5 Item", + "dataset_type": "model", + "product_type": "timeseries", "start_datetime": "1976-01-01", "end_datetime": "2020-12-31", }, @@ -98,6 +100,8 @@ def catalog_instance(mock_catalog): "variables": [], "variable_standard_names": [], "title": "Domain Item", + "dataset_type": "model", + "product_type": "ancillary", }, ) catalog.Items = [item_era5, item_domain] diff --git a/tests/unit/catalog/test_oceandatacatalog.py b/tests/unit/catalog/test_oceandatacatalog.py index 267549fb..4fa6fab6 100644 --- a/tests/unit/catalog/test_oceandatacatalog.py +++ b/tests/unit/catalog/test_oceandatacatalog.py @@ -40,9 +40,13 @@ def test_search_collection_type_error(self, catalog_instance): with pytest.raises(TypeError, match="'collection' must be a string or None"): catalog_instance.search(collection=1234) - def test_search_platform_type_error(self, catalog_instance): - with pytest.raises(TypeError, match="'platform' must be a string or None"): - catalog_instance.search(platform=["gn"]) + def test_search_dataset_type_error(self, catalog_instance): + with pytest.raises(TypeError, match="'dataset_type' must be a string or None"): + catalog_instance.search(dataset_type=["observation"]) + + def test_search_product_type_error(self, catalog_instance): + with pytest.raises(TypeError, match="'product_type' must be a string or None"): + catalog_instance.search(product_type=["timeseries"]) def test_search_variable_name_type_error(self, catalog_instance): with pytest.raises(TypeError, match="'variable_name' must be a string or None"): @@ -72,15 +76,27 @@ def test_filter_no_criteria_returns_all(self, catalog_instance): result = catalog_instance._filter_items(items=catalog_instance.Items) assert result == catalog_instance.Items - def test_filter_by_platform_match(self, catalog_instance): + def test_filter_by_dataset_type_match(self, catalog_instance): result = catalog_instance._filter_items( - items=catalog_instance.Items, platform="gn" + items=catalog_instance.Items, dataset_type="model" ) assert len(result) == 2 - def test_filter_by_platform_no_match(self, catalog_instance): + def test_filter_by_dataset_type_no_match(self, catalog_instance): + result = catalog_instance._filter_items( + items=catalog_instance.Items, dataset_type="observation" + ) + assert result == [] + + def test_filter_by_product_type_match(self, catalog_instance): + result = catalog_instance._filter_items( + items=catalog_instance.Items, product_type="timeseries" + ) + assert len(result) == 1 + + def test_filter_by_product_type_no_match(self, catalog_instance): result = catalog_instance._filter_items( - items=catalog_instance.Items, platform="xy" + items=catalog_instance.Items, product_type="climatology" ) assert result == [] @@ -105,14 +121,14 @@ def test_filter_by_item_name(self, catalog_instance): assert len(result) == 1 assert "domain" in result[0].id - def test_filter_combined_platform_and_variable(self, catalog_instance): + def test_filter_combined_dataset_type_and_variable(self, catalog_instance): result = catalog_instance._filter_items( - items=catalog_instance.Items, platform="gn", variable_name="tos_con" + items=catalog_instance.Items, dataset_type="model", variable_name="tos_con" ) assert len(result) == 1 def test_filter_empty_items_list(self, catalog_instance): - result = catalog_instance._filter_items(items=[], platform="gn") + result = catalog_instance._filter_items(items=[], dataset_type="model") assert result == [] @@ -125,19 +141,15 @@ def test_summary_display_text_contains_item_count(self, catalog_instance): result = catalog_instance.summary() assert "2" in repr(result) + def test_summary_contains_collection_id(self, catalog_instance): + result = catalog_instance.summary() + assert "noc-npd-era5" in repr(result) + def test_summary_raises_without_items(self, catalog_instance): catalog_instance.Items = None with pytest.raises(ValueError, match="No Items returned"): catalog_instance.summary() - def test_collection_summary_returns_catalog_summary(self, catalog_instance): - result = catalog_instance.collection_summary() - assert isinstance(result, CatalogSummary) - - def test_collection_summary_contains_collection_id(self, catalog_instance): - result = catalog_instance.collection_summary() - assert "noc-npd-era5" in repr(result) - class TestOceanDataCatalogItemSummary: def test_item_summary_type_error(self, catalog_instance):