Skip to content
216 changes: 166 additions & 50 deletions openeo/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ class DimensionAlreadyExistsException(MetadataException):


# TODO: make these dimension classes immutable data classes
# TODO: align better with STAC datacube extension
# TODO: align/adapt/integrate with pystac's datacube extension implementation?
class Dimension:
"""Base class for dimensions."""

Expand Down Expand Up @@ -71,7 +69,6 @@ def rename_labels(self, target, source) -> Dimension:

class SpatialDimension(Dimension):
# TODO: align better with STAC datacube extension: e.g. support "axis" (x or y)

DEFAULT_CRS = 4326

def __init__(
Expand Down Expand Up @@ -679,30 +676,16 @@ def metadata_from_stac(url: str) -> CubeMetadata:
"""
Reads the band metadata a static STAC catalog or a STAC API Collection and returns it as a :py:class:`CubeMetadata`

Policy:
- If cube:dimensions exists: treat it as source of truth (it may omit x/y/t/bands).
- Otherwise: apply openEO-style defaults (x, y, t) and (for Collection/Item) keep bands dimension even if empty.
Comment thread
suriyahgit marked this conversation as resolved.

:param url: The URL to a static STAC catalog (STAC Item, STAC Collection, or STAC Catalog) or a specific STAC API Collection
:return: A :py:class:`CubeMetadata` containing the DataCube band metadata from the url.
"""
stac_object = pystac.read_file(href=url)
bands = _StacMetadataParser().bands_from_stac_object(stac_object)

# At least assume there are spatial dimensions
# TODO #743: are there conditions in which we even should not assume the presence of spatial dimensions?
dimensions = [
SpatialDimension(name="x", extent=[None, None]),
SpatialDimension(name="y", extent=[None, None]),
]

# TODO #743: conditionally include band dimension when there was actual indication of band metadata?
band_dimension = BandDimension(name="bands", bands=bands)
dimensions.append(band_dimension)

# TODO: is it possible to derive the actual name of temporal dimension that the backend will use?
temporal_dimension = _StacMetadataParser().get_temporal_dimension(stac_object)
if temporal_dimension:
dimensions.append(temporal_dimension)

metadata = CubeMetadata(dimensions=dimensions)
return metadata
parser = _StacMetadataParser()
return parser.metadata_from_stac_object(stac_object)

# Sniff for PySTAC extension API since version 1.9.0 (which is not available below Python 3.9)
# TODO: remove this once support for Python 3.7 and 3.8 is dropped
Expand Down Expand Up @@ -760,39 +743,172 @@ def __init__(self, *, logger=_log, log_level=logging.DEBUG, supress_duplicate_wa
# Use caching trick to avoid duplicate warnings
self._warn = functools.lru_cache(maxsize=1000)(self._warn)

def metadata_from_stac_object(self, stac_object: pystac.STACObject) -> CubeMetadata:
"""
Build cube metadata from a STAC object.
"""
bands = self.bands_from_stac_object(stac_object)
dimensions = self.dimensions_from_stac_object(stac_object=stac_object, bands=bands)
return CubeMetadata(dimensions=dimensions)

def dimensions_from_stac_object(self, stac_object: pystac.STACObject, bands: _BandList) -> List[Dimension]:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why having bands as argument here? Shouldn't it be the responsibility of this method to detect the bands? Note that this is a public method, so it's weird to have an argument for something you actually want to extract

"""
Build dimension metadata from a STAC object.

Philosophy:
- If cube:dimensions exists: treat it as source of truth (it may omit x/y/t/bands).
- Otherwise: apply openEO-style defaults (x, y, t) and (for Collection/Item) keep bands dimension even if empty.
"""
if self.has_cube_dimensions(stac_object):
dimensions = self.parse_declared_dimensions(stac_object=stac_object, bands=bands)
if not any(isinstance(d, BandDimension) for d in dimensions) and isinstance(
stac_object, (pystac.Collection, pystac.Item)
):
dimensions.append(BandDimension(name="bands", bands=list(bands)))
return dimensions

dimensions: List[Dimension] = [
SpatialDimension(name="x", extent=[None, None]),
SpatialDimension(name="y", extent=[None, None]),
TemporalDimension(name="t", extent=self.infer_temporal_extent(stac_object)),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reuse get_temporal_dimension here instead ?

]
if isinstance(stac_object, (pystac.Collection, pystac.Item)):
dimensions.append(BandDimension(name="bands", bands=list(bands)))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why conditionally add the band dimension here?

return dimensions

def get_temporal_dimension(self, stac_obj: pystac.STACObject) -> Union[TemporalDimension, None]:
"""
Extract the temporal dimension from a STAC Collection/Item (if any)
"""
# TODO: also extract temporal dimension from assets?
if _PYSTAC_1_9_EXTENSION_INTERFACE:
if stac_obj.ext.has("cube") and hasattr(stac_obj.ext, "cube"):
temporal_dims = [
(n, d.extent or [None, None])
for (n, d) in stac_obj.ext.cube.dimensions.items()
if d.dim_type == pystac.extensions.datacube.DimensionType.TEMPORAL
if self.has_cube_dimensions(stac_obj):
temporal_dimensions = [
d
for d in self.parse_declared_dimensions(stac_object=stac_obj, bands=_BandList([]))
if isinstance(d, TemporalDimension)
]
if len(temporal_dimensions) == 1:
return temporal_dimensions[0]

if isinstance(stac_obj, (pystac.Collection, pystac.Item)):
return TemporalDimension(name="t", extent=self.infer_temporal_extent(stac_obj))

def has_cube_dimensions(self, stac_object: pystac.STACObject) -> bool:
cube_dimensions = self.cube_dimensions_dict(stac_object)
return isinstance(cube_dimensions, dict) and len(cube_dimensions) > 0

def cube_dimensions_dict(self, stac_object: pystac.STACObject) -> Dict[str, dict]:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this has to be in the public interface

"""
Return raw cube:dimensions dict from a Collection/Item, or {}.
"""
if isinstance(stac_object, pystac.Item):
return stac_object.properties.get("cube:dimensions", {}) or {}
if isinstance(stac_object, pystac.Collection):
return stac_object.extra_fields.get("cube:dimensions", {}) or {}
return {}

def infer_temporal_extent(self, stac_object: pystac.STACObject) -> List[Optional[str]]:
"""
Best-effort temporal extent:
- Collection: extent.temporal interval
- Item: datetime or start/end
"""
if isinstance(stac_object, pystac.Collection) and stac_object.extent and stac_object.extent.temporal:
interval = stac_object.extent.temporal.intervals[0]
return [Rfc3339(propagate_none=True).normalize(d) for d in interval]

if isinstance(stac_object, pystac.Item):
props = getattr(stac_object, "properties", {}) or {}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just wondering: why using getattr here? Isn't the presence of properties guaranteed?

dt_ = props.get("datetime")
if dt_:
norm = Rfc3339(propagate_none=True).normalize(dt_)
return [norm, norm]
start = props.get("start_datetime")
end = props.get("end_datetime")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't the "datetime" field a required field, so is this code even reachable?
Or to put differently: shouldn't "start_datetime"/"end_datetime" get higher precedence over "datetime"?

if start or end:
return [
Rfc3339(propagate_none=True).normalize(start),
Rfc3339(propagate_none=True).normalize(end),
]
if len(temporal_dims) == 1:
name, extent = temporal_dims[0]
return TemporalDimension(name=name, extent=extent)
elif isinstance(stac_obj, pystac.Collection) and stac_obj.extent.temporal:
# No explicit "cube:dimensions": build fallback from "extent.temporal",
# with dimension name "t" (openEO API recommendation).
extent = [Rfc3339(propagate_none=True).normalize(d) for d in stac_obj.extent.temporal.intervals[0]]
return TemporalDimension(name="t", extent=extent)
else:
if isinstance(stac_obj, pystac.Item):
cube_dimensions = stac_obj.properties.get("cube:dimensions", {})
elif isinstance(stac_obj, pystac.Collection):
cube_dimensions = stac_obj.extra_fields.get("cube:dimensions", {})

return [None, None]

@staticmethod
def _safe_extent_from_pystac_cube_dim(dim) -> list:
"""
PySTAC cube dimension wrapper may raise if 'extent' is missing.
Also, depending on serialization/version, extent might live in extra_fields.
"""
try:
ext = dim.extent
except Exception:
ext = None

if not ext:
extra = getattr(dim, "extra_fields", {}) or {}
ext = extra.get("extent")

return ext or [None, None]

def parse_declared_dimensions(self, stac_object: pystac.STACObject, bands: _BandList) -> List[Dimension]:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as noted above, it looks weird to have this required bands argument here without explanation

"""
Parse dimensions declared through cube:dimensions.
"""
if (
_PYSTAC_1_9_EXTENSION_INTERFACE
and getattr(stac_object, "ext", None) is not None
and stac_object.ext.has("cube")
and hasattr(stac_object.ext, "cube")
):
return self._parse_cube_dimensions_from_pystac_extension(stac_object=stac_object, bands=bands)
return self._parse_cube_dimensions_from_raw_dict(stac_object=stac_object, bands=bands)

def _parse_cube_dimensions_from_pystac_extension(
self, stac_object: pystac.STACObject, bands: _BandList
) -> List[Dimension]:
"""
Parse dimensions from PySTAC's cube extension wrapper (when present).
Important: PySTAC DimensionType only has SPATIAL + TEMPORAL.
Everything else is treated as band-like.
"""
dimensions = []
for name, dim in stac_object.ext.cube.dimensions.items():
dim_type = getattr(dim, "dim_type", None)
extent = self._safe_extent_from_pystac_cube_dim(dim)

if dim_type == pystac.extensions.datacube.DimensionType.SPATIAL:
dimensions.append(SpatialDimension(name=name, extent=extent))
elif dim_type == pystac.extensions.datacube.DimensionType.TEMPORAL:
dimensions.append(TemporalDimension(name=name, extent=extent))
else:
cube_dimensions = {}
temporal_dims = [
(n, d.get("extent", [None, None])) for (n, d) in cube_dimensions.items() if d.get("type") == "temporal"
]
if len(temporal_dims) == 1:
name, extent = temporal_dims[0]
return TemporalDimension(name=name, extent=extent)
dimensions.append(BandDimension(name=name, bands=list(bands)))

return dimensions

def _parse_cube_dimensions_from_raw_dict(self, stac_object: pystac.STACObject, bands: _BandList) -> List[Dimension]:
"""
Parse dimensions from raw cube:dimensions dict.
Supports 'spatial', 'temporal', and ('bands' or 'spectral' as an alias).
"""
dimensions = []
cube_dimensions = self.cube_dimensions_dict(stac_object)

for name, dim in cube_dimensions.items():
if not isinstance(dim, dict):
continue

dim_type = dim.get("type")
extent = dim.get("extent", [None, None])

if dim_type == "spatial":
dimensions.append(SpatialDimension(name=name, extent=extent))
elif dim_type == "temporal":
dimensions.append(TemporalDimension(name=name, extent=extent))
elif dim_type in ("bands", "spectral"):
dimensions.append(BandDimension(name=name, bands=list(bands)))
else:
dimensions.append(Dimension(name=name, type=dim_type))

return dimensions

def _band_from_eo_bands_metadata(self, band: Union[dict, pystac.extensions.eo.Band]) -> Band:
"""Construct band from metadata in eo v1.1 style"""
Expand Down
73 changes: 71 additions & 2 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,7 +1156,10 @@ def test_metadata_from_stac_bands(tmp_path, test_stac, expected):
# TODO #738 real request mocking of STAC resources compatible with pystac?
path.write_text(json.dumps(test_stac))
metadata = metadata_from_stac(str(path))
assert metadata.band_names == expected
if expected:
assert metadata.band_names == expected
else:
assert not metadata.has_band_dimension()



Expand Down Expand Up @@ -1253,7 +1256,73 @@ def test_metadata_from_stac_temporal_dimension(tmp_path, stac_dict, expected):
assert isinstance(dim, TemporalDimension)
assert (dim.name, dim.extent) == expected
else:
assert not metadata.has_temporal_dimension()
# With openEO defaults, a temporal dimension name ('t') can exist even when extent is unknown.
Comment thread
suriyahgit marked this conversation as resolved.
# Depending on STAC input, pystac/datacube parsing can produce:
# - a degenerate extent [d, d] when a single `datetime` is present
# - an "unknown" extent [None, None] when there is no temporal info
assert metadata.has_temporal_dimension()
extent = metadata.temporal_dimension.extent
if extent is None:
pass
else:
assert isinstance(extent, list) and len(extent) == 2
# Allow "unknown" temporal extent representation
if extent == [None, None]:
pass
else:
# Allow degenerate interval when a single datetime is available
assert extent[0] == extent[1]



# Dimension name resolution policy (STAC cube:dimensions vs openEO defaults)
Comment thread
suriyahgit marked this conversation as resolved.
@pytest.mark.parametrize(
["stac_dict", "expected_dims"],
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can a STAC 1.1 style test case be added which uses the new 'bands' common metadata and no datacube extension?
Collection metadata example:
https://github.com/radiantearth/stac-spec/blob/master/examples/collection-only/collection.json#L152
Bands spec:
https://github.com/radiantearth/stac-spec/blob/master/commons/common-metadata.md#band-object

[
(
# No cube:dimensions -> fall back to openEO default naming convention
StacDummyBuilder.collection(summaries={"eo:bands": [{"name": "B01"}]}),
{"t", "bands", "y", "x"},
),
(
# No cube:dimensions (item) -> fall back to openEO default naming convention
StacDummyBuilder.item(
properties={"datetime": "2020-05-22T00:00:00Z", "eo:bands": [{"name": "B01"}]}
),
{"t", "bands", "y", "x"},
),
(
# cube:dimensions present -> use the dimension names as suggested by cube:dimensions keys
StacDummyBuilder.collection(
cube_dimensions={
"time": {"type": "temporal", "axis": "t", "extent": ["2024-04-04", "2024-06-06"]},
"band": {"type": "bands", "axis": "bands", "values": ["B01"]},
"y": {"type": "spatial", "axis": "y", "extent": [0, 1]},
"x": {"type": "spatial", "axis": "x", "extent": [0, 1]},
}
),
{"time", "band", "y", "x"},
),
],
)
def test_metadata_from_stac_dimension_policy_cube_dimensions_vs_default(tmp_path, stac_dict, expected_dims):
path = tmp_path / "stac.json"
# TODO #738 real request mocking of STAC resources compatible with pystac?
path.write_text(json.dumps(stac_dict))
metadata = metadata_from_stac(str(path))

got = tuple(metadata.dimension_names() or ())

# Order-insensitive check: names only
assert set(got) == expected_dims

# Ensure the policy logic is exercised correctly:
# cube:dimensions can be located at root (collection) or in properties (item)
cube_dims = stac_dict.get("cube:dimensions") or (stac_dict.get("properties") or {}).get("cube:dimensions")
if cube_dims is None:
assert set(got) == {"t", "bands", "y", "x"}
else:
assert set(got) == set(cube_dims.keys())


@pytest.mark.parametrize(
Expand Down
Loading