Skip to content

Metadata Extraction#

Functions for extracting, serializing, and deserializing NetCDF metadata using GDAL's Multidimensional API.

pyramids.netcdf.metadata.get_metadata(source, open_options=None) #

Read and normalize all NetCDF MDIM metadata.

Accepts several source types and delegates to MetadataBuilder to produce a NetCDFMetadata instance.

Parameters:

Name Type Description Default
source Dataset | str | object

The data source. Accepts a GDAL dataset directly, a file path (opened internally with OF_MULTIDIM_RASTER), or a pyramids NetCDF/Dataset instance whose internal _raster attribute is extracted automatically.

required
open_options dict[str, Any] | None

Optional dictionary of GDAL open-options. Stored in the resulting metadata for provenance but not used to open the file.

None

Returns:

Name Type Description
NetCDFMetadata NetCDFMetadata

Fully populated metadata dataclass.

Raises:

Type Description
ValueError

If source is a string path that cannot be opened as a multidimensional raster.

Examples:

Open from a file path:

>>> from osgeo import gdal
>>> import pyramids.netcdf.metadata as meta
>>> md = meta.get_metadata(
...     "precip.nc"
... )
>>> md.driver
'netCDF'
See Also

MetadataBuilder: The builder class used internally.

Source code in src/pyramids/netcdf/metadata.py
def get_metadata(
    source,
    open_options: dict[str, Any] | None = None,
) -> NetCDFMetadata:
    """Read and normalize all NetCDF MDIM metadata.

    Accepts several source types and delegates to
    ``MetadataBuilder`` to produce a ``NetCDFMetadata`` instance.

    Args:
        source (gdal.Dataset | str | object): The data source.
            Accepts a GDAL dataset directly, a file path (opened
            internally with ``OF_MULTIDIM_RASTER``), or a pyramids
            ``NetCDF``/``Dataset`` instance whose internal
            ``_raster`` attribute is extracted automatically.
        open_options: Optional dictionary of GDAL open-options.
            Stored in the resulting metadata for provenance but
            not used to open the file.

    Returns:
        NetCDFMetadata: Fully populated metadata dataclass.

    Raises:
        ValueError: If *source* is a string path that cannot be
            opened as a multidimensional raster.

    Examples:
        Open from a file path:

        >>> from osgeo import gdal  # doctest: +SKIP
        >>> import pyramids.netcdf.metadata as meta  # doctest: +SKIP
        >>> md = meta.get_metadata(  # doctest: +SKIP
        ...     "precip.nc"
        ... )
        >>> md.driver  # doctest: +SKIP
        'netCDF'

    See Also:
        MetadataBuilder: The builder class used internally.
    """
    if isinstance(source, (str, Path)):
        ds = gdal.OpenEx(str(source), gdal.OF_MULTIDIM_RASTER)
        if ds is None:
            raise ValueError(f"Could not open '{source}' as multidimensional raster")
        builder = MetadataBuilder(ds, open_options)
        result = builder.build()
        ds = None  # close the temporary handle
        return result
    elif hasattr(source, "_raster"):
        builder = MetadataBuilder(source._raster, open_options)
        return builder.build()
    else:
        builder = MetadataBuilder(source, open_options)
        return builder.build()

pyramids.netcdf.metadata.to_json(metadata) #

Serialize NetCDFMetadata to a compact JSON string.

Converts the dataclass tree to plain dicts via to_dict and then encodes to JSON with no extra whitespace.

Parameters:

Name Type Description Default
metadata NetCDFMetadata

A NetCDFMetadata instance to serialize.

required

Returns:

Name Type Description
str str

JSON-encoded string with no ASCII escaping and compact separators (no spaces after , or :).

Examples:

Round-trip a minimal metadata object:

>>> import json
>>> from pyramids.netcdf.metadata import to_json
>>> from pyramids.netcdf.models import (
...     NetCDFMetadata, StructuralInfo,
... )
>>> md = NetCDFMetadata(
...     driver="netCDF",
...     root_group="/",
...     groups={},
...     variables={},
...     dimensions={},
...     global_attributes={},
...     structural=StructuralInfo(
...         driver_name="netCDF"
...     ),
...     created_with={"library": "GDAL"},
... )
>>> s = to_json(md)
>>> json.loads(s)["driver"]
'netCDF'
See Also

to_dict: Converts to plain dicts without JSON encoding. from_json: Deserializes the string back to NetCDFMetadata.

Source code in src/pyramids/netcdf/metadata.py
def to_json(metadata: NetCDFMetadata) -> str:
    """Serialize ``NetCDFMetadata`` to a compact JSON string.

    Converts the dataclass tree to plain dicts via ``to_dict``
    and then encodes to JSON with no extra whitespace.

    Args:
        metadata: A ``NetCDFMetadata`` instance to serialize.

    Returns:
        str: JSON-encoded string with no ASCII escaping and
            compact separators (no spaces after ``,`` or ``:``).

    Examples:
        Round-trip a minimal metadata object:

        >>> import json
        >>> from pyramids.netcdf.metadata import to_json
        >>> from pyramids.netcdf.models import (
        ...     NetCDFMetadata, StructuralInfo,
        ... )
        >>> md = NetCDFMetadata(
        ...     driver="netCDF",
        ...     root_group="/",
        ...     groups={},
        ...     variables={},
        ...     dimensions={},
        ...     global_attributes={},
        ...     structural=StructuralInfo(
        ...         driver_name="netCDF"
        ...     ),
        ...     created_with={"library": "GDAL"},
        ... )
        >>> s = to_json(md)
        >>> json.loads(s)["driver"]
        'netCDF'

    See Also:
        to_dict: Converts to plain dicts without JSON encoding.
        from_json: Deserializes the string back to
            ``NetCDFMetadata``.
    """
    return json.dumps(to_dict(metadata), ensure_ascii=False, separators=(",", ":"))

pyramids.netcdf.metadata.from_json(s) #

Deserialize NetCDFMetadata from a JSON string.

Parses the JSON produced by to_json and manually reconstructs the dataclass hierarchy (GroupInfo, VariableInfo, DimensionInfo, StructuralInfo).

Only the schema produced by to_dict / to_json is supported; arbitrary JSON will likely raise KeyError.

Parameters:

Name Type Description Default
s str

A JSON string previously produced by to_json.

required

Returns:

Name Type Description
NetCDFMetadata NetCDFMetadata

Reconstructed metadata instance.

Raises:

Type Description
JSONDecodeError

If s is not valid JSON.

KeyError

If required fields are missing from the JSON payload.

Examples:

Round-trip through JSON:

>>> from pyramids.netcdf.metadata import (
...     to_json, from_json,
... )
>>> from pyramids.netcdf.models import (
...     NetCDFMetadata, StructuralInfo,
... )
>>> md = NetCDFMetadata(
...     driver="netCDF",
...     root_group="/",
...     groups={},
...     variables={},
...     dimensions={},
...     global_attributes={"history": "created"},
...     structural=StructuralInfo(
...         driver_name="netCDF"
...     ),
...     created_with={"library": "GDAL"},
... )
>>> s = to_json(md)
>>> restored = from_json(s)
>>> restored.driver
'netCDF'
>>> restored.global_attributes["history"]
'created'
See Also

to_json: The serialization counterpart.

Source code in src/pyramids/netcdf/metadata.py
def from_json(s: str) -> NetCDFMetadata:
    """Deserialize ``NetCDFMetadata`` from a JSON string.

    Parses the JSON produced by ``to_json`` and manually
    reconstructs the dataclass hierarchy (``GroupInfo``,
    ``VariableInfo``, ``DimensionInfo``, ``StructuralInfo``).

    Only the schema produced by ``to_dict`` / ``to_json`` is
    supported; arbitrary JSON will likely raise ``KeyError``.

    Args:
        s: A JSON string previously produced by ``to_json``.

    Returns:
        NetCDFMetadata: Reconstructed metadata instance.

    Raises:
        json.JSONDecodeError: If *s* is not valid JSON.
        KeyError: If required fields are missing from the
            JSON payload.

    Examples:
        Round-trip through JSON:

        >>> from pyramids.netcdf.metadata import (
        ...     to_json, from_json,
        ... )
        >>> from pyramids.netcdf.models import (
        ...     NetCDFMetadata, StructuralInfo,
        ... )
        >>> md = NetCDFMetadata(
        ...     driver="netCDF",
        ...     root_group="/",
        ...     groups={},
        ...     variables={},
        ...     dimensions={},
        ...     global_attributes={"history": "created"},
        ...     structural=StructuralInfo(
        ...         driver_name="netCDF"
        ...     ),
        ...     created_with={"library": "GDAL"},
        ... )
        >>> s = to_json(md)
        >>> restored = from_json(s)
        >>> restored.driver
        'netCDF'
        >>> restored.global_attributes["history"]
        'created'

    See Also:
        to_json: The serialization counterpart.
    """
    d = json.loads(s)

    def build_group(gd: dict[str, Any]) -> GroupInfo:
        return GroupInfo(
            name=gd["name"],
            full_name=gd["full_name"],
            attributes=gd.get("attributes", {}),
            children=gd.get("children", []),
            variables=gd.get("variables", []),
        )

    def build_dim(dd: dict[str, Any]) -> DimensionInfo:
        return DimensionInfo(
            name=dd["name"],
            full_name=dd["full_name"],
            size=int(dd["size"]),
            type=dd.get("type"),
            direction=dd.get("direction"),
            indexing_variable=dd.get("indexing_variable"),
            attrs=dd.get("attrs", {}),
        )

    def build_array(ad: dict[str, Any]) -> VariableInfo:
        return VariableInfo(
            name=ad["name"],
            full_name=ad["full_name"],
            dtype=ad.get("dtype", "unknown"),
            shape=[int(x) for x in ad.get("shape", [])],
            dimensions=[str(x) for x in ad.get("dimensions", [])],
            attributes=ad.get("attributes", {}),
            unit=ad.get("unit"),
            nodata=ad.get("nodata"),
            scale=ad.get("scale"),
            offset=ad.get("offset"),
            srs_wkt=ad.get("srs_wkt"),
            srs_projjson=ad.get("srs_projjson"),
            coordinate_variables=[str(x) for x in ad.get("coordinate_variables", [])],
            structural_info=ad.get("structural_info"),
            block_size=(
                [int(x) for x in ad.get("block_size", [])]
                if ad.get("block_size") is not None
                else None
            ),
        )

    groups = {k: build_group(v) for k, v in d.get("groups", {}).items()}
    variables = {
        k: build_array(v)
        for k, v in d.get("variables", {}).items()
    }
    dims = {k: build_dim(v) for k, v in d.get("dimensions", {}).items()}

    structural = d.get("structural")
    structural_obj = (
        StructuralInfo(
            driver_name=structural.get("driver_name", "UNKNOWN"),
            driver_metadata=structural.get("driver_metadata"),
        )
        if structural is not None
        else None
    )

    return NetCDFMetadata(
        driver=d.get("driver", "UNKNOWN"),
        root_group=d.get("root_group"),
        groups=groups,
        variables=variables,
        dimensions=dims,
        global_attributes=d.get("global_attributes", {}),
        structural=structural_obj,
        open_options_used=d.get("open_options_used"),
        created_with=d.get("created_with", {}),
    )

pyramids.netcdf.metadata.to_dict(metadata) #

Convert NetCDFMetadata to plain dicts suitable for JSON.

Recursively walks all dataclass fields and converts them to plain dict / list / scalar types so the result can be passed directly to json.dumps.

Parameters:

Name Type Description Default
metadata NetCDFMetadata

A NetCDFMetadata instance to convert.

required

Returns:

Name Type Description
dict dict[str, Any]

Nested dictionary with all dataclass fields converted to plain dicts.

Examples:

Convert a minimal metadata object:

>>> from pyramids.netcdf.metadata import to_dict
>>> from pyramids.netcdf.models import (
...     NetCDFMetadata, StructuralInfo,
... )
>>> md = NetCDFMetadata(
...     driver="netCDF",
...     root_group="/",
...     groups={},
...     variables={},
...     dimensions={},
...     global_attributes={"title": "test"},
...     structural=StructuralInfo(
...         driver_name="netCDF"
...     ),
...     created_with={"library": "GDAL"},
... )
>>> d = to_dict(md)
>>> d["driver"]
'netCDF'
>>> d["global_attributes"]["title"]
'test'
>>> d["structural"]["driver_name"]
'netCDF'
See Also

to_json: Serializes directly to a JSON string. from_json: Deserializes a JSON string back to NetCDFMetadata.

Source code in src/pyramids/netcdf/metadata.py
def to_dict(metadata: NetCDFMetadata) -> dict[str, Any]:
    """Convert ``NetCDFMetadata`` to plain dicts suitable for JSON.

    Recursively walks all dataclass fields and converts them to
    plain ``dict`` / ``list`` / scalar types so the result can be
    passed directly to ``json.dumps``.

    Args:
        metadata: A ``NetCDFMetadata`` instance to convert.

    Returns:
        dict: Nested dictionary with all dataclass fields
            converted to plain dicts.

    Examples:
        Convert a minimal metadata object:

        >>> from pyramids.netcdf.metadata import to_dict
        >>> from pyramids.netcdf.models import (
        ...     NetCDFMetadata, StructuralInfo,
        ... )
        >>> md = NetCDFMetadata(
        ...     driver="netCDF",
        ...     root_group="/",
        ...     groups={},
        ...     variables={},
        ...     dimensions={},
        ...     global_attributes={"title": "test"},
        ...     structural=StructuralInfo(
        ...         driver_name="netCDF"
        ...     ),
        ...     created_with={"library": "GDAL"},
        ... )
        >>> d = to_dict(md)
        >>> d["driver"]
        'netCDF'
        >>> d["global_attributes"]["title"]
        'test'
        >>> d["structural"]["driver_name"]
        'netCDF'

    See Also:
        to_json: Serializes directly to a JSON string.
        from_json: Deserializes a JSON string back to
            ``NetCDFMetadata``.
    """

    def convert(obj: Any) -> Any:
        if is_dataclass(obj) and not isinstance(obj, type):
            return {k: convert(v) for k, v in asdict(obj).items()}
        if isinstance(obj, dict):
            return {str(k): convert(v) for k, v in obj.items()}
        if isinstance(obj, list):
            return [convert(v) for v in obj]
        return obj

    return cast(dict[str, Any], convert(metadata))