Source code for hydro.discoverability

"""Functions for adding ACDD/discoverability attributes to datasets based on their contents"""

import numpy as np
import xarray as xr


[docs] def flag_histogram(ds: xr.Dataset) -> xr.Dataset: """For flag variables, create a histogram of how many times each flag value appears""" ds_ = ds.copy() # TODO: switch to operator.is_not_none when python 3.13 dropped for var in ds_.filter_by_attrs(flag_values=lambda x: x is not None): da = ds_[var] unique, counts = np.unique(da.fillna(9), return_counts=True) flag_values = da.attrs["flag_values"] histogram = np.full_like(flag_values, 0, dtype=np.int64) histogram[np.searchsorted(flag_values, unique)] = counts da.attrs["flag_histogram"] = histogram return ds_
[docs] def min_max(ds: xr.Dataset) -> xr.Dataset: """Set min/max attributes on all the numeric data variables including flags This will use two "made up" attribute names of: * actual_max * actual_min The more well known attributes of "valid_min" and "valid_max" apply to the packed data, so if there are add_offset and scale_factor attributes, any valid_* attributes would apply to the data before scaling/multiplying. Additionally, generic applications should mask any values outside the "valid" ranges. CF defines an attribute "actual_range" that includes both min and max values of the unpacked data. For JSON-LD/schema.org reasons we need the min/max as separate values. """ ds_ = ds.copy() for var in ds_.variables: da = ds_[var] if da.dtype.kind not in "biuf": # numpy codes for the numeric types continue attrs = { "actual_min": da.min(skipna=True).item(), "actual_max": da.max(skipna=True).item(), } da.attrs.update(attrs) return ds_
[docs] def temporal(ds: xr.Dataset) -> xr.Dataset: """Set the temporal extent global attributes This includes: * time_coverage_start * time_coverage_end """ ds_ = ds.copy() fmt = "%Y-%m-%dT%H:%M:%SZ" attrs = { "time_coverage_start": ds_.time.min().dt.strftime(fmt).item(), "time_coverage_end": ds_.time.max().dt.strftime(fmt).item(), } ds_.attrs.update(attrs) return ds_
[docs] def geospatial(ds: xr.Dataset) -> xr.Dataset: """Set the geospatial extent global attributes These include: * geospatial_lat_min * geospatial_lat_max * geospatial_lon_min * geospatial_lon_max * geospatial_vertical_min * geospatial_vertical_max * geospatial_vertical_positive (always "down") * geospatial_vertical_units (always "dbar") Note that according to the "spec" of ACDD 1.3 if the lon min is greater than the lon max, this indicates that the geospatial extent crosses the discontinuity meridian. This function assumes that if the difference between the min/max of lon is larger than 300 degrees that crossing is occurring. The behavior if the ship goes to a longitude singularity such as the north pole not explicitly handled. The ACDD 1.3 "spec" also allows for the pressure units of bar to be used for the vertical units. """ ds_ = ds.copy() attrs = { "geospatial_lat_min": ds_.latitude.min(skipna=True).item(), "geospatial_lat_max": ds_.latitude.max(skipna=True).item(), "geospatial_lon_min": ds_.longitude.min(skipna=True).item(), "geospatial_lon_max": ds_.longitude.max(skipna=True).item(), "geospatial_vertical_min": ds_.pressure.min(skipna=True).item(), "geospatial_vertical_max": ds_.pressure.max(skipna=True).item(), "geospatial_vertical_positive": ds_.pressure.attrs["positive"], "geospatial_vertical_units": ds_.pressure.attrs["units"], } # Handle basic longitude crossing a discontinuity if (attrs["geospatial_lon_max"] - attrs["geospatial_lon_min"]) > 300: attrs["geospatial_lon_max"] = np.nanmax( ds_.longitude.values, where=ds_.longitude.values < 0, initial=-180 ).item() attrs["geospatial_lon_min"] = np.nanmin( ds_.longitude.values, where=ds_.longitude.values > 0, initial=180 ).item() ds_.attrs.update(attrs) return ds_
[docs] def add_discoverability_attributes(ds: xr.Dataset) -> xr.Dataset: """Add (or update) all the discoverability metadata from all the other functions in this module""" return ds.pipe(flag_histogram).pipe(min_max).pipe(temporal).pipe(geospatial)