Source code for argopy.utils.transform

"""
Manipulate/transform xarray objects or list of objects
"""
import numpy as np
import xarray as xr
import logging
from typing import List


log = logging.getLogger("argopy.utils.manip")


[docs]def drop_variables_not_in_all_datasets(ds_collection: List[xr.Dataset]) -> List[xr.Dataset]:
    """Drop variables that are not in all datasets (the lowest common denominator)

    Parameters
    ----------
    ds_collection: List[xarray.Dataset]
        A list of :class:`xarray.Dataset`

    Returns
    -------
    List[xarray.Dataset]
    """

    # List all possible data variables:
    vlist = []
    for res in ds_collection:
        [vlist.append(v) for v in list(res.data_vars)]
    vlist = np.unique(vlist)

    # Check if each variable are in each dataset:
    ishere = np.zeros((len(vlist), len(ds_collection)))
    for ir, res in enumerate(ds_collection):
        for iv, v in enumerate(res.data_vars):
            for iu, u in enumerate(vlist):
                if v == u:
                    ishere[iu, ir] = 1

    # List of dataset with missing variables:
    # ir_missing = np.sum(ishere, axis=0) < len(vlist)
    # List of variables missing in some dataset:
    iv_missing = np.sum(ishere, axis=1) < len(ds_collection)
    if len(iv_missing) > 0:
        log.debug(
            "Dropping these variables that are missing from some dataset in this list: %s"
            % vlist[iv_missing]
        )

    # List of variables to keep
    iv_tokeep = np.sum(ishere, axis=1) == len(ds_collection)
    for ir, res in enumerate(ds_collection):
        #         print("\n", res.attrs['Fetched_uri'])
        v_to_drop = []
        for iv, v in enumerate(res.data_vars):
            if v not in vlist[iv_tokeep]:
                v_to_drop.append(v)
        ds_collection[ir] = ds_collection[ir].drop_vars(v_to_drop)
    return ds_collection


[docs]def fill_variables_not_in_all_datasets(
    ds_collection: List[xr.Dataset], concat_dim: str = "rows"
) -> List[xr.Dataset]:
    """Add empty variables to dataset so that all the collection have the same :attr:`xarray.Dataset.data_vars` and :props:`xarray.Dataset.coords`

    This is to make sure that the collection of dataset can be concatenated

    Parameters
    ----------
    ds_collection: List[xarray.Dataset]
        A list of :class:`xarray.Dataset`
    concat_dim: str, default='rows'
        Name of the dimension to use to create new variables. Typically, this is the name of the dimension the collection will
        be concatenated along afterward.

    Returns
    -------
    List[xarray.Dataset]
    """

    def first_variable_with_concat_dim(this_ds, concat_dim="rows"):
        """Return the 1st variable in the collection that have the concat_dim in dims"""
        first = None
        for v in this_ds.data_vars:
            if concat_dim in this_ds[v].dims:
                first = v
                pass
        return first

    def fillvalue(da):
        """Return fillvalue for a dataarray"""
        # https://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind
        if da.dtype.kind in ["U"]:
            fillvalue = " "
        elif da.dtype.kind == "i":
            fillvalue = 99999
        elif da.dtype.kind == "M":
            fillvalue = np.datetime64("NaT")
        else:
            fillvalue = np.nan
        return fillvalue

    # List all possible data variables:
    vlist = []
    for res in ds_collection:
        [vlist.append(v) for v in list(res.variables) if concat_dim in res[v].dims]
    vlist = np.unique(vlist)
    # log.debug('variables', vlist)

    # List all possible coordinates:
    clist = []
    for res in ds_collection:
        [clist.append(c) for c in list(res.coords) if concat_dim in res[c].dims]
    clist = np.unique(clist)
    # log.debu('coordinates', clist)

    # Get the first occurrence of each variable, to be used as a template for attributes and dtype
    meta = {}
    for ir, ds in enumerate(ds_collection):
        for v in vlist:
            if v in ds.variables:
                meta[v] = {
                    "attrs": ds[v].attrs,
                    "dtype": ds[v].dtype,
                    "fill_value": fillvalue(ds[v]),
                }
    # [log.debug(meta[m]) for m in meta.keys()]

    # Add missing variables to dataset
    datasets = [ds.copy() for ds in ds_collection]
    for ir, ds in enumerate(datasets):
        for v in vlist:
            if v not in ds.variables:
                like = ds[first_variable_with_concat_dim(ds, concat_dim=concat_dim)]
                datasets[ir][v] = xr.full_like(
                    like, fill_value=meta[v]["fill_value"], dtype=meta[v]["dtype"]
                )
                datasets[ir][v].attrs = meta[v]["attrs"]

    # Make sure that all datasets have the same set of coordinates
    results = []
    for ir, ds in enumerate(datasets):
        results.append(datasets[ir].set_coords(clist))

    #
    return results