Source code for argopy.utils.transform

"""
Manipulate/transform xarray objects or list of objects
"""
import numpy as np
import xarray as xr
import logging
from typing import List


log = logging.getLogger("argopy.utils.manip")


[docs]def drop_variables_not_in_all_datasets(ds_collection: List[xr.Dataset]) -> List[xr.Dataset]: """Drop variables that are not in all datasets (the lowest common denominator) Parameters ---------- ds_collection: List[xarray.Dataset] A list of :class:`xarray.Dataset` Returns ------- List[xarray.Dataset] """ # List all possible data variables: vlist = [] for res in ds_collection: [vlist.append(v) for v in list(res.data_vars)] vlist = np.unique(vlist) # Check if each variable are in each dataset: ishere = np.zeros((len(vlist), len(ds_collection))) for ir, res in enumerate(ds_collection): for iv, v in enumerate(res.data_vars): for iu, u in enumerate(vlist): if v == u: ishere[iu, ir] = 1 # List of dataset with missing variables: # ir_missing = np.sum(ishere, axis=0) < len(vlist) # List of variables missing in some dataset: iv_missing = np.sum(ishere, axis=1) < len(ds_collection) if len(iv_missing) > 0: log.debug( "Dropping these variables that are missing from some dataset in this list: %s" % vlist[iv_missing] ) # List of variables to keep iv_tokeep = np.sum(ishere, axis=1) == len(ds_collection) for ir, res in enumerate(ds_collection): # print("\n", res.attrs['Fetched_uri']) v_to_drop = [] for iv, v in enumerate(res.data_vars): if v not in vlist[iv_tokeep]: v_to_drop.append(v) ds_collection[ir] = ds_collection[ir].drop_vars(v_to_drop) return ds_collection
[docs]def fill_variables_not_in_all_datasets( ds_collection: List[xr.Dataset], concat_dim: str = "rows" ) -> List[xr.Dataset]: """Add empty variables to dataset so that all the collection have the same :attr:`xarray.Dataset.data_vars` and :props:`xarray.Dataset.coords` This is to make sure that the collection of dataset can be concatenated Parameters ---------- ds_collection: List[xarray.Dataset] A list of :class:`xarray.Dataset` concat_dim: str, default='rows' Name of the dimension to use to create new variables. Typically, this is the name of the dimension the collection will be concatenated along afterward. Returns ------- List[xarray.Dataset] """ def first_variable_with_concat_dim(this_ds, concat_dim="rows"): """Return the 1st variable in the collection that have the concat_dim in dims""" first = None for v in this_ds.data_vars: if concat_dim in this_ds[v].dims: first = v pass return first def fillvalue(da): """Return fillvalue for a dataarray""" # https://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind if da.dtype.kind in ["U"]: fillvalue = " " elif da.dtype.kind == "i": fillvalue = 99999 elif da.dtype.kind == "M": fillvalue = np.datetime64("NaT") else: fillvalue = np.nan return fillvalue # List all possible data variables: vlist = [] for res in ds_collection: [vlist.append(v) for v in list(res.variables) if concat_dim in res[v].dims] vlist = np.unique(vlist) # log.debug('variables', vlist) # List all possible coordinates: clist = [] for res in ds_collection: [clist.append(c) for c in list(res.coords) if concat_dim in res[c].dims] clist = np.unique(clist) # log.debu('coordinates', clist) # Get the first occurrence of each variable, to be used as a template for attributes and dtype meta = {} for ir, ds in enumerate(ds_collection): for v in vlist: if v in ds.variables: meta[v] = { "attrs": ds[v].attrs, "dtype": ds[v].dtype, "fill_value": fillvalue(ds[v]), } # [log.debug(meta[m]) for m in meta.keys()] # Add missing variables to dataset datasets = [ds.copy() for ds in ds_collection] for ir, ds in enumerate(datasets): for v in vlist: if v not in ds.variables: like = ds[first_variable_with_concat_dim(ds, concat_dim=concat_dim)] datasets[ir][v] = xr.full_like( like, fill_value=meta[v]["fill_value"], dtype=meta[v]["dtype"] ) datasets[ir][v].attrs = meta[v]["attrs"] # Make sure that all datasets have the same set of coordinates results = [] for ir, ds in enumerate(datasets): results.append(datasets[ir].set_coords(clist)) # return results