Source code for argopy.extensions.canyon_b

from pathlib import Path
from typing import Union, List, Optional
import pandas as pd
import numpy as np
import xarray as xr

try:
    import PyCO2SYS as pyco2

    HAS_PYCO2SYS = True
except ImportError:
    HAS_PYCO2SYS = False
    pyco2 = None

try:
    from numba import jit, prange

    HAS_NUMBA = True
except ImportError:
    HAS_NUMBA = False

    # Define dummy decorators (needed for tests when numba is not installed)
    def jit(*args, **kwargs):
        def decorator(func):
            return func

        return decorator

    prange = range

try:
    from joblib import Parallel, delayed

    HAS_JOBLIB = True
except ImportError:
    HAS_JOBLIB = False
    Parallel = None
    delayed = None

from ..errors import InvalidDatasetStructure, DataNotFound
from ..utils import path2assets, to_list, point_in_polygon
from . import register_argo_accessor, ArgoAccessorExtension


[docs] @register_argo_accessor("canyon_b") class CanyonB(ArgoAccessorExtension): """Nutrients and Carbonate System Variables predictor with CANYON-B This is an implementation of the CANYON-B method: a bayesian neural network approach that estimates water-column nutrient concentrations and carbonate system variables ([1]_). CANYON-B is based on the CANYON model ([2]_) and provides more robust neural networks, that include a local uncertainty estimate for each predicted parameter. When using this method, please cite the papers. See Also -------- :meth:`canyon_b.predict`, :attr:`canyon_b.input_list`, :attr:`canyon_b.output_list` Examples -------- Load data, they must contain oxygen measurements: .. code-block:: python from argopy import DataFetcher ArgoSet = DataFetcher(ds='bgc', mode='standard', params='DOXY', measured='DOXY').float(1902605) ds = ArgoSet.to_xarray() Once input data are loaded, make all or selected parameters predictions with or without specifying input errors on pressure (epres, in dbar), temperature (etemp, in degC), salinity (epsal, in PSU) and oxygen (edoxy, in micromole/kg). For interested users, uncertainties on predicted parameters can also be included. .. code-block:: python ds.argo.canyon_b.predict() ds.argo.canyon_b.predict('PO4') ds.argo.canyon_b.predict(['PO4', 'NO3']) ds.argo.canyon_b.predict(['PO4', 'NO3'], include_uncertainties=True) ds.argo.canyon_b.predict(['PO4', 'NO3'], epres=0.5, etemp=0.005, epsal=0.005, edoxy=0.01) ds.argo.canyon_b.predict(['PO4', 'NO3'], epres=0.5, etemp=0.005, epsal=0.005, edoxy=0.01, include_uncertainties=True) By default, if no input errors are specified, the following default values are used: - epres = 0.5 dbar - etemp = 0.005 degC - epsal = 0.005 PSU - edoxy = 1% of DOXY value Notes ----- This Python implementation is largely inspired by work from Raphael Bajon (https://github.com/RaphaelBajon) which is available at https://github.com/RaphaelBajon/canyonbpy and from the EuroGO-SHIP organization (https://github.com/EuroGO-SHIP/AtlantOS_QC/blob/master/atlantos_qc/data_models/extra/pycanyonb.py) References ---------- .. [1] Bittig, H. C., Steinhoff, T., Claustre, H., Fiedler, B., Williams, N. L., Sauzede, R., Kortzinger, A., and Gattuso, J. P. (2018). An alternative to static climatologies: Robust estimation of open ocean CO2 variables and nutrient concentrations from T, S, and O2 data using Bayesian neural networks. Frontiers in Marine Science, 5, 328. https://doi.org/10.3389/fmars.2018.00328 .. [2] Sauzede, R., Bittig, H. C., Claustre, H., Pasqueron de Fommervault, O., Gattuso, J. P., Legendre, L., and Johnson, K. S. (2017). Estimates of water-column nutrient concentrations and carbonate system parameters in the global ocean: A novel approach based on neural networks. Frontiers in Marine Science, 4, 128. https://doi.org/10.3389/fmars.2017.00128 """ n_inputs = ( 7 # Pressure, Temperature, Salinity, Oxygen, Latitude, Longitude, Decimal Year ) """Number of inputs variables for CANYON-B""" _input_list = ["LATITUDE", "LONGITUDE", "PRES", "TEMP", "PSAL", "DOXY"] """List of parameters required to make predictions""" _output_list = [ "PO4", "PO4_ci", "PO4_cim", "PO4_cin", "PO4_cii", "NO3", "NO3_ci", "NO3_cim", "NO3_cin", "NO3_cii", "SiOH4", "SiOH4_ci", "SiOH4_cim", "SiOH4_cin", "SiOH4_cii", "AT", "AT_ci", "AT_cim", "AT_cin", "AT_cii", "DIC", "DIC_ci", "DIC_cim", "DIC_cin", "DIC_cii", "pHT", "pHT_ci", "pHT_cim", "pHT_cin", "pHT_cii", "pCO2", "pCO2_ci", "pCO2_cim", "pCO2_cin", "pCO2_cii", ] # DIC = CT in Bittig et al., (2018), keep it that way to be consistent with the canyon-med extension. """List of all possible output variables for CANYON-B"""
[docs] def __init__(self, *args, **kwargs): if not HAS_PYCO2SYS: raise ImportError( "PyCO2SYS is required for the CANYON-B extension." "Install it with: pip install PyCO2SYS" ) if not HAS_NUMBA: raise ImportError( "numba is required for the CANYON-B extension." "Install it with: pip install numba" ) # Note: for performance reasons, numba is required now. if not HAS_JOBLIB: raise ImportError( "joblib is required for the CANYON-B extension." "Install it with: pip install joblib" ) # Note: for parallelization of predictions, joblib is required now. super().__init__(*args, **kwargs) if self._argo._type != "point": raise InvalidDatasetStructure( "Method only available for a collection of points" ) if self._argo.N_POINTS == 0: raise DataNotFound("Empty dataset, no data to transform !") self.path2coef = Path(path2assets).joinpath( "canyon-b" ) # Path to CANYON-B assets
def get_param_attrs(self, param: str) -> dict: """ Get attributes for a given predicted parameter. Parameters ---------- param : str Parameter name. Valid options are: - 'NO3': Nitrate - 'PO4': Phosphate - 'SiOH4': Silicate - 'AT': Total alkalinity - 'DIC': Dissolved inorganic carbon - 'pHT': Total pH - 'pCO2': Partial pressure of CO2 Returns ------- dict Attribute dictionary containing: - 'units': Measurement units - 'long_name': Descriptive parameter name - 'comment': Data provenance note - 'reference': CANYON-B digital object identifier (DOI) """ attrs = {} if param in ["NO3", "PO4", "SiOH4", "AT", "DIC"]: attrs.update({"units": "micromole/kg"}) if param == "NO3": attrs.update({"long_name": "Nitrate concentration"}) if param == "PO4": attrs.update({"long_name": "Phosphate concentration"}) if param == "SiOH4": attrs.update({"long_name": "Silicate concentration"}) if param == "AT": attrs.update({"long_name": "Total alkalinity"}) if param == "DIC": attrs.update({"long_name": "Total dissolved inorganic carbon"}) if param == "pHT": attrs.update({"long_name": "Total pH"}) attrs.update({"units": "insitu total scale"}) if param == "pCO2": attrs.update({"long_name": "Partial pressure of CO2"}) attrs.update({"units": "micro atm"}) attrs.update({"comment": "Synthetic variable predicted using CANYON-B"}) attrs.update({"reference": "https://doi.org/10.3389/fmars.2018.00328"}) return attrs @property def input_list(self) -> list[str]: """List of parameters required to make predictions with CANYON-B Returns ------- list[str], default = ["LATITUDE", "LONGITUDE", "PRES", "TEMP", "PSAL", "DOXY"] """ return self._input_list.copy() @property def output_list(self) -> list[str]: """List of all possible output variables for CANYON-B Returns ------- list[str], default = ["PO4", "NO3", "DIC", "SiOH4", "AT", "pHT", "pCO2"] Notes ----- DIC = CT in Bittig et al., (2018), keep it that way to be consistent with the canyon-med extension. """ return self._output_list.copy() @property def decimal_year(self): """ Return the decimal year representation of the dataset `TIME` variable. Returns ------- float or np.ndarray Decimal year value(s) """ time_array = self._obj[self._argo._TNAME] return time_array.dt.year + ( 86400 * time_array.dt.dayofyear + 3600 * time_array.dt.hour + time_array.dt.second ) / (365.0 * 24 * 60 * 60) def ds2df(self) -> pd.DataFrame: """ Convert xarray Dataset to CANYON-B neural network input format. Transforms the Argo xarray Dataset into a pandas DataFrame with the required input variables for the CANYON-B neural network. Applies Arctic latitude adjustments and modifies pressure values to account for the large range of pressure values (from the surface to 4000 m depth) and a non-homogeneous data distribution within this range, according to [1]_. Returns ------- pd.DataFrame DataFrame with columns: - 'lat': Latitude in degrees North (Arctic-adjusted if applicable) - 'lon': Longitude in degrees East - 'dec_year': Decimal year - 'temp': Temperature (degC) - 'psal': Salinity (PSU) - 'doxy': Dissolved oxygen (umol/kg) - 'pres': Modified pressure for CANYON-B input (dimensionless) References ---------- .. [1] Fourrier, M., Coppola, L., Claustre, H., D'Ortenzio, F., Sauzede, R., and Gattuso, J.-P. (2020). A Regional Neural Network Approach to Estimate Water-Column Nutrient Concentrations and Carbonate System Variables in the Mediterranean Sea: CANYON-MED. Frontiers in Marine Science 7. https://doi.org/10.3389/fmars.2020.00620 """ if self._obj.argo.N_POINTS > 1: df = pd.DataFrame( { "lat": self.adjust_arctic_latitude( self._obj["LATITUDE"].values, self._obj["LONGITUDE"].values ), "lon": self._obj["LONGITUDE"], "dec_year": self.decimal_year, "temp": self._obj["TEMP"], "psal": self._obj["PSAL"], "doxy": self._obj["DOXY"], "pres": self._obj["PRES"], } ) else: # Handle single point dataset: df = pd.DataFrame.from_dict( { "lat": self.adjust_arctic_latitude( self._obj["LATITUDE"].values, self._obj["LONGITUDE"].values ), "lon": self._obj["LONGITUDE"].values, "dec_year": self.decimal_year.values, "temp": self._obj["TEMP"].values, "psal": self._obj["PSAL"].values, "doxy": self._obj["DOXY"].values, "pres": self._obj["PRES"].values, }, orient="index", ).T # Modify pressure according to Eq. 3 in 10.3389/fmars.2020.00620 df["pres"] = df["pres"].apply( lambda x: (x / 2e4) + (1 / ((1 + np.exp(-x / 300)) ** 3)) ) return df def create_canyonb_input_matrix(self) -> np.ndarray: """ Create input matrix for CANYON-B neural network predictions. Converts the xarray Dataset into a numpy array formatted for CANYON-B neural network input. Returns ------- np.ndarray Input matrix containing columns: - Decimal year - Normalized latitude - Transformed longitude (see eq. (1) in [1]_) - Temperature (degC) - Practical salinity (PSU) - Dissolved oxygen (umol/kg) - Transformed pressure (dimensionless) References ---------- .. [1] Bittig, H. C., Steinhoff, T., Claustre, H., Fiedler, B., Williams, N. L., Sauzede, R., Kortzinger, A., and Gattuso, J. P. (2018). An alternative to static climatologies: Robust estimation of open ocean CO2 variables and nutrient concentrations from T, S, and O2 data using Bayesian neural networks. Frontiers in Marine Science, 5, 328. https://doi.org/10.3389/fmars.2018.00328 """ df = self.ds2df() # Create input matrix data = np.column_stack( [ df["dec_year"], df["lat"] / 90, np.abs(1 - np.mod(df["lon"] - 110, 360) / 180), np.abs(1 - np.mod(df["lon"] - 20, 360) / 180), df["temp"], df["psal"], df["doxy"], df["pres"], ] ) return data def adjust_arctic_latitude(self, lat: np.ndarray, lon: np.ndarray) -> np.ndarray: """ Adjust latitude for Arctic basin calculations. This methods adjusts the latitude of all points inside the Arctic, west of the Lomonosov ridge. This adjustment improves the predictions in the subpolar North Pacific by artificially increasing the "length" of the Bering Strait (see [1]_ for details). Parameters ---------- lat : np.ndarray Latitudes in degrees North lon : np.ndarray Longitudes in degrees East Returns ------- np.ndarray Adjusted latitudes in degrees North References ---------- .. [1] Bittig, H. C., Steinhoff, T., Claustre, H., Fiedler, B., Williams, N. L., Sauzede, R., Kortzinger, A., and Gattuso, J. P. (2018). An alternative to static climatologies: Robust estimation of open ocean CO2 variables and nutrient concentrations from T, S, and O2 data using Bayesian neural networks. Frontiers in Marine Science, 5, 328. https://doi.org/10.3389/fmars.2018.00328 """ # Points for Arctic basin 'West' of Lomonosov ridge plon = np.array( [-180, -170, -85, -80, -37, -37, 143, 143, 180, 180, -180, -180] ) plat = np.array([68, 66.5, 66.5, 80, 80, 90, 90, 68, 68, 90, 90, 68]) # Create polygon array polygon = np.column_stack((plon, plat)) # Flatten coordinates and create points array points = np.column_stack((lon.flatten(), lat.flatten())) # Get mask mask = np.array([point_in_polygon(p, polygon) for p in points]) mask = mask.reshape(lat.shape) # Adjust latitude adjusted_lat = lat.copy() adjusted_lat[mask] = ( lat[mask] - np.sin(np.radians(lon[mask] + 37)) * (90 - lat[mask]) * 0.5 ) return adjusted_lat def load_weights(self, param: str) -> pd.DataFrame: """ Load CANYON-B neural network weights for a specific parameter. Parameters ---------- param : str Parameter name. Valid options are: - 'NO3': Nitrate - 'PO4': Phosphate - 'SiOH4': Silicate - 'AT': Total alkalinity - 'DIC': Dissolved inorganic carbon - 'pHT': Total pH - 'pCO2': Partial pressure of CO2 Returns ------- pd.DataFrame DataFrame containing the neural network weights for the specified parameter. """ if param in ["AT", "pCO2", "NO3", "PO4", "SiOH4"]: weights = pd.read_csv( self.path2coef.joinpath(f"wgts_{param}.txt"), header=None, sep="\t" ) elif param == "DIC": weights = pd.read_csv( self.path2coef.joinpath("wgts_CT.txt"), header=None, sep="\t" ) else: weights = pd.read_csv( self.path2coef.joinpath("wgts_pH.txt"), header=None, sep="\t" ) return weights @staticmethod @jit(nopython=True, parallel=True, cache=True, fastmath=True) def _nn_forward_1layer(data_N, w1, b1, w2, b2): """Forward pass for 1-layer neural network (numba optimized with parallelization)""" nol = data_N.shape[0] # Number of data points ni = data_N.shape[1] # Number of inputs to the neural network nl1 = w1.shape[0] # Number of neurons in the hidden layer # Forward pass a = np.zeros((nol, nl1)) for i in prange( nol ): # Parallel over data points (needs to be an explicit loop for numba) for j in range(nl1): tmp = b1[j] for k in range(ni): tmp += data_N[i, k] * w1[j, k] a[i, j] = np.tanh(tmp) y = ( a @ w2.T + b2 ) # @ is matrix multiplication operator in numpy and numba optimizes it well # Calculate input effects in parallel inx = np.zeros((nol, ni)) for i in prange(nol): # Parallel loop tanh_a = a[i, :] dtanh = 1 - tanh_a * tanh_a for k in range(ni): tmp = 0.0 for j in range(nl1): tmp += w2[0, j] * w1[j, k] * dtanh[j] inx[i, k] = tmp return y.flatten(), inx @staticmethod @jit(nopython=True, parallel=True, cache=True, fastmath=True) def _nn_forward_2layer(data_N, w1, b1, w2, b2, w3, b3): """Forward pass for 2-layer neural network (numba optimized with parallelization)""" nol = data_N.shape[0] # Number of data points ni = data_N.shape[1] # Number of inputs (neural network) nl1 = w1.shape[0] # Number of neurons in the first hidden layer nl2 = w2.shape[0] # Number of neurons in the second hidden layer # First layer a = np.zeros((nol, nl1)) for i in prange(nol): for j in range(nl1): tmp = b1[j] for k in range(ni): tmp += data_N[i, k] * w1[j, k] a[i, j] = np.tanh(tmp) # Second layer b_layer = np.zeros((nol, nl2)) for i in prange(nol): for j in range(nl2): tmp = b2[j] for k in range(nl1): tmp += a[i, k] * w2[j, k] b_layer[i, j] = np.tanh(tmp) # Output layer y = b_layer @ w3.T + b3 # Calculate input effects in parallel inx = np.zeros((nol, ni)) for i in prange(nol): dtanh_a = 1 - a[i, :] * a[i, :] dtanh_b = 1 - b_layer[i, :] * b_layer[i, :] for m in range(ni): tmp = 0.0 for j in range(nl2): for k in range(nl1): tmp += w3[0, j] * dtanh_b[j] * w2[j, k] * dtanh_a[k] * w1[k, m] inx[i, m] = tmp return y.flatten(), inx def _predict( self, param: str, epres: Optional[float] = None, etemp: Optional[float] = None, epsal: Optional[float] = None, edoxy: Optional[Union[float, np.ndarray]] = None, data: Optional[np.ndarray] = None, ) -> dict: """ Predict a single biogeochemical parameter using CANYON-B neural networks. This private method implements the CANYON-B Bayesian neural network ensemble to estimate oceanic parameters from hydrographic data. It processes input data through multiple neural networks and combines their predictions with uncertainty estimates. Parameters ---------- param : str Parameter to predict. Must be one of: - 'AT': Total alkalinity (umol/kg) - 'DIC': Dissolved inorganic carbon (umol/kg) - 'pHT': Total pH - 'pCO2': Partial pressure of CO2 (uatm) - 'NO3': Nitrate concentration (umol/kg) - 'PO4': Phosphate concentration (umol/kg) - 'SiOH4': Silicate concentration (umol/kg) epres : float, optional Pressure measurement uncertainty in dbar (default: 0.5 dbar) etemp : float, optional Temperature measurement uncertainty in degC (default: 0.005 degC) epsal : float, optional Salinity measurement uncertainty (PSU, default: 0.005) edoxy : float or np.ndarray, optional Oxygen measurement uncertainty in umol/kg. If not provided, defaults to 1% of measured oxygen values. Can be a scalar applied to all points or an array matching data dimensions. data : np.ndarray, optional Precomputed input matrix from create_canyonb_input_matrix(). If not provided, it will be computed. Returns ------- dict Dictionary containing the predicted parameter and associated uncertainties: - param: Predicted parameter value - param_ci: Predicted parameter value uncertainty - param_cim: Parameter measurement uncertainty - param_cin: Parameter uncertainty for Bayesian neural network mapping - param_cii: Parameter uncertainty due to input errors - param_inx: Input effects on parameter """ # Get array shape and number of elements shape = self._obj[self._argo._TNAME].shape nol = self._argo.N_POINTS # Setting up default uncertainties if not provided epres = 0.5 if epres is None else epres etemp = 0.005 if etemp is None else etemp epsal = 0.005 if epsal is None else epsal edoxy = 0.01 * self._obj.DOXY.values if edoxy is None else edoxy # Expand scalar error values errors = [epres, etemp, epsal, edoxy] errors = [ np.full(nol, e) if np.isscalar(e) else np.asarray(e).flatten() for e in errors ] epres, etemp, epsal, edoxy = errors # Define parameters and their properties # ! ORDER MATTERS HERE ! # Order of parameters follows https://github.com/RaphaelBajon/canyonbpy/blob/main/canyonbpy/core.py paramnames = ["AT", "DIC", "pHT", "pCO2", "NO3", "PO4", "SiOH4"] i_idx = {p: i for i, p in enumerate(paramnames)} inputsigma = np.array([6, 4, 0.005, np.nan, 0.02, 0.02, 0.02]) betaipCO2 = np.array([-3.114e-05, 1.087e-01, -7.899e01]) # Adjust pH uncertainty (Orr systematic uncertainty) inputsigma[2] = np.sqrt(0.005**2 + 0.01**2) # Prepare input data if data is None: data = self.create_canyonb_input_matrix() # Output dictionary out = {} # # # Process through neural networks for the given parameter # # # # Get index depending on parameter (mostly important to decipher between nutrients and carbonate systems) i = i_idx[param] # Load weights and convert them to numpy array inwgts = self.load_weights(param) inwgts = inwgts.to_numpy() # Number of networks in committee noparsets = inwgts.shape[1] - 1 # Determine input normalization based on parameter type if i > 3: # nutrients ni = data[:, 1:].shape[1] # Number of inputs (excluding year) ioffset = -1 mw = inwgts[: ni + 1, -1] sw = inwgts[ni + 1 : 2 * ni + 2, -1] data_N = (data[:, 1:] - mw[:ni]) / sw[:ni] else: # carbonate system ni = data.shape[1] # Number of inputs ioffset = 0 mw = inwgts[: ni + 1, -1] sw = inwgts[ni + 1 : 2 * ni + 2, -1] data_N = (data - mw[:ni]) / sw[:ni] # Extract weights and prepare arrays wgts = inwgts[3, :noparsets] betaciw = inwgts[2 * ni + 2 :, -1] betaciw = betaciw[~np.isnan(betaciw)] # Preallocate arrays cval = np.full((nol, noparsets), np.nan) cvalcy = np.full(noparsets, np.nan) inval = np.full((nol, ni, noparsets), np.nan) # Process each network in committee for network in range(noparsets): nlayerflag = 1 + bool(inwgts[1, network]) nl1 = int(inwgts[0, network]) nl2 = int(inwgts[1, network]) beta = inwgts[2, network] # Weights and biases for the first layer idx = 4 w1 = inwgts[idx : idx + nl1 * ni, network].reshape( nl1, ni, order="F" ) # Here, order='F' is needed to make sure to proper do the calculation as in the Matlab version (https://github.com/HCBScienceProducts/CANYON-B/blob/master/CANYONB.m) ! idx += nl1 * ni b1 = inwgts[idx : idx + nl1, network] idx += nl1 # Weights and biases for the second layer w2 = inwgts[idx : idx + nl2 * nl1, network].reshape(nl2, nl1, order="F") idx += nl2 * nl1 b2 = inwgts[idx : idx + nl2, network] if nlayerflag == 2: # Weights and biases for the third layer (if it exists) idx += nl2 w3 = inwgts[idx : idx + nl2, network].reshape(1, nl2, order="F") idx += nl2 b3 = inwgts[idx : idx + 1, network] # Forward pass using numba-optimized functions if nlayerflag == 1: # One hidden layer y, inx = self._nn_forward_1layer(data_N, w1, b1, w2, b2) else: # Two hidden layers y, inx = self._nn_forward_2layer(data_N, w1, b1, w2, b2, w3, b3) # Store results cval[:, network] = y cvalcy[network] = 1 / beta # 'noise' variance inval[:, :, network] = inx # Denormalization cval = cval * sw[ni] + mw[ni] cvalcy = cvalcy * sw[ni] ** 2 # Calculate committee statistics V1 = np.sum(wgts) V2 = np.sum(wgts**2) pred = np.sum(wgts[None, :] * cval, axis=1) / V1 # Calculate uncertainties cvalcu = np.sum(wgts[None, :] * (cval - pred[:, None]) ** 2, axis=1) / ( V1 - V2 / V1 ) cvalcib = np.sum(wgts * cvalcy) / V1 cvalciw = np.polyval(betaciw, np.sqrt(cvalcu)) ** 2 # Calculate input effects inx = np.sum(wgts[None, None, :] * inval, axis=2) / V1 inx = np.tile((sw[ni] / sw[0:ni].T), (nol, 1)) * inx # Pressure scaling pres_original = self._obj["PRES"].values.flatten() ddp = ( 1 / 2e4 + 1 / ((1 + np.exp(-pres_original / 300)) ** 4) * np.exp(-pres_original / 300) / 100 ) inx[:, 7 + ioffset] *= ddp # Calculate input variance error_matrix = np.column_stack([etemp, epsal, edoxy, epres]) cvalcin = np.sum( inx[:, 4 + ioffset : 8 + ioffset] ** 2 * error_matrix**2, axis=1 ) # Calculate measurement uncertainty if i > 3: cvalcimeas = (inputsigma[i] * pred) ** 2 elif i == 3: cvalcimeas = np.polyval(betaipCO2, pred) ** 2 else: cvalcimeas = inputsigma[i] ** 2 # Calculate total uncertainty uncertainty = np.sqrt(cvalcimeas + cvalcib + cvalciw + cvalcu + cvalcin) # Create numpy arrays out[param] = np.reshape(pred, shape) out[f"{param}_ci"] = np.reshape(uncertainty, shape) out[f"{param}_cim"] = np.sqrt(cvalcimeas) out[f"{param}_cin"] = np.reshape(np.sqrt(cvalcib + cvalciw + cvalcu), shape) out[f"{param}_cii"] = np.reshape(np.sqrt(cvalcin), shape) out[f"{param}_inx"] = inx[:, 4 : 8 + ioffset] # Input effects # pCO2 if i == 3: # ipCO2 = 'DIC' / umol kg-1 -> pCO2 / uatm outcalc = pyco2.sys( par1=2300, par2=out[param], par1_type=1, par2_type=2, salinity=35.0, temperature=25.0, temperature_out=np.nan, pressure_out=0.0, pressure_atmosphere_out=np.nan, total_silicate=0.0, total_phosphate=0.0, opt_pH_scale=1.0, opt_k_carbonic=10.0, opt_k_bisulfate=1.0, grads_of=["pCO2"], grads_wrt=["par2"], ) out[f"{paramnames[i]}"] = outcalc["pCO2"] # epCO2 = dpCO2/dDIC * e'DIC' for unc in ["_ci", "_cin", "_cii"]: out[f"{param}{unc}"] = outcalc["d_pCO2__d_par2"] * out[f"{param}{unc}"] out[f"{param}_cim"] = outcalc["d_pCO2__d_par2"] * np.reshape( out[f"{param}_cim"], shape ) out[f"{param}_inx"] = ( outcalc["d_pCO2__d_par2"][:, None] * out[f"{param}_inx"] ) return out
[docs] def predict( self, params: Union[str, List[str]] = None, epres: Optional[float] = None, etemp: Optional[float] = None, epsal: Optional[float] = None, edoxy: Optional[Union[float, np.ndarray]] = None, include_uncertainties: Optional[bool] = False, n_jobs: Optional[int] = -1, ) -> xr.Dataset: """ Make predictions using the CANYON-B method. This method implements the CANYON-B Bayesian neural network ensemble to estimate oceanic parameters from hydrographic data. Parameters ---------- params : str, list of str, or None, optional Parameter(s) to predict. Valid options: - 'AT': Total alkalinity (umol/kg) - 'DIC': Dissolved inorganic carbon (umol/kg) - 'pHT': Total pH - 'pCO2': Partial pressure of CO2 (uatm) - 'NO3': Nitrate concentration (umol/kg) - 'PO4': Phosphate concentration (umol/kg) - 'SiOH4': Silicate concentration (umol/kg) If None (default), all seven parameters are predicted. epres : float, optional Pressure measurement uncertainty in dbar (default: 0.5 dbar) etemp : float, optional Temperature measurement uncertainty in degC (default: 0.005 degC) epsal : float, optional Salinity measurement uncertainty in PSU (default: 0.005) edoxy : float or np.ndarray, optional Oxygen measurement uncertainty in umol/kg. If not provided, defaults to 1% of measured oxygen values. Can be a scalar applied to all points or an array matching data dimensions. include_uncertainties : bool, optional If True, include uncertainty estimates for each predicted parameter n_jobs : int, optional Number of parallel jobs used for prediction (only used when there is more than one parameter to predict). Default is -1 (use all available CPUs). This option is directly passed to :class:`joblib.Parallel`. Returns ------- :class:`xr.Dataset` Input dataset augmented with predicted parameters. """ # Validation of requested parameters to predict: params_list = ["NO3", "PO4", "SiOH4", "AT", "DIC", "pHT", "pCO2"] if params is None: params = params_list else: params = to_list(params) for p in params: if p not in params_list: raise ValueError( "Invalid parameter ('%s') to predict, must be in [%s]" % (p, ",".join(params_list)) ) # Compute input matrix once for all parameters (optimization) data = self.create_canyonb_input_matrix() # Helper function to process a single parameter def process_param(param): """Process a single parameter prediction""" out = self._predict( param, epres=epres, etemp=etemp, epsal=epsal, edoxy=edoxy, data=data ) return param, out # Make predictions of each of the requested parameters if len(params) > 1: # Parallel execution with Parallel(n_jobs=n_jobs, prefer=None) as parallel: results = parallel(delayed(process_param)(param) for param in params) # Convert results list to dict for processing results_dict = {param: out for param, out in results} else: # Sequential execution results_dict = {param: process_param(param)[1] for param in params} # Add results to dataset for param in params: out = results_dict[param] # Add predicted parameter to xr.Dataset self._obj[param] = xr.zeros_like(self._obj["TEMP"]) self._obj[param].attrs = self.get_param_attrs(param) values = out[param].astype(np.float32).squeeze() self._obj[param].values = np.atleast_1d(values) # Add uncertainties if requested if include_uncertainties: # Uncertainty on parameter (ci) self._obj[f"{param}_ci"] = xr.zeros_like(self._obj[param]) self._obj[f"{param}_ci"].attrs = self.get_param_attrs(param) self._obj[f"{param}_ci"].attrs[ "long_name" ] = f"Uncertainty on {self.get_param_attrs(param)['long_name']}" values = out[f"{param}_ci"].astype(np.float32).squeeze() self._obj[f"{param}_ci"].values = np.atleast_1d(values) # Measurement uncertainty (cim) cim_value = out[f"{param}_cim"] if np.isscalar(cim_value): self._obj[f"{param}_cim"] = xr.full_like( self._obj[param], fill_value=float(cim_value), dtype=np.float32 ) else: self._obj[f"{param}_cim"] = xr.zeros_like(self._obj[param]) values = cim_value.astype(np.float32).squeeze() self._obj[f"{param}_cim"].values = np.atleast_1d(values) self._obj[f"{param}_cim"].attrs = self.get_param_attrs(param) self._obj[f"{param}_cim"].attrs[ "long_name" ] = f"Measurement uncertainty on {self.get_param_attrs(param)['long_name']}" # Uncertainty for Bayesian neural network mapping (cin) self._obj[f"{param}_cin"] = xr.zeros_like(self._obj[param]) self._obj[f"{param}_cin"].attrs = self.get_param_attrs(param) self._obj[f"{param}_cin"].attrs[ "long_name" ] = f"Uncertainty for Bayesian neural network mapping on {self.get_param_attrs(param)['long_name']}" values = out[f"{param}_cin"].astype(np.float32).squeeze() self._obj[f"{param}_cin"].values = np.atleast_1d(values) # Uncertainty due to input errors (cii) self._obj[f"{param}_cii"] = xr.zeros_like(self._obj[param]) self._obj[f"{param}_cii"].attrs = self.get_param_attrs(param) self._obj[f"{param}_cii"].attrs[ "long_name" ] = f"Uncertainty due to input errors on {self.get_param_attrs(param)['long_name']}" values = out[f"{param}_cii"].astype(np.float32).squeeze() self._obj[f"{param}_cii"].values = np.atleast_1d(values) # Return xr.Dataset with predicted variables: if self._argo: self._argo.add_history( "Added CANYON-B predictions for [%s]" % (",".join(params)) ) return self._obj