Source code for argopy.reference.mapping

import warnings
import pandas as pd
from typing import Any
from copy import deepcopy
import numpy as np

from argopy.options import OPTIONS
from argopy.stores.nvs import NVS
from argopy.stores.nvs.utils import bindings2df, id2urn, url2predicate
from argopy.stores.nvs.utils import known_mappings 
from argopy.utils.format import ppliststr, urnparser
from argopy.utils.locals import Asset


id2concept = lambda x: urnparser(id2urn(x))["termid"]

predicate = lambda x: url2predicate(x).split(":")[-1]  # Remove NVS jargon ('skos:', or 'owl:')


[docs]
class ArgoReferenceMapping:
    """A class to work with Argo Reference Value Relationships, i.e. a NVS "mapping"

    More details from the AVTT documentation:
    https://github.com/OneArgo/ArgoVocabs?tab=readme-ov-file#ivb-mappings

    > Mappings are used to inform relationship between concepts. For instance, inform all the sensor_models manufactured by one sensor_maker, or all the platform_types manufactures by one platform_maker, etc.
    > They are used by the FileChecker to ensure the consistency between these metadata fields in the Argo dataset.

    Examples
    --------
    ..code-block: python
        :caption: Creation

        from argopy import ArgoReferenceMapping

        # Use two Argo parameter names, documented by one of the Argo reference tables:
        ArgoReferenceMapping('PLATFORM_MAKER', 'PLATFORM_TYPE')

        # or reference table identifiers:
        ArgoReferenceMapping('R24', 'R23')

    .. code-block:: python
        :caption: Indexing and values

        from argopy import ArgoReferenceMapping
        arm = ArgoReferenceMapping('R24', 'R23')

        # Relationships within this reference mapping:
        len(arm)     # Number of relationships
        arm.subjects   # Ordered list of unique 'subject' reference values names
        arm.objects    # Ordered list of unique 'object' reference values names
        arm.predicates # Ordered list of unique 'predicate', aka relationships, in this mapping

        # Check if a reference value is in this mapping as a subject or an object:
        'SBE' in arm  # Return True

        # Indexing is by subject values:
        arm['SBE']  # Return a dict with predicate as keys and objects as values

        # Iterate over all relationships:
        for relation in arm:
            print(relation['subject'], relation['predicate'])

    .. code-block:: python
        :caption: Export method

        from argopy import ArgoReferenceMapping
        arm = ArgoReferenceMapping('R24', 'R23')

        # Export all mapping relationships in a DataFrame:
        arm.to_dataframe()

        # To export mapping using AVTT jargon:
        arm.to_dataframe(raw=True)

    """

    __slots__ = (
        "_subjects",
        "_objects",
        "_predicates",
        "_nvs_store",
        "_d",
        "_Vocabulary2Parameter",
        "sub_id",
        "sub_parameter",
        "obj_id",
        "obj_parameter",
        "nvs",
    )


[docs]
    def __init__(self, sub: str, obj: str, **kwargs):
        # Get an NVS store to retrieve data:
        self._nvs_store: NVS = NVS(nvs=kwargs.get("nvs", OPTIONS["nvs"]))

        # Validate subject and object:
        self._Vocabulary2Parameter: dict[str, str] = Asset.load("vocabulary:mapping")[
            "data"
        ]["Vocabulary2Parameter"]

        if sub in self._Vocabulary2Parameter.keys():
            self.sub_id: str = sub
            self.sub_parameter: str = self._Vocabulary2Parameter[sub]
        elif sub in self._Vocabulary2Parameter.values():
            self.sub_parameter: str = sub
            self.sub_id: str = [
                k for k, v in self._Vocabulary2Parameter.items() if v == sub
            ][0]
        else:
            raise ValueError(
                f"Unknown subject Reference Table '{sub}'. Possible values are: \nIDs like: {ppliststr([k for k in self._Vocabulary2Parameter], last='or')}\nNames like: {ppliststr([k for k in self._Vocabulary2Parameter.values()], last='or')}"
            )

        if obj in self._Vocabulary2Parameter.keys():
            self.obj_id: str = obj
            self.obj_parameter: str = self._Vocabulary2Parameter[obj]
        elif obj in self._Vocabulary2Parameter.values():
            self.obj_parameter: str = obj
            self.obj_id: str = [
                k for k, v in self._Vocabulary2Parameter.items() if v == obj
            ][0]
        else:
            raise ValueError(
                f"Unknown object Reference Table '{obj}'. Possible values are: \nIDs like: {ppliststr([k for k in self._Vocabulary2Parameter], last='or')}\nNames like: {ppliststr([k for k in self._Vocabulary2Parameter.values()], last='or')}"
            )

        if (self.sub_id, self.obj_id) not in known_mappings():
            warnings.warn(
                f"This mapping '{(self.sub_id, self.obj_id)}'is not known to the AVTT ! Known mappings are {known_mappings()}"
            )

        # Retrieve NVS raw data
        # We use a deepcopy because we will modify the nvs raw data with complementary data
        self.nvs: dict[str, Any] = deepcopy(
            self._nvs_store.load_mapping(self.sub_id, self.obj_id)
        )

        # Internal placeholders:
        self._subjects: list[str] | None = None
        self._objects: list[str] | None = None
        self._predicates: list[str] | None = None
        self._d: dict[str, pd.DataFrame] | None = {}


    def __repr__(self):
        summary = [
            f"<argo.reference.mapping> subject('{self.sub_id}'/'{self.sub_parameter}') vs object('{self.obj_id}'/'{self.obj_parameter}')"
        ]
        summary.append(f"{len(self)} relationships in this mapping")
        return "\n".join(summary)

    @property
    def subjects(self):
        if self._subjects is None:
            self._subjects = np.unique([
                id2concept(binding["subj"]["value"])
                for binding in self.nvs["results"]["bindings"]
            ]).tolist()
            self._subjects.sort()
        return self._subjects

    @property
    def objects(self):
        if self._objects is None:
            self._objects = np.unique([
                id2concept(binding["obj"]["value"])
                for binding in self.nvs["results"]["bindings"]
            ]).tolist()
            self._objects.sort()
        return self._objects

    @property
    def predicates(self):
        if self._predicates is None:
            self._predicates = np.unique([
                predicate(binding["pred"]["value"])
                for binding in self.nvs["results"]["bindings"]
            ]).tolist()
            self._predicates.sort()
        return self._predicates

    def __len__(self):
        return len(self.nvs["results"]["bindings"])

    def __iter__(self):
        for sub in self.subjects:
            results = {'subject': sub, 'predicate':self[sub]}
            yield results

    def __contains__(self, item):
        return item in self.subjects or item in self.objects

    def __getitem__(self, key: str):
        ref_value: str | None = None
        if key in self.subjects:
            ref_value = key
        if ref_value is not None:
            if self._d.get(ref_value, None) is None:
                data = [
                        b
                        for b in self.nvs["results"]["bindings"]
                        if id2concept(b["subj"]["value"]) == key
                    ]
                results = {}
                for b in data:
                    subj, pred, obj = id2concept(b["subj"]["value"]), predicate(b["pred"]["value"]), id2concept(
                        b["obj"]["value"])
                    if pred in results:
                        results[pred].append(obj)
                    else:
                        results[pred] = [obj]
                for v in results.values():
                    v.sort()
                self._d[ref_value] = results
            return self._d[ref_value]
        raise ValueError(f"Invalid subject mapping value '{key}'")


[docs]
    def to_dataframe(self, raw:bool = False) -> pd.DataFrame:
        """Return mapping as a :class:`pd.DataFrame`"""
        df = None
        if len(self.nvs["results"]["bindings"]) > 0:
            df = bindings2df(self.nvs["results"]["bindings"])
            if raw:
                df = df.drop(['subject', 'object'], axis=1)
                df = df.rename({'subject_uri': 'subject', 'object_uri': 'object'}, axis=1)
                return df[['subject', 'predicate', 'object']]
            else:
                df = df.drop(['subject_uri', 'object_uri'], axis=1)
                df['predicate'] = df['predicate'].map(lambda x: x.split(":")[-1]) # Remove NVS jargon ('skos:', or 'owl:')
        return df