Source code for argopy.related.doi_snapshot

import pandas as pd
import numpy as np
import warnings
from typing import Union

# from matplotlib.colors import to_hex
# from IPython.display import IFrame

from ..stores import httpstore



[docs]
class DOIrecord:
    """Meta-data holder for an Argo GDAC snapshot DOI record

    This is a low-level class that is not intended to be instantiated directly.

    Please use the :class:`ArgoDOI` instead.

    Examples
    --------
    .. code-block:: python
        :caption: API description

        d = DOIrecord()
        d = DOIrecord('42182')
        d = DOIrecord('42182#103075')
        d = DOIrecord(hashtag='103075')
        d = DOIrecord(hashtag='103088')

        d.doi
        d.dx
        d.isvalid
        d.date
        d.network
        d.data
        d.file

    """
    root = ""


[docs]
    def __init__(
        self,
        doi: str = "10.17882/42182",
        hashtag: str = None,
        fs: httpstore = None,
        autoload: bool = True,
        api_root: str = "https://www.seanoe.org/api/",
    ):
        self.api_root = api_root
        self._fs = fs  # A httpstore will be created if necessary if self.load() is called
        self._data = None

        self._doi = doi
        self._hashtag = hashtag
        if "#" in doi:
            self._doi = doi.split("#")[0]
            self._hashtag = doi.split("#")[-1]

        if autoload:
            self.load()


    @property
    def doi(self) -> str:
        """DOI component (without hashtag)"""
        return self._doi

    @property
    def hashtag(self) -> str:
        """Hashtag of the full doi"""
        return self._hashtag

    @property
    def dx(self) -> str:
        """DOI url"""
        return "https://dx.doi.org/%s" % str(self)

    def isvalid(self) -> bool:
        return "42182" in self.doi

    @property
    def data(self) -> dict:
        """DOI record meta-data holder

        Trigger data (down)load if not available
        """
        if self._data is None:
            self.load()
        return self._data

    @property
    def date(self) -> pd.Timestamp:
        """Date associated with the DOI record"""
        return self.data["date"]

    @property
    def network(self) -> str:
        """Network of the Argo data pointed by the DOI

        Returns
        -------
        str: 'core+BGC+deep' or 'BGC'
        """
        return "BGC" if "BGC" in self.data["title"] else "core+BGC+deep"

    @property
    def file(self) -> list:
        """Return a pretty list of files properties associated with this DOI"""
        results = []
        for f in self.data["files"]:
            r = {"openAccess": bool(f["openAccess"])}
            if bool(f["openAccess"]):
                r["path"] = f["fileUrl"]
            else:
                r["path"] = None
            r["update"] = pd.to_datetime(f["lastUpdateDate"])
            r["date"] = pd.to_datetime(f["fragment"]["date"])
            r["size"] = f["size"]
            r["network"] = "BGC" if "BGC" in f["fragment"]["title"] else "core+BGC+deep"
            results.append(r)
        return results

    @property
    def uri(self) -> str:
        """url to API call to retrieve DOI data"""
        if self.hashtag is None:
            url = "find-by-id/{id}".format
        else:
            url = "find-by-fragment/{id}?fragmentId={hashtag}".format
        return self.api_root + url(id=self.doi.split("/")[-1], hashtag=self.hashtag)

    def __str__(self):
        # txt = "%s/%s" % (self.root, self.doi)
        txt = "%s" % (self.doi)
        if self.hashtag is not None:
            txt = "%s#%s" % (txt, self._hashtag)
        return txt

    def _process_data(self, data: dict) -> dict:
        """Synthetic dict from data return by API"""
        Nfiles = len(data["files"])
        if Nfiles > 1:
            # Sort files resources by date (most recent first)
            data["files"].sort(
                key=lambda x: x.get("fragment").get("date"), reverse=True
            )

        return {
            "title": data["title"]["en"],
            "date": pd.to_datetime(data["date"]),
            "authors": data["authors"],
            "files": data["files"],
            "Nfiles": Nfiles,
            # 'description': data['description'],
            # 'keywords': data['keywords'],
            # 'licenceUrl': data['licenceUrl'],
        }

    def load(self, cache: bool = False):
        """Load DOI record data from API call"""
        if self._data is None:
            if self._fs is None:
                self._fs = httpstore(cache=cache)

            data = self._fs.open_json(self.uri)
            self._data = self._process_data(data)

        return self

    def from_dict(self, d: dict):
        """Load DOI record data from a dictionary"""
        if (
            "title" in d
            and "en" in d["title"]
            and "date" in d
            and "authors" in d
            and "files" in d
        ):
            self._data = self._process_data(d)
        return self

    def search(self, **kwargs):
        raise ValueError("")

    def _repr_file(self, file, with_label=False) -> str:
        """Return a pretty string from a single file dict"""
        def sizeof_fmt(num, suffix="B"):
            for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
                if abs(num) < 1024.0:
                    return f"{num:3.1f}{unit}{suffix}"
                num /= 1024.0
            return f"{num:.1f}Yi{suffix}"

        summary = []
        if with_label:
            summary.append("%s" % file["label"]["en"])

        if bool(file["openAccess"]):
            summary.append("%s" % file["fileUrl"])
        else:
            summary.append("%s" % file["fileName"])

        attrs = []
        attrs.append("%s" % sizeof_fmt(file["size"]))
        attrs.append("openAccess=%s" % file["openAccess"])
        summary.append("(%s)" % (", ".join(attrs)))

        return " ".join(summary)

    def __repr__(self):
        summary = ["<argopy.DOI.record>"]
        summary.append("DOI: %s" % self.__str__())
        if self._data is not None:
            summary.append("Title: %s" % self.data["title"])
            summary.append("Date: %s" % self.date.strftime("%Y-%m-%d"))
            summary.append("Network: %s" % self.network)
            summary.append("Link: %s" % self.dx)

            if self.data["Nfiles"] == 1:
                summary.append("File: %s" % self._repr_file(self.data["files"][0]))
            else:
                summary.append("File: %i files in total" % (self.data["Nfiles"]))

                summary.append("Files for core+BGC+deep:")
                ifound = 0
                for ii, f in enumerate(self.data["files"]):
                    if "BGC" not in f["fragment"]["title"] and ifound < 10:
                        summary.append(
                            "     - #%s %s"
                            % (f["id"], self._repr_file(f, with_label=True))
                        )
                        ifound += 1

                summary.append("Files for BGC only:")
                ifound = 0
                for ii, f in enumerate(self.data["files"]):
                    if "BGC" in f["fragment"]["title"] and ifound < 10:
                        summary.append(
                            "     - #%s %s"
                            % (f["id"], self._repr_file(f, with_label=True))
                        )
                        ifound += 1

        return "\n".join(summary)


    # @property
    # def html(self) -> str:
    #     fs = 12
    #
    #     def td_msg(bgcolor, txtcolor, txt):
    #         style = "background-color:%s;" % to_hex(bgcolor, keep_alpha=True)
    #         style += "border-width:0px;"
    #         style += "padding: 2px 2px 2px 0px;"
    #         style += "text-align:left;"
    #         style += "color:%s" % to_hex(txtcolor, keep_alpha=True)
    #         return "<td style='%s'>%s</td>" % (style, str(txt))
    #
    #     def td_a(bgcolor, txtcolor, txt, link):
    #         style = "background-color:%s;" % to_hex(bgcolor, keep_alpha=True)
    #         style += "border-width:0px;"
    #         style += "padding: 2px 0px 2px 5px;"
    #         style += "text-align:right;"
    #         style += "color:%s" % to_hex(txtcolor, keep_alpha=True)
    #         return "<td style='%s'><a href='%s'>%s</a></td>" % (style, link, str(txt))
    #
    #     td_empty = "<td style='border-width:0px;padding: 2px 5px 2px 5px;text-align:left'>&nbsp;</td>"
    #
    #     html = []
    #     html.append(
    #         "<table style='border-collapse:collapse;border-spacing:0;font-size:%ipx'>"
    #         % fs
    #     )
    #     html.append("<tbody>")
    #
    #     rows = []
    #
    #     # 1st row:
    #     cols = []
    #     cols.append(td_msg("dimgray", "w", "doi: "))
    #     cols.append(td_msg("green", "w", "%s/" % self.root))
    #     cols.append(td_msg("yellowgreen", "w", self.doi))
    #     if self.hashtag is not None:
    #         cols.append(td_msg("darkorange", "w", "#%s" % self.hashtag))
    #     cols.append(td_a("white", "w", "↗", self.dx))
    #     cols.append(td_empty)
    #     rows.append("<tr>%s</tr>" % "\n".join(cols))
    #
    #     #         # 2nd row (if data have been loaded):
    #     #         if self._data is not None:
    #     #             cols = []
    #     #             cols.append(td_msg('dimgray', 'w', "Title: "))
    #     #             cols.append(td_msg('white', 'w', "%s" % self.data['title']))
    #     #             # cols.append(td_msg('yellowgreen', 'w', self.doi))
    #     #             # if self.hashtag is not None:
    #     #             #     cols.append(td_msg("darkorange", 'w', "#%s" % self.hashtag))
    #     #             # cols.append(td_a("white", 'w', "↗", self.dx))
    #     #             # cols.append(td_empty)
    #     #             rows.append("<tr>%s</tr>" % "\n".join(cols))
    #
    #     #         print(rows)
    #     #         # Fix colspan:
    #     #         Nrows = np.max([len(r.split("<td ")) for r in rows])
    #     #         print(Nrows)
    #     #         rowss = []
    #     #         for r in rows:
    #     #             rowss.append(r.replace("<tr>", "<tr colspan='%i'>" % Nrows))
    #     #         print(rowss)
    #
    #     # Finalize
    #     html.append("\n".join(rows))
    #     html.append("</tbody>")
    #     html.append("</table>")
    #     html = "\n".join(html)
    #     return html

    # def _repr_html_(self):
    #     return self.html



[docs]
class ArgoDOI:
    """Argo GDAC snapshot DOI access and discovery

    Examples
    --------
    .. code-block:: python
        :caption: Load DOI meta-data

        from argopy import ArgoDOI

        doi = ArgoDOI()  # If you don't know where to start, just load the primary Argo DOI record
        doi = ArgoDOI('95141')  # To point directly to a snapshot ID
        doi = ArgoDOI(hashtag='95141')
        doi = ArgoDOI(fs=httpstore(cache=True))

    .. code-block:: python
        :caption: Searching for a specific DOI snapshot

        # Return doi closest to a given date:
        ArgoDOI().search('2020-02')

        # Return doi closest to a given date for a specific network:
        ArgoDOI().search('2020-02', network='BGC')

    .. code-block:: python
        :caption: Working with DOIs

        doi = ArgoDOI('95141')

        doi.download()  # Trigger download of the DOI file
        doi.file  # Easy to read list of file(s) associated with a DOI record
        doi.dx  # http link toward the DOI snapshot webpage

    """


[docs]
    def __init__(self,
                 hashtag=None,
                 fs=None,
                 cache=True):
        self._fs = fs if isinstance(fs, httpstore) else httpstore(cache=cache)
        if hashtag is not None and '42182#' in hashtag:
            hashtag = hashtag.split('42182#')[-1]
        self._doi = DOIrecord(hashtag=hashtag, fs=self._fs, autoload=True)


    @property
    def doi(self) -> str:
        """DOI component (without hashtag)"""
        return str(self._doi)

    def __repr__(self):
        summary = self._doi.__repr__().split("\n")
        summary[0] = '<argopy.DOI>'
        return "\n".join(summary)


[docs]
    def dates(self, network: str = None) -> dict:
        """Mapping of DOI snapshot hashtag(s) to their publication date(s)

        Parameters
        ----------
        network: str, optional
            Allows to specify a network, like 'BGC'.

        Returns
        -------
        dict
            Dictionary where keys are DOI hashtag and values are publication dates as :class:`pandas.Timestamp`
        """
        d = {}
        network = self._doi.network if network is None else network
        if network == "BGC":
            for f in self._doi.data["files"]:
                if "BGC" in f["fragment"]["title"]:
                    d.update({int(f["id"]): pd.to_datetime(f["fragment"]["date"])})
        else:
            for f in self._doi.data["files"]:
                if "BGC" not in f["fragment"]["title"]:
                    d.update({int(f["id"]): pd.to_datetime(f["fragment"]["date"])})
        return d



[docs]
    def search(self, date: Union[str, pd.Timestamp], network: str = None) -> DOIrecord:
        """Search the DOI record the closest to a given date

        Parameters
        ----------
        date: str, :class:`pandas.Timestamp`
            Date to search a DOI for
        network: str, optional
            Allows to specify a network, like 'BGC'

        Returns
        -------
        :class:`argopy.related.doi_snapshot.DOIrecord`
        """
        dates = self.dates(network=network)
        target = pd.to_datetime(date, utc=True)
        close = list(dates.values())[
            np.argmin(np.abs([target - dates[d] for d in dates]))
        ]
        found = [d for d in dates if dates[d] == close]
        results = []
        if len(found) > 0:
            for f in found:
                results.append(DOIrecord(hashtag=f, fs=self._fs))
        if len(results) == 1:
            if (close - target).days > 30:
                warnings.warn(
                    "This snapshot is more than 30 days off your search dates !"
                )
            return results[0]
        else:
            return results


    @property
    def file(self) -> list:
        """DOI tar.gz file properties"""
        return self._doi.file

    @property
    def dx(self) -> str:
        """DOI url"""
        return self._doi.dx


[docs]
    def download(self):
        """Trigger download of a DOI tar.gz file

        This will simply make the web browser to open the DOI file.
        """
        flist = self.file
        if len(flist) > 1:
            warnings.warn("For safety reasons, we don't trigger download of a DOI when it has more than one file. This is probably happening because you did not specified a hashtag to your ArgoDOI instance.")
        else:
            import webbrowser
            webbrowser.open_new(self.file[0]['path'])