add docs to skill scores

1bf8d5d6 · lukas leufen · 561070a5 · 1bf8d5d6 · 1bf8d5d6
Commit 1bf8d5d6 authored 5 years ago by lukas leufen
--- a/docs/_source/conf.py
+++ b/docs/_source/conf.py
@@ -13,13 +13,13 @@
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../src'))
+sys.path.insert(0, os.path.abspath('../..'))
 # -- Project information -----------------------------------------------------
 project = 'machinelearningtools'
-copyright = '2020, Felix Kleinert, Lukas H Leufen'
+copyright = '2020, Lukas H Leufen, Felix Kleinert'
-author = 'Felix Kleinert, Lukas H Leufen'
+author = 'Lukas H Leufen, Felix Kleinert'
 # The short X.Y version
 version = 'v0.9.0'
@@ -38,7 +38,7 @@ extensions = [
    'sphinx.ext.coverage',
    'sphinx.ext.imgmath',
    'sphinx.ext.ifconfig',
-    'sphinx.ext.viewcode',
+    # 'sphinx.ext.viewcode',
    'sphinx.ext.autosummary',
    'autoapi.extension',
    'sphinx.ext.napoleon',
@@ -46,6 +46,7 @@ extensions = [
    'sphinx.ext.githubpages',
    'recommonmark',
    'sphinx.ext.autosectionlabel',
+    'sphinx_autodoc_typehints',  # must be loaded after napoleon
 ]
 # 2020-02-19 Begin

--- a/src/helpers/statistics.py
+++ b/src/helpers/statistics.py
+"""Collection of stastical methods: Transformation and Skill Scores."""
 from scipy import stats
 __author__ = 'Lukas Leufen, Felix Kleinert'
@@ -6,7 +8,7 @@ __date__ = '2019-10-23'
 import numpy as np
 import xarray as xr
 import pandas as pd
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 Data = Union[xr.DataArray, pd.DataFrame]
@@ -112,11 +114,59 @@ def mean_squared_error(a, b):
 class SkillScores:
+    r"""
+    Calculate different kinds of skill scores.
+    **Skill score on MSE**:
+    Calculate skill score based on MSE for given forecast, reference and observations.
+    .. math::
+        \text{SkillScore} = 1 - \frac{\text{MSE(obs, for)}}{\text{MSE(obs, ref)}}
+    To run:
+    .. code-block:: python
+        skill_scores = SkillScores(None).general_skill_score(data, observation_name, forecast_name, reference_name)
+    **Competitive skill score**:
+    Calculate skill scores to highlight differences between forecasts. This skill score is also based on the MSE.
+    Currently required forecasts are CNN, OLS and persi, as well as the observation obs.
+    .. code-block:: python
+        skill_scores_class = SkillScores(internal_data)  # must contain columns CNN, OLS, persi and obs.
+        skill_scores = skill_scores_class.skill_scores(window_lead_time=3)
+    **Skill score according to Murphy**:
+    Follow climatological skill score definition of Murphy (1988). External data is data from another time period than
+    the internal data set on initialisation. In other terms, this should be the train and validation data whereas the
+    internal data is the test data. This sounds perhaps counter-intuitive, but if a skill score is evaluated to a model
+    to another, this must be performend test data set. Therefore, for this case the foreign data is train and val data.
+    .. code-block:: python
+        skill_scores_class = SkillScores(internal_data)  # must contain columns obs and CNN.
+        skill_scores_clim = skill_scores_class.climatological_skill_scores(external_data, window_lead_time=3)
+    """
    def __init__(self, internal_data):
+        """Set internal data."""
        self.internal_data = internal_data
    def skill_scores(self, window_lead_time):
+        """
+        Calculate skill scores for all combinations of CNN, persistence and OLS.
+        :param window_lead_time: length of forecast steps
+        :return: skill score for each comparison and forecast step
+        """
        ahead_names = list(range(1, window_lead_time + 1))
        skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols'])
        for iahead in ahead_names:
@@ -126,7 +176,18 @@ class SkillScores:
                                   self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")]
        return skill_score
-    def climatological_skill_scores(self, external_data, window_lead_time):
+    def climatological_skill_scores(self, external_data: Data, window_lead_time: int) -> Data:
+        """
+        Calculate climatological skill scores according to Murphy (1988).
+        Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data
+        is part of parameters.
+        :param external_data: external data
+        :param window_lead_time: interested time step of forecast horizon to select data
+        :return: all CASES as well as all terms
+        """
        ahead_names = list(range(1, window_lead_time + 1))
        all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
@@ -161,7 +222,18 @@ class SkillScores:
        return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs)
    @staticmethod
-    def general_skill_score(data, observation_name="obs", forecast_name="CNN", reference_name="persi"):
+    def general_skill_score(data: Data, observation_name: str = "obs", forecast_name: str = "CNN",
+                            reference_name: str = "persi") -> np.ndarray:
+        r"""
+        Calculate general skill score based on mean squared error.
+        :param data: internal data containing data for observation, forecast and reference
+        :param observation_name: name of observation
+        :param forecast_name: name of forecast
+        :param reference_name: name of reference
+        :return: skill score of forecast
+        """
        data = data.dropna("index")
        observation = data.sel(type=observation_name)
        forecast = data.sel(type=forecast_name)
@@ -171,14 +243,28 @@ class SkillScores:
        return skill_score.values
    @staticmethod
-    def skill_score_pre_calculations(data, observation_name, forecast_name):
+    def skill_score_pre_calculations(data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray,
+                                                                                                     np.ndarray,
+                                                                                                     np.ndarray,
+                                                                                                     Data,
+                                                                                                     Dict[str, Data]]:
+        """
+        Calculate terms AI, BI, and CI, mean, variance and pearson's correlation and clean up data.
+        The additional information on mean, variance and pearson's correlation (and the p-value) are returned as
+        dictionary with the corresponding keys mean, sigma, r and p.
+        :param data: internal data to use for calculations
+        :param observation_name: name of observation
+        :param forecast_name: name of forecast
+        :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value
+        """
        data = data.loc[..., [observation_name, forecast_name]].drop("ahead")
        data = data.dropna("index")
        mean = data.mean("index")
        sigma = np.sqrt(data.var("index"))
-        # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
        r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name])
        AI = np.array(r ** 2)
@@ -190,17 +276,18 @@ class SkillScores:
        return AI, BI, CI, data, suffix
    def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"):
+        """Calculate CASE I."""
        AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name)
        skill_score = np.array(AI - BI - CI)
        return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array()
    def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"):
+        """Calculate CASE II."""
        AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
        monthly_mean = self.create_monthly_mean_from_daily_data(data)
        data = xr.concat([data, monthly_mean], dim="type")
        sigma = suffix["sigma"]
        sigma_monthly = np.sqrt(monthly_mean.var())
-        # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
        r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
        AII = np.array(r ** 2)
        BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values
@@ -208,6 +295,7 @@ class SkillScores:
        return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
    def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None):
+        """Calculate CASE III."""
        AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
        mean, sigma = suffix["mean"], suffix["sigma"]
        AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values
@@ -215,6 +303,7 @@ class SkillScores:
        return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
    def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None):
+        """Calculate CASE IV."""
        AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
        monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values,
                                                                         index=data.index)
@@ -236,6 +325,15 @@ class SkillScores:
    @staticmethod
    def create_monthly_mean_from_daily_data(data, columns=None, index=None):
+        """
+        Calculate average for each month and save as daily values with flag 'X'.
+        :param data: data to average
+        :param columns: columns to work on (all columns from given data are used if empty)
+        :param index: index of returned data (index of given data is used if empty)
+        :return: data containing monthly means in daily resolution
+        """
        if columns is None:
            columns = data.type.values
        if index is None: