From 1bf8d5d60f73455f28138e97f4905a31fd9ce383 Mon Sep 17 00:00:00 2001
From: lukas leufen <l.leufen@fz-juelich.de>
Date: Tue, 28 Apr 2020 08:15:23 +0200
Subject: [PATCH] add docs to skill scores

---
 docs/_source/conf.py      |   9 +--
 src/helpers/statistics.py | 112 +++++++++++++++++++++++++++++++++++---
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/docs/_source/conf.py b/docs/_source/conf.py
index 8a3181cb..6363f57e 100644
--- a/docs/_source/conf.py
+++ b/docs/_source/conf.py
@@ -13,13 +13,13 @@
 import os
 import sys
 
-sys.path.insert(0, os.path.abspath('../src'))
+sys.path.insert(0, os.path.abspath('../..'))
 
 # -- Project information -----------------------------------------------------
 
 project = 'machinelearningtools'
-copyright = '2020, Felix Kleinert, Lukas H Leufen'
-author = 'Felix Kleinert, Lukas H Leufen'
+copyright = '2020, Lukas H Leufen, Felix Kleinert'
+author = 'Lukas H Leufen, Felix Kleinert'
 
 # The short X.Y version
 version = 'v0.9.0'
@@ -38,7 +38,7 @@ extensions = [
     'sphinx.ext.coverage',
     'sphinx.ext.imgmath',
     'sphinx.ext.ifconfig',
-    'sphinx.ext.viewcode',
+    # 'sphinx.ext.viewcode',
     'sphinx.ext.autosummary',
     'autoapi.extension',
     'sphinx.ext.napoleon',
@@ -46,6 +46,7 @@ extensions = [
     'sphinx.ext.githubpages',
     'recommonmark',
     'sphinx.ext.autosectionlabel',
+    'sphinx_autodoc_typehints',  # must be loaded after napoleon
 ]
 
 # 2020-02-19 Begin
diff --git a/src/helpers/statistics.py b/src/helpers/statistics.py
index 99ae80d5..a38f8915 100644
--- a/src/helpers/statistics.py
+++ b/src/helpers/statistics.py
@@ -1,3 +1,5 @@
+"""Collection of stastical methods: Transformation and Skill Scores."""
+
 from scipy import stats
 
 __author__ = 'Lukas Leufen, Felix Kleinert'
@@ -6,7 +8,7 @@ __date__ = '2019-10-23'
 import numpy as np
 import xarray as xr
 import pandas as pd
-from typing import Union, Tuple
+from typing import Union, Tuple, Dict
 
 Data = Union[xr.DataArray, pd.DataFrame]
 
@@ -112,11 +114,59 @@ def mean_squared_error(a, b):
 
 
 class SkillScores:
+    r"""
+    Calculate different kinds of skill scores.
+
+    **Skill score on MSE**:
+
+    Calculate skill score based on MSE for given forecast, reference and observations.
+
+    .. math::
+
+        \text{SkillScore} = 1 - \frac{\text{MSE(obs, for)}}{\text{MSE(obs, ref)}}
+
+    To run:
+
+    .. code-block:: python
+
+        skill_scores = SkillScores(None).general_skill_score(data, observation_name, forecast_name, reference_name)
+
+    **Competitive skill score**:
+
+    Calculate skill scores to highlight differences between forecasts. This skill score is also based on the MSE.
+    Currently required forecasts are CNN, OLS and persi, as well as the observation obs.
+
+    .. code-block:: python
+
+        skill_scores_class = SkillScores(internal_data)  # must contain columns CNN, OLS, persi and obs.
+        skill_scores = skill_scores_class.skill_scores(window_lead_time=3)
+
+    **Skill score according to Murphy**:
+
+    Follow climatological skill score definition of Murphy (1988). External data is data from another time period than
+    the internal data set on initialisation. In other terms, this should be the train and validation data whereas the
+    internal data is the test data. This sounds perhaps counter-intuitive, but if a skill score is evaluated to a model
+    to another, this must be performend test data set. Therefore, for this case the foreign data is train and val data.
+
+    .. code-block:: python
+
+        skill_scores_class = SkillScores(internal_data)  # must contain columns obs and CNN.
+        skill_scores_clim = skill_scores_class.climatological_skill_scores(external_data, window_lead_time=3)
+
+    """
 
     def __init__(self, internal_data):
+        """Set internal data."""
         self.internal_data = internal_data
 
     def skill_scores(self, window_lead_time):
+        """
+        Calculate skill scores for all combinations of CNN, persistence and OLS.
+
+        :param window_lead_time: length of forecast steps
+
+        :return: skill score for each comparison and forecast step
+        """
         ahead_names = list(range(1, window_lead_time + 1))
         skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols'])
         for iahead in ahead_names:
@@ -126,7 +176,18 @@ class SkillScores:
                                    self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")]
         return skill_score
 
-    def climatological_skill_scores(self, external_data, window_lead_time):
+    def climatological_skill_scores(self, external_data: Data, window_lead_time: int) -> Data:
+        """
+        Calculate climatological skill scores according to Murphy (1988).
+
+        Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data
+        is part of parameters.
+
+        :param external_data: external data
+        :param window_lead_time: interested time step of forecast horizon to select data
+
+        :return: all CASES as well as all terms
+        """
         ahead_names = list(range(1, window_lead_time + 1))
 
         all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
@@ -161,7 +222,18 @@ class SkillScores:
         return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs)
 
     @staticmethod
-    def general_skill_score(data, observation_name="obs", forecast_name="CNN", reference_name="persi"):
+    def general_skill_score(data: Data, observation_name: str = "obs", forecast_name: str = "CNN",
+                            reference_name: str = "persi") -> np.ndarray:
+        r"""
+        Calculate general skill score based on mean squared error.
+
+        :param data: internal data containing data for observation, forecast and reference
+        :param observation_name: name of observation
+        :param forecast_name: name of forecast
+        :param reference_name: name of reference
+
+        :return: skill score of forecast
+        """
         data = data.dropna("index")
         observation = data.sel(type=observation_name)
         forecast = data.sel(type=forecast_name)
@@ -171,14 +243,28 @@ class SkillScores:
         return skill_score.values
 
     @staticmethod
-    def skill_score_pre_calculations(data, observation_name, forecast_name):
-
+    def skill_score_pre_calculations(data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray,
+                                                                                                     np.ndarray,
+                                                                                                     np.ndarray,
+                                                                                                     Data,
+                                                                                                     Dict[str, Data]]:
+        """
+        Calculate terms AI, BI, and CI, mean, variance and pearson's correlation and clean up data.
+
+        The additional information on mean, variance and pearson's correlation (and the p-value) are returned as
+        dictionary with the corresponding keys mean, sigma, r and p.
+
+        :param data: internal data to use for calculations
+        :param observation_name: name of observation
+        :param forecast_name: name of forecast
+
+        :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value
+        """
         data = data.loc[..., [observation_name, forecast_name]].drop("ahead")
         data = data.dropna("index")
 
         mean = data.mean("index")
         sigma = np.sqrt(data.var("index"))
-        # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
         r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name])
 
         AI = np.array(r ** 2)
@@ -190,17 +276,18 @@ class SkillScores:
         return AI, BI, CI, data, suffix
 
     def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"):
+        """Calculate CASE I."""
         AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name)
         skill_score = np.array(AI - BI - CI)
         return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array()
 
     def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"):
+        """Calculate CASE II."""
         AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
         monthly_mean = self.create_monthly_mean_from_daily_data(data)
         data = xr.concat([data, monthly_mean], dim="type")
         sigma = suffix["sigma"]
         sigma_monthly = np.sqrt(monthly_mean.var())
-        # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
         r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
         AII = np.array(r ** 2)
         BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values
@@ -208,6 +295,7 @@ class SkillScores:
         return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
 
     def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None):
+        """Calculate CASE III."""
         AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
         mean, sigma = suffix["mean"], suffix["sigma"]
         AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values
@@ -215,6 +303,7 @@ class SkillScores:
         return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
 
     def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None):
+        """Calculate CASE IV."""
         AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
         monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values,
                                                                          index=data.index)
@@ -236,6 +325,15 @@ class SkillScores:
 
     @staticmethod
     def create_monthly_mean_from_daily_data(data, columns=None, index=None):
+        """
+        Calculate average for each month and save as daily values with flag 'X'.
+
+        :param data: data to average
+        :param columns: columns to work on (all columns from given data are used if empty)
+        :param index: index of returned data (index of given data is used if empty)
+
+        :return: data containing monthly means in daily resolution
+        """
         if columns is None:
             columns = data.type.values
         if index is None:
-- 
GitLab