From 1bf8d5d60f73455f28138e97f4905a31fd9ce383 Mon Sep 17 00:00:00 2001 From: lukas leufen <l.leufen@fz-juelich.de> Date: Tue, 28 Apr 2020 08:15:23 +0200 Subject: [PATCH] add docs to skill scores --- docs/_source/conf.py | 9 +-- src/helpers/statistics.py | 112 +++++++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/docs/_source/conf.py b/docs/_source/conf.py index 8a3181cb..6363f57e 100644 --- a/docs/_source/conf.py +++ b/docs/_source/conf.py @@ -13,13 +13,13 @@ import os import sys -sys.path.insert(0, os.path.abspath('../src')) +sys.path.insert(0, os.path.abspath('../..')) # -- Project information ----------------------------------------------------- project = 'machinelearningtools' -copyright = '2020, Felix Kleinert, Lukas H Leufen' -author = 'Felix Kleinert, Lukas H Leufen' +copyright = '2020, Lukas H Leufen, Felix Kleinert' +author = 'Lukas H Leufen, Felix Kleinert' # The short X.Y version version = 'v0.9.0' @@ -38,7 +38,7 @@ extensions = [ 'sphinx.ext.coverage', 'sphinx.ext.imgmath', 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', + # 'sphinx.ext.viewcode', 'sphinx.ext.autosummary', 'autoapi.extension', 'sphinx.ext.napoleon', @@ -46,6 +46,7 @@ extensions = [ 'sphinx.ext.githubpages', 'recommonmark', 'sphinx.ext.autosectionlabel', + 'sphinx_autodoc_typehints', # must be loaded after napoleon ] # 2020-02-19 Begin diff --git a/src/helpers/statistics.py b/src/helpers/statistics.py index 99ae80d5..a38f8915 100644 --- a/src/helpers/statistics.py +++ b/src/helpers/statistics.py @@ -1,3 +1,5 @@ +"""Collection of stastical methods: Transformation and Skill Scores.""" + from scipy import stats __author__ = 'Lukas Leufen, Felix Kleinert' @@ -6,7 +8,7 @@ __date__ = '2019-10-23' import numpy as np import xarray as xr import pandas as pd -from typing import Union, Tuple +from typing import Union, Tuple, Dict Data = Union[xr.DataArray, pd.DataFrame] @@ -112,11 +114,59 @@ def mean_squared_error(a, b): class SkillScores: + r""" + Calculate different kinds of skill scores. + + **Skill score on MSE**: + + Calculate skill score based on MSE for given forecast, reference and observations. + + .. math:: + + \text{SkillScore} = 1 - \frac{\text{MSE(obs, for)}}{\text{MSE(obs, ref)}} + + To run: + + .. code-block:: python + + skill_scores = SkillScores(None).general_skill_score(data, observation_name, forecast_name, reference_name) + + **Competitive skill score**: + + Calculate skill scores to highlight differences between forecasts. This skill score is also based on the MSE. + Currently required forecasts are CNN, OLS and persi, as well as the observation obs. + + .. code-block:: python + + skill_scores_class = SkillScores(internal_data) # must contain columns CNN, OLS, persi and obs. + skill_scores = skill_scores_class.skill_scores(window_lead_time=3) + + **Skill score according to Murphy**: + + Follow climatological skill score definition of Murphy (1988). External data is data from another time period than + the internal data set on initialisation. In other terms, this should be the train and validation data whereas the + internal data is the test data. This sounds perhaps counter-intuitive, but if a skill score is evaluated to a model + to another, this must be performend test data set. Therefore, for this case the foreign data is train and val data. + + .. code-block:: python + + skill_scores_class = SkillScores(internal_data) # must contain columns obs and CNN. + skill_scores_clim = skill_scores_class.climatological_skill_scores(external_data, window_lead_time=3) + + """ def __init__(self, internal_data): + """Set internal data.""" self.internal_data = internal_data def skill_scores(self, window_lead_time): + """ + Calculate skill scores for all combinations of CNN, persistence and OLS. + + :param window_lead_time: length of forecast steps + + :return: skill score for each comparison and forecast step + """ ahead_names = list(range(1, window_lead_time + 1)) skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols']) for iahead in ahead_names: @@ -126,7 +176,18 @@ class SkillScores: self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")] return skill_score - def climatological_skill_scores(self, external_data, window_lead_time): + def climatological_skill_scores(self, external_data: Data, window_lead_time: int) -> Data: + """ + Calculate climatological skill scores according to Murphy (1988). + + Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data + is part of parameters. + + :param external_data: external data + :param window_lead_time: interested time step of forecast horizon to select data + + :return: all CASES as well as all terms + """ ahead_names = list(range(1, window_lead_time + 1)) all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', @@ -161,7 +222,18 @@ class SkillScores: return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) @staticmethod - def general_skill_score(data, observation_name="obs", forecast_name="CNN", reference_name="persi"): + def general_skill_score(data: Data, observation_name: str = "obs", forecast_name: str = "CNN", + reference_name: str = "persi") -> np.ndarray: + r""" + Calculate general skill score based on mean squared error. + + :param data: internal data containing data for observation, forecast and reference + :param observation_name: name of observation + :param forecast_name: name of forecast + :param reference_name: name of reference + + :return: skill score of forecast + """ data = data.dropna("index") observation = data.sel(type=observation_name) forecast = data.sel(type=forecast_name) @@ -171,14 +243,28 @@ class SkillScores: return skill_score.values @staticmethod - def skill_score_pre_calculations(data, observation_name, forecast_name): - + def skill_score_pre_calculations(data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray, + np.ndarray, + np.ndarray, + Data, + Dict[str, Data]]: + """ + Calculate terms AI, BI, and CI, mean, variance and pearson's correlation and clean up data. + + The additional information on mean, variance and pearson's correlation (and the p-value) are returned as + dictionary with the corresponding keys mean, sigma, r and p. + + :param data: internal data to use for calculations + :param observation_name: name of observation + :param forecast_name: name of forecast + + :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value + """ data = data.loc[..., [observation_name, forecast_name]].drop("ahead") data = data.dropna("index") mean = data.mean("index") sigma = np.sqrt(data.var("index")) - # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]]) r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) AI = np.array(r ** 2) @@ -190,17 +276,18 @@ class SkillScores: return AI, BI, CI, data, suffix def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"): + """Calculate CASE I.""" AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) skill_score = np.array(AI - BI - CI) return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"): + """Calculate CASE II.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) data = xr.concat([data, monthly_mean], dim="type") sigma = suffix["sigma"] sigma_monthly = np.sqrt(monthly_mean.var()) - # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]]) r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) AII = np.array(r ** 2) BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values @@ -208,6 +295,7 @@ class SkillScores: return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None): + """Calculate CASE III.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) mean, sigma = suffix["mean"], suffix["sigma"] AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values @@ -215,6 +303,7 @@ class SkillScores: return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None): + """Calculate CASE IV.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index) @@ -236,6 +325,15 @@ class SkillScores: @staticmethod def create_monthly_mean_from_daily_data(data, columns=None, index=None): + """ + Calculate average for each month and save as daily values with flag 'X'. + + :param data: data to average + :param columns: columns to work on (all columns from given data are used if empty) + :param index: index of returned data (index of given data is used if empty) + + :return: data containing monthly means in daily resolution + """ if columns is None: columns = data.type.values if index is None: -- GitLab