Skip to content
Snippets Groups Projects
Commit 1bf8d5d6 authored by lukas leufen's avatar lukas leufen
Browse files

add docs to skill scores

parent 561070a5
No related branches found
No related tags found
3 merge requests!125Release v0.10.0,!124Update Master to new version v0.10.0,!91WIP: Resolve "create sphinx docu"
Pipeline #35383 failed
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
import os import os
import sys import sys
sys.path.insert(0, os.path.abspath('../src')) sys.path.insert(0, os.path.abspath('../..'))
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'machinelearningtools' project = 'machinelearningtools'
copyright = '2020, Felix Kleinert, Lukas H Leufen' copyright = '2020, Lukas H Leufen, Felix Kleinert'
author = 'Felix Kleinert, Lukas H Leufen' author = 'Lukas H Leufen, Felix Kleinert'
# The short X.Y version # The short X.Y version
version = 'v0.9.0' version = 'v0.9.0'
...@@ -38,7 +38,7 @@ extensions = [ ...@@ -38,7 +38,7 @@ extensions = [
'sphinx.ext.coverage', 'sphinx.ext.coverage',
'sphinx.ext.imgmath', 'sphinx.ext.imgmath',
'sphinx.ext.ifconfig', 'sphinx.ext.ifconfig',
'sphinx.ext.viewcode', # 'sphinx.ext.viewcode',
'sphinx.ext.autosummary', 'sphinx.ext.autosummary',
'autoapi.extension', 'autoapi.extension',
'sphinx.ext.napoleon', 'sphinx.ext.napoleon',
...@@ -46,6 +46,7 @@ extensions = [ ...@@ -46,6 +46,7 @@ extensions = [
'sphinx.ext.githubpages', 'sphinx.ext.githubpages',
'recommonmark', 'recommonmark',
'sphinx.ext.autosectionlabel', 'sphinx.ext.autosectionlabel',
'sphinx_autodoc_typehints', # must be loaded after napoleon
] ]
# 2020-02-19 Begin # 2020-02-19 Begin
......
"""Collection of stastical methods: Transformation and Skill Scores."""
from scipy import stats from scipy import stats
__author__ = 'Lukas Leufen, Felix Kleinert' __author__ = 'Lukas Leufen, Felix Kleinert'
...@@ -6,7 +8,7 @@ __date__ = '2019-10-23' ...@@ -6,7 +8,7 @@ __date__ = '2019-10-23'
import numpy as np import numpy as np
import xarray as xr import xarray as xr
import pandas as pd import pandas as pd
from typing import Union, Tuple from typing import Union, Tuple, Dict
Data = Union[xr.DataArray, pd.DataFrame] Data = Union[xr.DataArray, pd.DataFrame]
...@@ -112,11 +114,59 @@ def mean_squared_error(a, b): ...@@ -112,11 +114,59 @@ def mean_squared_error(a, b):
class SkillScores: class SkillScores:
r"""
Calculate different kinds of skill scores.
**Skill score on MSE**:
Calculate skill score based on MSE for given forecast, reference and observations.
.. math::
\text{SkillScore} = 1 - \frac{\text{MSE(obs, for)}}{\text{MSE(obs, ref)}}
To run:
.. code-block:: python
skill_scores = SkillScores(None).general_skill_score(data, observation_name, forecast_name, reference_name)
**Competitive skill score**:
Calculate skill scores to highlight differences between forecasts. This skill score is also based on the MSE.
Currently required forecasts are CNN, OLS and persi, as well as the observation obs.
.. code-block:: python
skill_scores_class = SkillScores(internal_data) # must contain columns CNN, OLS, persi and obs.
skill_scores = skill_scores_class.skill_scores(window_lead_time=3)
**Skill score according to Murphy**:
Follow climatological skill score definition of Murphy (1988). External data is data from another time period than
the internal data set on initialisation. In other terms, this should be the train and validation data whereas the
internal data is the test data. This sounds perhaps counter-intuitive, but if a skill score is evaluated to a model
to another, this must be performend test data set. Therefore, for this case the foreign data is train and val data.
.. code-block:: python
skill_scores_class = SkillScores(internal_data) # must contain columns obs and CNN.
skill_scores_clim = skill_scores_class.climatological_skill_scores(external_data, window_lead_time=3)
"""
def __init__(self, internal_data): def __init__(self, internal_data):
"""Set internal data."""
self.internal_data = internal_data self.internal_data = internal_data
def skill_scores(self, window_lead_time): def skill_scores(self, window_lead_time):
"""
Calculate skill scores for all combinations of CNN, persistence and OLS.
:param window_lead_time: length of forecast steps
:return: skill score for each comparison and forecast step
"""
ahead_names = list(range(1, window_lead_time + 1)) ahead_names = list(range(1, window_lead_time + 1))
skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols']) skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols'])
for iahead in ahead_names: for iahead in ahead_names:
...@@ -126,7 +176,18 @@ class SkillScores: ...@@ -126,7 +176,18 @@ class SkillScores:
self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")] self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")]
return skill_score return skill_score
def climatological_skill_scores(self, external_data, window_lead_time): def climatological_skill_scores(self, external_data: Data, window_lead_time: int) -> Data:
"""
Calculate climatological skill scores according to Murphy (1988).
Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data
is part of parameters.
:param external_data: external data
:param window_lead_time: interested time step of forecast horizon to select data
:return: all CASES as well as all terms
"""
ahead_names = list(range(1, window_lead_time + 1)) ahead_names = list(range(1, window_lead_time + 1))
all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
...@@ -161,7 +222,18 @@ class SkillScores: ...@@ -161,7 +222,18 @@ class SkillScores:
return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs)
@staticmethod @staticmethod
def general_skill_score(data, observation_name="obs", forecast_name="CNN", reference_name="persi"): def general_skill_score(data: Data, observation_name: str = "obs", forecast_name: str = "CNN",
reference_name: str = "persi") -> np.ndarray:
r"""
Calculate general skill score based on mean squared error.
:param data: internal data containing data for observation, forecast and reference
:param observation_name: name of observation
:param forecast_name: name of forecast
:param reference_name: name of reference
:return: skill score of forecast
"""
data = data.dropna("index") data = data.dropna("index")
observation = data.sel(type=observation_name) observation = data.sel(type=observation_name)
forecast = data.sel(type=forecast_name) forecast = data.sel(type=forecast_name)
...@@ -171,14 +243,28 @@ class SkillScores: ...@@ -171,14 +243,28 @@ class SkillScores:
return skill_score.values return skill_score.values
@staticmethod @staticmethod
def skill_score_pre_calculations(data, observation_name, forecast_name): def skill_score_pre_calculations(data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray,
np.ndarray,
np.ndarray,
Data,
Dict[str, Data]]:
"""
Calculate terms AI, BI, and CI, mean, variance and pearson's correlation and clean up data.
The additional information on mean, variance and pearson's correlation (and the p-value) are returned as
dictionary with the corresponding keys mean, sigma, r and p.
:param data: internal data to use for calculations
:param observation_name: name of observation
:param forecast_name: name of forecast
:returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value
"""
data = data.loc[..., [observation_name, forecast_name]].drop("ahead") data = data.loc[..., [observation_name, forecast_name]].drop("ahead")
data = data.dropna("index") data = data.dropna("index")
mean = data.mean("index") mean = data.mean("index")
sigma = np.sqrt(data.var("index")) sigma = np.sqrt(data.var("index"))
# r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name])
AI = np.array(r ** 2) AI = np.array(r ** 2)
...@@ -190,17 +276,18 @@ class SkillScores: ...@@ -190,17 +276,18 @@ class SkillScores:
return AI, BI, CI, data, suffix return AI, BI, CI, data, suffix
def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"): def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"):
"""Calculate CASE I."""
AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name)
skill_score = np.array(AI - BI - CI) skill_score = np.array(AI - BI - CI)
return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array()
def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"): def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"):
"""Calculate CASE II."""
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
monthly_mean = self.create_monthly_mean_from_daily_data(data) monthly_mean = self.create_monthly_mean_from_daily_data(data)
data = xr.concat([data, monthly_mean], dim="type") data = xr.concat([data, monthly_mean], dim="type")
sigma = suffix["sigma"] sigma = suffix["sigma"]
sigma_monthly = np.sqrt(monthly_mean.var()) sigma_monthly = np.sqrt(monthly_mean.var())
# r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
AII = np.array(r ** 2) AII = np.array(r ** 2)
BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values
...@@ -208,6 +295,7 @@ class SkillScores: ...@@ -208,6 +295,7 @@ class SkillScores:
return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None): def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None):
"""Calculate CASE III."""
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
mean, sigma = suffix["mean"], suffix["sigma"] mean, sigma = suffix["mean"], suffix["sigma"]
AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values
...@@ -215,6 +303,7 @@ class SkillScores: ...@@ -215,6 +303,7 @@ class SkillScores:
return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None): def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None):
"""Calculate CASE IV."""
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values,
index=data.index) index=data.index)
...@@ -236,6 +325,15 @@ class SkillScores: ...@@ -236,6 +325,15 @@ class SkillScores:
@staticmethod @staticmethod
def create_monthly_mean_from_daily_data(data, columns=None, index=None): def create_monthly_mean_from_daily_data(data, columns=None, index=None):
"""
Calculate average for each month and save as daily values with flag 'X'.
:param data: data to average
:param columns: columns to work on (all columns from given data are used if empty)
:param index: index of returned data (index of given data is used if empty)
:return: data containing monthly means in daily resolution
"""
if columns is None: if columns is None:
columns = data.type.values columns = data.type.values
if index is None: if index is None:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment