From 27734c2e81f7df4cc7f5cb3a28cd6132f76d95af Mon Sep 17 00:00:00 2001 From: lukas leufen <l.leufen@fz-juelich.de> Date: Fri, 18 Sep 2020 15:19:10 +0000 Subject: [PATCH] added table of contents to readme --- README.md | 59 +---------------------- mlair/data_handler/station_preparation.py | 53 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 12d2c13a..6d9dc21f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ MLAir (Machine Learning on Air data) is an environment that simplifies and accel learning (ML) models for the analysis and forecasting of meteorological and air quality time series. You can find the docs [here](http://toar.pages.jsc.fz-juelich.de/mlair/docs/). +[[_TOC_]] + # Installation MLAir is based on several python frameworks. To work properly, you have to install all packages from the @@ -375,60 +377,3 @@ add it to `src/join_settings.py` in the hourly data section. Replace the `TOAR_S value. To make sure, that this **sensitive** data is not uploaded to the remote server, use the following command to prevent git from tracking this file: `git update-index --assume-unchanged src/join_settings.py` - -# remaining things - -## Transformation - -There are two different approaches (called scopes) to transform the data: -1) `station`: transform data for each station independently (somehow like batch normalisation) -1) `data`: transform all data of each station with shared metrics - -Transformation must be set by the `transformation` attribute. If `transformation = None` is given to `ExperimentSetup`, -data is not transformed at all. For all other setups, use the following dictionary structure to specify the -transformation. -``` -transformation = {"scope": <...>, - "method": <...>, - "mean": <...>, - "std": <...>} -ExperimentSetup(..., transformation=transformation, ...) -``` - -### scopes - -**station**: mean and std are not used - -**data**: either provide already calculated values for mean and std (if required by transformation method), or choose -from different calculation schemes, explained in the mean and std section. - -### supported transformation methods -Currently supported methods are: -* standardise (default, if method is not given) -* centre - -### mean and std -`"mean"="accurate"`: calculate the accurate values of mean and std (depending on method) by using all data. Although, -this method is accurate, it may take some time for the calculation. Furthermore, this could potentially lead to memory -issue (not explored yet, but could appear for a very big amount of data) - -`"mean"="estimate"`: estimate mean and std (depending on method). For each station, mean and std are calculated and -afterwards aggregated using the mean value over all station-wise metrics. This method is less accurate, especially -regarding the std calculation but therefore much faster. - -We recommend to use the later method *estimate* because of following reasons: -* much faster calculation -* real accuracy of mean and std is less important, because it is "just" a transformation / scaling -* accuracy of mean is almost as high as in the *accurate* case, because of -$\bar{x_{ij}} = \bar{\left(\bar{x_i}\right)_j}$. The only difference is, that in the *estimate* case, each mean is -equally weighted for each station independently of the actual data count of the station. -* accuracy of std is lower for *estimate* because of $\var{x_{ij}} \ne \bar{\left(\var{x_i}\right)_j}$, but still the mean of all -station-wise std is a decent estimate of the true std. - -`"mean"=<value, e.g. xr.DataArray>`: If mean and std are already calculated or shall be set manually, just add the -scaling values instead of the calculation method. For method *centre*, std can still be None, but is required for the -*standardise* method. **Important**: Format of given values **must** match internal data format of DataPreparation -class: `xr.DataArray` with `dims=["variables"]` and one value for each variable. - - - diff --git a/mlair/data_handler/station_preparation.py b/mlair/data_handler/station_preparation.py index 0f7dbd26..a278d0df 100644 --- a/mlair/data_handler/station_preparation.py +++ b/mlair/data_handler/station_preparation.py @@ -514,6 +514,59 @@ class DataHandlerSingleStation(AbstractDataHandlerSingleStation): :param transformation: the transformation dictionary as described above. :return: updated transformation dictionary + + ## Transformation + + There are two different approaches (called scopes) to transform the data: + 1) `station`: transform data for each station independently (somehow like batch normalisation) + 1) `data`: transform all data of each station with shared metrics + + Transformation must be set by the `transformation` attribute. If `transformation = None` is given to `ExperimentSetup`, + data is not transformed at all. For all other setups, use the following dictionary structure to specify the + transformation. + ``` + transformation = {"scope": <...>, + "method": <...>, + "mean": <...>, + "std": <...>} + ExperimentSetup(..., transformation=transformation, ...) + ``` + + ### scopes + + **station**: mean and std are not used + + **data**: either provide already calculated values for mean and std (if required by transformation method), or choose + from different calculation schemes, explained in the mean and std section. + + ### supported transformation methods + Currently supported methods are: + * standardise (default, if method is not given) + * centre + + ### mean and std + `"mean"="accurate"`: calculate the accurate values of mean and std (depending on method) by using all data. Although, + this method is accurate, it may take some time for the calculation. Furthermore, this could potentially lead to memory + issue (not explored yet, but could appear for a very big amount of data) + + `"mean"="estimate"`: estimate mean and std (depending on method). For each station, mean and std are calculated and + afterwards aggregated using the mean value over all station-wise metrics. This method is less accurate, especially + regarding the std calculation but therefore much faster. + + We recommend to use the later method *estimate* because of following reasons: + * much faster calculation + * real accuracy of mean and std is less important, because it is "just" a transformation / scaling + * accuracy of mean is almost as high as in the *accurate* case, because of + $\bar{x_{ij}} = \bar{\left(\bar{x_i}\right)_j}$. The only difference is, that in the *estimate* case, each mean is + equally weighted for each station independently of the actual data count of the station. + * accuracy of std is lower for *estimate* because of $\var{x_{ij}} \ne \bar{\left(\var{x_i}\right)_j}$, but still the mean of all + station-wise std is a decent estimate of the true std. + + `"mean"=<value, e.g. xr.DataArray>`: If mean and std are already calculated or shall be set manually, just add the + scaling values instead of the calculation method. For method *centre*, std can still be None, but is required for the + *standardise* method. **Important**: Format of given values **must** match internal data format of DataPreparation + class: `xr.DataArray` with `dims=["variables"]` and one value for each variable. + """ if transformation is None: return -- GitLab