diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..a08bab0068246f8d57b3789e10f6f0f4105817ad --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,33 @@ +# Changelog +All notable changes to this project will be documented in this file. + +## v0.9.0 - 2020-04-15 - faster bootstraps, extreme value upsamling +### general +- improved and faster bootstrap workflow +- new plot PlotAvailability +- extreme values upsampling +- improved runtime environment + +### new features +- entire bootstrap workflow has been refactored and much faster now, can be skipped with `evaluate_bootstraps=False`, #60 +- upsampling of extreme values, set with parameter `extreme_values=[your_values_standardised]` (e.g. `[1, 2]`) and + `extremes_on_right_tail_only=<True/False>` if only right tail of distribution is affected or both, #58, #87 +- minimal data length property (in total and for all subsets), #76 +- custom objects in model class to load customised model objects like padding class, loss, #72 +- new plot for data availability: `PlotAvailability`, #103 +- introduced (default) `plot_list` to specify which plots to draw +- latex and markdown information on sample sizes for each station, #90 + +### technical +- implemented tests on gpu and from scratch for develop, release and master branches, #95 +- usage of tensorflow 1.13.1 (gpu / cpu), separated in 2 different requirements, #81 +- new abstract plot class to have uniform plot class design +- New time tracking wrapper to use for functions or classes +- improved logger (info on display, debug into file), #73, #85, #88 +- improved run environment, especially for error handling, #86 +- prefix `general` in data store scope is now optional and can be skipped. If given scope is not `general`, it is + treated as subscope, #82 +- all 2D Padding classes are now selected by `Padding2D(padding_name=<padding_type>)` e.g. + `Padding2D(padding_name="SymPad2D")`, #78 +- custom learning rate (or lr_decay) is optional now, #71 + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..8e08e921f5fb728f7b1758e4bb385efc7d71c29b --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ + +import setuptools + +from src import __version__, __author__, __email__ + + +with open("README.md", "r") as fh: + long_description = fh.read() + + +setuptools.setup( + name="mlt", + version=__version__, + author=__author__, + author_email=__email__, + description="A framework to enable easy time series predictions with machine learning.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://gitlab.version.fz-juelich.de/toar/machinelearningtools", + package_dir={'': 'src'}, + packages=setuptools.find_packages(where="src"), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", # to be adjusted + "Operating System :: OS Independent", + ], + python_requires='>=3.5', +) \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index 452d0ed8b95a6300a2a47b65be78a5ddf4e968d6..9559822193d070bb886b59e2605bfc7aa73eef5e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,5 +1,21 @@ -""" -Test string +__version_info__ = { + 'major': 0, + 'minor': 9, + 'micro': 0, +} -This is all about machine learning tools -""" \ No newline at end of file +from src.run_modules import * +from src.run import run + + +def get_version(): + assert set(__version_info__.keys()) >= {"major", "minor"} + vers = [f"{__version_info__['major']}.{__version_info__['minor']}"] + if __version_info__["micro"]: + vers.append(f".{__version_info__['micro']}") + return "".join(vers) + + +__version__ = get_version() +__author__ = "Lukas H. Leufen, Felix Kleinert" +__email__ = "l.leufen@fz-juelich.de", diff --git a/src/data_handling/data_preparation.py b/src/data_handling/data_preparation.py index bb5254572e400b89a219ec674f408f09350f849c..fa7388e7d5510eae71a65ebc108f9a6dc3b0a2ff 100644 --- a/src/data_handling/data_preparation.py +++ b/src/data_handling/data_preparation.py @@ -110,7 +110,7 @@ class DataPrep(object): self.check_station_meta() logging.debug("loading finished") except FileNotFoundError as e: - logging.warning(e) + logging.debug(e) self.download_data(file_name, meta_file) logging.debug("loaded new data from JOIN") diff --git a/src/helpers/join.py b/src/helpers/join.py index 1b2abb6c8fe9d0db2dd45636f230cc9a2e232f7c..7d9c3aad23c402ae63f26bdf998074a86e35ffbf 100644 --- a/src/helpers/join.py +++ b/src/helpers/join.py @@ -51,10 +51,11 @@ def download_join(station_name: Union[str, List[str]], stat_var: dict, station_t # download all variables with given statistic data = None df = None + logging.info(f"load data for {station_name[0]} from JOIN") for var in _lower_list(sorted(vars_dict.keys())): if var in stat_var.keys(): - logging.info('load: {}'.format(var)) + logging.debug('load: {}'.format(var)) # create data link opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': stat_var[var], diff --git a/src/run.py b/src/run.py new file mode 100644 index 0000000000000000000000000000000000000000..705c612560d139292e3ec55c4f5c223318307447 --- /dev/null +++ b/src/run.py @@ -0,0 +1,48 @@ + +from src.run_modules import * +import argparse +import inspect + + +def run(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], + station_type='background', + trainable=False, create_new_model=True, + window_history_size=6, + experiment_date="testrun", + network=None, + variables=None, statistics_per_var=None, + start=None, end=None, + target_var="o3", target_dim=None, + window_lead_time=None, + dimensions=None, + interpolate_method=None, interpolate_dim=None, limit_nan_fill=None, + train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, + use_all_stations_on_all_data_sets=True, fraction_of_train=None, + experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None, + sampling="daily", + permute_data_on_training=False, extreme_values=None, extremes_on_right_tail_only=None, + transformation=None, + train_min_length=None, val_min_length=None, test_min_length=None, + evaluate_bootstraps=True, number_of_bootstraps=None, create_new_bootstraps=False, + plot_list=None): + + params = inspect.getfullargspec(ExperimentSetup).args + kwargs = {k: v for k, v in locals().items() if k in params} + + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + + with RunEnvironment(): + ExperimentSetup(args, **kwargs) + PreProcessing() + PartitionCheck() + ModelSetup() + Training() + PostProcessing() + + +if __name__ == "__main__": + + run() diff --git a/src/run_modules/__init__.py b/src/run_modules/__init__.py index f06d627f6ff482e11c6d1c520fa59197feb831cd..0c70ae4205ff38fdc876538c42c44ca0bc8cb9c0 100644 --- a/src/run_modules/__init__.py +++ b/src/run_modules/__init__.py @@ -1,5 +1,6 @@ from src.run_modules.experiment_setup import ExperimentSetup from src.run_modules.model_setup import ModelSetup +from src.run_modules.partition_check import PartitionCheck from src.run_modules.post_processing import PostProcessing from src.run_modules.pre_processing import PreProcessing from src.run_modules.run_environment import RunEnvironment diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index e79a1eef8c6ea3f4082d1b7c146b42e83a3b0eee..110e77913107787edc54c8c4415257b43df80aeb 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -264,6 +264,7 @@ class ExperimentSetup(RunEnvironment): sampling=sampling) self._set_param("experiment_name", exp_name) self._set_param("experiment_path", exp_path) + logging.info(f"Experiment path is: {exp_path}") path_config.check_path_and_create(self.data_store.get("experiment_path")) # set model path diff --git a/test/test_data_handling/test_bootstraps.py b/test/test_data_handling/test_bootstraps.py index 650c232314a351c148dcf906718e16e3f454a277..3d32a090e62294793bda3f2421de4f52205f427f 100644 --- a/test/test_data_handling/test_bootstraps.py +++ b/test/test_data_handling/test_bootstraps.py @@ -95,7 +95,7 @@ class TestCreateShuffledData: assert shuffled_data_clean.create_shuffled_data() is None assert caplog.record_tuples[0] == ('root', logging.INFO, "create / check shuffled bootstrap data") assert caplog.record_tuples[1] == ('root', logging.INFO, "create bootstap data for DEBW107") - assert caplog.record_tuples[5] == ('root', logging.INFO, "create bootstap data for DEBW013") + assert caplog.record_tuples[3] == ('root', logging.INFO, "create bootstap data for DEBW013") assert "DEBW107_o3_temp_hist7_nboots20_shuffled.nc" in os.listdir(data_path) assert "DEBW013_o3_temp_hist7_nboots20_shuffled.nc" in os.listdir(data_path)