diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index d61146b61a5ade9118675fa7b895212f310acc71..0dd2a07b48a0a3789824391291978f98b216bb57 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -50,7 +50,8 @@ DEFAULT_BOOTSTRAP_TYPE = "singleinput" DEFAULT_BOOTSTRAP_METHOD = "shuffle" DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", + "PlotOversampling", "PlotContingency"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", @@ -58,6 +59,11 @@ DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA DEFAULT_USE_MULTIPROCESSING = True DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False DEFAULT_MAX_NUMBER_MULTIPROCESSING = 16 +DEFAULT_OVERSAMPLING_BINS = 10 +DEFAULT_OVERSAMPLING_RATES_CAP = 100 +DEFAULT_OVERSAMPLING_METHOD = None + +DEFAULT_EXTERNAL_WEIGHTS = None def get_defaults(): diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index a17de95407a74d1504877fdce03a82d1c943e868..2ffdd49ec20c0e73c0f5ffd65176572f31f2a919 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -179,6 +179,49 @@ class DefaultDataHandler(AbstractDataHandler): def apply_transformation(self, data, base="target", dim=0, inverse=False): return self.id_class.apply_transformation(data, dim=dim, base=base, inverse=inverse) + def apply_oversampling(self, bin_edges, oversampling_rates, timedelta: Tuple[int, str] = (1, 's'), timedelta2: Tuple[int, str] = (1, 'ms')): + self._load() + self._X_extreme = None + self._Y_extreme = None + if (self._X is None) or (self._Y is None): + logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") + return + Y = self._Y + X = self._X + for i_bin in range(len(bin_edges)-1): + bin_start = bin_edges[i_bin] + if i_bin == len(bin_edges) - 2: + bin_end = bin_edges[i_bin+1]+1 + else: + bin_end = bin_edges[i_bin + 1] + rate = oversampling_rates[i_bin] + + # extract extremes based on occurrence in labels + other_dims = remove_items(list(Y.dims), self.time_dim) + extreme_idx = xr.concat([(Y >= bin_start).any(dim=other_dims[0]), + (Y < bin_end).any(dim=other_dims[0])], + dim=other_dims[0]).all(dim=other_dims[0]) + extreme_idx = extreme_idx[extreme_idx] + sel = extreme_idx.coords[self.time_dim].values + if len(extreme_idx)>0: + for i in range(np.ceil(rate).astype(int)): + if rate-i < 1: + rest = int(len(sel)*(rate-i))+1 + sel = np.random.choice(sel, rest, replace=False) + extremes_X = list(map(lambda x: x.sel(**{self.time_dim: sel}), X)) + self._add_timedelta(extremes_X, self.time_dim, (i, timedelta[1])) + self._add_timedelta(extremes_X, self.time_dim, (i_bin, timedelta2[1])) + extremes_Y = Y.sel(**{self.time_dim: sel}) + extremes_Y.coords[self.time_dim] = extremes_Y.coords[self.time_dim].values + i*np.timedelta64(*timedelta) + i_bin*np.timedelta64(*timedelta2) + if (self._X_extreme is None) or (self._Y_extreme is None): + self._X_extreme = extremes_X + self._Y_extreme = extremes_Y + else: + self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) + self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) + self._store(fresh_store=True) + + def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim=DEFAULT_TIME_DIM): """ @@ -230,14 +273,16 @@ class DefaultDataHandler(AbstractDataHandler): else: extreme_idx = xr.concat([(Y < -extr_val).any(dim=other_dims[0]), (Y > extr_val).any(dim=other_dims[0])], - dim=other_dims[1]).any(dim=other_dims[1]) + dim=other_dims[0]).any(dim=other_dims[0]) - extremes_X = list(map(lambda x: x.sel(**{dim: extreme_idx}), X)) + sel = extreme_idx[extreme_idx].coords[dim].values + extremes_X = list(map(lambda x: x.sel(**{dim: sel}), X)) self._add_timedelta(extremes_X, dim, timedelta) # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X)) extremes_Y = Y.sel(**{dim: extreme_idx}) - extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + #extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + self._add_timedelta(extremes_Y, dim, timedelta) self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) @@ -245,7 +290,7 @@ class DefaultDataHandler(AbstractDataHandler): @staticmethod def _add_timedelta(data, dim, timedelta): for d in data: - d.coords[dim].values += np.timedelta64(*timedelta) + d.coords[dim] = d.coords[dim].values + np.timedelta64(*timedelta) @classmethod def transformation(cls, set_stations, **kwargs): diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index a1e713a8c135800d02ff7c27894485a5da7fae37..2f47bac598c89b3f40eee62b5010c76f54f21346 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -301,7 +301,7 @@ class SkillScores: observation_name=self.observation_name) for (first, second) in combinations] return skill_score - + ''' def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: """ Calculate climatological skill scores according to Murphy (1988). @@ -316,6 +316,47 @@ class SkillScores: """ ahead_names = list(self.external_data[self.ahead_dim].data) + all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', + 'CASE IV'] + skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names], + dims=['terms', self.ahead_dim]) + + for iahead in ahead_names: + data = internal_data.sel({self.ahead_dim: iahead}) + + skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=1, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) + + skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) + + if self.external_data is not None and self.observation_name in self.external_data.coords["type"]: + external_data = self.external_data.sel({self.ahead_dim: iahead, "type": [self.observation_name]}) + skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, + external_data=external_data).values.flatten()) + + skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=4, forecast_name=forecast_name, observation_name=self.observation_name, + external_data=external_data).values.flatten()) + + return skill_score + ''' + + def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: + """ + Calculate climatological skill scores according to Murphy (1988). + + Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data + is part of parameters. + + :param internal_data: internal data + :param forecast_name: name of the forecast to use for this calculation (must be available in `data`) + + :return: all CASES as well as all terms + """ + ahead_names = list(internal_data[self.ahead_dim].data) + all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', 'CASE IV'] skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names], diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index 9a0e97dbd1f3a3a52f5717c88d09702e5d0d7928..f7cae7ac640e2ab412096ff1859ed10b4c7c761b 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -364,23 +364,25 @@ class IntelliO3_ts_architecture(AbstractModelClass): assert len(output_shape) == 1 super().__init__(input_shape[0], output_shape[0]) - from mlair.model_modules.keras_extensions import LearningRateDecay - # settings self.dropout_rate = .35 self.regularizer = keras.regularizers.l2(0.01) self.initial_lr = 1e-4 - self.lr_decay = LearningRateDecay(base_lr=self.initial_lr, drop=.94, epochs_drop=10) self.activation = keras.layers.ELU self.padding = "SymPad2D" + self.apply_to_model() # apply to model + def apply_to_model(self): + from mlair.model_modules.keras_extensions import LearningRateDecay + self.lr_decay = LearningRateDecay(base_lr=self.initial_lr, drop=.94, epochs_drop=10) self.set_model() self.set_compile_options() - self.set_custom_objects(loss=self.compile_options["loss"], + self.set_custom_objects(loss=self.compile_options["loss"][0], SymmetricPadding2D=SymmetricPadding2D, LearningRateDecay=LearningRateDecay) + def set_model(self): """ Build the model. @@ -407,14 +409,14 @@ class IntelliO3_ts_architecture(AbstractModelClass): pool_settings_dict1 = {'pool_kernel': (3, 1), 'tower_filter': 16, 'activation': activation} conv_settings_dict2 = { - 'tower_1': {'reduction_filter': 64, 'tower_filter': 32 * 2, 'tower_kernel': (3, 1), + 'tower_1': {'reduction_filter': 64, 'tower_filter': 32 * 2 * 2, 'tower_kernel': (3, 1), 'activation': activation}, - 'tower_2': {'reduction_filter': 64, 'tower_filter': 32 * 2, 'tower_kernel': (5, 1), + 'tower_2': {'reduction_filter': 64, 'tower_filter': 32 * 2 * 2, 'tower_kernel': (5, 1), 'activation': activation}, - 'tower_3': {'reduction_filter': 64, 'tower_filter': 32 * 2, 'tower_kernel': (1, 1), + 'tower_3': {'reduction_filter': 64, 'tower_filter': 32 * 2 * 2, 'tower_kernel': (1, 1), 'activation': activation} } - pool_settings_dict2 = {'pool_kernel': (3, 1), 'tower_filter': 32, 'activation': activation} + pool_settings_dict2 = {'pool_kernel': (3, 1), 'tower_filter': 32*2, 'activation': activation} ########################################## inception_model = InceptionModelBase() @@ -458,7 +460,49 @@ class IntelliO3_ts_architecture(AbstractModelClass): def set_compile_options(self): self.compile_options = {"optimizer": keras.optimizers.adam(lr=self.initial_lr, amsgrad=True), - "loss": [l_p_loss(4), keras.losses.mean_squared_error], + "loss": [keras.losses.mean_squared_error, keras.losses.mean_squared_error], "metrics": ['mse'], "loss_weights": [.01, .99] - } \ No newline at end of file + } + +class IntelliO3_ts_architecture_finetune_all_dense(IntelliO3_ts_architecture): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape, output_shape) + + self.initial_lr = 1e-5 + self.apply_to_model() + self.freeze_layers() + # self.lr_decay = None + + def freeze_layers(self): + for layer in self.model.layers: + if not isinstance(layer, keras.layers.core.Dense): + layer.trainable = False + +class IntelliO3_ts_architecture_finetune_outputs(IntelliO3_ts_architecture): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape, output_shape) + + self.initial_lr = 1e-5 + self.apply_to_model() + self.freeze_layers() + # self.lr_decay = None + + def freeze_layers(self): + for layer in self.model.layers: + if layer.name not in ["minor_1_out_Dense", "Main_out_Dense"]: + layer.trainable = False + +class IntelliO3_ts_architecture_finetune_main_output(IntelliO3_ts_architecture): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape, output_shape) + + self.initial_lr = 1e-5 + self.apply_to_model() + self.freeze_layers() + # self.lr_decay = None + + def freeze_layers(self): + for layer in self.model.layers: + if layer.name not in ["Main_out_Dense"]: + layer.trainable = False \ No newline at end of file diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 513f64f2c174d94cb7230b141387c9a850d678cb..29693b3f1c5832a460925819da9cc7aeef66a54e 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -20,6 +20,62 @@ from mlair.data_handler import DataCollection from mlair.helpers import TimeTrackingWrapper, to_list, remove_items from mlair.plotting.abstract_plot_class import AbstractPlotClass +@TimeTrackingWrapper +class PlotOversampling(AbstractPlotClass): + + def __init__(self, data, bin_edges, bin_edges_retransformed, oversampling_rates, plot_folder: str = ".", + plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates", + "oversampling_rates_deviation"]): + + super().__init__(plot_folder, plot_names[0]) + + Y_hist, Y_extreme_hist, Y_hist_dens, Y_extreme_hist_dens = self._calculate_hist(data, bin_edges) + real_oversampling = Y_extreme_hist / Y_hist + self._plot_oversampling_histogram(Y_hist, Y_extreme_hist, bin_edges_retransformed) + self._save() + self.plot_name = plot_names[1] + self._plot_oversampling_histogram(Y_hist_dens, Y_extreme_hist_dens, bin_edges_retransformed) + self._save() + self.plot_name = plot_names[2] + self._plot_oversampling_rates(oversampling_rates, real_oversampling) + self._save() + self.plot_name = plot_names[3] + self._plot_oversampling_rates_deviation(oversampling_rates, real_oversampling) + self._save() + + def _calculate_hist(self, data, bin_edges): + Y_hist = np.zeros(len(bin_edges)-1) + Y_extreme_hist = np.zeros(len(bin_edges)-1) + for station in data: + Y = station.get_Y(as_numpy=True, upsampling=False) + Y_extreme = station.get_Y(as_numpy=True, upsampling=True) + Y_hist = Y_hist + np.histogram(Y, bins=bin_edges)[0] + Y_extreme_hist = Y_extreme_hist + np.histogram(Y_extreme, bins=bin_edges)[0] + Y_hist_dens = Y_hist/np.sum(Y_hist) + Y_extreme_hist_dens = Y_extreme_hist / np.sum(Y_extreme_hist) + return Y_hist, Y_extreme_hist, Y_hist_dens, Y_extreme_hist_dens + + def _plot_oversampling_histogram(self, Y_hist, Y_extreme_hist, bin_edges): + fig, ax = plt.subplots(1, 1) + ax.hist(bin_edges[:-1], bin_edges, weights=Y_hist, label="Before oversampling", histtype="step") + ax.hist(bin_edges[:-1], bin_edges, weights=Y_extreme_hist, label="After oversampling", histtype="step") + ax.set_title(f"Density Histogram before-after oversampling") + ax.legend() + + def _plot_oversampling_rates(self, oversampling_rates, real_oversampling): + fig, ax = plt.subplots(1, 1) + ax.plot(range(len(real_oversampling)), oversampling_rates, label="Desired oversampling_rates") + ax.plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") + ax.set_title(f"Oversampling rates") + ax.legend() + + def _plot_oversampling_rates_deviation(self, oversampling_rates, real_oversampling): + fig, ax = plt.subplots(1, 1) + ax.plot(range(len(real_oversampling)), real_oversampling / oversampling_rates, + label="Actual/Desired Rate") + ax.set_title(f"Deviation from desired oversampling rates") + ax.legend() + @TimeTrackingWrapper class PlotStationMap(AbstractPlotClass): # pragma: no cover diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index ecba8a4e0a3369fbb170a7427ef81365d531bc3b..29ef70a34ad0b1e78db15fb238e72f5c33decd07 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -18,7 +18,7 @@ from matplotlib.backends.backend_pdf import PdfPages from mlair import helpers from mlair.data_handler.iterator import DataCollection -from mlair.helpers import TimeTrackingWrapper +from mlair.helpers import TimeTrackingWrapper, to_list from mlair.plotting.abstract_plot_class import AbstractPlotClass logging.getLogger('matplotlib').setLevel(logging.WARNING) @@ -28,6 +28,247 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) # matplotlib.use("TkAgg") # import matplotlib.pyplot as plt +@TimeTrackingWrapper +class PlotContingency(AbstractPlotClass): + + def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", + model_plot_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", + plot_names=["contingency_gilbert_skill_score", "contingency_threat_score", "contingency_hit_rate", + "contingency_false_alarm_rate", + "contingency_bias", "contingency_all_scores", "contingency_table"]): + + super().__init__(plot_folder, plot_names[0]) + self._stations = station_names + self._file_path = file_path + self._comp_path = comp_path + self._file_name = file_name + self._obs_name = obs_name + self._model_name = model_name + self._model_plot_name = model_plot_name + self._comp_names = to_list(comp_names) + self._all_names = [self._model_name] + self._all_names.extend(self._comp_names) + self._plot_names = plot_names + self._min_threshold, self._max_threshold = self._min_max_threshold() + contingency_array = self._calculate_contingencies() + self._save_tables(contingency_array, "contingency") + self._scores = ["gss", "ts", "h", "f", "b"] + score_array = self._calculate_all_scores(contingency_array) + self._save_tables(score_array, "scores_contingency") + self._plot_counter = 0 + + self._plot(score_array, "gss") + self._save() + self._plot(score_array, "ts") + self._save() + self._plot(score_array, "h") + self._save() + self._plot(score_array, "f") + self._save() + self._plot(score_array, "b") + self._save() + self._plot(score_array, "all_scores") + self._save() + self._plot_contingency(contingency_array, self._model_name) + self._save() + for comp in self._comp_names: + self._plot_contingency(contingency_array, comp) + self._save() + + def _save_tables(self, array, name): + for model in self._all_names: + type_array = array.sel(type=model).drop("type") + if name is "contingency": + df = pd.DataFrame({"a": type_array.sel(contingency_cell="ta").values, + "b": type_array.sel(contingency_cell="fa").values, + "c": type_array.sel(contingency_cell="fb").values, + "d": type_array.sel(contingency_cell="tb").values}) + else: + df = pd.DataFrame({"gss": type_array.sel(scores="gss").values, + "ts": type_array.sel(scores="ts").values, + "h": type_array.sel(scores="h").values, + "f": type_array.sel(scores="f").values, + "b": type_array.sel(scores="b").values}) + + helpers.tables.save_to_tex(self.plot_folder, f"table_{name}_{model}", + helpers.tables.create_column_format_for_tex(df), df) + + + def _plot_contingency(self, contingency_array, type): + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="ta", type=type)], label="a") + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="fa", type=type)], label="b") + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="fb", type=type)], label="c") + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="tb", type=type)], label="d") + plt.title(f"contingency table {type}") + plt.legend() + self.plot_name = f"contingency_table_{type}" + + def _plot(self, data, score): + if score == "all_scores": + for score_name in data.scores.values.tolist(): + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type="nn", scores=score_name)], label=score_name) + else: + for type in data.type.values.tolist(): + if type in "nn": + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], + label=self._model_plot_name) + else: + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], label=type) + plt.title(self._plot_names[self._plot_counter]) + plt.legend() + self.plot_name = self._plot_names[self._plot_counter] + self._plot_counter = self._plot_counter + 1 + + + def _create_competitor_forecast(self, station_name: str, competitor_name: str) -> xr.DataArray: + """ + Load and format the competing forecast of a distinct model indicated by `competitor_name` for a distinct station + indicated by `station_name`. The name of the competitor is set in the `type` axis as indicator. This method will + raise either a `FileNotFoundError` or `KeyError` if no competitor could be found for the given station. Either + there is no file provided in the expected path or no forecast for given `competitor_name` in the forecast file. + + :param station_name: name of the station to load data for + :param competitor_name: name of the model + :return: the forecast of the given competitor + """ + path = os.path.join(self._comp_path, competitor_name) + file = os.path.join(path, f"forecasts_{station_name}_test.nc") + data = xr.open_dataarray(file) + # data = data.expand_dims(Stations=[station_name]) # ToDo: remove line + forecast = data.sel(type=[self._model_name]) + forecast.coords["type"] = [competitor_name] + return forecast + + def _load_competitors(self, station_name: str, comp) -> xr.DataArray: + """ + Load all requested and available competitors for a given station. Forecasts must be available in the competitor + path like `//forecasts__test.nc`. The naming style is equal for all + forecasts of MLAir, so that forecasts of a different experiment can easily be copied into the competitor path + without any change. + + :param station_name: station indicator to load competitors for + + :return: a single xarray with all competing forecasts + """ + competing_predictions = [] + for competitor_name in comp: + try: + prediction = self._create_competitor_forecast(station_name, competitor_name) + competing_predictions.append(prediction) + except (FileNotFoundError, KeyError): + logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") + continue + return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + + def _min_max_threshold(self): + min_threshold = 0 + max_threshold = 0 + for station in self._stations: + file = os.path.join(self._file_path, self._file_name % station) + forecast_file = xr.open_dataarray(file) + obs = forecast_file.sel(type=self._obs_name) + obs = obs.fillna(0) + min_threshold = np.minimum(min_threshold, int(np.min(obs.values.flatten()))) + max_threshold = np.maximum(max_threshold, int(np.max(obs.values.flatten()))) + return min_threshold, max_threshold + + def _calculate_contingencies(self): + thresholds = np.arange(self._min_threshold, self._max_threshold) + contingency_cell = ["ta", "fa", "fb", "tb"] + contingency_array = xr.DataArray(dims=["thresholds", "contingency_cell", "type"], + coords=[thresholds, contingency_cell, self._all_names]) + contingency_array = contingency_array.fillna(1) + for station in self._stations: + file = os.path.join(self._file_path, self._file_name % station) + forecast_file = xr.open_dataarray(file) + obs = forecast_file.sel(type=self._obs_name) + predictions = [forecast_file.sel(type=self._model_name)] + for comp in self._comp_names: + c = self._load_competitors(station, [comp]) + if c is not None: + predictions.append(c.sel(type=comp)) + for threshold in range(self._min_threshold, self._max_threshold): + for pred in predictions: + ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) + contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] + ta + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] + fa + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] + fb + contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] + tb + return contingency_array + + def _single_contingency(self, obs, pred, threshold): + ta = 0 + fa = 0 + fb = 0 + tb = 0 + observations = obs.values.flatten() + predictions = pred.values.flatten() + for i in range(len(observations)): + if predictions[i] >= threshold: + if observations[i] >= threshold: + ta += + 1 + else: + fa += + 1 + else: + if observations[i] >= threshold: + fb += 1 + else: + tb += 1 + return ta, fa, fb, tb + + def _calculate_all_scores(self, contingency_array): + score_array = xr.DataArray(dims=["scores", "thresholds", "type"], + coords=[self._scores, contingency_array.thresholds.values, + contingency_array.type.values]) + for type in score_array.type.values.tolist(): + for threshold in score_array.thresholds.values.tolist(): + for score in score_array.scores.values.tolist(): + score_value = self._calculate_scores(contingency_array.loc[dict(type=type, + thresholds=threshold)].values, score) + score_array.loc[dict(type=type, thresholds=threshold, scores=score)] = score_value + return score_array + + def _calculate_scores(self, contingency, score): + true_above = contingency[0] + false_above = contingency[1] + false_below = contingency[2] + true_below = contingency[3] + if score == "gss": + frequency_above_threshold = (true_above + false_below)/(true_above + false_above + false_below + true_below) + forecasts_above_threshold = true_above + false_above + chance_hits = frequency_above_threshold*forecasts_above_threshold + if (true_above + false_above + false_below - chance_hits) == 0: + score_value = 1 + else: + score_value = (true_above - chance_hits)/(true_above + false_above + false_below - chance_hits) + if score == "ts": + if (true_above + false_above + false_below) == 0: + score_value = 1 + else: + score_value = true_above/(true_above + false_above + false_below) + elif score == "h": + if (true_above + false_below) == 0: + score_value = 0 + else: + score_value = true_above/(true_above + false_below) + elif score == "f": + if (false_above + true_below) == 0: + score_value = 0 + else: + score_value = false_above/(false_above + true_below) + elif score == "b": + if (true_above + false_below) == 0: + score_value = 0 + else: + score_value = (true_above + false_above)/(true_above + false_below) + return score_value + + + @TimeTrackingWrapper class PlotMonthlySummary(AbstractPlotClass): diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 209859c1ff38efe2667c918aa5b79c96f2524be0..6d8bde1a9f5540ae5d304847b7239fffc21ba8d7 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -10,6 +10,7 @@ from dill.source import getsource from mlair.configuration import path_config from mlair import helpers +from mlair.helpers import to_list from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_NETWORK, DEFAULT_STATION_TYPE, \ DEFAULT_START, DEFAULT_END, DEFAULT_WINDOW_HISTORY_SIZE, DEFAULT_OVERWRITE_LOCAL_DATA, \ DEFAULT_HPC_LOGIN_LIST, DEFAULT_HPC_HOST_LIST, DEFAULT_CREATE_NEW_MODEL, DEFAULT_TRAIN_MODEL, \ @@ -20,8 +21,10 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ - DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_MAX_NUMBER_MULTIPROCESSING, \ - DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD + DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_OVERSAMPLING_BINS, \ + DEFAULT_OVERSAMPLING_RATES_CAP, DEFAULT_OVERSAMPLING_METHOD, \ + DEFAULT_MAX_NUMBER_MULTIPROCESSING, \ + DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD, DEFAULT_EXTERNAL_WEIGHTS from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -185,6 +188,9 @@ class ExperimentSetup(RunEnvironment): :param use_multiprocessing: Enable parallel preprocessing (postprocessing not implemented yet) by setting this parameter to `True` (default). If set to `False` the computation is performed in an serial approach. Multiprocessing is disabled when running in debug mode and cannot be switched on. + :param oversampling_bins: Sets the number of classes in which the training data is split. The training samples are then + oversampled according to the frequency of the different classes. + :param oversampling_rates_cap: Sets the maximum oversampling rate that is applied to a class """ @@ -218,7 +224,9 @@ class ExperimentSetup(RunEnvironment): hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, data_origin: Dict = None, competitors: list = None, competitor_path: str = None, use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, - max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None, **kwargs): + max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None, + oversampling_bins=None, oversampling_rates_cap=None, oversampling_method=None, external_weights=None, + **kwargs): # create run framework super().__init__() @@ -236,15 +244,23 @@ class ExperimentSetup(RunEnvironment): self._set_param("bootstrap_path", bootstrap_path) self._set_param("train_model", train_model, default=DEFAULT_TRAIN_MODEL) self._set_param("fraction_of_training", fraction_of_train, default=DEFAULT_FRACTION_OF_TRAINING) + self._set_param("batch_size", batch_size, default=DEFAULT_BATCH_SIZE) + self._set_param("epochs", epochs, default=DEFAULT_EPOCHS) + + # set params for oversampling + self._set_param("oversampling_bins", oversampling_bins, default=DEFAULT_OVERSAMPLING_BINS) + self._set_param("oversampling_rates_cap", oversampling_rates_cap, default=DEFAULT_OVERSAMPLING_RATES_CAP) + self._set_param("oversampling_method", oversampling_method, default=DEFAULT_OVERSAMPLING_METHOD) self._set_param("extreme_values", extreme_values, default=DEFAULT_EXTREME_VALUES, scope="train") self._set_param("extremes_on_right_tail_only", extremes_on_right_tail_only, default=DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY, scope="train") - self._set_param("upsampling", extreme_values is not None, scope="train") - upsampling = self.data_store.get("upsampling", "train") + upsampling = (extreme_values is not None) or (oversampling_method is not None) + self._set_param("upsampling", upsampling, scope="train") permute_data = DEFAULT_PERMUTE_DATA if permute_data_on_training is None else permute_data_on_training self._set_param("permute_data", permute_data or upsampling, scope="train") - self._set_param("batch_size", batch_size, default=DEFAULT_BATCH_SIZE) - self._set_param("epochs", epochs, default=DEFAULT_EPOCHS) + if (extreme_values is not None) and (oversampling_method is not None): + logging.info("Parameters extreme_values and oversampling_method are set. In this case only " + "oversampling_method is used.") # set experiment name sampling = self._set_param("sampling", sampling, default=DEFAULT_SAMPLING) # always related to output sampling @@ -272,6 +288,7 @@ class ExperimentSetup(RunEnvironment): # set model path self._set_param("model_path", None, os.path.join(experiment_path, "model")) path_config.check_path_and_create(self.data_store.get("model_path")) + self._set_param("external_weights", external_weights, default=DEFAULT_EXTERNAL_WEIGHTS) # set plot path default_plot_path = os.path.join(experiment_path, "plots") @@ -357,7 +374,7 @@ class ExperimentSetup(RunEnvironment): # set competitors self._set_param("competitors", competitors, default=[]) competitor_path_default = os.path.join(self.data_store.get("data_path"), "competitors", - "_".join(self.data_store.get("target_var"))) + "_".join(to_list(self.data_store.get("target_var")))) self._set_param("competitor_path", competitor_path, default=competitor_path_default) # check variables, statistics and target variable diff --git a/mlair/run_modules/model_setup.py b/mlair/run_modules/model_setup.py index 83f4a2bd96314d6f8c53f8cc9407cbc12e7b9a16..28a4e99abe83102e632ec98000ae9233362dbe73 100644 --- a/mlair/run_modules/model_setup.py +++ b/mlair/run_modules/model_setup.py @@ -83,7 +83,10 @@ class ModelSetup(RunEnvironment): self.plot_model() # load weights if no training shall be performed - if not self._train_model and not self._create_new_model: + external_weights = self.data_store.get("external_weights") + if external_weights is not None: + self.load_weights(external_weights) + elif not self._train_model and not self._create_new_model: self.load_weights() # create checkpoint @@ -131,11 +134,13 @@ class ModelSetup(RunEnvironment): save_best_only=True, mode='auto') self.data_store.set("callbacks", callbacks, self.scope) - def load_weights(self): + def load_weights(self, external_weight=None): """Try to load weights from existing model or skip if not possible.""" + if external_weight is None: + external_weight = self.model_name try: - self.model.load_weights(self.model_name) - logging.info(f"reload weights from model {self.model_name} ...") + self.model.load_weights(external_weight) + logging.info(f"reload weights from model {external_weight} ...") except OSError: logging.info('no weights to reload...') diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index f3909fde29b466af1bf64124ab1d57873ae70d18..a19ddbe1dd09fa6da833d4c448f113c1bd02b44c 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -20,9 +20,10 @@ from mlair.helpers import TimeTracking, statistics, extract_value, remove_items, from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales + PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles,\ + PlotSeparationOfScales, PlotContingency from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ - PlotPeriodogram, PlotDataHistogram + PlotPeriodogram, PlotDataHistogram, PlotOversampling from mlair.run_modules.run_environment import RunEnvironment @@ -137,7 +138,16 @@ class PostProcessing(RunEnvironment): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + if len(competing_predictions) == 0: + return None + else: + comp_array = xr.concat(competing_predictions, "type") + if len(comp_array.coords[self.ahead_dim]) == self.window_lead_time: + return comp_array + else: + raise ValueError(f"Ahead dimensions of competitors do not match model." + f" Competitor ahead: {len(comp_array.coords[self.ahead_dim])}" + f" but window_lead_time is {self.window_lead_time}.") def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput", bootstrap_method="shuffle") -> None: @@ -291,11 +301,16 @@ class PostProcessing(RunEnvironment): :return: the model """ + external_weights = self.data_store.get("external_weights") try: model = self.data_store.get("best_model") except NameNotFoundInDataStore: logging.info("No model was saved in data store. Try to load model from experiment path.") - model_name = self.data_store.get("model_name", "model") + if external_weights is not None: + logging.info("load model from external_weights path") + model_name = external_weights + else: + model_name = self.data_store.get("model_name", "model") model_class: AbstractModelClass = self.data_store.get("model", "model") model = keras.models.load_model(model_name, custom_objects=model_class.custom_objects) return model @@ -329,6 +344,24 @@ class PostProcessing(RunEnvironment): target_dim = self.data_store.get("target_dim") iter_dim = self.data_store.get("iter_dim") + try: + if ("PlotContingency" in plot_list): + PlotContingency(station_names=self.test_data.keys(), file_path=path, + comp_path=self.competitor_path, comp_names=self.competitors, + file_name=r"forecasts_%s_test.nc", plot_folder=self.plot_path) + except Exception as e: + logging.error(f"Could not create plot OversamplingContingencyPlots due to the following error: {e}") + + try: + if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( + "PlotOversampling" in plot_list): + bin_edges = self.data_store.get('oversampling_bin_edges') + bin_edges_retransformed = self.data_store.get('oversampling_bin_edges_retransformed') + oversampling_rates = self.data_store.get('oversampling_rates_capped', 'train') + PlotOversampling(self.train_data, bin_edges, bin_edges_retransformed, oversampling_rates, plot_folder=self.plot_path) + except Exception as e: + logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") + try: if ("filter" in self.test_data[0].get_X(as_numpy=False)[0].coords) and ( "PlotSeparationOfScales" in plot_list): diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 08bff85c9c1fe06111ddb47e7a3952404e05c0ac..ef6f32552c5bc20107755d1f0fa5eff0f4171441 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -8,10 +8,14 @@ import os import traceback from typing import Tuple import multiprocessing + +import numpy as np import requests import psutil import pandas as pd +import xarray as xr +from matplotlib import pyplot as plt from mlair.data_handler import DataCollection, AbstractDataHandler from mlair.helpers import TimeTracking, to_list, tables @@ -65,9 +69,51 @@ class PreProcessing(RunEnvironment): raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") self.data_store.set("stations", valid_stations) self.split_train_val_test() + if self.data_store.get('oversampling_method')=='bin_oversampling': + logging.debug("Apply Oversampling") + self.apply_oversampling() + else: + logging.debug("No Oversampling") self.report_pre_processing() self.prepare_competitors() + def apply_oversampling(self): + data = self.data_store.get('data_collection', 'train') + bins = self.data_store.get('oversampling_bins') + rates_cap = self.data_store.get('oversampling_rates_cap') + histogram = np.array(bins) + #get min and max of the whole data + total_min = 0 + total_max = 0 + for station in data: + total_min = np.minimum(np.amin(station.get_Y(as_numpy=True)), total_min) + total_max = np.maximum(np.amax(station.get_Y(as_numpy=True)), total_max) + bin_edges = [] + for station in data: + # Create histogram for each station + hist, bin_edges = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min, total_max)) + # Add up histograms + histogram = histogram + hist + # Scale down to most frequent class=1 + histogram = 1/np.amax(histogram) * histogram + # Get Oversampling rates (with and without cap) + oversampling_rates = 1 / histogram + oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap) + # Get transformer variables + o3_mean = self.data_store.get("transformation")[0]["o3"]["mean"].values + o3_std = self.data_store.get("transformation")[0]["o3"]["std"].values + bin_edges_retransformed = np.floor(bin_edges*o3_std+o3_mean) + # Add to datastore + self.data_store.set('oversampling_rates', oversampling_rates, 'train') + self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'train') + self.data_store.set('oversampling_bin_edges', bin_edges) + self.data_store.set('oversampling_bin_edges_retransformed', bin_edges_retransformed) + #Y = None + #Y_extreme = None + for station in data: + station.apply_oversampling(bin_edges, oversampling_rates_capped) + + def report_pre_processing(self): """Log some metrics on data and create latex report.""" logging.debug(20 * '##') diff --git a/package_licenses.md b/package_licenses.md new file mode 100644 index 0000000000000000000000000000000000000000..007e63bb54bf1ef38b05c6cd05e878620ec0e447 --- /dev/null +++ b/package_licenses.md @@ -0,0 +1,78 @@ +Package | License | Link +---|---|--- +absl-py==0.11.0|Apache 2.0|(https://pypi.org/project/absl-py/) +appdirs==1.4.4|MIT|(https://pypi.org/project/appdirs/) +astor==0.8.1|BSD 3-Clause|(https://pypi.org/project/astor/) +astropy==4.1|BSD 3-Clause|(https://pypi.org/project/astropy/) +attrs==20.3.0|MIT|(https://pypi.org/project/attrs/) +bottleneck==1.3.2|BSD Simplified|(https://pypi.org/project/Bottleneck/) +cached-property==1.5.2|BSD|(https://pypi.org/project/cached-property/) +certifi==2020.12.5|MPL 2.0|(https://pypi.org/project/certifi/) +cftime==1.4.1|MIT|(https://pypi.org/project/cftime/) +chardet==4.0.0|LPGL|(https://pypi.org/project/chardet/) +coverage==5.4|Apache 2.0|(https://pypi.org/project/coverage/) +cycler==0.10.0|BSD|(https://pypi.org/project/Cycler/) +dask==2021.2.0|BSD|(https://pypi.org/project/dask/) +dill==0.3.3|BSD 3-Clause|(https://pypi.org/project/dill/) +fsspec==0.8.5|BSD|(https://pypi.org/project/fsspec/) +gast==0.4.0|BSD 3-Clause|(https://pypi.org/project/gast/) +grpcio==1.35.0|Apache 2.0|(https://pypi.org/project/grpcio/) +h5py==2.10.0|BSD|(https://pypi.org/project/h5py/) +idna==2.10|BSD|(https://pypi.org/project/idna/) +importlib-metadata==3.4.0|Apache|(https://pypi.org/project/importlib-metadata/) +iniconfig==1.1.1|MIT|(https://pypi.org/project/iniconfig/) +Keras==2.2.4|MIT|(https://pypi.org/project/keras/) +Keras-Applications==1.0.8|MIT|(https://pypi.org/project/Keras-Applications/) +Keras-Preprocessing==1.1.2|MIT|(https://pypi.org/project/Keras-Preprocessing/) +kiwisolver==1.3.1|BSD|(https://pypi.org/project/kiwisolver/) +locket==0.2.1|BSD 2-Clause|(https://pypi.org/project/locket/) +Markdown==3.3.3|BSD|(https://pypi.org/project/Markdown/) +matplotlib==3.3.4|PSF|(https://pypi.org/project/matplotlib/) +mock==4.0.3|BSD|(https://pypi.org/project/mock/) +netCDF4==1.5.5.1|MIT|(https://pypi.org/project/netCDF4/) +numpy==1.19.5|BSD|(https://pypi.org/project/numpy/) +ordered-set==4.0.2|MIT|(https://pypi.org/project/ordered-set/) +packaging==20.9|BSD 2-Clause or Apache 2.0|(https://pypi.org/project/packaging/) +pandas==1.1.5|BSD|(https://pypi.org/project/pandas/) +partd==1.1.0|BSD|(https://pypi.org/project/partd/) +patsy==0.5.1|BSD 2-Clause|(https://pypi.org/project/patsy/) +Pillow==8.1.0|HPND|(https://pypi.org/project/Pillow/) +pluggy==0.13.1|MIT|(https://pypi.org/project/pluggy/) +protobuf==3.15.0|BSD 3-Clause|(https://pypi.org/project/protobuf/) +psutil==5.8.0|BSD|(https://pypi.org/project/psutil/) +py==1.10.0|MIT|(https://pypi.org/project/py/) +pydot==1.4.2|MIT|(https://pypi.org/project/pydot/) +pyparsing==2.4.7|MIT|(https://pypi.org/project/pyparsing/) +pyshp==2.1.3|MIT|(https://pypi.org/project/pyshp/) +pytest==6.2.2|MIT|(https://pypi.org/project/pytest/) +pytest-cov==2.11.1|MIT|(https://pypi.org/project/pytest-cov/) +pytest-html==3.1.1|MPL 2.0|(https://pypi.org/project/pytest-html/) +pytest-lazy-fixture==0.6.3|MIT|(https://pypi.org/project/pytest-lazy-fixture/) +pytest-metadata==1.11.0|MPL 2.0|(https://pypi.org/project/pytest-metadata/) +pytest-sugar==0.9.4|BSD|(https://pypi.org/project/pytest-sugar/) +python-dateutil==2.8.1|Apache or BSD|(https://pypi.org/project/python-dateutil/) +pytz==2021.1|MIT|(https://pypi.org/project/pytz/) +PyYAML==5.4.1|MIT|(https://pypi.org/project/PyYAML/) +requests==2.25.1|Apache 2.0|(https://pypi.org/project/requests/) +scipy==1.5.4|BSD|(https://pypi.org/project/scipy/) +seaborn==0.11.1|BSD 3-Clause|(https://pypi.org/project/seaborn/) +six==1.15.0|MIT|(https://pypi.org/project/six/) +statsmodels==0.12.2|BSD|(https://pypi.org/project/statsmodels/) +tabulate==0.8.8|MIT|(https://pypi.org/project/tabulate/) +tensorboard==1.13.1|Apache 2.0|(https://pypi.org/project/tensorboard/) +tensorflow==1.13.1|Apache 2.0|(https://pypi.org/project/tensorflow/) +tensorflow-estimator==1.13.0|Apache 2.0|(https://pypi.org/project/tensorflow-estimator/) +termcolor==1.1.0|MIT|(https://pypi.org/project/termcolor/) +toml==0.10.2|MIT|(https://pypi.org/project/toml/) +toolz==0.11.1|BSD|(https://pypi.org/project/toolz/) +typing-extensions==3.7.4.3|PSF|(https://pypi.org/project/typing-extensions/) +urllib3==1.26.3|MIT|(https://pypi.org/project/urllib3/) +Werkzeug==1.0.1|BSD 3-Clause|(https://pypi.org/project/Werkzeug/) +wget==3.2|Public Domain|(https://pypi.org/project/wget/) +xarray==0.16.2|Apache|(https://pypi.org/project/xarray/) +zipp==3.4.0|MIT|(https://pypi.org/project/zipp/) +shapely==1.7.0|BSD|(https://pypi.org/project/Shapely/) +Cartopy==0.18.0|LGPLv3+|(https://pypi.org/project/Cartopy/) + +##Different Licenses +Apache, Apache 2.0, BSD, BSD Simplified, BSD 2-Clause, BSD 3-Clause, HPND, LGPLv3+, MIT, MPL 2.0, PSF, Public Domain diff --git a/run.py b/run.py index eb703e11cf9a3028dda0368ac3f7b7ca1578bd2a..bd93db698c55bc8bae49c5d39a85f9d26cc49780 100644 --- a/run.py +++ b/run.py @@ -25,6 +25,7 @@ def main(parser_args): # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], train_model=False, create_new_model=True, network="UBA", + oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], competitors=["test_model", "test_model2"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), diff --git a/run_two_phase.py b/run_two_phase.py new file mode 100644 index 0000000000000000000000000000000000000000..421c050e9813a02f71d8a2943b27226691691ecc --- /dev/null +++ b/run_two_phase.py @@ -0,0 +1,51 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflowHPC +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_freeze +import os + + +def load_stations(external_station_list=None): + import json + if external_station_list is None: + external_station_list = 'supplement/station_list_north_german_plain_rural.json' + try: + filename = external_station_list + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + +# 1. How to load existing model +# https://www.tensorflow.org/tutorials/images/transfer_learning +# 3. How many epochs? +# 4. Full data set? +# 5. lower learning rate? + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflowHPC(#stations=load_stations('supplement/German_background_stations.json'), + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=50, external_weights="/home/vincentgramlich/mlair/data/weights/test_weight.h5", + train_model=True, create_new_model=False, network="UBA", + model=IntelliO3_ts_architecture_freeze, + evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], + # competitors=["test_model", "test_model2"], + competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + window_lead_time=1, # oversampling_bins=10, oversampling_rates_cap=100, + **parser_args.__dict__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/run_with_finetuning.py b/run_with_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..fc334a5cf111f3343ebf7d87ef7b8d4746febdf8 --- /dev/null +++ b/run_with_finetuning.py @@ -0,0 +1,46 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_finetune_all_dense, \ + IntelliO3_ts_architecture_finetune_outputs, IntelliO3_ts_architecture_finetune_main_output +import os + + +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow( # stations=load_stations(), + stations=["DEBW087", "DEBW013", "DEBW107", "DEBW076"], + #stations=["DEBW013", "DEBW087"], + epochs=1, external_weights="/home/vincentgramlich/mlair/data/weights/testrun_network_daily_model-best.h5", + train_model=True, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture_finetune_all_dense, + window_lead_time=1, + #oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, + evaluate_bootstraps=False, plot_list=["PlotContingency"], + competitors=["withoutfinetuning"], + competitor_path=os.path.join(os.getcwd(), "data", "competitors", "o3"), + **parser_args.__dict__, start_script=__file__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/run_with_oversampling.py b/run_with_oversampling.py new file mode 100644 index 0000000000000000000000000000000000000000..781885f0f1d7e55473abf1b745bb4e4651ce36ee --- /dev/null +++ b/run_with_oversampling.py @@ -0,0 +1,46 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflowHPC +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture +import os + + +def load_stations(external_station_list=None): + import json + if external_station_list is None: + external_station_list = 'supplement/station_list_north_german_plain_rural.json' + try: + filename = external_station_list + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflowHPC(#stations=load_stations('supplement/German_background_stations.json'), + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=1, + train_model=True, create_new_model=True, network="UBA", + #model=IntelliO3_ts_architecture, + oversampling_method="bin_oversampling", + evaluate_bootstraps=False, plot_list=["PlotOversamplingContingency"], + competitors=["intellitest"], + #competitor_path="/p/project/deepacf/intelliaq/gramlich1/mlair/competitors/o3", + window_lead_time=1, oversampling_bins=2, oversampling_rates_cap=2, + **parser_args.__dict__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/run_without_finetuning.py b/run_without_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..61644a33195f8c5124b61f164c4e5831081ce0fb --- /dev/null +++ b/run_without_finetuning.py @@ -0,0 +1,45 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_freeze +import os + + +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow( # stations=load_stations(), + stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], + #stations=["DEBW013", "DEBW087"], + epochs=5, #external_weights="/home/vincentgramlich/mlair/data/weights/test_weight.h5", + train_model=True, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture, + window_lead_time=1, + #oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, + evaluate_bootstraps=False, plot_list=plots, + #competitors=["testcompetitor", "testcompetitor2"], + competitor_path=os.path.join(os.getcwd(), "data", "competitors"), + **parser_args.__dict__, start_script=__file__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/run_without_oversampling.py b/run_without_oversampling.py new file mode 100644 index 0000000000000000000000000000000000000000..c1714e685abaf12d71522916c7e1daebb836d9f9 --- /dev/null +++ b/run_without_oversampling.py @@ -0,0 +1,45 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflowHPC +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture +import os + + +def load_stations(external_station_list=None): + import json + if external_station_list is None: + external_station_list = 'supplement/station_list_north_german_plain_rural.json' + try: + filename = external_station_list + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflowHPC(#stations=load_stations('supplement/German_background_stations.json'), + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=1, + train_model=True, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture, + evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], + #competitors=["test_model", "test_model2"], + competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + window_lead_time=1, #oversampling_bins=10, oversampling_rates_cap=100, + **parser_args.__dict__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index b6bdd9556f73ff711003b01c3a2b65a1c20c66d3..97d80eb2e1192c3cc41d2af11cdec1f5fc8a0ca2 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -68,4 +68,5 @@ class TestAllDefaults: assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", - "PlotDataHistogram", "PlotPeriodogram"] + "PlotDataHistogram", "PlotPeriodogram", "PlotOversampling", + "PlotContingency"]