diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 9d44ce0b0e8d7b0bac9c188c697a5e65ab67df4c..6f3c1ceff7292ff9096ea3edba652ef19b8aa771 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -8,6 +8,8 @@ import os import traceback from typing import Tuple import multiprocessing + +import numpy as np import requests import psutil @@ -65,9 +67,37 @@ class PreProcessing(RunEnvironment): raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") self.data_store.set("stations", valid_stations) self.split_train_val_test() + self.apply_oversampling() self.report_pre_processing() self.prepare_competitors() + def apply_oversampling(self): + #if Abfrage for oversampling=True/False + bins = 10 + rates_cap = 20 + data = self.data_store.get('data_collection', 'train') + histogram = np.array(bins) + #get min and max of the whole data + min = 0 + max = 0 + for station in data: + min = np.minimum(np.amin(station.get_Y(as_numpy=True)), min) + max = np.maximum(np.amax(station.get_Y(as_numpy=True)), max) + for station in data: + # erstelle Histogramm mit numpy für jede Station + hist, _ = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(min,max)) + #histograms.append(hist) + histogram = histogram + hist + # Addiere alle Histogramme zusammen + #histogram = histograms[0]+histograms[1]+histograms[2]+histograms[3] + #teile durch gesamtanzahl + histogram = 1/np.sum(histogram) * histogram + #mult mit 1/häufigste Klasse + histogram = 1/np.amax(histogram) * histogram + #Oversampling 1/Kl + oversampling_rates = 1 / histogram + oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap) + def report_pre_processing(self): """Log some metrics on data and create latex report.""" logging.debug(20 * '##')