diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index e95be2d5301edcfb4f7b7f5eb386f58a736f1495..bc0ad1bf2fcb800259c86d0106921f6888030f86 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -113,9 +113,47 @@ class PreProcessing(RunEnvironment): precision = 4 path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(path) - set_names = ["train", "val", "test"] - df = pd.DataFrame(columns=meta_data + set_names) - for set_name in set_names: + names_of_set = ["train", "val", "test"] + df = self.create_info_df(meta_data, meta_round, names_of_set, precision) + column_format = self.create_column_format_for_tex(df) + self.save_to_tex(path=path, filename="station_sample_size.tex", column_format=column_format, df=df) + self.save_to_md(path=path, filename="station_sample_size.md", df=df) + df_nometa = df.drop(meta_data, axis=1) + self.save_to_tex(path=path, filename="station_sample_size_short.tex", column_format=column_format, df=df_nometa) + self.save_to_md(path=path, filename="station_sample_size_short.md", df=df_nometa) + # df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', + # column_format=column_format) + df_descr = self.create_describe_df(df_nometa) + column_format = self.create_column_format_for_tex(df_descr) + self.save_to_tex(path=path, filename="station_describe_short.tex", column_format=column_format, df=df_descr) + self.save_to_md(path=path, filename="station_describe_short.md", df=df_descr) + # df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', column_format=column_format) + + @staticmethod + def create_describe_df(df, percentiles=None, ignore_last_lines: int = 2): + if percentiles is None: + percentiles = [.05, .1, .25, .5, .75, .9, .95] + df_descr = df.iloc[:-ignore_last_lines].astype('float32').describe( + percentiles=percentiles).astype('int32') + df_descr = pd.concat([df.loc[['# Samples']], df_descr]).T + df_descr.rename(columns={"# Samples": "no. samples", "count": "no. stations"}, inplace=True) + df_descr_colnames = list(df_descr.columns) + df_descr_colnames = [df_descr_colnames[1]] + [df_descr_colnames[0]] + df_descr_colnames[2:] + df_descr = df_descr[df_descr_colnames] + return df_descr + + @staticmethod + def save_to_tex(path, filename, column_format, df, na_rep='---'): + df.to_latex(os.path.join(path, filename), na_rep=na_rep, column_format=column_format) + + @staticmethod + def save_to_md(path, filename, df, mode="w", encoding='utf-8', tablefmt="github"): + df.to_markdown(open(os.path.join(path, filename), mode=mode, encoding=encoding), + tablefmt=tablefmt) + + def create_info_df(self, meta_data, meta_round, names_of_set, precision): + df = pd.DataFrame(columns=meta_data + names_of_set) + for set_name in names_of_set: data = self.data_store.get("data_collection", set_name) for station in data: station_name = str(station.id_class) @@ -128,22 +166,7 @@ class PreProcessing(RunEnvironment): df.sort_index(inplace=True) df = df.reindex(df.index.drop(["# Stations", "# Samples"]).to_list() + ["# Stations", "# Samples"], ) df.index.name = 'stat. ID' - column_format = self.create_column_format_for_tex(df) - df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format) - df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), tablefmt="github") - df_nometa = df.drop(meta_data, axis=1) - df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', - column_format=column_format) - df_descr = df_nometa.iloc[:-2].astype('float32').describe( - percentiles=[.05, .1, .25, .5, .75, .9, .95]).astype('int32') - df_descr = pd.concat([df_nometa.loc[['# Samples']], df_descr]).T - df_descr.rename(columns={"# Samples": "no. samples", "count": "no. stations"}, inplace=True) - df_descr_colnames = list(df_descr.columns) - df_descr_colnames = [df_descr_colnames[1]] + [df_descr_colnames[0]] + df_descr_colnames[2:] - df_descr = df_descr[df_descr_colnames] - column_format = self.create_column_format_for_tex(df_descr) - df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', column_format=column_format) - + return df @staticmethod def create_column_format_for_tex(df: pd.DataFrame) -> str: diff --git a/test/test_run_modules/test_pre_processing.py b/test/test_run_modules/test_pre_processing.py index bdb8fdabff67ad894275c805522b9df4cf167011..5e89beff72ed659d2ba20045af25612e68ce8d22 100644 --- a/test/test_run_modules/test_pre_processing.py +++ b/test/test_run_modules/test_pre_processing.py @@ -8,6 +8,8 @@ from mlair.helpers import PyTestRegex from mlair.run_modules.experiment_setup import ExperimentSetup from mlair.run_modules.pre_processing import PreProcessing from mlair.run_modules.run_environment import RunEnvironment +import pandas as pd +import numpy as np class TestPreProcessing: @@ -115,3 +117,38 @@ class TestPreProcessing: assert pre.transformation(data_preparation, stations) is None class data_preparation_no_trans: pass assert pre.transformation(data_preparation_no_trans, stations) is None + + @pytest.fixture + def dummy_df(self): + data_dict = {'station_name': {'DEBW013': 'Stuttgart Bad Cannstatt', 'DEBW076': 'Baden-Baden', + 'DEBW087': 'Schwäbische_Alb', 'DEBW107': 'Tübingen', + 'DEBY081': 'Garmisch-Partenkirchen/Kreuzeckbahnstraße', '# Stations': np.nan, + '# Samples': np.nan}, + 'station_lon': {'DEBW013': 9.2297, 'DEBW076': 8.2202, 'DEBW087': 9.2076, 'DEBW107': 9.0512, + 'DEBY081': 11.0631, '# Stations': np.nan, '# Samples': np.nan}, + 'station_lat': {'DEBW013': 48.8088, 'DEBW076': 48.7731, 'DEBW087': 48.3458, 'DEBW107': 48.5077, + 'DEBY081': 47.4764, '# Stations': np.nan, '# Samples': np.nan}, + 'station_alt': {'DEBW013': 235.0, 'DEBW076': 148.0, 'DEBW087': 798.0, 'DEBW107': 325.0, + 'DEBY081': 735.0, '# Stations': np.nan, '# Samples': np.nan}, + 'train': {'DEBW013': 1413, 'DEBW076': 3002, 'DEBW087': 3016, 'DEBW107': 1782, 'DEBY081': 2837, + '# Stations': 6, '# Samples': 12050}, + 'val': {'DEBW013': 698, 'DEBW076': 715, 'DEBW087': 700, 'DEBW107': 701, 'DEBY081': 456, + '# Stations': 6, '# Samples': 3270}, + 'test': {'DEBW013': 1066, 'DEBW076': 696, 'DEBW087': 1080, 'DEBW107': 1080, 'DEBY081': 700, + '# Stations': 6, '# Samples': 4622}} + df = pd.DataFrame.from_dict(data_dict) + return df + + def test_create_column_format_for_tex(self): + df = pd.DataFrame(np.ones((2, 1))) + df_col = PreProcessing.create_column_format_for_tex(df) # len: 1+1 + assert df_col == 'lr' + assert len(df_col) == 2 + df = pd.DataFrame(np.ones((2, 2))) + df_col = PreProcessing.create_column_format_for_tex(df) # len: 2+1 + assert df_col == 'lcr' + assert len(df_col) == 3 + df = pd.DataFrame(np.ones((2, 3))) + df_col = PreProcessing.create_column_format_for_tex(df) # len: 3+1 + assert df_col == 'lccr' + assert len(df_col) == 4