Skip to content
Snippets Groups Projects
Commit 1dba57c6 authored by Felix Kleinert's avatar Felix Kleinert
Browse files

update summary tables in preprocessing

parent 70670eb3
No related branches found
No related tags found
3 merge requests!226Develop,!225Resolve "release v1.2.0",!194Resolve "Inclue new IntelliO3 plots in MLAir"
Pipeline #52937 passed
......@@ -113,9 +113,47 @@ class PreProcessing(RunEnvironment):
precision = 4
path = os.path.join(self.data_store.get("experiment_path"), "latex_report")
path_config.check_path_and_create(path)
set_names = ["train", "val", "test"]
df = pd.DataFrame(columns=meta_data + set_names)
for set_name in set_names:
names_of_set = ["train", "val", "test"]
df = self.create_info_df(meta_data, meta_round, names_of_set, precision)
column_format = self.create_column_format_for_tex(df)
self.save_to_tex(path=path, filename="station_sample_size.tex", column_format=column_format, df=df)
self.save_to_md(path=path, filename="station_sample_size.md", df=df)
df_nometa = df.drop(meta_data, axis=1)
self.save_to_tex(path=path, filename="station_sample_size_short.tex", column_format=column_format, df=df_nometa)
self.save_to_md(path=path, filename="station_sample_size_short.md", df=df_nometa)
# df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---',
# column_format=column_format)
df_descr = self.create_describe_df(df_nometa)
column_format = self.create_column_format_for_tex(df_descr)
self.save_to_tex(path=path, filename="station_describe_short.tex", column_format=column_format, df=df_descr)
self.save_to_md(path=path, filename="station_describe_short.md", df=df_descr)
# df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', column_format=column_format)
@staticmethod
def create_describe_df(df, percentiles=None, ignore_last_lines: int = 2):
if percentiles is None:
percentiles = [.05, .1, .25, .5, .75, .9, .95]
df_descr = df.iloc[:-ignore_last_lines].astype('float32').describe(
percentiles=percentiles).astype('int32')
df_descr = pd.concat([df.loc[['# Samples']], df_descr]).T
df_descr.rename(columns={"# Samples": "no. samples", "count": "no. stations"}, inplace=True)
df_descr_colnames = list(df_descr.columns)
df_descr_colnames = [df_descr_colnames[1]] + [df_descr_colnames[0]] + df_descr_colnames[2:]
df_descr = df_descr[df_descr_colnames]
return df_descr
@staticmethod
def save_to_tex(path, filename, column_format, df, na_rep='---'):
df.to_latex(os.path.join(path, filename), na_rep=na_rep, column_format=column_format)
@staticmethod
def save_to_md(path, filename, df, mode="w", encoding='utf-8', tablefmt="github"):
df.to_markdown(open(os.path.join(path, filename), mode=mode, encoding=encoding),
tablefmt=tablefmt)
def create_info_df(self, meta_data, meta_round, names_of_set, precision):
df = pd.DataFrame(columns=meta_data + names_of_set)
for set_name in names_of_set:
data = self.data_store.get("data_collection", set_name)
for station in data:
station_name = str(station.id_class)
......@@ -128,22 +166,7 @@ class PreProcessing(RunEnvironment):
df.sort_index(inplace=True)
df = df.reindex(df.index.drop(["# Stations", "# Samples"]).to_list() + ["# Stations", "# Samples"], )
df.index.name = 'stat. ID'
column_format = self.create_column_format_for_tex(df)
df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format)
df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), tablefmt="github")
df_nometa = df.drop(meta_data, axis=1)
df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---',
column_format=column_format)
df_descr = df_nometa.iloc[:-2].astype('float32').describe(
percentiles=[.05, .1, .25, .5, .75, .9, .95]).astype('int32')
df_descr = pd.concat([df_nometa.loc[['# Samples']], df_descr]).T
df_descr.rename(columns={"# Samples": "no. samples", "count": "no. stations"}, inplace=True)
df_descr_colnames = list(df_descr.columns)
df_descr_colnames = [df_descr_colnames[1]] + [df_descr_colnames[0]] + df_descr_colnames[2:]
df_descr = df_descr[df_descr_colnames]
column_format = self.create_column_format_for_tex(df_descr)
df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', column_format=column_format)
return df
@staticmethod
def create_column_format_for_tex(df: pd.DataFrame) -> str:
......
......@@ -8,6 +8,8 @@ from mlair.helpers import PyTestRegex
from mlair.run_modules.experiment_setup import ExperimentSetup
from mlair.run_modules.pre_processing import PreProcessing
from mlair.run_modules.run_environment import RunEnvironment
import pandas as pd
import numpy as np
class TestPreProcessing:
......@@ -115,3 +117,38 @@ class TestPreProcessing:
assert pre.transformation(data_preparation, stations) is None
class data_preparation_no_trans: pass
assert pre.transformation(data_preparation_no_trans, stations) is None
@pytest.fixture
def dummy_df(self):
data_dict = {'station_name': {'DEBW013': 'Stuttgart Bad Cannstatt', 'DEBW076': 'Baden-Baden',
'DEBW087': 'Schwäbische_Alb', 'DEBW107': 'Tübingen',
'DEBY081': 'Garmisch-Partenkirchen/Kreuzeckbahnstraße', '# Stations': np.nan,
'# Samples': np.nan},
'station_lon': {'DEBW013': 9.2297, 'DEBW076': 8.2202, 'DEBW087': 9.2076, 'DEBW107': 9.0512,
'DEBY081': 11.0631, '# Stations': np.nan, '# Samples': np.nan},
'station_lat': {'DEBW013': 48.8088, 'DEBW076': 48.7731, 'DEBW087': 48.3458, 'DEBW107': 48.5077,
'DEBY081': 47.4764, '# Stations': np.nan, '# Samples': np.nan},
'station_alt': {'DEBW013': 235.0, 'DEBW076': 148.0, 'DEBW087': 798.0, 'DEBW107': 325.0,
'DEBY081': 735.0, '# Stations': np.nan, '# Samples': np.nan},
'train': {'DEBW013': 1413, 'DEBW076': 3002, 'DEBW087': 3016, 'DEBW107': 1782, 'DEBY081': 2837,
'# Stations': 6, '# Samples': 12050},
'val': {'DEBW013': 698, 'DEBW076': 715, 'DEBW087': 700, 'DEBW107': 701, 'DEBY081': 456,
'# Stations': 6, '# Samples': 3270},
'test': {'DEBW013': 1066, 'DEBW076': 696, 'DEBW087': 1080, 'DEBW107': 1080, 'DEBY081': 700,
'# Stations': 6, '# Samples': 4622}}
df = pd.DataFrame.from_dict(data_dict)
return df
def test_create_column_format_for_tex(self):
df = pd.DataFrame(np.ones((2, 1)))
df_col = PreProcessing.create_column_format_for_tex(df) # len: 1+1
assert df_col == 'lr'
assert len(df_col) == 2
df = pd.DataFrame(np.ones((2, 2)))
df_col = PreProcessing.create_column_format_for_tex(df) # len: 2+1
assert df_col == 'lcr'
assert len(df_col) == 3
df = pd.DataFrame(np.ones((2, 3)))
df_col = PreProcessing.create_column_format_for_tex(df) # len: 3+1
assert df_col == 'lccr'
assert len(df_col) == 4
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment