diff --git a/src/run_modules/pre_processing.py b/src/run_modules/pre_processing.py index 147f480cda6c9f6466057bdc5cb152076e0f7132..551ea599a3114b7b97f5bcb146cf6e131e324eb5 100644 --- a/src/run_modules/pre_processing.py +++ b/src/run_modules/pre_processing.py @@ -62,37 +62,42 @@ class PreProcessing(RunEnvironment): def create_latex_report(self): """ - This function creates a latex table containing the Station IDs as index, and number of valid data points per - station per subset as well as used_meta_data: - could look like this - \begin{tabular}{llrrrlll} - \toprule - {} & station\_name & station\_lon & station\_lat & station\_alt & train & val & test \\ - \midrule - DENW094 & Aachen-Burtscheid & 6.0939 & 50.7547 & 205.0 & 1875 & 584 & 1032 \\ - DEBW029 & Aalen & 10.0963 & 48.8479 & 424.0 & 2958 & 715 & 1080 \\ - DENI052 & Allertal & 9.6230 & 52.8294 & 38.0 & 2790 & 497 & 1080 \\ + This function creates tables with information on the station meta data and a summary on subset sample sizes. + + * station_sample_size.md: see table below + * station_sample_size.tex: same as table below, but as latex table + * station_sample_size_short.tex: reduced size table without any meta data besides station ID, as latex table + + All tables are stored inside experiment_path inside the folder latex_report. The table format (e.g. which meta + data is highlighted) is currently hardcoded to have a stable table style. If further styles are needed, it is + better to add an additional style than modifying the existing table styles. + + | stat. ID | station_name | station_lon | station_lat | station_alt | train | val | test | + |------------|-------------------------------------------|---------------|---------------|---------------|---------|-------|--------| + | DEBW013 | Stuttgart Bad Cannstatt | 9.2297 | 48.8088 | 235 | 1434 | 712 | 1080 | + | DEBW076 | Baden-Baden | 8.2202 | 48.7731 | 148 | 3037 | 722 | 710 | + | DEBW087 | Schwäbische_Alb | 9.2076 | 48.3458 | 798 | 3044 | 714 | 1087 | + | DEBW107 | Tübingen | 9.0512 | 48.5077 | 325 | 1803 | 715 | 1087 | + | DEBY081 | Garmisch-Partenkirchen/Kreuzeckbahnstraße | 11.0631 | 47.4764 | 735 | 2935 | 525 | 714 | + | # Stations | nan | nan | nan | nan | 6 | 6 | 6 | + | # Samples | nan | nan | nan | nan | 12253 | 3388 | 4678 | + """ meta_data = ['station_name', 'station_lon', 'station_lat', 'station_alt'] meta_round = ["station_lon", "station_lat", "station_alt"] precision = 4 - path = self.data_store.get("experiment_path") - path = os.path.join(path, "latex_report") + path = os.path.join(self.data_store.get("experiment_path"), "latex_report") check_path_and_create(path) - table_name = "test.tex" - data_train: DataGenerator = self.data_store.get('generator', 'train') - data_val: DataGenerator = self.data_store.get('generator', 'val') - data_test: DataGenerator = self.data_store.get('generator', 'test') - - df = pd.DataFrame(columns=meta_data+["train", "val", "test"]) - for k, data in zip(["train", "val", "test"], [data_train, data_val, data_test]): - stations = data.stations - for station in stations: - df.loc[station, k] = data.get_data_generator(station).get_transposed_label().shape[0] + set_names = ["train", "val", "test"] + df = pd.DataFrame(columns=meta_data+set_names) + for set_name in set_names: + data: DataGenerator = self.data_store.get("generator", set_name) + for station in data.stations: + df.loc[station, set_name] = data.get_data_generator(station).get_transposed_label().shape[0] if df.loc[station, meta_data].isnull().any(): df.loc[station, meta_data] = data.get_data_generator(station).meta.loc[meta_data].values.flatten() - df.loc["# Samples", k] = df.loc[:, k].sum() - df.loc["# Stations", k] = df.loc[:, k].count() + df.loc["# Samples", set_name] = df.loc[:, set_name].sum() + df.loc["# Stations", set_name] = df.loc[:, set_name].count() df[meta_round] = df[meta_round].astype(float).round(precision) df.sort_index(inplace=True) df = df.reindex(df.index.drop(["# Stations", "# Samples"]).to_list() + ["# Stations", "# Samples"], ) @@ -101,9 +106,10 @@ class PreProcessing(RunEnvironment): column_format[0] = 'l' column_format[-1] = 'r' column_format = ''.join(column_format.tolist()) - df.to_latex(os.path.join(path, "test.tex"), na_rep='---', column_format=column_format) - df.to_markdown(open(os.path.join(path, "test.md"), mode="w", encoding='utf-8'), tablefmt="github") - df.drop(meta_data, axis=1).to_latex(os.path.join(path, "test_short.tex"), na_rep='---', column_format=column_format) + df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format) + df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), tablefmt="github") + df.drop(meta_data, axis=1).to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', + column_format=column_format) def split_train_val_test(self) -> None: """