diff --git a/src/run_modules/pre_processing.py b/src/run_modules/pre_processing.py index 551ea599a3114b7b97f5bcb146cf6e131e324eb5..5731b7a6291b146681e976aa40fcb3d87a464c3b 100644 --- a/src/run_modules/pre_processing.py +++ b/src/run_modules/pre_processing.py @@ -102,14 +102,32 @@ class PreProcessing(RunEnvironment): df.sort_index(inplace=True) df = df.reindex(df.index.drop(["# Stations", "# Samples"]).to_list() + ["# Stations", "# Samples"], ) df.index.name = 'stat. ID' - column_format = np.repeat('c', df.shape[1]+1) + column_format = self.create_column_format_for_tex(df) + df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format) + df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), tablefmt="github") + df_nometa = df.drop(meta_data, axis=1) + df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', + column_format=column_format) + df_descr = df_nometa.iloc[:-2].astype('float32').describe( + percentiles=[.05, .1, .25, .5, .75, .9, .95]).astype('int32') + df_descr = pd.concat([df_nometa.loc[['# Samples']], df_descr]).T + df_descr.rename(columns={"# Samples": "sum"}, inplace=True) + column_format = self.create_column_format_for_tex(df_descr) + df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', + column_format=column_format) + + @staticmethod + def create_column_format_for_tex(df: pd.DataFrame) -> str: + """ + Creates column format for latex table based on the shape of a given DataFrame. + + Calculates number of columns and uses 'c' as column position. First element is set to 'l', last to 'r' + """ + column_format = np.repeat('c', df.shape[1] + 1) column_format[0] = 'l' column_format[-1] = 'r' column_format = ''.join(column_format.tolist()) - df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format) - df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), tablefmt="github") - df.drop(meta_data, axis=1).to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', - column_format=column_format) + return column_format def split_train_val_test(self) -> None: """