diff --git a/test/run_pytest.sh b/test/run_pytest.sh index 2f9fc13eb68a9e02a171834c11bd0efecac5fb7c..ac39b23e289cffe8cb7f8d5ba526d3d93dc7a128 100644 --- a/test/run_pytest.sh +++ b/test/run_pytest.sh @@ -21,11 +21,11 @@ fi #python -m pytest test_process_netCDF_v2.py source ../video_prediction_tools/env_setup/modules_train.sh #Test for process step2 -#python -m pytest test_data_preprocess_step2.py +python -m pytest test_data_preprocess_step2.py #python -m pytest test_era5_data.py #Test for training #First remove all the files in the test folder #rm /p/project/deepacf/deeprain/video_prediction_shared_folder/models/test/* #python -m pytest test_train_model_era5.py #python -m pytest test_visualize_postprocess.py -python -m pytest test_meta_postprocess.py +#python -m pytest test_meta_postprocess.py diff --git a/test/test_data_preprocess_step2.py b/test/test_data_preprocess_step2.py index 7f7f3bf30917371ccf283bcd6fb9be380f21d5d5..b8a71408c1870631d7e0aef396ba7870b0ece789 100644 --- a/test/test_data_preprocess_step2.py +++ b/test/test_data_preprocess_step2.py @@ -10,24 +10,17 @@ import json import datetime input_dir = "/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/era5_test" -output_dir = "/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/era5_test" -datasplit_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/data_split/cv_test.json" -hparams_dict_config = "/p/project/deepacf/deeprain/bing/ambs/video_prediction_tools/hparams/era5/convLSTM/model_hparams.json" vars_in = ["T2","MSL","gph500"] sequences_per_file = 10 - +sequence_length = 20 #generate an instance for ERA5Pkl2Tfrecords @pytest.fixture(scope="module") def era5_dataset_case1(): - return ERA5Pkl2Tfrecords(input_dir=input_dir,output_dir=output_dir,datasplit_config=datasplit_config, - hparams_dict_config=hparams_dict_config,sequences_per_file=sequences_per_file) - -def test_get_datasplit(era5_dataset_case1): - assert input_dir == era5_dataset_case1.input_dir - d = era5_dataset_case1.get_datasplit() + return ERA5Pkl2Tfrecords(input_dir=input_dir,sequence_length=sequence_length, + sequences_per_file=sequences_per_file) def test_get_months(era5_dataset_case1): - assert len(era5_dataset_case1.get_years_months()[1]) == 3 + assert len(era5_dataset_case1.get_years_months()[1]) == 12 def test_get_metadata(era5_dataset_case1): """ @@ -37,14 +30,14 @@ def test_get_metadata(era5_dataset_case1): assert era5_dataset_case1.width == 160 assert era5_dataset_case1.vars_in == ["T2","MSL","gph500"] -def test_parse_hparams(era5_dataset_case1): - """ - Test the updated hparam is properly updated - """ - print("hparmas:",era5_dataset_case1.hparams) - assert era5_dataset_case1.hparams.max_epochs == 20 - assert era5_dataset_case1.sequence_length == 20 - assert era5_dataset_case1.hparams.batch_size == 4 +#def test_parse_hparams(era5_dataset_case1): +# """ +# Test the updated hparam is properly updated +# """ +# print("hparmas:",era5_dataset_case1.hparams) +# assert era5_dataset_case1.hparams.max_epochs == 20 +# assert era5_dataset_case1.sequence_length == 20 +# assert era5_dataset_case1.hparams.batch_size == 4 def test_save_tf_record(era5_dataset_case1): #create a sequence @@ -69,7 +62,7 @@ def test_read_pkl_and_save_tfrecords(era5_dataset_case1): #assert era5_dataset_case1.input_file_year=="/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/era5-Y2010toY2222M01to12-160x128-2970N1500W-T2_MSL_gph500/pickle/2017" #assert the output tfrecords is saved properly fname_case1 = "sequence_Y_2017_M_{}_0_to_{}.tfrecords".format(month_test,sequences_per_file-1) - if_file_exit = os.path.isfile(os.path.join(output_dir,"tfrecords",fname_case1)) + if_file_exit = os.path.isfile(os.path.join(output_dir,"tfrecords_seq_len_20",fname_case1)) print("file check:",os.path.join(output_dir,fname_case1)) assert if_file_exit == True diff --git a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh index 85620bfac61a99701fca88ed5db29462e9948962..04ecea9a1cc7f0e76482103cfa89c416cb7e8075 100644 --- a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh +++ b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh @@ -33,17 +33,8 @@ if [ -z ${VIRTUAL_ENV} ]; then fi # declare directory-variables which will be modified by config_runscript.py -source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/ -destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/ -# further settings -datasplit_dir=../data_split/cv_test.json -model=convLSTM -hparams_dict_config=../hparams/era5/${model}/model_hparams.json -sequences_per_file=10 +base_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/ sequence_length=20 - -# run preprocessing (step 2 where Tf-records are generated) -srun python ../main_scripts/main_preprocess_data_step2.py -input_dir ${source_dir} -output_dir ${destination_dir} - -datasplit_config ${datasplit_dir} -hparams_dict_config ${hparams_dict_config} - -sequences_per_file ${sequences_per_file} - +sequences_per_file=10 +# run Preprocessing (step 2 where Tf-records are generated) +srun python ../main_scripts/main_preprocess_data_step2.py -base_dir ${base_dir} -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} diff --git a/video_prediction_tools/data_preprocess/preprocess_data_step2.py b/video_prediction_tools/data_preprocess/preprocess_data_step2.py index 5bb503d20086d7fc016dab013bc6a42284c4336c..687c2639a22a72c53a1f3dbec30bc7c782947de0 100644 --- a/video_prediction_tools/data_preprocess/preprocess_data_step2.py +++ b/video_prediction_tools/data_preprocess/preprocess_data_step2.py @@ -2,12 +2,13 @@ Class and functions required for preprocessing ERA5 data (preprocessing substep 2) """ __email__ = "b.gong@fz-juelich.de" -__author__ = "Bing Gong, Scarlet Stadtler, Michael Langguth" -__date__ = "2020_11_10" +__author__ = "Bing Gong" +__date__ = "2020_12_29" # import modules import os +import glob import pickle import numpy as np import json @@ -18,35 +19,26 @@ from model_modules.video_prediction.datasets import ERA5Dataset class ERA5Pkl2Tfrecords(ERA5Dataset): - def __init__(self, input_dir=None, output_dir=None, datasplit_config=None, hparams_dict_config=None, \ - sequences_per_file=128, norm="minmax"): + def __init__(self, input_dir=None, sequence_length=20, sequences_per_file=128,norm="minmax"): """ This class is used for converting pkl files to tfrecords args: input_dir : str, the path to the PreprocessData directory which is parent directory of "Pickle" and "tfrecords" files directiory. - outpout_dir : str, the one upper level of the path to save the tfrecords files - datasplit_config : the path pointing to the datasplit_config json file - hparams_dict_config : the path to the dict that contains hparameters, + sequence_length : int, default is 20, the sequen length per sample sequences_per_file : int, how many sequences/samples per tfrecord to be saved norm : str, normalization methods from Norm_data class ("minmax" or "znorm"; default: "minmax") """ self.input_dir = input_dir - # ML: Do not append paths inside the script (invisible even for advanced users) - #self.output_dir = os.path.join(output_dir, "tfrecords") - self.output_dir = output_dir + self.input_dir_pkl = os.path.join(input_dir,"pickle") + self.output_dir = os.path.join(input_dir, "tfrecords_seq_len_" + str(sequence_length)) # if the output_dir is not exist, then create it os.makedirs(self.output_dir, exist_ok=True) # get metadata,includes the var_in, image height, width etc. self.get_metadata() # Get the data split informaiton - self.datasplit_dict_path = datasplit_config - self.data_dict = self.get_datasplit() - self.hparams_dict_config = hparams_dict_config - self.hparams_dict = self.get_model_hparams_dict() - self.hparams = self.parse_hparams() - self.sequence_length = self.hparams.sequence_length + self.sequence_length = sequence_length if norm == "minmax" or norm == "znorm": self.norm = norm else: @@ -60,15 +52,20 @@ class ERA5Pkl2Tfrecords(ERA5Dataset): Return : two elements: each contains 1-dim array with the months set from data_split_config json file """ - self.mode_list = [] - self.years = [] self.months = [] - for mode, value in self.d.items(): - self.mode_list.append(mode) - for year, month in value.items(): - self.years.append(year) - self.months.extend(month) - return set(self.years), set(self.months) + self.years_months = [] + #search for pickle names with pattern 'X_{}.pkl'for months + self.years = [ name for name in os.listdir(self.input_dir_pkl) if os.path.isdir(os.path.join(self.input_dir_pkl,name)) ] + #search for folder names from pickle folder to get years + patt = "X_*.pkl" + for year in self.years: + print("pahtL:",os.path.join(self.input_dir_pkl,year,patt)) + months_pkl_list = glob.glob(os.path.join(self.input_dir_pkl,year,patt)) + print ("months_pkl_list",months_pkl_list) + months_list = [int(m[-6:-4]) for m in months_pkl_list] + self.months.extend(months_list) + self.years_months.append(months_list) + return self.years, list(set(self.months)),self.years_months def get_stats_file(self): """ diff --git a/video_prediction_tools/data_split/bair_action_free/datasplit.json b/video_prediction_tools/data_split/bair_action_free/datasplit.json new file mode 100644 index 0000000000000000000000000000000000000000..217b285d8e105debbe7841735eb50786762ace19 --- /dev/null +++ b/video_prediction_tools/data_split/bair_action_free/datasplit.json @@ -0,0 +1,14 @@ +{ + "train":{ + "index1":[0,100], + "index2":[150,200] + }, + "val": + { + "index1":[110,149] + }, + "test": + { + "index1":[150,200] + } + } diff --git a/video_prediction_tools/data_split/bair_action_free/datasplit_template.json b/video_prediction_tools/data_split/bair_action_free/datasplit_template.json new file mode 100644 index 0000000000000000000000000000000000000000..11407a0439e7bd3d1397d6dfce9cce660786a866 --- /dev/null +++ b/video_prediction_tools/data_split/bair_action_free/datasplit_template.json @@ -0,0 +1,21 @@ +# NOTE: This json-file should not be processed and simply serves as an exemplary file to configure the datasplit for kth human action dataset. +# If you would like to generate your own datasplit config file, you may copy this template and modify it to your personal needs. +# However, remember to remove any comment lines (starting with #) from your config-file then!!! +# +# Explanation: In the following, the data is splitted based on the index, each index has a list with two elements which are the start and end indices of the +# raw dataset +# Be aware that this is a prue data file, i.e. do not make use of any Python-functions such as np.range or similar here! +{ + "train":{ + "index1":[0,100], + "index2":[150,200] + }, + "val": + { + "index1":[110,149] + }, + "test": + { + "index1":[150,200] + } + } diff --git a/video_prediction_tools/data_split/era5/datasplit.json b/video_prediction_tools/data_split/era5/datasplit.json new file mode 100644 index 0000000000000000000000000000000000000000..5dafd53b7143c064beabf67b02c723805c4b52ef --- /dev/null +++ b/video_prediction_tools/data_split/era5/datasplit.json @@ -0,0 +1,14 @@ +{ + "train":{ + "2017":[1] + }, + "val": + { + "2017":[2] + }, + "test": + { + "2017":[3] + + } + } diff --git a/video_prediction_tools/data_split/datasplit_template.json b/video_prediction_tools/data_split/era5/datasplit_template.json similarity index 100% rename from video_prediction_tools/data_split/datasplit_template.json rename to video_prediction_tools/data_split/era5/datasplit_template.json diff --git a/video_prediction_tools/data_split/kth/datasplit.json b/video_prediction_tools/data_split/kth/datasplit.json new file mode 100644 index 0000000000000000000000000000000000000000..217b285d8e105debbe7841735eb50786762ace19 --- /dev/null +++ b/video_prediction_tools/data_split/kth/datasplit.json @@ -0,0 +1,14 @@ +{ + "train":{ + "index1":[0,100], + "index2":[150,200] + }, + "val": + { + "index1":[110,149] + }, + "test": + { + "index1":[150,200] + } + } diff --git a/video_prediction_tools/data_split/kth/datasplit_template.json b/video_prediction_tools/data_split/kth/datasplit_template.json new file mode 100644 index 0000000000000000000000000000000000000000..11407a0439e7bd3d1397d6dfce9cce660786a866 --- /dev/null +++ b/video_prediction_tools/data_split/kth/datasplit_template.json @@ -0,0 +1,21 @@ +# NOTE: This json-file should not be processed and simply serves as an exemplary file to configure the datasplit for kth human action dataset. +# If you would like to generate your own datasplit config file, you may copy this template and modify it to your personal needs. +# However, remember to remove any comment lines (starting with #) from your config-file then!!! +# +# Explanation: In the following, the data is splitted based on the index, each index has a list with two elements which are the start and end indices of the +# raw dataset +# Be aware that this is a prue data file, i.e. do not make use of any Python-functions such as np.range or similar here! +{ + "train":{ + "index1":[0,100], + "index2":[150,200] + }, + "val": + { + "index1":[110,149] + }, + "test": + { + "index1":[150,200] + } + } diff --git a/video_prediction_tools/data_split/moving_mnist/datasplit.json b/video_prediction_tools/data_split/moving_mnist/datasplit.json new file mode 100644 index 0000000000000000000000000000000000000000..217b285d8e105debbe7841735eb50786762ace19 --- /dev/null +++ b/video_prediction_tools/data_split/moving_mnist/datasplit.json @@ -0,0 +1,14 @@ +{ + "train":{ + "index1":[0,100], + "index2":[150,200] + }, + "val": + { + "index1":[110,149] + }, + "test": + { + "index1":[150,200] + } + } diff --git a/video_prediction_tools/data_split/moving_mnist/datasplit_template.json b/video_prediction_tools/data_split/moving_mnist/datasplit_template.json new file mode 100644 index 0000000000000000000000000000000000000000..11407a0439e7bd3d1397d6dfce9cce660786a866 --- /dev/null +++ b/video_prediction_tools/data_split/moving_mnist/datasplit_template.json @@ -0,0 +1,21 @@ +# NOTE: This json-file should not be processed and simply serves as an exemplary file to configure the datasplit for kth human action dataset. +# If you would like to generate your own datasplit config file, you may copy this template and modify it to your personal needs. +# However, remember to remove any comment lines (starting with #) from your config-file then!!! +# +# Explanation: In the following, the data is splitted based on the index, each index has a list with two elements which are the start and end indices of the +# raw dataset +# Be aware that this is a prue data file, i.e. do not make use of any Python-functions such as np.range or similar here! +{ + "train":{ + "index1":[0,100], + "index2":[150,200] + }, + "val": + { + "index1":[110,149] + }, + "test": + { + "index1":[150,200] + } + } diff --git a/video_prediction_tools/main_scripts/main_preprocess_data_step2.py b/video_prediction_tools/main_scripts/main_preprocess_data_step2.py index 3695b90b30465b6698223bfcf9415149533be6d2..d5a57b8d173c65e5b8c7bda459053bc21eaf07ab 100644 --- a/video_prediction_tools/main_scripts/main_preprocess_data_step2.py +++ b/video_prediction_tools/main_scripts/main_preprocess_data_step2.py @@ -12,26 +12,20 @@ from mpi4py import MPI from general_utils import get_unique_vars from statistics import Calc_data_stat from data_preprocess.preprocess_data_step2 import * +import warnings def main(): parser = argparse.ArgumentParser() parser.add_argument("-input_dir", type=str) - parser.add_argument("-output_dir", type=str) - parser.add_argument("-datasplit_config", type=str, \ - help="The path to the datasplit_config json file which contains the details of train/val/testing") - parser.add_argument("-hparams_dict_config", type=str,\ - help="The path to the dict that contains hparameters.", default="") + parser.add_argument("-sequence_length", type=int, default=20) parser.add_argument("-sequences_per_file", type=int, default=20) args = parser.parse_args() - ins = ERA5Pkl2Tfrecords(input_dir=args.input_dir, output_dir=args.output_dir, - datasplit_config=args.datasplit_config, - hparams_dict_config=args.hparams_dict_config, - sequences_per_file=args.sequences_per_file) + ins = ERA5Pkl2Tfrecords(input_dir=args.input_dir, + sequence_length = args.sequence_length, + sequences_per_file=args.sequences_per_file) - partition = ins.data_dict - partition_data = partition.values() - years, months = ins.get_years_months() + years, months,years_months = ins.get_years_months() input_dir_pkl = os.path.join(args.input_dir, "pickle") # ini. MPI comm = MPI.COMM_WORLD @@ -49,31 +43,28 @@ def main(): # loop over whole data set (training, dev and test set) to collect the intermediate statistics print("Start collecting statistics from the whole dataset to be processed...") - for split in partition.keys(): - values = partition[split] - for year in values.keys(): - file_dir = os.path.join(stat_dir_prefix, year) - for month in values[year]: - if os.path.isfile(os.path.join(file_dir, "stat_" + '{0:02}'.format(month) + ".json")): - # process stat-file: - stat_obj.acc_stat_master(file_dir, int(month)) # process monthly statistic-file - else: - raise ("The stat file does not exist:", os.path.join(file_dir, "stat_" + '{0:02}'.\ - format(month) + ".json")) + + for year in years: + file_dir = os.path.join(stat_dir_prefix, year) + for month in months: + if os.path.isfile(os.path.join(file_dir, "stat_" + '{0:02}'.format(month) + ".json")): + # process stat-file: + stat_obj.acc_stat_master(file_dir, int(month)) # process monthly statistic-file + else: + warnings.warn("The stat file for year {} month {} does not exist".format(year, month)) # finalize statistics and write to json-file stat_obj.finalize_stat_master(vars_uni) stat_obj.write_stat_json(input_dir_pkl) # organize parallelized partioning - real_years_months = [] - for year_months in partition_data: - print("I am here year:", year_months) - for year in year_months.keys(): - for month in year_months[year]: - print("I am here month", month) - year_month = "Y_{}_M_{}".format(year, month) - real_years_months.append(year_month) + for i in range(len(years)): + year = years[i] + print("I am here year:", year) + for month in years_months[i]: + print("I am here month", month) + year_month = "Y_{}_M_{}".format(year, month) + real_years_months.append(year_month) broadcast_lists = [list(years), real_years_months] @@ -95,14 +86,12 @@ def main(): real_years_months = message_in[1] for year in years: - # loop over the years in the datasplit_dict which we want to process, - # while months that are not in the datasplit_dict are skipped year_rank = "Y_{}_M_{}".format(year, my_rank) if year_rank in real_years_months: # Initilial instance - ins2 = ERA5Pkl2Tfrecords(input_dir=args.input_dir, output_dir=args.output_dir, - datasplit_config=args.datasplit_config, - hparams_dict_config=args.hparams_dict_config, sequences_per_file=args.sequences_per_file) + ins2 = ERA5Pkl2Tfrecords(input_dir=args.input_dir, + sequence_length = args.sequence_length, + sequences_per_file=args.sequences_per_file) # create the tfrecords-files ins2.read_pkl_and_save_tfrecords(year=year, month=my_rank) print("Year {} finished", year)