diff --git a/video_prediction_savp/HPC_scripts/DataExtraction.sh b/video_prediction_savp/HPC_scripts/DataExtraction.sh index 50e655ecf88ce1c06723683bcb16a689ef919c2c..81d32b9d97872b1ce25963ae933dff7520f0fdfc 100755 --- a/video_prediction_savp/HPC_scripts/DataExtraction.sh +++ b/video_prediction_savp/HPC_scripts/DataExtraction.sh @@ -8,8 +8,8 @@ #SBATCH --cpus-per-task=1 #SBATCH --output=DataExtraction-out.%j #SBATCH --error=DataExtraction-err.%j -#SBATCH --time=00:20:00 -#SBATCH --partition=devel +#SBATCH --time=05:20:00 +#SBATCH --partition=batch #SBATCH --mail-type=ALL #SBATCH --mail-user=b.gong@fz-juelich.de @@ -23,8 +23,8 @@ module load h5py/2.9.0-Python-3.6.8 module load mpi4py/3.0.1-Python-3.6.8 module load netcdf4-python/1.5.0.1-Python-3.6.8 -#srun python ../../workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2014/ --destination_dir ${SAVE_DIR}/extractedData/2014 +year=2012 +srun python ../../workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/${year}/ --destination_dir ${SAVE_DIR}/extractedData/${year} # 2tier pystager -srun python ../../workflow_parallel_frame_prediction/DataExtraction/main_single_master.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2013/ --destination_dir ${SAVE_DIR}/extractedData/2013 - +#srun python ../../workflow_parallel_frame_prediction/DataExtraction/main_single_master.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/${year}/ --destination_dir ${SAVE_DIR}/extractedData/${year} diff --git a/video_prediction_savp/HPC_scripts/DataPreprocess_to_tf.sh b/video_prediction_savp/HPC_scripts/DataPreprocess_to_tf.sh index 35b9547554c62a7738cc03b4ebcf701f7e577286..f35bac2829063b1799fc3706f00ced3388ce0efa 100755 --- a/video_prediction_savp/HPC_scripts/DataPreprocess_to_tf.sh +++ b/video_prediction_savp/HPC_scripts/DataPreprocess_to_tf.sh @@ -6,7 +6,7 @@ #SBATCH --cpus-per-task=1 #SBATCH --output=DataPreprocess_to_tf-out.%j #SBATCH --error=DataPreprocess_to_tf-err.%j -#SBATCH --time=00:50:00 +#SBATCH --time=01:20:00 #SBATCH --partition=batch #SBATCH --mail-type=ALL #SBATCH --mail-user=b.gong@fz-juelich.de @@ -20,8 +20,8 @@ module load mpi4py/3.0.1-Python-3.6.8 module load TensorFlow/1.13.1-GPU-Python-3.6.8 -source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData -destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData +source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/ +destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/ -srun python ../video_prediction/datasets/era5_dataset_v2.py ${source_dir}/hickle/splits ${destination_dir}/tfrecords -vars T2 MSL gph500 -height 128 -width 160 -seq_length 20 +srun python ../video_prediction/datasets/era5_dataset_v2.py /${source_dir}/hickle/splits ${destination_dir}/tfrecords -vars T2 MSL gph500 -height 128 -width 160 -seq_length 20 diff --git a/video_prediction_savp/HPC_scripts/generate_era5.sh b/video_prediction_savp/HPC_scripts/generate_era5.sh index 977e965fd46f918297761e5393b3fa6d832f2dd2..e8208ba85379de5a8b7e061403b4bc5c34a5944e 100755 --- a/video_prediction_savp/HPC_scripts/generate_era5.sh +++ b/video_prediction_savp/HPC_scripts/generate_era5.sh @@ -22,9 +22,9 @@ module load netcdf4-python/1.5.0.1-Python-3.6.8 module load h5py/2.9.0-Python-3.6.8 -source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData -checkpoint_dir=/p/scratch/deepacf/video_prediction_shared_folder/models -results_dir=/p/scratch/deepacf/video_prediction_shared_folder/results +source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/ +checkpoint_dir=/p/scratch/deepacf/video_prediction_shared_folder/models/ +results_dir=/p/scratch/deepacf/video_prediction_shared_folder/results/ diff --git a/video_prediction_savp/HPC_scripts/train_era5.sh b/video_prediction_savp/HPC_scripts/train_era5.sh index 34457faf63d2df46a03abab561fad00695fe4cd9..b17756d494c0b7048878936f8ff7cbd004738499 100755 --- a/video_prediction_savp/HPC_scripts/train_era5.sh +++ b/video_prediction_savp/HPC_scripts/train_era5.sh @@ -25,8 +25,8 @@ module load cuDNN/7.5.1.10-CUDA-10.1.105 # declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py) -source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData -destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/models +source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/ +destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/models/ #define model type, hyperparams setting up #source hyperparam_dir.sh diff --git a/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json b/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json index 56f54e7c5f143194bc8e1d597f20825fa6d3cc59..5d342f2fd598aa2ecf6c2de2bdb251fba4466c45 100644 --- a/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json +++ b/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json @@ -1,7 +1,7 @@ { "batch_size": 10, "lr": 0.001, - "max_epochs":100, + "max_epochs":30, "context_frames":10, "sequence_length":20 diff --git a/video_prediction_savp/scripts/train_dummy.py b/video_prediction_savp/scripts/train_dummy.py index f157b47507af84330b61eba2211280c92c9171ad..9c621bbfc46dcf8cebb77b006604ed2f6f50056f 100644 --- a/video_prediction_savp/scripts/train_dummy.py +++ b/video_prediction_savp/scripts/train_dummy.py @@ -12,6 +12,14 @@ import numpy as np import tensorflow as tf from video_prediction import datasets, models import matplotlib.pyplot as plt +from json import JSONEncoder +import pickle as pkl +class NumpyArrayEncoder(JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + return JSONEncoder.default(self, obj) + def add_tag_suffix(summary, tag_suffix): summary_proto = tf.Summary() @@ -159,8 +167,16 @@ def plot_train(train_losses,val_losses,output_dir): plt.legend() plt.savefig(os.path.join(output_dir,'plot_train.png')) +def save_results_to_dict(results_dict,output_dir): + with open(os.path.join(output_dir,"results.json"),"w") as fp: + json.dump(results_dict,fp) - +def save_results_to_pkl(train_losses,val_losses, output_dir): + with open(os.path.join(output_dir,"train_losses.pkl"),"wb") as f: + pkl.dump(train_losses,f) + with open(os.path.join(output_dir,"val_losses.pkl"),"wb") as f: + pkl.dump(val_losses,f) + def main(): parser = argparse.ArgumentParser() @@ -237,7 +253,10 @@ def main(): print ("number of exmaples per epoch:",num_examples_per_epoch) steps_per_epoch = int(num_examples_per_epoch/batch_size) total_steps = steps_per_epoch * max_epochs + #mock total_steps only for fast debugging + #total_steps = 10 print ("Total steps for training:",total_steps) + results_dict = {} with tf.Session(config=config) as sess: print("parameter_count =", sess.run(parameter_count)) sess.run(tf.global_variables_initializer()) @@ -249,7 +268,7 @@ def main(): # step is relative to the start_step train_losses=[] val_losses=[] - + run_start_time = time.time() for step in range(total_steps): global_step = sess.run(model.global_step) print ("global_step:", global_step) @@ -294,12 +313,17 @@ def main(): else: print ("The model name does not exist") - print("saving model to", args.output_dir) + #print("saving model to", args.output_dir) saver.save(sess, os.path.join(args.output_dir, "model"), global_step=step)# + train_time = time.time() - run_start_time + results_dict = {"train_time":train_time, + "total_steps":total_steps} + save_results_to_dict(results_dict,args.output_dir) + save_results_to_pkl(train_losses, val_losses, args.output_dir) print("train_losses:",train_losses) print("val_losses:",val_losses) plot_train(train_losses,val_losses,args.output_dir) print("Done") - + if __name__ == '__main__': main()