diff --git a/video_prediction_savp/HPC_scripts/DataExtraction.sh b/video_prediction_savp/HPC_scripts/DataExtraction.sh index 50e655ecf88ce1c06723683bcb16a689ef919c2c..81d32b9d97872b1ce25963ae933dff7520f0fdfc 100755 --- a/video_prediction_savp/HPC_scripts/DataExtraction.sh +++ b/video_prediction_savp/HPC_scripts/DataExtraction.sh @@ -8,8 +8,8 @@ #SBATCH --cpus-per-task=1 #SBATCH --output=DataExtraction-out.%j #SBATCH --error=DataExtraction-err.%j -#SBATCH --time=00:20:00 -#SBATCH --partition=devel +#SBATCH --time=05:20:00 +#SBATCH --partition=batch #SBATCH --mail-type=ALL #SBATCH --mail-user=b.gong@fz-juelich.de @@ -23,8 +23,8 @@ module load h5py/2.9.0-Python-3.6.8 module load mpi4py/3.0.1-Python-3.6.8 module load netcdf4-python/1.5.0.1-Python-3.6.8 -#srun python ../../workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2014/ --destination_dir ${SAVE_DIR}/extractedData/2014 +year=2012 +srun python ../../workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/${year}/ --destination_dir ${SAVE_DIR}/extractedData/${year} # 2tier pystager -srun python ../../workflow_parallel_frame_prediction/DataExtraction/main_single_master.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2013/ --destination_dir ${SAVE_DIR}/extractedData/2013 - +#srun python ../../workflow_parallel_frame_prediction/DataExtraction/main_single_master.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/${year}/ --destination_dir ${SAVE_DIR}/extractedData/${year} diff --git a/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json b/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json index 56f54e7c5f143194bc8e1d597f20825fa6d3cc59..5d342f2fd598aa2ecf6c2de2bdb251fba4466c45 100644 --- a/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json +++ b/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json @@ -1,7 +1,7 @@ { "batch_size": 10, "lr": 0.001, - "max_epochs":100, + "max_epochs":30, "context_frames":10, "sequence_length":20 diff --git a/video_prediction_savp/scripts/train_dummy.py b/video_prediction_savp/scripts/train_dummy.py index f157b47507af84330b61eba2211280c92c9171ad..9c621bbfc46dcf8cebb77b006604ed2f6f50056f 100644 --- a/video_prediction_savp/scripts/train_dummy.py +++ b/video_prediction_savp/scripts/train_dummy.py @@ -12,6 +12,14 @@ import numpy as np import tensorflow as tf from video_prediction import datasets, models import matplotlib.pyplot as plt +from json import JSONEncoder +import pickle as pkl +class NumpyArrayEncoder(JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + return JSONEncoder.default(self, obj) + def add_tag_suffix(summary, tag_suffix): summary_proto = tf.Summary() @@ -159,8 +167,16 @@ def plot_train(train_losses,val_losses,output_dir): plt.legend() plt.savefig(os.path.join(output_dir,'plot_train.png')) +def save_results_to_dict(results_dict,output_dir): + with open(os.path.join(output_dir,"results.json"),"w") as fp: + json.dump(results_dict,fp) - +def save_results_to_pkl(train_losses,val_losses, output_dir): + with open(os.path.join(output_dir,"train_losses.pkl"),"wb") as f: + pkl.dump(train_losses,f) + with open(os.path.join(output_dir,"val_losses.pkl"),"wb") as f: + pkl.dump(val_losses,f) + def main(): parser = argparse.ArgumentParser() @@ -237,7 +253,10 @@ def main(): print ("number of exmaples per epoch:",num_examples_per_epoch) steps_per_epoch = int(num_examples_per_epoch/batch_size) total_steps = steps_per_epoch * max_epochs + #mock total_steps only for fast debugging + #total_steps = 10 print ("Total steps for training:",total_steps) + results_dict = {} with tf.Session(config=config) as sess: print("parameter_count =", sess.run(parameter_count)) sess.run(tf.global_variables_initializer()) @@ -249,7 +268,7 @@ def main(): # step is relative to the start_step train_losses=[] val_losses=[] - + run_start_time = time.time() for step in range(total_steps): global_step = sess.run(model.global_step) print ("global_step:", global_step) @@ -294,12 +313,17 @@ def main(): else: print ("The model name does not exist") - print("saving model to", args.output_dir) + #print("saving model to", args.output_dir) saver.save(sess, os.path.join(args.output_dir, "model"), global_step=step)# + train_time = time.time() - run_start_time + results_dict = {"train_time":train_time, + "total_steps":total_steps} + save_results_to_dict(results_dict,args.output_dir) + save_results_to_pkl(train_losses, val_losses, args.output_dir) print("train_losses:",train_losses) print("val_losses:",val_losses) plot_train(train_losses,val_losses,args.output_dir) print("Done") - + if __name__ == '__main__': main()