Skip to content
Snippets Groups Projects
Commit 034e9458 authored by stadtler1's avatar stadtler1
Browse files

Merge branch 'bing_issue#001_issue#003_test_juwels' of...

Merge branch 'bing_issue#001_issue#003_test_juwels' of https://gitlab.version.fz-juelich.de/toar/ambs into bing_issue#001_issue#003_test_juwels
parents 10d7dac4 cd1c7255
No related tags found
No related merge requests found
Pipeline #42178 failed
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --output=DataExtraction-out.%j #SBATCH --output=DataExtraction-out.%j
#SBATCH --error=DataExtraction-err.%j #SBATCH --error=DataExtraction-err.%j
#SBATCH --time=00:20:00 #SBATCH --time=05:20:00
#SBATCH --partition=devel #SBATCH --partition=batch
#SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
#SBATCH --mail-user=b.gong@fz-juelich.de #SBATCH --mail-user=b.gong@fz-juelich.de
...@@ -23,8 +23,8 @@ module load h5py/2.9.0-Python-3.6.8 ...@@ -23,8 +23,8 @@ module load h5py/2.9.0-Python-3.6.8
module load mpi4py/3.0.1-Python-3.6.8 module load mpi4py/3.0.1-Python-3.6.8
module load netcdf4-python/1.5.0.1-Python-3.6.8 module load netcdf4-python/1.5.0.1-Python-3.6.8
#srun python ../../workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2014/ --destination_dir ${SAVE_DIR}/extractedData/2014 year=2012
srun python ../../workflow_parallel_frame_prediction/DataExtraction/mpi_stager_v2.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/${year}/ --destination_dir ${SAVE_DIR}/extractedData/${year}
# 2tier pystager # 2tier pystager
srun python ../../workflow_parallel_frame_prediction/DataExtraction/main_single_master.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/2013/ --destination_dir ${SAVE_DIR}/extractedData/2013 #srun python ../../workflow_parallel_frame_prediction/DataExtraction/main_single_master.py --source_dir /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/nc/${year}/ --destination_dir ${SAVE_DIR}/extractedData/${year}
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --output=DataPreprocess_to_tf-out.%j #SBATCH --output=DataPreprocess_to_tf-out.%j
#SBATCH --error=DataPreprocess_to_tf-err.%j #SBATCH --error=DataPreprocess_to_tf-err.%j
#SBATCH --time=00:50:00 #SBATCH --time=01:20:00
#SBATCH --partition=batch #SBATCH --partition=batch
#SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
#SBATCH --mail-user=b.gong@fz-juelich.de #SBATCH --mail-user=b.gong@fz-juelich.de
...@@ -20,8 +20,8 @@ module load mpi4py/3.0.1-Python-3.6.8 ...@@ -20,8 +20,8 @@ module load mpi4py/3.0.1-Python-3.6.8
module load TensorFlow/1.13.1-GPU-Python-3.6.8 module load TensorFlow/1.13.1-GPU-Python-3.6.8
source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/
destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/
srun python ../video_prediction/datasets/era5_dataset_v2.py ${source_dir}/hickle/splits ${destination_dir}/tfrecords -vars T2 MSL gph500 -height 128 -width 160 -seq_length 20 srun python ../video_prediction/datasets/era5_dataset_v2.py /${source_dir}/hickle/splits ${destination_dir}/tfrecords -vars T2 MSL gph500 -height 128 -width 160 -seq_length 20
...@@ -22,9 +22,9 @@ module load netcdf4-python/1.5.0.1-Python-3.6.8 ...@@ -22,9 +22,9 @@ module load netcdf4-python/1.5.0.1-Python-3.6.8
module load h5py/2.9.0-Python-3.6.8 module load h5py/2.9.0-Python-3.6.8
source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/
checkpoint_dir=/p/scratch/deepacf/video_prediction_shared_folder/models checkpoint_dir=/p/scratch/deepacf/video_prediction_shared_folder/models/
results_dir=/p/scratch/deepacf/video_prediction_shared_folder/results results_dir=/p/scratch/deepacf/video_prediction_shared_folder/results/
......
...@@ -25,8 +25,8 @@ module load cuDNN/7.5.1.10-CUDA-10.1.105 ...@@ -25,8 +25,8 @@ module load cuDNN/7.5.1.10-CUDA-10.1.105
# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py) # declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py)
source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preproceesedData source_dir=/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/
destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/models destination_dir=/p/scratch/deepacf/video_prediction_shared_folder/models/
#define model type, hyperparams setting up #define model type, hyperparams setting up
#source hyperparam_dir.sh #source hyperparam_dir.sh
......
{ {
"batch_size": 10, "batch_size": 10,
"lr": 0.001, "lr": 0.001,
"max_epochs":100, "max_epochs":30,
"context_frames":10, "context_frames":10,
"sequence_length":20 "sequence_length":20
......
...@@ -12,6 +12,14 @@ import numpy as np ...@@ -12,6 +12,14 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from video_prediction import datasets, models from video_prediction import datasets, models
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from json import JSONEncoder
import pickle as pkl
class NumpyArrayEncoder(JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return JSONEncoder.default(self, obj)
def add_tag_suffix(summary, tag_suffix): def add_tag_suffix(summary, tag_suffix):
summary_proto = tf.Summary() summary_proto = tf.Summary()
...@@ -159,7 +167,15 @@ def plot_train(train_losses,val_losses,output_dir): ...@@ -159,7 +167,15 @@ def plot_train(train_losses,val_losses,output_dir):
plt.legend() plt.legend()
plt.savefig(os.path.join(output_dir,'plot_train.png')) plt.savefig(os.path.join(output_dir,'plot_train.png'))
def save_results_to_dict(results_dict,output_dir):
with open(os.path.join(output_dir,"results.json"),"w") as fp:
json.dump(results_dict,fp)
def save_results_to_pkl(train_losses,val_losses, output_dir):
with open(os.path.join(output_dir,"train_losses.pkl"),"wb") as f:
pkl.dump(train_losses,f)
with open(os.path.join(output_dir,"val_losses.pkl"),"wb") as f:
pkl.dump(val_losses,f)
def main(): def main():
...@@ -237,7 +253,10 @@ def main(): ...@@ -237,7 +253,10 @@ def main():
print ("number of exmaples per epoch:",num_examples_per_epoch) print ("number of exmaples per epoch:",num_examples_per_epoch)
steps_per_epoch = int(num_examples_per_epoch/batch_size) steps_per_epoch = int(num_examples_per_epoch/batch_size)
total_steps = steps_per_epoch * max_epochs total_steps = steps_per_epoch * max_epochs
#mock total_steps only for fast debugging
#total_steps = 10
print ("Total steps for training:",total_steps) print ("Total steps for training:",total_steps)
results_dict = {}
with tf.Session(config=config) as sess: with tf.Session(config=config) as sess:
print("parameter_count =", sess.run(parameter_count)) print("parameter_count =", sess.run(parameter_count))
sess.run(tf.global_variables_initializer()) sess.run(tf.global_variables_initializer())
...@@ -249,7 +268,7 @@ def main(): ...@@ -249,7 +268,7 @@ def main():
# step is relative to the start_step # step is relative to the start_step
train_losses=[] train_losses=[]
val_losses=[] val_losses=[]
run_start_time = time.time()
for step in range(total_steps): for step in range(total_steps):
global_step = sess.run(model.global_step) global_step = sess.run(model.global_step)
print ("global_step:", global_step) print ("global_step:", global_step)
...@@ -294,8 +313,13 @@ def main(): ...@@ -294,8 +313,13 @@ def main():
else: else:
print ("The model name does not exist") print ("The model name does not exist")
print("saving model to", args.output_dir) #print("saving model to", args.output_dir)
saver.save(sess, os.path.join(args.output_dir, "model"), global_step=step)# saver.save(sess, os.path.join(args.output_dir, "model"), global_step=step)#
train_time = time.time() - run_start_time
results_dict = {"train_time":train_time,
"total_steps":total_steps}
save_results_to_dict(results_dict,args.output_dir)
save_results_to_pkl(train_losses, val_losses, args.output_dir)
print("train_losses:",train_losses) print("train_losses:",train_losses)
print("val_losses:",val_losses) print("val_losses:",val_losses)
plot_train(train_losses,val_losses,args.output_dir) plot_train(train_losses,val_losses,args.output_dir)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment