diff --git a/video_prediction_savp/HPC_scripts/train_movingmnist.sh b/video_prediction_savp/HPC_scripts/train_movingmnist.sh index cb20b32c8e80cef704ae1efb7bc770991e381d0f..006ff73c30c4a53c80aef9371bfbe29fac39f973 100755 --- a/video_prediction_savp/HPC_scripts/train_movingmnist.sh +++ b/video_prediction_savp/HPC_scripts/train_movingmnist.sh @@ -8,9 +8,9 @@ #SBATCH --error=train_moving_mnist-err.%j #SBATCH --time=00:20:00 #SBATCH --gres=gpu:1 -#SBATCH --partition=develgpus +#SBATCH --partition=gpus #SBATCH --mail-type=ALL -#SBATCH --mail-user=s.stadtler@fz-juelich.de +#SBATCH --mail-user=b.gong@fz-juelich.de ##jutil env activate -p cjjsc42 @@ -40,6 +40,8 @@ destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/model model=convLSTM dataset=moving_mnist model_hparams=../hparams/${dataset}/${model}/model_hparams.json +destination_dir=${destination_dir}/${model}/"$(date +"%Y%m%dT%H%M")_"$USER"" # rund training -srun python ../scripts/train_dummy.py --input_dir ${source_dir}/tfrecords/ --dataset moving_mnist --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/${model}_bing_20200902/ + +srun python ../scripts/train_dummy.py --input_dir ${source_dir}/tfrecords/ --dataset moving_mnist --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ diff --git a/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json b/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json index d4942bea2ab5d6af424844b74d3769ccf699502f..fde951edd2e6b41965fbdce6ce831c1e154cbd0e 100644 --- a/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json +++ b/video_prediction_savp/hparams/era5/convLSTM/model_hparams.json @@ -1,8 +1,8 @@ { - "batch_size": 10, + "batch_size": 4, "lr": 0.001, - "max_epochs":2, + "max_epochs":20, "context_frames":10, "sequence_length":20, "loss_fun":"rmse" diff --git a/video_prediction_savp/hparams/moving_mnist/convLSTM/model_hparams.json b/video_prediction_savp/hparams/moving_mnist/convLSTM/model_hparams.json index b07caa0f35f54c1d5007e9fbc6802fe24f1adac0..b59f6cb2ee96162b2eb6014d7ca6bd37f54d4218 100644 --- a/video_prediction_savp/hparams/moving_mnist/convLSTM/model_hparams.json +++ b/video_prediction_savp/hparams/moving_mnist/convLSTM/model_hparams.json @@ -2,7 +2,7 @@ { "batch_size": 10, "lr": 0.001, - "max_epochs":2, + "max_epochs":20, "context_frames":10, "sequence_length":20, "loss_fun":"cross_entropy" diff --git a/video_prediction_savp/scripts/generate_movingmnist.py b/video_prediction_savp/scripts/generate_movingmnist.py index 0ec2af488c81dddeef6bff2deeb867c4e7b4ffed..d4fbf5eb5d8d8f4cad87ae26d15bc2787d9e6c0a 100644 --- a/video_prediction_savp/scripts/generate_movingmnist.py +++ b/video_prediction_savp/scripts/generate_movingmnist.py @@ -318,7 +318,7 @@ def main(): print("gene_images_denorm:",gen_images_denorm[0][0]) #Generate images inputs - plot_seq_imgs(imgs=input_images_denorm[:context_frames-1,:,:,0],idx = sample_ind + i, label="Ground Truth",output_png_dir=args.results_dir) + plot_seq_imgs(imgs=input_images_denorm[context_frames+1:,:,:,0],idx = sample_ind + i, label="Ground Truth",output_png_dir=args.results_dir) #Generate forecast images plot_seq_imgs(imgs=gen_images_denorm[context_frames:,:,:,0],idx = sample_ind + i,label="Forecast by Model " + args.model,output_png_dir=args.results_dir) diff --git a/video_prediction_savp/scripts/train_dummy.py b/video_prediction_savp/scripts/train_dummy.py index f693d0a6689890dd930c1dcb06338ff140c449a9..0417a36514fd6136fb9fbe934bfb396633fa6093 100644 --- a/video_prediction_savp/scripts/train_dummy.py +++ b/video_prediction_savp/scripts/train_dummy.py @@ -16,13 +16,6 @@ from json import JSONEncoder import pickle as pkl -class NumpyArrayEncoder(JSONEncoder): - def default(self, obj): - if isinstance(obj, np.ndarray): - return obj.tolist() - return JSONEncoder.default(self, obj) - - def add_tag_suffix(summary, tag_suffix): summary_proto = tf.Summary() summary_proto.ParseFromString(summary) @@ -80,7 +73,6 @@ def set_seed(seed): random.seed(seed) def load_params_from_checkpoints_dir(model_hparams_dict,checkpoint,dataset,model): - model_hparams_dict_load = {} if model_hparams_dict: with open(model_hparams_dict) as f: @@ -159,8 +151,19 @@ def make_dataset_iterator(train_dataset, val_dataset, batch_size ): return inputs,train_handle, val_handle -def plot_train(train_losses,val_losses,output_dir): - iterations = list(range(len(train_losses))) +def plot_train(train_losses,val_losses,step,output_dir): + """ + Function to plot training losses for train and val datasets against steps + params: + train_losses/val_losses (list): train losses, which length should be equal to the number of training steps + step (int): current training step + output_dir (str): the path to save the plot + + return: None + """ + + iterations = list(range(len(train_losses))) + if len(train_losses) != len(val_losses) or len(train_losses) != step +1 : raise ValueError("The length of training losses must be equal to the length of val losses and step +1 !") plt.plot(iterations, train_losses, 'g', label='Training loss') plt.plot(iterations, val_losses, 'b', label='validation loss') plt.title('Training and Validation loss') @@ -168,6 +171,8 @@ def plot_train(train_losses,val_losses,output_dir): plt.ylabel('Loss') plt.legend() plt.savefig(os.path.join(output_dir,'plot_train.png')) + plt.close() + return None def save_results_to_dict(results_dict,output_dir): with open(os.path.join(output_dir,"results.json"),"w") as fp: @@ -257,6 +262,7 @@ def main(): num_examples_per_epoch = train_dataset.num_examples_per_epoch() print ("number of exmaples per epoch:",num_examples_per_epoch) steps_per_epoch = int(num_examples_per_epoch/batch_size) + #number of steps totally equal to the number of steps per each echo multiple by number of epochs total_steps = steps_per_epoch * max_epochs global_step = tf.train.get_or_create_global_step() #mock total_steps only for fast debugging @@ -284,13 +290,12 @@ def main(): # --- Scarlet 20200813 print ("step:", step) val_handle_eval = sess.run(val_handle) - #Fetch variables in the graph - fetches = {"train_op": model.train_op} #fetches["latent_loss"] = model.latent_loss fetches["summary"] = model.summary_op - + fetches["global_step"] = model.global_step + if model.__class__.__name__ == "McNetVideoPredictionModel" or model.__class__.__name__ == "VanillaConvLstmVideoPredictionModel" or model.__class__.__name__ == "VanillaVAEVideoPredictionModel": fetches["global_step"] = model.global_step fetches["total_loss"] = model.total_loss @@ -326,8 +331,8 @@ def main(): val_results = sess.run(val_fetches,feed_dict={train_handle: val_handle_eval}) val_losses.append(val_results["total_loss"]) - summary_writer.add_summary(results["summary"]) - summary_writer.add_summary(val_results["summary"]) + summary_writer.add_summary(results["summary"],results["global_step"]) + summary_writer.add_summary(val_results["summary"],results["global_step"]) summary_writer.flush() # global_step will have the correct step count if we resume from a checkpoint @@ -346,19 +351,26 @@ def main(): print ("The model name does not exist") #print("saving model to", args.output_dir) + saver.save(sess, os.path.join(args.output_dir, "model"), global_step=step) # +++ Scarlet 20200813 timeit_end = time.time() # --- Scarlet 20200813 print("time needed for this step", timeit_end - timeit_start, ' s') + if step % 20 == 0: + # I save the pickle file and plot here inside the loop in case the training process cannot finished after job is done. + save_results_to_pkl(train_losses,val_losses,args.output_dir) + plot_train(train_losses,val_losses,step,args.output_dir) + + train_time = time.time() - run_start_time results_dict = {"train_time":train_time, "total_steps":total_steps} save_results_to_dict(results_dict,args.output_dir) - save_results_to_pkl(train_losses, val_losses, args.output_dir) + #save_results_to_pkl(train_losses, val_losses, args.output_dir) print("train_losses:",train_losses) print("val_losses:",val_losses) - plot_train(train_losses,val_losses,args.output_dir) + #plot_train(train_losses,val_losses,args.output_dir) print("Done") # +++ Scarlet 20200814 print("Total training time:", train_time/60., "min") diff --git a/video_prediction_savp/video_prediction/layers/BasicConvLSTMCell.py b/video_prediction_savp/video_prediction/layers/BasicConvLSTMCell.py index 321f6cc7e05320cf83e1173d8004429edf07ec24..c4a095dc8fc3abdbd87c1eaf79adcd7dad99020b 100644 --- a/video_prediction_savp/video_prediction/layers/BasicConvLSTMCell.py +++ b/video_prediction_savp/video_prediction/layers/BasicConvLSTMCell.py @@ -88,10 +88,14 @@ class BasicConvLSTMCell(ConvRNNCell): else: c, h = tf.split(axis = 3, num_or_size_splits = 2, value = state) concat = _conv_linear([inputs, h], self.filter_size, self.num_features * 4, True) - + print("concat1:",concat) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split(axis = 3, num_or_size_splits = 4, value = concat) - + print("input gate i:",i) + print("new_input j:",j) + print("forget gate:",f) + print("output gate:",o) + new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * tf.nn.sigmoid(o) @@ -100,6 +104,8 @@ class BasicConvLSTMCell(ConvRNNCell): new_state = LSTMStateTuple(new_c, new_h) else: new_state = tf.concat(axis = 3, values = [new_c, new_h]) + print("new h", new_h) + print("new state",new_state) return new_h, new_state @@ -135,9 +141,14 @@ def _conv_linear(args, filter_size, num_features, bias, bias_start=0.0, scope=No matrix = tf.get_variable( "Matrix", [filter_size[0], filter_size[1], total_arg_size_depth, num_features], dtype = dtype) if len(args) == 1: + print("args[0]:",args[0]) res = tf.nn.conv2d(args[0], matrix, strides = [1, 1, 1, 1], padding = 'SAME') + print("res1:",res) else: + print("matrix:",matrix) + print("tf.concat(axis = 3, values = args):",tf.concat(axis = 3, values = args)) res = tf.nn.conv2d(tf.concat(axis = 3, values = args), matrix, strides = [1, 1, 1, 1], padding = 'SAME') + print("res2:",res) if not bias: return res bias_term = tf.get_variable( @@ -146,3 +157,4 @@ def _conv_linear(args, filter_size, num_features, bias, bias_start=0.0, scope=No initializer = tf.constant_initializer( bias_start, dtype = dtype)) return res + bias_term + diff --git a/video_prediction_savp/video_prediction/layers/layer_def.py b/video_prediction_savp/video_prediction/layers/layer_def.py index 1ceac662136548fde65511815795d184fe91fac1..273b5eaee3cab703841b214ccc09ef190b6dd3ae 100644 --- a/video_prediction_savp/video_prediction/layers/layer_def.py +++ b/video_prediction_savp/video_prediction/layers/layer_def.py @@ -55,8 +55,7 @@ def _variable_with_weight_decay(name, shape, stddev, wd,initializer=tf.contrib.l def conv_layer(inputs, kernel_size, stride, num_features, idx, initializer=tf.contrib.layers.xavier_initializer() , activate="relu"): - print("conv_layer activation function",activate) - + print("conv_layer activation function",activate) with tf.variable_scope('{0}_conv'.format(idx)) as scope: input_channels = inputs.get_shape()[-1] @@ -75,7 +74,7 @@ def conv_layer(inputs, kernel_size, stride, num_features, idx, initializer=tf.co elif activate == "leaky_relu": conv_rect = tf.nn.leaky_relu(conv_biased, name = '{0}_conv'.format(idx)) elif activate == "sigmoid": - conv_rect = tf.nn.sigmoid(conv_biased, name = '{0}_conv'.format(idx)) + conv_rect = tf.nn.sigmoid(conv_biased, name = '{0}_conv'.format(idx)) else: raise ("activation function is not correct") return conv_rect diff --git a/video_prediction_savp/video_prediction/models/base_model.py b/video_prediction_savp/video_prediction/models/base_model.py index df479968325946a9d61896d498428d65692c1848..0d3bf6e4b554c70671d4678b530688c44f999b77 100644 --- a/video_prediction_savp/video_prediction/models/base_model.py +++ b/video_prediction_savp/video_prediction/models/base_model.py @@ -3,12 +3,10 @@ import itertools import os import re from collections import OrderedDict - import numpy as np import tensorflow as tf from tensorflow.contrib.training import HParams from tensorflow.python.util import nest - import video_prediction as vp from video_prediction.utils import tf_utils from video_prediction.utils.tf_utils import compute_averaged_gradients, reduce_tensors, local_device_setter, \ @@ -244,7 +242,9 @@ class BaseVideoPredictionModel(object): savers.append(saver) restore_op = [saver.saver_def.restore_op_name for saver in savers] sess.run(restore_op) - + return True + else: + return False class VideoPredictionModel(BaseVideoPredictionModel): def __init__(self, diff --git a/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py b/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py index d3b3d4817faa10e6f5db5257fdf4cd526e6d01c7..796486a453f9dc6807928deeb2b8962e2908a4f2 100644 --- a/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py +++ b/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py @@ -30,15 +30,14 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): self.max_epochs = self.hparams.max_epochs self.loss_fun = self.hparams.loss_fun + def get_default_hparams_dict(self): """ The keys of this dict define valid hyperparameters for instances of this class. A class inheriting from this one should override this method if it has a different set of hyperparameters. - Returns: A dict with the following hyperparameters. - batch_size: batch size for training. lr: learning rate. if decay steps is non-zero, this is the learning rate for steps <= decay_step. @@ -80,7 +79,6 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): else: raise ValueError("Loss function is not selected properly, you should chose either 'rmse' or 'cross_entropy'") - #This is the loss for only all the channels(temperature, geo500, pressure) #self.total_loss = tf.reduce_mean( # tf.square(self.x[:, self.context_frames:,:,:,:] - self.x_hat_predict_frames[:,:,:,:,:])) @@ -96,10 +94,8 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): self.saveable_variables = [self.global_step] + global_variables return None - @staticmethod def convLSTM_cell(inputs, hidden): - y_0 = inputs #we only usd patch 1, but the original paper use patch 4 for the moving mnist case, but use 2 for Radar Echo Dataset channels = inputs.get_shape()[-1] # conv lstm cell @@ -114,7 +110,6 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): z3 = tf.reshape(output, [-1, output_shape[1], output_shape[2], output_shape[3]]) #we feed the learn representation into a 1 × 1 convolutional layer to generate the final prediction x_hat = ld.conv_layer(z3, 1, 1, channels, "decode_1", activate="sigmoid") - return x_hat, hidden def convLSTM_network(self): @@ -143,3 +138,4 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): x_hat = tf.stack(x_hat) self.x_hat= tf.transpose(x_hat, [1, 0, 2, 3, 4]) # change first dim with sec dim self.x_hat_predict_frames = self.x_hat[:,self.context_frames-1:,:,:,:] +