diff --git a/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py b/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py index 5a8a2e1f3fffe5c66d5b93e53137300bf792317e..744284fc6c5b52bcde249f1a58e04a41e80339fa 100644 --- a/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py +++ b/video_prediction_savp/video_prediction/models/vanilla_convLSTM_model.py @@ -70,9 +70,14 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): #self.context_frames_loss = tf.reduce_mean( # tf.square(self.x[:, :self.context_frames, :, :, 0] - self.x_hat_context_frames[:, :, :, :, 0])) # This is the loss function (RMSE): + #This is loss function only for 1 channel (temperature RMSE) self.total_loss = tf.reduce_mean( - tf.square(self.x[:, self.context_frames:, :, :, 0] - self.x_hat_context_frames[:, (self.context_frames-1):-1, :, :, 0])) - + tf.square(self.x[:, self.context_frames:,:,:,0] - self.x_hat_predict_frames[:,:,:,:,0])) + + #This is the loss for only all the channels(temperature, geo500, pressure) + #self.total_loss = tf.reduce_mean( + # tf.square(self.x[:, self.context_frames:,:,:,:] - self.x_hat_predict_frames[:,:,:,:,:])) + self.train_op = tf.train.AdamOptimizer( learning_rate = self.learning_rate).minimize(self.total_loss, global_step = self.global_step) self.outputs = {} @@ -88,35 +93,19 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): @staticmethod def convLSTM_cell(inputs, hidden): - conv1 = ld.conv_layer(inputs, 3, 2, 8, "encode_1", activate = "leaky_relu") - - conv2 = ld.conv_layer(conv1, 3, 1, 8, "encode_2", activate = "leaky_relu") - - conv3 = ld.conv_layer(conv2, 3, 2, 8, "encode_3", activate = "leaky_relu") - - y_0 = conv3 + y_0 = inputs #we only usd patch 1, but the original paper use patch 4 for the moving mnist case, but use 2 for Radar Echo Dataset # conv lstm cell cell_shape = y_0.get_shape().as_list() + channels = cell_shape[-1] with tf.variable_scope('conv_lstm', initializer = tf.random_uniform_initializer(-.01, 0.1)): - cell = BasicConvLSTMCell(shape = [cell_shape[1], cell_shape[2]], filter_size = [3, 3], num_features = 8) + cell = BasicConvLSTMCell(shape = [cell_shape[1], cell_shape[2]], filter_size = [5, 5], num_features = 256) if hidden is None: hidden = cell.zero_state(y_0, tf.float32) - output, hidden = cell(y_0, hidden) - - output_shape = output.get_shape().as_list() - - z3 = tf.reshape(output, [-1, output_shape[1], output_shape[2], output_shape[3]]) - - conv5 = ld.transpose_conv_layer(z3, 3, 2, 8, "decode_5", activate = "leaky_relu") - - - conv6 = ld.transpose_conv_layer(conv5, 3, 1, 8, "decode_6", activate = "leaky_relu") - - - x_hat = ld.transpose_conv_layer(conv6, 3, 2, 3, "decode_7", activate = "sigmoid") # set activation to linear + #we feed the learn representation into a 1 × 1 convolutional layer to generate the final prediction + x_hat = ld.conv_layer(z3, 1, 1, channels, "decode_1", activate="sigmoid") return x_hat, hidden @@ -124,20 +113,18 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): network_template = tf.make_template('network', VanillaConvLstmVideoPredictionModel.convLSTM_cell) # make the template to share the variables # create network - x_hat_context = [] x_hat = [] - hidden = None - #This is for training - for i in range(self.sequence_length): - if i < self.context_frames: - x_1, hidden = network_template(self.x[:, i, :, :, :], hidden) - else: - x_1, hidden = network_template(x_1, hidden) - x_hat_context.append(x_1) + + # for i in range(self.sequence_length-1): + # if i < self.context_frames: + # x_1, hidden = network_template(self.x[:, i, :, :, :], hidden) + # else: + # x_1, hidden = network_template(x_1, hidden) + # x_hat_context.append(x_1) - #This is for generating video + #This is for training (optimization of convLSTM layer) hidden_g = None - for i in range(self.sequence_length): + for i in range(self.sequence_length-1): if i < self.context_frames: x_1_g, hidden_g = network_template(self.x[:, i, :, :, :], hidden_g) else: @@ -145,8 +132,6 @@ class VanillaConvLstmVideoPredictionModel(BaseVideoPredictionModel): x_hat.append(x_1_g) # pack them all together - x_hat_context = tf.stack(x_hat_context) x_hat = tf.stack(x_hat) - self.x_hat_context_frames = tf.transpose(x_hat_context, [1, 0, 2, 3, 4]) # change first dim with sec dim self.x_hat= tf.transpose(x_hat, [1, 0, 2, 3, 4]) # change first dim with sec dim - self.x_hat_predict_frames = self.x_hat[:,self.context_frames:,:,:,:] + self.x_hat_predict_frames = self.x_hat[:,self.context_frames-1:,:,:,:]