diff --git a/video_prediction/models/mocogan_model.py b/video_prediction/models/mocogan_model.py index 22490c4d831e65286438bb66fdc4855f8349f91c..3d9189176e59a52b0269a3e0928b47845f814d0a 100644 --- a/video_prediction/models/mocogan_model.py +++ b/video_prediction/models/mocogan_model.py @@ -10,39 +10,53 @@ from video_prediction.ops import lrelu, conv2d, flatten, tile_concat, pool2d, de from video_prediction.utils import tf_utils +def noise(x, use_noise, sigma=0.2): + if use_noise: + return x + sigma * tf.random_normal(x.shape, 0, 1) + return x + + def create_image_discriminator(images, ndf=64, - norm_layer='instance'): + norm_layer='instance', + use_noise=False, + noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] with tf.variable_scope("image_layer_1"): - h1 = conv2d(tf.pad(images, paddings), ndf, kernel_size=4, strides=2, padding='VALID') + h1 = noise(images, use_noise, noise_sigma) + h1 = conv2d(tf.pad(h1, paddings), ndf, kernel_size=4, strides=2, padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("image_layer_2"): - h2 = conv2d(tf.pad(h1, paddings), ndf * 2, kernel_size=4, strides=2, padding='VALID') + h2 = noise(h1, use_noise, noise_sigma) + h2 = conv2d(tf.pad(h2, paddings), ndf * 2, kernel_size=4, strides=2, padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("image_layer_3"): - h3 = conv2d(tf.pad(h2, paddings), ndf * 4, kernel_size=4, strides=2, padding='VALID') + h3 = noise(h2, use_noise, noise_sigma) + h3 = conv2d(tf.pad(h3, paddings), ndf * 4, kernel_size=4, strides=2, padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("image_layer_4"): - logits = conv2d(h3, 1, kernel_size=4, strides=1, padding='VALID') + h4 = noise(h3, use_noise, noise_sigma) + logits = conv2d(tf.pad(h4, paddings), 1, kernel_size=4, strides=2, padding='VALID', use_bias=False) layers.append(logits) return layers def create_video_discriminator(clips, ndf=64, - norm_layer='instance'): + norm_layer='instance', + use_noise=False, + noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] @@ -50,28 +64,27 @@ def create_video_discriminator(clips, clips = tf_utils.transpose_batch_time(clips) with tf.variable_scope("video_layer_1"): - h1 = conv3d(tf.pad(clips, paddings), ndf, kernel_size=4, strides=(1, 2, 2), padding='VALID') + h1 = noise(clips, use_noise, noise_sigma) + h1 = conv3d(tf.pad(h1, paddings), ndf, kernel_size=4, strides=(1, 2, 2), padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("video_layer_2"): - h2 = conv3d(tf.pad(h1, paddings), ndf * 2, kernel_size=4, strides=(1, 2, 2), padding='VALID') + h2 = noise(h1, use_noise, noise_sigma) + h2 = conv3d(tf.pad(h2, paddings), ndf * 2, kernel_size=4, strides=(1, 2, 2), padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("video_layer_3"): - h3 = conv3d(tf.pad(h2, paddings), ndf * 4, kernel_size=4, strides=(1, 2, 2), padding='VALID') + h3 = noise(h2, use_noise, noise_sigma) + h3 = conv3d(tf.pad(h3, paddings), ndf * 4, kernel_size=4, strides=(1, 2, 2), padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("video_layer_4"): - if h3.shape[1].value < 4: - kernel_size = (h3.shape[1].value, 4, 4) - else: - kernel_size = 4 - logits = conv3d(h3, 1, kernel_size=kernel_size, strides=1, padding='VALID') + logits = conv3d(tf.pad(h3, paddings), 1, kernel_size=4, strides=(1, 2, 2), padding='VALID', use_bias=False) layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers) @@ -79,7 +92,9 @@ def create_video_discriminator(clips, def create_acvideo_discriminator(clips, actions, ndf=64, - norm_layer='instance'): + norm_layer='instance', + use_noise=False, + noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] @@ -89,28 +104,27 @@ def create_acvideo_discriminator(clips, clip_pairs = tf_utils.transpose_batch_time(clip_pairs) with tf.variable_scope("acvideo_layer_1"): - h1 = conv3d(tf.pad(clip_pairs, paddings), ndf, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID') + h1 = noise(clip_pairs, use_noise, noise_sigma) + h1 = conv3d(tf.pad(h1, paddings), ndf, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("acvideo_layer_2"): - h2 = conv3d(tf.pad(h1, paddings), ndf * 2, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID') + h2 = noise(h1, use_noise, noise_sigma) + h2 = conv3d(tf.pad(h2, paddings), ndf * 2, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("acvideo_layer_3"): - h3 = conv3d(tf.pad(h2, paddings), ndf * 4, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID') + h3 = noise(h2, use_noise, noise_sigma) + h3 = conv3d(tf.pad(h3, paddings), ndf * 4, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("acvideo_layer_4"): - if h3.shape[1].value < 4: - kernel_size = (h3.shape[1].value, 4, 4) - else: - kernel_size = 4 - logits = conv3d(h3, 1, kernel_size=kernel_size, strides=1, padding='VALID') + logits = conv3d(tf.pad(h3, paddings), 1, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers) @@ -138,17 +152,20 @@ def discriminator_fn(targets, inputs=None, hparams=None): outputs = {} if hparams.image_gan_weight or hparams.image_vae_gan_weight: - image_features = create_image_discriminator(image_sample, ndf=hparams.ndf, norm_layer=hparams.norm_layer) + image_features = create_image_discriminator(image_sample, ndf=hparams.ndf, norm_layer=hparams.norm_layer, + use_noise=hparams.use_noise, noise_sigma=hparams.noise_sigma) image_features, image_logits = image_features[:-1], image_features[-1] outputs['discrim_image_logits'] = tf.expand_dims(image_logits, axis=0) # expand dims for the time dimension with tf.variable_scope(tf.get_variable_scope(), reuse=True): - images_features = create_image_discriminator(flatten(targets, 0, 1), ndf=hparams.ndf, norm_layer=hparams.norm_layer) + images_features = create_image_discriminator(flatten(targets, 0, 1), ndf=hparams.ndf, norm_layer=hparams.norm_layer, + use_noise=hparams.use_noise, noise_sigma=hparams.noise_sigma) images_features = images_features[:-1] for i, images_feature in enumerate(images_features): images_feature = tf.reshape(images_feature, targets.shape[:2].as_list() + images_feature.shape[1:].as_list()) outputs['discrim_image_feature%d' % i] = images_feature if hparams.video_gan_weight or hparams.video_vae_gan_weight: - video_features = create_video_discriminator(clip_sample, ndf=hparams.ndf, norm_layer=hparams.norm_layer) + video_features = create_video_discriminator(clip_sample, ndf=hparams.ndf, norm_layer=hparams.norm_layer, + use_noise=hparams.use_noise, noise_sigma=hparams.noise_sigma) video_features, video_logits = video_features[:-1], video_features[-1] outputs['discrim_video_logits'] = video_logits for i, video_feature in enumerate(video_features): @@ -158,7 +175,8 @@ def discriminator_fn(targets, inputs=None, hparams=None): indices = tf.expand_dims(t_start_indices, axis=0) + tf.expand_dims(t_offset_indices, axis=1) actions = inputs['actions'][hparams.context_frames:] actions_sample = tf.reshape(tf.gather_nd(actions, flatten(indices, 0, 1)), [hparams.clip_length - 1] + actions.shape.as_list()[1:]) - acvideo_features = create_acvideo_discriminator(clip_sample, actions_sample, ndf=hparams.ndf, norm_layer=hparams.norm_layer) + acvideo_features = create_acvideo_discriminator(clip_sample, actions_sample, ndf=hparams.ndf, norm_layer=hparams.norm_layer, + use_noise=hparams.use_noise, noise_sigma=hparams.noise_sigma) acvideo_features, acvideo_logits = acvideo_features[:-1], acvideo_features[-1] outputs['discrim_acvideo_logits'] = acvideo_logits for i, acvideo_feature in enumerate(acvideo_features): @@ -215,31 +233,31 @@ def create_generator(z, layers = [] with tf.variable_scope("layer_1"): - h0 = deconv2d(z, ngf * 8, kernel_size=4, strides=1, padding='VALID') + h0 = deconv2d(z, ngf * 8, kernel_size=4, strides=1, padding='VALID', use_bias=False) h0 = norm_layer(h0) h0 = tf.nn.relu(h0) layers.append(h0) with tf.variable_scope("layer_2"): - h1 = deconv2d(h0, ngf * 4, kernel_size=4, strides=2) + h1 = deconv2d(h0, ngf * 4, kernel_size=4, strides=2, use_bias=False) h1 = norm_layer(h1) h1 = tf.nn.relu(h1) layers.append(h1) with tf.variable_scope("layer_3"): - h2 = deconv2d(h1, ngf * 2, kernel_size=4, strides=2) + h2 = deconv2d(h1, ngf * 2, kernel_size=4, strides=2, use_bias=False) h2 = norm_layer(h2) h2 = tf.nn.relu(h2) layers.append(h2) with tf.variable_scope("layer_4"): - h3 = deconv2d(h2, ngf, kernel_size=4, strides=2) + h3 = deconv2d(h2, ngf, kernel_size=4, strides=2, use_bias=False) h3 = norm_layer(h3) h3 = tf.nn.relu(h3) layers.append(h3) with tf.variable_scope("layer_5"): - h4 = deconv2d(h3, n_channels, kernel_size=4, strides=2) + h4 = deconv2d(h3, n_channels, kernel_size=4, strides=2, use_bias=False) h4 = tf.nn.tanh(h4) layers.append(h4) return h4 @@ -302,6 +320,8 @@ class MoCoGANVideoPredictionModel(VideoPredictionModel): dim_z_content=50, dim_z_motion=10, norm_layer='batch', + use_noise=False, + noise_sigma=0.0, clip_length=10, lr=0.0002, beta1=0.5, diff --git a/video_prediction/ops.py b/video_prediction/ops.py index 58c43e8fed3e4bc7d602a154d91780e6119ef62c..eea4985d9689733d3dbac56a2ede27d1fc532a84 100644 --- a/video_prediction/ops.py +++ b/video_prediction/ops.py @@ -759,19 +759,20 @@ def upsample_conv2d_v2(inputs, filters, kernel_size, strides=(1, 1), padding='SA return outputs -def conv3d(inputs, filters, kernel_size, strides=(1, 1), padding='SAME', use_spectral_norm=False): +def conv3d(inputs, filters, kernel_size, strides=(1, 1), padding='SAME', use_bias=True, use_spectral_norm=False): + kernel_size = list(kernel_size) if isinstance(kernel_size, (tuple, list)) else [kernel_size] * 3 + strides = list(strides) if isinstance(strides, (tuple, list)) else [strides] * 3 + input_shape = inputs.get_shape().as_list() + kernel_shape = list(kernel_size) + [input_shape[-1], filters] with tf.variable_scope('conv3d'): - kernel_size = list(kernel_size) if isinstance(kernel_size, (tuple, list)) else [kernel_size] * 3 - strides = list(strides) if isinstance(strides, (tuple, list)) else [strides] * 3 - input_shape = inputs.get_shape().as_list() - kernel_shape = list(kernel_size) + [input_shape[-1], filters] kernel = tf.get_variable('kernel', kernel_shape, dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.02)) if use_spectral_norm: kernel = spectral_normed_weight(kernel) - outputs = tf.nn.conv3d(inputs, kernel, [1] + strides + [1], padding=padding) + outputs = tf.nn.conv3d(inputs, kernel, [1] + strides + [1], padding=padding) + if use_bias: bias = tf.get_variable('bias', [filters], dtype=tf.float32, initializer=tf.zeros_initializer()) outputs = tf.nn.bias_add(outputs, bias) - return outputs + return outputs def pool2d(inputs, pool_size, strides=(1, 1), padding='SAME', pool_mode='avg'):