|
|
@@ -22,7 +22,7 @@ import re
|
|
|
import tensorflow as tf
|
|
|
|
|
|
|
|
|
-def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, use_fp16=False, amp=False):
|
|
|
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False):
|
|
|
"""Creates an optimizer training op."""
|
|
|
global_step = tf.train.get_or_create_global_step()
|
|
|
|
|
|
@@ -72,7 +72,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None,
|
|
|
if hvd is not None:
|
|
|
from horovod.tensorflow.compression import Compression
|
|
|
optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
|
|
|
- if use_fp16 or amp:
|
|
|
+ if manual_fp16 or use_fp16:
|
|
|
loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
|
|
|
optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
|
|
|
|
|
|
@@ -80,7 +80,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None,
|
|
|
grads_and_vars = optimizer.compute_gradients(loss, tvars)
|
|
|
grads_and_vars = [(g,v) for g,v in grads_and_vars if g is not None]
|
|
|
grads, tvars = list(zip(*grads_and_vars))
|
|
|
- all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 or amp else tf.constant(True, dtype=tf.bool)
|
|
|
+ all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if manual_fp16 or use_fp16 else tf.constant(True, dtype=tf.bool)
|
|
|
|
|
|
# This is how the model was pre-trained.
|
|
|
# ensure global norm is a finite number
|
|
|
@@ -126,7 +126,7 @@ class AdamWeightDecayOptimizer(tf.train.Optimizer):
|
|
|
self.exclude_from_weight_decay = exclude_from_weight_decay
|
|
|
|
|
|
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
|
|
|
- use_fp16=False):
|
|
|
+ manual_fp16=False):
|
|
|
"""See base class."""
|
|
|
assignments = []
|
|
|
for (grad, param) in grads_and_vars:
|
|
|
@@ -134,7 +134,7 @@ class AdamWeightDecayOptimizer(tf.train.Optimizer):
|
|
|
continue
|
|
|
|
|
|
param_name = self._get_variable_name(param.name)
|
|
|
- has_shadow = use_fp16 and param.dtype.base_dtype != tf.float32
|
|
|
+ has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
|
|
|
if has_shadow:
|
|
|
# create shadow fp32 weights for fp16 variable
|
|
|
param_fp32 = tf.get_variable(
|