|
@@ -12,26 +12,25 @@
|
|
|
# See the License for the specific language governing permissions and
|
|
# See the License for the specific language governing permissions and
|
|
|
# limitations under the License.
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
-import time
|
|
|
|
|
import logging
|
|
import logging
|
|
|
-
|
|
|
|
|
|
|
+import time
|
|
|
from profile import Profiler
|
|
from profile import Profiler
|
|
|
|
|
+
|
|
|
|
|
+import dllogger
|
|
|
|
|
+import models
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
-from optimizer import build_optimizer
|
|
|
|
|
from lr_scheduler import build_lr_scheduler
|
|
from lr_scheduler import build_lr_scheduler
|
|
|
|
|
+from optimizer import build_optimizer
|
|
|
from utils.misc import AverageMeter
|
|
from utils.misc import AverageMeter
|
|
|
from utils.mode import Mode, RunScope
|
|
from utils.mode import Mode, RunScope
|
|
|
from utils.utility import get_num_trainers
|
|
from utils.utility import get_num_trainers
|
|
|
-import models
|
|
|
|
|
-
|
|
|
|
|
-import dllogger
|
|
|
|
|
|
|
|
|
|
import paddle
|
|
import paddle
|
|
|
import paddle.nn.functional as F
|
|
import paddle.nn.functional as F
|
|
|
from paddle.distributed import fleet
|
|
from paddle.distributed import fleet
|
|
|
from paddle.distributed.fleet import DistributedStrategy
|
|
from paddle.distributed.fleet import DistributedStrategy
|
|
|
-from paddle.static import sparsity
|
|
|
|
|
from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
|
|
from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
|
|
|
|
|
+from paddle.incubate import asp as sparsity
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_feeds(image_shape):
|
|
def create_feeds(image_shape):
|
|
@@ -45,11 +44,13 @@ def create_feeds(image_shape):
|
|
|
key (string): Name of variable to feed.
|
|
key (string): Name of variable to feed.
|
|
|
Value (tuple): paddle.static.data.
|
|
Value (tuple): paddle.static.data.
|
|
|
"""
|
|
"""
|
|
|
- feeds = dict()
|
|
|
|
|
|
|
+ feeds = {}
|
|
|
feeds['data'] = paddle.static.data(
|
|
feeds['data'] = paddle.static.data(
|
|
|
- name="data", shape=[None] + image_shape, dtype="float32")
|
|
|
|
|
|
|
+ name="data", shape=[None] + image_shape, dtype="float32"
|
|
|
|
|
+ )
|
|
|
feeds['label'] = paddle.static.data(
|
|
feeds['label'] = paddle.static.data(
|
|
|
- name="label", shape=[None, 1], dtype="int64")
|
|
|
|
|
|
|
+ name="label", shape=[None, 1], dtype="int64"
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
return feeds
|
|
return feeds
|
|
|
|
|
|
|
@@ -70,7 +71,7 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
|
|
|
key (string): Name of variable to fetch.
|
|
key (string): Name of variable to fetch.
|
|
|
Value (tuple): (variable, AverageMeter).
|
|
Value (tuple): (variable, AverageMeter).
|
|
|
"""
|
|
"""
|
|
|
- fetchs = dict()
|
|
|
|
|
|
|
+ fetchs = {}
|
|
|
target = paddle.reshape(feeds['label'], [-1, 1])
|
|
target = paddle.reshape(feeds['label'], [-1, 1])
|
|
|
|
|
|
|
|
if mode == Mode.TRAIN:
|
|
if mode == Mode.TRAIN:
|
|
@@ -78,8 +79,7 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
|
|
|
loss = F.cross_entropy(out, target)
|
|
loss = F.cross_entropy(out, target)
|
|
|
else:
|
|
else:
|
|
|
label_one_hot = F.one_hot(target, class_num)
|
|
label_one_hot = F.one_hot(target, class_num)
|
|
|
- soft_target = F.label_smooth(
|
|
|
|
|
- label_one_hot, epsilon=label_smoothing)
|
|
|
|
|
|
|
+ soft_target = F.label_smooth(label_one_hot, epsilon=label_smoothing)
|
|
|
soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
|
|
soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
|
|
|
log_softmax = -F.log_softmax(out, axis=-1)
|
|
log_softmax = -F.log_softmax(out, axis=-1)
|
|
|
loss = paddle.sum(log_softmax * soft_target, axis=-1)
|
|
loss = paddle.sum(log_softmax * soft_target, axis=-1)
|
|
@@ -94,19 +94,23 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
|
|
|
|
|
|
|
|
acc_top1 = paddle.metric.accuracy(input=out, label=target, k=1)
|
|
acc_top1 = paddle.metric.accuracy(input=out, label=target, k=1)
|
|
|
acc_top5 = paddle.metric.accuracy(input=out, label=target, k=5)
|
|
acc_top5 = paddle.metric.accuracy(input=out, label=target, k=5)
|
|
|
- metric_dict = dict()
|
|
|
|
|
|
|
+ metric_dict = {}
|
|
|
metric_dict["top1"] = acc_top1
|
|
metric_dict["top1"] = acc_top1
|
|
|
metric_dict["top5"] = acc_top5
|
|
metric_dict["top5"] = acc_top5
|
|
|
|
|
|
|
|
for key in metric_dict:
|
|
for key in metric_dict:
|
|
|
if mode != Mode.TRAIN and paddle.distributed.get_world_size() > 1:
|
|
if mode != Mode.TRAIN and paddle.distributed.get_world_size() > 1:
|
|
|
paddle.distributed.all_reduce(
|
|
paddle.distributed.all_reduce(
|
|
|
- metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
|
|
|
|
|
- metric_dict[key] = metric_dict[
|
|
|
|
|
- key] / paddle.distributed.get_world_size()
|
|
|
|
|
|
|
+ metric_dict[key], op=paddle.distributed.ReduceOp.SUM
|
|
|
|
|
+ )
|
|
|
|
|
+ metric_dict[key] = (
|
|
|
|
|
+ metric_dict[key] / paddle.distributed.get_world_size()
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- fetchs[key] = (metric_dict[key], AverageMeter(
|
|
|
|
|
- key, '7.4f', need_avg=True))
|
|
|
|
|
|
|
+ fetchs[key] = (
|
|
|
|
|
+ metric_dict[key],
|
|
|
|
|
+ AverageMeter(key, '7.4f', need_avg=True),
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
return fetchs
|
|
return fetchs
|
|
|
|
|
|
|
@@ -127,13 +131,16 @@ def create_strategy(args, is_train=True):
|
|
|
exec_strategy = paddle.static.ExecutionStrategy()
|
|
exec_strategy = paddle.static.ExecutionStrategy()
|
|
|
|
|
|
|
|
exec_strategy.num_threads = 1
|
|
exec_strategy.num_threads = 1
|
|
|
- exec_strategy.num_iteration_per_drop_scope = (10000 if args.amp and
|
|
|
|
|
- args.use_pure_fp16 else 10)
|
|
|
|
|
-
|
|
|
|
|
- paddle.set_flags({
|
|
|
|
|
- 'FLAGS_cudnn_exhaustive_search': True,
|
|
|
|
|
- 'FLAGS_conv_workspace_size_limit': 4096
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ exec_strategy.num_iteration_per_drop_scope = (
|
|
|
|
|
+ 10000 if args.amp and args.use_pure_fp16 else 10
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ paddle.set_flags(
|
|
|
|
|
+ {
|
|
|
|
|
+ 'FLAGS_cudnn_exhaustive_search': True,
|
|
|
|
|
+ 'FLAGS_conv_workspace_size_limit': 4096,
|
|
|
|
|
+ }
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
if not is_train:
|
|
if not is_train:
|
|
|
build_strategy.fix_op_run_order = True
|
|
build_strategy.fix_op_run_order = True
|
|
@@ -177,7 +184,7 @@ def dist_optimizer(args, optimizer):
|
|
|
dist_strategy.amp_configs = {
|
|
dist_strategy.amp_configs = {
|
|
|
"init_loss_scaling": args.scale_loss,
|
|
"init_loss_scaling": args.scale_loss,
|
|
|
"use_dynamic_loss_scaling": args.use_dynamic_loss_scaling,
|
|
"use_dynamic_loss_scaling": args.use_dynamic_loss_scaling,
|
|
|
- "use_pure_fp16": args.use_pure_fp16
|
|
|
|
|
|
|
+ "use_pure_fp16": args.use_pure_fp16,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
dist_strategy.asp = args.asp
|
|
dist_strategy.asp = args.asp
|
|
@@ -223,14 +230,16 @@ def build(args, main_prog, startup_prog, step_each_epoch, is_train=True):
|
|
|
input_image_channel=input_image_channel,
|
|
input_image_channel=input_image_channel,
|
|
|
data_format=data_format,
|
|
data_format=data_format,
|
|
|
use_pure_fp16=use_pure_fp16,
|
|
use_pure_fp16=use_pure_fp16,
|
|
|
- bn_weight_decay=bn_weight_decay)
|
|
|
|
|
|
|
+ bn_weight_decay=bn_weight_decay,
|
|
|
|
|
+ )
|
|
|
out = model(feeds["data"])
|
|
out = model(feeds["data"])
|
|
|
|
|
|
|
|
fetchs = create_fetchs(
|
|
fetchs = create_fetchs(
|
|
|
- out, feeds, class_num, args.label_smoothing, mode=mode)
|
|
|
|
|
|
|
+ out, feeds, class_num, args.label_smoothing, mode=mode
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
if args.asp:
|
|
if args.asp:
|
|
|
- sparsity.set_excluded_layers(main_prog, [model.fc.weight.name])
|
|
|
|
|
|
|
+ sparsity.set_excluded_layers(main_program=main_prog, param_names=[model.fc.weight.name])
|
|
|
|
|
|
|
|
lr_scheduler = None
|
|
lr_scheduler = None
|
|
|
optimizer = None
|
|
optimizer = None
|
|
@@ -244,10 +253,13 @@ def build(args, main_prog, startup_prog, step_each_epoch, is_train=True):
|
|
|
# This is a workaround to "Communicator of ring id 0 has not been initialized.".
|
|
# This is a workaround to "Communicator of ring id 0 has not been initialized.".
|
|
|
# Since Paddle's design, the initialization would be done inside train program,
|
|
# Since Paddle's design, the initialization would be done inside train program,
|
|
|
# eval_only need to manually call initialization.
|
|
# eval_only need to manually call initialization.
|
|
|
- if args.run_scope == RunScope.EVAL_ONLY and \
|
|
|
|
|
- paddle.distributed.get_world_size() > 1:
|
|
|
|
|
|
|
+ if (
|
|
|
|
|
+ args.run_scope == RunScope.EVAL_ONLY
|
|
|
|
|
+ and paddle.distributed.get_world_size() > 1
|
|
|
|
|
+ ):
|
|
|
collective_helper = CollectiveHelper(
|
|
collective_helper = CollectiveHelper(
|
|
|
- role_maker=fleet.PaddleCloudRoleMaker(is_collective=True))
|
|
|
|
|
|
|
+ role_maker=fleet.PaddleCloudRoleMaker(is_collective=True)
|
|
|
|
|
+ )
|
|
|
collective_helper.update_startup_program(startup_prog)
|
|
collective_helper.update_startup_program(startup_prog)
|
|
|
|
|
|
|
|
return fetchs, lr_scheduler, feeds, optimizer
|
|
return fetchs, lr_scheduler, feeds, optimizer
|
|
@@ -270,22 +282,22 @@ def compile_prog(args, program, loss_name=None, is_train=True):
|
|
|
build_strategy, exec_strategy = create_strategy(args, is_train)
|
|
build_strategy, exec_strategy = create_strategy(args, is_train)
|
|
|
|
|
|
|
|
compiled_program = paddle.static.CompiledProgram(
|
|
compiled_program = paddle.static.CompiledProgram(
|
|
|
- program).with_data_parallel(
|
|
|
|
|
- loss_name=loss_name,
|
|
|
|
|
- build_strategy=build_strategy,
|
|
|
|
|
- exec_strategy=exec_strategy)
|
|
|
|
|
|
|
+ program, build_strategy=build_strategy
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
return compiled_program
|
|
return compiled_program
|
|
|
|
|
|
|
|
|
|
|
|
|
-def run(args,
|
|
|
|
|
- dataloader,
|
|
|
|
|
- exe,
|
|
|
|
|
- program,
|
|
|
|
|
- fetchs,
|
|
|
|
|
- epoch,
|
|
|
|
|
- mode=Mode.TRAIN,
|
|
|
|
|
- lr_scheduler=None):
|
|
|
|
|
|
|
+def run(
|
|
|
|
|
+ args,
|
|
|
|
|
+ dataloader,
|
|
|
|
|
+ exe,
|
|
|
|
|
+ program,
|
|
|
|
|
+ fetchs,
|
|
|
|
|
+ epoch,
|
|
|
|
|
+ mode=Mode.TRAIN,
|
|
|
|
|
+ lr_scheduler=None,
|
|
|
|
|
+):
|
|
|
"""
|
|
"""
|
|
|
Execute program.
|
|
Execute program.
|
|
|
|
|
|
|
@@ -312,11 +324,11 @@ def run(args,
|
|
|
if fetchs[k][1] is not None:
|
|
if fetchs[k][1] is not None:
|
|
|
metric_dict[k] = fetchs[k][1]
|
|
metric_dict[k] = fetchs[k][1]
|
|
|
|
|
|
|
|
- metric_dict["batch_time"] = AverageMeter(
|
|
|
|
|
- 'batch_time', '.5f', postfix=" s,")
|
|
|
|
|
|
|
+ metric_dict["batch_time"] = AverageMeter('batch_time', '.5f', postfix=" s,")
|
|
|
metric_dict["data_time"] = AverageMeter('data_time', '.5f', postfix=" s,")
|
|
metric_dict["data_time"] = AverageMeter('data_time', '.5f', postfix=" s,")
|
|
|
metric_dict["compute_time"] = AverageMeter(
|
|
metric_dict["compute_time"] = AverageMeter(
|
|
|
- 'compute_time', '.5f', postfix=" s,")
|
|
|
|
|
|
|
+ 'compute_time', '.5f', postfix=" s,"
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
for m in metric_dict.values():
|
|
for m in metric_dict.values():
|
|
|
m.reset()
|
|
m.reset()
|
|
@@ -328,8 +340,7 @@ def run(args,
|
|
|
batch_size = None
|
|
batch_size = None
|
|
|
latency = []
|
|
latency = []
|
|
|
|
|
|
|
|
- total_benchmark_steps = \
|
|
|
|
|
- args.benchmark_steps + args.benchmark_warmup_steps
|
|
|
|
|
|
|
+ total_benchmark_steps = args.benchmark_steps + args.benchmark_warmup_steps
|
|
|
|
|
|
|
|
dataloader.reset()
|
|
dataloader.reset()
|
|
|
while True:
|
|
while True:
|
|
@@ -361,11 +372,12 @@ def run(args,
|
|
|
batch_size = batch[0]["data"].shape()[0]
|
|
batch_size = batch[0]["data"].shape()[0]
|
|
|
feed_dict = batch[0]
|
|
feed_dict = batch[0]
|
|
|
|
|
|
|
|
- with profiler.profile_tag(idx, "Training"
|
|
|
|
|
- if mode == Mode.TRAIN else "Evaluation"):
|
|
|
|
|
- results = exe.run(program=program,
|
|
|
|
|
- feed=feed_dict,
|
|
|
|
|
- fetch_list=fetch_list)
|
|
|
|
|
|
|
+ with profiler.profile_tag(
|
|
|
|
|
+ idx, "Training" if mode == Mode.TRAIN else "Evaluation"
|
|
|
|
|
+ ):
|
|
|
|
|
+ results = exe.run(
|
|
|
|
|
+ program=program, feed=feed_dict, fetch_list=fetch_list
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
for name, m in zip(fetchs.keys(), results):
|
|
for name, m in zip(fetchs.keys(), results):
|
|
|
if name in metric_dict:
|
|
if name in metric_dict:
|
|
@@ -382,15 +394,16 @@ def run(args,
|
|
|
tic = time.perf_counter()
|
|
tic = time.perf_counter()
|
|
|
|
|
|
|
|
if idx % args.print_interval == 0:
|
|
if idx % args.print_interval == 0:
|
|
|
- log_msg = dict()
|
|
|
|
|
|
|
+ log_msg = {}
|
|
|
log_msg['loss'] = metric_dict['loss'].val.item()
|
|
log_msg['loss'] = metric_dict['loss'].val.item()
|
|
|
log_msg['top1'] = metric_dict['top1'].val.item()
|
|
log_msg['top1'] = metric_dict['top1'].val.item()
|
|
|
log_msg['top5'] = metric_dict['top5'].val.item()
|
|
log_msg['top5'] = metric_dict['top5'].val.item()
|
|
|
log_msg['data_time'] = metric_dict['data_time'].val
|
|
log_msg['data_time'] = metric_dict['data_time'].val
|
|
|
log_msg['compute_time'] = metric_dict['compute_time'].val
|
|
log_msg['compute_time'] = metric_dict['compute_time'].val
|
|
|
log_msg['batch_time'] = metric_dict['batch_time'].val
|
|
log_msg['batch_time'] = metric_dict['batch_time'].val
|
|
|
- log_msg['ips'] = \
|
|
|
|
|
|
|
+ log_msg['ips'] = (
|
|
|
batch_size * num_trainers / metric_dict['batch_time'].val
|
|
batch_size * num_trainers / metric_dict['batch_time'].val
|
|
|
|
|
+ )
|
|
|
if mode == Mode.TRAIN:
|
|
if mode == Mode.TRAIN:
|
|
|
log_msg['lr'] = metric_dict['lr'].val
|
|
log_msg['lr'] = metric_dict['lr'].val
|
|
|
log_info((epoch, idx), log_msg, mode)
|
|
log_info((epoch, idx), log_msg, mode)
|
|
@@ -406,10 +419,10 @@ def run(args,
|
|
|
logging.info("Begin benchmark at step %d", idx + 1)
|
|
logging.info("Begin benchmark at step %d", idx + 1)
|
|
|
|
|
|
|
|
if idx == total_benchmark_steps:
|
|
if idx == total_benchmark_steps:
|
|
|
- benchmark_data = dict()
|
|
|
|
|
- benchmark_data[
|
|
|
|
|
- 'ips'] = batch_size * num_trainers / metric_dict[
|
|
|
|
|
- 'batch_time'].avg
|
|
|
|
|
|
|
+ benchmark_data = {}
|
|
|
|
|
+ benchmark_data['ips'] = (
|
|
|
|
|
+ batch_size * num_trainers / metric_dict['batch_time'].avg
|
|
|
|
|
+ )
|
|
|
if mode == mode.EVAL:
|
|
if mode == mode.EVAL:
|
|
|
latency = np.array(latency) * 1000
|
|
latency = np.array(latency) * 1000
|
|
|
quantile = np.quantile(latency, [0.9, 0.95, 0.99])
|
|
quantile = np.quantile(latency, [0.9, 0.95, 0.99])
|
|
@@ -422,15 +435,19 @@ def run(args,
|
|
|
logging.info("End benchmark at epoch step %d", idx)
|
|
logging.info("End benchmark at epoch step %d", idx)
|
|
|
return benchmark_data
|
|
return benchmark_data
|
|
|
|
|
|
|
|
- epoch_data = dict()
|
|
|
|
|
|
|
+ epoch_data = {}
|
|
|
epoch_data['loss'] = metric_dict['loss'].avg.item()
|
|
epoch_data['loss'] = metric_dict['loss'].avg.item()
|
|
|
epoch_data['epoch_time'] = metric_dict['batch_time'].total
|
|
epoch_data['epoch_time'] = metric_dict['batch_time'].total
|
|
|
- epoch_data['ips'] = batch_size * num_trainers * \
|
|
|
|
|
- metric_dict["batch_time"].count / metric_dict["batch_time"].sum
|
|
|
|
|
|
|
+ epoch_data['ips'] = (
|
|
|
|
|
+ batch_size
|
|
|
|
|
+ * num_trainers
|
|
|
|
|
+ * metric_dict["batch_time"].count
|
|
|
|
|
+ / metric_dict["batch_time"].sum
|
|
|
|
|
+ )
|
|
|
if mode == Mode.EVAL:
|
|
if mode == Mode.EVAL:
|
|
|
epoch_data['top1'] = metric_dict['top1'].avg.item()
|
|
epoch_data['top1'] = metric_dict['top1'].avg.item()
|
|
|
epoch_data['top5'] = metric_dict['top5'].avg.item()
|
|
epoch_data['top5'] = metric_dict['top5'].avg.item()
|
|
|
- log_info((epoch, ), epoch_data, mode)
|
|
|
|
|
|
|
+ log_info((epoch,), epoch_data, mode)
|
|
|
|
|
|
|
|
return epoch_data
|
|
return epoch_data
|
|
|
|
|
|
|
@@ -445,7 +462,7 @@ def log_info(step, metrics, mode):
|
|
|
mode(utils.Mode): Train or eval mode.
|
|
mode(utils.Mode): Train or eval mode.
|
|
|
"""
|
|
"""
|
|
|
prefix = 'train' if mode == Mode.TRAIN else 'val'
|
|
prefix = 'train' if mode == Mode.TRAIN else 'val'
|
|
|
- dllogger_iter_data = dict()
|
|
|
|
|
|
|
+ dllogger_iter_data = {}
|
|
|
for key in metrics:
|
|
for key in metrics:
|
|
|
dllogger_iter_data[f"{prefix}.{key}"] = metrics[key]
|
|
dllogger_iter_data[f"{prefix}.{key}"] = metrics[key]
|
|
|
dllogger.log(step=step, data=dllogger_iter_data)
|
|
dllogger.log(step=step, data=dllogger_iter_data)
|