Просмотр исходного кода

Merge pull request #676 from NVIDIA/gh/release

[DLRM/PyT] Triton updates
nv-kkudrynski 5 лет назад
Родитель
Сommit
323005c443

+ 17 - 10
PyTorch/Recommendation/DLRM/dlrm/data/datasets.py

@@ -144,7 +144,8 @@ class SplitCriteoDataset(Dataset):
         numerical_features: bool = False,
         numerical_features: bool = False,
         categorical_features: Optional[Sequence[int]] = None,
         categorical_features: Optional[Sequence[int]] = None,
         categorical_feature_sizes: Optional[Sequence[int]] = None,
         categorical_feature_sizes: Optional[Sequence[int]] = None,
-        prefetch_depth: int = 10
+        prefetch_depth: int = 10,
+        drop_last_batch: bool = False,
     ):
     ):
         self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size
         self._label_bytes_per_batch = np.dtype(np.bool).itemsize * batch_size
         self._numerical_bytes_per_batch = 13 * np.dtype(np.float16).itemsize * batch_size if numerical_features else 0
         self._numerical_bytes_per_batch = 13 * np.dtype(np.float16).itemsize * batch_size if numerical_features else 0
@@ -156,25 +157,31 @@ class SplitCriteoDataset(Dataset):
         ]
         ]
         self._categorical_features = categorical_features
         self._categorical_features = categorical_features
         self._batch_size = batch_size
         self._batch_size = batch_size
-        self._label_file = os.open(os.path.join(data_path, F"label.bin"), os.O_RDONLY)
-        self._num_entries = int(math.ceil(os.fstat(self._label_file).st_size / self._label_bytes_per_batch))
+        self._label_file = os.open(os.path.join(data_path, f"label.bin"), os.O_RDONLY)
+        self._num_entries = int(math.ceil(os.fstat(self._label_file).st_size
+                                          / self._label_bytes_per_batch)) if not drop_last_batch \
+                            else int(math.floor(os.fstat(self._label_file).st_size / self._label_bytes_per_batch))
 
 
         if numerical_features:
         if numerical_features:
             self._numerical_features_file = os.open(os.path.join(data_path, "numerical.bin"), os.O_RDONLY)
             self._numerical_features_file = os.open(os.path.join(data_path, "numerical.bin"), os.O_RDONLY)
-            if math.ceil(os.fstat(self._numerical_features_file).st_size /
-                         self._numerical_bytes_per_batch) != self._num_entries:
-                raise ValueError("Size miss match in data files")
+            number_of_numerical_batches = math.ceil(os.fstat(self._numerical_features_file).st_size
+                                                    / self._numerical_bytes_per_batch) if not drop_last_batch \
+                                          else math.floor(os.fstat(self._numerical_features_file).st_size
+                                                          / self._numerical_bytes_per_batch)
+            if number_of_numerical_batches != self._num_entries:
+                raise ValueError("Size mismatch in data files")
         else:
         else:
             self._numerical_features_file = None
             self._numerical_features_file = None
 
 
         if categorical_features:
         if categorical_features:
             self._categorical_features_files = []
             self._categorical_features_files = []
             for cat_id in categorical_features:
             for cat_id in categorical_features:
-                cat_file = os.open(os.path.join(data_path, F"cat_{cat_id}.bin"), os.O_RDONLY)
+                cat_file = os.open(os.path.join(data_path, f"cat_{cat_id}.bin"), os.O_RDONLY)
                 cat_bytes = self._categorical_bytes_per_batch[cat_id]
                 cat_bytes = self._categorical_bytes_per_batch[cat_id]
-                if math.ceil(
-                        os.fstat(cat_file).st_size / cat_bytes) != self._num_entries:
-                    raise ValueError("Size miss match in data files")
+                number_of_categorical_batches = math.ceil(os.fstat(cat_file).st_size / cat_bytes) if not drop_last_batch \
+                                                else math.floor(os.fstat(cat_file).st_size / cat_bytes)
+                if number_of_categorical_batches != self._num_entries:
+                    raise ValueError("Size mismatch in data files")
                 self._categorical_features_files.append(cat_file)
                 self._categorical_features_files.append(cat_file)
         else:
         else:
             self._categorical_features_files = None
             self._categorical_features_files = None

+ 4 - 4
PyTorch/Recommendation/DLRM/dlrm/nn/embeddings.py

@@ -131,12 +131,12 @@ class JointEmbedding(Embeddings):
         if self.hash_indices:
         if self.hash_indices:
             for cat, size in enumerate(self._categorical_feature_sizes):
             for cat, size in enumerate(self._categorical_feature_sizes):
                 categorical_inputs[:, cat] %= size
                 categorical_inputs[:, cat] %= size
-                logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
+                logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)
 
 
         return [self.embedding(categorical_inputs + self.offsets[:-1])]
         return [self.embedding(categorical_inputs + self.offsets[:-1])]
 
 
     def extra_repr(self):
     def extra_repr(self):
-        s = F"offsets={self.offsets.cpu().numpy()}"
+        s = f"offsets={self.offsets.cpu().numpy()}"
         return s
         return s
     # pylint:enable=missing-docstring
     # pylint:enable=missing-docstring
 
 
@@ -189,7 +189,7 @@ class FusedJointEmbedding(Embeddings):
         if self.hash_indices:
         if self.hash_indices:
             for cat, size in enumerate(self._categorical_feature_sizes):
             for cat, size in enumerate(self._categorical_feature_sizes):
                 categorical_inputs[:, cat] %= size
                 categorical_inputs[:, cat] %= size
-                logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
+                logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)
 
 
         return [BuckleEmbeddingFusedGatherFunction.apply(self.weight, categorical_inputs, self.offsets, self.amp_train)]
         return [BuckleEmbeddingFusedGatherFunction.apply(self.weight, categorical_inputs, self.offsets, self.amp_train)]
 
 
@@ -228,7 +228,7 @@ class JointSparseEmbedding(Embeddings):
         if self.hash_indices:
         if self.hash_indices:
             for cat, size in enumerate(self._categorical_feature_sizes):
             for cat, size in enumerate(self._categorical_feature_sizes):
                 categorical_inputs[:, cat] %= size
                 categorical_inputs[:, cat] %= size
-                logging.log_first_n(logging.WARNING, F"Hashed indices out of range.", 1)
+                logging.log_first_n(logging.WARNING, f"Hashed indices out of range.", 1)
 
 
         return [
         return [
             self.embedding(categorical_inputs)
             self.embedding(categorical_inputs)

+ 11 - 11
PyTorch/Recommendation/DLRM/dlrm/scripts/dist_main.py

@@ -152,7 +152,7 @@ def main(argv):
         dllogger.log(data=results, step=tuple())
         dllogger.log(data=results, step=tuple())
 
 
         if auc is not None:
         if auc is not None:
-            print(F"Finished testing. Test auc {auc:.4f}")
+            print(f"Finished testing. Test auc {auc:.4f}")
         return
         return
 
 
     if FLAGS.save_checkpoint_path and not FLAGS.bottom_features_ordered and is_main_process():
     if FLAGS.save_checkpoint_path and not FLAGS.bottom_features_ordered and is_main_process():
@@ -209,7 +209,7 @@ def main(argv):
             global_step = steps_per_epoch * epoch + step
             global_step = steps_per_epoch * epoch + step
 
 
             if FLAGS.max_steps and global_step > FLAGS.max_steps:
             if FLAGS.max_steps and global_step > FLAGS.max_steps:
-                print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
+                print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
                 break
                 break
 
 
             lr_scheduler.step()
             lr_scheduler.step()
@@ -245,7 +245,7 @@ def main(argv):
                 continue
                 continue
 
 
             if step == 0:
             if step == 0:
-                print(F"Started epoch {epoch}...")
+                print(f"Started epoch {epoch}...")
             elif step % print_freq == 0:
             elif step % print_freq == 0:
                 torch.cuda.current_stream().wait_stream(moving_loss_stream)
                 torch.cuda.current_stream().wait_stream(moving_loss_stream)
                 # Averaging cross a print_freq period to reduce the error.
                 # Averaging cross a print_freq period to reduce the error.
@@ -264,7 +264,7 @@ def main(argv):
 
 
                 eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
                 eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
                 metric_logger.print(
                 metric_logger.print(
-                    header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
+                    header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
 
 
                 with torch.cuda.stream(moving_loss_stream):
                 with torch.cuda.stream(moving_loss_stream):
                     moving_loss = 0.
                     moving_loss = 0.
@@ -275,7 +275,7 @@ def main(argv):
                 if auc is None:
                 if auc is None:
                     continue
                     continue
 
 
-                print(F"Epoch {epoch} step {step}. auc {auc:.6f}")
+                print(f"Epoch {epoch} step {step}. auc {auc:.6f}")
                 stop_time = time()
                 stop_time = time()
 
 
                 if auc > best_auc:
                 if auc > best_auc:
@@ -284,15 +284,15 @@ def main(argv):
 
 
                 if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                 if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                     run_time_s = int(stop_time - start_time)
                     run_time_s = int(stop_time - start_time)
-                    print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
-                          F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
-                          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+                    print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
+                          f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
+                          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
                     sys.exit()
                     sys.exit()
 
 
         epoch_stop_time = time()
         epoch_stop_time = time()
         epoch_time_s = epoch_stop_time - epoch_start_time
         epoch_time_s = epoch_stop_time - epoch_start_time
-        print(F"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
-              F"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s.")
+        print(f"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. "
+              f"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s.")
 
 
     avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
     avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
 
 
@@ -383,7 +383,7 @@ def dist_evaluate(model, data_loader):
             if timer.measured is not None:
             if timer.measured is not None:
                 metric_logger.update(step_time=timer.measured)
                 metric_logger.update(step_time=timer.measured)
                 if step % print_freq == 0 and step > 0:
                 if step % print_freq == 0 and step > 0:
-                    metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
+                    metric_logger.print(header=f"Test: [{step}/{steps_per_epoch}]")
 
 
         if is_main_process():
         if is_main_process():
             auc = utils.roc_auc_score(torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float()))
             auc = utils.roc_auc_score(torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float()))

+ 14 - 14
PyTorch/Recommendation/DLRM/dlrm/scripts/main.py

@@ -204,7 +204,7 @@ def main(argv):
                    'average_test_throughput': avg_test_throughput}
                    'average_test_throughput': avg_test_throughput}
         dllogger.log(data=results, step=tuple())
         dllogger.log(data=results, step=tuple())
 
 
-        print(F"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
+        print(f"Finished testing. Test Loss {loss:.4f}, auc {auc:.4f}")
         return
         return
 
 
     if FLAGS.mode == 'inference_benchmark':
     if FLAGS.mode == 'inference_benchmark':
@@ -227,12 +227,12 @@ def main(argv):
 
 
             mean_latency = np.mean(latencies)
             mean_latency = np.mean(latencies)
             mean_inference_throughput = batch_size / mean_latency
             mean_inference_throughput = batch_size / mean_latency
-            subresult = {F'mean_inference_latency_batch_{batch_size}': mean_latency,
-                         F'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
+            subresult = {f'mean_inference_latency_batch_{batch_size}': mean_latency,
+                         f'mean_inference_throughput_batch_{batch_size}': mean_inference_throughput}
             results.update(subresult)
             results.update(subresult)
         dllogger.log(data=results, step=tuple())
         dllogger.log(data=results, step=tuple())
 
 
-        print(F"Finished inference benchmark.")
+        print(f"Finished inference benchmark.")
         return
         return
 
 
     if FLAGS.mode == 'train':
     if FLAGS.mode == 'train':
@@ -305,7 +305,7 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                           decay_steps=FLAGS.decay_steps, decay_start_step=FLAGS.decay_start_step)
                           decay_steps=FLAGS.decay_steps, decay_start_step=FLAGS.decay_start_step)
 
 
             if FLAGS.max_steps and global_step > FLAGS.max_steps:
             if FLAGS.max_steps and global_step > FLAGS.max_steps:
-                print(F"Reached max global steps of {FLAGS.max_steps}. Stopping.")
+                print(f"Reached max global steps of {FLAGS.max_steps}. Stopping.")
                 break
                 break
 
 
             if prefetching_enabled:
             if prefetching_enabled:
@@ -346,17 +346,17 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                     )
                     )
 
 
                 if global_step < FLAGS.benchmark_warmup_steps:
                 if global_step < FLAGS.benchmark_warmup_steps:
-                    print(F'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
+                    print(f'Warming up, step [{global_step}/{FLAGS.benchmark_warmup_steps}]')
                     continue
                     continue
 
 
                 eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
                 eta_str = datetime.timedelta(seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step)))
                 metric_logger.print(
                 metric_logger.print(
-                    header=F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
+                    header=f"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}]  eta: {eta_str}")
 
 
             if (global_step % test_freq == 0 and global_step > 0 and
             if (global_step % test_freq == 0 and global_step > 0 and
                     global_step / steps_per_epoch >= FLAGS.test_after):
                     global_step / steps_per_epoch >= FLAGS.test_after):
                 loss, auc, test_step_time = evaluate(model, loss_fn, data_loader_test)
                 loss, auc, test_step_time = evaluate(model, loss_fn, data_loader_test)
-                print(F"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")
+                print(f"Epoch {epoch} step {step}. Test loss {loss:.5f}, auc {auc:.6f}")
 
 
                 if auc > best_auc:
                 if auc > best_auc:
                     best_auc = auc
                     best_auc = auc
@@ -366,16 +366,16 @@ def train(model, loss_fn, optimizer, data_loader_train, data_loader_test, scaled
                 if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                 if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold:
                     stop_time = time()
                     stop_time = time()
                     run_time_s = int(stop_time - start_time)
                     run_time_s = int(stop_time - start_time)
-                    print(F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
-                          F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
-                          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+                    print(f"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch "
+                          f"{global_step/steps_per_epoch:.2f} in {run_time_s}s. "
+                          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
                     return
                     return
 
 
     stop_time = time()
     stop_time = time()
     run_time_s = int(stop_time - start_time)
     run_time_s = int(stop_time - start_time)
 
 
-    print(F"Finished training in {run_time_s}s. "
-          F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
+    print(f"Finished training in {run_time_s}s. "
+          f"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s.")
 
 
     avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
     avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg
 
 
@@ -441,7 +441,7 @@ def evaluate(model, loss_fn, data_loader):
             if timer.measured is not None:
             if timer.measured is not None:
                 metric_logger.update(loss=loss_value, step_time=timer.measured)
                 metric_logger.update(loss=loss_value, step_time=timer.measured)
                 if step % print_freq == 0 and step > 0:
                 if step % print_freq == 0 and step > 0:
-                    metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")
+                    metric_logger.print(header=f"Test: [{step}/{steps_per_epoch}]")
 
 
         y_true = torch.cat(y_true)
         y_true = torch.cat(y_true)
         y_score = torch.cat(y_score)
         y_score = torch.cat(y_score)

+ 3 - 3
PyTorch/Recommendation/DLRM/dlrm/scripts/utils.py

@@ -127,7 +127,7 @@ class MetricLogger(object):
             header = ''
             header = ''
         print_str = header
         print_str = header
         for name, meter in self.meters.items():
         for name, meter in self.meters.items():
-            print_str += F"  {name}: {meter}"
+            print_str += f"  {name}: {meter}"
         print(print_str)
         print(print_str)
 
 
 
 
@@ -282,13 +282,13 @@ def roc_auc_score(y_true, y_score):
     y_true.squeeze_()
     y_true.squeeze_()
     y_score.squeeze_()
     y_score.squeeze_()
     if y_true.shape != y_score.shape:
     if y_true.shape != y_score.shape:
-        raise TypeError(F"Shape of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.")
+        raise TypeError(f"Shape of y_true and y_score must match. Got {y_true.shape()} and {y_score.shape()}.")
 
 
     desc_score_indices = torch.argsort(y_score, descending=True)
     desc_score_indices = torch.argsort(y_score, descending=True)
     y_score = y_score[desc_score_indices]
     y_score = y_score[desc_score_indices]
     y_true = y_true[desc_score_indices]
     y_true = y_true[desc_score_indices]
 
 
-    distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1]).squeeze()
+    distinct_value_indices = torch.nonzero(y_score[1:] - y_score[:-1], as_tuple=False).squeeze()
     threshold_idxs = torch.cat([distinct_value_indices, torch.tensor([y_true.numel() - 1], device=device)])
     threshold_idxs = torch.cat([distinct_value_indices, torch.tensor([y_true.numel() - 1], device=device)])
 
 
     tps = torch.cumsum(y_true, dim=0)[threshold_idxs]
     tps = torch.cumsum(y_true, dim=0)[threshold_idxs]

+ 1 - 1
PyTorch/Recommendation/DLRM/preproc/split_dataset.py

@@ -63,7 +63,7 @@ def split_binary_file(
 
 
         categorical_fs = []
         categorical_fs = []
         for i in range(len(categorical_feature_sizes)):
         for i in range(len(categorical_feature_sizes)):
-            fs = open(os.path.join(output_dir, F'cat_{i}.bin'), 'wb+')
+            fs = open(os.path.join(output_dir, f'cat_{i}.bin'), 'wb+')
             categorical_fs.append(fs)
             categorical_fs.append(fs)
             file_streams.append(fs)
             file_streams.append(fs)
 
 

+ 41 - 24
PyTorch/Recommendation/DLRM/triton/README.md

@@ -1,22 +1,33 @@
 # Deploying the DLRM model using Triton Inference Server
 # Deploying the DLRM model using Triton Inference Server
 
 
-The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
-
 This folder contains instructions for deploment and exemplary client application to run inference on
 This folder contains instructions for deploment and exemplary client application to run inference on
 Triton Inference Server as well as detailed performance analysis.
 Triton Inference Server as well as detailed performance analysis.
 
 
 ## Table Of Contents
 ## Table Of Contents
 
 
-- [Running Triton Inference Server and client](#running-triton-inference-server-and-client)
-- [Latency vs Throughput](#throughputlatency-results)
-- [Dynamic batching support](#dynamic-batching-support)
+  * [Solution Overview](#solution-overview)
+  * [Quick Start Guide](#quick-start-guide)
+     * [Running Triton Inference Server and client](#running-triton-inference-server-and-client)
+  * [Performance](#performance)
+     * [Latency vs Throughput](#throughputlatency-results)
+  * [Advanced](#advanced)
+     * [Dynamic batching support](#dynamic-batching-support)
+
+## Solution Overview
+
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
+
+## Quick Start Guide
 
 
-## Running Triton Inference Server and client
+### Running Triton Inference Server and client
 
 
 The very first step of deployment is to acquire trained checkpoint and model configuration for this
 The very first step of deployment is to acquire trained checkpoint and model configuration for this
 checkpoint. Default model configuration are stored inside `dlrm/config` directory.
 checkpoint. Default model configuration are stored inside `dlrm/config` directory.
 
 
-### Inference container
+**Currently, our implementation only supports TorchScript deployment for models that fit into the memory of a single GPU.**
+You can read more about training DLRM models on different dataset configurations based on frequency threshold in the preprocessing step in [README](https://gitlab-master.nvidia.com/dl/JoC/dlrm_pyt#preprocess-with-spark).
+
+#### Inference container
 
 
 Every command below is called from special inference container. To build that container go to main
 Every command below is called from special inference container. To build that container go to main
 repository folder and call
 repository folder and call
@@ -26,12 +37,12 @@ repository folder and call
 This command will download dependencies and build inference container. Then run shell inside the
 This command will download dependencies and build inference container. Then run shell inside the
 container:
 container:
 
 
-`docker run -it --rm --gpus device=0 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --net=host -v <PATH_TO_MODEL_REPOSITORY>:/repository dlrm-inference bash`
+`docker run -it --rm --gpus device=0 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --net=host -v <PATH_TO_MODEL_REPOSITORY>:/repository -v <PATH_TO_MODEL_CHECKPOINT>:/results/checkpoint -v <PATH_TO_DATASET>:/data dlrm-inference bash`
 
 
 Here `--gpus '"device=0,1,2,3"'` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates location where
 Here `--gpus '"device=0,1,2,3"'` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates location where
 deployed models were stored.
 deployed models were stored.
 
 
-### Deploying the model
+#### Deploying the model
 
 
 To deploy model into Triton compatible format, `deployer.py` script can by used. This script is
 To deploy model into Triton compatible format, `deployer.py` script can by used. This script is
 meant to be run from inside deployment docker container.
 meant to be run from inside deployment docker container.
@@ -104,15 +115,15 @@ Following model specific arguments have to be specified for model deployment:
   --cpu                 Export cpu model instead of gpu.
   --cpu                 Export cpu model instead of gpu.
 ```
 ```
 
 
-For example, to deploy model into onnx format, using half precision and max batch size 4096 called
-`dlrm-onnx-16` execute:
+For example, to deploy model into TorchScript format, using half precision and max batch size 4096 called
+`dlrm-ts-trace-16` execute:
 
 
 `python -m triton.deployer --ts-trace --triton-model-name dlrm-ts-trace-16 --triton-max-batch-size 4096 --save-dir /repository -- --model_checkpoint /results/checkpoint --fp16 --batch_size 4096 --num_numerical_features 13 --embedding_dim 128 --top_mlp_sizes 1024 1024 512 256 1 --bottom_mlp_sizes 512 256 128 --interaction_op dot --dataset /data`
 `python -m triton.deployer --ts-trace --triton-model-name dlrm-ts-trace-16 --triton-max-batch-size 4096 --save-dir /repository -- --model_checkpoint /results/checkpoint --fp16 --batch_size 4096 --num_numerical_features 13 --embedding_dim 128 --top_mlp_sizes 1024 1024 512 256 1 --bottom_mlp_sizes 512 256 128 --interaction_op dot --dataset /data`
 
 
 Where `model_checkpoint` is a checkpoint for a trained model with the same configuration as used during export and dataset (or at least dataset configuration)
 Where `model_checkpoint` is a checkpoint for a trained model with the same configuration as used during export and dataset (or at least dataset configuration)
 is mounted under `/data`
 is mounted under `/data`
 
 
-### Running the Triton server
+#### Running the Triton server
 **NOTE: This step is executed outside inference container**
 **NOTE: This step is executed outside inference container**
 
 
 1. `docker pull nvcr.io/nvidia/tritonserver:20.06-py3`
 1. `docker pull nvcr.io/nvidia/tritonserver:20.06-py3`
@@ -124,7 +135,7 @@ unload models. This is especially useful when dealing with numerous large models
 
 
 For models exported to onnx format and hosted inside onnx runtime it might be required to limit visible cpu to fully utlize gpu acceleration. Use `--cpuset-cpus` docker option for that.
 For models exported to onnx format and hosted inside onnx runtime it might be required to limit visible cpu to fully utlize gpu acceleration. Use `--cpuset-cpus` docker option for that.
 
 
-### Running client
+#### Running client
 
 
 Exemplary client `client.py` allows to check model performance against synthetic or real validation
 Exemplary client `client.py` allows to check model performance against synthetic or real validation
 data. Client connects to Triton server and perform inference.
 data. Client connects to Triton server and perform inference.
@@ -132,30 +143,33 @@ data. Client connects to Triton server and perform inference.
 ```
 ```
 usage: client.py [-h] --triton-server-url TRITON_SERVER_URL
 usage: client.py [-h] --triton-server-url TRITON_SERVER_URL
                  --triton-model-name TRITON_MODEL_NAME
                  --triton-model-name TRITON_MODEL_NAME
-                 [--triton-model-version TRITON_MODEL_VERSION]
-                 [-v] [-H HTTP_HEADER]
-                 --dataset_config DATASET_CONFIG
+                 [--triton-model-version TRITON_MODEL_VERSION] [-v]
+                 [-H HTTP_HEADER] --dataset_config DATASET_CONFIG
                  [--inference_data INFERENCE_DATA] [--batch_size BATCH_SIZE]
                  [--inference_data INFERENCE_DATA] [--batch_size BATCH_SIZE]
-                 [--fp16]
+                 [--drop_last_batch DROP_LAST_BATCH] [--fp16]
+                 [--test_batches TEST_BATCHES]
 
 
 optional arguments:
 optional arguments:
   -h, --help            show this help message and exit
   -h, --help            show this help message and exit
   --triton-server-url TRITON_SERVER_URL
   --triton-server-url TRITON_SERVER_URL
-                        URL adress of trtion server (with port)
+                        URL adress of triton server (with port)
   --triton-model-name TRITON_MODEL_NAME
   --triton-model-name TRITON_MODEL_NAME
                         Triton deployed model name
                         Triton deployed model name
   --triton-model-version TRITON_MODEL_VERSION
   --triton-model-version TRITON_MODEL_VERSION
                         Triton model version
                         Triton model version
-  -v, --verbose         Verbose mode.
+  -v, --verbose         Enable verbose output
   -H HTTP_HEADER        HTTP headers to add to inference server requests.
   -H HTTP_HEADER        HTTP headers to add to inference server requests.
                         Format is -H"Header:Value".
                         Format is -H"Header:Value".
   --dataset_config DATASET_CONFIG
   --dataset_config DATASET_CONFIG
-                        Configuration file describing categorical features
   --inference_data INFERENCE_DATA
   --inference_data INFERENCE_DATA
                         Path to file with inference data.
                         Path to file with inference data.
   --batch_size BATCH_SIZE
   --batch_size BATCH_SIZE
                         Inference request batch size
                         Inference request batch size
+  --drop_last_batch DROP_LAST_BATCH
+                        Drops the last batch size if it's not full
   --fp16                Use 16bit for numerical input
   --fp16                Use 16bit for numerical input
+  --test_batches TEST_BATCHES
+                        Specifies number of batches used in the inference
 ```
 ```
 
 
 To run inference on model exported in previous steps, using data located under
 To run inference on model exported in previous steps, using data located under
@@ -164,7 +178,7 @@ To run inference on model exported in previous steps, using data located under
 `python -m triton.client --triton-server-url localhost:8000 --triton-model-name dlrm-ts-trace-16 --dataset_config /data/model_size.json --inference_data /data/test --batch_size 4096 --fp16`
 `python -m triton.client --triton-server-url localhost:8000 --triton-model-name dlrm-ts-trace-16 --dataset_config /data/model_size.json --inference_data /data/test --batch_size 4096 --fp16`
 
 
 
 
-### Gathering performance data
+#### Gathering performance data
 Performance data can be gathered using `perf_client` tool. To use this tool, performance data needs
 Performance data can be gathered using `perf_client` tool. To use this tool, performance data needs
 to be dumped during deployment. To do that, use `--dump_perf_data` option for the deployer:
 to be dumped during deployment. To do that, use `--dump_perf_data` option for the deployer:
 
 
@@ -172,11 +186,13 @@ to be dumped during deployment. To do that, use `--dump_perf_data` option for th
 
 
 When perf data are dumped, `perf_client` can be used with following command:
 When perf data are dumped, `perf_client` can be used with following command:
 
 
-`/workspace/bin/perf_client --max-threads 10 -m dlrm-onnx-16 -x 1 -p 5000 -v -i gRPC -u localhost:8001 -b 4096 -l 5000 --concurrency-range 1 --input-data /location/for/perfdata -f result.csv`
+`/workspace/bin/perf_client --max-threads 10 -m dlrm-ts-trace-16 -x 1 -p 5000 -v -i gRPC -u localhost:8001 -b 4096 -l 5000 --concurrency-range 1 --input-data /location/for/perfdata -f result.csv`
 
 
 For more information about `perf_client` please refer to [official documentation](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-master-branch-guide/docs/optimization.html#perf-client).
 For more information about `perf_client` please refer to [official documentation](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-master-branch-guide/docs/optimization.html#perf-client).
 
 
-## Throughput/Latency results
+## Performance
+
+### Throughput/Latency results
 
 
 Throughput is measured in recommendations/second, and latency in milliseconds.
 Throughput is measured in recommendations/second, and latency in milliseconds.
 
 
@@ -258,8 +274,9 @@ Throughput is measured in recommendations/second, and latency in milliseconds.
 The plot above shows, that the GPU is saturated with batch size 4096. However, running inference with larger batches
 The plot above shows, that the GPU is saturated with batch size 4096. However, running inference with larger batches
 might be faster, than running several inference requests. Therefore, we choose 65536 as the optimal batch size.
 might be faster, than running several inference requests. Therefore, we choose 65536 as the optimal batch size.
 
 
+## Advanced
 
 
-## Dynamic batching support
+### Dynamic batching support
 The Triton server has a dynamic batching mechanism built in, that can be enabled. When it is enabled, then the server creates
 The Triton server has a dynamic batching mechanism built in, that can be enabled. When it is enabled, then the server creates
 inference batches from the received requests. Since the output of the model is a single probability, the batch size of a
 inference batches from the received requests. Since the output of the model is a single probability, the batch size of a
 single request may be large. Here it is assumed to be 4096. With dynamic batching enabled, the server will concatenate requests of this size into
 single request may be large. Here it is assumed to be 4096. With dynamic batching enabled, the server will concatenate requests of this size into

+ 10 - 2
PyTorch/Recommendation/DLRM/triton/client.py

@@ -48,7 +48,8 @@ def get_data_loader(batch_size, *, data_path, model_config):
             numerical_features=True,
             numerical_features=True,
             categorical_features=range(len(categorical_sizes)),
             categorical_features=range(len(categorical_sizes)),
             categorical_feature_sizes=categorical_sizes,
             categorical_feature_sizes=categorical_sizes,
-            prefetch_depth=1
+            prefetch_depth=1,
+            drop_last_batch=model_config.drop_last_batch
         )
         )
     else:
     else:
         data = SyntheticDataset(
         data = SyntheticDataset(
@@ -59,6 +60,9 @@ def get_data_loader(batch_size, *, data_path, model_config):
             device="cpu"
             device="cpu"
         )
         )
 
 
+    if model_config.test_batches > 0:
+        data = torch.utils.data.Subset(data, list(range(model_config.test_batches)))
+
     return torch.utils.data.DataLoader(data,
     return torch.utils.data.DataLoader(data,
                                        batch_size=None,
                                        batch_size=None,
                                        num_workers=0,
                                        num_workers=0,
@@ -111,8 +115,12 @@ if __name__ == '__main__':
                         help="Path to file with inference data.")
                         help="Path to file with inference data.")
     parser.add_argument("--batch_size", type=int, default=1,
     parser.add_argument("--batch_size", type=int, default=1,
                         help="Inference request batch size")
                         help="Inference request batch size")
+    parser.add_argument("--drop_last_batch", type=bool, default=True,
+                        help="Drops the last batch size if it's not full")
     parser.add_argument("--fp16", action="store_true", default=False,
     parser.add_argument("--fp16", action="store_true", default=False,
                         help="Use 16bit for numerical input")
                         help="Use 16bit for numerical input")
+    parser.add_argument("--test_batches", type=int, default=0,
+                        help="Specifies number of batches used in the inference")
 
 
     FLAGS = parser.parse_args()
     FLAGS = parser.parse_args()
     try:
     try:
@@ -152,7 +160,7 @@ if __name__ == '__main__':
     tgt_list = np.concatenate(tgt_list)
     tgt_list = np.concatenate(tgt_list)
 
 
     score = roc_auc_score(tgt_list, results)
     score = roc_auc_score(tgt_list, results)
-    print(F"Model score: {score}")
+    print(f"Model score: {score}")
 
 
     statistics = triton_client.get_inference_statistics(model_name=FLAGS.triton_model_name, headers=headers_dict)
     statistics = triton_client.get_inference_statistics(model_name=FLAGS.triton_model_name, headers=headers_dict)
     print(statistics)
     print(statistics)