Jelajahi Sumber

[SE3Transformer/DGLPyT] 22.08 container update

Alexandre Milesi 3 tahun lalu
induk
melakukan
fe337c5259
25 mengubah file dengan 497 tambahan dan 294 penghapusan
  1. 14 5
      DGLPyTorch/DrugDiscovery/SE3Transformer/Dockerfile
  2. 1 1
      DGLPyTorch/DrugDiscovery/SE3Transformer/LICENSE
  3. 1 0
      DGLPyTorch/DrugDiscovery/SE3Transformer/NOTICE
  4. 51 47
      DGLPyTorch/DrugDiscovery/SE3Transformer/README.md
  5. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/data_module.py
  6. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/qm9.py
  7. 5 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/basis.py
  8. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/fiber.py
  9. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/attention.py
  10. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/convolution.py
  11. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/linear.py
  12. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/norm.py
  13. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/pooling.py
  14. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/transformer.py
  15. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/arguments.py
  16. 11 20
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/callbacks.py
  17. 366 155
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/gpu_affinity.py
  18. 5 3
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/inference.py
  19. 2 20
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/loggers.py
  20. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/metrics.py
  21. 10 10
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/training.py
  22. 3 3
      DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/utils.py
  23. 1 1
      DGLPyTorch/DrugDiscovery/SE3Transformer/setup.py
  24. 2 2
      DGLPyTorch/DrugDiscovery/SE3Transformer/tests/test_equivariance.py
  25. 3 3
      DGLPyTorch/DrugDiscovery/SE3Transformer/tests/utils.py

+ 14 - 5
DGLPyTorch/DrugDiscovery/SE3Transformer/Dockerfile

@@ -24,7 +24,7 @@
 # run docker daemon with --default-runtime=nvidia for GPU detection during build
 # multistage build for DGL with CUDA and FP16
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.08-py3
 
 FROM ${FROM_IMAGE_NAME} AS dgl_builder
 
@@ -33,11 +33,19 @@ RUN apt-get update \
     && apt-get install -y git build-essential python3-dev make cmake \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /dgl
-RUN git clone --branch v0.7.0 --recurse-submodules --depth 1 https://github.com/dmlc/dgl.git .
-RUN sed -i 's/"35 50 60 70"/"60 70 80"/g' cmake/modules/CUDA.cmake
+RUN git clone --branch 0.9.0 --recurse-submodules --depth 1 https://github.com/dmlc/dgl.git .
 WORKDIR build
-RUN cmake -DUSE_CUDA=ON -DUSE_FP16=ON ..
-RUN make -j8
+RUN export NCCL_ROOT=/usr \
+    && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_CUDA=ON -DCUDA_ARCH_BIN="60 70 80" -DCUDA_ARCH_PTX="80" \
+        -DCUDA_ARCH_NAME="Manual" \
+        -DUSE_FP16=ON \
+        -DBUILD_TORCH=ON \
+        -DUSE_NCCL=ON \
+        -DUSE_SYSTEM_NCCL=ON \
+        -DBUILD_WITH_SHARED_NCCL=ON \
+        -DUSE_AVX=ON \
+    && cmake --build .
 
 
 FROM ${FROM_IMAGE_NAME}
@@ -49,6 +57,7 @@ COPY --from=dgl_builder /dgl ./dgl
 RUN cd dgl/python && python setup.py install && cd ../.. && rm -rf dgl
 
 ADD requirements.txt .
+RUN pip install --no-cache-dir --upgrade --pre pip
 RUN pip install --no-cache-dir -r requirements.txt
 ADD . .
 

+ 1 - 1
DGLPyTorch/DrugDiscovery/SE3Transformer/LICENSE

@@ -1,4 +1,4 @@
-Copyright 2021 NVIDIA CORPORATION & AFFILIATES
+Copyright 2021-2022 NVIDIA CORPORATION & AFFILIATES
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

+ 1 - 0
DGLPyTorch/DrugDiscovery/SE3Transformer/NOTICE

@@ -1,3 +1,4 @@
+
 SE(3)-Transformer PyTorch
 
 This repository includes software from https://github.com/FabianFuchsML/se3-transformer-public

+ 51 - 47
DGLPyTorch/DrugDiscovery/SE3Transformer/README.md

@@ -161,11 +161,11 @@ Competitive training results and analysis are provided for the following hyperpa
 
 This model supports the following features:: 
 
-| Feature               | SE(3)-Transformer                
-|-----------------------|--------------------------
-|Automatic mixed precision (AMP)   |         Yes 
-|Distributed data parallel (DDP)   |         Yes 
-         
+| Feature                         | SE(3)-Transformer |
+|---------------------------------|-------------------|
+| Automatic mixed precision (AMP) | Yes               |
+| Distributed data parallel (DDP) | Yes               |
+
 #### Features
 
 
@@ -476,20 +476,20 @@ The following sections provide details on how we achieved our performance and ac
 
 Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 21.07 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs.
 
-| GPUs    | Batch size / GPU    | Absolute error - TF32  | Absolute error - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (mixed precision to TF32) |       
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|  1                 |    240                   |           0.03456                            |        0.03460                                |        1h23min      |    1h03min                |    1.32x              |
-|  8                 |    240                   |           0.03417                            |        0.03424                                |        15min          |    12min                |    1.25x              |
+| GPUs | Batch size / GPU | Absolute error - TF32 | Absolute error - mixed precision | Time to train - TF32 | Time to train - mixed precision | Time to train speedup (mixed precision to TF32) |       
+|:----:|:----------------:|:---------------------:|:--------------------------------:|:--------------------:|:-------------------------------:|:-----------------------------------------------:|
+|  1   |       240        |        0.03038        |             0.02987              |       1h02min        |              50min              |                      1.24x                      |
+|  8   |       240        |        0.03466        |             0.03436              |        13min         |              10min              |                      1.27x                      |
 
 
 ##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
 Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 21.07 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
 
-| GPUs    | Batch size / GPU    | Absolute error - FP32  | Absolute error - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (mixed precision to FP32)  |      
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|  1                 |    240                   |           0.03432                            |        0.03439                                |         2h25min         |    1h33min                |    1.56x              |
-|  8                 |    240                   |           0.03380                            |        0.03495                                |        29min          |    20min                |    1.45x              |
+| GPUs | Batch size / GPU | Absolute error - FP32 | Absolute error - mixed precision | Time to train - FP32 | Time to train - mixed precision | Time to train speedup (mixed precision to FP32) |      
+|:----:|:----------------:|:---------------------:|:--------------------------------:|:--------------------:|:-------------------------------:|:-----------------------------------------------:|
+|  1   |       240        |        0.03044        |             0.03076              |       2h07min        |             1h22min             |                      1.55x                      |
+|  8   |       240        |        0.03435        |             0.03495              |        27min         |              19min              |                      1.42x                      |
 
 
 
@@ -499,12 +499,12 @@ Our results were obtained by running the `scripts/train.sh` training script in t
 
 Our results were obtained by running the `scripts/benchmark_train.sh` and `scripts/benchmark_train_multi_gpu.sh` benchmarking scripts in the PyTorch 21.07 NGC container on NVIDIA DGX A100 with 8x A100 80GB GPUs. Performance numbers (in molecules per millisecond) were averaged over five  entire training epochs after a warmup epoch.
 
-| GPUs             | Batch size / GPU     | Throughput - TF32 [mol/ms]                             | Throughput - mixed precision [mol/ms]      | Throughput speedup (mixed precision - TF32)   | Weak scaling - TF32    | Weak scaling - mixed precision |
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|   1              |     240             |   2.21                                       |   2.92                            |   1.32x                         |                      |                                              |
-|   1              |     120              |  1.81                                        |  2.04                             |  1.13x                          |                      |                                              |
-|   8              |     240             |   15.88                                      |     21.02                         |   1.32x                         |   7.18               |    7.20                                     |
-|   8              |     120              |  12.68                                       |    13.99                          |  1.10x                          |       7.00           |    6.86                                       |
+|       GPUs       |  Batch size / GPU   | Throughput - TF32 [mol/ms] | Throughput - mixed precision [mol/ms] | Throughput speedup (mixed precision - TF32) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|:----------------:|:-------------------:|:--------------------------:|:-------------------------------------:|:-------------------------------------------:|:-------------------:|:------------------------------:|
+|        1         |         240         |            2.61            |                 3.35                  |                    1.28x                    |                     |                                |
+|        1         |         120         |            1.94            |                 2.07                  |                    1.07x                    |                     |                                |
+|        8         |         240         |           18.80            |                 23.90                 |                    1.27x                    |        7.20         |              7.13              |
+|        8         |         120         |           14.10            |                 14.52                 |                    1.03x                    |        7.27         |              7.01              |
 
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -514,12 +514,12 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 Our results were obtained by running the `scripts/benchmark_train.sh` and `scripts/benchmark_train_multi_gpu.sh` benchmarking scripts in the PyTorch 21.07 NGC container on NVIDIA DGX-1 with 8x V100 16GB GPUs. Performance numbers (in molecules per millisecond) were averaged over five  entire training epochs after a warmup epoch.
 
-| GPUs             | Batch size / GPU     | Throughput - FP32 [mol/ms] | Throughput - mixed precision  [mol/ms]     | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision |
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|   1              |     240              |    1.25          |    1.88                           |  1.50x                          |                      |                                              |
-|   1              |     120              |    1.03           |   1.41                            |  1.37x                          |                      |                                              |
-|   8              |     240              |    8.68           |   12.75                           |  1.47x                          |      6.94            |      6.78                                    |
-|   8              |     120              |    6.64           |   8.58                           |   1.29x                         |        6.44          |        6.08                                  |
+|       GPUs       |   Batch size / GPU   | Throughput - FP32 [mol/ms] | Throughput - mixed precision  [mol/ms] | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|:----------------:|:--------------------:|:--------------------------:|:--------------------------------------:|:-------------------------------------------:|:-------------------:|:------------------------------:|
+|        1         |         240          |            1.33            |                  2.12                  |                    1.59x                    |                     |                                |
+|        1         |         120          |            1.11            |                  1.45                  |                    1.31x                    |                     |                                |
+|        8         |         240          |            9.32            |                 13.40                  |                    1.44x                    |        7.01         |              6.32              |
+|        8         |         120          |            6.90            |                  8.39                  |                    1.22x                    |        6.21         |              5.79              |
 
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -532,21 +532,21 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 Our results were obtained by running the `scripts/benchmark_inference.sh` inferencing benchmarking script in the PyTorch 21.07 NGC container on NVIDIA DGX A100 with 1x A100 80GB GPU.
 
-FP16
+AMP
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 11.60 | 140.94 | 138.29 | 140.12 | 386.40 |
-| 800 | 10.74 | 75.69 | 75.74 | 76.50 | 79.77 |
-| 400 | 8.86 | 45.57 | 46.11 | 46.60 | 49.97 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          13.54          |      121.44      |      118.07      |      119.00      |      366.64      |
+|    800     |          12.63          |      64.11       |      63.78       |      64.37       |      68.19       |
+|    400     |          10.65          |      37.97       |      39.02       |      39.67       |      42.87       |
 
 TF32
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 8.58 | 189.20 | 186.39 | 187.71 | 420.28 |
-| 800 | 8.28 | 97.56 | 97.20 | 97.73 | 101.13 |
-| 400 | 7.55 | 53.38 | 53.72 | 54.48 | 56.62 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          8.97           |      180.85      |      178.31      |      178.92      |      375.33      |
+|    800     |          8.86           |      90.76       |      90.77       |      91.11       |      92.96       |
+|    400     |          8.49           |      47.42       |      47.65       |      48.15       |      50.74       |
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
@@ -556,21 +556,21 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 Our results were obtained by running the `scripts/benchmark_inference.sh` inferencing benchmarking script in the PyTorch 21.07 NGC container on NVIDIA DGX-1 with 1x V100 16GB GPU.
 
-FP16
+AMP
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 6.42 | 254.54 | 247.97 | 249.29 | 721.15 |
-| 800 | 6.13 | 132.07 | 131.90 | 132.70 | 140.15 |
-| 400 | 5.37 | 75.12 | 76.01 | 76.66 | 79.90 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          6.59           |      248.02      |      242.11      |      242.62      |      674.60      |
+|    800     |          6.38           |      126.49      |      125.96      |      126.31      |      127.72      |
+|    400     |          5.90           |      68.24       |      68.53       |      69.02       |      70.87       |
 
 FP32
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 3.39 | 475.86 | 473.82 | 475.64 | 891.18 |
-| 800 | 3.36 | 239.17 | 240.64 | 241.65 | 243.70 |
-| 400 | 3.17 | 126.67 | 128.19 | 128.82 | 130.54 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          3.33           |      482.20      |      483.50      |      485.28      |      754.84      |
+|    800     |          3.35           |      239.09      |      242.21      |      243.13      |      244.91      |
+|    400     |          3.27           |      122.68      |      123.60      |      124.18      |      125.85      |
 
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -580,6 +580,10 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 ### Changelog
 
+August 2022:
+- Slight performance improvements
+- Upgraded base container
+
 November 2021:
 - Improved low memory mode to give further 6x memory savings
 - Disabled W&B logging by default

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/data_module.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import torch.distributed as dist

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/qm9.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 from typing import Tuple
 

+ 5 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/basis.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 
@@ -33,6 +33,9 @@ from torch.cuda.nvtx import range as nvtx_range
 
 from se3_transformer.runtime.utils import degree_to_dim
 
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+
 
 @lru_cache(maxsize=None)
 def get_clebsch_gordon(J: int, d_in: int, d_out: int, device) -> Tensor:

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/fiber.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/attention.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import dgl

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/convolution.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 from enum import Enum

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/linear.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/norm.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/layers/pooling.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 from typing import Dict, Literal

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/transformer.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import logging

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/arguments.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import argparse

+ 11 - 20
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/callbacks.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import logging
@@ -34,7 +34,7 @@ from se3_transformer.runtime.metrics import MeanAbsoluteError
 
 
 class BaseCallback(ABC):
-    def on_fit_start(self, optimizer, args):
+    def on_fit_start(self, optimizer, args, start_epoch):
         pass
 
     def on_fit_end(self):
@@ -64,17 +64,17 @@ class LRSchedulerCallback(BaseCallback):
         self.logger = logger
         self.scheduler = None
 
-        self.logger.log_metadata('learning rate', {'unit': None})
-
     @abstractmethod
-    def get_scheduler(self, optimizer, args):
+    def get_scheduler(self, optimizer, args, last_epoch):
         pass
 
-    def on_fit_start(self, optimizer, args):
-        self.scheduler = self.get_scheduler(optimizer, args)
+    def on_fit_start(self, optimizer, args, start_epoch):
+        self.scheduler = self.get_scheduler(optimizer, args, start_epoch - 1)
+        if hasattr(self, 'state_dict'):
+            self.scheduler.load_state_dict(self.state_dict)
 
     def on_checkpoint_load(self, checkpoint):
-        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        self.state_dict = checkpoint['scheduler_state_dict']
 
     def on_checkpoint_save(self, checkpoint):
         checkpoint['scheduler_state_dict'] = self.scheduler.state_dict()
@@ -96,9 +96,6 @@ class QM9MetricCallback(BaseCallback):
         self.best_mae = float('inf')
         self.last_mae = None
 
-        self.logger.log_metadata(f'{self.prefix} MAE', {'unit': None})
-        self.logger.log_metadata(f'{self.prefix} best MAE', {'unit': None})
-
     def on_validation_step(self, input, target, pred):
         self.mae(pred.detach(), target.detach())
 
@@ -120,9 +117,9 @@ class QM9LRSchedulerCallback(LRSchedulerCallback):
         super().__init__(logger)
         self.epochs = epochs
 
-    def get_scheduler(self, optimizer, args):
+    def get_scheduler(self, optimizer, args, last_epoch):
         min_lr = args.min_learning_rate if args.min_learning_rate else args.learning_rate / 10.0
-        return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, self.epochs, eta_min=min_lr)
+        return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, self.epochs, eta_min=min_lr, last_epoch=last_epoch)
 
 
 class PerformanceCallback(BaseCallback):
@@ -134,12 +131,6 @@ class PerformanceCallback(BaseCallback):
         self.mode = mode
         self.logger = logger
 
-        logger.log_metadata(f"throughput_{self.mode}", {'unit': 'molecules/s'})
-        logger.log_metadata(f"total_time_{self.mode}", {'unit': 's'})
-        logger.log_metadata(f"latency_{self.mode}_mean", {'unit': 's'})
-        for level in [90, 95, 99]:
-            logger.log_metadata(f"latency_{self.mode}_{level}", {'unit': 's'})
-
     def on_batch_start(self):
         if self.epoch >= self.warmup_epochs:
             self.timestamps.append(time.time() * 1000.0)

+ 366 - 155
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/gpu_affinity.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,12 +18,11 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import collections
 import itertools
-import math
 import os
 import pathlib
 import re
@@ -32,8 +31,12 @@ import pynvml
 
 
 class Device:
-    # assumes nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+    # assume nvml returns list of 64 bit ints
+    _nvml_bit_affinity = 64
+
+    _nvml_affinity_elements = (
+        os.cpu_count() + _nvml_bit_affinity - 1
+    ) // _nvml_bit_affinity
 
     def __init__(self, device_idx):
         super().__init__()
@@ -45,11 +48,20 @@ class Device:
     def get_uuid(self):
         return pynvml.nvmlDeviceGetUUID(self.handle)
 
-    def get_cpu_affinity(self):
-        affinity_string = ""
-        for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, Device._nvml_affinity_elements):
+    def get_cpu_affinity(self, scope):
+        if scope == 'socket':
+            nvml_scope = pynvml.NVML_AFFINITY_SCOPE_SOCKET
+        elif scope == 'node':
+            nvml_scope = pynvml.NVML_AFFINITY_SCOPE_NODE
+        else:
+            raise RuntimeError('Unknown scope')
+
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinityWithinScope(
+            self.handle, Device._nvml_affinity_elements, nvml_scope
+        ):
             # assume nvml returns list of 64 bit ints
-            affinity_string = "{:064b}".format(j) + affinity_string
+            affinity_string = '{:064b}'.format(j) + affinity_string
 
         affinity_list = [int(x) for x in affinity_string]
         affinity_list.reverse()  # so core 0 is in 0th element of list
@@ -63,213 +75,410 @@ def get_thread_siblings_list():
     Returns a list of 2-element integer tuples representing pairs of
     hyperthreading cores.
     """
-    path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
     thread_siblings_list = []
-    pattern = re.compile(r"(\d+)\D(\d+)")
+    pattern = re.compile(r'(\d+)\D(\d+)')
     for fname in pathlib.Path(path[0]).glob(path[1:]):
         with open(fname) as f:
             content = f.read().strip()
             res = pattern.findall(content)
             if res:
-                pair = tuple(map(int, res[0]))
+                pair = tuple(sorted(map(int, res[0])))
                 thread_siblings_list.append(pair)
+    thread_siblings_list = list(set(thread_siblings_list))
     return thread_siblings_list
 
 
-def check_socket_affinities(socket_affinities):
+def build_thread_siblings_dict(siblings_list):
+    siblings_dict = {}
+    for siblings_tuple in siblings_list:
+        for core in siblings_tuple:
+            siblings_dict[core] = siblings_tuple
+
+    return siblings_dict
+
+
+def group_list_by_key(the_list, key):
+    sorted_list = sorted(the_list, key=key)
+    grouped = [
+        tuple(group) for key, group in itertools.groupby(sorted_list, key=key)
+    ]
+    return grouped
+
+
+def ungroup_affinities(affinities, scope, cores, min_cores=1, max_cores=None):
+    if scope == 'socket':
+        affinities = [
+            list(itertools.chain(*zip(*affinity))) for affinity in affinities
+        ]
+    elif scope == 'node':
+        affinities = [
+            [group[0] for group in affinity] for affinity in affinities
+        ]
+
+    for gpu_id, affinity in enumerate(affinities):
+        if len(affinity) < min_cores:
+            raise RuntimeError(
+                f'Number of available physical cores for GPU {gpu_id} is less '
+                f'the predefinied minimum, min_cores={min_cores}, available '
+                f'physical cores: {affinity} (count={len(affinity)})'
+            )
+
+    if max_cores is not None:
+        affinities = [affinity[:max_cores] for affinity in affinities]
+
+    if cores == 'all_logical':
+        affinities = [
+            list(itertools.chain(*affinity)) for affinity in affinities
+        ]
+    elif cores == 'single_logical':
+        affinities = [
+            [group[0] for group in affinity] for affinity in affinities
+        ]
+    else:
+        raise RuntimeError('Unknown cores mode')
+
+    return affinities
+
+
+def check_affinities(affinities):
     # sets of cores should be either identical or disjoint
-    for i, j in itertools.product(socket_affinities, socket_affinities):
+    for i, j in itertools.product(affinities, affinities):
         if not set(i) == set(j) and not set(i).isdisjoint(set(j)):
-            raise RuntimeError(f"Sets of cores should be either identical or disjoint, " f"but got {i} and {j}.")
+            raise RuntimeError(
+                f'Sets of cores should be either identical or disjoint, '
+                f'but got {i} and {j}.'
+            )
 
 
-def get_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
+def get_affinities(nproc_per_node, scope, exclude_unavailable_cores=True):
     devices = [Device(i) for i in range(nproc_per_node)]
-    socket_affinities = [dev.get_cpu_affinity() for dev in devices]
+    affinities = [dev.get_cpu_affinity(scope) for dev in devices]
 
     if exclude_unavailable_cores:
         available_cores = os.sched_getaffinity(0)
-        socket_affinities = [list(set(affinity) & available_cores) for affinity in socket_affinities]
+        affinities = [
+            sorted(list(set(affinity) & available_cores))
+            for affinity in affinities
+        ]
+
+    check_affinities(affinities)
+
+    return affinities
+
+
+def get_grouped_affinities(nproc_per_node, exclude_unavailable_cores=True):
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = build_thread_siblings_dict(siblings_list)
+
+    socket_affinities = get_affinities(
+        nproc_per_node, 'socket', exclude_unavailable_cores
+    )
+    node_affinities = get_affinities(
+        nproc_per_node, 'node', exclude_unavailable_cores
+    )
+
+    siblings_key = lambda x: siblings_dict.get(x, (x,))
+
+    sibling_node_affinities = [
+        tuple(group_list_by_key(affinity, key=siblings_key))
+        for affinity in node_affinities
+    ]
+    sibling_socket_affinities = [
+        tuple(group_list_by_key(affinity, key=siblings_key))
+        for affinity in socket_affinities
+    ]
+
+    socket_node_assigned_cores = collections.defaultdict(list)
+    for socket, node_cores in zip(
+        sibling_socket_affinities, sibling_node_affinities
+    ):
+        socket_node_assigned_cores[socket].extend(node_cores)
+
+    socket_node_assigned_cores = {
+        key: tuple(sorted(set(value)))
+        for key, value in socket_node_assigned_cores.items()
+    }
+
+    node_grouping = collections.defaultdict(list)
+
+    for socket_cores, assigned_cores in socket_node_assigned_cores.items():
+        unassigned_cores = sorted(
+            list(set(socket_cores) - set(assigned_cores))
+        )
+
+        for assigned_core in assigned_cores:
+            node_grouping[assigned_core].append(assigned_core)
+
+        for assigned, unassigned in zip(
+            itertools.cycle(assigned_cores), unassigned_cores
+        ):
+            node_grouping[assigned].append(unassigned)
+
+    node_grouping = {key: tuple(value) for key, value in node_grouping.items()}
 
-    check_socket_affinities(socket_affinities)
+    grouped_affinities = [
+        tuple(node_grouping[item] for item in sibling_node_affinity)
+        for sibling_node_affinity in sibling_node_affinities
+    ]
 
-    return socket_affinities
+    return grouped_affinities
 
 
-def set_socket_affinity(gpu_id):
+def set_all(gpu_id, nproc_per_node, scope, cores, min_cores, max_cores):
     """
-    The process is assigned with all available logical CPU cores from the CPU
-    socket connected to the GPU with a given id.
+    The process is assigned with all available physical CPU cores recommended by
+    pynvml for the GPU with a given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
 
     Args:
         gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
     """
-    dev = Device(gpu_id)
-    affinity = dev.get_cpu_affinity()
-    os.sched_setaffinity(0, affinity)
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
+    ungrouped_affinities = ungroup_affinities(
+        grouped_affinities, scope, cores, min_cores, max_cores
+    )
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
 
 
-def set_single_affinity(gpu_id):
+def set_single(gpu_id, nproc_per_node, scope, cores, min_cores=1, max_cores=1):
     """
-    The process is assigned with the first available logical CPU core from the
-    list of all CPU cores from the CPU socket connected to the GPU with a given
-    id.
+    The process is assigned with the first available physical CPU core from the
+    list of all physical CPU cores recommended by pynvml for the GPU with a
+    given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
 
     Args:
         gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
     """
-    dev = Device(gpu_id)
-    affinity = dev.get_cpu_affinity()
-
-    # exclude unavailable cores
-    available_cores = os.sched_getaffinity(0)
-    affinity = list(set(affinity) & available_cores)
-    os.sched_setaffinity(0, affinity[:1])
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
+    single_grouped_affinities = [group[:1] for group in grouped_affinities]
+    ungrouped_affinities = ungroup_affinities(
+        single_grouped_affinities, scope, cores, min_cores, max_cores
+    )
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
 
 
-def set_single_unique_affinity(gpu_id, nproc_per_node):
+def set_single_unique(
+    gpu_id, nproc_per_node, scope, cores, min_cores=1, max_cores=1
+):
     """
     The process is assigned with a single unique available physical CPU core
-    from the list of all CPU cores from the CPU socket connected to the GPU with
-    a given id.
+    from the list of all physical CPU cores recommended by pynvml for the GPU
+    with a given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
 
     Args:
         gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
     """
-    socket_affinities = get_socket_affinities(nproc_per_node)
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
-
-    # remove siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
 
     affinities = []
-    assigned = []
+    assigned_groups = set()
 
-    for socket_affinity in socket_affinities:
-        for core in socket_affinity:
-            if core not in assigned:
-                affinities.append([core])
-                assigned.append(core)
+    for grouped_affinity in grouped_affinities:
+        for group in grouped_affinity:
+            if group not in assigned_groups:
+                affinities.append([group])
+                assigned_groups.add(group)
                 break
-    os.sched_setaffinity(0, affinities[gpu_id])
+
+    ungrouped_affinities = ungroup_affinities(
+        affinities, scope, cores, min_cores, max_cores
+    )
+
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
 
 
-def set_socket_unique_affinity(gpu_id, nproc_per_node, mode, balanced=True):
+def set_unique(
+    gpu_id,
+    nproc_per_node,
+    scope,
+    cores,
+    mode,
+    min_cores,
+    max_cores,
+    balanced=True,
+):
     """
-    The process is assigned with an unique subset of available physical CPU
-    cores from the CPU socket connected to a GPU with a given id.
-    Assignment automatically includes hyperthreading siblings (if siblings are
-    available).
+    The process is assigned with a unique subset of available physical CPU
+    cores from the list of all CPU cores recommended by pynvml for the GPU with
+    a given id.
+
+    Assignment automatically includes available hyperthreading siblings if
+    cores='all_logical'.
 
     Args:
         gpu_id: index of a GPU
-        nproc_per_node: total number of processes per node
-        mode: mode
-        balanced: assign an equal number of physical cores to each process
+        nproc_per_node: number of processes per node
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
+        mode: 'unique_contiguous' or 'unique_interleaved'
+        balanced: assign an equal number of physical cores to each process,
     """
-    socket_affinities = get_socket_affinities(nproc_per_node)
-
-    siblings_list = get_thread_siblings_list()
-    siblings_dict = dict(siblings_list)
+    grouped_affinities = get_grouped_affinities(nproc_per_node)
 
-    # remove hyperthreading siblings
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+    grouped_affinities_to_device_ids = collections.defaultdict(list)
 
-    socket_affinities_to_device_ids = collections.defaultdict(list)
-
-    for idx, socket_affinity in enumerate(socket_affinities):
-        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+    for idx, grouped_affinity in enumerate(grouped_affinities):
+        grouped_affinities_to_device_ids[tuple(grouped_affinity)].append(idx)
 
     # compute minimal number of physical cores per GPU across all GPUs and
     # sockets, code assigns this number of cores per GPU if balanced == True
     min_physical_cores_per_gpu = min(
-        [len(cores) // len(gpus) for cores, gpus in socket_affinities_to_device_ids.items()]
+        [
+            len(cores) // len(gpus)
+            for cores, gpus in grouped_affinities_to_device_ids.items()
+        ]
     )
 
-    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+    grouped_unique_affinities = [None] * nproc_per_node
+
+    for (
+        grouped_affinity,
+        device_ids,
+    ) in grouped_affinities_to_device_ids.items():
         devices_per_group = len(device_ids)
         if balanced:
             cores_per_device = min_physical_cores_per_gpu
-            socket_affinity = socket_affinity[: devices_per_group * min_physical_cores_per_gpu]
+            grouped_affinity = grouped_affinity[
+                : devices_per_group * min_physical_cores_per_gpu
+            ]
         else:
-            cores_per_device = len(socket_affinity) // devices_per_group
-
-        for group_id, device_id in enumerate(device_ids):
-            if device_id == gpu_id:
-
-                # In theory there should be no difference in performance between
-                # 'interleaved' and 'continuous' pattern on Intel-based DGX-1,
-                # but 'continuous' should be better for DGX A100 because on AMD
-                # Rome 4 consecutive cores are sharing L3 cache.
-                # TODO: code doesn't attempt to automatically detect layout of
-                # L3 cache, also external environment may already exclude some
-                # cores, this code makes no attempt to detect it and to align
-                # mapping to multiples of 4.
-
-                if mode == "interleaved":
-                    affinity = list(socket_affinity[group_id::devices_per_group])
-                elif mode == "continuous":
-                    affinity = list(socket_affinity[group_id * cores_per_device: (group_id + 1) * cores_per_device])
-                else:
-                    raise RuntimeError("Unknown set_socket_unique_affinity mode")
-
-                # unconditionally reintroduce hyperthreading siblings, this step
-                # may result in a different numbers of logical cores assigned to
-                # each GPU even if balanced == True (if hyperthreading siblings
-                # aren't available for a subset of cores due to some external
-                # constraints, siblings are re-added unconditionally, in the
-                # worst case unavailable logical core will be ignored by
-                # os.sched_setaffinity().
-                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
-                os.sched_setaffinity(0, affinity)
-
-
-def set_affinity(gpu_id, nproc_per_node, mode="socket_unique_continuous", balanced=True):
+            cores_per_device = len(grouped_affinity) // devices_per_group
+
+        for subgroup_id, device_id in enumerate(device_ids):
+            # In theory there should be no difference in performance between
+            # 'interleaved' and 'contiguous' pattern on Intel-based DGX-1,
+            # but 'contiguous' should be better for DGX A100 because on AMD
+            # Rome 4 consecutive cores are sharing L3 cache.
+            # TODO: code doesn't attempt to automatically detect layout of
+            # L3 cache, also external environment may already exclude some
+            # cores, this code makes no attempt to detect it and to align
+            # mapping to multiples of 4.
+
+            if mode == 'unique_interleaved':
+                unique_grouped_affinity = list(
+                    grouped_affinity[subgroup_id::devices_per_group]
+                )
+            elif mode == 'unique_contiguous':
+                unique_grouped_affinity = list(
+                    grouped_affinity[
+                        subgroup_id
+                        * cores_per_device : (subgroup_id + 1)
+                        * cores_per_device
+                    ]
+                )
+            else:
+                raise RuntimeError('Unknown set_unique mode')
+
+            grouped_unique_affinities[device_id] = unique_grouped_affinity
+
+    ungrouped_affinities = ungroup_affinities(
+        grouped_unique_affinities, scope, cores, min_cores, max_cores
+    )
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+
+
+def set_affinity(
+    gpu_id,
+    nproc_per_node,
+    *,
+    mode='unique_contiguous',
+    scope='node',
+    cores='all_logical',
+    balanced=True,
+    min_cores=1,
+    max_cores=None,
+):
     """
-    The process is assigned with a proper CPU affinity which matches hardware
-    architecture on a given platform. Usually it improves and stabilizes
-    performance of deep learning training workloads.
+    The process is assigned with a proper CPU affinity that matches CPU-GPU
+    hardware architecture on a given platform. Usually, setting proper affinity
+    improves and stabilizes the performance of deep learning training workloads.
 
-    This function assumes that the workload is running in multi-process
-    single-device mode (there are multiple training processes and each process
-    is running on a single GPU), which is typical for multi-GPU training
-    workloads using `torch.nn.parallel.DistributedDataParallel`.
+    This function assumes that the workload runs in multi-process single-device
+    mode (there are multiple training processes, and each process is running on
+    a single GPU). This is typical for multi-GPU data-parallel training
+    workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`).
 
     Available affinity modes:
-    * 'socket' - the process is assigned with all available logical CPU cores
-    from the CPU socket connected to the GPU with a given id.
-    * 'single' - the process is assigned with the first available logical CPU
-    core from the list of all CPU cores from the CPU socket connected to the GPU
-    with a given id (multiple GPUs could be assigned with the same CPU core).
-    * 'single_unique' - the process is assigned with a single unique available
-    physical CPU core from the list of all CPU cores from the CPU socket
-    connected to the GPU with a given id.
-    * 'socket_unique_interleaved' - the process is assigned with an unique
-    subset of available physical CPU cores from the CPU socket connected to a
-    GPU with a given id, hyperthreading siblings are included automatically,
-    cores are assigned with interleaved indexing pattern
-    * 'socket_unique_continuous' - (the default) the process is assigned with an
-    unique subset of available physical CPU cores from the CPU socket connected
-    to a GPU with a given id, hyperthreading siblings are included
-    automatically, cores are assigned with continuous indexing pattern
-
-    'socket_unique_continuous' is the recommended mode for deep learning
+    * 'all' - the process is assigned with all available physical CPU cores
+    recommended by pynvml for the GPU with a given id.
+    * 'single' - the process is assigned with the first available
+    physical CPU core from the list of all physical CPU cores recommended by
+    pynvml for the GPU with a given id (multiple GPUs could be assigned with
+    the same CPU core).
+    * 'single_unique' - the process is assigned with a single unique
+    available physical CPU core from the list of all CPU cores recommended by
+    pynvml for the GPU with a given id.
+    * 'unique_interleaved' - the process is assigned with a unique subset of
+    available physical CPU cores from the list of all physical CPU cores
+    recommended by pynvml for the GPU with a given id, cores are assigned with
+    interleaved indexing pattern
+    * 'unique_contiguous' - (the default mode) the process is assigned with a
+    unique subset of available physical CPU cores from the list of all physical
+    CPU cores recommended by pynvml for the GPU with a given id, cores are
+    assigned with contiguous indexing pattern
+
+    Available "scope" modes:
+    * 'node' - sets the scope for pynvml affinity queries to NUMA node
+    * 'socket' - sets the scope for pynvml affinity queries to processor socket
+
+    Available "cores" modes:
+    * 'all_logical' - assigns the process with all logical cores associated with
+    a given corresponding physical core (i.e., automatically includes all
+    available hyperthreading siblings)
+    * 'single_logical' - assigns the process with only one logical core
+    associated with a given corresponding physical core (i.e., excludes
+    hyperthreading siblings)
+
+    'unique_contiguous' is the recommended mode for deep learning
     training workloads on NVIDIA DGX machines.
 
     Args:
-        gpu_id: integer index of a GPU
+        gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1
         nproc_per_node: number of processes per node
         mode: affinity mode
+        scope: scope for retrieving affinity from pynvml, 'node' or 'socket'
+        cores: 'all_logical' or 'single_logical'
         balanced: assign an equal number of physical cores to each process,
-            affects only 'socket_unique_interleaved' and
-            'socket_unique_continuous' affinity modes
+            affects only 'unique_interleaved' and
+            'unique_contiguous' affinity modes
+        min_cores: (default=1) the intended minimum number of physical cores per
+            process, code raises RuntimeError if the number of available cores
+            is less than 'min_cores'
+        max_cores: (default=None) the intended maxmimum number of physical cores
+            per process, the list of assigned cores is trimmed to the first
+            'max_cores' cores if max_cores is not None
 
     Returns a set of logical CPU cores on which the process is eligible to run.
 
+    WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs.
+    set_affinity with scope='node' restricts execution only to the CPU cores
+    directly connected to GPUs. On DGX A100, it will limit the code to half of
+    the CPU cores and half of CPU memory bandwidth (which may be fine for many
+    DL models). Use scope='socket' to use all available DGX A100 CPU cores.
+
+    WARNING: Intel's OpenMP implementation resets affinity on the first call to
+    an OpenMP function after a fork. It's recommended to run with env variable:
+    `KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be
+    preserved after a fork (e.g. in PyTorch DataLoader workers).
+
     Example:
 
     import argparse
@@ -299,27 +508,29 @@ def set_affinity(gpu_id, nproc_per_node, mode="socket_unique_continuous", balanc
 
     Launch the example with:
     python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py
-
-
-    WARNING: On DGX A100 only a half of CPU cores have direct access to GPUs.
-    This function restricts execution only to the CPU cores directly connected
-    to GPUs, so on DGX A100 it will limit the code to half of CPU cores and half
-    of CPU memory bandwidth (which may be fine for many DL models).
     """
     pynvml.nvmlInit()
 
-    if mode == "socket":
-        set_socket_affinity(gpu_id)
-    elif mode == "single":
-        set_single_affinity(gpu_id)
-    elif mode == "single_unique":
-        set_single_unique_affinity(gpu_id, nproc_per_node)
-    elif mode == "socket_unique_interleaved":
-        set_socket_unique_affinity(gpu_id, nproc_per_node, "interleaved", balanced)
-    elif mode == "socket_unique_continuous":
-        set_socket_unique_affinity(gpu_id, nproc_per_node, "continuous", balanced)
+    if mode == 'all':
+        set_all(gpu_id, nproc_per_node, scope, cores, min_cores, max_cores)
+    elif mode == 'single':
+        set_single(gpu_id, nproc_per_node, scope, cores)
+    elif mode == 'single_unique':
+        set_single_unique(gpu_id, nproc_per_node, scope, cores)
+    elif mode == 'unique_interleaved' or mode == 'unique_contiguous':
+        set_unique(
+            gpu_id,
+            nproc_per_node,
+            scope,
+            cores,
+            mode,
+            min_cores,
+            max_cores,
+            balanced,
+        )
     else:
-        raise RuntimeError("Unknown affinity mode")
+        raise RuntimeError('Unknown affinity mode')
 
     affinity = os.sched_getaffinity(0)
     return affinity
+

+ 5 - 3
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/inference.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 from typing import List
@@ -109,10 +109,12 @@ if __name__ == '__main__':
 
     if is_distributed:
         nproc_per_node = torch.cuda.device_count()
-        affinity = gpu_affinity.set_affinity(local_rank, nproc_per_node)
+        affinity = gpu_affinity.set_affinity(local_rank, nproc_per_node, scope='socket')
         model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)
         model._set_static_graph()
 
+    torch.set_float32_matmul_precision('high')
+
     test_dataloader = datamodule.test_dataloader() if not args.benchmark else datamodule.train_dataloader()
     evaluate(model,
              test_dataloader,

+ 2 - 20
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/loggers.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import pathlib
@@ -40,11 +40,6 @@ class Logger(ABC):
     def log_hyperparams(self, params):
         pass
 
-    @rank_zero_only
-    @abstractmethod
-    def log_metadata(self, metric, metadata):
-        pass
-
     @rank_zero_only
     @abstractmethod
     def log_metrics(self, metrics, step=None):
@@ -86,11 +81,6 @@ class LoggerCollection(Logger):
         for logger in self.loggers:
             logger.log_hyperparams(params)
 
-    @rank_zero_only
-    def log_metadata(self, metric, metadata):
-        for logger in self.loggers:
-            logger.log_metadata(metric, metadata)
-
 
 class DLLogger(Logger):
     def __init__(self, save_dir: pathlib.Path, filename: str):
@@ -105,10 +95,6 @@ class DLLogger(Logger):
         params = self._sanitize_params(params)
         dllogger.log(step="PARAMETER", data=params)
 
-    @rank_zero_only
-    def log_metadata(self, metric, metadata):
-        dllogger.metadata(metric, metadata)
-
     @rank_zero_only
     def log_metrics(self, metrics, step=None):
         if step is None:
@@ -140,10 +126,6 @@ class WandbLogger(Logger):
         params = self._sanitize_params(params)
         self.experiment.config.update(params, allow_val_change=True)
 
-    @rank_zero_only
-    def log_metadata(self, metric, metadata):
-        pass
-
     @rank_zero_only
     def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
         if step is not None:

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/metrics.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 from abc import ABC, abstractmethod

+ 10 - 10
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/training.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import logging
@@ -82,7 +82,7 @@ def load_state(model: nn.Module, optimizer: Optimizer, path: pathlib.Path, callb
 
 
 def train_epoch(model, train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args):
-    losses = []
+    loss_acc = torch.zeros((1,), device='cuda')
     for i, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), unit='batch',
                          desc=f'Epoch {epoch_idx}', disable=(args.silent or local_rank != 0)):
         *inputs, target = to_cuda(batch)
@@ -94,6 +94,7 @@ def train_epoch(model, train_dataloader, loss_fn, epoch_idx, grad_scaler, optimi
             pred = model(*inputs)
             loss = loss_fn(pred, target) / args.accumulate_grad_batches
 
+        loss_acc += loss.detach()
         grad_scaler.scale(loss).backward()
 
         # gradient accumulation
@@ -106,9 +107,7 @@ def train_epoch(model, train_dataloader, loss_fn, epoch_idx, grad_scaler, optimi
             grad_scaler.update()
             model.zero_grad(set_to_none=True)
 
-        losses.append(loss.item())
-
-    return np.mean(losses)
+    return loss_acc / (i + 1)
 
 
 def train(model: nn.Module,
@@ -142,7 +141,7 @@ def train(model: nn.Module,
     epoch_start = load_state(model, optimizer, args.load_ckpt_path, callbacks) if args.load_ckpt_path else 0
 
     for callback in callbacks:
-        callback.on_fit_start(optimizer, args)
+        callback.on_fit_start(optimizer, args, epoch_start)
 
     for epoch_idx in range(epoch_start, args.epochs):
         if isinstance(train_dataloader.sampler, DistributedSampler):
@@ -151,10 +150,10 @@ def train(model: nn.Module,
         loss = train_epoch(model, train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks,
                            args)
         if dist.is_initialized():
-            loss = torch.tensor(loss, dtype=torch.float, device=device)
             torch.distributed.all_reduce(loss)
-            loss = (loss / world_size).item()
+            loss /= world_size
 
+        loss = loss.item()
         logging.info(f'Train loss: {loss}')
         logger.log_metrics({'train loss': loss}, epoch_idx)
 
@@ -228,8 +227,9 @@ if __name__ == '__main__':
                      QM9LRSchedulerCallback(logger, epochs=args.epochs)]
 
     if is_distributed:
-        gpu_affinity.set_affinity(gpu_id=get_local_rank(), nproc_per_node=torch.cuda.device_count())
+        gpu_affinity.set_affinity(gpu_id=get_local_rank(), nproc_per_node=torch.cuda.device_count(), scope='socket')
 
+    torch.set_float32_matmul_precision('high')
     print_parameters_count(model)
     logger.log_hyperparams(vars(args))
     increase_l2_fetch_granularity()

+ 3 - 3
DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/runtime/utils.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import argparse
@@ -76,7 +76,7 @@ def to_cuda(x):
         return {k: to_cuda(v) for k, v in x.items()}
     else:
         # DGLGraph or other objects
-        return x.to(device=torch.cuda.current_device())
+        return x.to(device=torch.cuda.current_device(), non_blocking=True)
 
 
 def get_local_rank() -> int:

+ 1 - 1
DGLPyTorch/DrugDiscovery/SE3Transformer/setup.py

@@ -4,7 +4,7 @@ setup(
     name='se3-transformer',
     packages=find_packages(exclude=['tests']),
     include_package_data=True,
-    version='1.1.0',
+    version='1.2.0',
     description='PyTorch + DGL implementation of SE(3)-Transformers',
     author='Alexandre Milesi',
     author_email='[email protected]',

+ 2 - 2
DGLPyTorch/DrugDiscovery/SE3Transformer/tests/test_equivariance.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import torch

+ 3 - 3
DGLPyTorch/DrugDiscovery/SE3Transformer/tests/utils.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import dgl
@@ -26,7 +26,7 @@ import torch
 
 
 def get_random_graph(N, num_edges_factor=18):
-    graph = dgl.transform.remove_self_loop(dgl.rand_graph(N, N * num_edges_factor))
+    graph = dgl.remove_self_loop(dgl.rand_graph(N, N * num_edges_factor))
     return graph