Răsfoiți Sursa

[EffDet/PyT] Base container changed back to 21.06

Mingyuan Ma 4 ani în urmă
părinte
comite
228277d643

+ 1 - 1
PyTorch/Detection/Efficientdet/Dockerfile

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
 
 FROM ${FROM_IMAGE_NAME}
 

+ 3 - 2
PyTorch/Detection/Efficientdet/effdet/csrc/focal_loss/focal_loss_cuda_kernel.cu

@@ -15,6 +15,7 @@
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <THC/THC.h>
 
 thread_local int multiProcessorCount=0;
 
@@ -255,7 +256,7 @@ std::vector<at::Tensor> focal_loss_forward_cuda(
         });
   }
 
-  C10_CUDA_CHECK(cudaGetLastError());
+  THCudaCheck(cudaGetLastError());
   return {loss, partial_grad};
 }
 
@@ -280,6 +281,6 @@ at::Tensor focal_loss_backward_cuda(const at::Tensor &grad_output,
                                          partial_grad.numel());
       });
 
-  C10_CUDA_CHECK(cudaGetLastError());
+  THCudaCheck(cudaGetLastError());
   return partial_grad;
 }

+ 11 - 10
PyTorch/Detection/Efficientdet/effdet/csrc/nms/cuda/nms.cu

@@ -14,8 +14,9 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/ceil_div.h>
-#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <THC/THC.h>
+#include <THC/THCDeviceUtils.cuh>
 
 #include <vector>
 #include <iostream>
@@ -73,7 +74,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
         t |= 1ULL << i;
       }
     }
-    const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
+    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
     dev_mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
@@ -88,20 +89,20 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 
   int boxes_num = boxes.size(0);
 
-  const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
+  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
 
   scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
 
-  at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
 
   unsigned long long* mask_dev = NULL;
   //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
 
-  mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
+  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
 
-  dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),
-              at::ceil_div(boxes_num, threadsPerBlock));
+  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
+              THCCeilDiv(boxes_num, threadsPerBlock));
   dim3 threads(threadsPerBlock);
   nms_kernel<<<blocks, threads>>>(boxes_num,
                                   nms_overlap_thresh,
@@ -109,7 +110,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
                                   mask_dev);
 
   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
+  THCudaCheck(cudaMemcpy(&mask_host[0],
                         mask_dev,
                         sizeof(unsigned long long) * boxes_num * col_blocks,
                         cudaMemcpyDeviceToHost));
@@ -134,7 +135,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
     }
   }
 
-  c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
+  THCudaFree(state, mask_dev);
   // TODO improve this part
   return std::get<0>(order_t.index({
                        keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(