4 ani în urmă · 228277d643
--- a/PyTorch/Detection/Efficientdet/Dockerfile
+++ b/PyTorch/Detection/Efficientdet/Dockerfile
@@ -12,7 +12,7 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel
			
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
			
 
				 
			
 
				 FROM ${FROM_IMAGE_NAME}
			
 
				 
			
--- a/PyTorch/Detection/Efficientdet/effdet/csrc/focal_loss/focal_loss_cuda_kernel.cu
+++ b/PyTorch/Detection/Efficientdet/effdet/csrc/focal_loss/focal_loss_cuda_kernel.cu
@@ -15,6 +15,7 @@
 
				 #include <ATen/ATen.h>
			
 
				 #include <ATen/AccumulateType.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				+#include <THC/THC.h>
			
 
				 
			
 
				 thread_local int multiProcessorCount=0;
			
 
				 
			
@@ -255,7 +256,7 @@ std::vector<at::Tensor> focal_loss_forward_cuda(
 
				         });
			
 
				   }
			
 
				 
			
 
				-  C10_CUDA_CHECK(cudaGetLastError());
			
 
				+  THCudaCheck(cudaGetLastError());
			
 
				   return {loss, partial_grad};
			
 
				 }
			
 
				 
			
@@ -280,6 +281,6 @@ at::Tensor focal_loss_backward_cuda(const at::Tensor &grad_output,
 
				                                          partial_grad.numel());
			
 
				       });
			
 
				 
			
 
				-  C10_CUDA_CHECK(cudaGetLastError());
			
 
				+  THCudaCheck(cudaGetLastError());
			
 
				   return partial_grad;
			
 
				 }
			
--- a/PyTorch/Detection/Efficientdet/effdet/csrc/nms/cuda/nms.cu
+++ b/PyTorch/Detection/Efficientdet/effdet/csrc/nms/cuda/nms.cu
@@ -14,8 +14,9 @@
 
				 
			
 
				 #include <ATen/ATen.h>
			
 
				 #include <ATen/cuda/CUDAContext.h>
			
 
				-#include <ATen/ceil_div.h>
			
 
				-#include <c10/cuda/CUDACachingAllocator.h>
			
 
				+
			
 
				+#include <THC/THC.h>
			
 
				+#include <THC/THCDeviceUtils.cuh>
			
 
				 
			
 
				 #include <vector>
			
 
				 #include <iostream>
			
@@ -73,7 +74,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 
				         t |= 1ULL << i;
			
 
				       }
			
 
				     }
			
 
				-    const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
			
 
				+    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
			
 
				     dev_mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				   }
			
 
				 }
			
@@ -88,20 +89,20 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 
				 
			
 
				   int boxes_num = boxes.size(0);
			
 
				 
			
 
				-  const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
			
 
				+  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
			
 
				 
			
 
				   scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
			
 
				 
			
 
				-  at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
			
 
				+  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
			
 
				 
			
 
				   unsigned long long* mask_dev = NULL;
			
 
				   //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
			
 
				   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
			
 
				 
			
 
				-  mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
			
 
				+  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
			
 
				 
			
 
				-  dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),
			
 
				-              at::ceil_div(boxes_num, threadsPerBlock));
			
 
				+  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
			
 
				+              THCCeilDiv(boxes_num, threadsPerBlock));
			
 
				   dim3 threads(threadsPerBlock);
			
 
				   nms_kernel<<<blocks, threads>>>(boxes_num,
			
 
				                                   nms_overlap_thresh,
			
@@ -109,7 +110,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 
				                                   mask_dev);
			
 
				 
			
 
				   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
			
 
				-  C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
			
 
				+  THCudaCheck(cudaMemcpy(&mask_host[0],
			
 
				                         mask_dev,
			
 
				                         sizeof(unsigned long long) * boxes_num * col_blocks,
			
 
				                         cudaMemcpyDeviceToHost));
			
@@ -134,7 +135,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
			
 
				+  THCudaFree(state, mask_dev);
			
 
				   // TODO improve this part
			
 
				   return std::get<0>(order_t.index({
			
 
				                        keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(