|
|
@@ -14,8 +14,9 @@
|
|
|
|
|
|
#include <ATen/ATen.h>
|
|
|
#include <ATen/cuda/CUDAContext.h>
|
|
|
-#include <ATen/ceil_div.h>
|
|
|
-#include <c10/cuda/CUDACachingAllocator.h>
|
|
|
+
|
|
|
+#include <THC/THC.h>
|
|
|
+#include <THC/THCDeviceUtils.cuh>
|
|
|
|
|
|
#include <vector>
|
|
|
#include <iostream>
|
|
|
@@ -73,7 +74,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
|
|
|
t |= 1ULL << i;
|
|
|
}
|
|
|
}
|
|
|
- const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
|
|
|
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
|
|
|
dev_mask[cur_box_idx * col_blocks + col_start] = t;
|
|
|
}
|
|
|
}
|
|
|
@@ -88,20 +89,20 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
|
|
|
|
|
int boxes_num = boxes.size(0);
|
|
|
|
|
|
- const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
|
|
|
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
|
|
|
|
|
|
scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
|
|
|
|
|
|
- at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
|
|
|
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
|
|
|
|
|
|
unsigned long long* mask_dev = NULL;
|
|
|
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
|
|
|
// boxes_num * col_blocks * sizeof(unsigned long long)));
|
|
|
|
|
|
- mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
|
|
|
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
|
|
|
|
|
|
- dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),
|
|
|
- at::ceil_div(boxes_num, threadsPerBlock));
|
|
|
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
|
|
|
+ THCCeilDiv(boxes_num, threadsPerBlock));
|
|
|
dim3 threads(threadsPerBlock);
|
|
|
nms_kernel<<<blocks, threads>>>(boxes_num,
|
|
|
nms_overlap_thresh,
|
|
|
@@ -109,7 +110,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
|
|
mask_dev);
|
|
|
|
|
|
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
|
|
|
- C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
|
|
|
+ THCudaCheck(cudaMemcpy(&mask_host[0],
|
|
|
mask_dev,
|
|
|
sizeof(unsigned long long) * boxes_num * col_blocks,
|
|
|
cudaMemcpyDeviceToHost));
|
|
|
@@ -134,7 +135,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
|
|
|
+ THCudaFree(state, mask_dev);
|
|
|
// TODO improve this part
|
|
|
return std::get<0>(order_t.index({
|
|
|
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
|