2 #include <thrust/copy.h>
3 #include <thrust/device_vector.h>
4 #include <thrust/execution_policy.h>
5 #include <thrust/gather.h>
6 #include <thrust/sort.h>
17 #define checkCudaErrors(err) CHECK_EQ(err, CUDA_SUCCESS)
22 const uint64_t entry_count,
25 const int device_id) {
26 thrust::device_ptr<T> key_ptr(val_buff);
27 thrust::device_ptr<int32_t> idx_ptr(idx_buff);
28 thrust::sequence(idx_ptr, idx_ptr + entry_count);
31 thrust::sort_by_key(thrust::cuda::par(alloc).on(qe_cuda_stream),
33 key_ptr + entry_count,
35 thrust::greater<T>());
37 thrust::sort_by_key(thrust::cuda::par(alloc).on(qe_cuda_stream),
39 key_ptr + entry_count,
48 const uint64_t entry_count,
50 const int device_id) {
51 thrust::device_ptr<T> key_ptr(val_buff);
52 thrust::device_ptr<int32_t> idx_ptr(idx_buff);
53 const size_t buf_size = entry_count *
sizeof(
T);
54 T* raw_ptr =
reinterpret_cast<T*
>(alloc.
allocate(buf_size));
55 thrust::device_ptr<T> tmp_ptr(raw_ptr);
57 thrust::copy(thrust::cuda::par(alloc).on(qe_cuda_stream),
59 key_ptr + entry_count,
62 thrust::gather(thrust::cuda::par(alloc).on(qe_cuda_stream),
64 idx_ptr + entry_count,
68 alloc.
deallocate(reinterpret_cast<int8_t*>(raw_ptr), buf_size);
74 const uint64_t entry_count,
76 thrust::sequence(idx_buff, idx_buff + entry_count);
78 thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff, thrust::greater<T>());
80 thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff);
87 const uint64_t entry_count,
89 thrust::copy(val_buff, val_buff + entry_count, tmp_buff);
90 thrust::gather(idx_buff, idx_buff + entry_count, tmp_buff, val_buff);
96 const uint64_t entry_count,
98 const uint32_t chosen_bytes,
100 const int device_id) {
102 switch (chosen_bytes) {
128 sort_on_gpu(val_buff, idx_buff, entry_count, desc, alloc, device_id);
139 const uint64_t entry_count,
141 const uint32_t chosen_bytes) {
143 switch (chosen_bytes) {
145 sort_on_cpu(reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, desc);
148 sort_on_cpu(reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, desc);
151 sort_on_cpu(reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, desc);
154 sort_on_cpu(val_buff, idx_buff, entry_count, desc);
165 const uint64_t entry_count,
166 const uint32_t chosen_bytes,
168 const int device_id) {
170 switch (chosen_bytes) {
173 reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, alloc, device_id);
177 reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, alloc, device_id);
181 reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, alloc, device_id);
195 const uint64_t entry_count,
197 const uint32_t chosen_bytes) {
199 switch (chosen_bytes) {
204 reinterpret_cast<int8_t*>(tmp_buff));
210 reinterpret_cast<int16_t*>(tmp_buff));
216 reinterpret_cast<int32_t*>(tmp_buff));
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
int8_t * allocate(std::ptrdiff_t num_bytes)
void checkCudaErrors(CUresult err)
void sort_on_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes)
void deallocate(int8_t *ptr, size_t num_bytes)
DEVICE auto copy(ARGS &&...args)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
void apply_permutation_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
void apply_permutation_on_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, int64_t *tmp_buff, const uint32_t chosen_bytes)