OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
InPlaceSortImpl.cu
Go to the documentation of this file.
1 #ifdef HAVE_CUDA
2 #include <thrust/copy.h>
3 #include <thrust/device_vector.h>
4 #include <thrust/execution_policy.h>
5 #include <thrust/gather.h>
6 #include <thrust/sort.h>
7 #endif
8 
10 #include "InPlaceSortImpl.h"
11 
12 #ifdef HAVE_CUDA
13 #include <cuda.h>
15 
16 #include "Logger/Logger.h"
17 #define checkCudaErrors(err) CHECK_EQ(err, CUDA_SUCCESS)
18 
19 template <typename T>
20 void sort_on_gpu(T* val_buff,
21  int32_t* idx_buff,
22  const uint64_t entry_count,
23  const bool desc,
24  ThrustAllocator& alloc,
25  const int device_id) {
26  thrust::device_ptr<T> key_ptr(val_buff);
27  thrust::device_ptr<int32_t> idx_ptr(idx_buff);
28  thrust::sequence(idx_ptr, idx_ptr + entry_count);
29  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
30  if (desc) {
31  thrust::sort_by_key(thrust::cuda::par(alloc).on(qe_cuda_stream),
32  key_ptr,
33  key_ptr + entry_count,
34  idx_ptr,
35  thrust::greater<T>());
36  } else {
37  thrust::sort_by_key(thrust::cuda::par(alloc).on(qe_cuda_stream),
38  key_ptr,
39  key_ptr + entry_count,
40  idx_ptr);
41  }
42  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
43 }
44 
45 template <typename T>
46 void apply_permutation_on_gpu(T* val_buff,
47  int32_t* idx_buff,
48  const uint64_t entry_count,
49  ThrustAllocator& alloc,
50  const int device_id) {
51  thrust::device_ptr<T> key_ptr(val_buff);
52  thrust::device_ptr<int32_t> idx_ptr(idx_buff);
53  const size_t buf_size = entry_count * sizeof(T);
54  T* raw_ptr = reinterpret_cast<T*>(alloc.allocate(buf_size));
55  thrust::device_ptr<T> tmp_ptr(raw_ptr);
56  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
57  thrust::copy(thrust::cuda::par(alloc).on(qe_cuda_stream),
58  key_ptr,
59  key_ptr + entry_count,
60  tmp_ptr);
61  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
62  thrust::gather(thrust::cuda::par(alloc).on(qe_cuda_stream),
63  idx_ptr,
64  idx_ptr + entry_count,
65  tmp_ptr,
66  key_ptr);
67  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
68  alloc.deallocate(reinterpret_cast<int8_t*>(raw_ptr), buf_size);
69 }
70 
71 template <typename T>
72 void sort_on_cpu(T* val_buff,
73  int32_t* idx_buff,
74  const uint64_t entry_count,
75  const bool desc) {
76  thrust::sequence(idx_buff, idx_buff + entry_count);
77  if (desc) {
78  thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff, thrust::greater<T>());
79  } else {
80  thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff);
81  }
82 }
83 
84 template <typename T>
85 void apply_permutation_on_cpu(T* val_buff,
86  int32_t* idx_buff,
87  const uint64_t entry_count,
88  T* tmp_buff) {
89  thrust::copy(val_buff, val_buff + entry_count, tmp_buff);
90  thrust::gather(idx_buff, idx_buff + entry_count, tmp_buff, val_buff);
91 }
92 #endif
93 
94 void sort_on_gpu(int64_t* val_buff,
95  int32_t* idx_buff,
96  const uint64_t entry_count,
97  const bool desc,
98  const uint32_t chosen_bytes,
99  ThrustAllocator& alloc,
100  const int device_id) {
101 #ifdef HAVE_CUDA
102  switch (chosen_bytes) {
103  case 1:
104  sort_on_gpu(reinterpret_cast<int8_t*>(val_buff),
105  idx_buff,
106  entry_count,
107  desc,
108  alloc,
109  device_id);
110  break;
111  case 2:
112  sort_on_gpu(reinterpret_cast<int16_t*>(val_buff),
113  idx_buff,
114  entry_count,
115  desc,
116  alloc,
117  device_id);
118  break;
119  case 4:
120  sort_on_gpu(reinterpret_cast<int32_t*>(val_buff),
121  idx_buff,
122  entry_count,
123  desc,
124  alloc,
125  device_id);
126  break;
127  case 8:
128  sort_on_gpu(val_buff, idx_buff, entry_count, desc, alloc, device_id);
129  break;
130  default:
131  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
132  break;
133  }
134 #endif
135 }
136 
137 void sort_on_cpu(int64_t* val_buff,
138  int32_t* idx_buff,
139  const uint64_t entry_count,
140  const bool desc,
141  const uint32_t chosen_bytes) {
142 #ifdef HAVE_CUDA
143  switch (chosen_bytes) {
144  case 1:
145  sort_on_cpu(reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, desc);
146  break;
147  case 2:
148  sort_on_cpu(reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, desc);
149  break;
150  case 4:
151  sort_on_cpu(reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, desc);
152  break;
153  case 8:
154  sort_on_cpu(val_buff, idx_buff, entry_count, desc);
155  break;
156  default:
157  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
158  break;
159  }
160 #endif
161 }
162 
163 void apply_permutation_on_gpu(int64_t* val_buff,
164  int32_t* idx_buff,
165  const uint64_t entry_count,
166  const uint32_t chosen_bytes,
167  ThrustAllocator& alloc,
168  const int device_id) {
169 #ifdef HAVE_CUDA
170  switch (chosen_bytes) {
171  case 1:
173  reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, alloc, device_id);
174  break;
175  case 2:
177  reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, alloc, device_id);
178  break;
179  case 4:
181  reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, alloc, device_id);
182  break;
183  case 8:
184  apply_permutation_on_gpu(val_buff, idx_buff, entry_count, alloc, device_id);
185  break;
186  default:
187  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
188  break;
189  }
190 #endif
191 }
192 
193 void apply_permutation_on_cpu(int64_t* val_buff,
194  int32_t* idx_buff,
195  const uint64_t entry_count,
196  int64_t* tmp_buff,
197  const uint32_t chosen_bytes) {
198 #ifdef HAVE_CUDA
199  switch (chosen_bytes) {
200  case 1:
201  apply_permutation_on_cpu(reinterpret_cast<int8_t*>(val_buff),
202  idx_buff,
203  entry_count,
204  reinterpret_cast<int8_t*>(tmp_buff));
205  break;
206  case 2:
207  apply_permutation_on_cpu(reinterpret_cast<int16_t*>(val_buff),
208  idx_buff,
209  entry_count,
210  reinterpret_cast<int16_t*>(tmp_buff));
211  break;
212  case 4:
213  apply_permutation_on_cpu(reinterpret_cast<int32_t*>(val_buff),
214  idx_buff,
215  entry_count,
216  reinterpret_cast<int32_t*>(tmp_buff));
217  break;
218  case 8:
219  apply_permutation_on_cpu(val_buff, idx_buff, entry_count, tmp_buff);
220  break;
221  default:
222  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
223  break;
224  }
225 #endif
226 }
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
int8_t * allocate(std::ptrdiff_t num_bytes)
void * CUstream
Definition: nocuda.h:23
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void sort_on_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes)
void deallocate(int8_t *ptr, size_t num_bytes)
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
void apply_permutation_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
void apply_permutation_on_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, int64_t *tmp_buff, const uint32_t chosen_bytes)