OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{ResultSetSortImpl.cu} Namespace Reference

Functions

template<class K , class V , class I >
std::vector< uint32_t > do_radix_sort (const ExecutorDeviceType device_type, const int device_id, ThrustAllocator &thrust_allocator, const int8_t *groupby_buffer, V dev_oe_col_buffer_begin, V dev_oe_col_buffer_end, I dev_idx_buff_begin, const size_t dev_idx_buff_size, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n)
 
void add_nulls (std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
 
template<typename T >
thrust::device_ptr< T > get_device_copy_ptr (const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
 
template<class K >
std::vector< uint32_t > baseline_sort_fp (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
 
template<class K >
std::vector< uint32_t > baseline_sort_int (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
 
template<class K >
thrust::host_vector< int64_t > collect_order_entry_column (const int8_t *groupby_buffer, const GroupByBufferLayoutInfo &layout, const size_t start, const size_t step)
 

Function Documentation

void anonymous_namespace{ResultSetSortImpl.cu}::add_nulls ( std::vector< uint32_t > &  idx_buff,
const std::vector< uint32_t > &  null_idx_buff,
const PodOrderEntry oe 
)

Definition at line 97 of file ResultSetSortImpl.cu.

References PodOrderEntry::nulls_first.

Referenced by baseline_sort_fp(), and baseline_sort_int().

99  {
100  if (null_idx_buff.empty()) {
101  return;
102  }
103  const auto insertion_point = oe.nulls_first ? idx_buff.begin() : idx_buff.end();
104  idx_buff.insert(insertion_point, null_idx_buff.begin(), null_idx_buff.end());
105 }
bool nulls_first

+ Here is the caller graph for this function:

template<class K >
std::vector<uint32_t> anonymous_namespace{ResultSetSortImpl.cu}::baseline_sort_fp ( const ExecutorDeviceType  device_type,
const int  device_id,
Data_Namespace::DataMgr data_mgr,
const int8_t *  groupby_buffer,
const thrust::host_vector< int64_t > &  oe_col_buffer,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  top_n,
const size_t  start,
const size_t  step 
)

Definition at line 125 of file ResultSetSortImpl.cu.

References add_nulls(), TargetInfo::agg_kind, CHECK, CPU, GroupByBufferLayoutInfo::entry_count, get_device_copy_ptr(), GPU, PodOrderEntry::is_desc, kAVG, kDOUBLE, null_val_bit_pattern(), PodOrderEntry::nulls_first, GroupByBufferLayoutInfo::oe_target_info, GroupByBufferLayoutInfo::row_bytes, takes_float_argument(), and PodOrderEntry::tle_no.

134  {
135  thrust::host_vector<uint32_t> neg_idx_buff;
136  thrust::host_vector<uint32_t> pos_idx_buff;
137  std::vector<uint32_t> null_idx_buff;
138  thrust::host_vector<int64_t> neg_oe_col_buffer;
139  thrust::host_vector<int64_t> pos_oe_col_buffer;
140  const auto slice_entry_count =
141  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
142  neg_idx_buff.reserve(slice_entry_count);
143  pos_idx_buff.reserve(slice_entry_count);
144  null_idx_buff.reserve(slice_entry_count);
145  neg_oe_col_buffer.reserve(slice_entry_count);
146  pos_oe_col_buffer.reserve(slice_entry_count);
147  size_t oe_col_buffer_idx = 0;
148  const auto& oe_info = layout.oe_target_info;
149  const auto col_ti =
150  oe_info.agg_kind == kAVG ? SQLTypeInfo(kDOUBLE, false) : oe_info.sql_type;
151  // Execlude AVG b/c collect_order_entry_column already makes its pair collapse into a
152  // double
153  const bool float_argument_input =
154  takes_float_argument(oe_info) && oe_info.agg_kind != kAVG;
155 
156  auto is_negative =
157  float_argument_input ? [](const int64_t v) -> bool { return (v & (1 << 31)) != 0; }
158  : [](const int64_t v) -> bool { return v < 0; };
159 
160  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
161  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
162  oe_col_buffer[oe_col_buffer_idx] ==
163  null_val_bit_pattern(col_ti, float_argument_input)) {
164  null_idx_buff.push_back(i);
165  continue;
166  }
167  if (is_negative(oe_col_buffer[oe_col_buffer_idx])) { // sign bit works the same for
168  // integer and floating point
169  neg_idx_buff.push_back(i);
170  neg_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
171  } else {
172  pos_idx_buff.push_back(i);
173  pos_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
174  }
175  }
176  std::vector<uint32_t> pos_result;
177  ThrustAllocator thrust_allocator(data_mgr, device_id);
178  if (device_type == ExecutorDeviceType::GPU) {
179  const auto dev_pos_idx_buff = get_device_copy_ptr(pos_idx_buff, thrust_allocator);
180  const auto dev_pos_oe_col_buffer =
181  get_device_copy_ptr(pos_oe_col_buffer, thrust_allocator);
182  pos_result = do_radix_sort<K>(device_type,
183  device_id,
184  thrust_allocator,
185  groupby_buffer,
186  dev_pos_oe_col_buffer,
187  dev_pos_oe_col_buffer + pos_oe_col_buffer.size(),
188  dev_pos_idx_buff,
189  pos_idx_buff.size(),
190  oe,
191  layout,
192  top_n);
193  } else {
194  CHECK(device_type == ExecutorDeviceType::CPU);
195  pos_result = do_radix_sort<K>(device_type,
196  device_id,
197  thrust_allocator,
198  groupby_buffer,
199  pos_oe_col_buffer.begin(),
200  pos_oe_col_buffer.end(),
201  pos_idx_buff.begin(),
202  pos_idx_buff.size(),
203  oe,
204  layout,
205  top_n);
206  }
207  std::vector<uint32_t> neg_result;
208  PodOrderEntry reverse_oe{oe.tle_no, !oe.is_desc, oe.nulls_first};
209  if (device_type == ExecutorDeviceType::GPU) {
210  const auto dev_neg_idx_buff = get_device_copy_ptr(neg_idx_buff, thrust_allocator);
211  const auto dev_neg_oe_col_buffer =
212  get_device_copy_ptr(neg_oe_col_buffer, thrust_allocator);
213  neg_result = do_radix_sort<K>(device_type,
214  device_id,
215  thrust_allocator,
216  groupby_buffer,
217  dev_neg_oe_col_buffer,
218  dev_neg_oe_col_buffer + neg_oe_col_buffer.size(),
219  dev_neg_idx_buff,
220  neg_idx_buff.size(),
221  reverse_oe,
222  layout,
223  top_n);
224  } else {
225  CHECK(device_type == ExecutorDeviceType::CPU);
226  neg_result = do_radix_sort<K>(device_type,
227  device_id,
228  thrust_allocator,
229  groupby_buffer,
230  neg_oe_col_buffer.begin(),
231  neg_oe_col_buffer.end(),
232  neg_idx_buff.begin(),
233  neg_idx_buff.size(),
234  reverse_oe,
235  layout,
236  top_n);
237  }
238  if (oe.is_desc) {
239  pos_result.insert(pos_result.end(), neg_result.begin(), neg_result.end());
240  add_nulls(pos_result, null_idx_buff, oe);
241  return pos_result;
242  }
243  neg_result.insert(neg_result.end(), pos_result.begin(), pos_result.end());
244  add_nulls(neg_result, null_idx_buff, oe);
245  return neg_result;
246 }
thrust::device_ptr< T > get_device_copy_ptr(const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
bool nulls_first
SQLAgg agg_kind
Definition: TargetInfo.h:51
void add_nulls(std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
int tle_no
bool is_desc
const TargetInfo oe_target_info
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:77

+ Here is the call graph for this function:

template<class K >
std::vector<uint32_t> anonymous_namespace{ResultSetSortImpl.cu}::baseline_sort_int ( const ExecutorDeviceType  device_type,
const int  device_id,
Data_Namespace::DataMgr data_mgr,
const int8_t *  groupby_buffer,
const thrust::host_vector< int64_t > &  oe_col_buffer,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  top_n,
const size_t  start,
const size_t  step 
)

Definition at line 249 of file ResultSetSortImpl.cu.

References add_nulls(), CHECK, CPU, GroupByBufferLayoutInfo::entry_count, get_compact_type(), get_device_copy_ptr(), GPU, null_val_bit_pattern(), GroupByBufferLayoutInfo::oe_target_info, and GroupByBufferLayoutInfo::row_bytes.

258  {
259  const auto& entry_ti = get_compact_type(layout.oe_target_info);
260  std::vector<uint32_t> null_idx_buff;
261  thrust::host_vector<uint32_t> notnull_idx_buff;
262  const auto slice_entry_count =
263  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
264  null_idx_buff.reserve(slice_entry_count);
265  notnull_idx_buff.reserve(slice_entry_count);
266  thrust::host_vector<int64_t> notnull_oe_col_buffer;
267  notnull_oe_col_buffer.reserve(slice_entry_count);
268  size_t oe_col_buffer_idx = 0;
269  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
270  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
271  oe_col_buffer[oe_col_buffer_idx] == null_val_bit_pattern(entry_ti, false)) {
272  null_idx_buff.push_back(i);
273  } else {
274  notnull_idx_buff.push_back(i);
275  notnull_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
276  }
277  }
278  std::vector<uint32_t> notnull_result;
279  ThrustAllocator thrust_allocator(data_mgr, device_id);
280  if (device_type == ExecutorDeviceType::GPU) {
281  const auto dev_notnull_idx_buff =
282  get_device_copy_ptr(notnull_idx_buff, thrust_allocator);
283  const auto dev_notnull_oe_col_buffer =
284  get_device_copy_ptr(notnull_oe_col_buffer, thrust_allocator);
285  notnull_result =
286  do_radix_sort<K>(device_type,
287  device_id,
288  thrust_allocator,
289  groupby_buffer,
290  dev_notnull_oe_col_buffer,
291  dev_notnull_oe_col_buffer + notnull_oe_col_buffer.size(),
292  dev_notnull_idx_buff,
293  notnull_idx_buff.size(),
294  oe,
295  layout,
296  top_n);
297  } else {
298  CHECK(device_type == ExecutorDeviceType::CPU);
299  notnull_result = do_radix_sort<K>(device_type,
300  device_id,
301  thrust_allocator,
302  groupby_buffer,
303  notnull_oe_col_buffer.begin(),
304  notnull_oe_col_buffer.end(),
305  notnull_idx_buff.begin(),
306  notnull_idx_buff.size(),
307  oe,
308  layout,
309  top_n);
310  }
311  add_nulls(notnull_result, null_idx_buff, oe);
312  return notnull_result;
313 }
thrust::device_ptr< T > get_device_copy_ptr(const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
const SQLTypeInfo get_compact_type(const TargetInfo &target)
void add_nulls(std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
const TargetInfo oe_target_info
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

template<class K >
thrust::host_vector<int64_t> anonymous_namespace{ResultSetSortImpl.cu}::collect_order_entry_column ( const int8_t *  groupby_buffer,
const GroupByBufferLayoutInfo layout,
const size_t  start,
const size_t  step 
)

Definition at line 316 of file ResultSetSortImpl.cu.

References GroupByBufferLayoutInfo::col_off, GroupByBufferLayoutInfo::row_bytes, and GroupByBufferLayoutInfo::target_groupby_index.

320  {
321  thrust::host_vector<int64_t> oe_col_buffer;
322  const auto row_ptr = groupby_buffer + start * layout.row_bytes;
323  auto crt_group_ptr1 = layout.target_groupby_index >= 0
324  ? row_ptr + layout.target_groupby_index * sizeof(K)
325  : row_ptr + layout.col_off;
326  const int8_t* crt_group_ptr2{nullptr};
327  if (layout.oe_target_info.agg_kind == kAVG) {
328  crt_group_ptr2 = crt_group_ptr1 + layout.col_bytes;
329  }
330  const auto entry_ti = get_compact_type(layout.oe_target_info);
331  const bool float_argument_input = takes_float_argument(layout.oe_target_info);
332  const auto step_bytes = layout.row_bytes * step;
333  const auto col_bytes = float_argument_input ? entry_ti.get_size() : layout.col_bytes;
334  for (size_t i = start; i < layout.entry_count; i += step) {
335  auto val1 = read_int_from_buff(crt_group_ptr1, col_bytes > 0 ? col_bytes : sizeof(K));
336  if (crt_group_ptr2) {
337  const auto val2 = read_int_from_buff(crt_group_ptr2, 8);
338  const auto avg_val = pair_to_double({val1, val2}, entry_ti, float_argument_input);
339  val1 = *reinterpret_cast<const int64_t*>(&avg_val);
340  }
341  oe_col_buffer.push_back(val1);
342  crt_group_ptr1 += step_bytes;
343  if (crt_group_ptr2) {
344  crt_group_ptr2 += step_bytes;
345  }
346  }
347  return oe_col_buffer;
348 }
int64_t read_int_from_buff(const int8_t *ptr, const int8_t compact_sz)
double pair_to_double(const std::pair< int64_t, int64_t > &fp_pair, const SQLTypeInfo &ti, const bool float_argument_input)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
const SQLTypeInfo get_compact_type(const TargetInfo &target)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const TargetInfo oe_target_info
Definition: sqldefs.h:77
const int64_t target_groupby_index
template<class K , class V , class I >
std::vector<uint32_t> anonymous_namespace{ResultSetSortImpl.cu}::do_radix_sort ( const ExecutorDeviceType  device_type,
const int  device_id,
ThrustAllocator thrust_allocator,
const int8_t *  groupby_buffer,
dev_oe_col_buffer_begin,
dev_oe_col_buffer_end,
dev_idx_buff_begin,
const size_t  dev_idx_buff_size,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  top_n 
)

Definition at line 25 of file ResultSetSortImpl.cu.

References checkCudaErrors, getQueryEngineCudaStreamForDevice(), GPU, PodOrderEntry::is_desc, run_benchmark_import::result, and GroupByBufferLayoutInfo::row_bytes.

35  {
36  if (dev_idx_buff_size == 0) {
37  return {};
38  }
39  if (oe.is_desc) {
40  if (device_type == ExecutorDeviceType::GPU) {
41  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
42  thrust::sort_by_key(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
43  dev_oe_col_buffer_begin,
44  dev_oe_col_buffer_end,
45  dev_idx_buff_begin,
46  thrust::greater<int64_t>());
47  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
48  } else {
49  thrust::sort_by_key(dev_oe_col_buffer_begin,
50  dev_oe_col_buffer_end,
51  dev_idx_buff_begin,
52  thrust::greater<int64_t>());
53  }
54  } else {
55  if (device_type == ExecutorDeviceType::GPU) {
56  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
57  thrust::sort_by_key(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
58  dev_oe_col_buffer_begin,
59  dev_oe_col_buffer_end,
60  dev_idx_buff_begin);
61  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
62  } else {
63  thrust::sort_by_key(
64  dev_oe_col_buffer_begin, dev_oe_col_buffer_end, dev_idx_buff_begin);
65  }
66  }
67  // Speculatively transfer only the top_n first, most of the time it'll be enough.
68  thrust::host_vector<uint32_t> host_vector_result(
69  dev_idx_buff_begin, dev_idx_buff_begin + std::min(top_n, dev_idx_buff_size));
70  // Sometimes, radix sort can bring to the front entries which are empty.
71  // For example, ascending sort on COUNT(*) will bring non-existent groups
72  // to the front of dev_idx_buff since they're 0 in our system. Re-do the
73  // transfer in that case to bring the entire dev_idx_buff; existing logic
74  // in row iteration will take care of skipping the empty rows.
75  for (size_t i = 0; i < host_vector_result.size(); ++i) {
76  const auto entry_idx = host_vector_result[i];
77  if (is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
78  host_vector_result = thrust::host_vector<uint32_t>(
79  dev_idx_buff_begin, dev_idx_buff_begin + dev_idx_buff_size);
80  break;
81  }
82  }
83  std::vector<uint32_t> result;
84  result.reserve(std::min(top_n, host_vector_result.size()));
85  for (size_t i = 0; i < host_vector_result.size(); ++i) {
86  const auto entry_idx = host_vector_result[i];
87  if (!is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
88  result.push_back(entry_idx);
89  if (result.size() >= top_n) {
90  break;
91  }
92  }
93  }
94  return result;
95 }
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
bool is_desc
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

template<typename T >
thrust::device_ptr<T> anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr ( const thrust::host_vector< T > &  host_vec,
ThrustAllocator thrust_allocator 
)

Definition at line 108 of file ResultSetSortImpl.cu.

References align_to_int64(), ThrustAllocator::allocateScopedBuffer(), copy_to_nvidia_gpu(), ThrustAllocator::getDataMgr(), ThrustAllocator::getDeviceId(), and heavydb.dtypes::T.

Referenced by baseline_sort(), baseline_sort_fp(), and baseline_sort_int().

109  {
110  if (host_vec.empty()) {
111  return thrust::device_ptr<T>(static_cast<T*>(nullptr));
112  }
113  const auto host_vec_bytes = host_vec.size() * sizeof(T);
114  T* dev_ptr = reinterpret_cast<T*>(
115  thrust_allocator.allocateScopedBuffer(align_to_int64(host_vec_bytes)));
116  copy_to_nvidia_gpu(thrust_allocator.getDataMgr(),
117  reinterpret_cast<CUdeviceptr>(dev_ptr),
118  &host_vec[0],
119  host_vec_bytes,
120  thrust_allocator.getDeviceId());
121  return thrust::device_ptr<T>(dev_ptr);
122 }
int getDeviceId() const
unsigned long long CUdeviceptr
Definition: nocuda.h:28
Data_Namespace::DataMgr * getDataMgr() const
int8_t * allocateScopedBuffer(std::ptrdiff_t num_bytes)
void copy_to_nvidia_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:35
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function: