OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "BufferCompaction.h"
23 #include "DecisionTreeEntry.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "Shared/Datum.h"
27 #include "Shared/quantile.h"
28 #include "TypePunning.h"
29 #include "Utils/SegmentTreeUtils.h"
30 
31 #include <atomic>
32 #include <cfloat>
33 #include <chrono>
34 #include <cmath>
35 #include <cstring>
36 #include <functional>
37 #include <thread>
38 #include <tuple>
39 
40 // decoder implementations
41 
42 #include "DecodersImpl.h"
43 
44 // arithmetic operator implementations
45 
46 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
47  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable( \
48  const type lhs, const type rhs, const null_type null_val) { \
49  if (lhs != null_val && rhs != null_val) { \
50  return lhs opsym rhs; \
51  } \
52  return null_val; \
53  }
54 
55 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
56  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
57  const type lhs, const type rhs, const null_type null_val) { \
58  if (lhs != null_val) { \
59  return lhs opsym rhs; \
60  } \
61  return null_val; \
62  }
63 
64 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
65  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
66  const type lhs, const type rhs, const null_type null_val) { \
67  if (rhs != null_val) { \
68  return lhs opsym rhs; \
69  } \
70  return null_val; \
71  }
72 
73 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
74  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable( \
75  const type lhs, \
76  const type rhs, \
77  const null_type null_val, \
78  const int8_t null_bool_val) { \
79  if (lhs != null_val && rhs != null_val) { \
80  return lhs opsym rhs; \
81  } \
82  return null_bool_val; \
83  }
84 
85 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
86  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
87  const type lhs, \
88  const type rhs, \
89  const null_type null_val, \
90  const int8_t null_bool_val) { \
91  if (lhs != null_val) { \
92  return lhs opsym rhs; \
93  } \
94  return null_bool_val; \
95  }
96 
97 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
98  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
99  const type lhs, \
100  const type rhs, \
101  const null_type null_val, \
102  const int8_t null_bool_val) { \
103  if (rhs != null_val) { \
104  return lhs opsym rhs; \
105  } \
106  return null_bool_val; \
107  }
108 
109 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
110  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type safe_div_##type( \
111  const type lhs, const type rhs, const null_type null_val) { \
112  if (lhs != null_val && rhs != null_val && rhs != 0) { \
113  return lhs / rhs; \
114  } \
115  return null_val; \
116  }
117 
118 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
119  DEF_ARITH_NULLABLE(type, null_type, add, +) \
120  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
121  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
122  DEF_ARITH_NULLABLE(type, null_type, div, /) \
123  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
124  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
125  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
126  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
127  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
128  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
129  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
130  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
131  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
132  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
133  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
134  DEF_CMP_NULLABLE(type, null_type, lt, <) \
135  DEF_CMP_NULLABLE(type, null_type, gt, >) \
136  DEF_CMP_NULLABLE(type, null_type, le, <=) \
137  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
139  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
140  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
141  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
142  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
143  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
145  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
146  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
147  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
148  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
149  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
150 
151 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
152 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
153 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
154 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
155 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
156 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
157 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
164 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
165 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
166 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
167 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
168 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
169 
170 #undef DEF_BINARY_NULLABLE_ALL_OPS
171 #undef DEF_SAFE_DIV_NULLABLE
172 #undef DEF_CMP_NULLABLE_RHS
173 #undef DEF_CMP_NULLABLE_LHS
174 #undef DEF_CMP_NULLABLE
175 #undef DEF_ARITH_NULLABLE_RHS
176 #undef DEF_ARITH_NULLABLE_LHS
177 #undef DEF_ARITH_NULLABLE
178 
179 #define DEF_MAP_STRING_TO_DATUM(value_type, value_name) \
180  extern "C" ALWAYS_INLINE DEVICE value_type map_string_to_datum_##value_name( \
181  const int32_t string_id, \
182  const int64_t translation_map_handle, \
183  const int32_t min_source_id) { \
184  const Datum* translation_map = \
185  reinterpret_cast<const Datum*>(translation_map_handle); \
186  const Datum& out_datum = translation_map[string_id - min_source_id]; \
187  return out_datum.value_name##val; \
188  }
189 
190 DEF_MAP_STRING_TO_DATUM(int8_t, bool)
191 DEF_MAP_STRING_TO_DATUM(int8_t, tinyint)
192 DEF_MAP_STRING_TO_DATUM(int16_t, smallint)
193 DEF_MAP_STRING_TO_DATUM(int32_t, int)
194 DEF_MAP_STRING_TO_DATUM(int64_t, bigint)
195 DEF_MAP_STRING_TO_DATUM(float, float)
196 DEF_MAP_STRING_TO_DATUM(double, double)
197 
198 #undef DEF_MAP_STRING_TO_DATUM
199 
200 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
201 scale_decimal_up(const int64_t operand,
202  const uint64_t scale,
203  const int64_t operand_null_val,
204  const int64_t result_null_val) {
205  return operand != operand_null_val ? operand * scale : result_null_val;
206 }
207 
208 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
209 scale_decimal_down_nullable(const int64_t operand,
210  const int64_t scale,
211  const int64_t null_val) {
212  // rounded scale down of a decimal
213  if (operand == null_val) {
214  return null_val;
215  }
216 
217  int64_t tmp = scale >> 1;
218  tmp = operand >= 0 ? operand + tmp : operand - tmp;
219  return tmp / scale;
220 }
221 
222 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
223 scale_decimal_down_not_nullable(const int64_t operand,
224  const int64_t scale,
225  const int64_t null_val) {
226  int64_t tmp = scale >> 1;
227  tmp = operand >= 0 ? operand + tmp : operand - tmp;
228  return tmp / scale;
229 }
230 
231 // Return floor(dividend / divisor).
232 // Assumes 0 < divisor.
233 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend,
234  const int64_t divisor) {
235  return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
236 }
237 
238 // Return floor(dividend / divisor) or NULL if dividend IS NULL.
239 // Assumes 0 < divisor.
240 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
241 floor_div_nullable_lhs(const int64_t dividend,
242  const int64_t divisor,
243  const int64_t null_val) {
244  return dividend == null_val ? null_val : floor_div_lhs(dividend, divisor);
245 }
246 
247 #define DEF_UMINUS_NULLABLE(type, null_type) \
248  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type uminus_##type##_nullable( \
249  const type operand, const null_type null_val) { \
250  return operand == null_val ? null_val : -operand; \
251  }
252 
253 DEF_UMINUS_NULLABLE(int8_t, int8_t)
254 DEF_UMINUS_NULLABLE(int16_t, int16_t)
255 DEF_UMINUS_NULLABLE(int32_t, int32_t)
256 DEF_UMINUS_NULLABLE(int64_t, int64_t)
257 DEF_UMINUS_NULLABLE(float, float)
258 DEF_UMINUS_NULLABLE(double, double)
259 
260 #undef DEF_UMINUS_NULLABLE
261 
262 #define DEF_CAST_NULLABLE(from_type, to_type) \
263  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
264  cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
265  const from_type from_null_val, \
266  const to_type to_null_val) { \
267  return operand == from_null_val ? to_null_val : operand; \
268  }
269 
270 #define DEF_CAST_SCALED_NULLABLE(from_type, to_type) \
271  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
272  cast_##from_type##_to_##to_type##_scaled_nullable(const from_type operand, \
273  const from_type from_null_val, \
274  const to_type to_null_val, \
275  const to_type divider) { \
276  return operand == from_null_val ? to_null_val : operand / divider; \
277  }
278 
279 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
280  DEF_CAST_NULLABLE(type1, type2) \
281  DEF_CAST_NULLABLE(type2, type1)
282 
283 #define DEF_ROUND_NULLABLE(from_type, to_type) \
284  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
285  cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
286  const from_type from_null_val, \
287  const to_type to_null_val) { \
288  return operand == from_null_val \
289  ? to_null_val \
290  : static_cast<to_type>(operand + (operand < from_type(0) \
291  ? from_type(-0.5) \
292  : from_type(0.5))); \
293  }
294 
295 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
296 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
297 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
298 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
299 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
300 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
301 DEF_CAST_NULLABLE_BIDIR(float, double)
302 
303 DEF_CAST_NULLABLE(int8_t, float)
304 DEF_CAST_NULLABLE(int16_t, float)
305 DEF_CAST_NULLABLE(int32_t, float)
306 DEF_CAST_NULLABLE(int64_t, float)
307 DEF_CAST_NULLABLE(int8_t, double)
308 DEF_CAST_NULLABLE(int16_t, double)
309 DEF_CAST_NULLABLE(int32_t, double)
310 DEF_CAST_NULLABLE(int64_t, double)
311 
312 DEF_ROUND_NULLABLE(float, int8_t)
313 DEF_ROUND_NULLABLE(float, int16_t)
314 DEF_ROUND_NULLABLE(float, int32_t)
315 DEF_ROUND_NULLABLE(float, int64_t)
316 DEF_ROUND_NULLABLE(double, int8_t)
317 DEF_ROUND_NULLABLE(double, int16_t)
318 DEF_ROUND_NULLABLE(double, int32_t)
319 DEF_ROUND_NULLABLE(double, int64_t)
320 
321 DEF_CAST_NULLABLE(uint8_t, int32_t)
322 DEF_CAST_NULLABLE(uint16_t, int32_t)
323 DEF_CAST_SCALED_NULLABLE(int64_t, float)
324 DEF_CAST_SCALED_NULLABLE(int64_t, double)
325 
326 #undef DEF_ROUND_NULLABLE
327 #undef DEF_CAST_NULLABLE_BIDIR
328 #undef DEF_CAST_SCALED_NULLABLE
329 #undef DEF_CAST_NULLABLE
330 
331 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand,
332  const int8_t null_val) {
333  return operand == null_val ? operand : (operand ? 0 : 1);
334 }
335 
336 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
337  const int8_t rhs,
338  const int8_t null_val) {
339  if (lhs == null_val) {
340  return rhs == 0 ? rhs : null_val;
341  }
342  if (rhs == null_val) {
343  return lhs == 0 ? lhs : null_val;
344  }
345  return (lhs && rhs) ? 1 : 0;
346 }
347 
348 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
349  const int8_t rhs,
350  const int8_t null_val) {
351  if (lhs == null_val) {
352  return rhs == 0 ? null_val : rhs;
353  }
354  if (rhs == null_val) {
355  return lhs == 0 ? null_val : lhs;
356  }
357  return (lhs || rhs) ? 1 : 0;
358 }
359 
360 // aggregator implementations
361 
362 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
363  return (*agg)++;
364 }
365 
367  int64_t* agg,
368  const int64_t val,
369  const int64_t min_val,
370  const int64_t bucket_size) {
371  uint64_t bitmap_idx = val - min_val;
372  if (1 < bucket_size) {
373  bitmap_idx /= static_cast<uint64_t>(bucket_size);
374  }
375  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
376 }
377 
378 #ifdef _MSC_VER
379 #define GPU_RT_STUB NEVER_INLINE
380 #else
381 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
382 #endif
383 
385  const int64_t,
386  const int64_t,
387  const int64_t,
388  const int64_t,
389  const int64_t,
390  const uint64_t,
391  const uint64_t) {}
392 
393 extern "C" RUNTIME_EXPORT NEVER_INLINE void
394 agg_approximate_count_distinct(int64_t* agg, const int64_t key, const uint32_t b) {
395  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
396  const uint32_t index = hash >> (64 - b);
397  const uint8_t rank = get_rank(hash << b, 64 - b);
398  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
399  M[index] = std::max(M[index], rank);
400 }
401 
403  const int64_t,
404  const uint32_t,
405  const int64_t,
406  const int64_t) {}
407 
408 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int8_t* bitset,
409  const int64_t val,
410  const int64_t min_val,
411  const int64_t max_val,
412  const int64_t null_val,
413  const int8_t null_bool_val) {
414  if (val == null_val) {
415  return null_bool_val;
416  }
417  if (val < min_val || val > max_val) {
418  return 0;
419  }
420  if (!bitset) {
421  return 0;
422  }
423  const uint64_t bitmap_idx = val - min_val;
424  return bitset[bitmap_idx >> 3] & (1 << (bitmap_idx & 7)) ? 1 : 0;
425 }
426 
427 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
428 compute_int64_t_lower_bound(const int64_t entry_cnt,
429  const int64_t target_value,
430  const int64_t* col_buf) {
431  int64_t l = 0;
432  int64_t h = entry_cnt - 1;
433  while (l < h) {
434  int64_t mid = l + (h - l) / 2;
435  if (target_value < col_buf[mid]) {
436  h = mid;
437  } else {
438  l = mid + 1;
439  }
440  }
441  return l;
442 }
443 
444 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
445 get_valid_buf_start_pos(const int64_t null_start_pos, const int64_t null_end_pos) {
446  return null_start_pos == 0 ? null_end_pos + 1 : 0;
447 }
448 
449 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
450 get_valid_buf_end_pos(const int64_t num_elems,
451  const int64_t null_start_pos,
452  const int64_t null_end_pos) {
453  return null_end_pos == num_elems ? null_start_pos : num_elems;
454 }
455 
456 template <typename T, typename Comparator>
457 inline int64_t compute_current_row_idx_in_frame(const int64_t num_elems,
458  const int64_t cur_row_idx,
459  const T* col_buf,
460  const int32_t* partition_rowid_buf,
461  const int64_t* ordered_index_buf,
462  const T null_val,
463  const bool nulls_first,
464  const int64_t null_start_pos,
465  const int64_t null_end_pos,
466  Comparator cmp) {
467  const auto target_value = col_buf[cur_row_idx];
468  if (target_value == null_val) {
469  for (int64_t target_offset = null_start_pos; target_offset < null_end_pos;
470  target_offset++) {
471  const auto candidate_offset = partition_rowid_buf[ordered_index_buf[target_offset]];
472  if (candidate_offset == cur_row_idx) {
473  return target_offset;
474  }
475  }
476  }
477  auto const modified_null_end_pos = nulls_first ? null_end_pos - 1 : null_end_pos;
478  int64_t l = get_valid_buf_start_pos(null_start_pos, modified_null_end_pos);
479  int64_t h = get_valid_buf_end_pos(num_elems, null_start_pos, modified_null_end_pos);
480  while (l < h) {
481  int64_t mid = l + (h - l) / 2;
482  auto const target_row_idx = partition_rowid_buf[ordered_index_buf[mid]];
483  auto const cur_value = col_buf[target_row_idx];
484  if (cmp(target_value, cur_value)) {
485  h = mid;
486  } else {
487  l = mid + 1;
488  }
489  }
490  int64_t target_offset = l;
491  int64_t candidate_row_idx = partition_rowid_buf[ordered_index_buf[target_offset]];
492  while (col_buf[candidate_row_idx] == target_value && target_offset < num_elems) {
493  if (candidate_row_idx == cur_row_idx) {
494  return target_offset;
495  }
496  candidate_row_idx = partition_rowid_buf[ordered_index_buf[++target_offset]];
497  }
498  return -1;
499 }
500 
501 #define DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(value_type, oper_name) \
502  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
503  compute_##value_type##_##oper_name##_current_row_idx_in_frame( \
504  const int64_t num_elems, \
505  const int64_t cur_row_idx, \
506  const value_type* col_buf, \
507  const int32_t* partition_rowid_buf, \
508  const int64_t* ordered_index_buf, \
509  const value_type null_val, \
510  const bool nulls_first, \
511  const int64_t null_start_pos, \
512  const int64_t null_end_pos) { \
513  return compute_current_row_idx_in_frame<value_type>(num_elems, \
514  cur_row_idx, \
515  col_buf, \
516  partition_rowid_buf, \
517  ordered_index_buf, \
518  null_val, \
519  nulls_first, \
520  null_start_pos, \
521  null_end_pos, \
522  std::oper_name<value_type>{}); \
523  }
524 #define DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME_ALL_TYPES(oper_name) \
525  DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int8_t, oper_name) \
526  DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int16_t, oper_name) \
527  DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int32_t, oper_name) \
528  DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int64_t, oper_name) \
529  DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(float, oper_name) \
530  DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(double, oper_name)
531 
534 
535 #undef DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME_ALL_TYPES
536 #undef DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME
537 
538 template <typename TARGET_VAL_TYPE, typename COL_TYPE, typename NULL_TYPE>
540  const int64_t num_elems,
541  const TARGET_VAL_TYPE target_val,
542  const COL_TYPE* col_buf,
543  const int32_t* partition_rowid_buf,
544  const int64_t* ordered_index_buf,
545  const NULL_TYPE null_val,
546  const bool nulls_first,
547  const int64_t null_start_offset,
548  const int64_t null_end_offset) {
549  if (target_val == null_val) {
550  return null_start_offset;
551  }
552  auto const modified_null_end_pos = nulls_first ? null_end_offset - 1 : null_end_offset;
553  int64_t l = get_valid_buf_start_pos(null_start_offset, modified_null_end_pos);
554  int64_t h = get_valid_buf_end_pos(num_elems, null_start_offset, modified_null_end_pos);
555  while (l < h) {
556  int64_t mid = l + (h - l) / 2;
557  if (target_val <= col_buf[partition_rowid_buf[ordered_index_buf[mid]]]) {
558  h = mid;
559  } else {
560  l = mid + 1;
561  }
562  }
563  return l;
564 }
565 
566 #define DEF_RANGE_MODE_FRAME_LOWER_BOUND( \
567  target_val_type, col_type, null_type, opname, opsym) \
568  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
569  range_mode_##target_val_type##_##col_type##_##null_type##_##opname##_frame_lower_bound( \
570  const int64_t num_elems, \
571  const target_val_type target_value, \
572  const col_type* col_buf, \
573  const int32_t* partition_rowid_buf, \
574  const int64_t* ordered_index_buf, \
575  const int64_t frame_bound_val, \
576  const null_type null_val, \
577  const bool nulls_first, \
578  const int64_t null_start_pos, \
579  const int64_t null_end_pos) { \
580  if (target_value == null_val) { \
581  return null_start_pos; \
582  } \
583  target_val_type new_val = target_value opsym frame_bound_val; \
584  return compute_lower_bound_from_ordered_partition_index<target_val_type, \
585  col_type, \
586  null_type>( \
587  num_elems, \
588  new_val, \
589  col_buf, \
590  partition_rowid_buf, \
591  ordered_index_buf, \
592  null_val, \
593  nulls_first, \
594  null_start_pos, \
595  null_end_pos); \
596  }
597 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int8_t, int8_t, int8_t, add, +)
598 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int8_t, int8_t, int8_t, sub, -)
599 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int16_t, int16_t, int16_t, add, +)
600 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int16_t, int16_t, int16_t, sub, -)
601 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int16_t, int16_t, int64_t, add, +)
602 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int16_t, int16_t, int64_t, sub, -)
603 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int32_t, int32_t, int32_t, add, +)
604 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int32_t, int32_t, int32_t, sub, -)
605 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int32_t, int32_t, int64_t, add, +)
606 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int32_t, int32_t, int64_t, sub, -)
607 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, int16_t, int64_t, add, +)
608 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, int16_t, int64_t, sub, -)
609 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, int32_t, int64_t, add, +)
610 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, int32_t, int64_t, sub, -)
611 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, int64_t, int64_t, add, +)
612 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, int64_t, int64_t, sub, -)
613 DEF_RANGE_MODE_FRAME_LOWER_BOUND(float, float, float, add, +)
614 DEF_RANGE_MODE_FRAME_LOWER_BOUND(float, float, float, sub, -)
615 DEF_RANGE_MODE_FRAME_LOWER_BOUND(double, double, double, add, +)
616 DEF_RANGE_MODE_FRAME_LOWER_BOUND(double, double, double, sub, -)
617 #undef DEF_RANGE_MODE_FRAME_LOWER_BOUND
618 
619 template <typename TARGET_VAL_TYPE, typename COL_TYPE, typename NULL_TYPE>
621  const int64_t num_elems,
622  const TARGET_VAL_TYPE target_val,
623  const COL_TYPE* col_buf,
624  const int32_t* partition_rowid_buf,
625  const int64_t* ordered_index_buf,
626  const NULL_TYPE null_val,
627  const bool nulls_first,
628  const int64_t null_start_offset,
629  const int64_t null_end_offset) {
630  if (target_val == null_val) {
631  return null_end_offset;
632  }
633  auto const modified_null_end_pos = nulls_first ? null_end_offset - 1 : null_end_offset;
634  int64_t l = get_valid_buf_start_pos(null_start_offset, modified_null_end_pos);
635  int64_t h = get_valid_buf_end_pos(num_elems, null_start_offset, modified_null_end_pos);
636  while (l < h) {
637  int64_t mid = l + (h - l) / 2;
638  if (target_val >= col_buf[partition_rowid_buf[ordered_index_buf[mid]]]) {
639  l = mid + 1;
640  } else {
641  h = mid;
642  }
643  }
644  return l;
645 }
646 
647 #define DEF_RANGE_MODE_FRAME_UPPER_BOUND( \
648  target_val_type, col_type, null_type, opname, opsym) \
649  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
650  range_mode_##target_val_type##_##col_type##_##null_type##_##opname##_frame_upper_bound( \
651  const int64_t num_elems, \
652  const target_val_type target_value, \
653  const col_type* col_buf, \
654  const int32_t* partition_rowid_buf, \
655  const int64_t* ordered_index_buf, \
656  const int64_t frame_bound_val, \
657  const null_type null_val, \
658  const bool nulls_first, \
659  const int64_t null_start_pos, \
660  const int64_t null_end_pos) { \
661  if (target_value == null_val) { \
662  return null_end_pos; \
663  } \
664  target_val_type new_val = target_value opsym frame_bound_val; \
665  return compute_upper_bound_from_ordered_partition_index<target_val_type, \
666  col_type, \
667  null_type>( \
668  num_elems, \
669  new_val, \
670  col_buf, \
671  partition_rowid_buf, \
672  ordered_index_buf, \
673  null_val, \
674  nulls_first, \
675  null_start_pos, \
676  null_end_pos); \
677  }
678 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int8_t, int8_t, int8_t, add, +)
679 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int8_t, int8_t, int8_t, sub, -)
680 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int16_t, int16_t, int16_t, add, +)
681 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int16_t, int16_t, int16_t, sub, -)
682 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int16_t, int16_t, int64_t, add, +)
683 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int16_t, int16_t, int64_t, sub, -)
684 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int32_t, int32_t, int32_t, add, +)
685 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int32_t, int32_t, int32_t, sub, -)
686 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int32_t, int32_t, int64_t, add, +)
687 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int32_t, int32_t, int64_t, sub, -)
688 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, int16_t, int64_t, add, +)
689 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, int16_t, int64_t, sub, -)
690 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, int32_t, int64_t, add, +)
691 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, int32_t, int64_t, sub, -)
692 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, int64_t, int64_t, add, +)
693 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, int64_t, int64_t, sub, -)
694 DEF_RANGE_MODE_FRAME_UPPER_BOUND(float, float, float, add, +)
695 DEF_RANGE_MODE_FRAME_UPPER_BOUND(float, float, float, sub, -)
696 DEF_RANGE_MODE_FRAME_UPPER_BOUND(double, double, double, add, +)
697 DEF_RANGE_MODE_FRAME_UPPER_BOUND(double, double, double, sub, -)
698 #undef DEF_RANGE_MODE_FRAME_UPPER_BOUND
699 
700 template <typename COL_TYPE, typename LOGICAL_TYPE>
701 inline LOGICAL_TYPE get_value_in_window_frame(const int64_t target_row_idx_in_frame,
702  const int64_t frame_start_offset,
703  const int64_t frame_end_offset,
704  const COL_TYPE* col_buf,
705  const int32_t* partition_rowid_buf,
706  const int64_t* ordered_index_buf,
707  const LOGICAL_TYPE logical_null_val,
708  const LOGICAL_TYPE col_null_val) {
709  if (target_row_idx_in_frame < frame_start_offset ||
710  target_row_idx_in_frame > frame_end_offset) {
711  return logical_null_val;
712  }
713  const auto target_offset =
714  partition_rowid_buf[ordered_index_buf[target_row_idx_in_frame]];
715  LOGICAL_TYPE target_val = col_buf[target_offset];
716  if (target_val == col_null_val) {
717  return logical_null_val;
718  }
719  return target_val;
720 }
721 
722 #define DEF_GET_VALUE_IN_FRAME(col_type, logical_type) \
723  extern "C" RUNTIME_EXPORT ALWAYS_INLINE logical_type \
724  get_##col_type##_value_##logical_type##_type_in_frame( \
725  const int64_t target_row_idx_in_frame, \
726  const int64_t frame_start_offset, \
727  const int64_t frame_end_offset, \
728  const col_type* col_buf, \
729  const int32_t* partition_rowid_buf, \
730  const int64_t* ordered_index_buf, \
731  const logical_type logical_null_val, \
732  const logical_type col_null_val) { \
733  return get_value_in_window_frame<col_type, logical_type>(target_row_idx_in_frame, \
734  frame_start_offset, \
735  frame_end_offset, \
736  col_buf, \
737  partition_rowid_buf, \
738  ordered_index_buf, \
739  logical_null_val, \
740  col_null_val); \
741  }
742 DEF_GET_VALUE_IN_FRAME(int8_t, int8_t)
743 DEF_GET_VALUE_IN_FRAME(int8_t, int16_t)
744 DEF_GET_VALUE_IN_FRAME(int8_t, int32_t)
745 DEF_GET_VALUE_IN_FRAME(int8_t, int64_t)
746 DEF_GET_VALUE_IN_FRAME(int16_t, int16_t)
747 DEF_GET_VALUE_IN_FRAME(int16_t, int32_t)
748 DEF_GET_VALUE_IN_FRAME(int16_t, int64_t)
749 DEF_GET_VALUE_IN_FRAME(int32_t, int32_t)
750 DEF_GET_VALUE_IN_FRAME(int32_t, int64_t)
751 DEF_GET_VALUE_IN_FRAME(int64_t, int64_t)
752 DEF_GET_VALUE_IN_FRAME(float, float)
753 DEF_GET_VALUE_IN_FRAME(double, double)
754 #undef DEF_GET_VALUE_IN_FRAME
755 
756 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t encode_date(int64_t decoded_val,
757  int64_t null_val,
758  int64_t multiplier) {
759  return decoded_val == null_val ? decoded_val : decoded_val * multiplier;
760 }
761 
762 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
763 compute_row_mode_start_index_sub(int64_t candidate_index,
764  int64_t current_partition_start_offset,
765  int64_t frame_bound) {
766  int64_t index = candidate_index - current_partition_start_offset - frame_bound;
767  return index < 0 ? 0 : index;
768 }
769 
770 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
771 compute_row_mode_start_index_add(int64_t candidate_index,
772  int64_t current_partition_start_offset,
773  int64_t frame_bound,
774  int64_t num_current_partition_elem) {
775  int64_t index = candidate_index - current_partition_start_offset + frame_bound;
776  return index >= num_current_partition_elem ? num_current_partition_elem : index;
777 }
778 
779 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
780 compute_row_mode_end_index_sub(int64_t candidate_index,
781  int64_t current_partition_start_offset,
782  int64_t frame_bound) {
783  int64_t index = candidate_index - current_partition_start_offset - frame_bound;
784  return index < 0 ? 0 : index + 1;
785 }
786 
787 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
788 compute_row_mode_end_index_add(int64_t candidate_index,
789  int64_t current_partition_start_offset,
790  int64_t frame_bound,
791  int64_t num_current_partition_elem) {
792  int64_t index = candidate_index - current_partition_start_offset + frame_bound;
793  return index >= num_current_partition_elem ? num_current_partition_elem : index + 1;
794 }
795 
797  int64_t** aggregation_trees,
798  size_t partition_idx) {
799  return aggregation_trees[partition_idx];
800 }
801 
803  int64_t** aggregation_trees,
804  size_t partition_idx) {
805  double** casted_aggregation_trees = reinterpret_cast<double**>(aggregation_trees);
806  return casted_aggregation_trees[partition_idx];
807 }
808 
810 get_integer_derived_aggregation_tree(int64_t** aggregation_trees, size_t partition_idx) {
811  SumAndCountPair<int64_t>** casted_aggregation_trees =
812  reinterpret_cast<SumAndCountPair<int64_t>**>(aggregation_trees);
813  return casted_aggregation_trees[partition_idx];
814 }
815 
817 get_double_derived_aggregation_tree(int64_t** aggregation_trees, size_t partition_idx) {
818  SumAndCountPair<double>** casted_aggregation_trees =
819  reinterpret_cast<SumAndCountPair<double>**>(aggregation_trees);
820  return casted_aggregation_trees[partition_idx];
821 }
822 
823 extern "C" RUNTIME_EXPORT ALWAYS_INLINE size_t
824 getStartOffsetForSegmentTreeTraversal(size_t level, size_t tree_fanout) {
825  size_t offset = 0;
826  for (size_t i = 0; i < level; i++) {
827  offset += pow(tree_fanout, i);
828  }
829  return offset;
830 }
831 namespace {
832 enum class AggFuncType { MIN, MAX, SUM };
833 
834 template <AggFuncType AGG_FUNC_TYPE, typename AGG_TYPE>
835 inline AGG_TYPE agg_func(AGG_TYPE const lhs, AGG_TYPE const rhs) {
836  if constexpr (AGG_FUNC_TYPE == AggFuncType::MIN) {
837  return std::min(lhs, rhs);
838  } else if constexpr (AGG_FUNC_TYPE == AggFuncType::MAX) {
839  return std::max(lhs, rhs);
840  } else {
841  return lhs + rhs;
842  }
843 }
844 } // namespace
845 
846 template <AggFuncType AGG_FUNC_TYPE, typename AGG_TYPE>
848  AGG_TYPE* aggregation_tree_for_partition,
849  size_t query_range_start_idx,
850  size_t query_range_end_idx,
851  size_t leaf_level,
852  size_t tree_fanout,
853  AGG_TYPE init_val,
854  AGG_TYPE invalid_val,
855  AGG_TYPE null_val) {
856  size_t leaf_start_idx = getStartOffsetForSegmentTreeTraversal(leaf_level, tree_fanout);
857  size_t begin = leaf_start_idx + query_range_start_idx;
858  size_t end = leaf_start_idx + query_range_end_idx;
859  AGG_TYPE res = init_val;
860  bool all_nulls = true;
861  for (int level = leaf_level; level >= 0; level--) {
862  size_t parentBegin = begin / tree_fanout;
863  size_t parentEnd = (end - 1) / tree_fanout;
864  if (parentBegin == parentEnd) {
865  for (size_t pos = begin; pos < end; pos++) {
866  if (aggregation_tree_for_partition[pos] != null_val) {
867  all_nulls = false;
868  res = agg_func<AGG_FUNC_TYPE>(res, aggregation_tree_for_partition[pos]);
869  }
870  }
871  return all_nulls ? null_val : res;
872  } else if (parentBegin > parentEnd) {
873  return null_val;
874  }
875  size_t group_begin = (parentBegin * tree_fanout) + 1;
876  if (begin != group_begin) {
877  size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1;
878  for (size_t pos = begin; pos < limit; pos++) {
879  if (aggregation_tree_for_partition[pos] != null_val) {
880  all_nulls = false;
881  res = agg_func<AGG_FUNC_TYPE>(res, aggregation_tree_for_partition[pos]);
882  }
883  }
884  parentBegin++;
885  }
886  size_t group_end = (parentEnd * tree_fanout) + 1;
887  if (end != group_end) {
888  for (size_t pos = group_end; pos < end; pos++) {
889  if (aggregation_tree_for_partition[pos] != null_val) {
890  all_nulls = false;
891  res = agg_func<AGG_FUNC_TYPE>(res, aggregation_tree_for_partition[pos]);
892  }
893  }
894  }
895  begin = parentBegin;
896  end = parentEnd;
897  }
898  return invalid_val;
899 }
900 
901 #define DEF_SEARCH_AGGREGATION_TREE(agg_value_type) \
902  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_value_type \
903  search_##agg_value_type##_aggregation_tree( \
904  agg_value_type* aggregated_tree_for_partition, \
905  size_t query_range_start_idx, \
906  size_t query_range_end_idx, \
907  size_t leaf_level, \
908  size_t tree_fanout, \
909  bool decimal_type, \
910  size_t scale, \
911  agg_value_type invalid_val, \
912  agg_value_type null_val, \
913  int32_t agg_type) { \
914  if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
915  return null_val; \
916  } \
917  switch (static_cast<AggFuncType>(agg_type)) { \
918  case AggFuncType::MIN: \
919  return compute_window_func_via_aggregation_tree<AggFuncType::MIN>( \
920  aggregated_tree_for_partition, \
921  query_range_start_idx, \
922  query_range_end_idx, \
923  leaf_level, \
924  tree_fanout, \
925  std::numeric_limits<agg_value_type>::max(), \
926  invalid_val, \
927  null_val); \
928  case AggFuncType::MAX: \
929  return compute_window_func_via_aggregation_tree<AggFuncType::MAX>( \
930  aggregated_tree_for_partition, \
931  query_range_start_idx, \
932  query_range_end_idx, \
933  leaf_level, \
934  tree_fanout, \
935  std::numeric_limits<agg_value_type>::lowest(), \
936  invalid_val, \
937  null_val); \
938  default: \
939  return compute_window_func_via_aggregation_tree<AggFuncType::SUM>( \
940  aggregated_tree_for_partition, \
941  query_range_start_idx, \
942  query_range_end_idx, \
943  leaf_level, \
944  tree_fanout, \
945  static_cast<agg_value_type>(0), \
946  invalid_val, \
947  null_val); \
948  } \
949  }
950 
953 #undef DEF_SEARCH_AGGREGATION_TREE
954 
955 template <typename AGG_VALUE_TYPE>
957  SumAndCountPair<AGG_VALUE_TYPE>* aggregation_tree_for_partition,
959  size_t query_range_start_idx,
960  size_t query_range_end_idx,
961  size_t leaf_level,
962  size_t tree_fanout,
963  AGG_VALUE_TYPE invalid_val,
964  AGG_VALUE_TYPE null_val) {
965  size_t leaf_start_idx = getStartOffsetForSegmentTreeTraversal(leaf_level, tree_fanout);
966  size_t begin = leaf_start_idx + query_range_start_idx;
967  size_t end = leaf_start_idx + query_range_end_idx;
968  SumAndCountPair<AGG_VALUE_TYPE> null_res{null_val, 0};
969  SumAndCountPair<AGG_VALUE_TYPE> invalid_res{invalid_val, 0};
970  bool all_nulls = true;
971  for (int level = leaf_level; level >= 0; level--) {
972  size_t parentBegin = begin / tree_fanout;
973  size_t parentEnd = (end - 1) / tree_fanout;
974  if (parentBegin == parentEnd) {
975  for (size_t pos = begin; pos < end; pos++) {
976  if (aggregation_tree_for_partition[pos].sum != null_val) {
977  all_nulls = false;
978  res.sum += aggregation_tree_for_partition[pos].sum;
979  res.count += aggregation_tree_for_partition[pos].count;
980  }
981  }
982  if (all_nulls) {
983  res = null_res;
984  }
985  return;
986  } else if (parentBegin > parentEnd) {
987  res = null_res;
988  return;
989  }
990  size_t group_begin = (parentBegin * tree_fanout) + 1;
991  if (begin != group_begin) {
992  size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1;
993  for (size_t pos = begin; pos < limit; pos++) {
994  if (aggregation_tree_for_partition[pos].sum != null_val) {
995  all_nulls = false;
996  res.sum += aggregation_tree_for_partition[pos].sum;
997  res.count += aggregation_tree_for_partition[pos].count;
998  }
999  }
1000  parentBegin++;
1001  }
1002  size_t group_end = (parentEnd * tree_fanout) + 1;
1003  if (end != group_end) {
1004  for (size_t pos = group_end; pos < end; pos++) {
1005  if (aggregation_tree_for_partition[pos].sum != null_val) {
1006  all_nulls = false;
1007  res.sum += aggregation_tree_for_partition[pos].sum;
1008  res.count += aggregation_tree_for_partition[pos].count;
1009  }
1010  }
1011  }
1012  begin = parentBegin;
1013  end = parentEnd;
1014  }
1015  res = invalid_res;
1016  return;
1017 }
1018 
1019 #define DEF_SEARCH_DERIVED_AGGREGATION_TREE(agg_value_type) \
1020  extern "C" RUNTIME_EXPORT ALWAYS_INLINE double \
1021  search_##agg_value_type##_derived_aggregation_tree( \
1022  SumAndCountPair<agg_value_type>* aggregated_tree_for_partition, \
1023  size_t query_range_start_idx, \
1024  size_t query_range_end_idx, \
1025  size_t leaf_level, \
1026  size_t tree_fanout, \
1027  bool decimal_type, \
1028  size_t scale, \
1029  agg_value_type invalid_val, \
1030  agg_value_type null_val, \
1031  int32_t agg_type) { \
1032  if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
1033  return null_val; \
1034  } \
1035  SumAndCountPair<agg_value_type> res{0, 0}; \
1036  compute_derived_aggregates<agg_value_type>(aggregated_tree_for_partition, \
1037  res, \
1038  query_range_start_idx, \
1039  query_range_end_idx, \
1040  leaf_level, \
1041  tree_fanout, \
1042  invalid_val, \
1043  null_val); \
1044  if (res.sum == null_val) { \
1045  return null_val; \
1046  } else if (res.count > 0) { \
1047  if (decimal_type) { \
1048  return (static_cast<double>(res.sum) / pow(10, scale)) / res.count; \
1049  } \
1050  return (static_cast<double>(res.sum)) / res.count; \
1051  } else { \
1052  return invalid_val; \
1053  } \
1054  }
1055 
1058 #undef DEF_SEARCH_DERIVED_AGGREGATION_TREE
1059 
1060 #define DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG(agg_type, null_type) \
1061  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_type \
1062  handle_null_val_##agg_type##_##null_type##_window_framing_agg( \
1063  agg_type res, null_type agg_null_val, agg_type input_col_null_val) { \
1064  if (res == agg_null_val) { \
1065  return input_col_null_val; \
1066  } \
1067  return res; \
1068  }
1072 #undef DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG
1073 
1074 template <typename T>
1075 T fill_missing_value(int64_t const cur_idx,
1076  T const null_val,
1077  T* const col_buf,
1078  int64_t const num_elems_in_partition,
1079  int32_t* const partition_rowid_buf,
1080  int64_t* const ordered_index_buf,
1081  bool const is_forward_fill) {
1082  T const cur_val = col_buf[partition_rowid_buf[ordered_index_buf[cur_idx]]];
1083  if (cur_val == null_val) {
1084  if (is_forward_fill) {
1085  for (int64_t cand_idx = cur_idx - 1; cand_idx >= 0; --cand_idx) {
1086  T const candidate_val = col_buf[partition_rowid_buf[ordered_index_buf[cand_idx]]];
1087  if (candidate_val != null_val) {
1088  return candidate_val;
1089  }
1090  }
1091  } else {
1092  for (int64_t cand_idx = cur_idx + 1; cand_idx < num_elems_in_partition;
1093  ++cand_idx) {
1094  T const candidate_val = col_buf[partition_rowid_buf[ordered_index_buf[cand_idx]]];
1095  if (candidate_val != null_val) {
1096  return candidate_val;
1097  }
1098  }
1099  }
1100  }
1101  return cur_val;
1102 }
1103 #define DEF_FILL_MISSING_VALUE(col_type) \
1104  extern "C" RUNTIME_EXPORT ALWAYS_INLINE col_type fill_##col_type##_missing_value( \
1105  int64_t const cur_row_idx_in_frame, \
1106  col_type const null_val, \
1107  col_type* const col_buf, \
1108  int64_t const num_elems_in_partition, \
1109  int32_t* const partition_rowid_buf, \
1110  int64_t* const ordered_index_buf, \
1111  bool const is_forward_fill) { \
1112  return fill_missing_value<col_type>(cur_row_idx_in_frame, \
1113  null_val, \
1114  col_buf, \
1115  num_elems_in_partition, \
1116  partition_rowid_buf, \
1117  ordered_index_buf, \
1118  is_forward_fill); \
1119  }
1120 DEF_FILL_MISSING_VALUE(int8_t)
1121 DEF_FILL_MISSING_VALUE(int16_t)
1122 DEF_FILL_MISSING_VALUE(int32_t)
1123 DEF_FILL_MISSING_VALUE(int64_t)
1125 DEF_FILL_MISSING_VALUE(double)
1126 #undef DEF_FILL_MISSING_VALUE
1127 
1128 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
1129  const auto old = *agg;
1130  *agg += val;
1131  return old;
1132 }
1133 
1134 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_if(int64_t* agg,
1135  const int64_t val,
1136  const int8_t cond) {
1137  return cond ? agg_sum(agg, val) : *agg;
1138 }
1139 
1140 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
1141  *agg = std::max(*agg, val);
1142 }
1143 
1144 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
1145  *agg = std::min(*agg, val);
1146 }
1147 
1148 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
1149  *agg = val;
1150 }
1151 
1152 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t* agg_id_varlen(int8_t* varlen_buffer,
1153  const int64_t offset,
1154  const int8_t* value,
1155  const int64_t size_bytes) {
1156  for (auto i = 0; i < size_bytes; i++) {
1157  varlen_buffer[offset + i] = value[i];
1158  }
1159  return &varlen_buffer[offset];
1160 }
1161 
1162 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1163 checked_single_agg_id(int64_t* agg, const int64_t val, const int64_t null_val) {
1164  if (val == null_val) {
1165  return 0;
1166  }
1167 
1168  if (*agg == val) {
1169  return 0;
1170  } else if (*agg == null_val) {
1171  *agg = val;
1172  return 0;
1173  } else {
1174  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
1175  return 15;
1176  }
1177 }
1178 
1180  int64_t* agg,
1181  const int64_t val,
1182  const int64_t min_val,
1183  const int64_t bucket_size,
1184  const int64_t skip_val) {
1185  if (val != skip_val) {
1186  agg_count_distinct_bitmap(agg, val, min_val, bucket_size);
1187  }
1188 }
1189 
1191  const int64_t,
1192  const int64_t,
1193  const int64_t,
1194  const int64_t,
1195  const int64_t,
1196  const int64_t,
1197  const uint64_t,
1198  const uint64_t) {}
1199 
1200 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg,
1201  const int32_t) {
1202  return (*agg)++;
1203 }
1204 
1205 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_if_int32(uint32_t* agg,
1206  const int32_t cond) {
1207  return cond ? (*agg)++ : *agg;
1208 }
1209 
1210 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg,
1211  const int32_t val) {
1212  const auto old = *agg;
1213  *agg += val;
1214  return old;
1215 }
1216 
1217 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_if_int32(int32_t* agg,
1218  const int32_t val,
1219  const int8_t cond) {
1220  return cond ? agg_sum_int32(agg, val) : *agg;
1221 }
1222 
1223 #define DEF_AGG_MAX_INT(n) \
1224  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, \
1225  const int##n##_t val) { \
1226  *agg = std::max(*agg, val); \
1227  }
1228 
1229 DEF_AGG_MAX_INT(32)
1230 DEF_AGG_MAX_INT(16)
1231 DEF_AGG_MAX_INT(8)
1232 #undef DEF_AGG_MAX_INT
1233 
1234 #define DEF_AGG_MIN_INT(n) \
1235  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, \
1236  const int##n##_t val) { \
1237  *agg = std::min(*agg, val); \
1238  }
1239 
1240 DEF_AGG_MIN_INT(32)
1241 DEF_AGG_MIN_INT(16)
1242 DEF_AGG_MIN_INT(8)
1243 #undef DEF_AGG_MIN_INT
1244 
1245 #define DEF_AGG_ID_INT(n) \
1246  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, \
1247  const int##n##_t val) { \
1248  *agg = val; \
1249  }
1250 
1251 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
1252  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
1253  int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
1254  if (val == null_val) { \
1255  return 0; \
1256  } \
1257  if (*agg == val) { \
1258  return 0; \
1259  } else if (*agg == null_val) { \
1260  *agg = val; \
1261  return 0; \
1262  } else { \
1263  /* see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES*/ \
1264  return 15; \
1265  } \
1266  }
1267 
1268 DEF_AGG_ID_INT(32)
1269 DEF_AGG_ID_INT(16)
1270 DEF_AGG_ID_INT(8)
1271 
1275 
1276 #undef DEF_AGG_ID_INT
1277 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
1278 
1279 #define DEF_WRITE_PROJECTION_INT(n) \
1280  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void write_projection_int##n( \
1281  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
1282  if (val != init_val) { \
1283  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
1284  } \
1285  }
1286 
1289 #undef DEF_WRITE_PROJECTION_INT
1290 
1291 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
1292  const int64_t val,
1293  const int64_t skip_val) {
1294  const auto old = *agg;
1295  if (val != skip_val) {
1296  if (old != skip_val) {
1297  return agg_sum(agg, val);
1298  } else {
1299  *agg = val;
1300  }
1301  }
1302  return old;
1303 }
1304 
1305 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1306 agg_sum_int32_skip_val(int32_t* agg, const int32_t val, const int32_t skip_val) {
1307  const auto old = *agg;
1308  if (val != skip_val) {
1309  if (old != skip_val) {
1310  return agg_sum_int32(agg, val);
1311  } else {
1312  *agg = val;
1313  }
1314  }
1315  return old;
1316 }
1317 
1318 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
1319 agg_sum_if_skip_val(int64_t* agg,
1320  const int64_t val,
1321  const int64_t skip_val,
1322  const int8_t cond) {
1323  return cond ? agg_sum_skip_val(agg, val, skip_val) : *agg;
1324 }
1325 
1326 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1328  const int32_t val,
1329  const int32_t skip_val,
1330  const int8_t cond) {
1331  return cond ? agg_sum_int32_skip_val(agg, val, skip_val) : *agg;
1332 }
1333 
1334 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_if(uint64_t* agg,
1335  const int64_t cond) {
1336  return cond ? (*agg)++ : *agg;
1337 }
1338 
1339 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
1340 agg_count_skip_val(uint64_t* agg, const int64_t val, const int64_t skip_val) {
1341  if (val != skip_val) {
1342  return agg_count(agg, val);
1343  }
1344  return *agg;
1345 }
1346 
1347 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
1348 agg_count_if_skip_val(uint64_t* agg, const int64_t cond, const int64_t skip_val) {
1349  if (cond != skip_val) {
1350  return agg_count_if(agg, cond);
1351  }
1352  return *agg;
1353 }
1354 
1355 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
1356 agg_count_int32_skip_val(uint32_t* agg, const int32_t val, const int32_t skip_val) {
1357  if (val != skip_val) {
1358  return agg_count_int32(agg, val);
1359  }
1360  return *agg;
1361 }
1362 
1363 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
1364 agg_count_if_int32_skip_val(uint32_t* agg, const int32_t cond, const int32_t skip_val) {
1365  if (cond != skip_val) {
1366  return agg_count_if_int32(agg, cond);
1367  }
1368  return *agg;
1369 }
1370 
1371 #define DEF_SKIP_AGG_ADD(base_agg_func) \
1372  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1373  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
1374  if (val != skip_val) { \
1375  base_agg_func(agg, val); \
1376  } \
1377  }
1378 
1379 #define DEF_SKIP_AGG(base_agg_func) \
1380  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1381  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
1382  if (val != skip_val) { \
1383  const DATA_T old_agg = *agg; \
1384  if (old_agg != skip_val) { \
1385  base_agg_func(agg, val); \
1386  } else { \
1387  *agg = val; \
1388  } \
1389  } \
1390  }
1391 
1392 #define DATA_T int64_t
1395 #undef DATA_T
1396 
1397 #define DATA_T int32_t
1400 #undef DATA_T
1401 
1402 #define DATA_T int16_t
1405 #undef DATA_T
1406 
1407 #define DATA_T int8_t
1410 #undef DATA_T
1411 
1412 #undef DEF_SKIP_AGG_ADD
1413 #undef DEF_SKIP_AGG
1414 
1415 // TODO(alex): fix signature
1416 
1417 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg,
1418  const double val) {
1419  return (*agg)++;
1420 }
1421 
1422 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t* agg,
1423  const double val) {
1424  const auto r = *reinterpret_cast<const double*>(agg) + val;
1425  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
1426 }
1427 
1428 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_if_double(int64_t* agg,
1429  const double val,
1430  const int8_t cond) {
1431  if (cond) {
1432  agg_sum_double(agg, val);
1433  }
1434 }
1435 
1436 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t* agg,
1437  const double val) {
1438  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
1439  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
1440 }
1441 
1442 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t* agg,
1443  const double val) {
1444  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
1445  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
1446 }
1447 
1448 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t* agg,
1449  const double val) {
1450  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
1451 }
1452 
1453 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1454 checked_single_agg_id_double(int64_t* agg, const double val, const double null_val) {
1455  if (val == null_val) {
1456  return 0;
1457  }
1458 
1459  if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
1460  return 0;
1461  } else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
1462  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
1463  return 0;
1464  } else {
1465  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
1466  return 15;
1467  }
1468 }
1469 
1470 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg,
1471  const float val) {
1472  return (*agg)++;
1473 }
1474 
1475 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t* agg,
1476  const float val) {
1477  const auto r = *reinterpret_cast<const float*>(agg) + val;
1478  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
1479 }
1480 
1481 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_if_sum_float(int32_t* agg,
1482  const float val,
1483  const int8_t cond) {
1484  if (cond) {
1485  agg_sum_float(agg, val);
1486  }
1487 }
1488 
1489 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t* agg,
1490  const float val) {
1491  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
1492  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
1493 }
1494 
1495 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t* agg,
1496  const float val) {
1497  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
1498  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
1499 }
1500 
1501 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
1502  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
1503 }
1504 
1505 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1506 checked_single_agg_id_float(int32_t* agg, const float val, const float null_val) {
1507  if (val == null_val) {
1508  return 0;
1509  }
1510 
1511  if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
1512  return 0;
1513  } else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
1514  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
1515  return 0;
1516  } else {
1517  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
1518  return 15;
1519  }
1520 }
1521 
1522 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
1523 agg_count_double_skip_val(uint64_t* agg, const double val, const double skip_val) {
1524  if (val != skip_val) {
1525  return agg_count_double(agg, val);
1526  }
1527  return *agg;
1528 }
1529 
1530 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
1531 agg_count_float_skip_val(uint32_t* agg, const float val, const float skip_val) {
1532  if (val != skip_val) {
1533  return agg_count_float(agg, val);
1534  }
1535  return *agg;
1536 }
1537 
1538 #define DEF_SKIP_AGG(base_agg_func) \
1539  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1540  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
1541  if (val != skip_val) { \
1542  const ADDR_T old_agg = *agg; \
1543  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
1544  base_agg_func(agg, val); \
1545  } else { \
1546  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
1547  } \
1548  } \
1549  }
1550 
1551 #define DEF_SKIP_IF_AGG(skip_agg_func, base_agg_func) \
1552  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void skip_agg_func##_skip_val( \
1553  ADDR_T* agg, const DATA_T val, const DATA_T skip_val, const int8_t cond) { \
1554  if (cond) { \
1555  base_agg_func##_skip_val(agg, val, skip_val); \
1556  } \
1557  }
1558 
1559 #define DATA_T double
1560 #define ADDR_T int64_t
1565 #undef ADDR_T
1566 #undef DATA_T
1567 
1568 #define DATA_T float
1569 #define ADDR_T int32_t
1574 #undef ADDR_T
1575 #undef DATA_T
1576 
1577 #undef DEF_SKIP_AGG
1578 #undef DEF_SKIP_IF_AGG
1579 
1580 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x,
1581  const int64_t scale) {
1582  if (x >= 0) {
1583  return x / scale * scale;
1584  }
1585  if (!(x % scale)) {
1586  return x;
1587  }
1588  return x / scale * scale - scale;
1589 }
1590 
1591 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x,
1592  const int64_t scale) {
1593  return decimal_floor(x, scale) + (x % scale ? scale : 0);
1594 }
1595 
1596 // Shared memory aggregators. Should never be called,
1597 // real implementations are in cuda_mapd_rt.cu.
1598 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
1599  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
1600  const int64_t val) { \
1601  return 0; \
1602  } \
1603  \
1604  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
1605  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
1606  return 0; \
1607  } \
1608  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
1609  const int32_t val) { \
1610  return 0; \
1611  } \
1612  \
1613  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
1614  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
1615  return 0; \
1616  } \
1617  \
1618  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
1619  const double val) { \
1620  return 0; \
1621  } \
1622  \
1623  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
1624  uint64_t* agg, const double val, const double skip_val) { \
1625  return 0; \
1626  } \
1627  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
1628  const float val) { \
1629  return 0; \
1630  } \
1631  \
1632  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
1633  uint32_t* agg, const float val, const float skip_val) { \
1634  return 0; \
1635  }
1636 
1637 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
1638  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
1639  \
1640  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
1641  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
1642  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
1643  const int32_t val) {} \
1644  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
1645  const int16_t val) {} \
1646  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
1647  const int8_t val) {} \
1648  \
1649  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
1650  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
1651  \
1652  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
1653  const double val) {} \
1654  \
1655  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
1656  int64_t* agg, const double val, const double skip_val) {} \
1657  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
1658  const float val) {} \
1659  \
1660  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
1661  int32_t* agg, const float val, const float skip_val) {}
1662 
1668 
1669 extern "C" GPU_RT_STUB int8_t* agg_id_varlen_shared(int8_t* varlen_buffer,
1670  const int64_t offset,
1671  const int8_t* value,
1672  const int64_t size_bytes) {
1673  return nullptr;
1674 }
1675 
1676 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_shared(int64_t* agg,
1677  const int64_t val,
1678  const int64_t null_val) {
1679  return 0;
1680 }
1681 
1682 extern "C" GPU_RT_STUB int32_t
1684  const int32_t val,
1685  const int32_t null_val) {
1686  return 0;
1687 }
1688 extern "C" GPU_RT_STUB int32_t
1690  const int16_t val,
1691  const int16_t null_val) {
1692  return 0;
1693 }
1694 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t* agg,
1695  const int8_t val,
1696  const int8_t null_val) {
1697  return 0;
1698 }
1699 
1700 extern "C" GPU_RT_STUB int32_t
1702  const double val,
1703  const double null_val) {
1704  return 0;
1705 }
1706 
1707 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_float_shared(int32_t* agg,
1708  const float val,
1709  const float null_val) {
1710  return 0;
1711 }
1712 
1713 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
1714  const int16_t val,
1715  const int16_t skip_val) {}
1716 
1717 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
1718  const int8_t val,
1719  const int8_t skip_val) {}
1720 
1721 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
1722  const int16_t val,
1723  const int16_t skip_val) {}
1724 
1725 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
1726  const int8_t val,
1727  const int8_t skip_val) {}
1728 
1729 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
1730 
1731 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
1732  return 0;
1733 }
1734 
1735 extern "C" GPU_RT_STUB int64_t agg_sum_if_shared(int64_t* agg,
1736  const int64_t val,
1737  const int8_t cond) {
1738  return 0;
1739 }
1740 
1741 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
1742  const int64_t val,
1743  const int64_t skip_val) {
1744  return 0;
1745 }
1746 
1747 extern "C" GPU_RT_STUB int64_t agg_sum_if_skip_val_shared(int64_t* agg,
1748  const int64_t val,
1749  const int64_t skip_val,
1750  const int8_t cond) {
1751  return 0;
1752 }
1753 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
1754  return 0;
1755 }
1756 
1757 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
1758  const int32_t val,
1759  const int32_t skip_val) {
1760  return 0;
1761 }
1762 
1763 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
1764 
1765 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
1766  const double val,
1767  const double skip_val) {}
1768 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
1769 
1770 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
1771  const float val,
1772  const float skip_val) {}
1773 
1774 extern "C" GPU_RT_STUB int32_t agg_sum_if_int32_shared(int32_t* agg,
1775  const int32_t val,
1776  const int8_t cond) {
1777  return 0;
1778 }
1779 
1780 extern "C" GPU_RT_STUB int32_t agg_sum_if_int32_skip_val_shared(int32_t* agg,
1781  const int32_t val,
1782  const int32_t skip_val,
1783  const int8_t cond) {
1784  return 0;
1785 }
1786 
1787 extern "C" GPU_RT_STUB void agg_sum_if_double_shared(int64_t* agg,
1788  const double val,
1789  const int8_t cond) {}
1790 
1791 extern "C" GPU_RT_STUB void agg_sum_if_double_skip_val_shared(int64_t* agg,
1792  const double val,
1793  const double skip_val,
1794  const int8_t cond) {}
1795 extern "C" GPU_RT_STUB void agg_sum_if_float_shared(int32_t* agg,
1796  const float val,
1797  const int8_t cond) {}
1798 
1799 extern "C" GPU_RT_STUB void agg_sum_if_float_skip_val_shared(int32_t* agg,
1800  const float val,
1801  const float skip_val,
1802  const int8_t cond) {}
1803 
1804 extern "C" GPU_RT_STUB void force_sync() {}
1805 
1806 extern "C" GPU_RT_STUB void sync_warp() {}
1807 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
1808 extern "C" GPU_RT_STUB void sync_threadblock() {}
1809 
1810 extern "C" GPU_RT_STUB void write_back_non_grouped_agg(int64_t* input_buffer,
1811  int64_t* output_buffer,
1812  const int32_t num_agg_cols){};
1813 // x64 stride functions
1814 
1815 extern "C" RUNTIME_EXPORT NEVER_INLINE int32_t
1816 pos_start_impl(int32_t const* row_index_resume) {
1817  return row_index_resume ? *row_index_resume : 0;
1818 }
1819 
1821  return pos_start_impl(nullptr);
1822 }
1823 
1825  return 1;
1826 }
1827 
1828 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
1829  return 0;
1830 }
1831 
1832 extern "C" GPU_RT_STUB int64_t get_thread_index() {
1833  return 0;
1834 }
1835 
1837  return nullptr;
1838 }
1839 
1840 extern "C" GPU_RT_STUB int64_t get_block_index() {
1841  return 0;
1842 }
1843 
1844 #undef GPU_RT_STUB
1845 
1846 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code,
1847  int32_t* error_codes) {
1848  // NB: never override persistent error codes (with code greater than zero).
1849  // On GPU, a projection query with a limit can run out of slots without it
1850  // being an actual error if the limit has been hit. If a persistent error
1851  // (division by zero, for example) occurs before running out of slots, we
1852  // have to avoid overriding it, because there's a risk that the query would
1853  // go through if we override with a potentially benign out-of-slots code.
1854  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
1855  error_codes[pos_start_impl(nullptr)] = err_code;
1856  }
1857 }
1858 
1859 // error_codes points to an array on GPU, but a single value on CPU.
1860 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t* error_codes) {
1861  return error_codes[pos_start_impl(nullptr)];
1862 }
1863 
1864 // group by helpers
1865 
1867  const int64_t* groups_buffer,
1868  const int32_t groups_buffer_size) {
1869  return groups_buffer;
1870 }
1871 
1873  int64_t* src,
1874  const int32_t sz) {
1875 #ifndef _WIN32
1876  // the body is not really needed, just make sure the call is not optimized away
1877  assert(dest);
1878 #endif
1879 }
1880 
1881 extern "C" RUNTIME_EXPORT int64_t* init_shared_mem(const int64_t* global_groups_buffer,
1882  const int32_t groups_buffer_size) {
1883  return nullptr;
1884 }
1885 
1887  int64_t* groups_buffer,
1888  const int64_t* init_vals,
1889  const uint32_t groups_buffer_entry_count,
1890  const uint32_t key_qw_count,
1891  const uint32_t agg_col_count,
1892  const bool keyless,
1893  const int8_t warp_size) {
1894 #ifndef _WIN32
1895  // the body is not really needed, just make sure the call is not optimized away
1896  assert(groups_buffer);
1897 #endif
1898 }
1899 
1901  int64_t* groups_buffer,
1902  const int64_t* init_vals,
1903  const uint32_t groups_buffer_entry_count,
1904  const uint32_t key_qw_count,
1905  const uint32_t agg_col_count,
1906  const bool keyless,
1907  const bool blocks_share_memory,
1908  const int32_t frag_idx) {
1909 #ifndef _WIN32
1910  // the body is not really needed, just make sure the call is not optimized away
1911  assert(groups_buffer);
1912 #endif
1913 }
1914 
1916  int64_t* groups_buffer,
1917  const int64_t* init_vals,
1918  const uint32_t groups_buffer_entry_count,
1919  const uint32_t key_qw_count,
1920  const uint32_t agg_col_count,
1921  const bool keyless,
1922  const int8_t warp_size) {
1923 #ifndef _WIN32
1924  // the body is not really needed, just make sure the call is not optimized away
1925  assert(groups_buffer);
1926 #endif
1927 }
1928 
1929 template <typename T>
1930 ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
1931  const uint32_t h,
1932  const T* key,
1933  const uint32_t key_count,
1934  const uint32_t row_size_quad) {
1935  auto off = h * row_size_quad;
1936  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
1937  if (*row_ptr == get_empty_key<T>()) {
1938  memcpy(row_ptr, key, key_count * sizeof(T));
1939  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1940  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1941  }
1942  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
1943  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1944  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1945  }
1946  return nullptr;
1947 }
1948 
1950  int64_t* groups_buffer,
1951  const uint32_t h,
1952  const int64_t* key,
1953  const uint32_t key_count,
1954  const uint32_t key_width,
1955  const uint32_t row_size_quad) {
1956  switch (key_width) {
1957  case 4:
1958  return get_matching_group_value(groups_buffer,
1959  h,
1960  reinterpret_cast<const int32_t*>(key),
1961  key_count,
1962  row_size_quad);
1963  case 8:
1964  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
1965  default:;
1966  }
1967  return nullptr;
1968 }
1969 
1970 template <typename T>
1972  const uint32_t entry_count,
1973  const uint32_t h,
1974  const T* key,
1975  const uint32_t key_count) {
1976  auto off = h;
1977  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
1978  if (key_buffer[off] == get_empty_key<T>()) {
1979  for (size_t i = 0; i < key_count; ++i) {
1980  key_buffer[off] = key[i];
1981  off += entry_count;
1982  }
1983  return h;
1984  }
1985  off = h;
1986  for (size_t i = 0; i < key_count; ++i) {
1987  if (key_buffer[off] != key[i]) {
1988  return -1;
1989  }
1990  off += entry_count;
1991  }
1992  return h;
1993 }
1994 
1995 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1997  const uint32_t entry_count,
1998  const uint32_t h,
1999  const int64_t* key,
2000  const uint32_t key_count,
2001  const uint32_t key_width) {
2002  switch (key_width) {
2003  case 4:
2004  return get_matching_group_value_columnar_slot(groups_buffer,
2005  entry_count,
2006  h,
2007  reinterpret_cast<const int32_t*>(key),
2008  key_count);
2009  case 8:
2011  groups_buffer, entry_count, h, key, key_count);
2012  default:
2013  return -1;
2014  }
2015  return -1;
2016 }
2017 
2019  int64_t* groups_buffer,
2020  const uint32_t h,
2021  const int64_t* key,
2022  const uint32_t key_qw_count,
2023  const size_t entry_count) {
2024  auto off = h;
2025  if (groups_buffer[off] == EMPTY_KEY_64) {
2026  for (size_t i = 0; i < key_qw_count; ++i) {
2027  groups_buffer[off] = key[i];
2028  off += entry_count;
2029  }
2030  return &groups_buffer[off];
2031  }
2032  off = h;
2033  for (size_t i = 0; i < key_qw_count; ++i) {
2034  if (groups_buffer[off] != key[i]) {
2035  return nullptr;
2036  }
2037  off += entry_count;
2038  }
2039  return &groups_buffer[off];
2040 }
2041 
2042 /*
2043  * For a particular hashed_index, returns the row-wise offset
2044  * to the first matching agg column in memory.
2045  * It also checks the corresponding group column, and initialize all
2046  * available keys if they are not empty (it is assumed all group columns are
2047  * 64-bit wide).
2048  *
2049  * Memory layout:
2050  *
2051  * | prepended group columns (64-bit each) | agg columns |
2052  */
2054  int64_t* groups_buffer,
2055  const uint32_t hashed_index,
2056  const int64_t* key,
2057  const uint32_t key_count,
2058  const uint32_t row_size_quad) {
2059  uint32_t off = hashed_index * row_size_quad;
2060  if (groups_buffer[off] == EMPTY_KEY_64) {
2061  for (uint32_t i = 0; i < key_count; ++i) {
2062  groups_buffer[off + i] = key[i];
2063  }
2064  }
2065  return groups_buffer + off + key_count;
2066 }
2067 
2074 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t*
2076  const uint32_t hashed_index,
2077  const uint32_t row_size_quad) {
2078  return groups_buffer + row_size_quad * hashed_index;
2079 }
2080 
2081 /*
2082  * For a particular hashed_index, find and initialize (if necessary) all the group
2083  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
2084  */
2085 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void
2087  const uint32_t hashed_index,
2088  const int64_t* key,
2089  const uint32_t key_count,
2090  const uint32_t entry_count) {
2091  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
2092  for (uint32_t i = 0; i < key_count; i++) {
2093  groups_buffer[i * entry_count + hashed_index] = key[i];
2094  }
2095  }
2096 }
2097 
2098 #include "GeoOpsRuntime.cpp"
2099 #include "GroupByRuntime.cpp"
2101 
2103  int64_t* groups_buffer,
2104  const int64_t key,
2105  const int64_t min_key,
2106  const int64_t /* bucket */,
2107  const uint32_t row_size_quad) {
2108  return groups_buffer + row_size_quad * (key - min_key);
2109 }
2110 
2112  int64_t* groups_buffer,
2113  const int64_t key,
2114  const int64_t min_key,
2115  const int64_t /* bucket */,
2116  const uint32_t row_size_quad,
2117  const uint8_t thread_warp_idx,
2118  const uint8_t warp_size) {
2119  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
2120 }
2121 
2123  const int32_t len) {
2124  return {reinterpret_cast<char const*>(ptr), static_cast<uint64_t>(len)};
2125 }
2126 
2127 #ifdef __clang__
2128 #include "../Utils/StringLike.cpp"
2129 #endif
2130 
2131 #ifndef __CUDACC__
2132 #include "TopKRuntime.cpp"
2133 #endif
2134 
2135 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2136 char_length(const char* str, const int32_t str_len) {
2137  return str_len;
2138 }
2139 
2140 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2141 char_length_nullable(const char* str, const int32_t str_len, const int32_t int_null) {
2142  if (!str) {
2143  return int_null;
2144  }
2145  return str_len;
2146 }
2147 
2148 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2149 key_for_string_encoded(const int32_t str_id) {
2150  return str_id;
2151 }
2152 
2153 extern "C" ALWAYS_INLINE DEVICE int32_t
2154 map_string_dict_id(const int32_t string_id,
2155  const int64_t translation_map_handle,
2156  const int32_t min_source_id) {
2157  const int32_t* translation_map =
2158  reinterpret_cast<const int32_t*>(translation_map_handle);
2159  return translation_map[string_id - min_source_id];
2160 }
2161 
2163  const double* regressor_inputs,
2164  const int64_t decision_tree_table_handle,
2165  const int64_t decision_tree_offsets_handle,
2166  const int32_t num_regressors,
2167  const int32_t num_trees,
2168  const bool compute_avg,
2169  const double null_value) {
2170  for (int32_t regressor_idx = 0; regressor_idx < num_regressors; ++regressor_idx) {
2171  if (regressor_inputs[regressor_idx] == null_value) {
2172  return null_value;
2173  }
2174  }
2175  const DecisionTreeEntry* decision_tree_table =
2176  reinterpret_cast<const DecisionTreeEntry*>(decision_tree_table_handle);
2177  const int64_t* decision_tree_offsets =
2178  reinterpret_cast<const int64_t*>(decision_tree_offsets_handle);
2179  double sum_tree_results{0};
2180  for (int32_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
2181  int64_t row_idx = decision_tree_offsets[tree_idx];
2182  while (true) {
2183  const DecisionTreeEntry& current_entry = decision_tree_table[row_idx];
2184  if (!current_entry.isSplitNode()) {
2185  sum_tree_results += current_entry.value;
2186  break;
2187  }
2188  const auto regressor_input = regressor_inputs[current_entry.feature_index];
2189  row_idx = regressor_input <= current_entry.value
2190  ? current_entry.left_child_row_idx
2191  : current_entry.right_child_row_idx;
2192  }
2193  }
2194  return compute_avg ? sum_tree_results / num_trees : sum_tree_results;
2195 }
2196 
2198  const double proportion,
2199  const int64_t row_offset) {
2200  const int64_t threshold = 4294967296 * proportion;
2201  return (row_offset * 2654435761) % 4294967296 < threshold;
2202 }
2203 
2204 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2205 width_bucket(const double target_value,
2206  const double lower_bound,
2207  const double upper_bound,
2208  const double scale_factor,
2209  const int32_t partition_count) {
2210  if (target_value < lower_bound) {
2211  return 0;
2212  } else if (target_value >= upper_bound) {
2213  return partition_count + 1;
2214  }
2215  return ((target_value - lower_bound) * scale_factor) + 1;
2216 }
2217 
2218 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2219 width_bucket_reversed(const double target_value,
2220  const double lower_bound,
2221  const double upper_bound,
2222  const double scale_factor,
2223  const int32_t partition_count) {
2224  if (target_value > lower_bound) {
2225  return 0;
2226  } else if (target_value <= upper_bound) {
2227  return partition_count + 1;
2228  }
2229  return ((lower_bound - target_value) * scale_factor) + 1;
2230 }
2231 
2232 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
2233 width_bucket_nullable(const double target_value,
2234  const double lower_bound,
2235  const double upper_bound,
2236  const double scale_factor,
2237  const int32_t partition_count,
2238  const double null_val) {
2239  if (target_value == null_val) {
2240  return INT32_MIN;
2241  }
2242  return width_bucket(
2243  target_value, lower_bound, upper_bound, scale_factor, partition_count);
2244 }
2245 
2246 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
2247 width_bucket_reversed_nullable(const double target_value,
2248  const double lower_bound,
2249  const double upper_bound,
2250  const double scale_factor,
2251  const int32_t partition_count,
2252  const double null_val) {
2253  if (target_value == null_val) {
2254  return INT32_MIN;
2255  }
2256  return width_bucket_reversed(
2257  target_value, lower_bound, upper_bound, scale_factor, partition_count);
2258 }
2259 
2260 // width_bucket with no out-of-bound check version which can be called
2261 // if we can assure the input target_value expr always resides in the valid range
2262 // (so we can also avoid null checking)
2263 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2264 width_bucket_no_oob_check(const double target_value,
2265  const double lower_bound,
2266  const double scale_factor) {
2267  int32_t calc = (target_value - lower_bound) * scale_factor;
2268  return calc + 1;
2269 }
2270 
2271 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2272 width_bucket_reversed_no_oob_check(const double target_value,
2273  const double lower_bound,
2274  const double scale_factor) {
2275  int32_t calc = (lower_bound - target_value) * scale_factor;
2276  return calc + 1;
2277 }
2278 
2279 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2280 width_bucket_expr(const double target_value,
2281  const bool reversed,
2282  const double lower_bound,
2283  const double upper_bound,
2284  const int32_t partition_count) {
2285  if (reversed) {
2286  return width_bucket_reversed(target_value,
2287  lower_bound,
2288  upper_bound,
2289  partition_count / (lower_bound - upper_bound),
2290  partition_count);
2291  }
2292  return width_bucket(target_value,
2293  lower_bound,
2294  upper_bound,
2295  partition_count / (upper_bound - lower_bound),
2296  partition_count);
2297 }
2298 
2299 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2300 width_bucket_expr_nullable(const double target_value,
2301  const bool reversed,
2302  const double lower_bound,
2303  const double upper_bound,
2304  const int32_t partition_count,
2305  const double null_val) {
2306  if (target_value == null_val) {
2307  return INT32_MIN;
2308  }
2309  return width_bucket_expr(
2310  target_value, reversed, lower_bound, upper_bound, partition_count);
2311 }
2312 
2313 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2314 width_bucket_expr_no_oob_check(const double target_value,
2315  const bool reversed,
2316  const double lower_bound,
2317  const double upper_bound,
2318  const int32_t partition_count) {
2319  if (reversed) {
2321  target_value, lower_bound, partition_count / (lower_bound - upper_bound));
2322  }
2324  target_value, lower_bound, partition_count / (upper_bound - lower_bound));
2325 }
2326 
2327 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
2328 row_number_window_func(const int64_t output_buff, const int64_t pos) {
2329  return reinterpret_cast<const int64_t*>(output_buff)[pos];
2330 }
2331 
2333  const int64_t output_buff,
2334  const int64_t pos) {
2335  return reinterpret_cast<const double*>(output_buff)[pos];
2336 }
2337 
2338 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t* agg) {
2339  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
2340 }
2341 
2342 extern "C" RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t* agg) {
2343  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
2344 }
2345 
2346 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t* sum,
2347  const int64_t* count,
2348  const double null_val) {
2349  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
2350 }
2351 
2352 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
2353  const int64_t* count,
2354  const double null_val,
2355  const uint32_t scale) {
2356  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
2357 }
2358 
2359 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t* agg,
2360  const int64_t* count,
2361  const double null_val) {
2362  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
2363  : null_val;
2364 }
2365 
2366 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t* agg,
2367  const int32_t* count,
2368  const double null_val) {
2369  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
2370  : null_val;
2371 }
2372 
2374  uint8_t* bitmap,
2375  const uint32_t bitmap_bytes,
2376  const uint8_t* key_bytes,
2377  const uint32_t key_len) {
2378  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
2379  const uint32_t word_idx = bit_pos / 32;
2380  const uint32_t bit_idx = bit_pos % 32;
2381  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
2382 }
2383 
2384 // First 3 parameters are output, the rest are input.
2386  int32_t* error_codes,
2387  int32_t* total_matched,
2388  int64_t** out,
2389  const uint32_t frag_idx,
2390  const uint32_t* row_index_resume,
2391  const int8_t** col_buffers,
2392  const int8_t* literals,
2393  const int64_t* num_rows,
2394  const uint64_t* frag_row_offsets,
2395  const int32_t* max_matched,
2396  const int64_t* init_agg_value,
2397  const int64_t* join_hash_tables,
2398  const int8_t* row_func_mgr) {
2399 #ifndef _WIN32
2400  assert(error_codes || total_matched || out || frag_idx || row_index_resume ||
2401  col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
2402  init_agg_value || join_hash_tables || row_func_mgr);
2403 #endif
2404 }
2405 
2406 // First 3 parameters are output, the rest are input.
2408  int32_t* error_codes,
2409  int32_t* total_matched,
2410  int64_t** out,
2411  const uint32_t* num_fragments_ptr,
2412  const uint32_t* num_tables_ptr,
2413  const uint32_t* row_index_resume, // aka start_rowid
2414  const int8_t*** col_buffers,
2415  const int8_t* literals,
2416  const int64_t* num_rows,
2417  const uint64_t* frag_row_offsets,
2418  const int32_t* max_matched,
2419  const int64_t* init_agg_value,
2420  const int64_t* join_hash_tables,
2421  const int8_t* row_func_mgr) {
2422  uint32_t const num_fragments = *num_fragments_ptr;
2423  uint32_t const num_tables = *num_tables_ptr;
2424  // num_fragments_ptr and num_tables_ptr are replaced by frag_idx when passed below.
2425  for (uint32_t frag_idx = 0;
2426  frag_idx < num_fragments && get_error_code(error_codes) == 0;
2427  ++frag_idx) {
2428  query_stub_hoisted_literals(error_codes,
2429  total_matched,
2430  out,
2431  frag_idx,
2432  row_index_resume,
2433  col_buffers ? col_buffers[frag_idx] : nullptr,
2434  literals,
2435  &num_rows[frag_idx * num_tables],
2436  &frag_row_offsets[frag_idx * num_tables],
2437  max_matched,
2438  init_agg_value,
2439  join_hash_tables,
2440  row_func_mgr);
2441  }
2442 }
2443 
2444 // First 3 parameters are output, the rest are input.
2445 extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub(int32_t* error_codes,
2446  int32_t* total_matched,
2447  int64_t** out,
2448  const uint32_t frag_idx,
2449  const uint32_t* row_index_resume,
2450  const int8_t** col_buffers,
2451  const int64_t* num_rows,
2452  const uint64_t* frag_row_offsets,
2453  const int32_t* max_matched,
2454  const int64_t* init_agg_value,
2455  const int64_t* join_hash_tables,
2456  const int8_t* row_func_mgr) {
2457 #ifndef _WIN32
2458  assert(error_codes || total_matched || out || frag_idx || row_index_resume ||
2459  col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
2460  join_hash_tables || row_func_mgr);
2461 #endif
2462 }
2463 
2464 // First 3 parameters are output, the rest are input.
2465 extern "C" RUNTIME_EXPORT void multifrag_query(int32_t* error_codes,
2466  int32_t* total_matched,
2467  int64_t** out,
2468  const uint32_t* num_fragments_ptr,
2469  const uint32_t* num_tables_ptr,
2470  const uint32_t* row_index_resume,
2471  const int8_t*** col_buffers,
2472  const int64_t* num_rows,
2473  const uint64_t* frag_row_offsets,
2474  const int32_t* max_matched,
2475  const int64_t* init_agg_value,
2476  const int64_t* join_hash_tables,
2477  const int8_t* row_func_mgr) {
2478  uint32_t const num_fragments = *num_fragments_ptr;
2479  uint32_t const num_tables = *num_tables_ptr;
2480  // num_fragments_ptr and num_tables_ptr are replaced by frag_idx when passed below.
2481  for (uint32_t frag_idx = 0;
2482  frag_idx < num_fragments && get_error_code(error_codes) == 0;
2483  ++frag_idx) {
2484  query_stub(error_codes,
2485  total_matched,
2486  out,
2487  frag_idx,
2488  row_index_resume,
2489  col_buffers ? col_buffers[frag_idx] : nullptr,
2490  &num_rows[frag_idx * num_tables],
2491  &frag_row_offsets[frag_idx * num_tables],
2492  max_matched,
2493  init_agg_value,
2494  join_hash_tables,
2495  row_func_mgr);
2496  }
2497 }
2498 
2499 // WARNING: Don't add #include "Shared/InlineNullValues.h" to this file.
2500 // It may build fine, but during runtime results in
2501 // CUDA_ERROR_INVALID_PTX (218): a PTX JIT compilation failed: ptxas application ptx
2502 // input, line 10; fatal : Parsing error near '.globl': syntax error
2503 
2504 // See spatial_type::Codegen::pointIsNullFunctionName() for selecting
2505 // which of the following two functions to use to determine point IS NULL.
2506 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool point_int32_is_null(int32_t* point) {
2507  constexpr uint32_t null_array_compressed_32 = 0x80000000U; // Shared/InlineNullValues.h
2508  return point == nullptr || uint32_t(*point) == null_array_compressed_32;
2509 }
2510 
2512  constexpr double null_array_double = 2 * DBL_MIN; // Shared/InlineNullValues.h
2513  return point == nullptr || *point == null_array_double;
2514 }
2515 
2517  if (check_interrupt_init(static_cast<unsigned>(INT_CHECK))) {
2518  return true;
2519  }
2520  return false;
2521 }
2522 
2523 extern "C" RUNTIME_EXPORT bool check_interrupt_init(unsigned command) {
2524  static std::atomic_bool runtime_interrupt_flag{false};
2525 
2526  if (command == static_cast<unsigned>(INT_CHECK)) {
2527  if (runtime_interrupt_flag.load()) {
2528  return true;
2529  }
2530  return false;
2531  }
2532  if (command == static_cast<unsigned>(INT_ABORT)) {
2533  runtime_interrupt_flag.store(true);
2534  return false;
2535  }
2536  if (command == static_cast<unsigned>(INT_RESET)) {
2537  runtime_interrupt_flag.store(false);
2538  return false;
2539  }
2540  return false;
2541 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(int32_t *error_codes, int32_t *total_matched, int64_t **out, const uint32_t frag_idx, const uint32_t *row_index_resume, const int8_t **col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
RUNTIME_EXPORT void agg_min_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t encode_date(int64_t decoded_val, int64_t null_val, int64_t multiplier)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_if_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_start_index_sub(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound)
double value
#define DEF_UMINUS_NULLABLE(type, null_type)
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE int8_t * agg_id_varlen(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
RUNTIME_EXPORT void agg_max_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
RUNTIME_EXPORT void multifrag_query(int32_t *error_codes, int32_t *total_matched, int64_t **out, const uint32_t *num_fragments_ptr, const uint32_t *num_tables_ptr, const uint32_t *row_index_resume, const int8_t ***col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
#define EMPTY_KEY_64
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
int64_t left_child_row_idx
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_if(uint64_t *agg, const int64_t cond)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:49
RUNTIME_EXPORT void agg_min_int16(int16_t *agg, const int16_t val)
RUNTIME_EXPORT ALWAYS_INLINE StringView string_pack(const int8_t *ptr, const int32_t len)
#define GPU_RT_STUB
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
int64_t feature_index
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
#define DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME_ALL_TYPES(oper_name)
RUNTIME_EXPORT void agg_max_int16(int16_t *agg, const int16_t val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool point_int32_is_null(int32_t *point)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT NEVER_INLINE void query_stub(int32_t *error_codes, int32_t *total_matched, int64_t **out, const uint32_t frag_idx, const uint32_t *row_index_resume, const int8_t **col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
RUNTIME_EXPORT NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:39
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
#define DEF_CAST_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_if(int64_t *agg, const int64_t val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ int64_t get_thread_index()
Definition: cuda_mapd_rt.cu:19
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
__device__ void agg_sum_if_double_skip_val_shared(int64_t *agg, const double val, const double skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:35
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
Calculate approximate median and general quantiles, based on &quot;Computing Extremely Accurate Quantiles ...
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
Structures and runtime functions of streaming top-k heap.
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:43
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
__device__ void agg_sum_if_float_shared(int32_t *agg, const float val, const int8_t cond)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
#define DEF_AGG_MAX_INT(n)
Definitions for core Datum union type.
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_integer_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
int64_t compute_upper_bound_from_ordered_partition_index(const int64_t num_elems, const TARGET_VAL_TYPE target_val, const COL_TYPE *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const NULL_TYPE null_val, const bool nulls_first, const int64_t null_start_offset, const int64_t null_end_offset)
AGG_TYPE agg_func(AGG_TYPE const lhs, AGG_TYPE const rhs)
Macros and functions for groupby buffer compaction.
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_CAST_SCALED_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size)
T fill_missing_value(int64_t const cur_idx, T const null_val, T *const col_buf, int64_t const num_elems_in_partition, int32_t *const partition_rowid_buf, int64_t *const ordered_index_buf, bool const is_forward_fill)
__device__ void agg_sum_if_double_shared(int64_t *agg, const double val, const int8_t cond)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
RUNTIME_EXPORT void agg_sum_if_float(int32_t *agg, const float val, const int8_t cond)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend, const int64_t divisor)
__device__ int32_t agg_sum_if_int32_shared(int32_t *agg, const int32_t val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ int8_t * agg_id_varlen_shared(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_if_int32_skip_val(uint32_t *agg, const int32_t cond, const int32_t skip_val)
int64_t compute_current_row_idx_in_frame(const int64_t num_elems, const int64_t cur_row_idx, const T *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const T null_val, const bool nulls_first, const int64_t null_start_pos, const int64_t null_end_pos, Comparator cmp)
__device__ int64_t * declare_dynamic_shared_memory()
Definition: cuda_mapd_rt.cu:56
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_end_index_add(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound, int64_t num_current_partition_elem)
ALWAYS_INLINE DEVICE int32_t map_string_dict_id(const int32_t string_id, const int64_t translation_map_handle, const int32_t min_source_id)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
__device__ int64_t agg_sum_if_shared(int64_t *agg, const int64_t val, const int8_t cond)
#define DEF_ROUND_NULLABLE(from_type, to_type)
ALWAYS_INLINE DEVICE double tree_model_reg_predict(const double *regressor_inputs, const int64_t decision_tree_table_handle, const int64_t decision_tree_offsets_handle, const int32_t num_regressors, const int32_t num_trees, const bool compute_avg, const double null_value)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
#define DEVICE
RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool point_double_is_null(double *point)
RUNTIME_EXPORT void multifrag_query_hoisted_literals(int32_t *error_codes, int32_t *total_matched, int64_t **out, const uint32_t *num_fragments_ptr, const uint32_t *num_tables_ptr, const uint32_t *row_index_resume, const int8_t ***col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
Definition: cuda_mapd_rt.cu:23
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_start_index_add(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound, int64_t num_current_partition_elem)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
#define DEF_SEARCH_DERIVED_AGGREGATION_TREE(agg_value_type)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
__device__ int64_t agg_sum_if_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_if_sum_float(int32_t *agg, const float val, const int8_t cond)
#define DEF_MAP_STRING_TO_DATUM(value_type, value_name)
RUNTIME_EXPORT ALWAYS_INLINE int64_t get_valid_buf_start_pos(const int64_t null_start_pos, const int64_t null_end_pos)
#define DEF_AGG_MIN_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:27
RUNTIME_EXPORT ALWAYS_INLINE int32_t width_bucket_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t *agg)
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT NEVER_INLINE void init_columnar_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const bool blocks_share_memory, const int32_t frag_idx)
std::function< bool(const PermutationIdx, const PermutationIdx)> Comparator
Definition: ResultSet.h:155
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
bool isSplitNode() const
__device__ void sync_warp()
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_if_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
RUNTIME_EXPORT ALWAYS_INLINE double * get_double_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
#define RUNTIME_EXPORT
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_reversed(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
int64_t right_child_row_idx
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_if_skip_val(uint64_t *agg, const int64_t cond, const int64_t skip_val)
LOGICAL_TYPE get_value_in_window_frame(const int64_t target_row_idx_in_frame, const int64_t frame_start_offset, const int64_t frame_end_offset, const COL_TYPE *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const LOGICAL_TYPE logical_null_val, const LOGICAL_TYPE col_null_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t get_valid_buf_end_pos(const int64_t num_elems, const int64_t null_start_pos, const int64_t null_end_pos)
RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t *agg)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
#define DEF_FILL_MISSING_VALUE(col_type)
RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int8_t *bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr_no_oob_check(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_if_int32(uint32_t *agg, const int32_t cond)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
#define DEF_SEARCH_AGGREGATION_TREE(agg_value_type)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
void compute_derived_aggregates(SumAndCountPair< AGG_VALUE_TYPE > *aggregation_tree_for_partition, SumAndCountPair< AGG_VALUE_TYPE > &res, size_t query_range_start_idx, size_t query_range_end_idx, size_t leaf_level, size_t tree_fanout, AGG_VALUE_TYPE invalid_val, AGG_VALUE_TYPE null_val)
#define DEF_RANGE_MODE_FRAME_UPPER_BOUND(target_val_type, col_type, null_type, opname, opsym)
RUNTIME_EXPORT void agg_max_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE size_t getStartOffsetForSegmentTreeTraversal(size_t level, size_t tree_fanout)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:66
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
RUNTIME_EXPORT ALWAYS_INLINE SumAndCountPair< double > * get_double_derived_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT NEVER_INLINE void init_group_by_buffer_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
RUNTIME_EXPORT ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
#define DEF_GET_VALUE_IN_FRAME(col_type, logical_type)
#define NEVER_INLINE
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_end_index_sub(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound)
AGG_TYPE compute_window_func_via_aggregation_tree(AGG_TYPE *aggregation_tree_for_partition, size_t query_range_start_idx, size_t query_range_end_idx, size_t leaf_level, size_t tree_fanout, AGG_TYPE init_val, AGG_TYPE invalid_val, AGG_TYPE null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr_nullable(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count, const double null_val)
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_if_int32(int32_t *agg, const int32_t val, const int8_t cond)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t width_bucket_reversed_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
int64_t compute_lower_bound_from_ordered_partition_index(const int64_t num_elems, const TARGET_VAL_TYPE target_val, const COL_TYPE *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const NULL_TYPE null_val, const bool nulls_first, const int64_t null_start_offset, const int64_t null_end_offset)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void sync_threadblock()
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_int64_t_lower_bound(const int64_t entry_cnt, const int64_t target_value, const int64_t *col_buf)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t *error_codes)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
#define ALWAYS_INLINE
#define DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG(agg_type, null_type)
#define DEF_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code, int32_t *error_codes)
__device__ void agg_sum_if_float_skip_val_shared(int32_t *agg, const float val, const float skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
RUNTIME_EXPORT ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
__device__ void force_sync()
#define DEF_SKIP_IF_AGG(skip_agg_func, base_agg_func)
#define DEF_RANGE_MODE_FRAME_LOWER_BOUND(target_val_type, col_type, null_type, opname, opsym)
RUNTIME_EXPORT void agg_min_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_if_double(int64_t *agg, const double val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
__device__ int32_t agg_sum_if_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE SumAndCountPair< int64_t > * get_integer_derived_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_reversed_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:31