11 #if CUDA_VERSION < 10000
12 static_assert(
false,
"CUDA v10.0 or later is required.");
15 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 350)
16 static_assert(
false,
"CUDA Compute Capability of 3.5 or greater is required.");
27 extern "C" __device__ int32_t
pos_start_impl(
const int32_t* row_index_resume) {
28 return blockIdx.x * blockDim.x + threadIdx.x;
36 return blockDim.x * gridDim.x;
40 return threadIdx.x % warp_sz;
44 const int64_t* groups_buffer,
45 const int32_t groups_buffer_size) {
57 extern __shared__ int64_t shared_mem_buffer[];
58 return shared_mem_buffer;
66 extern "C" __device__
const int64_t*
init_shared_mem(
const int64_t* global_groups_buffer,
67 const int32_t groups_buffer_size) {
69 extern __shared__ int64_t shared_groups_buffer[];
73 const int32_t buffer_units = groups_buffer_size >> 3;
75 for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
76 shared_groups_buffer[pos] = global_groups_buffer[pos];
79 return shared_groups_buffer;
82 #define init_group_by_buffer_gpu_impl init_group_by_buffer_gpu
86 #undef init_group_by_buffer_gpu_impl
97 __inline__ __device__ uint32_t
get_smid(
void) {
99 asm(
"mov.u32 %0, %%smid;" :
"=r"(ret));
127 __shared__
volatile int64_t dw_block_cycle_start;
128 __shared__
volatile bool
135 if (threadIdx.x == 0) {
136 dw_block_cycle_start = 0LL;
137 int64_t cycle_count =
static_cast<int64_t
>(clock64());
140 dw_block_cycle_start =
static_cast<int64_t
>(
143 static_cast<unsigned long long>(cycle_count)));
146 int64_t cycles = cycle_count - dw_block_cycle_start;
147 if ((smid ==
get_smid()) && (dw_block_cycle_start > 0LL) &&
150 dw_should_terminate =
true;
152 dw_should_terminate =
false;
156 return dw_should_terminate;
163 template <
typename T =
unsigned long long>
173 template <
typename T>
177 const uint32_t key_count,
178 const uint32_t row_size_quad) {
179 const T empty_key = get_empty_key<T>();
180 uint32_t off = h * row_size_quad;
181 auto row_ptr =
reinterpret_cast<T*
>(groups_buffer + off);
183 const T old = atomicCAS(row_ptr, empty_key, *key);
184 if (empty_key == old && key_count > 1) {
185 for (
size_t i = 1; i <= key_count - 1; ++i) {
186 atomicExch(row_ptr + i, key[i]);
191 while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
197 for (uint32_t i = 0; i < key_count; ++i) {
198 if (row_ptr[i] != key[i]) {
205 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
214 const uint32_t key_count,
215 const uint32_t key_width,
216 const uint32_t row_size_quad) {
221 reinterpret_cast<const unsigned int*>(key),
227 reinterpret_cast<const unsigned long long*>(key),
235 template <
typename T>
237 const uint32_t entry_count,
240 const uint32_t key_count) {
241 const T empty_key = get_empty_key<T>();
243 atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
245 if (old == empty_key) {
246 uint32_t offset = h + entry_count;
247 for (
size_t i = 1; i < key_count; ++i) {
248 *
reinterpret_cast<T*
>(groups_buffer + offset) = key[i];
249 offset += entry_count;
257 if (old != empty_key) {
259 for (uint32_t i = 0; i < key_count; ++i) {
260 if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
263 offset += entry_count;
269 extern "C" __device__ int32_t
271 const uint32_t entry_count,
274 const uint32_t key_count,
275 const uint32_t key_width) {
282 reinterpret_cast<const unsigned int*>(key),
289 reinterpret_cast<const unsigned long long*>(key),
297 int64_t* groups_buffer,
300 const uint32_t key_qw_count,
301 const size_t entry_count) {
304 const uint64_t old = atomicCAS(
305 reinterpret_cast<unsigned long long*>(groups_buffer + off),
EMPTY_KEY_64, *key);
307 for (
size_t i = 0; i < key_qw_count; ++i) {
308 groups_buffer[off] = key[i];
311 return &groups_buffer[off];
316 for (
size_t i = 0; i < key_qw_count; ++i) {
317 if (groups_buffer[off] != key[i]) {
322 return &groups_buffer[off];
331 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
332 unsigned long long int old = *address_as_ull, assumed;
336 old = atomicCAS(address_as_ull, assumed, max((
long long)val, (
long long)assumed));
337 }
while (assumed != old);
343 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
344 unsigned long long int old = *address_as_ull, assumed;
348 old = atomicCAS(address_as_ull, assumed, min((
long long)val, (
long long)assumed));
349 }
while (assumed != old);
354 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600)
355 __device__
double atomicAdd(
double* address,
double val) {
356 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
357 unsigned long long int old = *address_as_ull, assumed;
361 old = atomicCAS(address_as_ull,
363 __double_as_longlong(val + __longlong_as_double(assumed)));
366 }
while (assumed != old);
368 return __longlong_as_double(old);
372 __device__
double atomicMax(
double* address,
double val) {
373 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
374 unsigned long long int old = *address_as_ull, assumed;
378 old = atomicCAS(address_as_ull,
380 __double_as_longlong(max(val, __longlong_as_double(assumed))));
383 }
while (assumed != old);
385 return __longlong_as_double(old);
389 int* address_as_int = (
int*)address;
390 int old = *address_as_int, assumed;
395 address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
398 }
while (assumed != old);
400 return __int_as_float(old);
403 __device__
double atomicMin(
double* address,
double val) {
404 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
405 unsigned long long int old = *address_as_ull, assumed;
409 old = atomicCAS(address_as_ull,
411 __double_as_longlong(min(val, __longlong_as_double(assumed))));
412 }
while (assumed != old);
414 return __longlong_as_double(old);
417 __device__
double atomicMin(
float* address,
float val) {
418 int* address_as_ull = (
int*)address;
419 int old = *address_as_ull, assumed;
424 address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
425 }
while (assumed != old);
427 return __int_as_float(old);
431 return static_cast<uint64_t
>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U));
435 return cond ?
static_cast<uint64_t
>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U))
436 : static_cast<uint64_t>(*(reinterpret_cast<uint32_t*>(agg)));
440 return atomicAdd(agg, 1U);
444 const int32_t cond) {
445 return cond ? atomicAdd(agg, 1U) : *agg;
457 return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
461 return atomicAdd(agg, val);
465 atomicAdd(reinterpret_cast<float*>(agg), val);
469 atomicAdd(reinterpret_cast<double*>(agg), val);
475 static_assert(
sizeof(int64_t) ==
sizeof(
unsigned long long));
477 return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
486 return atomicAdd(agg, val);
495 atomicAdd(reinterpret_cast<float*>(agg), val);
503 atomicAdd(reinterpret_cast<double*>(agg), val);
516 atomicMax(reinterpret_cast<double*>(agg), val);
520 atomicMax(reinterpret_cast<float*>(agg), val);
531 #if CUDA_VERSION > 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
532 __device__
void atomicMax16(int16_t* agg,
const int16_t val) {
533 unsigned short int* address_as_us =
reinterpret_cast<unsigned short int*
>(agg);
534 unsigned short int old = *address_as_us, assumed;
538 old = atomicCAS(address_as_us,
540 static_cast<unsigned short>(max(static_cast<short int>(val),
541 static_cast<short int>(assumed))));
542 }
while (assumed != old);
547 unsigned int* base_address_u32 =
548 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
550 unsigned int old_value = *base_address_u32;
551 unsigned int swap_value, compare_value;
553 compare_value = old_value;
555 (
reinterpret_cast<size_t>(agg) & 0x2)
556 ?
static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
559 : (old_value & 0xFFFF0000) |
560 static_cast<unsigned int>(
561 max(static_cast<int16_t>(old_value & 0xFFFF), val));
562 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
563 }
while (old_value != compare_value);
569 unsigned int* base_address_u32 =
570 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
577 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
578 unsigned int old_value = *base_address_u32;
579 unsigned int swap_value, compare_value;
581 compare_value = old_value;
582 auto max_value =
static_cast<unsigned int>(
585 static_cast<int8_t>(__byte_perm(
586 compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
587 swap_value = __byte_perm(
588 compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
589 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
590 }
while (compare_value != old_value);
593 #if CUDA_VERSION > 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
594 __device__
void atomicMin16(int16_t* agg,
const int16_t val) {
595 unsigned short int* address_as_us =
reinterpret_cast<unsigned short int*
>(agg);
596 unsigned short int old = *address_as_us, assumed;
600 old = atomicCAS(address_as_us,
602 static_cast<unsigned short>(min(static_cast<short int>(val),
603 static_cast<short int>(assumed))));
604 }
while (assumed != old);
609 unsigned int* base_address_u32 =
610 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
612 unsigned int old_value = *base_address_u32;
613 unsigned int swap_value, compare_value;
615 compare_value = old_value;
617 (
reinterpret_cast<size_t>(agg) & 0x2)
618 ?
static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
621 : (old_value & 0xFFFF0000) |
622 static_cast<unsigned int>(
623 min(static_cast<int16_t>(old_value & 0xFFFF), val));
624 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
625 }
while (old_value != compare_value);
631 const int16_t skip_val) {
633 unsigned int* base_address_u32 =
634 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
636 unsigned int old_value = *base_address_u32;
637 unsigned int swap_value, compare_value;
639 compare_value = old_value;
640 int16_t selected_old_val = (
reinterpret_cast<size_t>(agg) & 0x2)
641 ?
static_cast<int16_t
>(old_value >> 16)
642 : static_cast<int16_t>(old_value & 0xFFFF);
645 (
reinterpret_cast<size_t>(agg) & 0x2)
646 ?
static_cast<unsigned int>(
647 selected_old_val == skip_val ? val : min(selected_old_val, val))
650 : (old_value & 0xFFFF0000) |
651 static_cast<unsigned int>(
652 selected_old_val == skip_val ? val : min(selected_old_val, val));
653 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
654 }
while (old_value != compare_value);
659 unsigned int* base_address_u32 =
660 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
662 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
663 unsigned int old_value = *base_address_u32;
664 unsigned int swap_value, compare_value;
666 compare_value = old_value;
667 auto min_value =
static_cast<unsigned int>(
669 static_cast<int8_t>(__byte_perm(
670 compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
671 swap_value = __byte_perm(
672 compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
673 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
674 }
while (compare_value != old_value);
679 unsigned int* base_address_u32 =
680 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
682 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
683 unsigned int old_value = *base_address_u32;
684 unsigned int swap_value, compare_value;
686 compare_value = old_value;
687 int8_t selected_old_val =
static_cast<int8_t
>(
688 __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
689 auto min_value =
static_cast<unsigned int>(
690 selected_old_val == skip_val ? val : min(val, selected_old_val));
691 swap_value = __byte_perm(
692 compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
693 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
694 }
while (compare_value != old_value);
714 atomicMin(reinterpret_cast<double*>(agg), val);
718 atomicMin(reinterpret_cast<float*>(agg), val);
726 const int64_t offset,
728 const int64_t size_bytes) {
729 for (
auto i = 0; i < size_bytes; i++) {
730 varlen_buffer[offset + i] = value[i];
732 return &varlen_buffer[offset];
737 const int64_t null_val) {
738 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(agg);
739 unsigned long long int old = *address_as_ull, assumed;
741 if (val == null_val) {
746 if (static_cast<int64_t>(old) != null_val) {
747 if (static_cast<int64_t>(old) != val) {
756 old = atomicCAS(address_as_ull, assumed, val);
757 }
while (assumed != old);
762 #define DEF_AGG_ID_INT_SHARED(n) \
763 extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
764 const int##n##_t val) { \
772 #undef DEF_AGG_ID_INT_SHARED
775 *agg = *(
reinterpret_cast<const int64_t*
>(&val));
780 const double null_val) {
781 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(agg);
782 unsigned long long int old = *address_as_ull, assumed;
784 if (val == null_val) {
789 if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
790 if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
799 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
800 }
while (assumed != old);
806 *agg = *(
reinterpret_cast<const int64_t*
>(val));
809 extern "C" __device__ int32_t
812 const double null_val) {
813 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(agg);
814 unsigned long long int old = *address_as_ull, assumed;
817 if (val == null_val) {
822 if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
823 if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
832 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
833 }
while (assumed != old);
839 *agg = __float_as_int(val);
844 const float null_val) {
845 int* address_as_ull =
reinterpret_cast<int*
>(agg);
846 int old = *address_as_ull, assumed;
848 if (val == null_val) {
853 if (old != __float_as_int(null_val)) {
854 if (old != __float_as_int(val)) {
863 old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
864 }
while (assumed != old);
869 #define DEF_SKIP_AGG(base_agg_func) \
870 extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
871 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
872 if (val != skip_val) { \
873 return base_agg_func##_shared(agg, val); \
878 #define DATA_T int64_t
879 #define ADDR_T uint64_t
885 #define DATA_T int32_t
886 #define ADDR_T uint32_t
895 const int32_t skip_val) {
896 if (val != skip_val) {
903 const int16_t skip_val) {
904 if (val != skip_val) {
911 const int16_t skip_val) {
912 if (val != skip_val) {
919 const int8_t skip_val) {
920 if (val != skip_val) {
927 const int8_t skip_val) {
928 if (val != skip_val) {
935 const int32_t skip_val) {
936 int32_t old = atomicExch(address, INT_MAX);
937 return atomicMin(address, old == skip_val ? val : min(old, val));
942 const int32_t skip_val) {
943 if (val != skip_val) {
950 const int32_t skip_val) {
951 unsigned int* address_as_int = (
unsigned int*)address;
952 int32_t old = atomicExch(address_as_int, 0);
953 int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
954 return old == skip_val ? old2 : (old2 + old);
959 const int32_t skip_val) {
960 if (val != skip_val) {
969 const int32_t skip_val,
976 const int64_t skip_val) {
977 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
978 int64_t old = atomicExch(address_as_ull, 0);
979 int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
980 return old == skip_val ? old2 : (old2 + old);
985 const int64_t skip_val) {
986 if (val != skip_val) {
994 const int64_t skip_val,
1001 const int64_t skip_val) {
1002 unsigned long long int* address_as_ull =
1003 reinterpret_cast<unsigned long long int*
>(address);
1004 unsigned long long int old = *address_as_ull, assumed;
1008 old = atomicCAS(address_as_ull,
1010 assumed == skip_val ? val : min((
long long)val, (
long long)assumed));
1011 }
while (assumed != old);
1018 const int64_t skip_val) {
1019 if (val != skip_val) {
1026 const int64_t skip_val) {
1027 unsigned long long int* address_as_ull =
1028 reinterpret_cast<unsigned long long int*
>(address);
1029 unsigned long long int old = *address_as_ull, assumed;
1033 old = atomicCAS(address_as_ull,
1035 assumed == skip_val ? val : max((
long long)val, (
long long)assumed));
1036 }
while (assumed != old);
1043 const int64_t skip_val) {
1044 if (val != skip_val) {
1050 #define DEF_SKIP_AGG(base_agg_func) \
1051 extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
1052 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
1053 if (val != skip_val) { \
1054 return base_agg_func##_shared(agg, val); \
1059 #define DATA_T double
1060 #define ADDR_T uint64_t
1065 #define DATA_T float
1066 #define ADDR_T uint32_t
1074 const float skip_val) {
1075 if (__float_as_int(val) != __float_as_int(skip_val)) {
1076 float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1077 atomicMax(reinterpret_cast<float*>(agg),
1078 __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1083 float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1085 reinterpret_cast<float*>(address),
1086 __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1091 const float skip_val) {
1092 if (__float_as_int(val) != __float_as_int(skip_val)) {
1099 const float skip_val) {
1100 float old = atomicExch(address, 0.
f);
1101 atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1106 const float skip_val) {
1107 if (__float_as_int(val) != __float_as_int(skip_val)) {
1114 const float skip_val,
1115 const int8_t cond) {
1123 const double skip_val) {
1124 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
1125 double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1128 __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1133 const double skip_val) {
1134 if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1141 const double skip_val,
1142 const int8_t cond) {
1150 const double skip_val) {
1151 unsigned long long int* address_as_ull =
1152 reinterpret_cast<unsigned long long int*
>(address);
1153 unsigned long long int old = *address_as_ull;
1154 unsigned long long int skip_val_as_ull =
1155 *
reinterpret_cast<const unsigned long long*
>(&skip_val);
1156 unsigned long long int assumed;
1160 old = atomicCAS(address_as_ull,
1162 assumed == skip_val_as_ull
1163 ? *reinterpret_cast<unsigned long long*>(&val)
1164 : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1165 }
while (assumed != old);
1167 return __longlong_as_double(old);
1172 const double skip_val) {
1173 if (val != skip_val) {
1180 const double skip_val) {
1181 if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1182 double old = __longlong_as_double(atomicExch(
1183 reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
1184 atomicMax(reinterpret_cast<double*>(agg),
1185 __double_as_longlong(old) == __double_as_longlong(skip_val)
1196 auto slot_address =
reinterpret_cast<unsigned long long int*
>(slot);
1197 const auto empty_key =
1198 static_cast<unsigned long long int*
>(
static_cast<void*
>(&init_val));
1199 const auto new_val_cast =
1200 static_cast<unsigned long long int*
>(
static_cast<void*
>(&new_val));
1202 const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1203 if (old_val == *empty_key) {
1213 unsigned int* slot_address =
reinterpret_cast<unsigned int*
>(slot);
1214 unsigned int compare_value =
static_cast<unsigned int>(init_val);
1215 unsigned int swap_value =
static_cast<unsigned int>(new_val);
1217 const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1218 return old_value == compare_value;
1224 unsigned int* base_slot_address =
1225 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(slot) & ~0x3);
1226 unsigned int old_value = *base_slot_address;
1227 unsigned int swap_value, compare_value;
1229 compare_value = old_value;
1232 if (static_cast<unsigned int>(init_val) !=
1234 compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1237 swap_value = __byte_perm(compare_value,
1238 static_cast<unsigned int>(new_val),
1239 (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1240 old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1241 }
while (compare_value != old_value);
1249 unsigned int* base_slot_address =
1250 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(slot) & ~0x3);
1251 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1252 unsigned int old_value = *base_slot_address;
1253 unsigned int swap_value, compare_value;
1255 compare_value = old_value;
1258 if (static_cast<unsigned int>(init_val) !=
1259 __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1262 swap_value = __byte_perm(compare_value,
1263 static_cast<unsigned int>(new_val),
1264 byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1265 old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1266 }
while (compare_value != old_value);
1270 #include "../Utils/ChunkIter.cpp"
1273 #define EXECUTE_INCLUDE
1278 #undef EXECUTE_INCLUDE
1279 #include "../Utils/Regexp.cpp"
1280 #include "../Utils/StringLike.cpp"
1284 auto chunk_iter =
reinterpret_cast<ChunkIter*
>(chunk_iter_);
1294 const uint32_t bitmap_bytes,
1295 const uint8_t* key_bytes,
1296 const uint32_t key_len) {
1297 const uint32_t bit_pos =
MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1298 const uint32_t word_idx = bit_pos / 32;
1299 const uint32_t bit_idx = bit_pos % 32;
1300 atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1305 const int64_t min_val,
1306 const int64_t bucket_size,
1307 const int64_t base_dev_addr,
1308 const int64_t base_host_addr,
1309 const uint64_t sub_bitmap_count,
1310 const uint64_t bitmap_bytes) {
1311 constexpr
unsigned bitmap_element_size = 8 *
sizeof(uint32_t);
1312 auto bitmap_idx =
static_cast<uint64_t
>(val - min_val);
1313 if (1 < bucket_size) {
1314 bitmap_idx /=
static_cast<uint64_t
>(bucket_size);
1316 uint64_t
const word_idx = bitmap_idx / bitmap_element_size;
1317 uint32_t
const bit_idx = bitmap_idx % bitmap_element_size;
1318 int64_t
const agg_offset = *agg - base_host_addr;
1319 int64_t
const thread_offset = (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes;
1320 auto* bitmap =
reinterpret_cast<uint32_t*
>(base_dev_addr + agg_offset + thread_offset);
1321 atomicOr(bitmap + word_idx, 1u << bit_idx);
1327 const int64_t min_val,
1328 const int64_t bucket_size,
1329 const int64_t skip_val,
1330 const int64_t base_dev_addr,
1331 const int64_t base_host_addr,
1332 const uint64_t sub_bitmap_count,
1333 const uint64_t bitmap_bytes) {
1334 if (val != skip_val) {
1350 const int64_t base_dev_addr,
1351 const int64_t base_host_addr) {
1353 const uint32_t index = hash >> (64 - b);
1354 const int32_t rank =
get_rank(hash << b, 64 - b);
1355 const int64_t host_addr = *agg;
1356 int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1361 __threadfence_block();
1378 if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1396 int64_t* output_buffer,
1397 const int32_t agg_idx) {
1398 if (threadIdx.x == agg_idx) {
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
__device__ int32_t checked_single_agg_id_double_shared_slow(int64_t *agg, const double *valp, const double null_val)
__device__ void agg_max_float_shared(int32_t *agg, const float val)
__device__ uint32_t agg_count_float_shared(uint32_t *agg, const float val)
__device__ bool dynamic_watchdog()
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
__device__ void agg_max_shared(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_if(uint64_t *agg, const int64_t cond)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
__device__ StringView string_decode(int8_t *chunk_iter_, int64_t pos)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ void agg_min_int32_shared(int32_t *agg, const int32_t val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
__device__ int64_t dw_sm_cycle_start[128]
__device__ void agg_id_float_shared(int32_t *agg, const float val)
__device__ void agg_min_double_shared(int64_t *agg, const double val)
__device__ int64_t get_thread_index()
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)
__device__ void agg_sum_if_double_skip_val_shared(int64_t *agg, const double val, const double skip_val, const int8_t cond)
__device__ int32_t pos_step_impl()
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
Structures and runtime functions of streaming top-k heap.
__device__ void agg_min_int8_shared(int8_t *agg, const int8_t val)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
__device__ void agg_sum_if_float_shared(int32_t *agg, const float val, const int8_t cond)
__device__ double atomicMin(double *address, double val)
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)
Functions to support geospatial operations used by the executor.
Macros and functions for groupby buffer compaction.
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)
__device__ void agg_sum_if_double_shared(int64_t *agg, const double val, const int8_t cond)
__device__ int64_t dw_cycle_budget
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
__device__ int32_t agg_sum_if_int32_shared(int32_t *agg, const int32_t val, const int8_t cond)
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
__device__ void agg_min_float_shared(int32_t *agg, const float val)
__device__ int8_t * agg_id_varlen_shared(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
__device__ int64_t atomicMin64(int64_t *address, int64_t val)
__device__ int64_t * declare_dynamic_shared_memory()
__device__ void agg_max_double_shared(int64_t *agg, const double val)
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
__device__ int64_t agg_sum_if_shared(int64_t *agg, const int64_t val, const int8_t cond)
__device__ void agg_id_double_shared(int64_t *agg, const double val)
__device__ void agg_max_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void atomicMax16(int16_t *agg, const int16_t val)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
__device__ void agg_min_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
__device__ bool check_interrupt()
__device__ bool slotEmptyKeyCAS_int32(int32_t *slot, int32_t new_val, int32_t init_val)
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ void agg_min_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
__inline__ __device__ uint32_t get_smid(void)
__device__ int64_t agg_sum_if_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)
__device__ void agg_min_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)
__device__ int64_t atomicMax64(int64_t *address, int64_t val)
__device__ bool slotEmptyKeyCAS(int64_t *slot, int64_t new_val, int64_t init_val)
__device__ uint32_t agg_count_if_int32_shared(uint32_t *agg, const int32_t cond)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)
__device__ void atomicMin16(int16_t *agg, const int16_t val)
__device__ void agg_max_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
__device__ int32_t runtime_interrupt_flag
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_if_int32(uint32_t *agg, const int32_t cond)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void atomicMin8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
__device__ void agg_min_int16_shared(int16_t *agg, const int16_t val)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
__device__ void agg_min_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
#define DEF_AGG_ID_INT_SHARED(n)
__device__ uint64_t agg_count_double_shared(uint64_t *agg, const double val)
__device__ uint64_t agg_count_if_shared(uint64_t *agg, const int64_t cond)
__device__ T get_empty_key()
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void sync_threadblock()
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
__device__ void atomicMax8(int8_t *agg, const int8_t val)
__device__ void agg_id_shared(int64_t *agg, const int64_t val)
__device__ double atomicMax(double *address, double val)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
__device__ void agg_sum_if_float_skip_val_shared(int32_t *agg, const float val, const float skip_val, const int8_t cond)
__device__ void agg_max_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t dw_abort
__device__ bool slotEmptyKeyCAS_int16(int16_t *slot, int16_t new_val, int16_t init_val)
__device__ void agg_max_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)
Functions to support array operations used by the executor.
__device__ void force_sync()
__device__ int32_t agg_sum_if_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
__device__ void agg_min_shared(int64_t *agg, const int64_t val)
__device__ bool slotEmptyKeyCAS_int8(int8_t *slot, int8_t new_val, int8_t init_val)
__device__ int32_t group_buff_idx_impl()