OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CountDistinct.h File Reference

Functions used to work with (approximate) count distinct sets. More...

#include "Descriptors/CountDistinctDescriptor.h"
#include "HyperLogLog.h"
#include "ThirdParty/robin_hood/robin_hood.h"
#include <bitset>
#include <vector>
+ Include dependency graph for CountDistinct.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Typedefs

using CountDistinctDescriptors = std::vector< CountDistinctDescriptor >
 
using CountDistinctSet = robin_hood::unordered_set< int64_t >
 

Functions

size_t bitmap_set_size (const int8_t *bitmap, const size_t bitmap_byte_sz)
 
void bitmap_set_union (int8_t *lhs, int8_t *rhs, const size_t bitmap_sz)
 
void partial_bitmap_union (int8_t *set_vals, const CountDistinctDescriptor &count_distinct_desc)
 
int64_t count_distinct_set_size (const int64_t set_handle, const CountDistinctDescriptor &count_distinct_desc)
 
void count_distinct_set_union (const int64_t new_set_handle, const int64_t old_set_handle, const CountDistinctDescriptor &new_count_distinct_desc, const CountDistinctDescriptor &old_count_distinct_desc)
 

Detailed Description

Functions used to work with (approximate) count distinct sets.

Definition in file CountDistinct.h.

Typedef Documentation

Definition at line 34 of file CountDistinct.h.

using CountDistinctSet = robin_hood::unordered_set<int64_t>

Definition at line 35 of file CountDistinct.h.

Function Documentation

size_t bitmap_set_size ( const int8_t *  bitmap,
const size_t  bitmap_byte_sz 
)
inline

Definition at line 37 of file CountDistinct.h.

Referenced by count_distinct_set_size(), and ResultSet::getNDVEstimator().

37  {
38  const auto bitmap_word_count = bitmap_byte_sz >> 3;
39  const auto bitmap_rem_bytes = bitmap_byte_sz & 7;
40  const auto bitmap64 = reinterpret_cast<const int64_t*>(bitmap);
41  size_t set_size = 0;
42  for (size_t i = 0; i < bitmap_word_count; ++i) {
43  std::bitset<64> word_bitset(bitmap64[i]);
44  set_size += word_bitset.count();
45  }
46  const auto rem_bitmap = reinterpret_cast<const int8_t*>(&bitmap64[bitmap_word_count]);
47  for (size_t i = 0; i < bitmap_rem_bytes; ++i) {
48  std::bitset<8> byte_bitset(rem_bitmap[i]);
49  set_size += byte_bitset.count();
50  }
51  return set_size;
52 }

+ Here is the caller graph for this function:

void bitmap_set_union ( int8_t *  lhs,
int8_t *  rhs,
const size_t  bitmap_sz 
)
inline

Definition at line 54 of file CountDistinct.h.

Referenced by count_distinct_set_union(), and partial_bitmap_union().

54  {
55  for (size_t i = 0; i < bitmap_sz; ++i) {
56  lhs[i] = rhs[i] = lhs[i] | rhs[i];
57  }
58 }

+ Here is the caller graph for this function:

int64_t count_distinct_set_size ( const int64_t  set_handle,
const CountDistinctDescriptor count_distinct_desc 
)
inline

Definition at line 75 of file CountDistinct.h.

References CountDistinctDescriptor::approximate, Bitmap, bitmap_set_size(), CountDistinctDescriptor::bitmap_sz_bits, CountDistinctDescriptor::bitmapSizeBytes(), CHECK, CHECK_GT, CountDistinctDescriptor::device_type, GPU, hll_size(), CountDistinctDescriptor::impl_type_, partial_bitmap_union(), CountDistinctDescriptor::sub_bitmap_count, and UnorderedSet.

Referenced by ResultSet::makeTargetValue(), ResultSet::ResultSetComparator< BUFFER_ITERATOR_TYPE >::materializeCountDistinctColumn(), and ResultSetStorage::reduceSingleRow().

77  {
78  if (!set_handle) {
79  return 0;
80  }
81  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
82  auto set_vals = reinterpret_cast<int8_t*>(set_handle);
83  if (count_distinct_desc.approximate) {
84  CHECK_GT(count_distinct_desc.bitmap_sz_bits, 0);
85  return count_distinct_desc.device_type == ExecutorDeviceType::GPU
86  ? hll_size(reinterpret_cast<const int32_t*>(set_vals),
87  count_distinct_desc.bitmap_sz_bits)
88  : hll_size(reinterpret_cast<const int8_t*>(set_vals),
89  count_distinct_desc.bitmap_sz_bits);
90  }
91  if (count_distinct_desc.sub_bitmap_count > 1) {
92  partial_bitmap_union(set_vals, count_distinct_desc);
93  }
94  return bitmap_set_size(set_vals, count_distinct_desc.bitmapSizeBytes());
95  }
97  return reinterpret_cast<CountDistinctSet*>(set_handle)->size();
98 }
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
CountDistinctImplType impl_type_
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
Definition: HyperLogLog.h:88
#define CHECK_GT(x, y)
Definition: Logger.h:305
void partial_bitmap_union(int8_t *set_vals, const CountDistinctDescriptor &count_distinct_desc)
Definition: CountDistinct.h:61
#define CHECK(condition)
Definition: Logger.h:291
size_t bitmap_set_size(const int8_t *bitmap, const size_t bitmap_byte_sz)
Definition: CountDistinct.h:37

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void count_distinct_set_union ( const int64_t  new_set_handle,
const int64_t  old_set_handle,
const CountDistinctDescriptor new_count_distinct_desc,
const CountDistinctDescriptor old_count_distinct_desc 
)
inline

Definition at line 100 of file CountDistinct.h.

References CountDistinctDescriptor::approximate, Bitmap, bitmap_set_union(), CountDistinctDescriptor::bitmap_sz_bits, CountDistinctDescriptor::bitmapPaddedSizeBytes(), CountDistinctDescriptor::bitmapSizeBytes(), CHECK, CHECK_EQ, CHECK_GE, CPU, CountDistinctDescriptor::device_type, GPU, hll_unify(), CountDistinctDescriptor::impl_type_, CountDistinctDescriptor::sub_bitmap_count, and UnorderedSet.

Referenced by count_distinct_set_union_jit_rt(), and ResultSetStorage::reduceOneCountDistinctSlot().

104  {
105  if (new_count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
106  auto new_set = reinterpret_cast<int8_t*>(new_set_handle);
107  auto old_set = reinterpret_cast<int8_t*>(old_set_handle);
108  if (new_count_distinct_desc.approximate) {
109  CHECK(old_count_distinct_desc.approximate);
110  if (new_count_distinct_desc.device_type == ExecutorDeviceType::GPU &&
111  old_count_distinct_desc.device_type == ExecutorDeviceType::GPU) {
112  hll_unify(reinterpret_cast<int32_t*>(new_set),
113  reinterpret_cast<int32_t*>(old_set),
114  size_t(1) << old_count_distinct_desc.bitmap_sz_bits);
115  } else if (new_count_distinct_desc.device_type == ExecutorDeviceType::GPU &&
116  old_count_distinct_desc.device_type == ExecutorDeviceType::CPU) {
117  hll_unify(reinterpret_cast<int32_t*>(new_set),
118  reinterpret_cast<int8_t*>(old_set),
119  size_t(1) << old_count_distinct_desc.bitmap_sz_bits);
120  } else if (new_count_distinct_desc.device_type == ExecutorDeviceType::CPU &&
121  old_count_distinct_desc.device_type == ExecutorDeviceType::GPU) {
122  hll_unify(reinterpret_cast<int8_t*>(new_set),
123  reinterpret_cast<int32_t*>(old_set),
124  size_t(1) << old_count_distinct_desc.bitmap_sz_bits);
125  } else {
126  CHECK(old_count_distinct_desc.device_type == ExecutorDeviceType::CPU &&
127  new_count_distinct_desc.device_type == ExecutorDeviceType::CPU);
128  hll_unify(reinterpret_cast<int8_t*>(new_set),
129  reinterpret_cast<int8_t*>(old_set),
130  size_t(1) << old_count_distinct_desc.bitmap_sz_bits);
131  }
132  } else {
133  CHECK_EQ(new_count_distinct_desc.sub_bitmap_count,
134  old_count_distinct_desc.sub_bitmap_count);
135  CHECK_GE(old_count_distinct_desc.sub_bitmap_count, size_t(1));
136  // NB: For low cardinality input and if the query ran on GPU the bitmap is
137  // composed of multiple padded sub-bitmaps. Treat them as if they are regular
138  // bitmaps and let count_distinct_set_size take care of additional reduction.
139  const auto bitmap_byte_sz = old_count_distinct_desc.sub_bitmap_count == 1
140  ? old_count_distinct_desc.bitmapSizeBytes()
141  : old_count_distinct_desc.bitmapPaddedSizeBytes();
142  bitmap_set_union(new_set, old_set, bitmap_byte_sz);
143  }
144  } else {
145  CHECK(old_count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
146  auto old_set = reinterpret_cast<CountDistinctSet*>(old_set_handle);
147  auto new_set = reinterpret_cast<CountDistinctSet*>(new_set_handle);
148  new_set->insert(old_set->begin(), old_set->end());
149  old_set->insert(new_set->begin(), new_set->end());
150  }
151 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
Definition: HyperLogLog.h:107
#define CHECK_GE(x, y)
Definition: Logger.h:306
void bitmap_set_union(int8_t *lhs, int8_t *rhs, const size_t bitmap_sz)
Definition: CountDistinct.h:54
CountDistinctImplType impl_type_
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void partial_bitmap_union ( int8_t *  set_vals,
const CountDistinctDescriptor count_distinct_desc 
)
inline

Definition at line 61 of file CountDistinct.h.

References bitmap_set_union(), CountDistinctDescriptor::bitmapPaddedSizeBytes(), CountDistinctDescriptor::bitmapSizeBytes(), CHECK_EQ, and CountDistinctDescriptor::sub_bitmap_count.

Referenced by count_distinct_set_size().

62  {
63  auto partial_set_vals = set_vals;
64  CHECK_EQ(
65  size_t(0),
66  count_distinct_desc.bitmapPaddedSizeBytes() % count_distinct_desc.sub_bitmap_count);
67  const auto partial_padded_size =
68  count_distinct_desc.bitmapPaddedSizeBytes() / count_distinct_desc.sub_bitmap_count;
69  for (size_t i = 1; i < count_distinct_desc.sub_bitmap_count; ++i) {
70  partial_set_vals += partial_padded_size;
71  bitmap_set_union(set_vals, partial_set_vals, count_distinct_desc.bitmapSizeBytes());
72  }
73 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void bitmap_set_union(int8_t *lhs, int8_t *rhs, const size_t bitmap_sz)
Definition: CountDistinct.h:54

+ Here is the call graph for this function:

+ Here is the caller graph for this function: