OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashTable.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 namespace {
20 
21 namespace perfect_hash {
22 
23 void to_set_one_to_one(const int32_t* const ptr4,
24  size_t entry_count,
26  const auto empty = -1;
27  auto ptr = ptr4;
28  for (size_t e = 0; e < entry_count; ++e, ++ptr) {
29  if (*ptr == empty) {
30  continue;
31  }
32 
33  decltype(DecodedJoinHashBufferEntry::key) key;
34  key.push_back(e);
35 
36  decltype(DecodedJoinHashBufferEntry::payload) payload;
37  payload.insert(*ptr);
38 
39  s.insert({std::move(key), std::move(payload)});
40  }
41 }
42 
43 void to_set_one_to_many(const int32_t* const ptr2,
44  const int32_t* const ptr3,
45  const int32_t* const ptr4,
46  size_t entry_count,
48  auto empty = -1;
49  auto ptr = ptr2;
50  for (size_t e = 0; e < entry_count; ++e, ++ptr) {
51  if (*ptr == empty) {
52  continue;
53  }
54 
55  decltype(DecodedJoinHashBufferEntry::key) key;
56  key.push_back(e);
57 
58  int32_t offset = ptr2[e];
59 
60  int32_t count = ptr3[e];
61 
62  decltype(DecodedJoinHashBufferEntry::payload) payload;
63  for (size_t j = 0; j < static_cast<size_t>(count); ++j) {
64  payload.insert(ptr4[offset + j]);
65  }
66 
67  s.insert({std::move(key), std::move(payload)});
68  }
69 }
70 
71 } // namespace perfect_hash
72 
73 namespace keyed_hash {
74 
75 template <typename T>
76 void to_set_one_to_one(const int8_t* ptr1,
77  size_t entry_count,
78  size_t key_component_count,
80  auto empty = get_empty_key<T>();
81  auto ptr = reinterpret_cast<const T*>(ptr1);
82  for (size_t e = 0; e < entry_count; ++e, ptr += key_component_count) {
83  if (*ptr == empty) {
84  continue;
85  }
86 
87  std::vector<int64_t> key;
88  size_t j = 0;
89  for (; j < key_component_count - 1; ++j) {
90  key.push_back(ptr[j]);
91  }
92 
93  std::set<int32_t> payload;
94  payload.insert(ptr[j]);
95 
96  s.insert({std::move(key), std::move(payload)});
97  }
98 }
99 
100 template <typename T>
101 void to_set_one_to_many(const int8_t* ptr1,
102  const int32_t* const ptr2,
103  const int32_t* const ptr3,
104  const int32_t* const ptr4,
105  size_t entry_count,
106  size_t key_component_count,
108  auto empty = get_empty_key<T>();
109  auto ptr = reinterpret_cast<const T*>(ptr1);
110  for (size_t e = 0; e < entry_count; ++e, ptr += key_component_count) {
111  if (*ptr == empty) {
112  continue;
113  }
114 
115  std::vector<int64_t> key;
116  size_t j = 0;
117  for (; j < key_component_count - 1; ++j) {
118  key.push_back(ptr[j]);
119  }
120 
121  int32_t offset = ptr2[e];
122 
123  int32_t count = ptr3[e];
124 
125  decltype(DecodedJoinHashBufferEntry::payload) payload;
126  for (size_t j = 0; j < static_cast<size_t>(count); ++j) {
127  payload.insert(ptr4[offset + j]);
128  }
129 
130  s.insert({std::move(key), std::move(payload)});
131  }
132 }
133 
134 } // namespace keyed_hash
135 
136 } // anonymous namespace
137 
140  size_t key_component_count, // number of key parts
141  size_t key_component_width, // width of a key part
142  size_t entry_count, // number of hashable entries
143  const int8_t* ptr1, // keys
144  const int8_t* ptr2, // offsets
145  const int8_t* ptr3, // counts
146  const int8_t* ptr4, // payloads (rowids)
147  size_t buffer_size) { // total memory size
149 
150  CHECK_LE(ptr1, ptr2);
151  CHECK_LE(ptr2, ptr3);
152  CHECK_LE(ptr3, ptr4);
153  CHECK_LE(ptr4, ptr1 + buffer_size);
154 
155  bool have_keys = ptr2 > ptr1;
156  bool have_offsets = ptr3 > ptr2;
157  bool have_counts = ptr4 > ptr3;
158  bool have_payloads = (ptr1 + buffer_size) > ptr4;
159 
160  auto i32ptr2 = reinterpret_cast<const int32_t*>(ptr2);
161  auto i32ptr3 = reinterpret_cast<const int32_t*>(ptr3);
162  auto i32ptr4 = reinterpret_cast<const int32_t*>(ptr4);
163 
164  if (have_keys) { // BaselineJoinHashTable or BoundingBoxIntersectJoinHashTable
165  CHECK(key_component_width == 8 || key_component_width == 4);
166  if (key_component_width == 8) {
167  if (!have_offsets && !have_counts) {
168  keyed_hash::to_set_one_to_one<int64_t>(ptr1, entry_count, key_component_count, s);
169  } else {
170  keyed_hash::to_set_one_to_many<int64_t>(
171  ptr1, i32ptr2, i32ptr3, i32ptr4, entry_count, key_component_count, s);
172  }
173  } else if (key_component_width == 4) {
174  if (!have_offsets && !have_counts) {
175  keyed_hash::to_set_one_to_one<int32_t>(ptr1, entry_count, key_component_count, s);
176  } else {
177  keyed_hash::to_set_one_to_many<int32_t>(
178  ptr1, i32ptr2, i32ptr3, i32ptr4, entry_count, key_component_count, s);
179  }
180  }
181  } else { // JoinHashTable
182  if (!have_offsets && !have_counts && have_payloads) {
183  perfect_hash::to_set_one_to_one(i32ptr4, entry_count, s);
184  } else {
185  perfect_hash::to_set_one_to_many(i32ptr2, i32ptr3, i32ptr4, entry_count, s);
186  }
187  }
188 
189  return s;
190 }
191 
192 namespace {
193 
194 template <typename T>
195 void inner_to_string(const int8_t* ptr1,
196  size_t entry_count,
197  size_t key_component_count,
198  bool raw,
199  std::string& txt) {
200  auto empty = get_empty_key<T>();
201  auto ptr = reinterpret_cast<const T*>(ptr1);
202  for (size_t e = 0; e < entry_count; ++e, ptr += key_component_count) {
203  if (e != 0) {
204  txt += " ";
205  }
206  if (*ptr == empty && !raw) {
207  txt += "*"; // null hash table entry
208  } else if (*ptr == empty - 1 && !raw) {
209  txt += "?"; // write_pending (should never happen here)
210  } else {
211  txt += "(";
212  for (size_t j = 0; j < key_component_count; ++j) {
213  if (j != 0) {
214  txt += ",";
215  }
216  txt += std::to_string(ptr[j]);
217  }
218  txt += ")";
219  }
220  }
221 }
222 
223 } // anonymous namespace
224 
227  const std::string& type, // perfect, keyed, or geo
228  const std::string& layout_type, // one-to-one, one-to-many, many-to-many
229  size_t key_component_count, // number of key parts
230  size_t key_component_width, // width of a key part
231  size_t entry_count, // number of hashable entries
232  const int8_t* ptr1, // keys
233  const int8_t* ptr2, // offsets
234  const int8_t* ptr3, // counts
235  const int8_t* ptr4, // payloads (rowids)
236  size_t buffer_size, // total memory size
237  bool raw) {
238  std::string txt;
239 
240  CHECK(ptr1 <= ptr2);
241  CHECK(ptr2 <= ptr3);
242  CHECK(ptr3 <= ptr4);
243  CHECK(ptr4 <= ptr1 + buffer_size);
244 
245  bool have_keys = ptr2 > ptr1;
246  bool have_offsets = ptr3 > ptr2;
247  bool have_counts = ptr4 > ptr3;
248  bool have_payloads = (ptr1 + buffer_size) > ptr4;
249 
250  // table heading
251  txt += "| " + type;
252  if (!have_offsets && !have_counts) {
253  txt += layout_type;
254  } else if (have_offsets && have_counts) {
255  txt += layout_type;
256  } else {
257  CHECK(false);
258  }
259 
260  // first section: keys
261  if (have_keys) {
262  CHECK(key_component_width == 8 || key_component_width == 4);
263 
264  if (!txt.empty()) {
265  txt += " ";
266  }
267  txt += "| keys ";
268 
269  if (key_component_width == 8) {
270  inner_to_string<int64_t>(ptr1, entry_count, key_component_count, raw, txt);
271  } else if (key_component_width == 4) {
272  inner_to_string<int32_t>(ptr1, entry_count, key_component_count, raw, txt);
273  }
274  }
275 
276  // second section: offsets
277  if (have_offsets) {
278  if (!txt.empty()) {
279  txt += " ";
280  }
281  txt += "| offsets ";
282 
283  auto i32ptr2 = reinterpret_cast<const int32_t*>(ptr2);
284  auto i32ptr3 = reinterpret_cast<const int32_t*>(ptr3);
285  for (size_t i = 0; &i32ptr2[i] < i32ptr3; ++i) {
286  if (i != 0) {
287  txt += " ";
288  }
289  if (i32ptr2[i] == -1 && !raw) {
290  txt += "*"; // null
291  } else {
292  txt += std::to_string(i32ptr2[i]);
293  }
294  }
295  }
296 
297  // third section: counts
298  if (have_counts) {
299  if (!txt.empty()) {
300  txt += " ";
301  }
302  txt += "| counts ";
303 
304  auto i32ptr3 = reinterpret_cast<const int32_t*>(ptr3);
305  auto i32ptr4 = reinterpret_cast<const int32_t*>(ptr4);
306  for (size_t i = 0; &i32ptr3[i] < i32ptr4; ++i) {
307  if (i != 0) {
308  txt += " ";
309  }
310  if (i32ptr3[i] == 0 && !raw) {
311  txt += "*"; // null
312  } else {
313  txt += std::to_string(i32ptr3[i]);
314  }
315  }
316  }
317 
318  // fourth section: payloads (rowids)
319  if (have_payloads) {
320  if (!txt.empty()) {
321  txt += " ";
322  }
323  txt += "| payloads ";
324 
325  auto i32ptr4 = reinterpret_cast<const int32_t*>(ptr4);
326  auto i32ptr5 = reinterpret_cast<const int32_t*>(ptr1 + buffer_size);
327  for (size_t i = 0; &i32ptr4[i] < i32ptr5; ++i) {
328  if (i != 0) {
329  txt += " ";
330  }
331  if (i32ptr4[i] == -1 && !raw) {
332  txt += "*"; // null
333  } else {
334  txt += std::to_string(i32ptr4[i]);
335  }
336  }
337  }
338 
339  if (!txt.empty()) {
340  txt += " |";
341  }
342  return txt;
343 }
void inner_to_string(const int8_t *ptr1, size_t entry_count, size_t key_component_count, bool raw, std::string &txt)
Definition: HashTable.cpp:195
std::string to_string(char const *&&v)
#define CHECK_LE(x, y)
Definition: Logger.h:304
std::set< DecodedJoinHashBufferEntry > DecodedJoinHashBufferSet
Definition: HashTable.h:72
static std::string toString(const std::string &type, const std::string &layout_type, size_t key_component_count, size_t key_component_width, size_t entry_count, const int8_t *ptr1, const int8_t *ptr2, const int8_t *ptr3, const int8_t *ptr4, size_t buffer_size, bool raw=false)
Decode hash table into a human-readable string.
Definition: HashTable.cpp:226
#define CHECK(condition)
Definition: Logger.h:291
std::set< int32_t > payload
Definition: HashTable.h:23
static DecodedJoinHashBufferSet toSet(size_t key_component_count, size_t key_component_width, size_t entry_count, const int8_t *ptr1, const int8_t *ptr2, const int8_t *ptr3, const int8_t *ptr4, size_t buffer_size)
Decode hash table into a std::set for easy inspection and validation.
Definition: HashTable.cpp:139
void to_set_one_to_many(const int32_t *const ptr2, const int32_t *const ptr3, const int32_t *const ptr4, size_t entry_count, DecodedJoinHashBufferSet &s)
Definition: HashTable.cpp:43
std::vector< int64_t > key
Definition: HashTable.h:22
void to_set_one_to_one(const int32_t *const ptr4, size_t entry_count, DecodedJoinHashBufferSet &s)
Definition: HashTable.cpp:23