OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringDictionary.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <functional>
20 #include <future>
21 #include <map>
22 #include <shared_mutex>
23 #include <string>
24 #include <string_view>
25 #include <tuple>
26 #include <vector>
27 
28 #include "DictRef.h"
29 #include "DictionaryCache.hpp"
30 #include "Shared/DbObjectKeys.h"
31 #include "StringOps/StringOpInfo.h"
32 
34 
36 
37 namespace StringOps_Namespace {
38 struct StringOpInfo;
39 }
40 
41 class DictPayloadUnavailable : public std::runtime_error {
42  public:
43  DictPayloadUnavailable() : std::runtime_error("DictPayloadUnavailable") {}
44 
45  DictPayloadUnavailable(const std::string& err) : std::runtime_error(err) {}
46 };
47 
48 class LeafHostInfo;
49 
50 using string_dict_hash_t = uint32_t;
51 
52 using StringLookupCallback = std::function<bool(std::string_view, int32_t string_id)>;
53 
55  public:
57  const std::string& folder,
58  const bool isTemp,
59  const bool recover,
60  const bool materializeHashes = false,
61  size_t initial_capacity = 256);
62  StringDictionary(const LeafHostInfo& host, const shared::StringDictKey& dict_key);
63  ~StringDictionary() noexcept;
64 
65  const shared::StringDictKey& getDictKey() const noexcept;
66 
68  public:
69  virtual ~StringCallback() = default;
70  virtual void operator()(std::string const&, int32_t const string_id) = 0;
71  virtual void operator()(std::string_view const, int32_t const string_id) = 0;
72  };
73 
74  // Functors passed to eachStringSerially() must derive from StringCallback.
75  // Each std::string const& (if isClient()) or std::string_view (if !isClient())
76  // plus string_id is passed to the callback functor.
77  void eachStringSerially(int64_t const generation, StringCallback&) const;
78  std::function<int32_t(std::string const&)> makeLambdaStringToId() const;
79  friend class StringLocalCallback;
80 
81  int32_t getOrAdd(const std::string& str) noexcept;
82  template <class T, class String>
83  size_t getBulk(const std::vector<String>& string_vec, T* encoded_vec) const;
84  template <class T, class String>
85  size_t getBulk(const std::vector<String>& string_vec,
86  T* encoded_vec,
87  const int64_t generation) const;
88  template <class T, class String>
89  void getOrAddBulk(const std::vector<String>& string_vec, T* encoded_vec);
90  template <class T, class String>
91  void getOrAddBulkParallel(const std::vector<String>& string_vec, T* encoded_vec);
92  template <class String>
93  void getOrAddBulkArray(const std::vector<std::vector<String>>& string_array_vec,
94  std::vector<std::vector<int32_t>>& ids_array_vec);
95  template <class String>
96  int32_t getIdOfString(const String&) const;
97  std::string getString(int32_t string_id) const;
98  std::string_view getStringView(int32_t string_id) const;
99  std::pair<char*, size_t> getStringBytes(int32_t string_id) const noexcept;
100  size_t storageEntryCount() const;
101 
102  template <typename T>
103  std::vector<T> getLike(const std::string& pattern,
104  const bool icase,
105  const bool is_simple,
106  const char escape,
107  const size_t generation) const;
108 
109  template <typename T>
110  std::vector<T> getLikeImpl(const std::string& pattern,
111  const bool icase,
112  const bool is_simple,
113  const char escape,
114  const size_t generation) const;
115 
116  std::vector<int32_t> getCompare(const std::string& pattern,
117  const std::string& comp_operator,
118  const size_t generation);
119 
120  std::vector<int32_t> getRegexpLike(const std::string& pattern,
121  const char escape,
122  const size_t generation) const;
123 
124  std::vector<std::string> copyStrings() const;
125 
126  std::vector<std::string_view> getStringViews() const;
127  std::vector<std::string_view> getStringViews(const size_t generation) const;
128 
129  std::vector<int32_t> buildDictionaryTranslationMap(
130  const std::shared_ptr<StringDictionary> dest_dict,
131  StringLookupCallback const& dest_transient_lookup_callback) const;
132 
134  const StringDictionary* dest_dict,
135  int32_t* translated_ids,
136  const int64_t source_generation,
137  const int64_t dest_generation,
138  const bool dest_has_transients,
139  StringLookupCallback const& dest_transient_lookup_callback,
140  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
141 
143  Datum* translated_ids,
144  const int64_t source_generation,
145  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
146 
147  bool checkpoint() noexcept;
148 
149  bool isClient() const noexcept;
150 
168  static void populate_string_ids(
169  std::vector<int32_t>& dest_ids,
170  StringDictionary* dest_dict,
171  const std::vector<int32_t>& source_ids,
172  const StringDictionary* source_dict,
173  const std::vector<std::string const*>& transient_string_vec = {});
174 
175  static void populate_string_array_ids(
176  std::vector<std::vector<int32_t>>& dest_array_ids,
177  StringDictionary* dest_dict,
178  const std::vector<std::vector<int32_t>>& source_array_ids,
179  const StringDictionary* source_dict);
180 
181  static constexpr int32_t INVALID_STR_ID = -1;
182  static constexpr size_t MAX_STRLEN = (1 << 15) - 1;
183  static constexpr size_t MAX_STRCOUNT = (1U << 31) - 1;
184 
185  void update_leaf(const LeafHostInfo& host_info);
186  size_t computeCacheSize() const;
187 
188  private:
189  struct StringIdxEntry {
190  uint64_t off : 48;
191  uint64_t size : 16;
192  };
193 
194  // In the compare_cache_value_t index represents the index of the sorted cache.
195  // The diff component represents whether the index the cache is pointing to is equal to
196  // the pattern it is cached for. We want to use diff so we don't have compare string
197  // again when we are retrieving it from the cache.
199  int32_t index;
200  int32_t diff;
201  };
202 
203  struct PayloadString {
204  char* c_str_ptr;
205  size_t size;
206  bool canary;
207  };
208 
210  std::vector<std::future<std::vector<std::pair<string_dict_hash_t, unsigned int>>>>&
211  dictionary_futures);
212  size_t getNumStringsFromStorage(const size_t storage_slots) const noexcept;
213  bool fillRateIsHigh(const size_t num_strings) const noexcept;
214  void increaseHashTableCapacity() noexcept;
215  template <class String>
217  const size_t str_count,
218  const size_t storage_high_water_mark,
219  const std::vector<String>& input_strings,
220  const std::vector<size_t>& string_memory_ids,
221  const std::vector<string_dict_hash_t>& input_strings_hashes) noexcept;
222  int32_t getOrAddImpl(const std::string_view& str) noexcept;
223  template <class String>
224  void hashStrings(const std::vector<String>& string_vec,
225  std::vector<string_dict_hash_t>& hashes) const noexcept;
226 
227  int32_t getUnlocked(const std::string_view sv) const noexcept;
228  std::string getStringUnlocked(int32_t string_id) const noexcept;
229  std::string_view getStringViewUnlocked(int32_t string_id) const noexcept;
230  std::string getStringChecked(const int string_id) const noexcept;
231  std::string_view getStringViewChecked(const int string_id) const noexcept;
232  std::pair<char*, size_t> getStringBytesChecked(const int string_id) const noexcept;
233  template <class String>
234  uint32_t computeBucket(
235  const string_dict_hash_t hash,
236  const String& input_string,
237  const std::vector<int32_t>& string_id_string_dict_hash_table) const noexcept;
238  template <class String>
240  const string_dict_hash_t input_string_hash,
241  const String& input_string,
242  const std::vector<int32_t>& string_id_string_dict_hash_table,
243  const size_t storage_high_water_mark,
244  const std::vector<String>& input_strings,
245  const std::vector<size_t>& string_memory_ids) const noexcept;
247  const string_dict_hash_t hash,
248  const std::vector<int32_t>& string_id_string_dict_hash_table) noexcept;
249  void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length);
250  void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length);
251 
252  template <class String>
253  void appendToStorage(const String str) noexcept;
254  template <class String>
255  void appendToStorageBulk(const std::vector<String>& input_strings,
256  const std::vector<size_t>& string_memory_ids,
257  const size_t sum_new_strings_lengths) noexcept;
258  PayloadString getStringFromStorage(const int string_id) const noexcept;
259  std::string_view getStringFromStorageFast(const int string_id) const noexcept;
260  void addPayloadCapacity(const size_t min_capacity_requested = 0) noexcept;
261  void addOffsetCapacity(const size_t min_capacity_requested = 0) noexcept;
262  size_t addStorageCapacity(int fd, const size_t min_capacity_requested = 0) noexcept;
263  void* addMemoryCapacity(void* addr,
264  size_t& mem_size,
265  const size_t min_capacity_requested = 0) noexcept;
266  void invalidateInvertedIndex() noexcept;
267  std::vector<int32_t> getEquals(std::string pattern,
268  std::string comp_operator,
269  size_t generation);
270  void buildSortedCache();
271  void insertInSortedCache(std::string str, int32_t str_id);
272  void sortCache(std::vector<int32_t>& cache);
273  void mergeSortedCache(std::vector<int32_t>& temp_sorted_cache);
274  compare_cache_value_t* binary_search_cache(const std::string& pattern) const;
275 
276  const shared::StringDictKey dict_key_;
277  const std::string folder_;
278  size_t str_count_;
279  size_t collisions_;
280  std::vector<int32_t> string_id_string_dict_hash_table_;
282  std::vector<int32_t> sorted_cache;
283  bool isTemp_;
285  std::string offsets_path_;
293  mutable std::shared_mutex rw_mutex_;
294  mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int32_t>>
296  mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int64_t>>
298  mutable size_t like_cache_size_;
299  mutable std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_;
300  mutable size_t regex_cache_size_;
301  mutable std::map<std::string, int32_t> equal_cache_;
302  mutable size_t equal_cache_size_;
304  mutable size_t compare_cache_size_;
305  mutable std::shared_ptr<std::vector<std::string>> strings_cache_;
306  mutable size_t strings_cache_size_;
307  mutable std::unique_ptr<StringDictionaryClient> client_;
308  mutable std::unique_ptr<StringDictionaryClient> client_no_timeout_;
309 
310  char* CANARY_BUFFER{nullptr};
311  size_t canary_buffer_size = 0;
312 };
313 
314 int32_t truncate_to_generation(const int32_t id, const size_t generation);
315 
316 void translate_string_ids(std::vector<int32_t>& dest_ids,
317  const LeafHostInfo& dict_server_host,
318  const shared::StringDictKey& dest_dict_key,
319  const std::vector<int32_t>& source_ids,
320  const shared::StringDictKey& source_dict_key,
321  const int32_t dest_generation);
StringIdxEntry * offset_map_
bool isClient() const noexcept
void increaseHashTableCapacity() noexcept
void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length)
size_t addStorageCapacity(int fd, const size_t min_capacity_requested=0) noexcept
std::vector< int32_t > getRegexpLike(const std::string &pattern, const char escape, const size_t generation) const
size_t getBulk(const std::vector< String > &string_vec, T *encoded_vec) const
std::vector< int32_t > buildDictionaryTranslationMap(const std::shared_ptr< StringDictionary > dest_dict, StringLookupCallback const &dest_transient_lookup_callback) const
std::vector< std::string > copyStrings() const
void buildDictionaryNumericTranslationMap(Datum *translated_ids, const int64_t source_generation, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
uint64_t off
const shared::StringDictKey & getDictKey() const noexcept
std::map< std::tuple< std::string, bool, bool, char >, std::vector< int64_t > > like_i64_cache_
std::pair< char *, size_t > getStringBytesChecked(const int string_id) const noexcept
uint64_t size
size_t storageEntryCount() const
StringDictionary(const shared::StringDictKey &dict_key, const std::string &folder, const bool isTemp, const bool recover, const bool materializeHashes=false, size_t initial_capacity=256)
void addOffsetCapacity(const size_t min_capacity_requested=0) noexcept
uint32_t computeBucketFromStorageAndMemory(const string_dict_hash_t input_string_hash, const String &input_string, const std::vector< int32_t > &string_id_string_dict_hash_table, const size_t storage_high_water_mark, const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids) const noexcept
std::string getStringChecked(const int string_id) const noexcept
std::vector< string_dict_hash_t > hash_cache_
DictionaryCache< std::string, compare_cache_value_t > compare_cache_
bool fillRateIsHigh(const size_t num_strings) const noexcept
void * addMemoryCapacity(void *addr, size_t &mem_size, const size_t min_capacity_requested=0) noexcept
std::map< std::tuple< std::string, bool, bool, char >, std::vector< int32_t > > like_i32_cache_
std::string offsets_path_
static void populate_string_ids(std::vector< int32_t > &dest_ids, StringDictionary *dest_dict, const std::vector< int32_t > &source_ids, const StringDictionary *source_dict, const std::vector< std::string const * > &transient_string_vec={})
Populates provided dest_ids vector with string ids corresponding to given source strings.
std::vector< T > getLikeImpl(const std::string &pattern, const bool icase, const bool is_simple, const char escape, const size_t generation) const
std::shared_mutex rw_mutex_
std::string_view getStringFromStorageFast(const int string_id) const noexcept
int32_t getOrAdd(const std::string &str) noexcept
std::map< std::pair< std::string, char >, std::vector< int32_t > > regex_cache_
size_t computeCacheSize() const
std::unique_ptr< StringDictionaryClient > client_
std::string getStringUnlocked(int32_t string_id) const noexcept
int32_t getIdOfString(const String &) const
std::string_view getStringView(int32_t string_id) const
static constexpr size_t MAX_STRCOUNT
std::vector< int32_t > getEquals(std::string pattern, std::string comp_operator, size_t generation)
uint32_t computeBucket(const string_dict_hash_t hash, const String &input_string, const std::vector< int32_t > &string_id_string_dict_hash_table) const noexcept
static constexpr int32_t INVALID_STR_ID
std::shared_ptr< std::vector< std::string > > strings_cache_
virtual void operator()(std::string const &, int32_t const string_id)=0
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator, const size_t generation)
const std::string folder_
void appendToStorageBulk(const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids, const size_t sum_new_strings_lengths) noexcept
void addPayloadCapacity(const size_t min_capacity_requested=0) noexcept
std::map< std::string, int32_t > equal_cache_
uint32_t computeUniqueBucketWithHash(const string_dict_hash_t hash, const std::vector< int32_t > &string_id_string_dict_hash_table) noexcept
void getOrAddBulkArray(const std::vector< std::vector< String >> &string_array_vec, std::vector< std::vector< int32_t >> &ids_array_vec)
int32_t getUnlocked(const std::string_view sv) const noexcept
void appendToStorage(const String str) noexcept
void getOrAddBulk(const std::vector< String > &string_vec, T *encoded_vec)
const shared::StringDictKey dict_key_
std::pair< char *, size_t > getStringBytes(int32_t string_id) const noexcept
DictPayloadUnavailable(const std::string &err)
void processDictionaryFutures(std::vector< std::future< std::vector< std::pair< string_dict_hash_t, unsigned int >>>> &dictionary_futures)
void translate_string_ids(std::vector< int32_t > &dest_ids, const LeafHostInfo &dict_server_host, const shared::StringDictKey &dest_dict_key, const std::vector< int32_t > &source_ids, const shared::StringDictKey &source_dict_key, const int32_t dest_generation)
bool checkpoint() noexcept
void mergeSortedCache(std::vector< int32_t > &temp_sorted_cache)
void eachStringSerially(int64_t const generation, StringCallback &) const
size_t getNumStringsFromStorage(const size_t storage_slots) const noexcept
void update_leaf(const LeafHostInfo &host_info)
std::string getString(int32_t string_id) const
void hashStrings(const std::vector< String > &string_vec, std::vector< string_dict_hash_t > &hashes) const noexcept
std::vector< std::string_view > getStringViews() const
std::unique_ptr< StringDictionaryClient > client_no_timeout_
virtual ~StringCallback()=default
void increaseHashTableCapacityFromStorageAndMemory(const size_t str_count, const size_t storage_high_water_mark, const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids, const std::vector< string_dict_hash_t > &input_strings_hashes) noexcept
std::function< int32_t(std::string const &)> makeLambdaStringToId() const
void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length)
static void populate_string_array_ids(std::vector< std::vector< int32_t >> &dest_array_ids, StringDictionary *dest_dict, const std::vector< std::vector< int32_t >> &source_array_ids, const StringDictionary *source_dict)
void invalidateInvertedIndex() noexcept
uint32_t string_dict_hash_t
std::vector< T > getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape, const size_t generation) const
std::string_view getStringViewUnlocked(int32_t string_id) const noexcept
std::vector< int32_t > string_id_string_dict_hash_table_
void sortCache(std::vector< int32_t > &cache)
std::string_view getStringViewChecked(const int string_id) const noexcept
static constexpr size_t MAX_STRLEN
void getOrAddBulkParallel(const std::vector< String > &string_vec, T *encoded_vec)
std::function< bool(std::string_view, int32_t string_id)> StringLookupCallback
bool g_enable_stringdict_parallel
PayloadString getStringFromStorage(const int string_id) const noexcept
std::shared_timed_mutex shared_mutex
compare_cache_value_t * binary_search_cache(const std::string &pattern) const
void insertInSortedCache(std::string str, int32_t str_id)
Definition: Datum.h:71
std::vector< int32_t > sorted_cache
int32_t getOrAddImpl(const std::string_view &str) noexcept
int32_t truncate_to_generation(const int32_t id, const size_t generation)
~StringDictionary() noexcept