OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringDictionaryProxy.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef STRINGDICTIONARY_STRINGDICTIONARYPROXY_H
18 #define STRINGDICTIONARY_STRINGDICTIONARYPROXY_H
19 
20 #include "Logger/Logger.h" // For CHECK macros
21 #include "Shared/misc.h"
22 #include "StringDictionary.h"
23 
24 #include "ThirdParty/robin_hood/robin_hood.h"
25 
26 #include <optional>
27 #include <ostream>
28 #include <shared_mutex>
29 #include <string>
30 #include <string_view>
31 #include <tuple>
32 #include <vector>
33 
34 namespace StringOps_Namespace {
35 struct StringOpInfo;
36 }
37 
38 // used to access a StringDictionary when transient strings are involved
40  public:
43  StringDictionaryProxy(std::shared_ptr<StringDictionary> sd,
44  const shared::StringDictKey& string_dict_key,
45  const int64_t generation);
46 
47  const shared::StringDictKey& getDictKey() const noexcept { return string_dict_key_; };
48 
49  bool operator==(StringDictionaryProxy const&) const;
50  bool operator!=(StringDictionaryProxy const&) const;
51 
52  int32_t getOrAdd(const std::string& str) noexcept;
53  StringDictionary* getDictionary() const noexcept;
54  int64_t getGeneration() const noexcept;
55 
76  std::vector<int32_t> getTransientBulk(const std::vector<std::string>& strings) const;
77  int32_t getOrAddTransient(const std::string&);
78  int32_t getOrAddTransient(const std::string_view);
79  // Not currently used
80  std::vector<int32_t> getOrAddTransientBulk(const std::vector<std::string>& strings);
81  int32_t getIdOfString(const std::string& str) const;
83  const std::string& str) const; // disregard generation, only used by QueryRenderer
84  std::string getString(int32_t string_id) const;
85  std::vector<std::string> getStrings(const std::vector<int32_t>& string_ids) const;
86  std::pair<const char*, size_t> getStringBytes(int32_t string_id) const noexcept;
87 
88  template <typename T>
90  size_t const offset_;
91  std::vector<T> vector_map_;
92  int64_t num_untranslated_strings_{-1};
93  T range_start_{0};
94  T range_end_{0};
95 
96  public:
97  // +1 is added to skip string_id=-1 reserved for INVALID_STR_ID. id_map[-1]==-1.
98  TranslationMap(uint32_t const tran_size, uint32_t const dict_size)
99  : offset_(tran_size + 1), vector_map_(offset_ + dict_size) {}
100  TranslationMap(uint32_t const tran_size, uint32_t const dict_size, const T& init_val)
101  : offset_(tran_size + 1), vector_map_(offset_ + dict_size, init_val) {}
102  TranslationMap(TranslationMap const&) = delete;
103  TranslationMap(TranslationMap&&) = default;
104  bool empty() const { return vector_map_.size() == 1; }
105  inline size_t getIndex(int32_t const id) const { return offset_ + id; }
106  std::vector<T> const& getVectorMap() const { return vector_map_; }
107  size_t size() const { return vector_map_.size(); }
108  size_t numTransients() const { return offset_ - 1; }
109  size_t numNonTransients() const { return vector_map_.size() - offset_; }
110  T* data() { return vector_map_.data(); }
111  T const* data() const { return vector_map_.data(); }
112  int32_t domainStart() const { return -static_cast<int32_t>(offset_); }
113  int32_t domainEnd() const { return static_cast<int32_t>(numNonTransients()); }
114  void setRangeStart(const int32_t range_start) { range_start_ = range_start; }
115  void setRangeEnd(const int32_t range_end) { range_end_ = range_end; }
116  T rangeStart() const { return range_start_; }
117  T rangeEnd() const { return range_end_; }
118 
119  // Next two methods are currently used by buildUnionTranslationMapToOtherProxy to
120  // short circuit iteration over ids after intersection translation if all
121  // ids translated. Currently the private num_untranslated_strings_ is initialized
122  // to a -1 sentinel to signify that the value has not been calculated, which we
123  // CHECK against in the getter numUntranslatedStrings() method
124  // to represent that the num_untranslated_strings_ field has been uninitialized
125  size_t numUntranslatedStrings() const {
126  CHECK_GE(num_untranslated_strings_, 0L);
127  return static_cast<size_t>(num_untranslated_strings_);
128  }
129  void setNumUntranslatedStrings(const size_t num_untranslated_strings) {
130  num_untranslated_strings_ = static_cast<int64_t>(num_untranslated_strings);
131  }
132  T* storageData() { return vector_map_.data() + offset_; }
133  T& operator[](int32_t const id) { return vector_map_[getIndex(id)]; }
134  T operator[](int32_t const id) const { return vector_map_[getIndex(id)]; }
135  friend std::ostream& operator<<(std::ostream& os, TranslationMap<T> const& sdp_map) {
136  return os << "IdMap(offset_(" << sdp_map.offset_ << ") vector_map_"
137  << shared::printContainer(sdp_map.vector_map_) << ')';
138  }
139  };
140 
142 
143  IdMap initIdMap() const {
144  return IdMap(
146  }
147 
168  TranslationMap<Datum> buildNumericTranslationMap(
169  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
170 
172  const StringDictionaryProxy* dest_proxy,
173  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
174 
176  StringDictionaryProxy* dest_proxy,
177  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_types) const;
178 
188  size_t storageEntryCount() const;
189 
196  size_t transientEntryCount() const;
197 
206  size_t entryCount() const;
207 
208  void updateGeneration(const int64_t generation) noexcept;
209 
210  template <typename T>
211  std::vector<T> getLike(const std::string& pattern,
212  const bool icase,
213  const bool is_simple,
214  const char escape) const;
215 
216  std::vector<int32_t> getCompare(const std::string& pattern,
217  const std::string& comp_operator) const;
218 
219  std::vector<int32_t> getRegexpLike(const std::string& pattern, const char escape) const;
220 
222  using is_transparent = void; // Used by robin_hood to activate heterogenous hashing
223  // std::string and char const* are implicitly cast to std::string_view.
224  size_t operator()(std::string_view const key) const {
225  return robin_hood::hash_bytes(key.data(), key.size());
226  }
227  };
229  using is_transparent = void; // Used by robin_hood to activate heterogenous equal
230  // std::string and char const* are implicitly cast to std::string_view.
231  bool operator()(std::string_view const lhs, std::string_view const rhs) const {
232  return lhs == rhs;
233  }
234  };
235 
236  // The std::string must live in the map, and std::string const* in the vector. As
237  // desirable as it might be to have it the other way, string addresses won't change
238  // in the robin_hood::unordered_node_map when new strings are added, but may change
239  // in a std::vector (and robin_hood::unordered_flat_map).
240  using TransientMap = robin_hood::unordered_node_map<std::string,
241  int32_t,
242  HeterogeneousStringHash,
244 
245  const std::vector<std::string const*>& getTransientVector() const {
246  return transient_string_vec_;
247  }
248 
249  // INVALID_STR_ID = -1 is reserved for invalid string_ids.
250  // Thus the greatest valid transient string_id is -2.
251  static unsigned transientIdToIndex(int32_t const id) {
252  constexpr int max_transient_string_id = -2;
253  return static_cast<unsigned>(max_transient_string_id - id);
254  }
255 
256  static int32_t transientIndexToId(unsigned const index) {
257  constexpr int max_transient_string_id = -2;
258  return static_cast<int32_t>(max_transient_string_id - index);
259  }
260 
261  // Iterate over transient strings, then non-transients.
263 
264  // Union strings from both StringDictionaryProxies into *this as transients.
265  // Return map of old string_ids to new string_ids.
267 
268  private:
269  std::string getStringUnlocked(const int32_t string_id) const;
270  size_t transientEntryCountUnlocked() const;
271  size_t entryCountUnlocked() const;
272  size_t persistedC() const;
273  template <typename String>
274  int32_t getOrAddTransientImpl(String);
275  template <typename String>
276  int32_t lookupTransientStringUnlocked(const String& lookup_string) const;
277  size_t getTransientBulkImpl(const std::vector<std::string>& strings,
278  int32_t* string_ids,
279  const bool take_read_lock) const;
280  template <typename String>
281  size_t transientLookupBulk(const std::vector<String>& lookup_strings,
282  int32_t* string_ids,
283  const bool take_read_lock) const;
284  template <typename String>
285  size_t transientLookupBulkUnlocked(const std::vector<String>& lookup_strings,
286  int32_t* string_ids) const;
287  template <typename String>
288  size_t transientLookupBulkParallelUnlocked(const std::vector<String>& lookup_strings,
289  int32_t* string_ids) const;
290 
292  const StringDictionaryProxy* dest_proxy,
293  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
294 
295  std::shared_ptr<StringDictionary> string_dict_;
298  // Holds pointers into transient_str_to_int_
299  std::vector<std::string const*> transient_string_vec_;
300  int64_t generation_;
302 
303  // Return INVALID_STR_ID if not found on string_dict_. Don't lock or check transients.
304  template <typename String>
305  int32_t getIdOfStringFromClient(String const&) const;
306  template <typename String>
307  int32_t getOrAddTransientUnlocked(String const&);
308 
309  friend class StringLocalCallback;
310  friend class StringNetworkCallback;
311 };
312 #endif // STRINGDICTIONARY_STRINGDICTIONARYPROXY_H
void eachStringSerially(StringDictionary::StringCallback &) const
int32_t getOrAddTransientImpl(String)
void setNumUntranslatedStrings(const size_t num_untranslated_strings)
const shared::StringDictKey string_dict_key_
std::pair< const char *, size_t > getStringBytes(int32_t string_id) const noexcept
size_t transientEntryCountUnlocked() const
const std::vector< std::string const * > & getTransientVector() const
TranslationMap(uint32_t const tran_size, uint32_t const dict_size, const T &init_val)
size_t entryCount() const
Returns the number of total string entries for this proxy, both stored in the underlying dictionary a...
int32_t getIdOfStringNoGeneration(const std::string &str) const
std::string getStringUnlocked(const int32_t string_id) const
size_t storageEntryCount() const
Returns the number of string entries in the underlying string dictionary, at this proxy&#39;s generation_...
TranslationMap(uint32_t const tran_size, uint32_t const dict_size)
StringDictionary * getDictionary() const noexcept
#define CHECK_GE(x, y)
Definition: Logger.h:306
size_t transientLookupBulkUnlocked(const std::vector< String > &lookup_strings, int32_t *string_ids) const
StringDictionaryProxy const & operator=(StringDictionaryProxy const &)=delete
size_t transientLookupBulk(const std::vector< String > &lookup_strings, int32_t *string_ids, const bool take_read_lock) const
std::string getString(int32_t string_id) const
IdMap buildIntersectionTranslationMapToOtherProxyUnlocked(const StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
size_t transientLookupBulkParallelUnlocked(const std::vector< String > &lookup_strings, int32_t *string_ids) const
int32_t getIdOfStringFromClient(String const &) const
std::vector< int32_t > getTransientBulk(const std::vector< std::string > &strings) const
Executes read-only lookup of a vector of strings and returns a vector of their integer ids...
TranslationMap< Datum > buildNumericTranslationMap(const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
Builds a vectorized string_id translation map from this proxy to dest_proxy.
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
TranslationMap< int32_t > IdMap
static constexpr int32_t INVALID_STR_ID
std::shared_ptr< StringDictionary > string_dict_
IdMap transientUnion(StringDictionaryProxy const &)
std::vector< std::string const * > transient_string_vec_
void setRangeEnd(const int32_t range_end)
int32_t lookupTransientStringUnlocked(const String &lookup_string) const
std::vector< std::string > getStrings(const std::vector< int32_t > &string_ids) const
size_t getTransientBulkImpl(const std::vector< std::string > &strings, int32_t *string_ids, const bool take_read_lock) const
size_t operator()(std::string_view const key) const
static int32_t transientIndexToId(unsigned const index)
void updateGeneration(const int64_t generation) noexcept
size_t transientEntryCount() const
Returns the number of transient string entries for this proxy,.
IdMap buildUnionTranslationMapToOtherProxy(StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_types) const
StringDictionaryProxy(StringDictionaryProxy const &)=delete
void setRangeStart(const int32_t range_start)
int32_t getOrAddTransient(const std::string &)
std::vector< T > getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape) const
int32_t getOrAddTransientUnlocked(String const &)
bool operator!=(StringDictionaryProxy const &) const
std::vector< int32_t > getRegexpLike(const std::string &pattern, const char escape) const
int32_t getOrAdd(const std::string &str) noexcept
bool operator==(StringDictionaryProxy const &) const
size_t getIndex(int32_t const id) const
std::vector< T > const & getVectorMap() const
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)
IdMap buildIntersectionTranslationMapToOtherProxy(const StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
robin_hood::unordered_node_map< std::string, int32_t, HeterogeneousStringHash, HeterogeneousStringEqual > TransientMap
nvtxRangeId_t range_start(const char *)
Definition: nvtx_helpers.h:247
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:108
std::shared_timed_mutex shared_mutex
const shared::StringDictKey & getDictKey() const noexcept
void range_end(nvtxRangeId_t)
Definition: nvtx_helpers.h:253
bool operator()(std::string_view const lhs, std::string_view const rhs) const
size_t persistedC() const
int32_t getIdOfString(const std::string &str) const
static unsigned transientIdToIndex(int32_t const id)
int64_t getGeneration() const noexcept