OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringNoneEncoder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
23 #include "StringNoneEncoder.h"
24 #include <algorithm>
25 #include <cstdlib>
26 #include <memory>
27 #include "MemoryLevel.h"
28 
30 
32  const std::vector<std::string>* srcData,
33  const int start_idx,
34  const size_t numAppendElems,
35  const size_t byteLimit,
36  const bool replicating) {
37  size_t dataSize = 0;
38  size_t n = start_idx;
39  for (; n < start_idx + numAppendElems; n++) {
40  size_t len = (*srcData)[replicating ? 0 : n].length();
41  if (dataSize + len > byteLimit) {
42  break;
43  }
44  dataSize += len;
45  }
46  return n - start_idx;
47 }
48 
50  const int8_t* index_data,
51  const std::vector<size_t>& selected_idx,
52  const size_t byte_limit) {
53  size_t num_elements = 0;
54  size_t data_size = 0;
55  for (const auto& offset_index : selected_idx) {
56  auto element_size = getStringSizeAtIndex(index_data, offset_index);
57  if (data_size + element_size > byte_limit) {
58  break;
59  }
60  data_size += element_size;
61  num_elements++;
62  }
63  return num_elements;
64 }
65 
66 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendEncodedDataAtIndices(
67  const int8_t* index_data,
68  int8_t* data,
69  const std::vector<size_t>& selected_idx) {
70  std::vector<std::string_view> data_subset;
71  data_subset.reserve(selected_idx.size());
72  for (const auto& offset_index : selected_idx) {
73  data_subset.emplace_back(getStringAtIndex(index_data, data, offset_index));
74  }
75  return appendData(&data_subset, 0, selected_idx.size(), false);
76 }
77 
78 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendEncodedData(
79  const int8_t* index_data,
80  int8_t* data,
81  const size_t start_idx,
82  const size_t num_elements) {
83  std::vector<std::string_view> data_subset;
84  data_subset.reserve(num_elements);
85  for (size_t count = 0; count < num_elements; ++count) {
86  auto current_index = start_idx + count;
87  data_subset.emplace_back(getStringAtIndex(index_data, data, current_index));
88  }
89  return appendData(&data_subset, 0, num_elements, false);
90 }
91 
92 template <typename StringType>
93 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData(
94  const std::vector<StringType>* srcData,
95  const int start_idx,
96  const size_t numAppendElems,
97  const bool replicating) {
98  return appendData(srcData->data(), start_idx, numAppendElems, replicating);
99 }
100 
101 template <typename StringType>
102 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData(const StringType* srcData,
103  const int start_idx,
104  const size_t numAppendElems,
105  const bool replicating) {
106  CHECK(index_buf); // index_buf must be set before this.
107  size_t append_index_size = numAppendElems * sizeof(StringOffsetT);
108  if (num_elems_ == 0) {
109  append_index_size += sizeof(StringOffsetT); // plus one for the initial offset of 0.
110  }
111  index_buf->reserve(index_buf->size() + append_index_size);
112  StringOffsetT offset = 0;
113  if (num_elems_ == 0) {
114  index_buf->append((int8_t*)&offset,
115  sizeof(StringOffsetT)); // write the inital 0 offset
116  last_offset = 0;
117  } else {
118  // always need to read a valid last offset from buffer/disk
119  // b/c now due to vacuum "last offset" may go backward and if
120  // index chunk was not reloaded last_offset would go way off!
121  index_buf->read((int8_t*)&last_offset,
122  sizeof(StringOffsetT),
123  index_buf->size() - sizeof(StringOffsetT),
125  CHECK_GE(last_offset, 0);
126  }
127  size_t append_data_size = 0;
128  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
129  size_t len = (srcData)[replicating ? 0 : n].length();
130  append_data_size += len;
131  }
132  buffer_->reserve(buffer_->size() + append_data_size);
133 
134  size_t inbuf_size =
135  std::min(std::max(append_index_size, append_data_size), (size_t)MAX_INPUT_BUF_SIZE);
136  auto inbuf = std::make_unique<int8_t[]>(inbuf_size);
137  for (size_t num_appended = 0; num_appended < numAppendElems;) {
138  StringOffsetT* p = reinterpret_cast<StringOffsetT*>(inbuf.get());
139  size_t i;
140  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(StringOffsetT);
141  i++, num_appended++) {
142  p[i] = last_offset + (srcData)[replicating ? 0 : num_appended + start_idx].length();
143  last_offset = p[i];
144  }
145  index_buf->append(inbuf.get(), i * sizeof(StringOffsetT));
146  }
147 
148  for (size_t num_appended = 0; num_appended < numAppendElems;) {
149  size_t size = 0;
150  for (int i = start_idx + num_appended;
151  num_appended < numAppendElems && size < inbuf_size;
152  i++, num_appended++) {
153  size_t len = (srcData)[replicating ? 0 : i].length();
154  if (len > inbuf_size) {
155  // for large strings, append on its own
156  if (size > 0) {
157  buffer_->append(inbuf.get(), size);
158  }
159  size = 0;
160  buffer_->append((int8_t*)(srcData)[replicating ? 0 : i].data(), len);
161  num_appended++;
162  break;
163  } else if (size + len > inbuf_size) {
164  break;
165  }
166  char* dest = reinterpret_cast<char*>(inbuf.get()) + size;
167  if (len > 0) {
168  (srcData)[replicating ? 0 : i].copy(dest, len);
169  size += len;
170  }
171  update_elem_stats((srcData)[replicating ? 0 : i]);
172  }
173  if (size > 0) {
174  buffer_->append(inbuf.get(), size);
175  }
176  }
177  // make sure buffer_ is flushed even if no new data is appended to it
178  // (e.g. empty strings) because the metadata needs to be flushed.
179  if (!buffer_->isDirty()) {
180  buffer_->setDirty();
181  }
182 
183  num_elems_ += numAppendElems;
184  auto chunk_metadata = std::make_shared<ChunkMetadata>();
185  getMetadata(chunk_metadata);
186  return chunk_metadata;
187 }
188 
189 void StringNoneEncoder::updateStats(const std::vector<std::string>* const src_data,
190  const size_t start_idx,
191  const size_t num_elements) {
192  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
193  update_elem_stats((*src_data)[n]);
194  if (has_nulls) {
195  break;
196  }
197  }
198 }
199 
200 template <typename StringType>
201 void StringNoneEncoder::update_elem_stats(const StringType& elem) {
202  if (!has_nulls && elem.empty()) {
203  has_nulls = true;
204  }
205 }
206 
207 std::pair<StringOffsetT, StringOffsetT> StringNoneEncoder::getStringOffsets(
208  const int8_t* index_data,
209  size_t index) {
210  auto string_offsets = reinterpret_cast<const StringOffsetT*>(index_data);
211  auto current_index = index + 1;
212  auto offset = string_offsets[current_index];
213  CHECK(offset >= 0);
214  int64_t last_offset = string_offsets[current_index - 1];
215  CHECK(last_offset >= 0 && last_offset <= offset);
216  return {offset, last_offset};
217 }
218 
219 size_t StringNoneEncoder::getStringSizeAtIndex(const int8_t* index_data, size_t index) {
220  auto [offset, last_offset] = getStringOffsets(index_data, index);
221  size_t string_byte_size = offset - last_offset;
222  return string_byte_size;
223 }
224 
225 std::string_view StringNoneEncoder::getStringAtIndex(const int8_t* index_data,
226  const int8_t* data,
227  size_t index) {
228  auto [offset, last_offset] = getStringOffsets(index_data, index);
229  size_t string_byte_size = offset - last_offset;
230  auto current_data = reinterpret_cast<const char*>(data + last_offset);
231  return std::string_view{current_data, string_byte_size};
232 }
233 
234 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string>(
235  const std::vector<std::string>* srcData,
236  const int start_idx,
237  const size_t numAppendElems,
238  const bool replicating);
239 
240 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string_view>(
241  const std::vector<std::string_view>* srcData,
242  const int start_idx,
243  const size_t numAppendElems,
244  const bool replicating);
245 
246 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string>(
247  const std::string* srcData,
248  const int start_idx,
249  const size_t numAppendElems,
250  const bool replicating);
251 
252 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string_view>(
253  const std::string_view* srcData,
254  const int start_idx,
255  const size_t numAppendElems,
256  const bool replicating);
257 
258 template void StringNoneEncoder::update_elem_stats<std::string>(const std::string& elem);
259 template void StringNoneEncoder::update_elem_stats<std::string_view>(
260  const std::string_view& elem);
261 
262 void StringNoneEncoder::getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) {
263  Encoder::getMetadata(chunkMetadata); // call on parent class
264  chunkMetadata->chunkStats.min.stringval = nullptr;
265  chunkMetadata->chunkStats.max.stringval = nullptr;
266  chunkMetadata->chunkStats.has_nulls = has_nulls;
267 }
268 
269 // Only called from the executor for synthesized meta-information.
270 std::shared_ptr<ChunkMetadata> StringNoneEncoder::getMetadata(const SQLTypeInfo& ti) {
271  auto chunk_stats = ChunkStats{};
272  chunk_stats.min.stringval = nullptr;
273  chunk_stats.max.stringval = nullptr;
274  chunk_stats.has_nulls = has_nulls;
275  return std::make_shared<ChunkMetadata>(ti, 0, 0, chunk_stats);
276 }
static std::string_view getStringAtIndex(const int8_t *index_data, const int8_t *data, size_t index)
size_t num_elems_
Definition: Encoder.h:288
void updateStats(const int64_t, const bool) override
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
#define CHECK_GE(x, y)
Definition: Logger.h:306
std::shared_ptr< ChunkMetadata > appendEncodedDataAtIndices(const int8_t *index_data, int8_t *data, const std::vector< size_t > &selected_idx) override
size_t getNumElemsForBytesInsertData(const std::vector< std::string > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
int32_t StringOffsetT
Definition: sqltypes.h:1495
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:231
AbstractBuffer * index_buf
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:290
size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t *index_data, const std::vector< size_t > &selected_idx, const size_t byte_limit) override
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51
An AbstractBuffer is a unit of data management for a data manager.
static size_t getStringSizeAtIndex(const int8_t *index_data, size_t index)
std::string * stringval
Definition: Datum.h:81
void update_elem_stats(const StringType &elem)
StringOffsetT last_offset
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
std::shared_ptr< ChunkMetadata > appendEncodedData(const int8_t *index_data, int8_t *data, const size_t start_idx, const size_t num_elements) override
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:291
For unencoded strings.
constexpr double n
Definition: Utm.h:38
virtual void reserve(size_t num_bytes)=0
static std::pair< StringOffsetT, StringOffsetT > getStringOffsets(const int8_t *index_data, size_t index)