OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringTestTableFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "TableFunctionsTesting.h"
18 
19 /*
20  This file contains testing string-related compile-time UDTFs.
21  */
22 
23 #ifndef __CUDACC__
24 
27  Column<TextEncodingDict>& out_str,
28  Column<int64_t>& out_size) {
29  const int64_t num_rows = input_str.size();
30  set_output_row_size(num_rows);
31  for (int64_t i = 0; i < num_rows; i++) {
32  out_str[i] = input_str[i];
33  const std::string str = input_str.getString(i);
34  out_size[i] = str.size();
35  }
36  return num_rows;
37 }
38 
41  Column<TextEncodingDict>& string_if_equal,
42  Column<bool>& strings_are_equal) {
43  const int64_t num_rows = input_strings.size();
44  const int64_t num_cols = input_strings.numCols();
45  set_output_row_size(num_rows);
46  for (int64_t r = 0; r < num_rows; r++) {
47  bool are_equal = true;
48  if (num_cols > 0) {
49  std::string first_str = input_strings[0].getString(r);
50  for (int64_t c = 1; c != num_cols; ++c) {
51  if (input_strings[c].getString(r) != first_str) {
52  are_equal = false;
53  break;
54  }
55  }
56  strings_are_equal[r] = are_equal;
57  if (are_equal && num_cols > 0) {
58  string_if_equal[r] = input_strings[0][r];
59  } else {
60  string_if_equal.setNull(r);
61  }
62  }
63  }
64  return num_rows;
65 }
66 
69  const Column<TextEncodingDict>& input_str,
70  const Column<int>& pos,
71  const Column<int>& len,
72  Column<TextEncodingDict>& output_substr) {
73  const int64_t num_rows = input_str.size();
74  mgr.set_output_row_size(num_rows);
75  for (int64_t row_idx = 0; row_idx < num_rows; row_idx++) {
76  const std::string input_string{input_str.getString(row_idx)};
77  const std::string substring = input_string.substr(pos[row_idx], len[row_idx]);
78  const TextEncodingDict substr_id = output_substr.getOrAddTransient(substring);
79  output_substr[row_idx] = substr_id;
80  }
81  return num_rows;
82 }
83 
86  const ColumnList<TextEncodingDict>& input_strings,
88  Column<TextEncodingDict>& concatted_string) {
89  const int64_t num_rows = input_strings.size();
90  const int64_t num_cols = input_strings.numCols();
91  const std::string separator_str{separator.getString()};
92  mgr.set_output_row_size(num_rows);
93  for (int64_t row_idx = 0; row_idx < num_rows; row_idx++) {
94  if (num_cols > 0) {
95  std::string concatted_output{input_strings[0].getString(row_idx)};
96  for (int64_t col_idx = 1; col_idx < num_cols; ++col_idx) {
97  concatted_output += separator_str;
98  concatted_output += input_strings[col_idx].getString(row_idx);
99  }
100  const TextEncodingDict concatted_str_id =
101  concatted_string.getOrAddTransient(concatted_output);
102  concatted_string[row_idx] = concatted_str_id;
103  } else {
104  concatted_string.setNull(row_idx);
105  }
106  }
107  return num_rows;
108 }
109 
112  const int64_t num_strings,
113  Column<TextEncodingDict>& new_dict_col) {
114  mgr.set_output_row_size(num_strings);
115  for (int32_t s = 0; s < num_strings; ++s) {
116  const std::string new_string = "String_" + std::to_string(s);
117  const int32_t string_id = new_dict_col.getOrAddTransient(new_string);
118  new_dict_col[s] = string_id;
119  }
120  return num_strings;
121 }
122 
123 #endif // #ifndef __CUDACC__
124 
126  const TextEncodingNone& str2,
127  Column<int32_t>& hamming_distance) {
128  const int32_t str_len = str1.size() <= str2.size() ? str1.size() : str2.size();
129 
130 #ifdef __CUDACC__
131  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
132  const int32_t step = blockDim.x * gridDim.x;
133  int32_t* output_ptr = hamming_distance.ptr_;
134 #else
135  const int32_t start = 0;
136  const int32_t step = 1;
137 #endif
138 
139  int32_t num_chars_unequal = 0;
140  for (int32_t i = start; i < str_len; i += step) {
141  num_chars_unequal += (str1[i] != str2[i]) ? 1 : 0;
142  }
143 #ifdef __CUDACC__
144  atomicAdd(output_ptr, num_chars_unequal);
145 #else
146  hamming_distance[0] = num_chars_unequal;
147 #endif
148  return 1;
149 }
150 
151 template <typename T>
153  const TextEncodingNone& str,
154  const int32_t multiplier,
155  Column<int32_t>& idx,
156  Column<int8_t>& char_bytes) {
157  const int32_t str_len = str.size();
158  // Note: we assume RowMultiplier is 1 for this test, was to make running on
159  // GPU easy Todo: Provide Constant RowMultiplier interface
160  if (multiplier != 1) {
161  return 0;
162  }
163  const int32_t num_input_rows = indices.size();
164  const int32_t num_output_rows = num_input_rows * multiplier;
165 
166 #ifdef __CUDACC__
167  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
168  const int32_t step = blockDim.x * gridDim.x;
169 #else
170  const int32_t start = 0;
171  const int32_t step = 1;
172 #endif
173 
174  for (int32_t i = start; i < num_output_rows; i += step) {
175  idx[i] = indices[i % num_output_rows];
176  char_bytes[i] = str[i % str_len]; // index < str_len ? str[i] : 0;
177  }
178  return num_output_rows;
179 }
180 
181 // forward declarations
182 template TEMPLATE_NOINLINE int32_t
184  const TextEncodingNone& str,
185  const int32_t multiplier,
186  Column<int32_t>& idx,
187  Column<int8_t>& char_bytes);
188 template TEMPLATE_NOINLINE int32_t
190  const TextEncodingNone& str,
191  const int32_t multiplier,
192  Column<int32_t>& idx,
193  Column<int8_t>& char_bytes);
194 
195 #ifndef __CUDACC__
196 
197 #include <iostream>
198 #include <string>
199 
201  Column<int32_t>& char_idx,
202  Column<int8_t>& char_bytes) {
203  const std::string str{input.getString()};
204  const int64_t str_size(str.size());
205  set_output_row_size(str_size);
206  for (int32_t i = 0; i < str_size; ++i) {
207  char_idx[i] = i;
208  char_bytes[i] = str[i];
209  }
210  return str_size;
211 }
212 
213 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
EXTENSION_NOINLINE_HOST void set_output_row_size(int64_t num_rows)
#define EXTENSION_NOINLINE
Definition: heavydbTypes.h:58
TEMPLATE_NOINLINE int32_t ct_get_string_chars__template(const Column< T > &indices, const TextEncodingNone &str, const int32_t multiplier, Column< int32_t > &idx, Column< int8_t > &char_bytes)
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t size() const
DEVICE void setNull(int64_t index)
EXTENSION_NOINLINE_HOST int32_t ct_binding_str_length__cpu_(const Column< TextEncodingDict > &input_str, Column< TextEncodingDict > &out_str, Column< int64_t > &out_size)
std::string to_string(char const *&&v)
DEVICE ALWAYS_INLINE int64_t size() const
Definition: heavydbTypes.h:688
EXTENSION_NOINLINE_HOST int32_t ct_binding_str_equals__cpu_(const ColumnList< TextEncodingDict > &input_strings, Column< TextEncodingDict > &string_if_equal, Column< bool > &strings_are_equal)
EXTENSION_NOINLINE int32_t ct_hamming_distance(const TextEncodingNone &str1, const TextEncodingNone &str2, Column< int32_t > &hamming_distance)
#define EXTENSION_NOINLINE_HOST
Definition: heavydbTypes.h:55
EXTENSION_NOINLINE_HOST int32_t ct_synthesize_new_dict__cpu_(TableFunctionManager &mgr, const int64_t num_strings, Column< TextEncodingDict > &new_dict_col)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t ct_string_concat__cpu_(TableFunctionManager &mgr, const ColumnList< TextEncodingDict > &input_strings, const TextEncodingNone &separator, Column< TextEncodingDict > &concatted_string)
DEVICE int64_t size() const
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t ct_string_to_chars__cpu_(const TextEncodingNone &input, Column< int32_t > &char_idx, Column< int8_t > &char_bytes)
EXTENSION_NOINLINE_HOST int32_t ct_substr__cpu_(TableFunctionManager &mgr, const Column< TextEncodingDict > &input_str, const Column< int > &pos, const Column< int > &len, Column< TextEncodingDict > &output_substr)
#define TEMPLATE_NOINLINE
Definition: heavydbTypes.h:60
std::pair< std::string_view, const char * > substring(const std::string &str, size_t substr_length)
return substring of str with postfix if str.size() &gt; substr_length