OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ExtensionFunctionsText.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <cstring>
18 #include "Shared/toString.h"
19 #include "heavydbTypes.h"
20 
21 #ifdef __clang__
22 #pragma clang diagnostic push
23 #pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
24 #endif
25 
26 // To-Do: strtok_to_array with default "delimiters" value
27 
28 #ifndef __CUDACC__
29 std::vector<std::string> __strtok_to_array(const std::string& text,
30  const std::string& delimiters) {
31  std::vector<std::string> vec;
32 
33  char* str = const_cast<char*>(text.c_str());
34  const char* del = delimiters.c_str();
35 
36  char* substr = strtok(str, del);
37  while (substr != NULL) {
38  std::string s(substr);
39  vec.emplace_back(s);
40  substr = strtok(NULL, del);
41  }
42 
43  return vec;
44 }
45 
48  TextEncodingNone& text,
49  TextEncodingNone& delimiters) {
50  /*
51  Rules
52  -----
53  * If either parameters is NULL => a NULL is returned
54  * An empty array is returned if tokenization produces no tokens
55 
56  Note
57  ----
58  <delimiters> argument is optional on snowflake but HeavyDB dont' support
59  default values on UDFs at the moment. See:
60  https://github.com/heavyai/heavydb-internal/pull/6651
61 
62  Examples
63  --------
64  > select strtok_to_array('a.b.c', '.');
65  {a, b, c}
66 
67  > select strtok_to_array('user@gmail.com', '.@')
68  {user, gmail, com}
69 
70  > select strtok_to_array('', '.')
71  NULL
72 
73  > select strtok_to_array('a.b.c', '')
74  NULL
75  */
76 
77  if (text.isNull() || delimiters.isNull()) {
78  return Array<TextEncodingDict>(0, true);
79  }
80 
81  const auto& vec = __strtok_to_array(text.getString(), delimiters.getString());
82  Array<TextEncodingDict> out_arr(vec.size());
83  for (size_t i = 0; i < vec.size(); ++i) {
84  out_arr[i] = mgr.getOrAddTransient(TRANSIENT_DICT_DB_ID, TRANSIENT_DICT_ID, vec[i]);
85  }
86  return out_arr;
87 }
88 
91  TextEncodingDict text,
92  TextEncodingNone& delimiters) {
93  if (text.isNull() || delimiters.isNull()) {
94  return Array<TextEncodingDict>(0, true);
95  }
96 
97  std::string str = mgr.getString(GET_DICT_DB_ID(mgr, 0), GET_DICT_ID(mgr, 0), text);
98  const auto& vec = __strtok_to_array(str, delimiters.getString());
99  Array<TextEncodingDict> out_arr(vec.size());
100  for (size_t i = 0; i < vec.size(); ++i) {
101  out_arr[i] = mgr.getOrAddTransient(TRANSIENT_DICT_DB_ID, TRANSIENT_DICT_ID, vec[i]);
102  }
103  return out_arr;
104 }
105 #endif // #ifndef __CUDACC__
106 
107 #ifdef __clang__
108 #pragma clang diagnostic pop
109 #endif
#define EXTENSION_NOINLINE
Definition: heavydbTypes.h:58
std::string getString() const
Definition: heavydbTypes.h:641
#define TRANSIENT_DICT_DB_ID
Definition: DbObjectKeys.h:25
#define TRANSIENT_DICT_ID
Definition: DbObjectKeys.h:24
#define GET_DICT_ID(mgr, arg_idx)
Definition: heavydbTypes.h:141
DEVICE ALWAYS_INLINE bool isNull() const
Definition: heavydbTypes.h:232
#define GET_DICT_DB_ID(mgr, arg_idx)
Definition: heavydbTypes.h:139
int32_t getOrAddTransient(int32_t db_id, int32_t dict_id, std::string str)
Definition: heavydbTypes.h:314
DEVICE ALWAYS_INLINE bool isNull() const
Definition: heavydbTypes.h:691
EXTENSION_NOINLINE Array< TextEncodingDict > strtok_to_array(RowFunctionManager &mgr, TextEncodingNone &text, TextEncodingNone &delimiters)
std::string getString(int32_t db_id, int32_t dict_id, int32_t string_id)
Definition: heavydbTypes.h:299
EXTENSION_NOINLINE Array< TextEncodingDict > strtok_to_array__1(RowFunctionManager &mgr, TextEncodingDict text, TextEncodingNone &delimiters)
std::vector< std::string > __strtok_to_array(const std::string &text, const std::string &delimiters)