OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLTableFunctions.cpp
Go to the documentation of this file.
1 #include "MLTableFunctions.hpp"
2 
3 using namespace TableFunctions_Namespace;
4 
5 #ifndef __CUDACC__
6 
9  Column<TextEncodingDict>& output_ml_frameworks,
10  Column<bool>& output_availability,
11  Column<bool>& output_default) {
12  const std::vector<std::string> ml_frameworks = {"oneapi", "onedal", "mlpack"};
13  const int32_t num_frameworks = ml_frameworks.size();
14  mgr.set_output_row_size(num_frameworks);
15  const std::vector<int32_t> ml_framework_string_ids =
16  output_ml_frameworks.string_dict_proxy_->getOrAddTransientBulk(ml_frameworks);
17 
18 #if defined(HAVE_ONEDAL) || defined(HAVE_MLPACK)
19  bool found_available_framework = false;
20  auto framework_found_actions = [&output_availability,
21  &output_default,
22  &found_available_framework](const int64_t out_row_idx) {
23  output_availability[out_row_idx] = true;
24  if (!found_available_framework) {
25  output_default[out_row_idx] = true;
26  found_available_framework = true;
27  } else {
28  output_default[out_row_idx] = false;
29  }
30  };
31 #endif
32 
33 #if !defined(HAVE_ONEDAL) || !defined(HAVE_MLPACK)
34  auto framework_not_found_actions = [&output_availability,
35  &output_default](const int64_t out_row_idx) {
36  output_availability[out_row_idx] = false;
37  output_default[out_row_idx] = false;
38  };
39 #endif
40 
41  for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
42  output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
43  if (ml_frameworks[out_row_idx] == "onedal" ||
44  ml_frameworks[out_row_idx] == "oneapi") {
45 #ifdef HAVE_ONEDAL
46  framework_found_actions(out_row_idx);
47 #else
48  framework_not_found_actions(out_row_idx);
49 #endif
50  } else if (ml_frameworks[out_row_idx] == "mlpack") {
51 #ifdef HAVE_MLPACK
52  framework_found_actions(out_row_idx);
53 #else
54  framework_not_found_actions(out_row_idx);
55 #endif
56  }
57  }
58  return num_frameworks;
59 }
60 
61 std::vector<std::string> get_model_features(
62  const std::string& model_name,
63  const std::shared_ptr<AbstractMLModel>& model) {
64  return model->getModelMetadata().getFeatures();
65 }
66 
69  const TextEncodingNone& model_name,
70  const ColumnList<TextEncodingDict>& input_cat_features,
71  const int32_t cat_top_k,
72  const float cat_min_fraction,
73  const TextEncodingNone& preferred_ml_framework_str,
74  const TextEncodingNone& model_metadata,
75  Column<TextEncodingDict>& output_model_name) {
76  CategoricalFeaturesBuilder<double> cat_features_builder(
77  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
78  return pca_fit_impl(mgr,
79  model_name,
80  cat_features_builder.getFeatures(),
81  cat_features_builder.getCatFeatureKeys(),
82  preferred_ml_framework_str,
83  model_metadata,
84  output_model_name);
85 }
86 
89  const TextEncodingNone& model_name,
90  Column<int64_t>& output_coef_idx,
91  Column<TextEncodingDict>& output_feature,
92  Column<int64_t>& output_sub_coef_idx,
93  Column<TextEncodingDict>& output_sub_feature,
94  Column<double>& output_coef) {
95  try {
96  const auto linear_reg_model = std::dynamic_pointer_cast<LinearRegressionModel>(
97  g_ml_models.getModel(model_name));
98  if (!linear_reg_model) {
99  throw std::runtime_error("Model is not of type linear regression.");
100  }
101 
102  const auto& coefs = linear_reg_model->getCoefs();
103  const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
104  const int64_t num_sub_coefs = static_cast<int64_t>(coefs.size());
105  const int64_t num_cat_features = static_cast<int64_t>(cat_feature_keys.size());
106  mgr.set_output_row_size(num_sub_coefs);
107 
108  std::vector<std::string> feature_names =
109  get_model_features(model_name, linear_reg_model);
110  feature_names.insert(feature_names.begin(), "intercept");
111 
112  for (int64_t sub_coef_idx = 0, coef_idx = 0; sub_coef_idx < num_sub_coefs;
113  ++coef_idx) {
114  if (num_cat_features >= coef_idx && coef_idx > 0) {
115  const auto& col_cat_feature_keys = cat_feature_keys[coef_idx - 1];
116  int64_t col_cat_feature_idx = 1;
117  for (const auto& col_cat_feature_key : col_cat_feature_keys) {
118  output_coef_idx[sub_coef_idx] = coef_idx;
119  if (feature_names[coef_idx].empty()) {
120  output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
121  } else {
122  output_feature[sub_coef_idx] =
123  output_feature.getOrAddTransient(feature_names[coef_idx]);
124  }
125  output_sub_coef_idx[sub_coef_idx] = col_cat_feature_idx++;
126  output_sub_feature[sub_coef_idx] =
127  output_sub_feature.getOrAddTransient(col_cat_feature_key);
128  output_coef[sub_coef_idx] = coefs[sub_coef_idx];
129  ++sub_coef_idx;
130  }
131  } else {
132  output_coef_idx[sub_coef_idx] = coef_idx;
133  if (feature_names[coef_idx].empty()) {
134  output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
135  } else {
136  output_feature[sub_coef_idx] =
137  output_feature.getOrAddTransient(feature_names[coef_idx]);
138  }
139  output_sub_coef_idx[sub_coef_idx] = 1;
140  output_sub_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
141  output_coef[sub_coef_idx] = coefs[sub_coef_idx];
142  ++sub_coef_idx;
143  }
144  }
145 
146  return num_sub_coefs;
147  } catch (std::runtime_error& e) {
148  return mgr.ERROR_MESSAGE(e.what());
149  }
150 }
151 
154  const Column<TextEncodingDict>& model_name,
155  Column<int64_t>& output_coef_idx,
156  Column<TextEncodingDict>& output_feature,
157  Column<int64_t>& output_sub_coef_idx,
158  Column<TextEncodingDict>& output_sub_feature,
159  Column<double>& output_coef) {
160  if (model_name.size() != 1) {
161  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
162  }
163  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
164  return linear_reg_coefs__cpu_1(mgr,
165  model_name_text_enc_none,
166  output_coef_idx,
167  output_feature,
168  output_sub_coef_idx,
169  output_sub_feature,
170  output_coef);
171 }
172 
175  const TextEncodingNone& model_name,
176  Column<int64_t>& feature_id,
177  Column<TextEncodingDict>& feature,
178  Column<int64_t>& sub_feature_id,
179  Column<TextEncodingDict>& sub_feature,
180  Column<double>& importance_score) {
181 #ifndef HAVE_ONEDAL
182  return mgr.ERROR_MESSAGE(
183  "Only OneDAL framework supported for random forest regression.");
184 #endif
185  try {
186 #ifdef HAVE_ONEDAL
187  const auto base_model = g_ml_models.getModel(model_name);
188  const auto rand_forest_model =
189  std::dynamic_pointer_cast<AbstractRandomForestModel>(base_model);
190  if (!rand_forest_model) {
191  throw std::runtime_error("Model is not of type random forest.");
192  }
193  const auto& variable_importance_scores =
194  rand_forest_model->getVariableImportanceScores();
195  const int64_t num_features = variable_importance_scores.size();
196  mgr.set_output_row_size(num_features);
197  if (num_features == 0) {
198  return mgr.ERROR_MESSAGE("Variable importance not computed for this model.");
199  }
200  if (num_features != rand_forest_model->getNumFeatures()) {
201  return mgr.ERROR_MESSAGE(
202  "Mismatch in number of features and number of variable importance metrics.");
203  }
204  const auto num_logical_features = rand_forest_model->getNumLogicalFeatures();
205  std::vector<std::string> feature_names =
206  get_model_features(model_name, rand_forest_model);
207 
208  int64_t physical_feature_idx = 0;
209  const auto& cat_feature_keys = rand_forest_model->getCatFeatureKeys();
210  const auto num_cat_features = rand_forest_model->getNumCatFeatures();
211  for (int64_t feature_idx = 0; feature_idx < num_logical_features; ++feature_idx) {
212  // Make feature ids start at 1, not 0
213  if (feature_idx < num_cat_features) {
214  const auto& col_cat_feature_keys = cat_feature_keys[feature_idx];
215  int64_t sub_feature_idx = 1;
216  for (const auto& col_cat_feature_key : col_cat_feature_keys) {
217  feature_id[physical_feature_idx] = feature_idx + 1;
218  if (feature_names[feature_idx].empty()) {
219  feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
220  } else {
221  feature[physical_feature_idx] =
222  feature.getOrAddTransient(feature_names[feature_idx]);
223  }
224  sub_feature_id[physical_feature_idx] = sub_feature_idx++;
225  const TextEncodingDict feature_sub_key =
226  sub_feature.getOrAddTransient(col_cat_feature_key);
227  sub_feature[physical_feature_idx] = feature_sub_key;
228  importance_score[physical_feature_idx] =
229  variable_importance_scores[physical_feature_idx];
230  physical_feature_idx++;
231  }
232  } else {
233  feature_id[physical_feature_idx] = feature_idx + 1;
234  if (feature_names[feature_idx].empty()) {
235  feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
236  } else {
237  feature[physical_feature_idx] =
238  feature.getOrAddTransient(feature_names[feature_idx]);
239  }
240  sub_feature_id[physical_feature_idx] = 1;
241  sub_feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
242  importance_score[physical_feature_idx] =
243  variable_importance_scores[physical_feature_idx];
244  physical_feature_idx++;
245  }
246  }
247  return num_features;
248 #endif
249  } catch (std::runtime_error& e) {
250  return mgr.ERROR_MESSAGE(e.what());
251  }
252 }
253 
256  const Column<TextEncodingDict>& model_name,
257  Column<int64_t>& feature_id,
258  Column<TextEncodingDict>& feature,
259  Column<int64_t>& sub_feature_id,
260  Column<TextEncodingDict>& sub_feature,
261  Column<double>& importance_score) {
262  if (model_name.size() != 1) {
263  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
264  }
265  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
267  model_name_text_enc_none,
268  feature_id,
269  feature,
270  sub_feature_id,
271  sub_feature,
272  importance_score);
273 }
274 
277  const TextEncodingNone& model_name,
278  Column<int64_t>& tree_id,
279  Column<int64_t>& entry_id,
280  Column<bool>& is_split_node,
281  Column<int64_t>& feature_id,
282  Column<int64_t>& left_child,
283  Column<int64_t>& right_child,
284  Column<double>& value) {
285 #ifdef HAVE_ONEDAL
286  try {
287  const auto model = g_ml_models.getModel(model_name);
288  const auto tree_model = std::dynamic_pointer_cast<AbstractTreeModel>(model);
289  if (!tree_model) {
290  throw std::runtime_error("Model not a tree-type model.");
291  }
292  const auto num_trees = tree_model->getNumTrees();
293  std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
294  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
295  TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
296  tree_model->traverseDF(tree_idx, tree_visitor);
297  }
298  std::vector<int64_t> decision_tree_offsets(num_trees + 1);
299  decision_tree_offsets[0] = 0;
300  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
301  decision_tree_offsets[tree_idx + 1] =
302  decision_tree_offsets[tree_idx] +
303  static_cast<int64_t>(decision_trees[tree_idx].size());
304  }
305  const auto num_entries = decision_tree_offsets[num_trees];
306  mgr.set_output_row_size(num_entries);
307  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
308  const auto& decision_tree = decision_trees[tree_idx];
309  const auto output_offset = decision_tree_offsets[tree_idx];
310  const int64_t num_tree_entries = decision_tree.size();
311  for (int64_t entry_idx = 0; entry_idx < num_tree_entries; ++entry_idx) {
312  const int64_t output_idx = output_offset + entry_idx;
313  const auto& tree_entry = decision_tree[entry_idx];
314  const bool entry_is_split_node = tree_entry.isSplitNode();
315  tree_id[output_idx] = tree_idx;
316  entry_id[output_idx] = entry_idx;
317  is_split_node[output_idx] = entry_is_split_node;
318  feature_id[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
319  : tree_entry.feature_index;
320  left_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
321  : tree_entry.left_child_row_idx;
322  right_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
323  : tree_entry.right_child_row_idx;
324  value[output_idx] = tree_entry.value;
325  }
326  }
327  return num_entries;
328  } catch (std::runtime_error& e) {
329  const std::string error_str(e.what());
330  return mgr.ERROR_MESSAGE(error_str);
331  }
332 #else // Not HAVE_ONEDAL
333  return mgr.ERROR_MESSAGE("OneDAL library must be available for get_decision_trees.");
334 #endif
335 }
336 
339  const Column<TextEncodingDict>& model_name,
340  Column<int64_t>& tree_id,
341  Column<int64_t>& entry_id,
342  Column<bool>& is_split_node,
343  Column<int64_t>& feature_id,
344  Column<int64_t>& left_child,
345  Column<int64_t>& right_child,
346  Column<double>& value) {
347  if (model_name.size() != 1) {
348  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
349  }
350  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
351  return get_decision_trees__cpu_1(mgr,
352  model_name_text_enc_none,
353  tree_id,
354  entry_id,
355  is_split_node,
356  feature_id,
357  left_child,
358  right_child,
359  value);
360 }
361 
363 void check_model_params(const std::shared_ptr<AbstractMLModel>& model,
364  const int64_t num_cat_features,
365  const int64_t num_numeric_features) {
366  if (model->getNumLogicalFeatures() != num_cat_features + num_numeric_features) {
367  std::ostringstream error_oss;
368  error_oss << "Model expects " << model->getNumLogicalFeatures() << " features but "
369  << num_cat_features + num_numeric_features << " were provided.";
370  throw std::runtime_error(error_oss.str());
371  }
372  if (model->getNumCatFeatures() != num_cat_features) {
373  std::ostringstream error_oss;
374  error_oss << "Model expects " << model->getNumCatFeatures()
375  << " categorical features but " << num_cat_features << " were provided.";
376  throw std::runtime_error(error_oss.str());
377  }
378 }
379 
380 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
StringDictionaryProxy * string_dict_proxy_
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
#define EXTENSION_NOINLINE_HOST
Definition: heavydbTypes.h:55
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
MLModelMap g_ml_models
Definition: MLModel.h:125
std::vector< std::string > get_model_features(const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model)
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)
DEVICE int64_t size() const
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const