OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLPredictCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2023 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
19 #include "TreeModelPredictionMgr.h"
20 
21 #ifdef HAVE_CUDA
23 #include "GpuMemUtils.h"
24 #endif // HAVE_CUDA
25 
26 #include <tbb/parallel_for.h>
27 #include <stack>
28 #include <vector>
29 
30 std::vector<std::shared_ptr<Analyzer::Expr>> generated_encoded_and_casted_features(
31  const std::vector<std::shared_ptr<Analyzer::Expr>>& feature_exprs,
32  const std::vector<std::vector<std::string>>& cat_feature_keys,
33  const std::vector<int64_t>& feature_permutations,
34  Executor* executor) {
35  std::vector<std::shared_ptr<Analyzer::Expr>> casted_feature_exprs;
36  const size_t num_feature_exprs = feature_exprs.size();
37  const size_t num_cat_features = cat_feature_keys.size();
38 
39  if (num_cat_features > num_feature_exprs) {
40  throw std::runtime_error("More categorical keys than features.");
41  }
42 
43  auto get_int_constant_expr = [](int32_t const_val) {
44  Datum d;
45  d.intval = const_val;
46  return makeExpr<Analyzer::Constant>(SQLTypeInfo(kINT, false), false, d);
47  };
48 
49  for (size_t original_feature_idx = 0; original_feature_idx < num_feature_exprs;
50  ++original_feature_idx) {
51  const auto feature_idx = feature_permutations.empty()
52  ? original_feature_idx
53  : feature_permutations[original_feature_idx];
54  auto& feature_expr = feature_exprs[feature_idx];
55  const auto& feature_ti = feature_expr->get_type_info();
56  if (feature_ti.is_number()) {
57  // Don't conditionally cast to double iff type is not double
58  // as this was causing issues for the random forest function with
59  // mixed types. Need to troubleshoot more but always casting to double
60  // regardless of the underlying type always seems to be safe
61  casted_feature_exprs.emplace_back(makeExpr<Analyzer::UOper>(
62  SQLTypeInfo(kDOUBLE, false), false, kCAST, feature_expr));
63  } else {
64  CHECK(feature_ti.is_string()) << "Expected text type";
65  if (!feature_ti.is_text_encoding_dict()) {
66  throw std::runtime_error("Expected dictionary-encoded text column.");
67  }
68  if (original_feature_idx >= num_cat_features) {
69  throw std::runtime_error("Model not trained on text type for column.");
70  }
71  const auto& str_dict_key = feature_ti.getStringDictKey();
72  const auto str_dict_proxy = executor->getStringDictionaryProxy(str_dict_key, true);
73  for (const auto& cat_feature_key : cat_feature_keys[original_feature_idx]) {
74  // For one-hot encoded columns, null values will translate as a 0.0 and not a null
75  // We are computing the following:
76  // CASE WHEN str_val is NULL then 0.0 ELSE
77  // CAST(str_id = one_hot_encoded_str_id AS DOUBLE) END
78 
79  // Check if the expression is null
80  auto is_null_expr = makeExpr<Analyzer::UOper>(
81  SQLTypeInfo(kBOOLEAN, false), false, kISNULL, feature_expr);
82  Datum zero_datum;
83  zero_datum.doubleval = 0.0;
84  // If null then emit a 0.0 double constant as the THEN expr
85  auto is_null_then_expr =
86  makeExpr<Analyzer::Constant>(SQLTypeInfo(kDOUBLE, false), false, zero_datum);
87  std::list<
88  std::pair<std::shared_ptr<Analyzer::Expr>, std::shared_ptr<Analyzer::Expr>>>
89  when_then_exprs;
90  when_then_exprs.emplace_back(std::make_pair(is_null_expr, is_null_then_expr));
91  // The rest of/core string test logic goes in the ELSE statement
92  // Get the string id of the one-hot feature
93  const auto str_id = str_dict_proxy->getIdOfString(cat_feature_key);
94  auto str_id_expr = get_int_constant_expr(str_id);
95  // Get integer id for this row's string
96  auto key_for_string_expr = makeExpr<Analyzer::KeyForStringExpr>(feature_expr);
97 
98  // Check if this row's string id is equal to the search one-hot encoded id
99  std::shared_ptr<Analyzer::Expr> str_equality_expr =
100  makeExpr<Analyzer::BinOper>(SQLTypeInfo(kBOOLEAN, false),
101  false,
102  kEQ,
103  kONE,
104  key_for_string_expr,
105  str_id_expr);
106  // Cast the above boolean results to a double, 0.0 or 1.0
107  auto cast_expr = makeExpr<Analyzer::UOper>(
108  SQLTypeInfo(kDOUBLE, false), false, kCAST, str_equality_expr);
109 
110  // Generate the full CASE statement and add to the casted feature exprssions
111  casted_feature_exprs.emplace_back(makeExpr<Analyzer::CaseExpr>(
112  SQLTypeInfo(kDOUBLE, false), false, when_then_exprs, cast_expr));
113  }
114  }
115  }
116  return casted_feature_exprs;
117 }
118 
120  const Analyzer::MLPredictExpr* expr,
121  const std::shared_ptr<AbstractMLModel>& abstract_model,
122  const CompilationOptions& co) {
124  const auto linear_reg_model =
125  std::dynamic_pointer_cast<LinearRegressionModel>(abstract_model);
126  // The parent codegen function called this function `codegenLinRegPredict`
127  // iff we had MLModelType::LINEAR_REG_PREDICT, so below is just a sanity
128  // check
129  CHECK(linear_reg_model);
130  const auto& model_coefs = linear_reg_model->getCoefs();
131  const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
132 
133  const auto& regressor_exprs = expr->get_regressor_values();
134 
135  const auto casted_regressor_exprs = generated_encoded_and_casted_features(
136  regressor_exprs,
137  cat_feature_keys,
138  linear_reg_model->getModelMetadata().getFeaturePermutations(),
139  executor());
140 
141  auto get_double_constant_expr = [](double const_val) {
142  Datum d;
143  d.doubleval = const_val;
144  return makeExpr<Analyzer::Constant>(SQLTypeInfo(kDOUBLE, false), false, d);
145  };
146 
147  std::shared_ptr<Analyzer::Expr> result;
148 
149  // Linear regression models are of the form
150  // y = b0 + b1*x1 + b2*x2 + ... + bn*xn
151  // Where b0 is the constant y-intercept, x1..xn are the dependent
152  // varabiles (aka regressors or predictors), and b1..bn are the
153  // regression coefficients
154 
155  for (size_t model_coef_idx = 0; model_coef_idx < model_coefs.size(); ++model_coef_idx) {
156  auto coef_value_expr = get_double_constant_expr(model_coefs[model_coef_idx]);
157  if (model_coef_idx == size_t(0)) {
158  // We have the y-intercept b0, this is not multiplied by any regressor
159  result = coef_value_expr;
160  } else {
161  // We have a term with a regressor (xi) and regression coefficient (bi)
162  const auto& casted_regressor_expr = casted_regressor_exprs[model_coef_idx - 1];
163  // Multiply regressor by coefficient
164  auto mul_expr = makeExpr<Analyzer::BinOper>(SQLTypeInfo(kDOUBLE, false),
165  false,
166  kMULTIPLY,
167  kONE,
168  coef_value_expr,
169  casted_regressor_expr);
170  // Add term to result
171  result = makeExpr<Analyzer::BinOper>(
172  SQLTypeInfo(kDOUBLE, false), false, kPLUS, kONE, result, mul_expr);
173  }
174  }
175 
176  // The following will codegen the expression tree we just created modeling
177  // the linear regression formula
178  return codegenArith(dynamic_cast<Analyzer::BinOper*>(result.get()), co);
179 }
180 
182  const Analyzer::MLPredictExpr* expr,
183  const std::shared_ptr<AbstractTreeModel>& tree_model,
184  const CompilationOptions& co) {
185 #ifdef HAVE_ONEDAL
186  const int64_t num_trees = static_cast<int64_t>(tree_model->getNumTrees());
187  const auto& regressor_exprs = expr->get_regressor_values();
188  const auto& cat_feature_keys = tree_model->getCatFeatureKeys();
189  const auto casted_regressor_exprs = generated_encoded_and_casted_features(
190  regressor_exprs,
191  cat_feature_keys,
192  tree_model->getModelMetadata().getFeaturePermutations(),
193  executor());
194  // We cast all regressors to double for simplicity and to match
195  // how feature filters are stored in the tree model.
196  // Null checks are handled further down in the generated kernel
197  // in the runtime function itself
198 
199  std::vector<llvm::Value*> regressor_values;
200  for (const auto& casted_regressor_expr : casted_regressor_exprs) {
201  regressor_values.emplace_back(codegen(casted_regressor_expr.get(), false, co)[0]);
202  }
203 
204  // First build tables, i.e. vectors of DecisionTreeEntry, for each tree
205  std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
206  {
207  auto tree_build_timer = DEBUG_TIMER("Tree Visitors Dispatched");
208  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_trees),
209  [&](const tbb::blocked_range<int64_t>& r) {
210  const auto start_tree_idx = r.begin();
211  const auto end_tree_idx = r.end();
212  for (int64_t tree_idx = start_tree_idx; tree_idx < end_tree_idx;
213  ++tree_idx) {
214  TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
215  tree_model->traverseDF(tree_idx, tree_visitor);
216  }
217  });
218  }
219 
220  // Next, compute prefix-sum offset such that decision_tree_offsets[k]
221  // specifies the starting offset of tree k relative to tree 0, and
222  // decision_tree_offsets[k+1] specifies the last entry + 1 of tree
223  // k relative to tree 0
224  std::vector<int64_t> decision_tree_offsets(num_trees + 1);
225  decision_tree_offsets[0] = 0;
226  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
227  decision_tree_offsets[tree_idx + 1] =
228  decision_tree_offsets[tree_idx] +
229  static_cast<int64_t>(decision_trees[tree_idx].size());
230  }
231 
232  VLOG(1) << tree_model->getModelTypeString() << " model has " << num_trees
233  << " trees and " << decision_tree_offsets[num_trees] << " total entries.";
234 
235  // Finally, go back through each tree and adjust all left and right child idx entries
236  // such that such values are global relative to the start of tree 0. This will allow
237  // the downstream code-generated kernel to be able treat these child idx entries as
238  // as absolute offsets from the base pointer for all trees, rather than computing such
239  // an offset on the fly
240  {
241  auto tree_offset_correction_timer = DEBUG_TIMER("Tree Offsets Corrected");
243  tbb::blocked_range<int64_t>(1, num_trees),
244  [&](const tbb::blocked_range<int64_t>& r) {
245  const auto start_tree_idx = r.begin();
246  const auto end_tree_idx = r.end();
247  for (int64_t tree_idx = start_tree_idx; tree_idx < end_tree_idx; ++tree_idx) {
248  const int64_t start_offset = decision_tree_offsets[tree_idx];
249  auto& decision_tree = decision_trees[tree_idx];
250  const int64_t num_tree_entries = static_cast<int64_t>(decision_tree.size());
251  CHECK_EQ(num_tree_entries,
252  decision_tree_offsets[tree_idx + 1] - start_offset);
253  for (int64_t decision_entry_idx = 0; decision_entry_idx < num_tree_entries;
254  ++decision_entry_idx) {
255  if (decision_tree[decision_entry_idx].isSplitNode()) {
256  decision_tree[decision_entry_idx].left_child_row_idx += start_offset;
257  decision_tree[decision_entry_idx].right_child_row_idx += start_offset;
258  }
259  }
260  }
261  });
262  }
263 
264  {
265  auto tree_model_prediction_mgr_timer =
266  DEBUG_TIMER("TreeModelPredictionMgr generation and codegen");
267  // TreeModelPredictionMgr copies the decision trees and offsets to host
268  // buffers in RowSetMemoryOwner and onto each GPU if the query is running
269  // on GPU, and takes care of the tree traversal codegen itself
270 
271  const bool compute_avg = tree_model->getModelType() == MLModelType::RANDOM_FOREST_REG;
272  auto tree_model_prediction_mgr = std::make_unique<TreeModelPredictionMgr>(
275  executor(),
276  decision_trees,
277  decision_tree_offsets,
278  compute_avg);
279 
280  return cgen_state_->moveTreeModelPredictionMgr(std::move(tree_model_prediction_mgr))
281  ->codegen(regressor_values, co);
282  }
283 #else
284  throw std::runtime_error("OneDAL not available.");
285 #endif
286 }
287 
289  const CompilationOptions& co) {
290  auto timer = DEBUG_TIMER(__func__);
291  const auto& model_expr = expr->get_model_value();
292  CHECK(model_expr);
293  auto model_constant_expr = dynamic_cast<const Analyzer::Constant*>(model_expr);
294  CHECK(model_constant_expr);
295  const auto model_datum = model_constant_expr->get_constval();
296  const auto model_name_ptr = model_datum.stringval;
297  CHECK(model_name_ptr);
298  const auto model_name = *model_name_ptr;
299  const auto abstract_model = g_ml_models.getModel(model_name);
300  const auto model_type = abstract_model->getModelType();
301  const auto& regressor_exprs = expr->get_regressor_values();
302  if (abstract_model->getNumLogicalFeatures() !=
303  static_cast<int64_t>(regressor_exprs.size())) {
304  std::ostringstream error_oss;
305  error_oss << "ML_PREDICT: Model '" << model_name
306  << "' expects different number of predictor variables ("
307  << abstract_model->getNumLogicalFeatures() << ") than provided ("
308  << regressor_exprs.size() << ").";
309  throw std::runtime_error(error_oss.str());
310  }
311 
312  switch (model_type) {
314  return codegenLinRegPredict(expr, abstract_model, co);
315  }
319  if (auto tree_model =
320  std::dynamic_pointer_cast<AbstractTreeModel>(abstract_model)) {
321  return codegenTreeRegPredict(expr, tree_model, co);
322  } else {
323  throw std::runtime_error(
324  "Invalid ML model codegen call. Input model is not of expected type "
325  "TreeModel.");
326  }
327  }
328  default: {
329  throw std::runtime_error("Unsupported model type.");
330  }
331  }
332 }
333 
335  const CompilationOptions& co) {
336  auto timer = DEBUG_TIMER(__func__);
337  const auto& model_expr = expr->get_model_value();
338  CHECK(model_expr);
339  auto model_constant_expr = dynamic_cast<const Analyzer::Constant*>(model_expr);
340  CHECK(model_constant_expr);
341  const auto model_datum = model_constant_expr->get_constval();
342  const auto model_name_ptr = model_datum.stringval;
343  CHECK(model_name_ptr);
344  const auto model_name = *model_name_ptr;
345  const auto abstract_model = g_ml_models.getModel(model_name);
346  const auto model_type = abstract_model->getModelType();
347  if (model_type != MLModelType::PCA) {
348  throw std::runtime_error("PCA_PROJECT: Model '" + model_name +
349  "' is not a PCA model.");
350  }
351  const auto pca_model = std::dynamic_pointer_cast<PcaModel>(abstract_model);
352  const auto& feature_exprs = expr->get_feature_values();
353  if (pca_model->getNumLogicalFeatures() != static_cast<int64_t>(feature_exprs.size())) {
354  std::ostringstream error_oss;
355  error_oss << "PCA_PROJECT: Model '" << model_name
356  << "' expects different number of predictor variables ("
357  << pca_model->getNumLogicalFeatures() << ") than provided ("
358  << feature_exprs.size() << ").";
359  throw std::runtime_error(error_oss.str());
360  }
361 
362  const auto& pc_dimension_expr = expr->get_pc_dimension_value();
363  auto pc_dimension_const_expr =
364  dynamic_cast<const Analyzer::Constant*>(pc_dimension_expr);
365  const auto pc_dimension_datum = pc_dimension_const_expr->get_constval();
366  const auto pc_dimension = pc_dimension_datum.intval - 1;
367  if (pc_dimension < 0 || pc_dimension >= pca_model->getNumFeatures()) {
368  std::ostringstream error_oss;
369  error_oss << "PCA_PROJECT: Invalid PC dimension (" << pc_dimension + 1
370  << ") provided. Valid range is [1, " << pca_model->getNumFeatures() << "].";
371  throw std::runtime_error(error_oss.str());
372  }
373 
374  const auto& column_means = pca_model->getColumnMeans();
375  const auto& column_std_devs = pca_model->getColumnStdDevs();
376  const auto& eigenvectors = pca_model->getEigenvectors();
377 
378  const auto& cat_feature_keys = pca_model->getCatFeatureKeys();
379 
380  const auto casted_feature_exprs = generated_encoded_and_casted_features(
381  feature_exprs,
382  cat_feature_keys,
383  pca_model->getModelMetadata().getFeaturePermutations(),
384  executor());
385 
386  auto get_double_constant_expr = [](double const_val) {
387  Datum d;
388  d.doubleval = const_val;
389  return makeExpr<Analyzer::Constant>(SQLTypeInfo(kDOUBLE, false), false, d);
390  };
391 
392  std::shared_ptr<Analyzer::Expr> result;
393 
394  for (size_t feature_idx = 0; feature_idx < feature_exprs.size(); ++feature_idx) {
395  auto mean_expr = get_double_constant_expr(column_means[feature_idx]);
396  const auto& casted_feature_expr = casted_feature_exprs[feature_idx];
397  // Subtract column mean from feature
398  auto mean_diff_expr = makeExpr<Analyzer::BinOper>(
399  SQLTypeInfo(kDOUBLE, false), false, kMINUS, kONE, casted_feature_expr, mean_expr);
400  auto std_dev_expr = get_double_constant_expr(column_std_devs[feature_idx]);
401  auto z_score_expr = makeExpr<Analyzer::BinOper>(
402  SQLTypeInfo(kDOUBLE, false), false, kDIVIDE, kONE, mean_diff_expr, std_dev_expr);
403  auto pc_term_expr = get_double_constant_expr(eigenvectors[pc_dimension][feature_idx]);
404  auto pca_mul_expr = makeExpr<Analyzer::BinOper>(
405  SQLTypeInfo(kDOUBLE, false), false, kMULTIPLY, kONE, z_score_expr, pc_term_expr);
406  if (feature_idx == 0) {
407  // There is no result yet, so set the result to the first term
408  result = pca_mul_expr;
409  } else {
410  // Add the term to the result
411  result = makeExpr<Analyzer::BinOper>(
412  SQLTypeInfo(kDOUBLE, false), false, kPLUS, kONE, result, pca_mul_expr);
413  }
414  }
415 
416  // The following will codegen the expression tree we just created modeling
417  // the linear regression formula
418  return codegenArith(dynamic_cast<Analyzer::BinOper*>(result.get()), co);
419 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
llvm::Value * codegenTreeRegPredict(const Analyzer::MLPredictExpr *, const std::shared_ptr< AbstractTreeModel > &tree_model, const CompilationOptions &)
llvm::Value * codegenArith(const Analyzer::BinOper *, const CompilationOptions &)
CgenState * cgen_state_
Definition: sqldefs.h:51
Definition: sqldefs.h:32
llvm::Value * codegen(const std::vector< llvm::Value * > &regressor_inputs, const CompilationOptions &co) const
Definition: sqldefs.h:43
llvm::Value * codegenLinRegPredict(const Analyzer::MLPredictExpr *, const std::shared_ptr< AbstractMLModel > &model, const CompilationOptions &)
int32_t intval
Definition: Datum.h:75
std::vector< std::shared_ptr< Analyzer::Expr > > generated_encoded_and_casted_features(const std::vector< std::shared_ptr< Analyzer::Expr >> &feature_exprs, const std::vector< std::vector< std::string >> &cat_feature_keys, const std::vector< int64_t > &feature_permutations, Executor *executor)
const Expr * get_pc_dimension_value() const
Definition: Analyzer.h:792
const TreeModelPredictionMgr * moveTreeModelPredictionMgr(std::unique_ptr< const TreeModelPredictionMgr > &&tree_model_prediction_mgr)
Definition: CgenState.h:205
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
ExecutorDeviceType device_type
MLModelMap g_ml_models
Definition: MLModel.h:125
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
Definition: sqldefs.h:42
Definition: sqldefs.h:74
Datum get_constval() const
Definition: Analyzer.h:348
const Expr * get_model_value() const
Definition: Analyzer.h:788
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
const std::vector< std::shared_ptr< Analyzer::Expr > > & get_feature_values() const
Definition: Analyzer.h:789
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
const Expr * get_model_value() const
Definition: Analyzer.h:713
Definition: sqltypes.h:72
Allocate GPU memory using GpuBuffers via DataMgr.
const std::vector< std::shared_ptr< Analyzer::Expr > > & get_regressor_values() const
Definition: Analyzer.h:714
Definition: Datum.h:71
#define VLOG(n)
Definition: Logger.h:388
double doubleval
Definition: Datum.h:78
Executor * executor() const