OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLModel.h
Go to the documentation of this file.
1 /*
2  * Copyright 2023 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "AbstractMLModel.h"
20 #include "MLModelMetadata.h"
22 
23 #include <iostream>
24 #include <map>
25 #include <memory>
26 #include <stack>
27 #include <vector>
28 
29 #ifndef __CUDACC__
30 
31 #ifdef HAVE_ONEDAL
32 #include "daal.h"
33 #include "oneapi/dal/algo/decision_forest.hpp"
34 #endif
35 
36 class MLModelMap {
37  public:
38  void addModel(const std::string& model_name, std::shared_ptr<AbstractMLModel> model) {
39  const auto upper_model_name = to_upper(model_name);
40  std::lock_guard<std::shared_mutex> model_map_write_lock(model_map_mutex_);
41  model_map_[upper_model_name] = model;
42  }
43 
44  bool modelExists(const std::string& model_name) const {
45  const auto upper_model_name = to_upper(model_name);
46  std::shared_lock<std::shared_mutex> model_map_read_lock(model_map_mutex_);
47  auto model_map_itr = model_map_.find(upper_model_name);
48  return model_map_itr != model_map_.end();
49  }
50 
51  std::shared_ptr<AbstractMLModel> getModel(const std::string& model_name) const {
52  const auto upper_model_name = to_upper(model_name);
53  std::shared_lock<std::shared_mutex> model_map_read_lock(model_map_mutex_);
54  auto model_map_itr = model_map_.find(upper_model_name);
55  if (model_map_itr != model_map_.end()) {
56  return model_map_itr->second;
57  }
58  const std::string error_str = "Model '" + upper_model_name + "' does not exist.";
59  throw std::runtime_error(error_str);
60  }
61 
62  void deleteModel(const std::string& model_name) {
63  const auto upper_model_name = to_upper(model_name);
64  std::lock_guard<std::shared_mutex> model_map_write_lock(model_map_mutex_);
65  auto const model_it = model_map_.find(upper_model_name);
66  if (model_it == model_map_.end()) {
67  std::ostringstream error_oss;
68  error_oss << "Cannot erase model " << upper_model_name
69  << ". No model by that name was found.";
70  throw std::runtime_error(error_oss.str());
71  }
72  model_map_.erase(model_it);
73  }
74 
75  std::vector<std::string> getModelNames() const {
76  std::shared_lock<std::shared_mutex> model_map_read_lock(model_map_mutex_);
77  std::vector<std::string> model_names;
78  model_names.reserve(model_map_.size());
79  for (auto const& model : model_map_) {
80  model_names.emplace_back(model.first);
81  }
82  return model_names;
83  }
84  std::vector<MLModelMetadata> getModelMetadata() const {
85  std::shared_lock<std::shared_mutex> model_map_read_lock(model_map_mutex_);
86  std::vector<MLModelMetadata> model_metadata;
87  for (auto const& model : model_map_) {
88  model_metadata.emplace_back(MLModelMetadata(
89  model.first,
90  model.second->getModelType(),
91  model.second->getModelTypeString(),
92  model.second->getNumLogicalFeatures(),
93  model.second->getNumFeatures(),
94  model.second->getNumCatFeatures(),
95  model.second->getNumLogicalFeatures() - model.second->getNumCatFeatures(),
96  model.second->getModelMetadataStr()));
97  }
98  return model_metadata;
99  }
100 
101  MLModelMetadata getModelMetadata(const std::string& model_name) const {
102  const auto upper_model_name = to_upper(model_name);
103  std::shared_lock<std::shared_mutex> model_map_read_lock(model_map_mutex_);
104  auto model_map_itr = model_map_.find(upper_model_name);
105  if (model_map_itr != model_map_.end()) {
106  return MLModelMetadata(model_map_itr->first,
107  model_map_itr->second->getModelType(),
108  model_map_itr->second->getModelTypeString(),
109  model_map_itr->second->getNumLogicalFeatures(),
110  model_map_itr->second->getNumFeatures(),
111  model_map_itr->second->getNumCatFeatures(),
112  model_map_itr->second->getNumLogicalFeatures() -
113  model_map_itr->second->getNumCatFeatures(),
114  model_map_itr->second->getModelMetadataStr());
115  }
116  const std::string error_str = "Model '" + upper_model_name + "' does not exist.";
117  throw std::runtime_error(error_str);
118  }
119 
120  private:
121  std::map<std::string, std::shared_ptr<AbstractMLModel>> model_map_;
123 };
124 
126 
128  public:
129  LinearRegressionModel(const std::vector<double>& coefs,
130  const std::string& model_metadata)
131  : AbstractMLModel(model_metadata), coefs_(coefs) {}
132 
133  LinearRegressionModel(const std::vector<double>& coefs,
134  const std::string& model_metadata,
135  const std::vector<std::vector<std::string>>& cat_feature_keys)
136  : AbstractMLModel(model_metadata, cat_feature_keys), coefs_(coefs) {}
137 
138  virtual MLModelType getModelType() const override { return MLModelType::LINEAR_REG; }
139 
140  virtual std::string getModelTypeString() const override { return "Linear Regression"; }
141 
142  virtual int64_t getNumFeatures() const override {
143  return static_cast<int64_t>(coefs_.size()) - 1;
144  }
145 
146  const std::vector<double>& getCoefs() const { return coefs_; }
147 
148  private:
149  std::vector<double> coefs_;
150 };
151 
152 // In scenarios where oneDAL is not available, users still need a full definition of
153 // AbstractTreeModel to compile.
154 class TreeModelVisitor;
155 
156 class AbstractTreeModel : public virtual AbstractMLModel {
157  public:
158  virtual MLModelType getModelType() const = 0;
159  virtual std::string getModelTypeString() const = 0;
160  virtual int64_t getNumFeatures() const = 0;
161  virtual int64_t getNumTrees() const = 0;
162  virtual ~AbstractTreeModel() = default;
163  virtual void traverseDF(const int64_t tree_idx,
164  TreeModelVisitor& tree_node_visitor) const = 0;
165 };
166 
167 #ifdef HAVE_ONEDAL
168 
169 using namespace daal::algorithms;
170 using namespace daal::data_management;
171 
172 namespace df = oneapi::dal::decision_forest;
173 
174 class TreeModelVisitor : public daal::algorithms::regression::TreeNodeVisitor {
175  public:
176  TreeModelVisitor(std::vector<DecisionTreeEntry>& decision_table)
177  : decision_table_(decision_table) {}
178 
179  const std::vector<DecisionTreeEntry>& getDecisionTable() const {
180  return decision_table_;
181  }
182 
183  bool onLeafNode(size_t level, double response) override {
184  decision_table_.emplace_back(DecisionTreeEntry(response));
185  if (last_node_leaf_) {
186  decision_table_[parent_nodes_.top()].right_child_row_idx =
187  static_cast<int64_t>(decision_table_.size() - 1);
188  parent_nodes_.pop();
189  }
190  last_node_leaf_ = true;
191  return true;
192  }
193 
194  bool onSplitNode(size_t level, size_t featureIndex, double featureValue) override {
195  decision_table_.emplace_back(
196  DecisionTreeEntry(featureValue,
197  static_cast<int64_t>(featureIndex),
198  static_cast<int64_t>(decision_table_.size() + 1)));
199  if (last_node_leaf_) {
200  decision_table_[parent_nodes_.top()].right_child_row_idx =
201  static_cast<int64_t>(decision_table_.size() - 1);
202  parent_nodes_.pop();
203  }
204  last_node_leaf_ = false;
205  parent_nodes_.emplace(decision_table_.size() - 1);
206  return true;
207  }
208 
209  bool operator()(const df::leaf_node_info<df::task::regression>& info) {
210  decision_table_.emplace_back(DecisionTreeEntry(info.get_response()));
211  if (last_node_leaf_) {
212  decision_table_[parent_nodes_.top()].right_child_row_idx =
213  static_cast<int64_t>(decision_table_.size() - 1);
214  parent_nodes_.pop();
215  }
216  last_node_leaf_ = true;
217  return true;
218  }
219 
220  bool operator()(const df::split_node_info<df::task::regression>& info) {
221  decision_table_.emplace_back(
222  DecisionTreeEntry(info.get_feature_value(),
223  static_cast<int64_t>(info.get_feature_index()),
224  static_cast<int64_t>(decision_table_.size() + 1)));
225  if (last_node_leaf_) {
226  decision_table_[parent_nodes_.top()].right_child_row_idx =
227  static_cast<int64_t>(decision_table_.size() - 1);
228  parent_nodes_.pop();
229  }
230  last_node_leaf_ = false;
231  parent_nodes_.emplace(decision_table_.size() - 1);
232  return true;
233  }
234 
235  private:
236  std::vector<DecisionTreeEntry>& decision_table_;
237  std::stack<size_t> parent_nodes_;
238  bool last_node_leaf_{false};
239 };
240 
241 class DecisionTreeRegressionModel : public virtual AbstractTreeModel {
242  public:
243  DecisionTreeRegressionModel(decision_tree::regression::interface1::ModelPtr& model_ptr,
244  const std::string& model_metadata)
245  : AbstractMLModel(model_metadata), model_ptr_(model_ptr) {}
246  DecisionTreeRegressionModel(
247  decision_tree::regression::interface1::ModelPtr& model_ptr,
248  const std::string& model_metadata,
249  const std::vector<std::vector<std::string>>& cat_feature_keys)
250  : AbstractMLModel(model_metadata, cat_feature_keys), model_ptr_(model_ptr) {}
251 
252  virtual MLModelType getModelType() const override {
254  }
255 
256  virtual std::string getModelTypeString() const override {
257  return "Decision Tree Regression";
258  }
259 
260  virtual int64_t getNumFeatures() const override {
261  return model_ptr_->getNumberOfFeatures();
262  }
263  virtual int64_t getNumTrees() const override { return 1; }
264  virtual void traverseDF(const int64_t tree_idx,
265  TreeModelVisitor& tree_node_visitor) const override {
266  CHECK_EQ(tree_idx, 0);
267  model_ptr_->traverseDF(tree_node_visitor);
268  }
269  const decision_tree::regression::interface1::ModelPtr getModelPtr() const {
270  return model_ptr_;
271  }
272 
273  private:
274  decision_tree::regression::interface1::ModelPtr model_ptr_;
275 };
276 
277 class GbtRegressionModel : public virtual AbstractTreeModel {
278  public:
279  GbtRegressionModel(gbt::regression::interface1::ModelPtr& model_ptr,
280  const std::string& model_metadata)
281  : AbstractMLModel(model_metadata), model_ptr_(model_ptr) {}
282 
283  GbtRegressionModel(gbt::regression::interface1::ModelPtr& model_ptr,
284  const std::string& model_metadata,
285  const std::vector<std::vector<std::string>>& cat_feature_keys)
286  : AbstractMLModel(model_metadata, cat_feature_keys), model_ptr_(model_ptr) {}
287 
288  virtual MLModelType getModelType() const override { return MLModelType::GBT_REG; }
289 
290  virtual std::string getModelTypeString() const override {
291  return "Gradient Boosted Trees Regression";
292  }
293 
294  virtual int64_t getNumFeatures() const override {
295  return model_ptr_->getNumberOfFeatures();
296  }
297  virtual int64_t getNumTrees() const override { return model_ptr_->getNumberOfTrees(); }
298  virtual void traverseDF(const int64_t tree_idx,
299  TreeModelVisitor& tree_node_visitor) const override {
300  model_ptr_->traverseDF(tree_idx, tree_node_visitor);
301  }
302  const gbt::regression::interface1::ModelPtr getModelPtr() const { return model_ptr_; }
303 
304  private:
305  gbt::regression::interface1::ModelPtr model_ptr_;
306 };
307 
308 class AbstractRandomForestModel : public virtual AbstractTreeModel {
309  public:
310  virtual const std::vector<double>& getVariableImportanceScores() const = 0;
311  virtual const double getOutOfBagError() const = 0;
312 };
313 
314 class RandomForestRegressionModel : public virtual AbstractRandomForestModel {
315  public:
316  RandomForestRegressionModel(
317  decision_forest::regression::interface1::ModelPtr& model_ptr,
318  const std::string& model_metadata,
319  const std::vector<double>& variable_importance,
320  const double out_of_bag_error)
321  : AbstractMLModel(model_metadata)
322  , model_ptr_(model_ptr)
323  , variable_importance_(variable_importance)
324  , out_of_bag_error_(out_of_bag_error) {}
325 
326  RandomForestRegressionModel(
327  decision_forest::regression::interface1::ModelPtr& model_ptr,
328  const std::string& model_metadata,
329  const std::vector<std::vector<std::string>>& cat_feature_keys,
330  const std::vector<double>& variable_importance,
331  const double out_of_bag_error)
332  : AbstractMLModel(model_metadata, cat_feature_keys)
333  , model_ptr_(model_ptr)
334  , variable_importance_(variable_importance)
335  , out_of_bag_error_(out_of_bag_error) {}
336 
337  virtual MLModelType getModelType() const override {
339  }
340 
341  virtual std::string getModelTypeString() const override {
342  return "Random Forest Regression";
343  }
344  virtual int64_t getNumFeatures() const override {
345  return model_ptr_->getNumberOfFeatures();
346  }
347  virtual int64_t getNumTrees() const override { return model_ptr_->getNumberOfTrees(); }
348  virtual void traverseDF(const int64_t tree_idx,
349  TreeModelVisitor& tree_node_visitor) const override {
350  model_ptr_->traverseDF(tree_idx, tree_node_visitor);
351  }
352 
353  virtual const std::vector<double>& getVariableImportanceScores() const override {
354  return variable_importance_;
355  }
356 
357  virtual const double getOutOfBagError() const override { return out_of_bag_error_; }
358 
359  const decision_forest::regression::interface1::ModelPtr getModelPtr() const {
360  return model_ptr_;
361  }
362 
363  private:
364  decision_forest::regression::interface1::ModelPtr model_ptr_;
365  std::vector<double> variable_importance_;
366  double out_of_bag_error_;
367 };
368 
369 class OneAPIRandomForestRegressionModel : public virtual AbstractRandomForestModel {
370  public:
371  OneAPIRandomForestRegressionModel(
372  const std::shared_ptr<const df::model<df::task::regression>> model,
373  const std::string& model_metadata,
374  const std::vector<double>& variable_importance,
375  const double out_of_bag_error,
376  const int64_t num_features)
377  : AbstractMLModel(model_metadata)
378  , model_(std::move(model))
379  , variable_importance_(variable_importance)
380  , out_of_bag_error_(out_of_bag_error)
381  , num_features_(num_features) {}
382 
383  OneAPIRandomForestRegressionModel(
384  const std::shared_ptr<const df::model<df::task::regression>> model,
385  const std::string& model_metadata,
386  const std::vector<std::vector<std::string>>& cat_feature_keys,
387  const std::vector<double>& variable_importance,
388  const double out_of_bag_error,
389  const int64_t num_features)
390  : AbstractMLModel(model_metadata, cat_feature_keys)
391  , model_(std::move(model))
392  , variable_importance_(variable_importance)
393  , out_of_bag_error_(out_of_bag_error)
394  , num_features_(num_features) {}
395 
396  virtual MLModelType getModelType() const override {
398  }
399 
400  virtual std::string getModelTypeString() const override {
401  return "Random Forest Regression";
402  }
403  virtual int64_t getNumFeatures() const override { return num_features_; }
404  virtual int64_t getNumTrees() const override { return model_->get_tree_count(); }
405  virtual void traverseDF(const int64_t tree_idx,
406  TreeModelVisitor& tree_node_visitor) const override {
407  model_->traverse_depth_first(tree_idx, tree_node_visitor);
408  }
409 
410  virtual const std::vector<double>& getVariableImportanceScores() const override {
411  return variable_importance_;
412  }
413 
414  virtual const double getOutOfBagError() const override { return out_of_bag_error_; }
415 
416  const std::shared_ptr<const df::model<df::task::regression>> getModel() const {
417  return model_;
418  }
419 
420  private:
421  const std::shared_ptr<const df::model<df::task::regression>> model_;
422  std::vector<double> variable_importance_;
423  double out_of_bag_error_;
424  int64_t num_features_; // oneapi::df::models do not store number of features
425 };
426 
427 #endif // #ifdef HAVE_ONEDAL
428 
429 class PcaModel : public AbstractMLModel {
430  public:
431  PcaModel(const std::vector<double>& col_means,
432  const std::vector<double>& col_std_devs,
433  const std::vector<std::vector<double>>& eigenvectors,
434  const std::vector<double>& eigenvalues,
435  const std::string& model_metadata)
436  : AbstractMLModel(model_metadata)
437  , col_means_(col_means)
438  , col_std_devs_(col_std_devs)
439  , eigenvectors_(eigenvectors)
440  , eigenvalues_(eigenvalues) {}
441 
442  PcaModel(const std::vector<double>& col_means,
443  const std::vector<double>& col_std_devs,
444  const std::vector<std::vector<double>>& eigenvectors,
445  const std::vector<double>& eigenvalues,
446  const std::string& model_metadata,
447  const std::vector<std::vector<std::string>>& cat_feature_keys)
448  : AbstractMLModel(model_metadata, cat_feature_keys)
449  , col_means_(col_means)
450  , col_std_devs_(col_std_devs)
451  , eigenvectors_(eigenvectors)
452  , eigenvalues_(eigenvalues) {}
453 
454  virtual MLModelType getModelType() const override { return MLModelType::PCA; }
455 
456  virtual std::string getModelTypeString() const override { return "PCA"; }
457 
458  virtual int64_t getNumFeatures() const override {
459  return static_cast<int64_t>(col_means_.size());
460  }
461 
462  const std::vector<double>& getColumnMeans() const { return col_means_; }
463  const std::vector<double>& getColumnStdDevs() const { return col_std_devs_; }
464  const std::vector<std::vector<double>>& getEigenvectors() const {
465  return eigenvectors_;
466  }
467  const std::vector<double>& getEigenvalues() const { return eigenvalues_; }
468 
469  private:
470  std::vector<double> col_means_;
471  std::vector<double> col_std_devs_;
472  std::vector<std::vector<double>> eigenvectors_;
473  std::vector<double> eigenvalues_;
474 };
475 
476 #endif // #ifndef __CUDACC__
PcaModel(const std::vector< double > &col_means, const std::vector< double > &col_std_devs, const std::vector< std::vector< double >> &eigenvectors, const std::vector< double > &eigenvalues, const std::string &model_metadata)
Definition: MLModel.h:431
const std::vector< double > & getColumnStdDevs() const
Definition: MLModel.h:463
#define CHECK_EQ(x, y)
Definition: Logger.h:301
virtual std::string getModelTypeString() const override
Definition: MLModel.h:140
virtual int64_t getNumTrees() const =0
virtual void traverseDF(const int64_t tree_idx, TreeModelVisitor &tree_node_visitor) const =0
virtual int64_t getNumFeatures() const override
Definition: MLModel.h:458
std::vector< MLModelMetadata > getModelMetadata() const
Definition: MLModel.h:84
virtual std::string getModelTypeString() const override
Definition: MLModel.h:456
std::vector< double > eigenvalues_
Definition: MLModel.h:473
LinearRegressionModel(const std::vector< double > &coefs, const std::string &model_metadata)
Definition: MLModel.h:129
virtual MLModelType getModelType() const =0
virtual MLModelType getModelType() const override
Definition: MLModel.h:454
MLModelType
Definition: MLModelType.h:25
std::vector< double > col_std_devs_
Definition: MLModel.h:471
std::vector< double > col_means_
Definition: MLModel.h:470
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:38
bool modelExists(const std::string &model_name) const
Definition: MLModel.h:44
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
std::string to_upper(const std::string &str)
virtual std::string getModelTypeString() const =0
LinearRegressionModel(const std::vector< double > &coefs, const std::string &model_metadata, const std::vector< std::vector< std::string >> &cat_feature_keys)
Definition: MLModel.h:133
MLModelMap g_ml_models
Definition: MLModel.h:125
MLModelMetadata getModelMetadata(const std::string &model_name) const
Definition: MLModel.h:101
std::shared_mutex model_map_mutex_
Definition: MLModel.h:122
PcaModel(const std::vector< double > &col_means, const std::vector< double > &col_std_devs, const std::vector< std::vector< double >> &eigenvectors, const std::vector< double > &eigenvalues, const std::string &model_metadata, const std::vector< std::vector< std::string >> &cat_feature_keys)
Definition: MLModel.h:442
void deleteModel(const std::string &model_name)
Definition: MLModel.h:62
std::vector< double > coefs_
Definition: MLModel.h:149
const std::vector< double > & getColumnMeans() const
Definition: MLModel.h:462
const std::vector< double > & getCoefs() const
Definition: MLModel.h:146
const std::vector< double > & getEigenvalues() const
Definition: MLModel.h:467
virtual MLModelType getModelType() const override
Definition: MLModel.h:138
std::vector< std::string > getModelNames() const
Definition: MLModel.h:75
virtual int64_t getNumFeatures() const override
Definition: MLModel.h:142
std::shared_timed_mutex shared_mutex
virtual ~AbstractTreeModel()=default
virtual int64_t getNumFeatures() const =0
std::map< std::string, std::shared_ptr< AbstractMLModel > > model_map_
Definition: MLModel.h:121
std::vector< std::vector< double > > eigenvectors_
Definition: MLModel.h:472
const std::vector< std::vector< double > > & getEigenvectors() const
Definition: MLModel.h:464