3 using namespace TableFunctions_Namespace;
12 const std::vector<std::string> ml_frameworks = {
"oneapi",
"onedal",
"mlpack"};
13 const int32_t num_frameworks = ml_frameworks.size();
15 const std::vector<int32_t> ml_framework_string_ids =
18 #if defined(HAVE_ONEDAL) || defined(HAVE_MLPACK)
19 bool found_available_framework =
false;
20 auto framework_found_actions = [&output_availability,
22 &found_available_framework](
const int64_t out_row_idx) {
23 output_availability[out_row_idx] =
true;
24 if (!found_available_framework) {
25 output_default[out_row_idx] =
true;
26 found_available_framework =
true;
28 output_default[out_row_idx] =
false;
33 #if !defined(HAVE_ONEDAL) || !defined(HAVE_MLPACK)
34 auto framework_not_found_actions = [&output_availability,
35 &output_default](
const int64_t out_row_idx) {
36 output_availability[out_row_idx] =
false;
37 output_default[out_row_idx] =
false;
41 for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
42 output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
43 if (ml_frameworks[out_row_idx] ==
"onedal" ||
44 ml_frameworks[out_row_idx] ==
"oneapi") {
46 framework_found_actions(out_row_idx);
48 framework_not_found_actions(out_row_idx);
50 }
else if (ml_frameworks[out_row_idx] ==
"mlpack") {
52 framework_found_actions(out_row_idx);
54 framework_not_found_actions(out_row_idx);
58 return num_frameworks;
62 const std::string& model_name,
63 const std::shared_ptr<AbstractMLModel>& model) {
64 return model->getModelMetadata().getFeatures();
71 const int32_t cat_top_k,
72 const float cat_min_fraction,
77 input_cat_features, cat_top_k, cat_min_fraction,
false );
82 preferred_ml_framework_str,
98 if (!linear_reg_model) {
99 throw std::runtime_error(
"Model is not of type linear regression.");
102 const auto& coefs = linear_reg_model->getCoefs();
103 const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
104 const int64_t num_sub_coefs =
static_cast<int64_t
>(coefs.size());
105 const int64_t num_cat_features =
static_cast<int64_t
>(cat_feature_keys.size());
108 std::vector<std::string> feature_names =
110 feature_names.insert(feature_names.begin(),
"intercept");
112 for (int64_t sub_coef_idx = 0, coef_idx = 0; sub_coef_idx < num_sub_coefs;
114 if (num_cat_features >= coef_idx && coef_idx > 0) {
115 const auto& col_cat_feature_keys = cat_feature_keys[coef_idx - 1];
116 int64_t col_cat_feature_idx = 1;
117 for (
const auto& col_cat_feature_key : col_cat_feature_keys) {
118 output_coef_idx[sub_coef_idx] = coef_idx;
119 if (feature_names[coef_idx].empty()) {
120 output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
122 output_feature[sub_coef_idx] =
125 output_sub_coef_idx[sub_coef_idx] = col_cat_feature_idx++;
126 output_sub_feature[sub_coef_idx] =
128 output_coef[sub_coef_idx] = coefs[sub_coef_idx];
132 output_coef_idx[sub_coef_idx] = coef_idx;
133 if (feature_names[coef_idx].empty()) {
134 output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
136 output_feature[sub_coef_idx] =
139 output_sub_coef_idx[sub_coef_idx] = 1;
140 output_sub_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
141 output_coef[sub_coef_idx] = coefs[sub_coef_idx];
146 return num_sub_coefs;
147 }
catch (std::runtime_error& e) {
148 return mgr.ERROR_MESSAGE(e.what());
160 if (model_name.
size() != 1) {
161 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
165 model_name_text_enc_none,
182 return mgr.ERROR_MESSAGE(
183 "Only OneDAL framework supported for random forest regression.");
188 const auto rand_forest_model =
189 std::dynamic_pointer_cast<AbstractRandomForestModel>(base_model);
190 if (!rand_forest_model) {
191 throw std::runtime_error(
"Model is not of type random forest.");
193 const auto& variable_importance_scores =
194 rand_forest_model->getVariableImportanceScores();
195 const int64_t num_features = variable_importance_scores.size();
197 if (num_features == 0) {
198 return mgr.ERROR_MESSAGE(
"Variable importance not computed for this model.");
200 if (num_features != rand_forest_model->getNumFeatures()) {
201 return mgr.ERROR_MESSAGE(
202 "Mismatch in number of features and number of variable importance metrics.");
204 const auto num_logical_features = rand_forest_model->getNumLogicalFeatures();
205 std::vector<std::string> feature_names =
208 int64_t physical_feature_idx = 0;
209 const auto& cat_feature_keys = rand_forest_model->getCatFeatureKeys();
210 const auto num_cat_features = rand_forest_model->getNumCatFeatures();
211 for (int64_t feature_idx = 0; feature_idx < num_logical_features; ++feature_idx) {
213 if (feature_idx < num_cat_features) {
214 const auto& col_cat_feature_keys = cat_feature_keys[feature_idx];
215 int64_t sub_feature_idx = 1;
216 for (
const auto& col_cat_feature_key : col_cat_feature_keys) {
217 feature_id[physical_feature_idx] = feature_idx + 1;
218 if (feature_names[feature_idx].empty()) {
219 feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
221 feature[physical_feature_idx] =
224 sub_feature_id[physical_feature_idx] = sub_feature_idx++;
227 sub_feature[physical_feature_idx] = feature_sub_key;
228 importance_score[physical_feature_idx] =
229 variable_importance_scores[physical_feature_idx];
230 physical_feature_idx++;
233 feature_id[physical_feature_idx] = feature_idx + 1;
234 if (feature_names[feature_idx].empty()) {
235 feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
237 feature[physical_feature_idx] =
240 sub_feature_id[physical_feature_idx] = 1;
241 sub_feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
242 importance_score[physical_feature_idx] =
243 variable_importance_scores[physical_feature_idx];
244 physical_feature_idx++;
249 }
catch (std::runtime_error& e) {
250 return mgr.ERROR_MESSAGE(e.what());
262 if (model_name.
size() != 1) {
263 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
267 model_name_text_enc_none,
290 throw std::runtime_error(
"Model not a tree-type model.");
292 const auto num_trees = tree_model->getNumTrees();
293 std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
294 for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
295 TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
296 tree_model->traverseDF(tree_idx, tree_visitor);
298 std::vector<int64_t> decision_tree_offsets(num_trees + 1);
299 decision_tree_offsets[0] = 0;
300 for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
301 decision_tree_offsets[tree_idx + 1] =
302 decision_tree_offsets[tree_idx] +
303 static_cast<int64_t
>(decision_trees[tree_idx].size());
305 const auto num_entries = decision_tree_offsets[num_trees];
307 for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
308 const auto& decision_tree = decision_trees[tree_idx];
309 const auto output_offset = decision_tree_offsets[tree_idx];
310 const int64_t num_tree_entries = decision_tree.size();
311 for (int64_t entry_idx = 0; entry_idx < num_tree_entries; ++entry_idx) {
312 const int64_t output_idx = output_offset + entry_idx;
313 const auto& tree_entry = decision_tree[entry_idx];
314 const bool entry_is_split_node = tree_entry.isSplitNode();
315 tree_id[output_idx] = tree_idx;
316 entry_id[output_idx] = entry_idx;
317 is_split_node[output_idx] = entry_is_split_node;
318 feature_id[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
319 : tree_entry.feature_index;
320 left_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
321 : tree_entry.left_child_row_idx;
322 right_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
323 : tree_entry.right_child_row_idx;
324 value[output_idx] = tree_entry.value;
328 }
catch (std::runtime_error& e) {
329 const std::string error_str(e.what());
330 return mgr.ERROR_MESSAGE(error_str);
332 #else // Not HAVE_ONEDAL
333 return mgr.ERROR_MESSAGE(
"OneDAL library must be available for get_decision_trees.");
347 if (model_name.
size() != 1) {
348 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
352 model_name_text_enc_none,
364 const int64_t num_cat_features,
365 const int64_t num_numeric_features) {
366 if (model->getNumLogicalFeatures() != num_cat_features + num_numeric_features) {
367 std::ostringstream error_oss;
368 error_oss <<
"Model expects " << model->getNumLogicalFeatures() <<
" features but "
369 << num_cat_features + num_numeric_features <<
" were provided.";
370 throw std::runtime_error(error_oss.str());
372 if (model->getNumCatFeatures() != num_cat_features) {
373 std::ostringstream error_oss;
374 error_oss <<
"Model expects " << model->getNumCatFeatures()
375 <<
" categorical features but " << num_cat_features <<
" were provided.";
376 throw std::runtime_error(error_oss.str());
380 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
ColumnList< T > getFeatures()
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
StringDictionaryProxy * string_dict_proxy_
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
#define EXTENSION_NOINLINE_HOST
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< std::string > get_model_features(const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model)
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)
DEVICE int64_t size() const
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const