37 #include <tbb/parallel_for.h>
38 #include <tbb/task_arena.h>
40 using namespace TableFunctions_Namespace;
43 std::vector<const T*>
pluck_ptrs(
const std::vector<std::vector<T>>& data,
44 const int64_t start_idx,
45 const int64_t end_idx) {
46 std::vector<const T*> raw_ptrs;
49 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
50 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
51 raw_ptrs.emplace_back(data[col_idx].data());
57 std::vector<const T*>
pluck_ptrs(
const std::vector<T*>& data,
58 const int64_t start_idx,
59 const int64_t end_idx) {
60 std::vector<const T*> raw_ptrs;
63 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
64 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
65 raw_ptrs.emplace_back(data[col_idx]);
84 const int64_t num_cat_features,
85 const int64_t num_numeric_features);
101 template <
typename K,
typename T>
106 const int num_clusters,
107 const int num_iterations,
113 output_ids = input_ids;
116 return mgr.ERROR_MESSAGE(
"Invalid KMeans initialization strategy: " +
120 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
122 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
127 const auto denulled_data =
denull_data(input_features);
128 const int64_t num_rows = denulled_data.masked_num_rows;
129 const bool data_is_masked =
130 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
131 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
132 int32_t* denulled_output =
133 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
137 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
139 bool did_execute =
false;
143 onedal_oneapi_kmeans_impl(normalized_ptrs,
148 kmeans_init_strategy);
151 onedal_kmeans_impl(normalized_ptrs,
156 kmeans_init_strategy);
163 mlpack_kmeans_impl(normalized_ptrs,
168 kmeans_init_strategy);
173 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
174 " ML library to support kmeans implementation.");
177 if (data_is_masked) {
179 denulled_data.reverse_index_map,
180 output_clusters.
ptr_,
181 denulled_data.unmasked_num_rows,
182 inline_null_value<int32_t>());
184 }
catch (std::runtime_error& e) {
185 return mgr.ERROR_MESSAGE(e.what());
187 return input_ids.
size();
202 template <
typename K,
typename T>
207 const double epsilon,
208 const int32_t min_observations,
213 output_ids = input_ids;
215 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
217 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
222 const auto denulled_data =
denull_data(input_features);
223 const int64_t num_rows = denulled_data.masked_num_rows;
224 const bool data_is_masked =
225 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
226 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
227 int32_t* denulled_output =
228 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
232 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
234 bool did_execute =
false;
238 onedal_oneapi_dbscan_impl(
239 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
243 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
251 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
256 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
257 " ML library to support dbscan implementation.");
260 if (data_is_masked) {
262 denulled_data.reverse_index_map,
263 output_clusters.
ptr_,
264 denulled_data.unmasked_num_rows,
265 inline_null_value<int32_t>());
267 }
catch (std::runtime_error& e) {
268 return mgr.ERROR_MESSAGE(e.what());
270 return input_ids.
size();
273 template <
typename T>
279 const std::vector<std::vector<std::string>>& cat_feature_keys,
283 if (input_labels.
size() == 0) {
284 return mgr.ERROR_MESSAGE(
285 "No rows exist in training data. Training data must at least contain 1 row.");
287 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
289 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
292 const auto denulled_data =
denull_data(input_labels, input_features);
293 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
294 const auto features_ptrs =
296 const int64_t num_coefs = input_features.
numCols() + 1;
298 std::vector<int64_t> coef_idxs(num_coefs);
299 std::vector<double> coefs(num_coefs);
301 bool did_execute =
false;
309 onedal_linear_reg_fit_impl(labels_ptrs[0],
313 denulled_data.masked_num_rows);
316 onedal_oneapi_linear_reg_fit_impl(labels_ptrs[0],
320 denulled_data.masked_num_rows);
327 mlpack_linear_reg_fit_impl(labels_ptrs[0],
331 denulled_data.masked_num_rows);
336 return mgr.ERROR_MESSAGE(
337 "Cannot find " + preferred_ml_framework_str.
getString() +
338 " ML library to support linear regression implementation.");
340 }
catch (std::runtime_error& e) {
341 return mgr.ERROR_MESSAGE(e.what());
344 std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
346 const std::string model_name_str = model_name.
getString();
349 output_model_name[0] = model_name_str_id;
364 template <
typename T>
373 std::vector<std::vector<std::string>> empty_cat_feature_keys;
378 empty_cat_feature_keys,
379 preferred_ml_framework_str,
384 template <
typename T>
389 const int32_t cat_top_k,
390 const float cat_min_fraction,
391 const bool cat_include_others)
392 : num_rows_(numeric_features.size()) {
394 one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
395 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
396 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
397 one_hot_encoding_infos;
398 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
399 one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
401 one_hot_encoded_cols_ =
402 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
403 cat_features, one_hot_encoding_infos);
404 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
405 cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
406 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
407 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
410 const int64_t num_numeric_features = numeric_features.
numCols();
411 for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
412 ++numeric_feature_idx) {
413 col_ptrs_.emplace_back(numeric_features.
ptrs_[numeric_feature_idx]);
418 const int32_t cat_top_k,
419 const float cat_min_fraction,
420 const bool cat_include_others)
421 : num_rows_(cat_features.size()) {
423 one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
424 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
425 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
426 one_hot_encoding_infos;
427 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
428 one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
430 one_hot_encoded_cols_ =
431 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
432 cat_features, one_hot_encoding_infos);
433 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
434 cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
435 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
436 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
444 const std::vector<std::vector<std::string>>& cat_feature_keys)
445 : num_rows_(numeric_features.size()), cat_feature_keys_(cat_feature_keys) {
446 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
447 if (num_cat_features != cat_feature_keys_.size()) {
448 throw std::runtime_error(
449 "Number of provided categorical features does not match number of categorical "
450 "features in the model.");
452 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
453 one_hot_encoding_infos;
454 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
455 one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
457 one_hot_encoded_cols_ =
458 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
459 cat_features, one_hot_encoding_infos);
460 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
461 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
462 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
465 const int64_t num_numeric_features = numeric_features.
numCols();
466 for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
467 ++numeric_feature_idx) {
468 col_ptrs_.emplace_back(numeric_features.
ptrs_[numeric_feature_idx]);
474 const std::vector<std::vector<std::string>>& cat_feature_keys)
475 : num_rows_(cat_features.size()), cat_feature_keys_(cat_feature_keys) {
476 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
477 if (num_cat_features != cat_feature_keys_.size()) {
478 throw std::runtime_error(
479 "Number of provided categorical features does not match number of categorical "
480 "features in the model.");
482 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
483 one_hot_encoding_infos;
484 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
485 one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
487 one_hot_encoded_cols_ =
488 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
489 cat_features, one_hot_encoding_infos);
490 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
491 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
492 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
499 col_ptrs_.data(),
static_cast<int64_t
>(col_ptrs_.size()), num_rows_);
503 return cat_feature_keys_;
508 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol<T>>
528 template <
typename T>
535 const int32_t cat_top_k,
536 const float cat_min_fraction,
541 input_numeric_features,
551 preferred_ml_framework_str,
569 template <
typename T>
575 const int32_t cat_top_k,
576 const float cat_min_fraction,
581 input_cat_features, cat_top_k, cat_min_fraction,
false );
588 preferred_ml_framework_str,
593 template <
typename T>
595 Column<T> wrapper_col(col_vec.data(),
static_cast<int64_t
>(col_vec.size()));
637 template <
typename T>
643 const std::vector<std::vector<std::string>>& cat_feature_keys,
644 const int64_t max_tree_depth,
645 const int64_t min_observations_per_leaf_node,
649 if (input_labels.
size() == 0) {
650 return mgr.ERROR_MESSAGE(
651 "No rows exist in training data. Training data must at least contain 1 row.");
653 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
655 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
659 return mgr.ERROR_MESSAGE(
660 "Only OneDAL framework supported for decision tree regression.");
663 return mgr.ERROR_MESSAGE(
664 "Only OneDAL framework supported for decision tree regression.");
667 const auto denulled_data =
denull_data(input_labels, input_features);
668 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
669 const auto features_ptrs =
673 bool did_execute =
false;
677 onedal_decision_tree_reg_fit_impl<T>(model_name,
682 denulled_data.masked_num_rows,
684 min_observations_per_leaf_node);
687 output_model_name[0] = model_name_str_id;
692 return mgr.ERROR_MESSAGE(
693 "Cannot find " + preferred_ml_framework_str.
getString() +
694 " ML library to support decision tree regression implementation.");
696 }
catch (std::runtime_error& e) {
697 return mgr.ERROR_MESSAGE(e.what());
715 template <
typename T>
721 const int64_t max_tree_depth,
722 const int64_t min_observations_per_leaf_node,
726 std::vector<std::vector<std::string>> empty_cat_feature_keys;
731 empty_cat_feature_keys,
733 min_observations_per_leaf_node,
734 preferred_ml_framework_str,
754 template <
typename T>
761 const int64_t max_tree_depth,
762 const int64_t min_observations_per_leaf_node,
763 const int32_t cat_top_k,
764 const float cat_min_fraction,
768 std::vector<std::vector<std::string>> empty_cat_feature_keys;
770 input_numeric_features,
780 min_observations_per_leaf_node,
781 preferred_ml_framework_str,
801 template <
typename T>
807 const int64_t max_tree_depth,
808 const int64_t min_observations_per_leaf_node,
809 const int32_t cat_top_k,
810 const float cat_min_fraction,
814 std::vector<std::vector<std::string>> empty_cat_feature_keys;
816 input_cat_features, cat_top_k, cat_min_fraction,
false );
823 min_observations_per_leaf_node,
824 preferred_ml_framework_str,
829 template <
typename T>
835 const std::vector<std::vector<std::string>>& cat_feature_keys,
836 const int64_t max_iterations,
837 const int64_t max_tree_depth,
838 const double shrinkage,
839 const double min_split_loss,
841 const double obs_per_tree_fraction,
842 const int64_t features_per_node,
843 const int64_t min_observations_per_leaf_node,
844 const int64_t max_bins,
845 const int64_t min_bin_size,
849 if (input_labels.
size() == 0) {
850 return mgr.ERROR_MESSAGE(
851 "No rows exist in training data. Training data must at least contain 1 row.");
853 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
855 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
859 return mgr.ERROR_MESSAGE(
"Only OneDAL framework supported for GBT regression.");
862 return mgr.ERROR_MESSAGE(
"Only OneDAL framework supported for GBT regression.");
865 const auto denulled_data =
denull_data(input_labels, input_features);
866 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
867 const auto features_ptrs =
871 bool did_execute =
false;
875 onedal_gbt_reg_fit_impl<T>(model_name,
880 denulled_data.masked_num_rows,
886 obs_per_tree_fraction,
888 min_observations_per_leaf_node,
893 output_model_name[0] = model_name_str_id;
898 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
899 " ML library to support GBT regression implementation.");
901 }
catch (std::runtime_error& e) {
902 return mgr.ERROR_MESSAGE(e.what());
928 template <
typename T>
934 const int64_t max_iterations,
935 const int64_t max_tree_depth,
936 const double shrinkage,
937 const double min_split_loss,
939 const double obs_per_tree_fraction,
940 const int64_t features_per_node,
941 const int64_t min_observations_per_leaf_node,
942 const int64_t max_bins,
943 const int64_t min_bin_size,
947 std::vector<std::vector<std::string>> empty_cat_feature_keys;
952 empty_cat_feature_keys,
958 obs_per_tree_fraction,
960 min_observations_per_leaf_node,
963 preferred_ml_framework_str,
991 template <
typename T>
998 const int64_t max_iterations,
999 const int64_t max_tree_depth,
1000 const double shrinkage,
1001 const double min_split_loss,
1002 const double lambda,
1003 const double obs_per_tree_fraction,
1004 const int64_t features_per_node,
1005 const int64_t min_observations_per_leaf_node,
1006 const int64_t max_bins,
1007 const int64_t min_bin_size,
1008 const int32_t cat_top_k,
1009 const float cat_min_fraction,
1014 input_numeric_features,
1028 obs_per_tree_fraction,
1030 min_observations_per_leaf_node,
1033 preferred_ml_framework_str,
1061 template <
typename T>
1067 const int64_t max_iterations,
1068 const int64_t max_tree_depth,
1069 const double shrinkage,
1070 const double min_split_loss,
1071 const double lambda,
1072 const double obs_per_tree_fraction,
1073 const int64_t features_per_node,
1074 const int64_t min_observations_per_leaf_node,
1075 const int64_t max_bins,
1076 const int64_t min_bin_size,
1077 const int32_t cat_top_k,
1078 const float cat_min_fraction,
1083 input_cat_features, cat_top_k, cat_min_fraction,
false );
1094 obs_per_tree_fraction,
1096 min_observations_per_leaf_node,
1099 preferred_ml_framework_str,
1104 template <
typename T>
1110 const std::vector<std::vector<std::string>>& cat_feature_keys,
1111 const int64_t num_trees,
1112 const double obs_per_tree_fraction,
1113 const int64_t max_tree_depth,
1114 const int64_t features_per_node,
1115 const double impurity_threshold,
1116 const bool bootstrap,
1117 const int64_t min_obs_per_leaf_node,
1118 const int64_t min_obs_per_split_node,
1119 const double min_weight_fraction_in_leaf_node,
1120 const double min_impurity_decrease_in_split_node,
1121 const int64_t max_leaf_nodes,
1122 const bool use_histogram,
1127 if (input_labels.
size() == 0) {
1128 return mgr.ERROR_MESSAGE(
1129 "No rows exist in training data. Training data must at least contain 1 row.");
1131 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1133 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1134 preferred_ml_framework_str.
getString());
1137 return mgr.ERROR_MESSAGE(
1138 "Only OneDAL framework supported for random forest regression.");
1141 return mgr.ERROR_MESSAGE(
1142 "Only OneDAL framework supported for random forest regression.");
1145 const auto denulled_data =
denull_data(input_labels, input_features);
1146 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
1147 const auto features_ptrs =
1151 bool did_execute =
false;
1152 const auto var_importance_metric =
1155 return mgr.ERROR_MESSAGE(
"Invalid variable importance metric: " +
1161 if (use_histogram) {
1162 onedal_oneapi_random_forest_reg_fit_impl<
1164 oneapi::dal::decision_forest::method::hist>(
1170 denulled_data.masked_num_rows,
1172 obs_per_tree_fraction,
1177 min_obs_per_leaf_node,
1178 min_obs_per_split_node,
1179 min_weight_fraction_in_leaf_node,
1180 min_impurity_decrease_in_split_node,
1182 var_importance_metric);
1184 onedal_oneapi_random_forest_reg_fit_impl<
1186 oneapi::dal::decision_forest::method::dense>(
1192 denulled_data.masked_num_rows,
1194 obs_per_tree_fraction,
1199 min_obs_per_leaf_node,
1200 min_obs_per_split_node,
1201 min_weight_fraction_in_leaf_node,
1202 min_impurity_decrease_in_split_node,
1204 var_importance_metric);
1208 output_model_name[0] = model_name_str_id;
1211 if (use_histogram) {
1212 onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1218 denulled_data.masked_num_rows,
1220 obs_per_tree_fraction,
1225 min_obs_per_leaf_node,
1226 min_obs_per_split_node,
1227 min_weight_fraction_in_leaf_node,
1228 min_impurity_decrease_in_split_node,
1230 var_importance_metric);
1232 onedal_random_forest_reg_fit_impl<
1234 decision_forest::regression::training::defaultDense>(
1240 denulled_data.masked_num_rows,
1242 obs_per_tree_fraction,
1247 min_obs_per_leaf_node,
1248 min_obs_per_split_node,
1249 min_weight_fraction_in_leaf_node,
1250 min_impurity_decrease_in_split_node,
1252 var_importance_metric);
1256 output_model_name[0] = model_name_str_id;
1261 return mgr.ERROR_MESSAGE(
1262 "Cannot find " + preferred_ml_framework_str.
getString() +
1263 " ML library to support random forest regression implementation.");
1265 }
catch (std::runtime_error& e) {
1266 return mgr.ERROR_MESSAGE(e.what());
1295 template <
typename T>
1301 const int64_t num_trees,
1302 const double obs_per_tree_fraction,
1303 const int64_t max_tree_depth,
1304 const int64_t features_per_node,
1305 const double impurity_threshold,
1306 const bool bootstrap,
1307 const int64_t min_obs_per_leaf_node,
1308 const int64_t min_obs_per_split_node,
1309 const double min_weight_fraction_in_leaf_node,
1310 const double min_impurity_decrease_in_split_node,
1311 const int64_t max_leaf_nodes,
1312 const bool use_histogram,
1317 std::vector<std::vector<std::string>> empty_cat_feature_keys;
1322 empty_cat_feature_keys,
1324 obs_per_tree_fraction,
1329 min_obs_per_leaf_node,
1330 min_obs_per_split_node,
1331 min_weight_fraction_in_leaf_node,
1332 min_impurity_decrease_in_split_node,
1335 var_importance_metric_str,
1336 preferred_ml_framework_str,
1367 template <
typename T>
1374 const int64_t num_trees,
1375 const double obs_per_tree_fraction,
1376 const int64_t max_tree_depth,
1377 const int64_t features_per_node,
1378 const double impurity_threshold,
1379 const bool bootstrap,
1380 const int64_t min_obs_per_leaf_node,
1381 const int64_t min_obs_per_split_node,
1382 const double min_weight_fraction_in_leaf_node,
1383 const double min_impurity_decrease_in_split_node,
1384 const int64_t max_leaf_nodes,
1385 const bool use_histogram,
1387 const int32_t cat_top_k,
1388 const float cat_min_fraction,
1393 input_numeric_features,
1403 obs_per_tree_fraction,
1408 min_obs_per_leaf_node,
1409 min_obs_per_split_node,
1410 min_weight_fraction_in_leaf_node,
1411 min_impurity_decrease_in_split_node,
1414 var_importance_metric_str,
1415 preferred_ml_framework_str,
1446 template <
typename T>
1452 const int64_t num_trees,
1453 const double obs_per_tree_fraction,
1454 const int64_t max_tree_depth,
1455 const int64_t features_per_node,
1456 const double impurity_threshold,
1457 const bool bootstrap,
1458 const int64_t min_obs_per_leaf_node,
1459 const int64_t min_obs_per_split_node,
1460 const double min_weight_fraction_in_leaf_node,
1461 const double min_impurity_decrease_in_split_node,
1462 const int64_t max_leaf_nodes,
1463 const bool use_histogram,
1465 const int32_t cat_top_k,
1466 const float cat_min_fraction,
1471 input_cat_features, cat_top_k, cat_min_fraction,
false );
1478 obs_per_tree_fraction,
1483 min_obs_per_leaf_node,
1484 min_obs_per_split_node,
1485 min_weight_fraction_in_leaf_node,
1486 min_impurity_decrease_in_split_node,
1489 var_importance_metric_str,
1490 preferred_ml_framework_str,
1495 template <
typename T>
1500 const std::vector<std::vector<std::string>>& cat_feature_keys,
1504 if (input_features.
size() == 0) {
1505 return mgr.ERROR_MESSAGE(
1506 "No rows exist in training data. Training data must at least contain 1 row.");
1508 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1510 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1511 preferred_ml_framework_str.
getString());
1514 const auto denulled_data =
denull_data(input_features);
1515 const int64_t num_rows = denulled_data.masked_num_rows;
1516 if (num_rows == 0) {
1517 return mgr.ERROR_MESSAGE(
1518 "No non-null rows exist in training data. Training data must at least contain "
1522 const auto features_ptrs =
1525 const auto z_std_norm_summary_stats =
1527 const auto normalized_ptrs =
1528 pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1530 z_std_norm_summary_stats.normalized_data.size());
1531 bool did_execute =
false;
1535 const auto [eigenvectors, eigenvalues] =
1536 onedal_oneapi_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1537 auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1538 z_std_norm_summary_stats.std_devs,
1546 const auto [eigenvectors, eigenvalues] =
1547 onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1548 auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1549 z_std_norm_summary_stats.std_devs,
1559 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
1560 " ML library to support PCA implementation.");
1565 output_model_name[0] = model_name_str_id;
1567 }
catch (std::runtime_error& e) {
1568 return mgr.ERROR_MESSAGE(e.what());
1583 template <
typename T>
1591 std::vector<std::vector<std::string>> empty_cat_feature_keys;
1595 empty_cat_feature_keys,
1596 preferred_ml_framework_str,
1614 template <
typename T>
1620 const int32_t cat_top_k,
1621 const float cat_min_fraction,
1626 input_numeric_features,
1634 preferred_ml_framework_str,
1656 const int32_t cat_top_k,
1657 const float cat_min_fraction,
1662 template <
typename T,
typename K>
1665 const std::shared_ptr<AbstractMLModel>& model,
1671 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1673 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1674 preferred_ml_framework_str.
getString());
1676 const auto denulled_data =
denull_data(input_features);
1677 const int64_t num_rows = denulled_data.masked_num_rows;
1678 const bool data_is_masked =
1679 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1680 std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1682 T* denulled_output =
1683 data_is_masked ? denulled_output_allocation.data() : output_predictions.
ptr_;
1684 const auto features_ptrs =
pluck_ptrs(denulled_data.data, 0L, input_features.
numCols());
1687 bool did_execute =
false;
1688 const auto model_type = model->getModelType();
1689 switch (model_type) {
1691 const auto linear_reg_model =
1693 CHECK(linear_reg_model);
1697 onedal_oneapi_linear_reg_predict_impl(
1698 linear_reg_model, features_ptrs, denulled_output, num_rows);
1701 onedal_linear_reg_predict_impl(
1702 linear_reg_model, features_ptrs, denulled_output, num_rows);
1709 mlpack_linear_reg_predict_impl(
1710 linear_reg_model, features_ptrs, denulled_output, num_rows);
1718 const auto decision_tree_reg_model =
1719 std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1720 CHECK(decision_tree_reg_model);
1723 onedal_decision_tree_reg_predict_impl(
1724 decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1732 const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1733 CHECK(gbt_reg_model);
1736 onedal_gbt_reg_predict_impl(
1737 gbt_reg_model, features_ptrs, denulled_output, num_rows);
1745 const auto random_forest_reg_model =
1746 std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1747 const auto oneapi_random_forest_reg_model =
1748 std::dynamic_pointer_cast<OneAPIRandomForestRegressionModel>(model);
1749 CHECK(random_forest_reg_model || oneapi_random_forest_reg_model);
1753 if (random_forest_reg_model) {
1754 onedal_random_forest_reg_predict_impl(
1755 random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1757 onedal_oneapi_random_forest_reg_predict_impl(
1758 oneapi_random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1766 throw std::runtime_error(
"Unsupported model type");
1770 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
1771 " ML library to support model implementation.");
1773 }
catch (std::runtime_error& e) {
1774 const std::string error_str(e.what());
1775 return mgr.ERROR_MESSAGE(error_str);
1777 output_ids = input_ids;
1778 if (data_is_masked) {
1780 denulled_data.reverse_index_map,
1781 output_predictions.
ptr_,
1782 denulled_data.unmasked_num_rows,
1783 inline_null_value<T>());
1785 return input_ids.
size();
1799 template <
typename T,
typename K>
1815 preferred_ml_framework_str,
1817 output_predictions);
1818 }
catch (std::runtime_error& e) {
1819 const std::string error_str(e.what());
1820 return mgr.ERROR_MESSAGE(error_str);
1835 template <
typename T,
typename K>
1848 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
1850 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1854 cat_features_builder.getFeatures(),
1855 preferred_ml_framework_str,
1857 output_predictions);
1858 }
catch (std::runtime_error& e) {
1859 const std::string error_str(e.what());
1860 return mgr.ERROR_MESSAGE(error_str);
1875 template <
typename T,
typename K>
1888 model->getCatFeatureKeys());
1892 cat_features_builder.getFeatures(),
1893 preferred_ml_framework_str,
1895 output_predictions);
1896 }
catch (std::runtime_error& e) {
1897 const std::string error_str(e.what());
1898 return mgr.ERROR_MESSAGE(error_str);
1913 template <
typename T,
typename K>
1922 if (model_name.
size() != 1) {
1923 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1927 model_name_text_enc_none,
1930 preferred_ml_framework_str,
1932 output_predictions);
1946 template <
typename T,
typename K>
1956 if (model_name.
size() != 1) {
1957 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1961 model_name_text_enc_none,
1964 input_numeric_features,
1965 preferred_ml_framework_str,
1967 output_predictions);
1981 template <
typename T,
typename K>
1990 if (model_name.
size() != 1) {
1991 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1995 model_name_text_enc_none,
1998 preferred_ml_framework_str,
2000 output_predictions);
2003 template <
typename T>
2005 const std::shared_ptr<AbstractMLModel>& model,
2009 const int64_t num_rows = input_labels.
size();
2010 if (num_rows == 0) {
2011 return mgr.ERROR_MESSAGE(
2012 "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
2014 std::vector<T> output_predictions_vec(num_rows);
2015 Column<T> output_predictions(output_predictions_vec);
2016 std::vector<int64_t> input_ids_vec(num_rows);
2017 std::vector<int64_t> output_ids_vec(num_rows);
2028 ml_framework_encoding_none,
2030 output_predictions);
2036 }
catch (std::runtime_error& e) {
2038 return mgr.ERROR_MESSAGE(e.what());
2045 const size_t max_thread_count = std::thread::hardware_concurrency();
2047 const size_t num_threads = std::min(
2048 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
2050 std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
2051 std::vector<double> local_sum_squares(num_threads, 0.0);
2053 tbb::task_arena limited_arena(num_threads);
2055 limited_arena.execute([&] {
2057 tbb::blocked_range<int64_t>(0, num_rows),
2058 [&](
const tbb::blocked_range<int64_t>& r) {
2059 const int64_t start_idx = r.begin();
2060 const int64_t end_idx = r.end();
2061 double local_sum_squared_regression{0.0};
2062 double local_sum_square{0.0};
2063 for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
2064 if (output_predictions[row_idx] != inline_null_value<T>()) {
2065 local_sum_squared_regression +=
2066 (input_labels[row_idx] - output_predictions[row_idx]) *
2067 (input_labels[row_idx] - output_predictions[row_idx]);
2068 local_sum_square += (input_labels[row_idx] - labels_mean) *
2069 (input_labels[row_idx] - labels_mean);
2072 const size_t thread_idx = tbb::this_task_arena::current_thread_index();
2073 local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
2074 local_sum_squares[thread_idx] += local_sum_square;
2077 double sum_squared_regression{0.0};
2078 double sum_squares{0.0};
2079 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
2080 sum_squared_regression += local_sum_squared_regressions[thread_idx];
2081 sum_squares += local_sum_squares[thread_idx];
2083 output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
2096 template <
typename T>
2105 return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2106 }
catch (std::runtime_error& e) {
2107 const std::string error_str(e.what());
2108 return mgr.ERROR_MESSAGE(error_str);
2121 template <
typename T>
2128 if (model_name.
size() != 1) {
2129 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
2133 mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2144 template <
typename T>
2155 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
2157 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2159 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2160 }
catch (std::runtime_error& e) {
2161 const std::string error_str(e.what());
2162 return mgr.ERROR_MESSAGE(error_str);
2174 template <
typename T>
2185 model->getCatFeatureKeys());
2187 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2188 }
catch (std::runtime_error& e) {
2189 const std::string error_str(e.what());
2190 return mgr.ERROR_MESSAGE(error_str);
2202 template <
typename T>
2210 if (model_name.
size() != 1) {
2211 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
2213 const std::string model_name_str{model_name.
getString(0)};
2217 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
2219 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2221 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2222 }
catch (std::runtime_error& e) {
2223 const std::string error_str(e.what());
2224 return mgr.ERROR_MESSAGE(error_str);
2314 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
ColumnList< T > getFeatures()
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
std::string getString() const
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
NEVER_INLINE HOST int32_t pca_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
MLFramework get_ml_framework(const std::string &ml_framework_str)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
const size_t max_inputs_per_thread
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define EXTENSION_NOINLINE_HOST
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
void disable_output_allocations()
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t numCols() const
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
std::vector< int8_t * > col_ptrs_
std::vector< TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol< T > > one_hot_encoded_cols_
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
std::vector< std::vector< std::string > > cat_feature_keys_
DEVICE int64_t size() const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
void enable_output_allocations()
Column< T > create_wrapper_col(std::vector< T > &col_vec)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const