OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLTableFunctions.hpp File Reference
+ Include dependency graph for MLTableFunctions.hpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  CategoricalFeaturesBuilder< T >
 

Functions

template<typename T >
std::vector< const T * > pluck_ptrs (const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
 
template<typename T >
std::vector< const T * > pluck_ptrs (const std::vector< T * > &data, const int64_t start_idx, const int64_t end_idx)
 
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_ (TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
 
EXTENSION_NOINLINE_HOST void check_model_params (const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
 
template<typename K , typename T >
NEVER_INLINE HOST int32_t kmeans__cpu_template (TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
 
template<typename K , typename T >
NEVER_INLINE HOST int32_t dbscan__cpu_template (TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
Column< T > create_wrapper_col (std::vector< T > &col_vec)
 
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
 
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t pca_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict_impl (TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score_impl (TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, Column< double > &output_r2)
 
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
 
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
 
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
 
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
 

Function Documentation

EXTENSION_NOINLINE_HOST void check_model_params ( const std::shared_ptr< AbstractMLModel > &  model,
const int64_t  num_cat_features,
const int64_t  num_numeric_features 
)

Definition at line 363 of file MLTableFunctions.cpp.

Referenced by ml_reg_predict__cpu_template(), and r2_score__cpu_template().

365  {
366  if (model->getNumLogicalFeatures() != num_cat_features + num_numeric_features) {
367  std::ostringstream error_oss;
368  error_oss << "Model expects " << model->getNumLogicalFeatures() << " features but "
369  << num_cat_features + num_numeric_features << " were provided.";
370  throw std::runtime_error(error_oss.str());
371  }
372  if (model->getNumCatFeatures() != num_cat_features) {
373  std::ostringstream error_oss;
374  error_oss << "Model expects " << model->getNumCatFeatures()
375  << " categorical features but " << num_cat_features << " were provided.";
376  throw std::runtime_error(error_oss.str());
377  }
378 }

+ Here is the caller graph for this function:

template<typename T >
Column<T> create_wrapper_col ( std::vector< T > &  col_vec)

Definition at line 594 of file MLTableFunctions.hpp.

594  {
595  Column<T> wrapper_col(col_vec.data(), static_cast<int64_t>(col_vec.size()));
596  return wrapper_col;
597 }
template<typename K , typename T >
NEVER_INLINE HOST int32_t dbscan__cpu_template ( TableFunctionManager mgr,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const double  epsilon,
const int32_t  min_observations,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< int32_t > &  output_clusters 
)

Definition at line 204 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), TextEncodingNone::getString(), INVALID, MLPACK, ONEAPI, ONEDAL, pluck_ptrs(), Column< T >::ptr_, TableFunctionManager::set_output_row_size(), Column< T >::size(), TableFunctions_Namespace::unmask_data(), and z_std_normalize_data().

211  {
212  mgr.set_output_row_size(input_ids.size());
213  output_ids = input_ids;
214 
215  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
216  if (preferred_ml_framework == MLFramework::INVALID) {
217  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
218  preferred_ml_framework_str.getString());
219  }
220 
221  try {
222  const auto denulled_data = denull_data(input_features);
223  const int64_t num_rows = denulled_data.masked_num_rows;
224  const bool data_is_masked =
225  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
226  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
227  int32_t* denulled_output =
228  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
229 
230  // z_std_normalize_data can throw if std dev is 0
231  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
232  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
233 
234  bool did_execute = false;
235 #ifdef HAVE_ONEDAL
236  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
237  preferred_ml_framework == MLFramework::DEFAULT)) {
238  onedal_oneapi_dbscan_impl(
239  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
240  did_execute = true;
241  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
242  onedal_dbscan_impl(
243  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
244  did_execute = true;
245  }
246 #endif
247 #ifdef HAVE_MLPACK
248  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
249  preferred_ml_framework == MLFramework::DEFAULT)) {
250  mlpack_dbscan_impl(
251  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
252  did_execute = true;
253  }
254 #endif
255  if (!did_execute) {
256  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
257  " ML library to support dbscan implementation.");
258  }
259 
260  if (data_is_masked) {
261  unmask_data(denulled_output,
262  denulled_data.reverse_index_map,
263  output_clusters.ptr_,
264  denulled_data.unmasked_num_rows,
265  inline_null_value<int32_t>());
266  }
267  } catch (std::runtime_error& e) {
268  return mgr.ERROR_MESSAGE(e.what());
269  }
270  return input_ids.size();
271 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 717 of file MLTableFunctions.hpp.

References decision_tree_reg_impl().

725  {
726  std::vector<std::vector<std::string>> empty_cat_feature_keys;
727  return decision_tree_reg_impl(mgr,
728  model_name,
729  input_labels,
730  input_features,
731  empty_cat_feature_keys,
732  max_tree_depth,
733  min_observations_per_leaf_node,
734  preferred_ml_framework_str,
735  model_metadata,
736  output_model_name);
737 }
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 755 of file MLTableFunctions.hpp.

References decision_tree_reg_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

767  {
768  std::vector<std::vector<std::string>> empty_cat_feature_keys;
769  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
770  input_numeric_features,
771  cat_top_k,
772  cat_min_fraction,
773  false /* cat_include_others */);
774  return decision_tree_reg_impl(mgr,
775  model_name,
776  input_labels,
777  cat_features_builder.getFeatures(),
778  cat_features_builder.getCatFeatureKeys(),
779  max_tree_depth,
780  min_observations_per_leaf_node,
781  preferred_ml_framework_str,
782  model_metadata,
783  output_model_name);
784 }
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 802 of file MLTableFunctions.hpp.

References decision_tree_reg_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

813  {
814  std::vector<std::vector<std::string>> empty_cat_feature_keys;
815  CategoricalFeaturesBuilder<T> cat_features_builder(
816  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
817  return decision_tree_reg_impl(mgr,
818  model_name,
819  input_labels,
820  cat_features_builder.getFeatures(),
821  cat_features_builder.getCatFeatureKeys(),
822  max_tree_depth,
823  min_observations_per_leaf_node,
824  preferred_ml_framework_str,
825  model_metadata,
826  output_model_name);
827 }
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 639 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by decision_tree_reg_fit__cpu_template().

648  {
649  if (input_labels.size() == 0) {
650  return mgr.ERROR_MESSAGE(
651  "No rows exist in training data. Training data must at least contain 1 row.");
652  }
653  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
654  if (preferred_ml_framework == MLFramework::INVALID) {
655  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
656  preferred_ml_framework_str.getString());
657  }
658  if (preferred_ml_framework == MLFramework::MLPACK) {
659  return mgr.ERROR_MESSAGE(
660  "Only OneDAL framework supported for decision tree regression.");
661  }
662 #ifndef HAVE_ONEDAL
663  return mgr.ERROR_MESSAGE(
664  "Only OneDAL framework supported for decision tree regression.");
665 #endif
666 
667  const auto denulled_data = denull_data(input_labels, input_features);
668  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
669  const auto features_ptrs =
670  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
671  mgr.set_output_row_size(1);
672  try {
673  bool did_execute = false;
674 #ifdef HAVE_ONEDAL
675  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
676  preferred_ml_framework == MLFramework::DEFAULT)) {
677  onedal_decision_tree_reg_fit_impl<T>(model_name,
678  labels_ptrs[0],
679  features_ptrs,
680  model_metadata,
681  cat_feature_keys,
682  denulled_data.masked_num_rows,
683  max_tree_depth,
684  min_observations_per_leaf_node);
685  const TextEncodingDict model_name_str_id =
686  output_model_name.getOrAddTransient(model_name);
687  output_model_name[0] = model_name_str_id;
688  did_execute = true;
689  }
690 #endif
691  if (!did_execute) {
692  return mgr.ERROR_MESSAGE(
693  "Cannot find " + preferred_ml_framework_str.getString() +
694  " ML library to support decision tree regression implementation.");
695  }
696  } catch (std::runtime_error& e) {
697  return mgr.ERROR_MESSAGE(e.what());
698  }
699  return 1;
700 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 930 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl().

946  {
947  std::vector<std::vector<std::string>> empty_cat_feature_keys;
948  return gbt_reg_fit_impl(mgr,
949  model_name,
950  input_labels,
951  input_features,
952  empty_cat_feature_keys,
953  max_iterations,
954  max_tree_depth,
955  shrinkage,
956  min_split_loss,
957  lambda,
958  obs_per_tree_fraction,
959  features_per_node,
960  min_observations_per_leaf_node,
961  max_bins,
962  min_bin_size,
963  preferred_ml_framework_str,
964  model_metadata,
965  output_model_name);
966 }
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 993 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

1012  {
1013  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1014  input_numeric_features,
1015  cat_top_k,
1016  cat_min_fraction,
1017  false /* cat_include_others */);
1018  return gbt_reg_fit_impl(mgr,
1019  model_name,
1020  input_labels,
1021  cat_features_builder.getFeatures(),
1022  cat_features_builder.getCatFeatureKeys(),
1023  max_iterations,
1024  max_tree_depth,
1025  shrinkage,
1026  min_split_loss,
1027  lambda,
1028  obs_per_tree_fraction,
1029  features_per_node,
1030  min_observations_per_leaf_node,
1031  max_bins,
1032  min_bin_size,
1033  preferred_ml_framework_str,
1034  model_metadata,
1035  output_model_name);
1036 }
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1063 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

1081  {
1082  CategoricalFeaturesBuilder<T> cat_features_builder(
1083  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1084  return gbt_reg_fit_impl(mgr,
1085  model_name,
1086  input_labels,
1087  cat_features_builder.getFeatures(),
1088  cat_features_builder.getCatFeatureKeys(),
1089  max_iterations,
1090  max_tree_depth,
1091  shrinkage,
1092  min_split_loss,
1093  lambda,
1094  obs_per_tree_fraction,
1095  features_per_node,
1096  min_observations_per_leaf_node,
1097  max_bins,
1098  min_bin_size,
1099  preferred_ml_framework_str,
1100  model_metadata,
1101  output_model_name);
1102 }
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 831 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by gbt_reg_fit__cpu_template().

848  {
849  if (input_labels.size() == 0) {
850  return mgr.ERROR_MESSAGE(
851  "No rows exist in training data. Training data must at least contain 1 row.");
852  }
853  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
854  if (preferred_ml_framework == MLFramework::INVALID) {
855  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
856  preferred_ml_framework_str.getString());
857  }
858  if (preferred_ml_framework == MLFramework::MLPACK) {
859  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
860  }
861 #ifndef HAVE_ONEDAL
862  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
863 #endif
864 
865  const auto denulled_data = denull_data(input_labels, input_features);
866  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
867  const auto features_ptrs =
868  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
869  mgr.set_output_row_size(1);
870  try {
871  bool did_execute = false;
872 #ifdef HAVE_ONEDAL
873  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
874  preferred_ml_framework == MLFramework::DEFAULT)) {
875  onedal_gbt_reg_fit_impl<T>(model_name,
876  labels_ptrs[0],
877  features_ptrs,
878  model_metadata,
879  cat_feature_keys,
880  denulled_data.masked_num_rows,
881  max_iterations,
882  max_tree_depth,
883  shrinkage,
884  min_split_loss,
885  lambda,
886  obs_per_tree_fraction,
887  features_per_node,
888  min_observations_per_leaf_node,
889  max_bins,
890  min_bin_size);
891  const TextEncodingDict model_name_str_id =
892  output_model_name.getOrAddTransient(model_name);
893  output_model_name[0] = model_name_str_id;
894  did_execute = true;
895  }
896 #endif
897  if (!did_execute) {
898  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
899  " ML library to support GBT regression implementation.");
900  }
901  } catch (std::runtime_error& e) {
902  return mgr.ERROR_MESSAGE(e.what());
903  }
904  return 1;
905 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
Column< int64_t > &  tree_id,
Column< int64_t > &  entry_id,
Column< bool > &  is_split_node,
Column< int64_t > &  feature_id,
Column< int64_t > &  left_child,
Column< int64_t > &  right_child,
Column< double > &  value 
)

Definition at line 276 of file MLTableFunctions.cpp.

References g_ml_models, MLModelMap::getModel(), and TableFunctionManager::set_output_row_size().

Referenced by get_decision_trees__cpu_2().

284  {
285 #ifdef HAVE_ONEDAL
286  try {
287  const auto model = g_ml_models.getModel(model_name);
288  const auto tree_model = std::dynamic_pointer_cast<AbstractTreeModel>(model);
289  if (!tree_model) {
290  throw std::runtime_error("Model not a tree-type model.");
291  }
292  const auto num_trees = tree_model->getNumTrees();
293  std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
294  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
295  TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
296  tree_model->traverseDF(tree_idx, tree_visitor);
297  }
298  std::vector<int64_t> decision_tree_offsets(num_trees + 1);
299  decision_tree_offsets[0] = 0;
300  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
301  decision_tree_offsets[tree_idx + 1] =
302  decision_tree_offsets[tree_idx] +
303  static_cast<int64_t>(decision_trees[tree_idx].size());
304  }
305  const auto num_entries = decision_tree_offsets[num_trees];
306  mgr.set_output_row_size(num_entries);
307  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
308  const auto& decision_tree = decision_trees[tree_idx];
309  const auto output_offset = decision_tree_offsets[tree_idx];
310  const int64_t num_tree_entries = decision_tree.size();
311  for (int64_t entry_idx = 0; entry_idx < num_tree_entries; ++entry_idx) {
312  const int64_t output_idx = output_offset + entry_idx;
313  const auto& tree_entry = decision_tree[entry_idx];
314  const bool entry_is_split_node = tree_entry.isSplitNode();
315  tree_id[output_idx] = tree_idx;
316  entry_id[output_idx] = entry_idx;
317  is_split_node[output_idx] = entry_is_split_node;
318  feature_id[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
319  : tree_entry.feature_index;
320  left_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
321  : tree_entry.left_child_row_idx;
322  right_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
323  : tree_entry.right_child_row_idx;
324  value[output_idx] = tree_entry.value;
325  }
326  }
327  return num_entries;
328  } catch (std::runtime_error& e) {
329  const std::string error_str(e.what());
330  return mgr.ERROR_MESSAGE(error_str);
331  }
332 #else // Not HAVE_ONEDAL
333  return mgr.ERROR_MESSAGE("OneDAL library must be available for get_decision_trees.");
334 #endif
335 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
MLModelMap g_ml_models
Definition: MLModel.h:125

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2 ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
Column< int64_t > &  tree_id,
Column< int64_t > &  entry_id,
Column< bool > &  is_split_node,
Column< int64_t > &  feature_id,
Column< int64_t > &  left_child,
Column< int64_t > &  right_child,
Column< double > &  value 
)

Definition at line 338 of file MLTableFunctions.cpp.

References get_decision_trees__cpu_1(), Column< TextEncodingDict >::getString(), and Column< TextEncodingDict >::size().

346  {
347  if (model_name.size() != 1) {
348  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
349  }
350  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
351  return get_decision_trees__cpu_1(mgr,
352  model_name_text_enc_none,
353  tree_id,
354  entry_id,
355  is_split_node,
356  feature_id,
357  left_child,
358  right_child,
359  value);
360 }
DEVICE const std::string getString(int64_t index) const
DEVICE int64_t size() const
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)

+ Here is the call graph for this function:

template<typename K , typename T >
NEVER_INLINE HOST int32_t kmeans__cpu_template ( TableFunctionManager mgr,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const int  num_clusters,
const int  num_iterations,
const TextEncodingNone init_type_str,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< int32_t > &  output_clusters 
)

Definition at line 103 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_kmeans_init_type(), get_ml_framework(), TextEncodingNone::getString(), INVALID, MLPACK, ONEAPI, ONEDAL, pluck_ptrs(), Column< T >::ptr_, TableFunctionManager::set_output_row_size(), Column< T >::size(), TableFunctions_Namespace::unmask_data(), and z_std_normalize_data().

111  {
112  mgr.set_output_row_size(input_ids.size());
113  output_ids = input_ids;
114  const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
115  if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
116  return mgr.ERROR_MESSAGE("Invalid KMeans initialization strategy: " +
117  init_type_str.getString());
118  }
119 
120  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
121  if (preferred_ml_framework == MLFramework::INVALID) {
122  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
123  preferred_ml_framework_str.getString());
124  }
125 
126  try {
127  const auto denulled_data = denull_data(input_features);
128  const int64_t num_rows = denulled_data.masked_num_rows;
129  const bool data_is_masked =
130  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
131  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
132  int32_t* denulled_output =
133  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
134 
135  // z_std_normalize_data can throw if std dev is 0
136  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
137  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
138 
139  bool did_execute = false;
140 #ifdef HAVE_ONEDAL
141  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
142  preferred_ml_framework == MLFramework::DEFAULT)) {
143  onedal_oneapi_kmeans_impl(normalized_ptrs,
144  denulled_output,
145  num_rows,
146  num_clusters,
147  num_iterations,
148  kmeans_init_strategy);
149  did_execute = true;
150  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
151  onedal_kmeans_impl(normalized_ptrs,
152  denulled_output,
153  num_rows,
154  num_clusters,
155  num_iterations,
156  kmeans_init_strategy);
157  did_execute = true;
158  }
159 #endif
160 #ifdef HAVE_MLPACK
161  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
162  preferred_ml_framework == MLFramework::DEFAULT)) {
163  mlpack_kmeans_impl(normalized_ptrs,
164  denulled_output,
165  num_rows,
166  num_clusters,
167  num_iterations,
168  kmeans_init_strategy);
169  did_execute = true;
170  }
171 #endif
172  if (!did_execute) {
173  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
174  " ML library to support kmeans implementation.");
175  }
176 
177  if (data_is_masked) {
178  unmask_data(denulled_output,
179  denulled_data.reverse_index_map,
180  output_clusters.ptr_,
181  denulled_data.unmasked_num_rows,
182  inline_null_value<int32_t>());
183  }
184  } catch (std::runtime_error& e) {
185  return mgr.ERROR_MESSAGE(e.what());
186  }
187  return input_ids.size();
188 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)

+ Here is the call graph for this function:

EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
Column< int64_t > &  output_coef_idx,
Column< TextEncodingDict > &  output_feature,
Column< int64_t > &  output_sub_coef_idx,
Column< TextEncodingDict > &  output_sub_feature,
Column< double > &  output_coef 
)

Definition at line 88 of file MLTableFunctions.cpp.

References g_ml_models, get_model_features(), MLModelMap::getModel(), Column< TextEncodingDict >::getOrAddTransient(), and TableFunctionManager::set_output_row_size().

Referenced by linear_reg_coefs__cpu_2().

94  {
95  try {
96  const auto linear_reg_model = std::dynamic_pointer_cast<LinearRegressionModel>(
97  g_ml_models.getModel(model_name));
98  if (!linear_reg_model) {
99  throw std::runtime_error("Model is not of type linear regression.");
100  }
101 
102  const auto& coefs = linear_reg_model->getCoefs();
103  const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
104  const int64_t num_sub_coefs = static_cast<int64_t>(coefs.size());
105  const int64_t num_cat_features = static_cast<int64_t>(cat_feature_keys.size());
106  mgr.set_output_row_size(num_sub_coefs);
107 
108  std::vector<std::string> feature_names =
109  get_model_features(model_name, linear_reg_model);
110  feature_names.insert(feature_names.begin(), "intercept");
111 
112  for (int64_t sub_coef_idx = 0, coef_idx = 0; sub_coef_idx < num_sub_coefs;
113  ++coef_idx) {
114  if (num_cat_features >= coef_idx && coef_idx > 0) {
115  const auto& col_cat_feature_keys = cat_feature_keys[coef_idx - 1];
116  int64_t col_cat_feature_idx = 1;
117  for (const auto& col_cat_feature_key : col_cat_feature_keys) {
118  output_coef_idx[sub_coef_idx] = coef_idx;
119  if (feature_names[coef_idx].empty()) {
120  output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
121  } else {
122  output_feature[sub_coef_idx] =
123  output_feature.getOrAddTransient(feature_names[coef_idx]);
124  }
125  output_sub_coef_idx[sub_coef_idx] = col_cat_feature_idx++;
126  output_sub_feature[sub_coef_idx] =
127  output_sub_feature.getOrAddTransient(col_cat_feature_key);
128  output_coef[sub_coef_idx] = coefs[sub_coef_idx];
129  ++sub_coef_idx;
130  }
131  } else {
132  output_coef_idx[sub_coef_idx] = coef_idx;
133  if (feature_names[coef_idx].empty()) {
134  output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
135  } else {
136  output_feature[sub_coef_idx] =
137  output_feature.getOrAddTransient(feature_names[coef_idx]);
138  }
139  output_sub_coef_idx[sub_coef_idx] = 1;
140  output_sub_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
141  output_coef[sub_coef_idx] = coefs[sub_coef_idx];
142  ++sub_coef_idx;
143  }
144  }
145 
146  return num_sub_coefs;
147  } catch (std::runtime_error& e) {
148  return mgr.ERROR_MESSAGE(e.what());
149  }
150 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
MLModelMap g_ml_models
Definition: MLModel.h:125
std::vector< std::string > get_model_features(const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2 ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
Column< int64_t > &  output_coef_idx,
Column< TextEncodingDict > &  output_feature,
Column< int64_t > &  output_sub_coef_idx,
Column< TextEncodingDict > &  output_sub_feature,
Column< double > &  output_coef 
)

Definition at line 153 of file MLTableFunctions.cpp.

References Column< TextEncodingDict >::getString(), linear_reg_coefs__cpu_1(), and Column< TextEncodingDict >::size().

159  {
160  if (model_name.size() != 1) {
161  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
162  }
163  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
164  return linear_reg_coefs__cpu_1(mgr,
165  model_name_text_enc_none,
166  output_coef_idx,
167  output_feature,
168  output_sub_coef_idx,
169  output_sub_feature,
170  output_coef);
171 }
DEVICE const std::string getString(int64_t index) const
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 366 of file MLTableFunctions.hpp.

References linear_reg_fit_impl().

372  {
373  std::vector<std::vector<std::string>> empty_cat_feature_keys;
374  return linear_reg_fit_impl(mgr,
375  model_name,
376  input_labels,
377  input_features,
378  empty_cat_feature_keys,
379  preferred_ml_framework_str,
380  model_metadata,
381  output_model_name);
382 }
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 530 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and linear_reg_fit_impl().

539  {
540  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
541  input_numeric_features,
542  cat_top_k,
543  cat_min_fraction,
544  false /* cat_include_others */);
545 
546  return linear_reg_fit_impl(mgr,
547  model_name,
548  input_labels,
549  cat_features_builder.getFeatures(),
550  cat_features_builder.getCatFeatureKeys(),
551  preferred_ml_framework_str,
552  model_metadata,
553  output_model_name);
554 }
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 571 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and linear_reg_fit_impl().

579  {
580  CategoricalFeaturesBuilder<T> cat_features_builder(
581  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
582 
583  return linear_reg_fit_impl(mgr,
584  model_name,
585  input_labels,
586  cat_features_builder.getFeatures(),
587  cat_features_builder.getCatFeatureKeys(),
588  preferred_ml_framework_str,
589  model_metadata,
590  output_model_name);
591 }
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 275 of file MLTableFunctions.hpp.

References MLModelMap::addModel(), DEFAULT, TableFunctions_Namespace::denull_data(), g_ml_models, get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by linear_reg_fit__cpu_template().

282  {
283  if (input_labels.size() == 0) {
284  return mgr.ERROR_MESSAGE(
285  "No rows exist in training data. Training data must at least contain 1 row.");
286  }
287  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
288  if (preferred_ml_framework == MLFramework::INVALID) {
289  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
290  preferred_ml_framework_str.getString());
291  }
292  const auto denulled_data = denull_data(input_labels, input_features);
293  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
294  const auto features_ptrs =
295  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
296  const int64_t num_coefs = input_features.numCols() + 1;
297  mgr.set_output_row_size(num_coefs);
298  std::vector<int64_t> coef_idxs(num_coefs);
299  std::vector<double> coefs(num_coefs);
300  try {
301  bool did_execute = false;
302 #ifdef HAVE_ONEDAL
303  // FIXME: We default to legacy DAAL Linear Regression, as the oneAPI implementation
304  // seems to be experimental. It crashes on a few small toy models (such as datasets
305  // with 1 datapoint) and finds different coefficients for large models, when compared
306  // with the DAAL implementation. This should be revisited when oneDAL is updated.
307  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
308  preferred_ml_framework == MLFramework::DEFAULT)) {
309  onedal_linear_reg_fit_impl(labels_ptrs[0],
310  features_ptrs,
311  coef_idxs.data(),
312  coefs.data(),
313  denulled_data.masked_num_rows);
314  did_execute = true;
315  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI)) {
316  onedal_oneapi_linear_reg_fit_impl(labels_ptrs[0],
317  features_ptrs,
318  coef_idxs.data(),
319  coefs.data(),
320  denulled_data.masked_num_rows);
321  did_execute = true;
322  }
323 #endif
324 #ifdef HAVE_MLPACK
325  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
326  preferred_ml_framework == MLFramework::DEFAULT)) {
327  mlpack_linear_reg_fit_impl(labels_ptrs[0],
328  features_ptrs,
329  coef_idxs.data(),
330  coefs.data(),
331  denulled_data.masked_num_rows);
332  did_execute = true;
333  }
334 #endif
335  if (!did_execute) {
336  return mgr.ERROR_MESSAGE(
337  "Cannot find " + preferred_ml_framework_str.getString() +
338  " ML library to support linear regression implementation.");
339  }
340  } catch (std::runtime_error& e) {
341  return mgr.ERROR_MESSAGE(e.what());
342  }
343  auto model =
344  std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
345  g_ml_models.addModel(model_name, model);
346  const std::string model_name_str = model_name.getString();
347  const TextEncodingDict model_name_str_id =
348  output_model_name.getOrAddTransient(model_name);
349  output_model_name[0] = model_name_str_id;
350  return 1;
351 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:38
MLModelMap g_ml_models
Definition: MLModel.h:125
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1801 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), and ColumnList< T >::numCols().

Referenced by ml_reg_predict__cpu_template().

1807  {
1808  try {
1809  const auto model = g_ml_models.getModel(model_name);
1810  check_model_params(model, 0, input_features.numCols());
1811  return ml_reg_predict_impl(mgr,
1812  model,
1813  input_ids,
1814  input_features,
1815  preferred_ml_framework_str,
1816  output_ids,
1817  output_predictions);
1818  } catch (std::runtime_error& e) {
1819  const std::string error_str(e.what());
1820  return mgr.ERROR_MESSAGE(error_str);
1821  }
1822 }
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
MLModelMap g_ml_models
Definition: MLModel.h:125
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1837 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), ColumnList< T >::numCols(), and ColumnList< TextEncodingDict >::numCols().

1844  {
1845  try {
1846  const auto model = g_ml_models.getModel(model_name);
1848  model, input_cat_features.numCols(), input_numeric_features.numCols());
1849  CategoricalFeaturesBuilder<T> cat_features_builder(
1850  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1851  return ml_reg_predict_impl(mgr,
1852  model,
1853  input_ids,
1854  cat_features_builder.getFeatures(),
1855  preferred_ml_framework_str,
1856  output_ids,
1857  output_predictions);
1858  } catch (std::runtime_error& e) {
1859  const std::string error_str(e.what());
1860  return mgr.ERROR_MESSAGE(error_str);
1861  }
1862 }
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:125
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1877 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), and ColumnList< TextEncodingDict >::numCols().

1883  {
1884  try {
1885  const auto model = g_ml_models.getModel(model_name);
1886  check_model_params(model, input_cat_features.numCols(), 0);
1887  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1888  model->getCatFeatureKeys());
1889  return ml_reg_predict_impl(mgr,
1890  model,
1891  input_ids,
1892  cat_features_builder.getFeatures(),
1893  preferred_ml_framework_str,
1894  output_ids,
1895  output_predictions);
1896  } catch (std::runtime_error& e) {
1897  const std::string error_str(e.what());
1898  return mgr.ERROR_MESSAGE(error_str);
1899  }
1900 }
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:125
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1915 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

1921  {
1922  if (model_name.size() != 1) {
1923  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1924  }
1925  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1926  return ml_reg_predict__cpu_template(mgr,
1927  model_name_text_enc_none,
1928  input_ids,
1929  input_features,
1930  preferred_ml_framework_str,
1931  output_ids,
1932  output_predictions);
1933 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1948 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

1955  {
1956  if (model_name.size() != 1) {
1957  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1958  }
1959  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1960  return ml_reg_predict__cpu_template(mgr,
1961  model_name_text_enc_none,
1962  input_ids,
1963  input_cat_features,
1964  input_numeric_features,
1965  preferred_ml_framework_str,
1966  output_ids,
1967  output_predictions);
1968 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1983 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

1989  {
1990  if (model_name.size() != 1) {
1991  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1992  }
1993  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1994  return ml_reg_predict__cpu_template(mgr,
1995  model_name_text_enc_none,
1996  input_ids,
1997  input_cat_features,
1998  preferred_ml_framework_str,
1999  output_ids,
2000  output_predictions);
2001 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict_impl ( TableFunctionManager mgr,
const std::shared_ptr< AbstractMLModel > &  model,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1664 of file MLTableFunctions.hpp.

References CHECK, DECISION_TREE_REG, DEFAULT, TableFunctions_Namespace::denull_data(), GBT_REG, get_ml_framework(), TextEncodingNone::getString(), INVALID, LINEAR_REG, MLPACK, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), Column< T >::ptr_, RANDOM_FOREST_REG, TableFunctionManager::set_output_row_size(), Column< T >::size(), heavydb.dtypes::T, and TableFunctions_Namespace::unmask_data().

Referenced by ml_reg_predict__cpu_template(), and r2_score_impl().

1670  {
1671  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1672  if (preferred_ml_framework == MLFramework::INVALID) {
1673  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1674  preferred_ml_framework_str.getString());
1675  }
1676  const auto denulled_data = denull_data(input_features);
1677  const int64_t num_rows = denulled_data.masked_num_rows;
1678  const bool data_is_masked =
1679  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1680  std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1681  mgr.set_output_row_size(input_ids.size());
1682  T* denulled_output =
1683  data_is_masked ? denulled_output_allocation.data() : output_predictions.ptr_;
1684  const auto features_ptrs = pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1685 
1686  try {
1687  bool did_execute = false;
1688  const auto model_type = model->getModelType();
1689  switch (model_type) {
1690  case MLModelType::LINEAR_REG: {
1691  const auto linear_reg_model =
1692  std::dynamic_pointer_cast<LinearRegressionModel>(model);
1693  CHECK(linear_reg_model);
1694 #ifdef HAVE_ONEDAL
1695  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
1696  preferred_ml_framework == MLFramework::DEFAULT)) {
1697  onedal_oneapi_linear_reg_predict_impl(
1698  linear_reg_model, features_ptrs, denulled_output, num_rows);
1699  did_execute = true;
1700  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
1701  onedal_linear_reg_predict_impl(
1702  linear_reg_model, features_ptrs, denulled_output, num_rows);
1703  did_execute = true;
1704  }
1705 #endif
1706 #ifdef HAVE_MLPACK
1707  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
1708  preferred_ml_framework == MLFramework::DEFAULT)) {
1709  mlpack_linear_reg_predict_impl(
1710  linear_reg_model, features_ptrs, denulled_output, num_rows);
1711  did_execute = true;
1712  }
1713 #endif
1714  break;
1715  }
1717 #ifdef HAVE_ONEDAL
1718  const auto decision_tree_reg_model =
1719  std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1720  CHECK(decision_tree_reg_model);
1721  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1722  preferred_ml_framework == MLFramework::DEFAULT)) {
1723  onedal_decision_tree_reg_predict_impl(
1724  decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1725  did_execute = true;
1726  }
1727 #endif
1728  break;
1729  }
1730  case MLModelType::GBT_REG: {
1731 #ifdef HAVE_ONEDAL
1732  const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1733  CHECK(gbt_reg_model);
1734  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1735  preferred_ml_framework == MLFramework::DEFAULT)) {
1736  onedal_gbt_reg_predict_impl(
1737  gbt_reg_model, features_ptrs, denulled_output, num_rows);
1738  did_execute = true;
1739  }
1740 #endif
1741  break;
1742  }
1744 #ifdef HAVE_ONEDAL
1745  const auto random_forest_reg_model =
1746  std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1747  const auto oneapi_random_forest_reg_model =
1748  std::dynamic_pointer_cast<OneAPIRandomForestRegressionModel>(model);
1749  CHECK(random_forest_reg_model || oneapi_random_forest_reg_model);
1750  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
1751  preferred_ml_framework == MLFramework::ONEDAL ||
1752  preferred_ml_framework == MLFramework::DEFAULT)) {
1753  if (random_forest_reg_model) {
1754  onedal_random_forest_reg_predict_impl(
1755  random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1756  } else {
1757  onedal_oneapi_random_forest_reg_predict_impl(
1758  oneapi_random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1759  }
1760  did_execute = true;
1761  }
1762 #endif
1763  break;
1764  }
1765  default: {
1766  throw std::runtime_error("Unsupported model type");
1767  }
1768  }
1769  if (!did_execute) {
1770  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1771  " ML library to support model implementation.");
1772  }
1773  } catch (std::runtime_error& e) {
1774  const std::string error_str(e.what());
1775  return mgr.ERROR_MESSAGE(error_str);
1776  }
1777  output_ids = input_ids;
1778  if (data_is_masked) {
1779  unmask_data(denulled_output,
1780  denulled_data.reverse_index_map,
1781  output_predictions.ptr_,
1782  denulled_data.unmasked_num_rows,
1783  inline_null_value<T>());
1784  }
1785  return input_ids.size();
1786 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 68 of file MLTableFunctions.cpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and pca_fit_impl().

75  {
76  CategoricalFeaturesBuilder<double> cat_features_builder(
77  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
78  return pca_fit_impl(mgr,
79  model_name,
80  cat_features_builder.getFeatures(),
81  cat_features_builder.getCatFeatureKeys(),
82  preferred_ml_framework_str,
83  model_metadata,
84  output_model_name);
85 }
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1585 of file MLTableFunctions.hpp.

References pca_fit_impl().

1590  {
1591  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1592  return pca_fit_impl(mgr,
1593  model_name,
1594  input_features,
1595  empty_cat_feature_keys,
1596  preferred_ml_framework_str,
1597  model_metadata,
1598  output_model_name);
1599 }
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1616 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and pca_fit_impl().

1624  {
1625  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1626  input_numeric_features,
1627  cat_top_k,
1628  cat_min_fraction,
1629  false /* cat_include_others */);
1630  return pca_fit_impl(mgr,
1631  model_name,
1632  cat_features_builder.getFeatures(),
1633  cat_features_builder.getCatFeatureKeys(),
1634  preferred_ml_framework_str,
1635  model_metadata,
1636  output_model_name);
1637 }
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t pca_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1497 of file MLTableFunctions.hpp.

References MLModelMap::addModel(), DEFAULT, TableFunctions_Namespace::denull_data(), g_ml_models, get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), ColumnList< T >::size(), and z_std_normalize_data_with_summary_stats().

Referenced by pca_fit__cpu_1(), and pca_fit__cpu_template().

1503  {
1504  if (input_features.size() == 0) {
1505  return mgr.ERROR_MESSAGE(
1506  "No rows exist in training data. Training data must at least contain 1 row.");
1507  }
1508  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1509  if (preferred_ml_framework == MLFramework::INVALID) {
1510  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1511  preferred_ml_framework_str.getString());
1512  }
1513  try {
1514  const auto denulled_data = denull_data(input_features);
1515  const int64_t num_rows = denulled_data.masked_num_rows;
1516  if (num_rows == 0) {
1517  return mgr.ERROR_MESSAGE(
1518  "No non-null rows exist in training data. Training data must at least contain "
1519  "1 "
1520  "non-null row.");
1521  }
1522  const auto features_ptrs =
1523  pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1524  // z_std_normalize_data_with_summary_stats can throw if std dev is 0
1525  const auto z_std_norm_summary_stats =
1526  z_std_normalize_data_with_summary_stats(denulled_data.data, num_rows);
1527  const auto normalized_ptrs =
1528  pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1529  0L,
1530  z_std_norm_summary_stats.normalized_data.size());
1531  bool did_execute = false;
1532 #ifdef HAVE_ONEDAL
1533  if (preferred_ml_framework == MLFramework::ONEAPI ||
1534  preferred_ml_framework == MLFramework::DEFAULT) {
1535  const auto [eigenvectors, eigenvalues] =
1536  onedal_oneapi_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1537  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1538  z_std_norm_summary_stats.std_devs,
1539  eigenvectors,
1540  eigenvalues,
1541  model_metadata,
1542  cat_feature_keys);
1543  g_ml_models.addModel(model_name, model);
1544  did_execute = true;
1545  } else if (preferred_ml_framework == MLFramework::ONEDAL) {
1546  const auto [eigenvectors, eigenvalues] =
1547  onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1548  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1549  z_std_norm_summary_stats.std_devs,
1550  eigenvectors,
1551  eigenvalues,
1552  model_metadata,
1553  cat_feature_keys);
1554  g_ml_models.addModel(model_name, model);
1555  did_execute = true;
1556  }
1557 #endif
1558  if (!did_execute) {
1559  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1560  " ML library to support PCA implementation.");
1561  }
1562  mgr.set_output_row_size(1);
1563  const TextEncodingDict model_name_str_id =
1564  output_model_name.getOrAddTransient(model_name);
1565  output_model_name[0] = model_name_str_id;
1566  return 1;
1567  } catch (std::runtime_error& e) {
1568  return mgr.ERROR_MESSAGE(e.what());
1569  }
1570 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:38
MLModelMap g_ml_models
Definition: MLModel.h:125
DEVICE int64_t size() const
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
std::vector<const T*> pluck_ptrs ( const std::vector< std::vector< T >> &  data,
const int64_t  start_idx,
const int64_t  end_idx 
)

Definition at line 43 of file MLTableFunctions.hpp.

References CHECK_GE, CHECK_GT, and CHECK_LE.

Referenced by dbscan__cpu_template(), decision_tree_reg_impl(), gbt_reg_fit_impl(), kmeans__cpu_template(), linear_reg_fit_impl(), ml_reg_predict_impl(), pca_fit_impl(), and random_forest_reg_fit_impl().

45  {
46  std::vector<const T*> raw_ptrs;
47  CHECK_GE(start_idx, 0L);
48  CHECK_GT(end_idx, start_idx);
49  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
50  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
51  raw_ptrs.emplace_back(data[col_idx].data());
52  }
53  return raw_ptrs;
54 }
#define CHECK_GE(x, y)
Definition: Logger.h:306
#define CHECK_GT(x, y)
Definition: Logger.h:305
#define CHECK_LE(x, y)
Definition: Logger.h:304

+ Here is the caller graph for this function:

template<typename T >
std::vector<const T*> pluck_ptrs ( const std::vector< T * > &  data,
const int64_t  start_idx,
const int64_t  end_idx 
)

Definition at line 57 of file MLTableFunctions.hpp.

References CHECK_GE, CHECK_GT, and CHECK_LE.

59  {
60  std::vector<const T*> raw_ptrs;
61  CHECK_GE(start_idx, 0L);
62  CHECK_GT(end_idx, start_idx);
63  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
64  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
65  raw_ptrs.emplace_back(data[col_idx]);
66  }
67  return raw_ptrs;
68 }
#define CHECK_GE(x, y)
Definition: Logger.h:306
#define CHECK_GT(x, y)
Definition: Logger.h:305
#define CHECK_LE(x, y)
Definition: Logger.h:304
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
Column< double > &  output_r2 
)

Definition at line 2097 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< T >::numCols(), and r2_score_impl().

Referenced by r2_score__cpu_template().

2101  {
2102  try {
2103  const auto model = g_ml_models.getModel(model_name);
2104  check_model_params(model, 0, input_features.numCols());
2105  return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2106  } catch (std::runtime_error& e) {
2107  const std::string error_str(e.what());
2108  return mgr.ERROR_MESSAGE(error_str);
2109  }
2110 }
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
MLModelMap g_ml_models
Definition: MLModel.h:125

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
Column< double > &  output_r2 
)

Definition at line 2123 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), r2_score__cpu_template(), and Column< TextEncodingDict >::size().

2127  {
2128  if (model_name.size() != 1) {
2129  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2130  }
2131  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
2132  return r2_score__cpu_template(
2133  mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2134 }
DEVICE const std::string getString(int64_t index) const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
Column< double > &  output_r2 
)

Definition at line 2146 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< T >::numCols(), ColumnList< TextEncodingDict >::numCols(), and r2_score_impl().

2151  {
2152  try {
2153  const auto model = g_ml_models.getModel(model_name);
2155  model, input_cat_features.numCols(), input_numeric_features.numCols());
2156  CategoricalFeaturesBuilder<T> cat_features_builder(
2157  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2158  return r2_score_impl(
2159  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2160  } catch (std::runtime_error& e) {
2161  const std::string error_str(e.what());
2162  return mgr.ERROR_MESSAGE(error_str);
2163  }
2164 }
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:125

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
Column< double > &  output_r2 
)

Definition at line 2176 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< TextEncodingDict >::numCols(), and r2_score_impl().

2180  {
2181  try {
2182  const auto model = g_ml_models.getModel(model_name);
2183  check_model_params(model, input_cat_features.numCols(), 0);
2184  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
2185  model->getCatFeatureKeys());
2186  return r2_score_impl(
2187  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2188  } catch (std::runtime_error& e) {
2189  const std::string error_str(e.what());
2190  return mgr.ERROR_MESSAGE(error_str);
2191  }
2192 }
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:125

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
Column< double > &  output_r2 
)

Definition at line 2204 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), Column< TextEncodingDict >::getString(), ColumnList< T >::numCols(), ColumnList< TextEncodingDict >::numCols(), r2_score_impl(), and Column< TextEncodingDict >::size().

2209  {
2210  if (model_name.size() != 1) {
2211  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2212  }
2213  const std::string model_name_str{model_name.getString(0)};
2214  try {
2215  const auto model = g_ml_models.getModel(model_name_str);
2217  model, input_cat_features.numCols(), input_numeric_features.numCols());
2218  CategoricalFeaturesBuilder<T> cat_features_builder(
2219  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2220  return r2_score_impl(
2221  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2222  } catch (std::runtime_error& e) {
2223  const std::string error_str(e.what());
2224  return mgr.ERROR_MESSAGE(error_str);
2225  }
2226 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:125
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score_impl ( TableFunctionManager mgr,
const std::shared_ptr< AbstractMLModel > &  model,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
Column< double > &  output_r2 
)

Definition at line 2004 of file MLTableFunctions.hpp.

References TableFunctionManager::disable_output_allocations(), TableFunctionManager::enable_output_allocations(), get_column_mean(), max_inputs_per_thread, ml_reg_predict_impl(), threading_serial::parallel_for(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by r2_score__cpu_template().

2008  {
2009  const int64_t num_rows = input_labels.size();
2010  if (num_rows == 0) {
2011  return mgr.ERROR_MESSAGE(
2012  "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
2013  }
2014  std::vector<T> output_predictions_vec(num_rows);
2015  Column<T> output_predictions(output_predictions_vec);
2016  std::vector<int64_t> input_ids_vec(num_rows);
2017  std::vector<int64_t> output_ids_vec(num_rows);
2018  Column<int64_t> input_ids(input_ids_vec);
2019  Column<int64_t> output_ids(output_ids_vec);
2021  TextEncodingNone ml_framework_encoding_none("DEFAULT");
2022 
2023  try {
2024  auto ret = ml_reg_predict_impl(mgr,
2025  model,
2026  input_ids,
2027  input_features,
2028  ml_framework_encoding_none,
2029  output_ids,
2030  output_predictions);
2031 
2032  if (ret < 0) {
2033  // A return of less than 0 symbolizes an error
2034  return ret;
2035  }
2036  } catch (std::runtime_error& e) {
2038  return mgr.ERROR_MESSAGE(e.what());
2039  }
2040 
2042  mgr.set_output_row_size(1);
2043 
2044  const auto labels_mean = get_column_mean(input_labels);
2045  const size_t max_thread_count = std::thread::hardware_concurrency();
2046  const size_t max_inputs_per_thread = 20000;
2047  const size_t num_threads = std::min(
2048  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
2049 
2050  std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
2051  std::vector<double> local_sum_squares(num_threads, 0.0);
2052 
2053  tbb::task_arena limited_arena(num_threads);
2054 
2055  limited_arena.execute([&] {
2057  tbb::blocked_range<int64_t>(0, num_rows),
2058  [&](const tbb::blocked_range<int64_t>& r) {
2059  const int64_t start_idx = r.begin();
2060  const int64_t end_idx = r.end();
2061  double local_sum_squared_regression{0.0};
2062  double local_sum_square{0.0};
2063  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
2064  if (output_predictions[row_idx] != inline_null_value<T>()) {
2065  local_sum_squared_regression +=
2066  (input_labels[row_idx] - output_predictions[row_idx]) *
2067  (input_labels[row_idx] - output_predictions[row_idx]);
2068  local_sum_square += (input_labels[row_idx] - labels_mean) *
2069  (input_labels[row_idx] - labels_mean);
2070  }
2071  }
2072  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
2073  local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
2074  local_sum_squares[thread_idx] += local_sum_square;
2075  });
2076  });
2077  double sum_squared_regression{0.0};
2078  double sum_squares{0.0};
2079  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
2080  sum_squared_regression += local_sum_squared_regressions[thread_idx];
2081  sum_squares += local_sum_squares[thread_idx];
2082  }
2083  output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
2084  return 1;
2085 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
DEVICE int64_t size() const
const size_t max_inputs_per_thread
void disable_output_allocations()
Definition: heavydbTypes.h:379
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
void enable_output_allocations()
Definition: heavydbTypes.h:381
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1297 of file MLTableFunctions.hpp.

References random_forest_reg_fit_impl().

1316  {
1317  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1318  return random_forest_reg_fit_impl(mgr,
1319  model_name,
1320  input_labels,
1321  input_features,
1322  empty_cat_feature_keys,
1323  num_trees,
1324  obs_per_tree_fraction,
1325  max_tree_depth,
1326  features_per_node,
1327  impurity_threshold,
1328  bootstrap,
1329  min_obs_per_leaf_node,
1330  min_obs_per_split_node,
1331  min_weight_fraction_in_leaf_node,
1332  min_impurity_decrease_in_split_node,
1333  max_leaf_nodes,
1334  use_histogram,
1335  var_importance_metric_str,
1336  preferred_ml_framework_str,
1337  model_metadata,
1338  output_model_name);
1339 }
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1368 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and random_forest_reg_fit_impl().

1391  {
1392  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1393  input_numeric_features,
1394  cat_top_k,
1395  cat_min_fraction,
1396  false /* cat_include_others */);
1397  return random_forest_reg_fit_impl(mgr,
1398  model_name,
1399  input_labels,
1400  cat_features_builder.getFeatures(),
1401  cat_features_builder.getCatFeatureKeys(),
1402  num_trees,
1403  obs_per_tree_fraction,
1404  max_tree_depth,
1405  features_per_node,
1406  impurity_threshold,
1407  bootstrap,
1408  min_obs_per_leaf_node,
1409  min_obs_per_split_node,
1410  min_weight_fraction_in_leaf_node,
1411  min_impurity_decrease_in_split_node,
1412  max_leaf_nodes,
1413  use_histogram,
1414  var_importance_metric_str,
1415  preferred_ml_framework_str,
1416  model_metadata,
1417  output_model_name);
1418 }
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1447 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and random_forest_reg_fit_impl().

1469  {
1470  CategoricalFeaturesBuilder<T> cat_features_builder(
1471  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1472  return random_forest_reg_fit_impl(mgr,
1473  model_name,
1474  input_labels,
1475  cat_features_builder.getFeatures(),
1476  cat_features_builder.getCatFeatureKeys(),
1477  num_trees,
1478  obs_per_tree_fraction,
1479  max_tree_depth,
1480  features_per_node,
1481  impurity_threshold,
1482  bootstrap,
1483  min_obs_per_leaf_node,
1484  min_obs_per_split_node,
1485  min_weight_fraction_in_leaf_node,
1486  min_impurity_decrease_in_split_node,
1487  max_leaf_nodes,
1488  use_histogram,
1489  var_importance_metric_str,
1490  preferred_ml_framework_str,
1491  model_metadata,
1492  output_model_name);
1493 }
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1106 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), get_var_importance_metric(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by random_forest_reg_fit__cpu_template().

1126  {
1127  if (input_labels.size() == 0) {
1128  return mgr.ERROR_MESSAGE(
1129  "No rows exist in training data. Training data must at least contain 1 row.");
1130  }
1131  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1132  if (preferred_ml_framework == MLFramework::INVALID) {
1133  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1134  preferred_ml_framework_str.getString());
1135  }
1136  if (preferred_ml_framework == MLFramework::MLPACK) {
1137  return mgr.ERROR_MESSAGE(
1138  "Only OneDAL framework supported for random forest regression.");
1139  }
1140 #ifndef HAVE_ONEDAL
1141  return mgr.ERROR_MESSAGE(
1142  "Only OneDAL framework supported for random forest regression.");
1143 #endif
1144 
1145  const auto denulled_data = denull_data(input_labels, input_features);
1146  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
1147  const auto features_ptrs =
1148  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
1149  mgr.set_output_row_size(1);
1150  try {
1151  bool did_execute = false;
1152  const auto var_importance_metric =
1153  get_var_importance_metric(var_importance_metric_str);
1154  if (var_importance_metric == VarImportanceMetric::INVALID) {
1155  return mgr.ERROR_MESSAGE("Invalid variable importance metric: " +
1156  var_importance_metric_str.getString());
1157  }
1158 #ifdef HAVE_ONEDAL
1159  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
1160  preferred_ml_framework == MLFramework::DEFAULT)) {
1161  if (use_histogram) {
1162  onedal_oneapi_random_forest_reg_fit_impl<
1163  T,
1164  oneapi::dal::decision_forest::method::hist>(
1165  model_name,
1166  labels_ptrs[0],
1167  features_ptrs,
1168  model_metadata,
1169  cat_feature_keys,
1170  denulled_data.masked_num_rows,
1171  num_trees,
1172  obs_per_tree_fraction,
1173  max_tree_depth,
1174  features_per_node,
1175  impurity_threshold,
1176  bootstrap,
1177  min_obs_per_leaf_node,
1178  min_obs_per_split_node,
1179  min_weight_fraction_in_leaf_node,
1180  min_impurity_decrease_in_split_node,
1181  max_leaf_nodes,
1182  var_importance_metric);
1183  } else {
1184  onedal_oneapi_random_forest_reg_fit_impl<
1185  T,
1186  oneapi::dal::decision_forest::method::dense>(
1187  model_name,
1188  labels_ptrs[0],
1189  features_ptrs,
1190  model_metadata,
1191  cat_feature_keys,
1192  denulled_data.masked_num_rows,
1193  num_trees,
1194  obs_per_tree_fraction,
1195  max_tree_depth,
1196  features_per_node,
1197  impurity_threshold,
1198  bootstrap,
1199  min_obs_per_leaf_node,
1200  min_obs_per_split_node,
1201  min_weight_fraction_in_leaf_node,
1202  min_impurity_decrease_in_split_node,
1203  max_leaf_nodes,
1204  var_importance_metric);
1205  }
1206  const TextEncodingDict model_name_str_id =
1207  output_model_name.getOrAddTransient(model_name);
1208  output_model_name[0] = model_name_str_id;
1209  did_execute = true;
1210  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
1211  if (use_histogram) {
1212  onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1213  model_name,
1214  labels_ptrs[0],
1215  features_ptrs,
1216  model_metadata,
1217  cat_feature_keys,
1218  denulled_data.masked_num_rows,
1219  num_trees,
1220  obs_per_tree_fraction,
1221  max_tree_depth,
1222  features_per_node,
1223  impurity_threshold,
1224  bootstrap,
1225  min_obs_per_leaf_node,
1226  min_obs_per_split_node,
1227  min_weight_fraction_in_leaf_node,
1228  min_impurity_decrease_in_split_node,
1229  max_leaf_nodes,
1230  var_importance_metric);
1231  } else {
1232  onedal_random_forest_reg_fit_impl<
1233  T,
1234  decision_forest::regression::training::defaultDense>(
1235  model_name,
1236  labels_ptrs[0],
1237  features_ptrs,
1238  model_metadata,
1239  cat_feature_keys,
1240  denulled_data.masked_num_rows,
1241  num_trees,
1242  obs_per_tree_fraction,
1243  max_tree_depth,
1244  features_per_node,
1245  impurity_threshold,
1246  bootstrap,
1247  min_obs_per_leaf_node,
1248  min_obs_per_split_node,
1249  min_weight_fraction_in_leaf_node,
1250  min_impurity_decrease_in_split_node,
1251  max_leaf_nodes,
1252  var_importance_metric);
1253  }
1254  const TextEncodingDict model_name_str_id =
1255  output_model_name.getOrAddTransient(model_name);
1256  output_model_name[0] = model_name_str_id;
1257  did_execute = true;
1258  }
1259 #endif
1260  if (!did_execute) {
1261  return mgr.ERROR_MESSAGE(
1262  "Cannot find " + preferred_ml_framework_str.getString() +
1263  " ML library to support random forest regression implementation.");
1264  }
1265  } catch (std::runtime_error& e) {
1266  return mgr.ERROR_MESSAGE(e.what());
1267  }
1268  return 1;
1269 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
Column< int64_t > &  feature_id,
Column< TextEncodingDict > &  feature,
Column< int64_t > &  sub_feature_id,
Column< TextEncodingDict > &  sub_feature,
Column< double > &  importance_score 
)

Definition at line 174 of file MLTableFunctions.cpp.

References g_ml_models, get_model_features(), MLModelMap::getModel(), Column< TextEncodingDict >::getOrAddTransient(), and TableFunctionManager::set_output_row_size().

Referenced by random_forest_reg_var_importance__cpu_2().

180  {
181 #ifndef HAVE_ONEDAL
182  return mgr.ERROR_MESSAGE(
183  "Only OneDAL framework supported for random forest regression.");
184 #endif
185  try {
186 #ifdef HAVE_ONEDAL
187  const auto base_model = g_ml_models.getModel(model_name);
188  const auto rand_forest_model =
189  std::dynamic_pointer_cast<AbstractRandomForestModel>(base_model);
190  if (!rand_forest_model) {
191  throw std::runtime_error("Model is not of type random forest.");
192  }
193  const auto& variable_importance_scores =
194  rand_forest_model->getVariableImportanceScores();
195  const int64_t num_features = variable_importance_scores.size();
196  mgr.set_output_row_size(num_features);
197  if (num_features == 0) {
198  return mgr.ERROR_MESSAGE("Variable importance not computed for this model.");
199  }
200  if (num_features != rand_forest_model->getNumFeatures()) {
201  return mgr.ERROR_MESSAGE(
202  "Mismatch in number of features and number of variable importance metrics.");
203  }
204  const auto num_logical_features = rand_forest_model->getNumLogicalFeatures();
205  std::vector<std::string> feature_names =
206  get_model_features(model_name, rand_forest_model);
207 
208  int64_t physical_feature_idx = 0;
209  const auto& cat_feature_keys = rand_forest_model->getCatFeatureKeys();
210  const auto num_cat_features = rand_forest_model->getNumCatFeatures();
211  for (int64_t feature_idx = 0; feature_idx < num_logical_features; ++feature_idx) {
212  // Make feature ids start at 1, not 0
213  if (feature_idx < num_cat_features) {
214  const auto& col_cat_feature_keys = cat_feature_keys[feature_idx];
215  int64_t sub_feature_idx = 1;
216  for (const auto& col_cat_feature_key : col_cat_feature_keys) {
217  feature_id[physical_feature_idx] = feature_idx + 1;
218  if (feature_names[feature_idx].empty()) {
219  feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
220  } else {
221  feature[physical_feature_idx] =
222  feature.getOrAddTransient(feature_names[feature_idx]);
223  }
224  sub_feature_id[physical_feature_idx] = sub_feature_idx++;
225  const TextEncodingDict feature_sub_key =
226  sub_feature.getOrAddTransient(col_cat_feature_key);
227  sub_feature[physical_feature_idx] = feature_sub_key;
228  importance_score[physical_feature_idx] =
229  variable_importance_scores[physical_feature_idx];
230  physical_feature_idx++;
231  }
232  } else {
233  feature_id[physical_feature_idx] = feature_idx + 1;
234  if (feature_names[feature_idx].empty()) {
235  feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
236  } else {
237  feature[physical_feature_idx] =
238  feature.getOrAddTransient(feature_names[feature_idx]);
239  }
240  sub_feature_id[physical_feature_idx] = 1;
241  sub_feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
242  importance_score[physical_feature_idx] =
243  variable_importance_scores[physical_feature_idx];
244  physical_feature_idx++;
245  }
246  }
247  return num_features;
248 #endif
249  } catch (std::runtime_error& e) {
250  return mgr.ERROR_MESSAGE(e.what());
251  }
252 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
MLModelMap g_ml_models
Definition: MLModel.h:125
std::vector< std::string > get_model_features(const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2 ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
Column< int64_t > &  feature_id,
Column< TextEncodingDict > &  feature,
Column< int64_t > &  sub_feature_id,
Column< TextEncodingDict > &  sub_feature,
Column< double > &  importance_score 
)

Definition at line 255 of file MLTableFunctions.cpp.

References Column< TextEncodingDict >::getString(), random_forest_reg_var_importance__cpu_1(), and Column< TextEncodingDict >::size().

261  {
262  if (model_name.size() != 1) {
263  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
264  }
265  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
267  model_name_text_enc_none,
268  feature_id,
269  feature,
270  sub_feature_id,
271  sub_feature,
272  importance_score);
273 }
DEVICE const std::string getString(int64_t index) const
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t size() const

+ Here is the call graph for this function:

EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_ ( TableFunctionManager mgr,
Column< TextEncodingDict > &  output_ml_frameworks,
Column< bool > &  output_availability,
Column< bool > &  output_default 
)

Definition at line 8 of file MLTableFunctions.cpp.

References StringDictionaryProxy::getOrAddTransientBulk(), TableFunctionManager::set_output_row_size(), and Column< TextEncodingDict >::string_dict_proxy_.

11  {
12  const std::vector<std::string> ml_frameworks = {"oneapi", "onedal", "mlpack"};
13  const int32_t num_frameworks = ml_frameworks.size();
14  mgr.set_output_row_size(num_frameworks);
15  const std::vector<int32_t> ml_framework_string_ids =
16  output_ml_frameworks.string_dict_proxy_->getOrAddTransientBulk(ml_frameworks);
17 
18 #if defined(HAVE_ONEDAL) || defined(HAVE_MLPACK)
19  bool found_available_framework = false;
20  auto framework_found_actions = [&output_availability,
21  &output_default,
22  &found_available_framework](const int64_t out_row_idx) {
23  output_availability[out_row_idx] = true;
24  if (!found_available_framework) {
25  output_default[out_row_idx] = true;
26  found_available_framework = true;
27  } else {
28  output_default[out_row_idx] = false;
29  }
30  };
31 #endif
32 
33 #if !defined(HAVE_ONEDAL) || !defined(HAVE_MLPACK)
34  auto framework_not_found_actions = [&output_availability,
35  &output_default](const int64_t out_row_idx) {
36  output_availability[out_row_idx] = false;
37  output_default[out_row_idx] = false;
38  };
39 #endif
40 
41  for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
42  output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
43  if (ml_frameworks[out_row_idx] == "onedal" ||
44  ml_frameworks[out_row_idx] == "oneapi") {
45 #ifdef HAVE_ONEDAL
46  framework_found_actions(out_row_idx);
47 #else
48  framework_not_found_actions(out_row_idx);
49 #endif
50  } else if (ml_frameworks[out_row_idx] == "mlpack") {
51 #ifdef HAVE_MLPACK
52  framework_found_actions(out_row_idx);
53 #else
54  framework_not_found_actions(out_row_idx);
55 #endif
56  }
57  }
58  return num_frameworks;
59 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
StringDictionaryProxy * string_dict_proxy_
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)

+ Here is the call graph for this function: