32 using namespace daal::algorithms;
33 using namespace daal::data_management;
35 inline void printAprioriItemsets(
36 daal::data_management::NumericTablePtr largeItemsetsTable,
37 daal::data_management::NumericTablePtr largeItemsetsSupportTable,
38 size_t nItemsetToPrint = 20) {
39 using namespace daal::data_management;
41 size_t largeItemsetCount = largeItemsetsSupportTable->getNumberOfRows();
42 size_t nItemsInLargeItemsets = largeItemsetsTable->getNumberOfRows();
44 BlockDescriptor<int> block1;
45 largeItemsetsTable->getBlockOfRows(0, nItemsInLargeItemsets, readOnly, block1);
46 int* largeItemsets = block1.getBlockPtr();
48 BlockDescriptor<int> block2;
49 largeItemsetsSupportTable->getBlockOfRows(0, largeItemsetCount, readOnly, block2);
50 int* largeItemsetsSupportData = block2.getBlockPtr();
52 std::vector<std::vector<size_t>> largeItemsetsVector;
53 largeItemsetsVector.resize(largeItemsetCount);
55 for (
size_t i = 0; i < nItemsInLargeItemsets; i++) {
56 largeItemsetsVector[largeItemsets[2 * i]].push_back(largeItemsets[2 * i + 1]);
59 std::vector<size_t> supportVector;
60 supportVector.resize(largeItemsetCount);
62 for (
size_t i = 0; i < largeItemsetCount; i++) {
63 supportVector[largeItemsetsSupportData[2 * i]] = largeItemsetsSupportData[2 * i + 1];
66 std::cout << std::endl <<
"Apriori example program results" << std::endl;
68 std::cout << std::endl
69 <<
"Last " << nItemsetToPrint <<
" large itemsets: " << std::endl;
70 std::cout << std::endl
72 <<
"\t\t\tSupport" << std::endl;
74 size_t iMin = (((largeItemsetCount > nItemsetToPrint) && (nItemsetToPrint != 0))
75 ? largeItemsetCount - nItemsetToPrint
77 for (
size_t i = iMin; i < largeItemsetCount; i++) {
79 for (
size_t l = 0; l < largeItemsetsVector[i].size() - 1; l++) {
80 std::cout << largeItemsetsVector[i][l] <<
", ";
82 std::cout << largeItemsetsVector[i][largeItemsetsVector[i].size() - 1] <<
"}\t\t";
84 std::cout << supportVector[i] << std::endl;
87 largeItemsetsTable->releaseBlockOfRows(block1);
88 largeItemsetsSupportTable->releaseBlockOfRows(block2);
91 inline void printAprioriRules(daal::data_management::NumericTablePtr leftItemsTable,
92 daal::data_management::NumericTablePtr rightItemsTable,
93 daal::data_management::NumericTablePtr confidenceTable,
94 size_t nRulesToPrint = 20) {
95 using namespace daal::data_management;
97 size_t nRules = confidenceTable->getNumberOfRows();
98 size_t nLeftItems = leftItemsTable->getNumberOfRows();
99 size_t nRightItems = rightItemsTable->getNumberOfRows();
101 BlockDescriptor<int> block1;
102 leftItemsTable->getBlockOfRows(0, nLeftItems, readOnly, block1);
103 int* leftItems = block1.getBlockPtr();
105 BlockDescriptor<int> block2;
106 rightItemsTable->getBlockOfRows(0, nRightItems, readOnly, block2);
107 int* rightItems = block2.getBlockPtr();
109 BlockDescriptor<DAAL_DATA_TYPE> block3;
110 confidenceTable->getBlockOfRows(0, nRules, readOnly, block3);
111 DAAL_DATA_TYPE* confidence = block3.getBlockPtr();
113 std::vector<std::vector<size_t>> leftItemsVector;
114 leftItemsVector.resize(nRules);
117 std::cout << std::endl <<
"No association rules were found " << std::endl;
121 for (
size_t i = 0; i < nLeftItems; i++) {
122 leftItemsVector[leftItems[2 * i]].push_back(leftItems[2 * i + 1]);
125 std::vector<std::vector<size_t>> rightItemsVector;
126 rightItemsVector.resize(nRules);
128 for (
size_t i = 0; i < nRightItems; i++) {
129 rightItemsVector[rightItems[2 * i]].push_back(rightItems[2 * i + 1]);
132 std::vector<DAAL_DATA_TYPE> confidenceVector;
133 confidenceVector.resize(nRules);
135 for (
size_t i = 0; i < nRules; i++) {
136 confidenceVector[i] = confidence[i];
139 std::cout << std::endl
140 <<
"Last " << nRulesToPrint <<
" association rules: " << std::endl;
141 std::cout << std::endl
143 <<
"\t\t\t\tConfidence" << std::endl;
145 (((nRules > nRulesToPrint) && (nRulesToPrint != 0)) ? (nRules - nRulesToPrint) : 0);
147 for (
size_t i = iMin; i < nRules; i++) {
149 for (
size_t l = 0; l < leftItemsVector[i].size() - 1; l++) {
150 std::cout << leftItemsVector[i][l] <<
", ";
152 std::cout << leftItemsVector[i][leftItemsVector[i].size() - 1] <<
"} => {";
154 for (
size_t l = 0; l < rightItemsVector[i].size() - 1; l++) {
155 std::cout << rightItemsVector[i][l] <<
", ";
157 std::cout << rightItemsVector[i][rightItemsVector[i].size() - 1] <<
"}\t\t";
159 std::cout << confidenceVector[i] << std::endl;
162 leftItemsTable->releaseBlockOfRows(block1);
163 rightItemsTable->releaseBlockOfRows(block2);
164 confidenceTable->releaseBlockOfRows(block3);
167 inline bool isFull(daal::data_management::NumericTableIface::StorageLayout layout) {
168 int layoutInt = (int)layout;
169 if (daal::data_management::packed_mask & layoutInt) {
175 inline bool isUpper(daal::data_management::NumericTableIface::StorageLayout layout) {
176 using daal::data_management::NumericTableIface;
178 if (layout == NumericTableIface::upperPackedSymmetricMatrix ||
179 layout == NumericTableIface::upperPackedTriangularMatrix) {
185 inline bool isLower(daal::data_management::NumericTableIface::StorageLayout layout) {
186 using daal::data_management::NumericTableIface;
188 if (layout == NumericTableIface::lowerPackedSymmetricMatrix ||
189 layout == NumericTableIface::lowerPackedTriangularMatrix) {
195 template <
typename T>
196 inline void printArray(
T* array,
197 const size_t nPrintedCols,
198 const size_t nPrintedRows,
201 size_t interval = 10) {
202 std::cout << std::setiosflags(std::ios::left);
203 std::cout << message << std::endl;
204 for (
size_t i = 0; i < nPrintedRows; i++) {
205 for (
size_t j = 0; j < nPrintedCols; j++) {
206 std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
207 << std::setprecision(3);
208 std::cout << array[i * nCols + j];
210 std::cout << std::endl;
212 std::cout << std::endl;
215 template <
typename T>
216 inline void printArray(
T* array,
220 size_t interval = 10) {
221 printArray(array, nCols, nRows, nCols, message, interval);
224 template <
typename T>
225 inline void printLowerArray(
T* array,
226 const size_t nPrintedRows,
228 size_t interval = 10) {
229 std::cout << std::setiosflags(std::ios::left);
230 std::cout << message << std::endl;
232 for (
size_t i = 0; i < nPrintedRows; i++) {
233 for (
size_t j = 0; j <= i; j++) {
234 std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
235 << std::setprecision(3);
236 std::cout << array[ind++];
238 std::cout << std::endl;
240 std::cout << std::endl;
243 template <
typename T>
244 inline void printUpperArray(
T* array,
245 const size_t nPrintedCols,
246 const size_t nPrintedRows,
249 size_t interval = 10) {
250 std::cout << std::setiosflags(std::ios::left);
251 std::cout << message << std::endl;
253 for (
size_t i = 0; i < nPrintedRows; i++) {
254 for (
size_t j = 0; j < i; j++) {
257 for (
size_t j = i; j < nPrintedCols; j++) {
258 std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
259 << std::setprecision(3);
260 std::cout << array[ind++];
262 for (
size_t j = nPrintedCols; j < nCols; j++) {
265 std::cout << std::endl;
267 std::cout << std::endl;
270 inline void printNumericTable(daal::data_management::NumericTable* dataTable,
271 const char* message =
"",
272 size_t nPrintedRows = 0,
273 size_t nPrintedCols = 0,
274 size_t interval = 10) {
275 using namespace daal::data_management;
277 size_t nRows = dataTable->getNumberOfRows();
278 size_t nCols = dataTable->getNumberOfColumns();
279 NumericTableIface::StorageLayout layout = dataTable->getDataLayout();
281 if (nPrintedRows != 0) {
282 nPrintedRows = std::min(nRows, nPrintedRows);
284 nPrintedRows = nRows;
287 if (nPrintedCols != 0) {
288 nPrintedCols = std::min(nCols, nPrintedCols);
290 nPrintedCols = nCols;
293 BlockDescriptor<DAAL_DATA_TYPE> block;
294 if (isFull(layout) || layout == NumericTableIface::csrArray) {
295 dataTable->getBlockOfRows(0, nRows, readOnly, block);
296 printArray<DAAL_DATA_TYPE>(
297 block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
298 dataTable->releaseBlockOfRows(block);
300 PackedArrayNumericTableIface* packedTable =
301 dynamic_cast<PackedArrayNumericTableIface*
>(dataTable);
302 packedTable->getPackedArray(readOnly, block);
303 if (isLower(layout)) {
304 printLowerArray<DAAL_DATA_TYPE>(
305 block.getBlockPtr(), nPrintedRows, message, interval);
306 }
else if (isUpper(layout)) {
307 printUpperArray<DAAL_DATA_TYPE>(
308 block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
310 packedTable->releasePackedArray(block);
314 inline void printNumericTable(daal::data_management::NumericTable& dataTable,
315 const char* message =
"",
316 size_t nPrintedRows = 0,
317 size_t nPrintedCols = 0,
318 size_t interval = 10) {
319 printNumericTable(&dataTable, message, nPrintedRows, nPrintedCols, interval);
322 inline void printNumericTable(
const daal::data_management::NumericTablePtr& dataTable,
323 const char* message =
"",
324 size_t nPrintedRows = 0,
325 size_t nPrintedCols = 0,
326 size_t interval = 10) {
327 printNumericTable(dataTable.get(), message, nPrintedRows, nPrintedCols, interval);
330 template <
typename T>
331 const NumericTablePtr prepare_data_table(
const T* data,
const int64_t num_rows) {
334 data_table->setArray<
T>(
const_cast<T*
>(data), 0);
339 template <
typename T>
340 const NumericTablePtr prepare_data_table(
const std::vector<const T*>& data,
341 const int64_t num_rows) {
343 const size_t num_columns = data.size();
347 for (
size_t i = 0; i < num_columns; ++i) {
348 data_table->setArray<
T>(
const_cast<T*
>(data[i]), i);
353 template <
typename T>
354 const NumericTablePtr prepare_pivoted_data_table(
const T* data,
const int64_t num_elems) {
358 for (
size_t c = 0; c < static_cast<size_t>(num_elems); ++c) {
359 data_table->setArray<
T>(
const_cast<T*
>(data) + c, c);
365 const static std::map<KMeansInitStrategy, kmeans::init::Method> kmeans_init_type_map = {
371 const auto itr = kmeans_init_type_map.find(init_type);
372 if (itr == kmeans_init_type_map.end()) {
373 std::ostringstream oss;
374 oss <<
"Invalid Kmeans cluster centroid initialization type. "
375 <<
"Was expecting one of DETERMINISTIC, RANDOM, or PLUS_PLUS.";
376 throw std::runtime_error(oss.str());
381 template <
typename T, kmeans::init::Method M>
382 const NumericTablePtr init_centroids_for_type(
const NumericTablePtr& input_features_table,
383 const int32_t num_clusters) {
384 kmeans::init::Batch<T, M>
init(num_clusters);
385 init.input.set(kmeans::init::data, input_features_table);
387 return init.getResult()->get(kmeans::init::centroids);
390 template <
typename T>
391 const NumericTablePtr init_centroids(
const NumericTablePtr& input_features_table,
392 const kmeans::init::Method& init_type,
393 const int32_t num_clusters) {
395 case kmeans::init::Method::deterministicDense:
396 return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
397 input_features_table, num_clusters);
398 case kmeans::init::Method::randomDense:
399 return init_centroids_for_type<T, kmeans::init::Method::randomDense>(
400 input_features_table, num_clusters);
401 case kmeans::init::Method::plusPlusDense:
402 return init_centroids_for_type<T, kmeans::init::Method::plusPlusDense>(
403 input_features_table, num_clusters);
404 case kmeans::init::Method::parallelPlusDense:
405 return init_centroids_for_type<T, kmeans::init::Method::parallelPlusDense>(
406 input_features_table, num_clusters);
409 return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
410 input_features_table, num_clusters);
415 template <
typename T>
416 NEVER_INLINE HOST int32_t onedal_kmeans_impl(
const std::vector<const T*>& input_features,
417 int32_t* output_clusters,
418 const int64_t num_rows,
419 const int num_clusters,
420 const int num_iterations,
423 const auto features_table = prepare_data_table(input_features, num_rows);
425 const auto centroids =
426 init_centroids<T>(features_table, onedal_kmeans_init_type, num_clusters);
427 const auto assignments_table =
429 const kmeans::ResultPtr
result(
new kmeans::Result);
430 result->set(kmeans::assignments, assignments_table);
431 result->set(kmeans::objectiveFunction,
433 result->set(kmeans::nIterations,
435 kmeans::Batch<> algorithm(num_clusters, num_iterations);
436 algorithm.input.set(kmeans::data, features_table);
437 algorithm.input.set(kmeans::inputCentroids, centroids);
438 algorithm.parameter().resultsToEvaluate = kmeans::computeAssignments;
439 algorithm.setResult(
result);
441 }
catch (std::exception& e) {
442 throw std::runtime_error(e.what());
447 template <
typename T>
448 NEVER_INLINE HOST int32_t onedal_dbscan_impl(
const std::vector<const T*>& input_features,
449 int32_t* output_clusters,
450 const int64_t num_rows,
451 const double epsilon,
452 const int32_t min_observations) {
454 const auto features_table = prepare_data_table(input_features, num_rows);
455 const auto assignments_table =
457 const dbscan::ResultPtr
result(
new dbscan::Result);
458 result->set(dbscan::assignments, assignments_table);
459 result->set(dbscan::nClusters,
461 dbscan::Batch<> algorithm(epsilon, min_observations);
462 algorithm.input.set(dbscan::data, features_table);
463 algorithm.parameter().resultsToCompute = dbscan::assignments;
464 algorithm.setResult(
result);
466 }
catch (std::exception& e) {
467 throw std::runtime_error(e.what());
472 template <
typename T>
473 NEVER_INLINE HOST std::pair<std::vector<std::vector<T>>, std::vector<T>> onedal_pca_impl(
474 const std::vector<const T*>& input_features,
475 const int64_t num_rows) {
477 const auto features_table = prepare_data_table(input_features, num_rows);
478 pca::Batch<> algorithm;
479 algorithm.input.set(pca::data, features_table);
480 algorithm.parameter.resultsToCompute = pca::mean | pca::variance | pca::eigenvalue;
481 algorithm.parameter.isDeterministic =
true;
484 pca::ResultPtr
result = algorithm.getResult();
485 const auto eigenvectors_table = result->get(pca::eigenvectors);
486 const int64_t num_dims = eigenvectors_table->getNumberOfRows();
487 CHECK_EQ(num_dims, static_cast<int64_t>(eigenvectors_table->getNumberOfColumns()));
488 std::vector<std::vector<T>> eigenvectors(num_dims, std::vector<T>(num_dims));
489 for (int64_t row_idx = 0; row_idx < num_dims; ++row_idx) {
490 for (int64_t col_idx = 0; col_idx < num_dims; ++col_idx) {
492 eigenvectors[row_idx][col_idx] =
493 eigenvectors_table->getValue<
T>(col_idx, row_idx);
496 const auto eigenvalues_table = result->get(pca::eigenvalues);
497 std::vector<T> eigenvalues(num_dims);
498 for (int64_t dim_idx = 0; dim_idx < num_dims; ++dim_idx) {
499 eigenvalues[dim_idx] = eigenvalues_table->getValue<
T>(dim_idx, 0);
501 return std::make_pair(eigenvectors, eigenvalues);
502 }
catch (std::exception& e) {
503 throw std::runtime_error(e.what());
507 template <
typename T>
508 int32_t extract_model_coefs(
const NumericTablePtr& coefs_table,
511 const int64_t num_coefs = coefs_table->getNumberOfColumns();
512 for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
513 coef_idxs[coef_idx] = coef_idx;
515 coefs_table->NumericTable::getValue<
T>(coef_idx,
static_cast<size_t>(0));
520 template <
typename T>
522 onedal_linear_reg_fit_impl(
const T* input_labels,
523 const std::vector<const T*>& input_features,
524 int64_t* output_coef_idxs,
525 double* output_coefs,
526 const int64_t num_rows) {
528 const auto labels_table = prepare_data_table(input_labels, num_rows);
529 const auto features_table = prepare_data_table(input_features, num_rows);
531 linear_regression::training::Batch<T, linear_regression::training::Method::qrDense>
534 algorithm.input.set(linear_regression::training::data, features_table);
535 algorithm.input.set(linear_regression::training::dependentVariables, labels_table);
538 const auto training_result = algorithm.getResult();
539 const auto coefs_table =
540 training_result->get(linear_regression::training::model)->getBeta();
542 return extract_model_coefs<T>(coefs_table, output_coef_idxs, output_coefs);
543 }
catch (std::exception& e) {
544 throw std::runtime_error(e.what());
548 template <
typename T>
550 const double* model_coefs,
551 const int64_t num_coefs) {
555 std::vector<T> casted_model_coefs(num_coefs);
556 for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
557 casted_model_coefs[coef_idx] = model_coefs[coef_idx];
559 const auto betas_table =
560 prepare_pivoted_data_table(casted_model_coefs.data(), num_coefs);
562 CHECK_EQ(betas_table->getNumberOfColumns(), num_coefs);
565 linear_regression::ModelBuilder<T> model_builder(num_coefs - 1,
569 BlockDescriptor<T> block_result;
573 betas_table->getBlockOfRows(0, betas_table->getNumberOfRows(), readOnly, block_result);
575 (betas_table->getNumberOfRows()) * (betas_table->getNumberOfColumns());
578 T* first_itr = block_result.getBlockPtr();
579 T* last_itr = first_itr + num_betas;
580 model_builder.setBeta(first_itr, last_itr);
581 betas_table->releaseBlockOfRows(block_result);
583 return model_builder.getModel();
586 template <
typename T>
588 onedal_linear_reg_predict_impl(
const std::shared_ptr<LinearRegressionModel>& model,
589 const std::vector<const T*>& input_features,
590 T* output_predictions,
591 const int64_t num_rows) {
594 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
595 throw std::runtime_error(
596 "Number of model coefficients does not match number of input features.");
598 const auto features_table = prepare_data_table(input_features, num_rows);
599 const auto model_ptr =
600 build_linear_reg_model<T>(model->getCoefs().data(), input_features.size() + 1);
602 linear_regression::prediction::Batch<> algorithm;
603 algorithm.input.set(linear_regression::prediction::data, features_table);
604 algorithm.input.set(linear_regression::prediction::model, model_ptr);
606 const auto predictions_table =
609 const linear_regression::prediction::ResultPtr
result(
610 new linear_regression::prediction::Result);
611 result->set(linear_regression::prediction::prediction, predictions_table);
612 algorithm.setResult(result);
615 }
catch (std::exception& e) {
616 throw std::runtime_error(e.what());
620 template <
typename T>
622 const std::string& model_name,
623 const T* input_labels,
624 const std::vector<const T*>& input_features,
625 const std::string& model_metadata,
626 const std::vector<std::vector<std::string>>& cat_feature_keys,
627 const int64_t num_rows,
628 const int64_t max_tree_depth,
629 const int64_t min_observations_per_leaf_node) {
631 const auto labels_table = prepare_data_table(input_labels, num_rows);
632 const auto features_table = prepare_data_table(input_features, num_rows);
633 decision_tree::regression::training::Batch<T> algorithm;
634 algorithm.input.set(decision_tree::regression::training::data, features_table);
635 algorithm.input.set(decision_tree::regression::training::dependentVariables,
638 algorithm.parameter.pruning = decision_tree::Pruning::none;
639 algorithm.parameter.maxTreeDepth = max_tree_depth;
640 algorithm.parameter.minObservationsInLeafNodes = min_observations_per_leaf_node;
643 decision_tree::regression::training::ResultPtr training_result =
644 algorithm.getResult();
646 auto model_ptr = training_result->get(decision_tree::regression::training::model);
647 auto model = std::make_shared<DecisionTreeRegressionModel>(
648 model_ptr, model_metadata, cat_feature_keys);
650 }
catch (std::exception& e) {
651 throw std::runtime_error(e.what());
655 template <
typename T>
657 const std::string& model_name,
658 const T* input_labels,
659 const std::vector<const T*>& input_features,
660 const std::string& model_metadata,
661 const std::vector<std::vector<std::string>>& cat_feature_keys,
662 const int64_t num_rows,
663 const int64_t max_iterations,
664 const int64_t max_tree_depth,
665 const double shrinkage,
666 const double min_split_loss,
668 const double obs_per_tree_fraction,
669 const int64_t features_per_node,
670 const int64_t min_observations_per_leaf_node,
671 const int64_t max_bins,
672 const int64_t min_bin_size) {
674 const auto labels_table = prepare_data_table(input_labels, num_rows);
675 const auto features_table = prepare_data_table(input_features, num_rows);
676 gbt::regression::training::Batch<T> algorithm;
677 algorithm.input.set(gbt::regression::training::data, features_table);
678 algorithm.input.set(gbt::regression::training::dependentVariable, labels_table);
680 algorithm.parameter().maxIterations = max_iterations;
681 algorithm.parameter().maxTreeDepth = max_tree_depth;
682 algorithm.parameter().shrinkage = shrinkage;
683 algorithm.parameter().minSplitLoss = min_split_loss;
684 algorithm.parameter().lambda = lambda;
685 algorithm.parameter().observationsPerTreeFraction = obs_per_tree_fraction;
686 algorithm.parameter().featuresPerNode = features_per_node;
687 algorithm.parameter().minObservationsInLeafNode = min_observations_per_leaf_node;
688 algorithm.parameter().maxBins = max_bins;
689 algorithm.parameter().minBinSize = min_bin_size;
692 gbt::regression::training::ResultPtr training_result = algorithm.getResult();
694 auto model_ptr = training_result->get(gbt::regression::training::model);
696 std::make_shared<GbtRegressionModel>(model_ptr, model_metadata, cat_feature_keys);
698 }
catch (std::exception& e) {
699 throw std::runtime_error(e.what());
703 inline decision_forest::training::VariableImportanceMode get_var_importance_metric_type(
706 decision_forest::training::VariableImportanceMode>
707 var_importance_mode_type_map = {
709 decision_forest::training::VariableImportanceMode::MDI},
711 decision_forest::training::VariableImportanceMode::none},
713 decision_forest::training::VariableImportanceMode::MDI},
715 decision_forest::training::VariableImportanceMode::MDA_Raw},
717 decision_forest::training::VariableImportanceMode::MDA_Scaled}};
719 const auto itr = var_importance_mode_type_map.find(var_importance_metric);
720 if (itr == var_importance_mode_type_map.end()) {
721 std::ostringstream oss;
722 oss <<
"Invalid variable importance mode type. "
723 <<
"Was expecting one of DEFAULT, NONE, MDI, MDA, or MDA_SCALED.";
724 throw std::runtime_error(oss.str());
729 template <
typename T, decision_forest::regression::training::Method M>
731 const std::string& model_name,
732 const T* input_labels,
733 const std::vector<const T*>& input_features,
734 const std::string& model_metadata,
735 const std::vector<std::vector<std::string>>& cat_feature_keys,
736 const int64_t num_rows,
737 const int64_t num_trees,
738 const double obs_per_tree_fraction,
739 const int64_t max_tree_depth,
740 const int64_t features_per_node,
741 const double impurity_threshold,
742 const bool bootstrap,
743 const int64_t min_obs_per_leaf_node,
744 const int64_t min_obs_per_split_node,
745 const double min_weight_fraction_in_leaf_node,
746 const double min_impurity_decrease_in_split_node,
747 const int64_t max_leaf_nodes,
748 const VarImportanceMetric var_importance_metric) {
749 constexpr
bool compute_out_of_bag_error{
false};
751 const auto labels_table = prepare_data_table(input_labels, num_rows);
752 const auto features_table = prepare_data_table(input_features, num_rows);
753 decision_forest::regression::training::Batch<T, M> algorithm;
754 algorithm.input.set(decision_forest::regression::training::data, features_table);
755 algorithm.input.set(decision_forest::regression::training::dependentVariable,
758 algorithm.parameter().nTrees = num_trees;
759 algorithm.parameter().observationsPerTreeFraction = obs_per_tree_fraction;
760 algorithm.parameter().maxTreeDepth = max_tree_depth;
761 algorithm.parameter().featuresPerNode = features_per_node;
762 algorithm.parameter().impurityThreshold = impurity_threshold;
763 algorithm.parameter().bootstrap = bootstrap;
764 algorithm.parameter().minObservationsInLeafNode = min_obs_per_leaf_node;
765 algorithm.parameter().minObservationsInSplitNode = min_obs_per_split_node;
766 algorithm.parameter().minWeightFractionInLeafNode = min_weight_fraction_in_leaf_node;
767 algorithm.parameter().minImpurityDecreaseInSplitNode =
768 min_impurity_decrease_in_split_node;
769 algorithm.parameter().varImportance =
770 get_var_importance_metric_type(var_importance_metric);
771 algorithm.parameter().resultsToCompute =
772 compute_out_of_bag_error ? decision_forest::training::computeOutOfBagError : 0;
775 decision_forest::regression::training::ResultPtr training_result =
776 algorithm.getResult();
778 auto model_ptr = training_result->get(decision_forest::regression::training::model);
779 auto variable_importance_table =
780 training_result->get(decision_forest::regression::training::variableImportance);
781 const size_t num_features = input_features.size();
782 std::vector<double> variable_importance(
785 for (
size_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
786 variable_importance[feature_idx] =
787 variable_importance_table->NumericTable::getValue<
T>(feature_idx, size_t(0));
790 double out_of_bag_error{0};
791 if (compute_out_of_bag_error) {
792 auto out_of_bag_error_table =
793 training_result->get(decision_forest::regression::training::outOfBagError);
795 out_of_bag_error_table->NumericTable::getValue<
T>(0,
static_cast<size_t>(0));
797 auto model = std::make_shared<RandomForestRegressionModel>(model_ptr,
803 }
catch (std::exception& e) {
804 throw std::runtime_error(e.what());
808 template <
typename T>
810 const std::shared_ptr<DecisionTreeRegressionModel>& model,
811 const std::vector<const T*>& input_features,
812 T* output_predictions,
813 const int64_t num_rows) {
816 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
817 throw std::runtime_error(
"Number of provided features does not match model.");
819 const auto features_table = prepare_data_table(input_features, num_rows);
820 decision_tree::regression::prediction::Batch<T> algorithm;
821 algorithm.input.set(decision_tree::regression::prediction::data, features_table);
822 algorithm.input.set(decision_tree::regression::prediction::model,
823 model->getModelPtr());
825 const auto predictions_table =
828 const decision_tree::regression::prediction::ResultPtr
result(
829 new decision_tree::regression::prediction::Result);
830 result->set(decision_tree::regression::prediction::prediction, predictions_table);
831 algorithm.setResult(result);
834 }
catch (std::exception& e) {
835 throw std::runtime_error(e.what());
839 template <
typename T>
841 onedal_gbt_reg_predict_impl(
const std::shared_ptr<GbtRegressionModel>& model,
842 const std::vector<const T*>& input_features,
843 T* output_predictions,
844 const int64_t num_rows) {
847 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
848 throw std::runtime_error(
"Number of provided features does not match model.");
850 const auto features_table = prepare_data_table(input_features, num_rows);
851 gbt::regression::prediction::Batch<T> algorithm;
852 algorithm.input.set(gbt::regression::prediction::data, features_table);
853 algorithm.input.set(gbt::regression::prediction::model, model->getModelPtr());
855 const auto predictions_table =
858 const gbt::regression::prediction::ResultPtr
result(
859 new gbt::regression::prediction::Result);
860 result->set(gbt::regression::prediction::prediction, predictions_table);
861 algorithm.setResult(result);
864 }
catch (std::exception& e) {
865 throw std::runtime_error(e.what());
869 template <
typename T>
871 const std::shared_ptr<RandomForestRegressionModel>& model,
872 const std::vector<const T*>& input_features,
873 T* output_predictions,
874 const int64_t num_rows) {
877 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
878 throw std::runtime_error(
"Number of provided features does not match model.");
880 const auto features_table = prepare_data_table(input_features, num_rows);
881 decision_forest::regression::prediction::Batch<T> algorithm;
882 algorithm.input.set(decision_forest::regression::prediction::data, features_table);
883 algorithm.input.set(decision_forest::regression::prediction::model,
884 model->getModelPtr());
886 const auto predictions_table =
889 const decision_forest::regression::prediction::ResultPtr
result(
890 new decision_forest::regression::prediction::Result);
891 result->set(decision_forest::regression::prediction::prediction, predictions_table);
892 algorithm.setResult(result);
896 }
catch (std::exception& e) {
897 throw std::runtime_error(e.what());
901 #endif // #ifdef HAVE_ONEDAL
902 #endif // #ifdef __CUDACC__
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
std::pair< FILE *, std::string > create(const std::string &basePath, const int fileId, const size_t pageSize, const size_t numPages)
void init(LogOptions const &log_opts)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)