OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
OneDalFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 #ifdef HAVE_ONEDAL
21 
22 #include <cstring>
23 
24 #include "MLModel.h"
27 #include "daal.h"
28 
29 #include <iomanip>
30 #include <iostream>
31 
32 using namespace daal::algorithms;
33 using namespace daal::data_management;
34 
35 inline void printAprioriItemsets(
36  daal::data_management::NumericTablePtr largeItemsetsTable,
37  daal::data_management::NumericTablePtr largeItemsetsSupportTable,
38  size_t nItemsetToPrint = 20) {
39  using namespace daal::data_management;
40 
41  size_t largeItemsetCount = largeItemsetsSupportTable->getNumberOfRows();
42  size_t nItemsInLargeItemsets = largeItemsetsTable->getNumberOfRows();
43 
44  BlockDescriptor<int> block1;
45  largeItemsetsTable->getBlockOfRows(0, nItemsInLargeItemsets, readOnly, block1);
46  int* largeItemsets = block1.getBlockPtr();
47 
48  BlockDescriptor<int> block2;
49  largeItemsetsSupportTable->getBlockOfRows(0, largeItemsetCount, readOnly, block2);
50  int* largeItemsetsSupportData = block2.getBlockPtr();
51 
52  std::vector<std::vector<size_t>> largeItemsetsVector;
53  largeItemsetsVector.resize(largeItemsetCount);
54 
55  for (size_t i = 0; i < nItemsInLargeItemsets; i++) {
56  largeItemsetsVector[largeItemsets[2 * i]].push_back(largeItemsets[2 * i + 1]);
57  }
58 
59  std::vector<size_t> supportVector;
60  supportVector.resize(largeItemsetCount);
61 
62  for (size_t i = 0; i < largeItemsetCount; i++) {
63  supportVector[largeItemsetsSupportData[2 * i]] = largeItemsetsSupportData[2 * i + 1];
64  }
65 
66  std::cout << std::endl << "Apriori example program results" << std::endl;
67 
68  std::cout << std::endl
69  << "Last " << nItemsetToPrint << " large itemsets: " << std::endl;
70  std::cout << std::endl
71  << "Itemset"
72  << "\t\t\tSupport" << std::endl;
73 
74  size_t iMin = (((largeItemsetCount > nItemsetToPrint) && (nItemsetToPrint != 0))
75  ? largeItemsetCount - nItemsetToPrint
76  : 0);
77  for (size_t i = iMin; i < largeItemsetCount; i++) {
78  std::cout << "{";
79  for (size_t l = 0; l < largeItemsetsVector[i].size() - 1; l++) {
80  std::cout << largeItemsetsVector[i][l] << ", ";
81  }
82  std::cout << largeItemsetsVector[i][largeItemsetsVector[i].size() - 1] << "}\t\t";
83 
84  std::cout << supportVector[i] << std::endl;
85  }
86 
87  largeItemsetsTable->releaseBlockOfRows(block1);
88  largeItemsetsSupportTable->releaseBlockOfRows(block2);
89 }
90 
91 inline void printAprioriRules(daal::data_management::NumericTablePtr leftItemsTable,
92  daal::data_management::NumericTablePtr rightItemsTable,
93  daal::data_management::NumericTablePtr confidenceTable,
94  size_t nRulesToPrint = 20) {
95  using namespace daal::data_management;
96 
97  size_t nRules = confidenceTable->getNumberOfRows();
98  size_t nLeftItems = leftItemsTable->getNumberOfRows();
99  size_t nRightItems = rightItemsTable->getNumberOfRows();
100 
101  BlockDescriptor<int> block1;
102  leftItemsTable->getBlockOfRows(0, nLeftItems, readOnly, block1);
103  int* leftItems = block1.getBlockPtr();
104 
105  BlockDescriptor<int> block2;
106  rightItemsTable->getBlockOfRows(0, nRightItems, readOnly, block2);
107  int* rightItems = block2.getBlockPtr();
108 
109  BlockDescriptor<DAAL_DATA_TYPE> block3;
110  confidenceTable->getBlockOfRows(0, nRules, readOnly, block3);
111  DAAL_DATA_TYPE* confidence = block3.getBlockPtr();
112 
113  std::vector<std::vector<size_t>> leftItemsVector;
114  leftItemsVector.resize(nRules);
115 
116  if (nRules == 0) {
117  std::cout << std::endl << "No association rules were found " << std::endl;
118  return;
119  }
120 
121  for (size_t i = 0; i < nLeftItems; i++) {
122  leftItemsVector[leftItems[2 * i]].push_back(leftItems[2 * i + 1]);
123  }
124 
125  std::vector<std::vector<size_t>> rightItemsVector;
126  rightItemsVector.resize(nRules);
127 
128  for (size_t i = 0; i < nRightItems; i++) {
129  rightItemsVector[rightItems[2 * i]].push_back(rightItems[2 * i + 1]);
130  }
131 
132  std::vector<DAAL_DATA_TYPE> confidenceVector;
133  confidenceVector.resize(nRules);
134 
135  for (size_t i = 0; i < nRules; i++) {
136  confidenceVector[i] = confidence[i];
137  }
138 
139  std::cout << std::endl
140  << "Last " << nRulesToPrint << " association rules: " << std::endl;
141  std::cout << std::endl
142  << "Rule"
143  << "\t\t\t\tConfidence" << std::endl;
144  size_t iMin =
145  (((nRules > nRulesToPrint) && (nRulesToPrint != 0)) ? (nRules - nRulesToPrint) : 0);
146 
147  for (size_t i = iMin; i < nRules; i++) {
148  std::cout << "{";
149  for (size_t l = 0; l < leftItemsVector[i].size() - 1; l++) {
150  std::cout << leftItemsVector[i][l] << ", ";
151  }
152  std::cout << leftItemsVector[i][leftItemsVector[i].size() - 1] << "} => {";
153 
154  for (size_t l = 0; l < rightItemsVector[i].size() - 1; l++) {
155  std::cout << rightItemsVector[i][l] << ", ";
156  }
157  std::cout << rightItemsVector[i][rightItemsVector[i].size() - 1] << "}\t\t";
158 
159  std::cout << confidenceVector[i] << std::endl;
160  }
161 
162  leftItemsTable->releaseBlockOfRows(block1);
163  rightItemsTable->releaseBlockOfRows(block2);
164  confidenceTable->releaseBlockOfRows(block3);
165 }
166 
167 inline bool isFull(daal::data_management::NumericTableIface::StorageLayout layout) {
168  int layoutInt = (int)layout;
169  if (daal::data_management::packed_mask & layoutInt) {
170  return false;
171  }
172  return true;
173 }
174 
175 inline bool isUpper(daal::data_management::NumericTableIface::StorageLayout layout) {
176  using daal::data_management::NumericTableIface;
177 
178  if (layout == NumericTableIface::upperPackedSymmetricMatrix ||
179  layout == NumericTableIface::upperPackedTriangularMatrix) {
180  return true;
181  }
182  return false;
183 }
184 
185 inline bool isLower(daal::data_management::NumericTableIface::StorageLayout layout) {
186  using daal::data_management::NumericTableIface;
187 
188  if (layout == NumericTableIface::lowerPackedSymmetricMatrix ||
189  layout == NumericTableIface::lowerPackedTriangularMatrix) {
190  return true;
191  }
192  return false;
193 }
194 
195 template <typename T>
196 inline void printArray(T* array,
197  const size_t nPrintedCols,
198  const size_t nPrintedRows,
199  const size_t nCols,
200  std::string message,
201  size_t interval = 10) {
202  std::cout << std::setiosflags(std::ios::left);
203  std::cout << message << std::endl;
204  for (size_t i = 0; i < nPrintedRows; i++) {
205  for (size_t j = 0; j < nPrintedCols; j++) {
206  std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
207  << std::setprecision(3);
208  std::cout << array[i * nCols + j];
209  }
210  std::cout << std::endl;
211  }
212  std::cout << std::endl;
213 }
214 
215 template <typename T>
216 inline void printArray(T* array,
217  const size_t nCols,
218  const size_t nRows,
219  std::string message,
220  size_t interval = 10) {
221  printArray(array, nCols, nRows, nCols, message, interval);
222 }
223 
224 template <typename T>
225 inline void printLowerArray(T* array,
226  const size_t nPrintedRows,
227  std::string message,
228  size_t interval = 10) {
229  std::cout << std::setiosflags(std::ios::left);
230  std::cout << message << std::endl;
231  int ind = 0;
232  for (size_t i = 0; i < nPrintedRows; i++) {
233  for (size_t j = 0; j <= i; j++) {
234  std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
235  << std::setprecision(3);
236  std::cout << array[ind++];
237  }
238  std::cout << std::endl;
239  }
240  std::cout << std::endl;
241 }
242 
243 template <typename T>
244 inline void printUpperArray(T* array,
245  const size_t nPrintedCols,
246  const size_t nPrintedRows,
247  const size_t nCols,
248  std::string message,
249  size_t interval = 10) {
250  std::cout << std::setiosflags(std::ios::left);
251  std::cout << message << std::endl;
252  int ind = 0;
253  for (size_t i = 0; i < nPrintedRows; i++) {
254  for (size_t j = 0; j < i; j++) {
255  std::cout << " ";
256  }
257  for (size_t j = i; j < nPrintedCols; j++) {
258  std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
259  << std::setprecision(3);
260  std::cout << array[ind++];
261  }
262  for (size_t j = nPrintedCols; j < nCols; j++) {
263  ind++;
264  }
265  std::cout << std::endl;
266  }
267  std::cout << std::endl;
268 }
269 
270 inline void printNumericTable(daal::data_management::NumericTable* dataTable,
271  const char* message = "",
272  size_t nPrintedRows = 0,
273  size_t nPrintedCols = 0,
274  size_t interval = 10) {
275  using namespace daal::data_management;
276 
277  size_t nRows = dataTable->getNumberOfRows();
278  size_t nCols = dataTable->getNumberOfColumns();
279  NumericTableIface::StorageLayout layout = dataTable->getDataLayout();
280 
281  if (nPrintedRows != 0) {
282  nPrintedRows = std::min(nRows, nPrintedRows);
283  } else {
284  nPrintedRows = nRows;
285  }
286 
287  if (nPrintedCols != 0) {
288  nPrintedCols = std::min(nCols, nPrintedCols);
289  } else {
290  nPrintedCols = nCols;
291  }
292 
293  BlockDescriptor<DAAL_DATA_TYPE> block;
294  if (isFull(layout) || layout == NumericTableIface::csrArray) {
295  dataTable->getBlockOfRows(0, nRows, readOnly, block);
296  printArray<DAAL_DATA_TYPE>(
297  block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
298  dataTable->releaseBlockOfRows(block);
299  } else {
300  PackedArrayNumericTableIface* packedTable =
301  dynamic_cast<PackedArrayNumericTableIface*>(dataTable);
302  packedTable->getPackedArray(readOnly, block);
303  if (isLower(layout)) {
304  printLowerArray<DAAL_DATA_TYPE>(
305  block.getBlockPtr(), nPrintedRows, message, interval);
306  } else if (isUpper(layout)) {
307  printUpperArray<DAAL_DATA_TYPE>(
308  block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
309  }
310  packedTable->releasePackedArray(block);
311  }
312 }
313 
314 inline void printNumericTable(daal::data_management::NumericTable& dataTable,
315  const char* message = "",
316  size_t nPrintedRows = 0,
317  size_t nPrintedCols = 0,
318  size_t interval = 10) {
319  printNumericTable(&dataTable, message, nPrintedRows, nPrintedCols, interval);
320 }
321 
322 inline void printNumericTable(const daal::data_management::NumericTablePtr& dataTable,
323  const char* message = "",
324  size_t nPrintedRows = 0,
325  size_t nPrintedCols = 0,
326  size_t interval = 10) {
327  printNumericTable(dataTable.get(), message, nPrintedRows, nPrintedCols, interval);
328 }
329 
330 template <typename T>
331 const NumericTablePtr prepare_data_table(const T* data, const int64_t num_rows) {
332  // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
333  const auto data_table = SOANumericTable::create(1 /* num_columns */, num_rows);
334  data_table->setArray<T>(const_cast<T*>(data), 0);
335 
336  return data_table;
337 }
338 
339 template <typename T>
340 const NumericTablePtr prepare_data_table(const std::vector<const T*>& data,
341  const int64_t num_rows) {
342  // Data dimensions
343  const size_t num_columns = data.size();
344 
345  // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
346  const auto data_table = SOANumericTable::create(num_columns, num_rows);
347  for (size_t i = 0; i < num_columns; ++i) {
348  data_table->setArray<T>(const_cast<T*>(data[i]), i);
349  }
350  return data_table;
351 }
352 
353 template <typename T>
354 const NumericTablePtr prepare_pivoted_data_table(const T* data, const int64_t num_elems) {
355  // Data dimensions
356  // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
357  const auto data_table = SOANumericTable::create(num_elems, 1);
358  for (size_t c = 0; c < static_cast<size_t>(num_elems); ++c) {
359  data_table->setArray<T>(const_cast<T*>(data) + c, c);
360  }
361  return data_table;
362 }
363 
364 inline kmeans::init::Method get_kmeans_init_type(const KMeansInitStrategy init_type) {
365  const static std::map<KMeansInitStrategy, kmeans::init::Method> kmeans_init_type_map = {
366  {KMeansInitStrategy::DEFAULT, kmeans::init::Method::deterministicDense},
367  {KMeansInitStrategy::DETERMINISTIC, kmeans::init::Method::deterministicDense},
368  {KMeansInitStrategy::RANDOM, kmeans::init::Method::randomDense},
369  {KMeansInitStrategy::PLUS_PLUS, kmeans::init::Method::parallelPlusDense}};
370 
371  const auto itr = kmeans_init_type_map.find(init_type);
372  if (itr == kmeans_init_type_map.end()) {
373  std::ostringstream oss;
374  oss << "Invalid Kmeans cluster centroid initialization type. "
375  << "Was expecting one of DETERMINISTIC, RANDOM, or PLUS_PLUS.";
376  throw std::runtime_error(oss.str());
377  }
378  return itr->second;
379 }
380 
381 template <typename T, kmeans::init::Method M>
382 const NumericTablePtr init_centroids_for_type(const NumericTablePtr& input_features_table,
383  const int32_t num_clusters) {
384  kmeans::init::Batch<T, M> init(num_clusters);
385  init.input.set(kmeans::init::data, input_features_table);
386  init.compute();
387  return init.getResult()->get(kmeans::init::centroids);
388 }
389 
390 template <typename T>
391 const NumericTablePtr init_centroids(const NumericTablePtr& input_features_table,
392  const kmeans::init::Method& init_type,
393  const int32_t num_clusters) {
394  switch (init_type) {
395  case kmeans::init::Method::deterministicDense:
396  return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
397  input_features_table, num_clusters);
398  case kmeans::init::Method::randomDense:
399  return init_centroids_for_type<T, kmeans::init::Method::randomDense>(
400  input_features_table, num_clusters);
401  case kmeans::init::Method::plusPlusDense:
402  return init_centroids_for_type<T, kmeans::init::Method::plusPlusDense>(
403  input_features_table, num_clusters);
404  case kmeans::init::Method::parallelPlusDense:
405  return init_centroids_for_type<T, kmeans::init::Method::parallelPlusDense>(
406  input_features_table, num_clusters);
407  default: {
408  UNREACHABLE();
409  return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
410  input_features_table, num_clusters);
411  }
412  }
413 }
414 
415 template <typename T>
416 NEVER_INLINE HOST int32_t onedal_kmeans_impl(const std::vector<const T*>& input_features,
417  int32_t* output_clusters,
418  const int64_t num_rows,
419  const int num_clusters,
420  const int num_iterations,
421  const KMeansInitStrategy kmeans_init_type) {
422  try {
423  const auto features_table = prepare_data_table(input_features, num_rows);
424  const auto onedal_kmeans_init_type = get_kmeans_init_type(kmeans_init_type);
425  const auto centroids =
426  init_centroids<T>(features_table, onedal_kmeans_init_type, num_clusters);
427  const auto assignments_table =
428  HomogenNumericTable<int32_t>::create(output_clusters, 1, num_rows);
429  const kmeans::ResultPtr result(new kmeans::Result);
430  result->set(kmeans::assignments, assignments_table);
431  result->set(kmeans::objectiveFunction,
432  HomogenNumericTable<T>::create(1, 1, NumericTable::doAllocate));
433  result->set(kmeans::nIterations,
434  HomogenNumericTable<int>::create(1, 1, NumericTable::doAllocate));
435  kmeans::Batch<> algorithm(num_clusters, num_iterations);
436  algorithm.input.set(kmeans::data, features_table);
437  algorithm.input.set(kmeans::inputCentroids, centroids);
438  algorithm.parameter().resultsToEvaluate = kmeans::computeAssignments;
439  algorithm.setResult(result);
440  algorithm.compute();
441  } catch (std::exception& e) {
442  throw std::runtime_error(e.what());
443  }
444  return num_rows;
445 }
446 
447 template <typename T>
448 NEVER_INLINE HOST int32_t onedal_dbscan_impl(const std::vector<const T*>& input_features,
449  int32_t* output_clusters,
450  const int64_t num_rows,
451  const double epsilon,
452  const int32_t min_observations) {
453  try {
454  const auto features_table = prepare_data_table(input_features, num_rows);
455  const auto assignments_table =
456  HomogenNumericTable<int32_t>::create(output_clusters, 1, num_rows);
457  const dbscan::ResultPtr result(new dbscan::Result);
458  result->set(dbscan::assignments, assignments_table);
459  result->set(dbscan::nClusters,
460  HomogenNumericTable<int>::create(1, 1, NumericTable::doAllocate));
461  dbscan::Batch<> algorithm(epsilon, min_observations);
462  algorithm.input.set(dbscan::data, features_table);
463  algorithm.parameter().resultsToCompute = dbscan::assignments;
464  algorithm.setResult(result);
465  algorithm.compute();
466  } catch (std::exception& e) {
467  throw std::runtime_error(e.what());
468  }
469  return num_rows;
470 }
471 
472 template <typename T>
473 NEVER_INLINE HOST std::pair<std::vector<std::vector<T>>, std::vector<T>> onedal_pca_impl(
474  const std::vector<const T*>& input_features,
475  const int64_t num_rows) {
476  try {
477  const auto features_table = prepare_data_table(input_features, num_rows);
478  pca::Batch<> algorithm;
479  algorithm.input.set(pca::data, features_table);
480  algorithm.parameter.resultsToCompute = pca::mean | pca::variance | pca::eigenvalue;
481  algorithm.parameter.isDeterministic = true;
482 
483  algorithm.compute();
484  pca::ResultPtr result = algorithm.getResult();
485  const auto eigenvectors_table = result->get(pca::eigenvectors);
486  const int64_t num_dims = eigenvectors_table->getNumberOfRows();
487  CHECK_EQ(num_dims, static_cast<int64_t>(eigenvectors_table->getNumberOfColumns()));
488  std::vector<std::vector<T>> eigenvectors(num_dims, std::vector<T>(num_dims));
489  for (int64_t row_idx = 0; row_idx < num_dims; ++row_idx) {
490  for (int64_t col_idx = 0; col_idx < num_dims; ++col_idx) {
491  // eigenvectors_table is column major, so need to flip the lookup indicies
492  eigenvectors[row_idx][col_idx] =
493  eigenvectors_table->getValue<T>(col_idx, row_idx);
494  }
495  }
496  const auto eigenvalues_table = result->get(pca::eigenvalues);
497  std::vector<T> eigenvalues(num_dims);
498  for (int64_t dim_idx = 0; dim_idx < num_dims; ++dim_idx) {
499  eigenvalues[dim_idx] = eigenvalues_table->getValue<T>(dim_idx, 0);
500  }
501  return std::make_pair(eigenvectors, eigenvalues);
502  } catch (std::exception& e) {
503  throw std::runtime_error(e.what());
504  }
505 }
506 
507 template <typename T>
508 int32_t extract_model_coefs(const NumericTablePtr& coefs_table,
509  int64_t* coef_idxs,
510  double* coefs) {
511  const int64_t num_coefs = coefs_table->getNumberOfColumns();
512  for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
513  coef_idxs[coef_idx] = coef_idx;
514  coefs[coef_idx] =
515  coefs_table->NumericTable::getValue<T>(coef_idx, static_cast<size_t>(0));
516  }
517  return num_coefs;
518 }
519 
520 template <typename T>
521 NEVER_INLINE HOST int32_t
522 onedal_linear_reg_fit_impl(const T* input_labels,
523  const std::vector<const T*>& input_features,
524  int64_t* output_coef_idxs,
525  double* output_coefs,
526  const int64_t num_rows) {
527  try {
528  const auto labels_table = prepare_data_table(input_labels, num_rows);
529  const auto features_table = prepare_data_table(input_features, num_rows);
530 
531  linear_regression::training::Batch<T, linear_regression::training::Method::qrDense>
532  algorithm;
533 
534  algorithm.input.set(linear_regression::training::data, features_table);
535  algorithm.input.set(linear_regression::training::dependentVariables, labels_table);
536 
537  algorithm.compute();
538  const auto training_result = algorithm.getResult();
539  const auto coefs_table =
540  training_result->get(linear_regression::training::model)->getBeta();
541 
542  return extract_model_coefs<T>(coefs_table, output_coef_idxs, output_coefs);
543  } catch (std::exception& e) {
544  throw std::runtime_error(e.what());
545  }
546 }
547 
548 template <typename T>
549 NEVER_INLINE HOST linear_regression::ModelPtr build_linear_reg_model(
550  const double* model_coefs,
551  const int64_t num_coefs) {
552  // See comment at end of onedal_lin_reg_fit_impl
553  // We need to unpivot the model data back to the native
554  // format oneDal expects, with 1 column per beta
555  std::vector<T> casted_model_coefs(num_coefs);
556  for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
557  casted_model_coefs[coef_idx] = model_coefs[coef_idx];
558  }
559  const auto betas_table =
560  prepare_pivoted_data_table(casted_model_coefs.data(), num_coefs);
561 
562  CHECK_EQ(betas_table->getNumberOfColumns(), num_coefs);
563 
564  // Create model builder with true intercept flag
565  linear_regression::ModelBuilder<T> model_builder(num_coefs - 1,
566  1 /* num_dependent_variables */);
567 
568  // Retrive pointer to the begining of betas_table
569  BlockDescriptor<T> block_result;
570 
571  // Use generic code for getting start and end iterators for betas table, even though we
572  // currently only support case of one dependent variable (i.e. 1 row in the betas table)
573  betas_table->getBlockOfRows(0, betas_table->getNumberOfRows(), readOnly, block_result);
574  size_t num_betas =
575  (betas_table->getNumberOfRows()) * (betas_table->getNumberOfColumns());
576 
577  // Initialize iterators for beta array with itrecepts
578  T* first_itr = block_result.getBlockPtr();
579  T* last_itr = first_itr + num_betas;
580  model_builder.setBeta(first_itr, last_itr);
581  betas_table->releaseBlockOfRows(block_result);
582 
583  return model_builder.getModel();
584 }
585 
586 template <typename T>
587 NEVER_INLINE HOST int32_t
588 onedal_linear_reg_predict_impl(const std::shared_ptr<LinearRegressionModel>& model,
589  const std::vector<const T*>& input_features,
590  T* output_predictions,
591  const int64_t num_rows) {
592  CHECK(model->getModelType() == MLModelType::LINEAR_REG);
593  try {
594  if (model->getNumFeatures() != static_cast<int64_t>(input_features.size())) {
595  throw std::runtime_error(
596  "Number of model coefficients does not match number of input features.");
597  }
598  const auto features_table = prepare_data_table(input_features, num_rows);
599  const auto model_ptr =
600  build_linear_reg_model<T>(model->getCoefs().data(), input_features.size() + 1);
601 
602  linear_regression::prediction::Batch<> algorithm;
603  algorithm.input.set(linear_regression::prediction::data, features_table);
604  algorithm.input.set(linear_regression::prediction::model, model_ptr);
605 
606  const auto predictions_table =
607  HomogenNumericTable<T>::create(output_predictions, 1, num_rows);
608 
609  const linear_regression::prediction::ResultPtr result(
610  new linear_regression::prediction::Result);
611  result->set(linear_regression::prediction::prediction, predictions_table);
612  algorithm.setResult(result);
613  algorithm.compute();
614  return num_rows;
615  } catch (std::exception& e) {
616  throw std::runtime_error(e.what());
617  }
618 }
619 
620 template <typename T>
621 NEVER_INLINE HOST void onedal_decision_tree_reg_fit_impl(
622  const std::string& model_name,
623  const T* input_labels,
624  const std::vector<const T*>& input_features,
625  const std::string& model_metadata,
626  const std::vector<std::vector<std::string>>& cat_feature_keys,
627  const int64_t num_rows,
628  const int64_t max_tree_depth,
629  const int64_t min_observations_per_leaf_node) {
630  try {
631  const auto labels_table = prepare_data_table(input_labels, num_rows);
632  const auto features_table = prepare_data_table(input_features, num_rows);
633  decision_tree::regression::training::Batch<T> algorithm;
634  algorithm.input.set(decision_tree::regression::training::data, features_table);
635  algorithm.input.set(decision_tree::regression::training::dependentVariables,
636  labels_table);
637 
638  algorithm.parameter.pruning = decision_tree::Pruning::none;
639  algorithm.parameter.maxTreeDepth = max_tree_depth;
640  algorithm.parameter.minObservationsInLeafNodes = min_observations_per_leaf_node;
641  algorithm.compute();
642  /* Retrieve the algorithm results */
643  decision_tree::regression::training::ResultPtr training_result =
644  algorithm.getResult();
645 
646  auto model_ptr = training_result->get(decision_tree::regression::training::model);
647  auto model = std::make_shared<DecisionTreeRegressionModel>(
648  model_ptr, model_metadata, cat_feature_keys);
649  g_ml_models.addModel(model_name, model);
650  } catch (std::exception& e) {
651  throw std::runtime_error(e.what());
652  }
653 }
654 
655 template <typename T>
656 NEVER_INLINE HOST void onedal_gbt_reg_fit_impl(
657  const std::string& model_name,
658  const T* input_labels,
659  const std::vector<const T*>& input_features,
660  const std::string& model_metadata,
661  const std::vector<std::vector<std::string>>& cat_feature_keys,
662  const int64_t num_rows,
663  const int64_t max_iterations,
664  const int64_t max_tree_depth,
665  const double shrinkage,
666  const double min_split_loss,
667  const double lambda,
668  const double obs_per_tree_fraction,
669  const int64_t features_per_node,
670  const int64_t min_observations_per_leaf_node,
671  const int64_t max_bins,
672  const int64_t min_bin_size) {
673  try {
674  const auto labels_table = prepare_data_table(input_labels, num_rows);
675  const auto features_table = prepare_data_table(input_features, num_rows);
676  gbt::regression::training::Batch<T> algorithm;
677  algorithm.input.set(gbt::regression::training::data, features_table);
678  algorithm.input.set(gbt::regression::training::dependentVariable, labels_table);
679 
680  algorithm.parameter().maxIterations = max_iterations;
681  algorithm.parameter().maxTreeDepth = max_tree_depth;
682  algorithm.parameter().shrinkage = shrinkage;
683  algorithm.parameter().minSplitLoss = min_split_loss;
684  algorithm.parameter().lambda = lambda;
685  algorithm.parameter().observationsPerTreeFraction = obs_per_tree_fraction;
686  algorithm.parameter().featuresPerNode = features_per_node;
687  algorithm.parameter().minObservationsInLeafNode = min_observations_per_leaf_node;
688  algorithm.parameter().maxBins = max_bins;
689  algorithm.parameter().minBinSize = min_bin_size;
690  algorithm.compute();
691  /* Retrieve the algorithm results */
692  gbt::regression::training::ResultPtr training_result = algorithm.getResult();
693 
694  auto model_ptr = training_result->get(gbt::regression::training::model);
695  auto model =
696  std::make_shared<GbtRegressionModel>(model_ptr, model_metadata, cat_feature_keys);
697  g_ml_models.addModel(model_name, model);
698  } catch (std::exception& e) {
699  throw std::runtime_error(e.what());
700  }
701 }
702 
703 inline decision_forest::training::VariableImportanceMode get_var_importance_metric_type(
704  const VarImportanceMetric var_importance_metric) {
705  const static std::map<VarImportanceMetric,
706  decision_forest::training::VariableImportanceMode>
707  var_importance_mode_type_map = {
709  decision_forest::training::VariableImportanceMode::MDI},
711  decision_forest::training::VariableImportanceMode::none},
713  decision_forest::training::VariableImportanceMode::MDI},
715  decision_forest::training::VariableImportanceMode::MDA_Raw},
717  decision_forest::training::VariableImportanceMode::MDA_Scaled}};
718 
719  const auto itr = var_importance_mode_type_map.find(var_importance_metric);
720  if (itr == var_importance_mode_type_map.end()) {
721  std::ostringstream oss;
722  oss << "Invalid variable importance mode type. "
723  << "Was expecting one of DEFAULT, NONE, MDI, MDA, or MDA_SCALED.";
724  throw std::runtime_error(oss.str());
725  }
726  return itr->second;
727 }
728 
729 template <typename T, decision_forest::regression::training::Method M>
730 NEVER_INLINE HOST void onedal_random_forest_reg_fit_impl(
731  const std::string& model_name,
732  const T* input_labels,
733  const std::vector<const T*>& input_features,
734  const std::string& model_metadata,
735  const std::vector<std::vector<std::string>>& cat_feature_keys,
736  const int64_t num_rows,
737  const int64_t num_trees,
738  const double obs_per_tree_fraction,
739  const int64_t max_tree_depth,
740  const int64_t features_per_node,
741  const double impurity_threshold,
742  const bool bootstrap,
743  const int64_t min_obs_per_leaf_node,
744  const int64_t min_obs_per_split_node,
745  const double min_weight_fraction_in_leaf_node,
746  const double min_impurity_decrease_in_split_node,
747  const int64_t max_leaf_nodes,
748  const VarImportanceMetric var_importance_metric) {
749  constexpr bool compute_out_of_bag_error{false};
750  try {
751  const auto labels_table = prepare_data_table(input_labels, num_rows);
752  const auto features_table = prepare_data_table(input_features, num_rows);
753  decision_forest::regression::training::Batch<T, M> algorithm;
754  algorithm.input.set(decision_forest::regression::training::data, features_table);
755  algorithm.input.set(decision_forest::regression::training::dependentVariable,
756  labels_table);
757 
758  algorithm.parameter().nTrees = num_trees;
759  algorithm.parameter().observationsPerTreeFraction = obs_per_tree_fraction;
760  algorithm.parameter().maxTreeDepth = max_tree_depth;
761  algorithm.parameter().featuresPerNode = features_per_node;
762  algorithm.parameter().impurityThreshold = impurity_threshold;
763  algorithm.parameter().bootstrap = bootstrap;
764  algorithm.parameter().minObservationsInLeafNode = min_obs_per_leaf_node;
765  algorithm.parameter().minObservationsInSplitNode = min_obs_per_split_node;
766  algorithm.parameter().minWeightFractionInLeafNode = min_weight_fraction_in_leaf_node;
767  algorithm.parameter().minImpurityDecreaseInSplitNode =
768  min_impurity_decrease_in_split_node;
769  algorithm.parameter().varImportance =
770  get_var_importance_metric_type(var_importance_metric);
771  algorithm.parameter().resultsToCompute =
772  compute_out_of_bag_error ? decision_forest::training::computeOutOfBagError : 0;
773  algorithm.compute();
774  /* Retrieve the algorithm results */
775  decision_forest::regression::training::ResultPtr training_result =
776  algorithm.getResult();
777 
778  auto model_ptr = training_result->get(decision_forest::regression::training::model);
779  auto variable_importance_table =
780  training_result->get(decision_forest::regression::training::variableImportance);
781  const size_t num_features = input_features.size();
782  std::vector<double> variable_importance(
783  var_importance_metric != VarImportanceMetric::NONE ? num_features : 0);
784  if (var_importance_metric != VarImportanceMetric::NONE) {
785  for (size_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
786  variable_importance[feature_idx] =
787  variable_importance_table->NumericTable::getValue<T>(feature_idx, size_t(0));
788  }
789  }
790  double out_of_bag_error{0};
791  if (compute_out_of_bag_error) {
792  auto out_of_bag_error_table =
793  training_result->get(decision_forest::regression::training::outOfBagError);
794  out_of_bag_error =
795  out_of_bag_error_table->NumericTable::getValue<T>(0, static_cast<size_t>(0));
796  }
797  auto model = std::make_shared<RandomForestRegressionModel>(model_ptr,
798  model_metadata,
799  cat_feature_keys,
800  variable_importance,
801  out_of_bag_error);
802  g_ml_models.addModel(model_name, model);
803  } catch (std::exception& e) {
804  throw std::runtime_error(e.what());
805  }
806 }
807 
808 template <typename T>
809 NEVER_INLINE HOST int32_t onedal_decision_tree_reg_predict_impl(
810  const std::shared_ptr<DecisionTreeRegressionModel>& model,
811  const std::vector<const T*>& input_features,
812  T* output_predictions,
813  const int64_t num_rows) {
814  CHECK(model->getModelType() == MLModelType::DECISION_TREE_REG);
815  try {
816  if (model->getNumFeatures() != static_cast<int64_t>(input_features.size())) {
817  throw std::runtime_error("Number of provided features does not match model.");
818  }
819  const auto features_table = prepare_data_table(input_features, num_rows);
820  decision_tree::regression::prediction::Batch<T> algorithm;
821  algorithm.input.set(decision_tree::regression::prediction::data, features_table);
822  algorithm.input.set(decision_tree::regression::prediction::model,
823  model->getModelPtr());
824 
825  const auto predictions_table =
826  HomogenNumericTable<T>::create(output_predictions, 1, num_rows);
827 
828  const decision_tree::regression::prediction::ResultPtr result(
829  new decision_tree::regression::prediction::Result);
830  result->set(decision_tree::regression::prediction::prediction, predictions_table);
831  algorithm.setResult(result);
832  algorithm.compute();
833  return num_rows;
834  } catch (std::exception& e) {
835  throw std::runtime_error(e.what());
836  }
837 }
838 
839 template <typename T>
840 NEVER_INLINE HOST int32_t
841 onedal_gbt_reg_predict_impl(const std::shared_ptr<GbtRegressionModel>& model,
842  const std::vector<const T*>& input_features,
843  T* output_predictions,
844  const int64_t num_rows) {
845  CHECK(model->getModelType() == MLModelType::GBT_REG);
846  try {
847  if (model->getNumFeatures() != static_cast<int64_t>(input_features.size())) {
848  throw std::runtime_error("Number of provided features does not match model.");
849  }
850  const auto features_table = prepare_data_table(input_features, num_rows);
851  gbt::regression::prediction::Batch<T> algorithm;
852  algorithm.input.set(gbt::regression::prediction::data, features_table);
853  algorithm.input.set(gbt::regression::prediction::model, model->getModelPtr());
854 
855  const auto predictions_table =
856  HomogenNumericTable<T>::create(output_predictions, 1, num_rows);
857 
858  const gbt::regression::prediction::ResultPtr result(
859  new gbt::regression::prediction::Result);
860  result->set(gbt::regression::prediction::prediction, predictions_table);
861  algorithm.setResult(result);
862  algorithm.compute();
863  return num_rows;
864  } catch (std::exception& e) {
865  throw std::runtime_error(e.what());
866  }
867 }
868 
869 template <typename T>
870 NEVER_INLINE HOST int32_t onedal_random_forest_reg_predict_impl(
871  const std::shared_ptr<RandomForestRegressionModel>& model,
872  const std::vector<const T*>& input_features,
873  T* output_predictions,
874  const int64_t num_rows) {
875  CHECK(model->getModelType() == MLModelType::RANDOM_FOREST_REG);
876  try {
877  if (model->getNumFeatures() != static_cast<int64_t>(input_features.size())) {
878  throw std::runtime_error("Number of provided features does not match model.");
879  }
880  const auto features_table = prepare_data_table(input_features, num_rows);
881  decision_forest::regression::prediction::Batch<T> algorithm;
882  algorithm.input.set(decision_forest::regression::prediction::data, features_table);
883  algorithm.input.set(decision_forest::regression::prediction::model,
884  model->getModelPtr());
885 
886  const auto predictions_table =
887  HomogenNumericTable<T>::create(output_predictions, 1, num_rows);
888 
889  const decision_forest::regression::prediction::ResultPtr result(
890  new decision_forest::regression::prediction::Result);
891  result->set(decision_forest::regression::prediction::prediction, predictions_table);
892  algorithm.setResult(result);
893  algorithm.compute();
894 
895  return num_rows;
896  } catch (std::exception& e) {
897  throw std::runtime_error(e.what());
898  }
899 }
900 
901 #endif // #ifdef HAVE_ONEDAL
902 #endif // #ifdef __CUDACC__
#define CHECK_EQ(x, y)
Definition: Logger.h:301
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
VarImportanceMetric
#define UNREACHABLE()
Definition: Logger.h:338
std::pair< FILE *, std::string > create(const std::string &basePath, const int fileId, const size_t pageSize, const size_t numPages)
Definition: File.cpp:55
KMeansInitStrategy
#define HOST
void init(LogOptions const &log_opts)
Definition: Logger.cpp:364
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:38
MLModelMap g_ml_models
Definition: MLModel.h:125
#define NEVER_INLINE
#define CHECK(condition)
Definition: Logger.h:291