OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsCommon.hpp File Reference
#include <filesystem>
#include <mutex>
#include <shared_mutex>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>
#include "QueryEngine/heavydbTypes.h"
+ Include dependency graph for TableFunctionsCommon.hpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  ZStdNormalizationSummaryStats< T >
 

Namespaces

 FileUtilities
 

Enumerations

enum  BoundsType { Min, Max }
 
enum  IntervalType { Inclusive, Exclusive }
 

Functions

template<typename T >
std::pair< T, T > get_column_min_max (Column< T > const &)
 
std::pair< int32_t, int32_t > get_column_min_max (Column< TextEncodingDict > const &)
 
template<typename T >
NEVER_INLINE HOST double get_column_mean (const T *data, const int64_t num_rows)
 
template<typename T >
NEVER_INLINE HOST double get_column_mean (const Column< T > &col)
 
template<typename T >
NEVER_INLINE HOST double get_column_std_dev (const Column< T > &col, const double mean)
 
template<typename T >
NEVER_INLINE HOST double get_column_std_dev (const T *data, const int64_t num_rows, const double mean)
 
template<typename T >
void z_std_normalize_col (const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
 
template<typename T >
std::vector< std::vector< T > > z_std_normalize_data (const std::vector< T * > &input_data, const int64_t num_rows)
 
template<typename T >
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats (const std::vector< T * > &input_data, const int64_t num_rows)
 
template<typename T >
NEVER_INLINE HOST std::tuple
< T, T, bool > 
get_column_metadata (const Column< T > &col)
 
NEVER_INLINE HOST std::tuple
< int32_t, int32_t, bool > 
get_column_metadata (const Column< TextEncodingDict > &col)
 
template<typename T1 , typename T2 >
NEVER_INLINE HOST T1 distance_in_meters (const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat)
 
int64_t x_y_bin_to_bin_index (const int64_t x_bin, const int64_t y_bin, const int64_t num_x_bins)
 
std::pair< int64_t, int64_t > bin_to_x_y_bin_indexes (const int64_t bin, const int64_t num_x_bins)
 
std::vector
< std::filesystem::path > 
FileUtilities::get_fs_paths (const std::string &file_or_directory)
 
template<typename T >
NEVER_INLINE HOST bool is_valid_tf_input (const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
 

Enumeration Type Documentation

enum BoundsType
Enumerator
Min 
Max 

Definition at line 121 of file TableFunctionsCommon.hpp.

Enumerator
Inclusive 
Exclusive 

Definition at line 123 of file TableFunctionsCommon.hpp.

Function Documentation

std::pair<int64_t, int64_t> bin_to_x_y_bin_indexes ( const int64_t  bin,
const int64_t  num_x_bins 
)
inline

Definition at line 112 of file TableFunctionsCommon.hpp.

Referenced by GeoRaster< T, Z >::get_xy_coords_for_bin_idx().

113  {
114  return std::make_pair(bin % num_x_bins, bin / num_x_bins);
115 }

+ Here is the caller graph for this function:

template<typename T1 , typename T2 >
NEVER_INLINE HOST T1 distance_in_meters ( const T1  fromlon,
const T1  fromlat,
const T2  tolon,
const T2  tolat 
)

Definition at line 452 of file TableFunctionsCommon.cpp.

452  {
453  T1 latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
454  T1 longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
455  T1 latitudeH = sin(latitudeArc * 0.5);
456  latitudeH *= latitudeH;
457  T1 lontitudeH = sin(longitudeArc * 0.5);
458  lontitudeH *= lontitudeH;
459  T1 tmp = cos(fromlat * 0.017453292519943295769236907684886) *
460  cos(tolat * 0.017453292519943295769236907684886);
461  return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
462 }
template<typename T >
NEVER_INLINE HOST double get_column_mean ( const T *  data,
const int64_t  num_rows 
)

Definition at line 116 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

Referenced by get_column_mean(), r2_score_impl(), z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

116  {
117  // const int64_t num_rows = col.size();
118  const size_t max_thread_count = std::thread::hardware_concurrency();
119  const size_t max_inputs_per_thread = 20000;
120  const size_t num_threads = std::min(
121  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
122 
123  std::vector<double> local_col_sums(num_threads, 0.);
124  std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
125  tbb::task_arena limited_arena(num_threads);
126  limited_arena.execute([&] {
128  tbb::blocked_range<int64_t>(0, num_rows),
129  [&](const tbb::blocked_range<int64_t>& r) {
130  const int64_t start_idx = r.begin();
131  const int64_t end_idx = r.end();
132  double local_col_sum = 0.;
133  int64_t local_col_non_null_count = 0;
134  for (int64_t r = start_idx; r < end_idx; ++r) {
135  const T val = data[r];
136  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
137  if (std::isnan(val) || std::isinf(val)) {
138  continue;
139  }
140  }
141  if (val == inline_null_value<T>()) {
142  continue;
143  }
144  local_col_sum += data[r];
145  local_col_non_null_count++;
146  }
147  size_t thread_idx = tbb::this_task_arena::current_thread_index();
148  local_col_sums[thread_idx] += local_col_sum;
149  local_col_non_null_counts[thread_idx] += local_col_non_null_count;
150  });
151  });
152 
153  double col_sum = 0.0;
154  int64_t col_non_null_count = 0L;
155 
156  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
157  col_sum += local_col_sums[thread_idx];
158  col_non_null_count += local_col_non_null_counts[thread_idx];
159  }
160 
161  return col_non_null_count == 0 ? 0 : col_sum / col_non_null_count;
162 }
const size_t max_inputs_per_thread
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST double get_column_mean ( const Column< T > &  col)

Definition at line 183 of file TableFunctionsCommon.cpp.

References get_column_mean(), Column< T >::getPtr(), and Column< T >::size().

183  {
184  return get_column_mean(col.getPtr(), col.size());
185 }
DEVICE int64_t size() const
DEVICE T * getPtr() const
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST std::tuple<T, T, bool> get_column_metadata ( const Column< T > &  col)

Definition at line 276 of file TableFunctionsCommon.cpp.

References Column< T >::isNull(), max_inputs_per_thread, threading_serial::parallel_for(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by get_column_metadata().

276  {
277  T col_min = std::numeric_limits<T>::max();
278  T col_max = std::numeric_limits<T>::lowest();
279  bool has_nulls = false;
280  const int64_t num_rows = col.size();
281  const size_t max_thread_count = std::thread::hardware_concurrency();
282  const size_t max_inputs_per_thread = 200000;
283  const size_t num_threads = std::min(
284  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
285 
286  std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
287  std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
288  std::vector<bool> local_col_has_nulls(num_threads, false);
289  tbb::task_arena limited_arena(num_threads);
290 
291  limited_arena.execute([&] {
293  tbb::blocked_range<int64_t>(0, num_rows),
294  [&](const tbb::blocked_range<int64_t>& r) {
295  const int64_t start_idx = r.begin();
296  const int64_t end_idx = r.end();
297  T local_col_min = std::numeric_limits<T>::max();
298  T local_col_max = std::numeric_limits<T>::lowest();
299  bool local_has_nulls = false;
300  for (int64_t r = start_idx; r < end_idx; ++r) {
301  const T val = col[r];
302  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
303  if (std::isnan(val) || std::isinf(val)) {
304  continue;
305  }
306  }
307  if (col.isNull(r)) {
308  local_has_nulls = true;
309  continue;
310  }
311  if (val < local_col_min) {
312  local_col_min = val;
313  }
314  if (val > local_col_max) {
315  local_col_max = val;
316  }
317  }
318  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
319  if (local_has_nulls) {
320  local_col_has_nulls[thread_idx] = true;
321  }
322  if (local_col_min < local_col_mins[thread_idx]) {
323  local_col_mins[thread_idx] = local_col_min;
324  }
325  if (local_col_max > local_col_maxes[thread_idx]) {
326  local_col_maxes[thread_idx] = local_col_max;
327  }
328  });
329  });
330 
331  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
332  if (local_col_has_nulls[thread_idx]) {
333  has_nulls = true;
334  }
335  if (local_col_mins[thread_idx] < col_min) {
336  col_min = local_col_mins[thread_idx];
337  }
338  if (local_col_maxes[thread_idx] > col_max) {
339  col_max = local_col_maxes[thread_idx];
340  }
341  }
342  return {col_min, col_max, has_nulls};
343 }
DEVICE int64_t size() const
const size_t max_inputs_per_thread
DEVICE bool isNull(int64_t index) const
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

NEVER_INLINE HOST std::tuple<int32_t, int32_t, bool> get_column_metadata ( const Column< TextEncodingDict > &  col)

Definition at line 358 of file TableFunctionsCommon.cpp.

References get_column_metadata(), Column< TextEncodingDict >::getPtr(), and Column< TextEncodingDict >::size().

359  {
360  Column<int32_t> int_alias_col(reinterpret_cast<int32_t*>(col.getPtr()), col.size());
361  return get_column_metadata(int_alias_col);
362 }
DEVICE TextEncodingDict * getPtr() const
NEVER_INLINE HOST std::tuple< T, T, bool > get_column_metadata(const Column< T > &col)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T >
std::pair<T, T> get_column_min_max ( Column< T > const &  )

Definition at line 42 of file TableFunctionsCommon.hpp.

42  {
43  throw std::runtime_error("Table function called but built with ENABLE_SYSTEM_TFS=off.");
44 }
std::pair<int32_t, int32_t> get_column_min_max ( Column< TextEncodingDict > const &  )

Definition at line 46 of file TableFunctionsCommon.hpp.

46  {
47  throw std::runtime_error("Table function called but built with ENABLE_SYSTEM_TFS=off.");
48 }
template<typename T >
NEVER_INLINE HOST double get_column_std_dev ( const Column< T > &  col,
const double  mean 
)

Definition at line 195 of file TableFunctionsCommon.cpp.

References get_column_std_dev(), Column< T >::getPtr(), and Column< T >::size().

Referenced by get_column_std_dev(), z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

195  {
196  return get_column_std_dev(col.getPtr(), col.size(), mean);
197 }
DEVICE int64_t size() const
DEVICE T * getPtr() const
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST double get_column_std_dev ( const T *  data,
const int64_t  num_rows,
const double  mean 
)

Definition at line 209 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

211  {
212  // const int64_t num_rows = col.size();
213  const size_t max_thread_count = std::thread::hardware_concurrency();
214  const size_t max_inputs_per_thread = 200000;
215  const size_t num_threads = std::min(
216  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
217 
218  std::vector<double> local_col_squared_residuals(num_threads, 0.);
219  std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
220  tbb::task_arena limited_arena(num_threads);
221 
222  limited_arena.execute([&] {
224  tbb::blocked_range<int64_t>(0, num_rows),
225  [&](const tbb::blocked_range<int64_t>& r) {
226  const int64_t start_idx = r.begin();
227  const int64_t end_idx = r.end();
228  double local_col_squared_residual = 0.;
229  int64_t local_col_non_null_count = 0;
230  for (int64_t r = start_idx; r < end_idx; ++r) {
231  const T val = data[r];
232  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
233  if (std::isnan(val) || std::isinf(val)) {
234  continue;
235  }
236  }
237  if (val == inline_null_value<T>()) {
238  continue;
239  }
240  const double residual = val - mean;
241  local_col_squared_residual += (residual * residual);
242  local_col_non_null_count++;
243  }
244  size_t thread_idx = tbb::this_task_arena::current_thread_index();
245  local_col_squared_residuals[thread_idx] += local_col_squared_residual;
246  local_col_non_null_counts[thread_idx] += local_col_non_null_count;
247  });
248  });
249 
250  double col_sum_squared_residual = 0.0;
251  int64_t col_non_null_count = 0;
252 
253  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
254  col_sum_squared_residual += local_col_squared_residuals[thread_idx];
255  col_non_null_count += local_col_non_null_counts[thread_idx];
256  }
257 
258  return col_non_null_count == 0 ? 0
259  : sqrt(col_sum_squared_residual / col_non_null_count);
260 }
const size_t max_inputs_per_thread
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST bool is_valid_tf_input ( const T  input,
const T  bounds_val,
const BoundsType  bounds_type,
const IntervalType  interval_type 
)

Definition at line 556 of file TableFunctionsCommon.cpp.

References Exclusive, Inclusive, Max, Min, and UNREACHABLE.

559  {
560  switch (bounds_type) {
561  case BoundsType::Min:
562  switch (interval_type) {
564  return input >= bounds_val;
566  return input > bounds_val;
567  default:
568  UNREACHABLE();
569  }
570  case BoundsType::Max:
571  switch (interval_type) {
573  return input <= bounds_val;
575  return input < bounds_val;
576  default:
577  UNREACHABLE();
578  }
579  break;
580  default:
581  UNREACHABLE();
582  }
583  UNREACHABLE();
584  return false; // To address compiler warning
585 }
#define UNREACHABLE()
Definition: Logger.h:338
template<typename T >
void z_std_normalize_col ( const T *  input_data,
T *  output_data,
const int64_t  num_rows,
const double  mean,
const double  std_dev 
)

Definition at line 365 of file TableFunctionsCommon.cpp.

References threading_serial::parallel_for().

Referenced by z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

369  {
370  if (std_dev <= 0.0) {
371  throw std::runtime_error("Standard deviation cannot be <= 0");
372  }
373  const double inv_std_dev = 1.0 / std_dev;
374 
375  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
376  [&](const tbb::blocked_range<int64_t>& r) {
377  const int64_t start_idx = r.begin();
378  const int64_t end_idx = r.end();
379  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
380  output_data[row_idx] = (input_data[row_idx] - mean) * inv_std_dev;
381  }
382  });
383 }
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
std::vector<std::vector<T> > z_std_normalize_data ( const std::vector< T * > &  input_data,
const int64_t  num_rows 
)

Definition at line 397 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by dbscan__cpu_template(), and kmeans__cpu_template().

398  {
399  const int64_t num_features = input_data.size();
400  std::vector<std::vector<T>> normalized_data(num_features);
401  for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
402  const auto mean = get_column_mean(input_data[feature_idx], num_rows);
403  const auto std_dev = get_column_std_dev(input_data[feature_idx], num_rows, mean);
404  normalized_data[feature_idx].resize(num_rows);
405  z_std_normalize_col(input_data[feature_idx],
406  normalized_data[feature_idx].data(),
407  num_rows,
408  mean,
409  std_dev);
410  }
411  return normalized_data;
412 }
void z_std_normalize_col(const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
ZStdNormalizationSummaryStats<T> z_std_normalize_data_with_summary_stats ( const std::vector< T * > &  input_data,
const int64_t  num_rows 
)

Definition at line 422 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by pca_fit_impl().

424  {
425  const int64_t num_features = input_data.size();
426  std::vector<std::vector<T>> normalized_data(num_features);
427  std::vector<T> means(num_features);
428  std::vector<T> std_devs(num_features);
429  for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
430  means[feature_idx] = get_column_mean(input_data[feature_idx], num_rows);
431  std_devs[feature_idx] =
432  get_column_std_dev(input_data[feature_idx], num_rows, means[feature_idx]);
433  normalized_data[feature_idx].resize(num_rows);
434  z_std_normalize_col(input_data[feature_idx],
435  normalized_data[feature_idx].data(),
436  num_rows,
437  means[feature_idx],
438  std_devs[feature_idx]);
439  }
440  return ZStdNormalizationSummaryStats<T>(normalized_data, means, std_devs);
441 }
void z_std_normalize_col(const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)

+ Here is the call graph for this function:

+ Here is the caller graph for this function: