#include <filesystem>
#include <mutex>
#include <shared_mutex>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>
#include "QueryEngine/heavydbTypes.h"

Include dependency graph for TableFunctionsCommon.hpp:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes
struct	ZStdNormalizationSummaryStats< T >

Namespaces
	FileUtilities

Enumerations
enum	BoundsType { Min, Max }

enum	IntervalType { Inclusive, Exclusive }

Functions
template<typename T >
std::pair< T, T >	get_column_min_max (Column< T > const &)

std::pair< int32_t, int32_t >	get_column_min_max (Column< TextEncodingDict > const &)

template<typename T >
NEVER_INLINE HOST double	get_column_mean (const T *data, const int64_t num_rows)

template<typename T >
NEVER_INLINE HOST double	get_column_mean (const Column< T > &col)

template<typename T >
NEVER_INLINE HOST double	get_column_std_dev (const Column< T > &col, const double mean)

template<typename T >
NEVER_INLINE HOST double	get_column_std_dev (const T *data, const int64_t num_rows, const double mean)

template<typename T >
void	z_std_normalize_col (const T input_data, T output_data, const int64_t num_rows, const double mean, const double std_dev)

template<typename T >
std::vector< std::vector< T > >	z_std_normalize_data (const std::vector< T * > &input_data, const int64_t num_rows)

template<typename T >
ZStdNormalizationSummaryStats< T >	z_std_normalize_data_with_summary_stats (const std::vector< T * > &input_data, const int64_t num_rows)

template<typename T >
NEVER_INLINE HOST std::tuple < T, T, bool >	get_column_metadata (const Column< T > &col)

NEVER_INLINE HOST std::tuple < int32_t, int32_t, bool >	get_column_metadata (const Column< TextEncodingDict > &col)

template<typename T1 , typename T2 >
NEVER_INLINE HOST T1	distance_in_meters (const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat)

int64_t	x_y_bin_to_bin_index (const int64_t x_bin, const int64_t y_bin, const int64_t num_x_bins)

std::pair< int64_t, int64_t >	bin_to_x_y_bin_indexes (const int64_t bin, const int64_t num_x_bins)

std::vector < std::filesystem::path >	FileUtilities::get_fs_paths (const std::string &file_or_directory)

template<typename T >
NEVER_INLINE HOST bool	is_valid_tf_input (const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)

Enumeration Type Documentation

enum BoundsType

Enumerator
Min
Max

Definition at line 121 of file TableFunctionsCommon.hpp.

121 { Min, Max };

Min

Definition: TableFunctionsCommon.hpp:121

Max

Definition: TableFunctionsCommon.hpp:121

enum IntervalType

Enumerator
Inclusive
Exclusive

Definition at line 123 of file TableFunctionsCommon.hpp.

123 { Inclusive, Exclusive };

Exclusive

Definition: TableFunctionsCommon.hpp:123

Inclusive

Definition: TableFunctionsCommon.hpp:123

Function Documentation

std::pair<int64_t, int64_t> bin_to_x_y_bin_indexes	(	const int64_t	bin,
		const int64_t	num_x_bins
	)

inline

Definition at line 112 of file TableFunctionsCommon.hpp.

Referenced by GeoRaster< T, Z >::get_xy_coords_for_bin_idx().

                                                                                     {
   return std::make_pair(bin % num_x_bins, bin / num_x_bins);
 }

Here is the caller graph for this function:

template<typename T1 , typename T2 >

NEVER_INLINE HOST T1 distance_in_meters	(	const T1	fromlon,
		const T1	fromlat,
		const T2	tolon,
		const T2	tolat
	)

Definition at line 452 of file TableFunctionsCommon.cpp.

                                                                                        {
   T1 latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
   T1 longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
   T1 latitudeH = sin(latitudeArc * 0.5);
   latitudeH *= latitudeH;
   T1 lontitudeH = sin(longitudeArc * 0.5);
   lontitudeH *= lontitudeH;
   T1 tmp = cos(fromlat * 0.017453292519943295769236907684886) *
            cos(tolat * 0.017453292519943295769236907684886);
   return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
 }

template<typename T >

NEVER_INLINE HOST double get_column_mean	(	const T *	data,
		const int64_t	num_rows
	)

Definition at line 116 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

Referenced by get_column_mean(), r2_score_impl(), z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

                                                                                 {
   // const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 20000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<double> local_col_sums(num_threads, 0.);
   std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
   tbb::task_arena limited_arena(num_threads);
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           double local_col_sum = 0.;
           int64_t local_col_non_null_count = 0;
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = data[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (val == inline_null_value<T>()) {
               continue;
             }
             local_col_sum += data[r];
             local_col_non_null_count++;
           }
           size_t thread_idx = tbb::this_task_arena::current_thread_index();
           local_col_sums[thread_idx] += local_col_sum;
           local_col_non_null_counts[thread_idx] += local_col_non_null_count;
         });
   });
 
   double col_sum = 0.0;
   int64_t col_non_null_count = 0L;
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     col_sum += local_col_sums[thread_idx];
     col_non_null_count += local_col_non_null_counts[thread_idx];
   }
 
   return col_non_null_count == 0 ? 0 : col_sum / col_non_null_count;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

NEVER_INLINE HOST double get_column_mean ( const Column< T > & col )

Definition at line 183 of file TableFunctionsCommon.cpp.

References get_column_mean(), Column< T >::getPtr(), and Column< T >::size().

                                                                {
   return get_column_mean(col.getPtr(), col.size());
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST std::tuple<T, T, bool> get_column_metadata ( const Column< T > & col )

Definition at line 276 of file TableFunctionsCommon.cpp.

References Column< T >::isNull(), max_inputs_per_thread, threading_serial::parallel_for(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by get_column_metadata().

                                                                                  {
   T col_min = std::numeric_limits<T>::max();
   T col_max = std::numeric_limits<T>::lowest();
   bool has_nulls = false;
   const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 200000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
   std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
   std::vector<bool> local_col_has_nulls(num_threads, false);
   tbb::task_arena limited_arena(num_threads);
 
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           T local_col_min = std::numeric_limits<T>::max();
           T local_col_max = std::numeric_limits<T>::lowest();
           bool local_has_nulls = false;
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = col[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (col.isNull(r)) {
               local_has_nulls = true;
               continue;
             }
             if (val < local_col_min) {
               local_col_min = val;
             }
             if (val > local_col_max) {
               local_col_max = val;
             }
           }
           const size_t thread_idx = tbb::this_task_arena::current_thread_index();
           if (local_has_nulls) {
             local_col_has_nulls[thread_idx] = true;
           }
           if (local_col_min < local_col_mins[thread_idx]) {
             local_col_mins[thread_idx] = local_col_min;
           }
           if (local_col_max > local_col_maxes[thread_idx]) {
             local_col_maxes[thread_idx] = local_col_max;
           }
         });
   });
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     if (local_col_has_nulls[thread_idx]) {
       has_nulls = true;
     }
     if (local_col_mins[thread_idx] < col_min) {
       col_min = local_col_mins[thread_idx];
     }
     if (local_col_maxes[thread_idx] > col_max) {
       col_max = local_col_maxes[thread_idx];
     }
   }
   return {col_min, col_max, has_nulls};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

NEVER_INLINE HOST std::tuple<int32_t, int32_t, bool> get_column_metadata ( const Column< TextEncodingDict > & col )

Definition at line 358 of file TableFunctionsCommon.cpp.

References get_column_metadata(), Column< TextEncodingDict >::getPtr(), and Column< TextEncodingDict >::size().

                                          {
   Column<int32_t> int_alias_col(reinterpret_cast<int32_t*>(col.getPtr()), col.size());
   return get_column_metadata(int_alias_col);
 }

Here is the call graph for this function:

template<typename T >

std::pair<T, T> get_column_min_max ( Column< T > const & )

Definition at line 42 of file TableFunctionsCommon.hpp.

                                                    {
   throw std::runtime_error("Table function called but built with ENABLE_SYSTEM_TFS=off.");
 }

std::pair<int32_t, int32_t> get_column_min_max ( Column< TextEncodingDict > const & )

Definition at line 46 of file TableFunctionsCommon.hpp.

                                                                               {
   throw std::runtime_error("Table function called but built with ENABLE_SYSTEM_TFS=off.");
 }

template<typename T >

NEVER_INLINE HOST double get_column_std_dev	(	const Column< T > &	col,
		const double	mean
	)

Definition at line 195 of file TableFunctionsCommon.cpp.

References get_column_std_dev(), Column< T >::getPtr(), and Column< T >::size().

Referenced by get_column_std_dev(), z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

                                                                                      {
   return get_column_std_dev(col.getPtr(), col.size(), mean);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

NEVER_INLINE HOST double get_column_std_dev	(	const T *	data,
		const int64_t	num_rows,
		const double	mean
	)

Definition at line 209 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

                                                                {
   // const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 200000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<double> local_col_squared_residuals(num_threads, 0.);
   std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
   tbb::task_arena limited_arena(num_threads);
 
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           double local_col_squared_residual = 0.;
           int64_t local_col_non_null_count = 0;
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = data[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (val == inline_null_value<T>()) {
               continue;
             }
             const double residual = val - mean;
             local_col_squared_residual += (residual * residual);
             local_col_non_null_count++;
           }
           size_t thread_idx = tbb::this_task_arena::current_thread_index();
           local_col_squared_residuals[thread_idx] += local_col_squared_residual;
           local_col_non_null_counts[thread_idx] += local_col_non_null_count;
         });
   });
 
   double col_sum_squared_residual = 0.0;
   int64_t col_non_null_count = 0;
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     col_sum_squared_residual += local_col_squared_residuals[thread_idx];
     col_non_null_count += local_col_non_null_counts[thread_idx];
   }
 
   return col_non_null_count == 0 ? 0
                                  : sqrt(col_sum_squared_residual / col_non_null_count);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST bool is_valid_tf_input	(	const T	input,
		const T	bounds_val,
		const BoundsType	bounds_type,
		const IntervalType	interval_type
	)

Definition at line 556 of file TableFunctionsCommon.cpp.

References Exclusive, Inclusive, Max, Min, and UNREACHABLE.

                                                                            {
   switch (bounds_type) {
     case BoundsType::Min:
       switch (interval_type) {
         case IntervalType::Inclusive:
           return input >= bounds_val;
         case IntervalType::Exclusive:
           return input > bounds_val;
         default:
           UNREACHABLE();
       }
     case BoundsType::Max:
       switch (interval_type) {
         case IntervalType::Inclusive:
           return input <= bounds_val;
         case IntervalType::Exclusive:
           return input < bounds_val;
         default:
           UNREACHABLE();
       }
       break;
     default:
       UNREACHABLE();
   }
   UNREACHABLE();
   return false;  // To address compiler warning
 }

int64_t x_y_bin_to_bin_index	(	const int64_t	x_bin,
		const int64_t	y_bin,
		const int64_t	num_x_bins
	)

inline

Definition at line 106 of file TableFunctionsCommon.hpp.

                                                               {
   return y_bin * num_x_bins + x_bin;
 }

Here is the caller graph for this function:

template<typename T >

void z_std_normalize_col	(	const T *	input_data,
		T *	output_data,
		const int64_t	num_rows,
		const double	mean,
		const double	std_dev
	)

Definition at line 365 of file TableFunctionsCommon.cpp.

References threading_serial::parallel_for().

Referenced by z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

                                                {
   if (std_dev <= 0.0) {
     throw std::runtime_error("Standard deviation cannot be <= 0");
   }
   const double inv_std_dev = 1.0 / std_dev;
 
   tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
                     [&](const tbb::blocked_range<int64_t>& r) {
                       const int64_t start_idx = r.begin();
                       const int64_t end_idx = r.end();
                       for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
                         output_data[row_idx] = (input_data[row_idx] - mean) * inv_std_dev;
                       }
                     });
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

std::vector<std::vector<T> > z_std_normalize_data	(	const std::vector< T * > &	input_data,
		const int64_t	num_rows
	)

Definition at line 397 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by dbscan__cpu_template(), and kmeans__cpu_template().

                                                                          {
   const int64_t num_features = input_data.size();
   std::vector<std::vector<T>> normalized_data(num_features);
   for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
     const auto mean = get_column_mean(input_data[feature_idx], num_rows);
     const auto std_dev = get_column_std_dev(input_data[feature_idx], num_rows, mean);
     normalized_data[feature_idx].resize(num_rows);
     z_std_normalize_col(input_data[feature_idx],
                         normalized_data[feature_idx].data(),
                         num_rows,
                         mean,
                         std_dev);
   }
   return normalized_data;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

ZStdNormalizationSummaryStats<T> z_std_normalize_data_with_summary_stats	(	const std::vector< T * > &	input_data,
		const int64_t	num_rows
	)

Definition at line 422 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by pca_fit_impl().

                             {
   const int64_t num_features = input_data.size();
   std::vector<std::vector<T>> normalized_data(num_features);
   std::vector<T> means(num_features);
   std::vector<T> std_devs(num_features);
   for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
     means[feature_idx] = get_column_mean(input_data[feature_idx], num_rows);
     std_devs[feature_idx] =
         get_column_std_dev(input_data[feature_idx], num_rows, means[feature_idx]);
     normalized_data[feature_idx].resize(num_rows);
     z_std_normalize_col(input_data[feature_idx],
                         normalized_data[feature_idx].data(),
                         num_rows,
                         means[feature_idx],
                         std_devs[feature_idx]);
   }
   return ZStdNormalizationSummaryStats<T>(normalized_data, means, std_devs);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Classes

Namespaces

Enumerations

Functions

Enumeration Type Documentation

Function Documentation