OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsStats.hpp File Reference
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>
#include <rapidjson/document.h>
#include "QueryEngine/heavydbTypes.h"
+ Include dependency graph for TableFunctionsStats.hpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  StatsRequestPredicate
 
struct  ColumnStats< T >
 
struct  StatsRequest
 

Enumerations

enum  StatsRequestPredicateOp { StatsRequestPredicateOp::NONE, StatsRequestPredicateOp::LT, StatsRequestPredicateOp::GT }
 
enum  StatsRequestAggType {
  StatsRequestAggType::COUNT, StatsRequestAggType::MIN, StatsRequestAggType::MAX, StatsRequestAggType::SUM,
  StatsRequestAggType::AVG
}
 

Functions

std::vector< StatsRequestparse_stats_requests_json (const std::string &stats_requests_json_str, const int64_t num_attrs)
 
template<typename TA >
ColumnStats< TA > get_column_stats (const ColumnList< TA > &attrs, StatsRequest &stats_request, std::unordered_map< std::string, ColumnStats< TA >> &stats_map)
 
template<typename TA >
void compute_stats_requests (const ColumnList< TA > &attrs, std::vector< StatsRequest > &stats_requests)
 
template<typename TA >
void populate_output_stats_cols (Column< TextEncodingDict > &stat_names, Column< TA > &stat_vals, const std::vector< StatsRequest > &stats_requests)
 
template<typename T >
NEVER_INLINE HOST ColumnStats< T > get_column_stats (const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate=StatsRequestPredicate())
 
template<typename T >
NEVER_INLINE HOST ColumnStats< T > get_column_stats (const Column< T > &col, const StatsRequestPredicate &predicate=StatsRequestPredicate())
 

Enumeration Type Documentation

enum StatsRequestAggType
strong
Enumerator
COUNT 
MIN 
MAX 
SUM 
AVG 

Definition at line 104 of file TableFunctionsStats.hpp.

Enumerator
NONE 
LT 
GT 

Definition at line 30 of file TableFunctionsStats.hpp.

Function Documentation

template<typename TA >
void compute_stats_requests ( const ColumnList< TA > &  attrs,
std::vector< StatsRequest > &  stats_requests 
)

Definition at line 143 of file TableFunctionsStats.hpp.

References AVG, COUNT, get_column_stats(), MAX, MIN, and SUM.

144  {
145  std::unordered_map<std::string, ColumnStats<TA>> stats_map;
146 
147  for (auto& stats_request : stats_requests) {
148  const auto column_stats = get_column_stats(attrs, stats_request, stats_map);
149  switch (stats_request.agg_type) {
151  stats_request.result = column_stats.non_null_or_filtered_count;
152  break;
153  }
155  stats_request.result = column_stats.min;
156  break;
157  }
159  stats_request.result = column_stats.max;
160  break;
161  }
163  stats_request.result = column_stats.sum;
164  break;
165  }
167  stats_request.result = column_stats.mean;
168  break;
169  }
170  }
171  }
172 }
NEVER_INLINE HOST ColumnStats< T > get_column_stats(const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate)

+ Here is the call graph for this function:

template<typename TA >
ColumnStats<TA> get_column_stats ( const ColumnList< TA > &  attrs,
StatsRequest stats_request,
std::unordered_map< std::string, ColumnStats< TA >> &  stats_map 
)

Definition at line 126 of file TableFunctionsStats.hpp.

References StatsRequest::attr_id, StatsRequest::filter_type, StatsRequest::filter_val, get_column_stats(), StatsRequestPredicate::to_string(), and to_string().

129  {
130  StatsRequestPredicate predicate(stats_request.filter_type, stats_request.filter_val);
131  const std::string request_str_key =
132  std::to_string(stats_request.attr_id) + "||" + predicate.to_string();
133  auto stats_map_itr = stats_map.find(request_str_key);
134  if (stats_map_itr != stats_map.end()) {
135  return stats_map_itr->second;
136  }
137  const auto column_stats = get_column_stats(attrs[stats_request.attr_id], predicate);
138  stats_map[request_str_key] = column_stats;
139  return column_stats;
140 }
NEVER_INLINE HOST ColumnStats< T > get_column_stats(const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate)
std::string to_string(char const *&&v)
StatsRequestPredicateOp filter_type

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST ColumnStats<T> get_column_stats ( const T *  data,
const int64_t  num_rows,
const StatsRequestPredicate predicate = StatsRequestPredicate() 
)

Definition at line 22 of file TableFunctionsStats.cpp.

References ColumnStats< T >::max, max_inputs_per_thread, ColumnStats< T >::mean, ColumnStats< T >::min, ColumnStats< T >::non_null_or_filtered_count, threading_serial::parallel_for(), ColumnStats< T >::sum, heavydb.dtypes::T, and ColumnStats< T >::total_count.

Referenced by compute_stats_requests(), and get_column_stats().

25  {
26  // const int64_t num_rows = col.size();
27  const size_t max_thread_count = std::thread::hardware_concurrency();
28  const size_t max_inputs_per_thread = 20000;
29  const size_t num_threads = std::min(
30  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
31 
32  std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
33  std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
34  std::vector<double> local_col_sums(num_threads, 0.);
35  std::vector<int64_t> local_col_non_null_or_filtered_counts(num_threads, 0L);
36  tbb::task_arena limited_arena(num_threads);
37  limited_arena.execute([&] {
39  tbb::blocked_range<int64_t>(0, num_rows),
40  [&](const tbb::blocked_range<int64_t>& r) {
41  const int64_t start_idx = r.begin();
42  const int64_t end_idx = r.end();
43  T local_col_min = std::numeric_limits<T>::max();
44  T local_col_max = std::numeric_limits<T>::lowest();
45  double local_col_sum = 0.;
46  int64_t local_col_non_null_or_filtered_count = 0;
47  for (int64_t r = start_idx; r < end_idx; ++r) {
48  const T val = data[r];
49  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
50  if (std::isnan(val) || std::isinf(val)) {
51  continue;
52  }
53  }
54  if (val == inline_null_value<T>()) {
55  continue;
56  }
57  if (!predicate(val)) {
58  continue;
59  }
60  if (val < local_col_min) {
61  local_col_min = val;
62  }
63  if (val > local_col_max) {
64  local_col_max = val;
65  }
66  local_col_sum += data[r];
67  local_col_non_null_or_filtered_count++;
68  }
69  size_t thread_idx = tbb::this_task_arena::current_thread_index();
70  if (local_col_min < local_col_mins[thread_idx]) {
71  local_col_mins[thread_idx] = local_col_min;
72  }
73  if (local_col_max > local_col_maxes[thread_idx]) {
74  local_col_maxes[thread_idx] = local_col_max;
75  }
76  local_col_sums[thread_idx] += local_col_sum;
77  local_col_non_null_or_filtered_counts[thread_idx] +=
78  local_col_non_null_or_filtered_count;
79  });
80  });
81 
82  ColumnStats<T> column_stats;
83  // Use separate double col_sum instead of column_stats.sum to avoid fp imprecision if T
84  // is float
85  double col_sum = 0.0;
86  column_stats.total_count = num_rows;
87 
88  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
89  if (local_col_mins[thread_idx] < column_stats.min) {
90  column_stats.min = local_col_mins[thread_idx];
91  }
92  if (local_col_maxes[thread_idx] > column_stats.max) {
93  column_stats.max = local_col_maxes[thread_idx];
94  }
95  col_sum += local_col_sums[thread_idx];
96  column_stats.non_null_or_filtered_count +=
97  local_col_non_null_or_filtered_counts[thread_idx];
98  }
99 
100  if (column_stats.non_null_or_filtered_count > 0) {
101  column_stats.sum = col_sum;
102  column_stats.mean = col_sum / column_stats.non_null_or_filtered_count;
103  }
104  return column_stats;
105 }
const size_t max_inputs_per_thread
int64_t non_null_or_filtered_count
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST ColumnStats<T> get_column_stats ( const Column< T > &  col,
const StatsRequestPredicate predicate = StatsRequestPredicate() 
)

Definition at line 133 of file TableFunctionsStats.cpp.

References get_column_stats(), Column< T >::getPtr(), and Column< T >::size().

135  {
136  return get_column_stats(col.getPtr(), col.size(), predicate);
137 }
DEVICE int64_t size() const
NEVER_INLINE HOST ColumnStats< T > get_column_stats(const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate)
DEVICE T * getPtr() const

+ Here is the call graph for this function:

std::vector<StatsRequest> parse_stats_requests_json ( const std::string &  stats_requests_json_str,
const int64_t  num_attrs 
)

Definition at line 211 of file TableFunctionsStats.cpp.

References StatsRequest::agg_type, StatsRequest::attr_id, convert_string_to_stats_request_agg_type(), convert_string_to_stats_request_predicate_op(), StatsRequest::filter_type, StatsRequest::filter_val, StatsRequest::name, NONE, replace_substrings(), to_string(), and shared::transform().

213  {
214  std::vector<StatsRequest> stats_requests;
215  rapidjson::Document doc;
216 
217  // remove double double quotes our parser introduces
218  const auto fixed_stats_requests_json_str =
219  replace_substrings(stats_requests_json_str, "\"\"", "\"");
220 
221  if (doc.Parse(fixed_stats_requests_json_str.c_str()).HasParseError()) {
222  // Not valid JSON
223  std::cout << "DEBUG: Failed JSON: " << fixed_stats_requests_json_str << std::endl;
224  throw std::runtime_error("Could not parse Stats Requests JSON.");
225  }
226  // Todo (todd): Enforce Schema
227  if (!doc.IsArray()) {
228  throw std::runtime_error("Stats Request JSON did not contain valid root Array.");
229  }
230  const std::vector<std::string> required_keys = {
231  "name", "attr_id", "agg_type", "filter_type"};
232 
233  for (const auto& stat_request_obj : doc.GetArray()) {
234  for (const auto& required_key : required_keys) {
235  if (!stat_request_obj.HasMember(required_key)) {
236  throw std::runtime_error("Stats Request JSON missing key " + required_key + ".");
237  }
238  if (required_key == "attr_id") {
239  if (!stat_request_obj[required_key].IsUint()) {
240  throw std::runtime_error(required_key + " must be int type");
241  }
242  } else {
243  if (!stat_request_obj[required_key].IsString()) {
244  throw std::runtime_error(required_key + " must be string type");
245  }
246  }
247  }
248  StatsRequest stats_request;
249  stats_request.name = stat_request_obj["name"].GetString();
250  stats_request.attr_id = stat_request_obj["attr_id"].GetInt() - 1;
251  if (stats_request.attr_id < 0 || stats_request.attr_id >= num_attrs) {
252  throw std::runtime_error("Invalid attr_id: " +
253  std::to_string(stats_request.attr_id));
254  }
255 
256  std::string agg_type_str = stat_request_obj["agg_type"].GetString();
258  agg_type_str.begin(), agg_type_str.end(), agg_type_str.begin(), ::toupper);
259  stats_request.agg_type = convert_string_to_stats_request_agg_type(agg_type_str);
260 
261  std::string filter_type_str = stat_request_obj["filter_type"].GetString();
262  std::transform(filter_type_str.begin(),
263  filter_type_str.end(),
264  filter_type_str.begin(),
265  ::toupper);
266  stats_request.filter_type =
268  if (stats_request.filter_type != StatsRequestPredicateOp::NONE) {
269  if (!stat_request_obj.HasMember("filter_val")) {
270  throw std::runtime_error("Stats Request JSON missing expected filter_val");
271  }
272  if (!stat_request_obj["filter_val"].IsNumber()) {
273  throw std::runtime_error("Stats Request JSON filter_val should be numeric.");
274  }
275  stats_request.filter_val = stat_request_obj["filter_val"].GetDouble();
276  }
277  stats_requests.emplace_back(stats_request);
278  }
279  return stats_requests;
280 }
StatsRequestPredicateOp convert_string_to_stats_request_predicate_op(const std::string &str)
std::string to_string(char const *&&v)
std::string replace_substrings(const std::string &str, const std::string &pattern_str, const std::string &replacement_str)
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:329
StatsRequestAggType convert_string_to_stats_request_agg_type(const std::string &str)
StatsRequestPredicateOp filter_type
StatsRequestAggType agg_type

+ Here is the call graph for this function:

template<typename TA >
void populate_output_stats_cols ( Column< TextEncodingDict > &  stat_names,
Column< TA > &  stat_vals,
const std::vector< StatsRequest > &  stats_requests 
)

Definition at line 175 of file TableFunctionsStats.hpp.

References StringDictionaryProxy::getOrAddTransient(), setup::name, logger::request_id(), and Column< TextEncodingDict >::string_dict_proxy_.

177  {
178  const int64_t num_requests = static_cast<int64_t>(stats_requests.size());
179  for (int64_t request_id = 0; request_id < num_requests; ++request_id) {
180  stat_names[request_id] =
181  stat_names.string_dict_proxy_->getOrAddTransient(stats_requests[request_id].name);
182  stat_vals[request_id] = stats_requests[request_id].result;
183  }
184 }
StringDictionaryProxy * string_dict_proxy_
int32_t getOrAddTransient(const std::string &)
RequestId request_id()
Definition: Logger.cpp:876
string name
Definition: setup.in.py:72

+ Here is the call graph for this function: