26 #include <tbb/parallel_for.h>
27 #include <tbb/task_arena.h>
29 #define NANOSECONDS_PER_SECOND 1000000000
33 T col_min = std::numeric_limits<T>::max();
34 T col_max = std::numeric_limits<T>::lowest();
35 const int64_t num_rows = col.
size();
36 const size_t max_thread_count = std::thread::hardware_concurrency();
38 const size_t num_threads = std::min(
39 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
41 std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
42 std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
43 tbb::task_arena limited_arena(num_threads);
45 limited_arena.execute([&] {
47 tbb::blocked_range<int64_t>(0, num_rows),
48 [&](
const tbb::blocked_range<int64_t>& r) {
49 const int64_t start_idx = r.begin();
50 const int64_t end_idx = r.end();
51 T local_col_min = std::numeric_limits<T>::max();
52 T local_col_max = std::numeric_limits<T>::lowest();
53 for (int64_t r = start_idx; r < end_idx; ++r) {
55 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
56 if (std::isnan(val) || std::isinf(val)) {
60 if (val == inline_null_value<T>()) {
63 if (val < local_col_min) {
66 if (val > local_col_max) {
70 size_t thread_idx = tbb::this_task_arena::current_thread_index();
71 if (local_col_min < local_col_mins[thread_idx]) {
72 local_col_mins[thread_idx] = local_col_min;
74 if (local_col_max > local_col_maxes[thread_idx]) {
75 local_col_maxes[thread_idx] = local_col_max;
80 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
81 if (local_col_mins[thread_idx] < col_min) {
82 col_min = local_col_mins[thread_idx];
84 if (local_col_maxes[thread_idx] > col_max) {
85 col_max = local_col_maxes[thread_idx];
88 return std::make_pair(col_min, col_max);
115 template <
typename T>
118 const size_t max_thread_count = std::thread::hardware_concurrency();
120 const size_t num_threads = std::min(
121 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
123 std::vector<double> local_col_sums(num_threads, 0.);
124 std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
125 tbb::task_arena limited_arena(num_threads);
126 limited_arena.execute([&] {
128 tbb::blocked_range<int64_t>(0, num_rows),
129 [&](
const tbb::blocked_range<int64_t>& r) {
130 const int64_t start_idx = r.begin();
131 const int64_t end_idx = r.end();
132 double local_col_sum = 0.;
133 int64_t local_col_non_null_count = 0;
134 for (int64_t r = start_idx; r < end_idx; ++r) {
135 const T val = data[r];
136 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
137 if (std::isnan(val) || std::isinf(val)) {
141 if (val == inline_null_value<T>()) {
144 local_col_sum += data[r];
145 local_col_non_null_count++;
147 size_t thread_idx = tbb::this_task_arena::current_thread_index();
148 local_col_sums[thread_idx] += local_col_sum;
149 local_col_non_null_counts[thread_idx] += local_col_non_null_count;
153 double col_sum = 0.0;
154 int64_t col_non_null_count = 0L;
156 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
157 col_sum += local_col_sums[thread_idx];
158 col_non_null_count += local_col_non_null_counts[thread_idx];
161 return col_non_null_count == 0 ? 0 : col_sum / col_non_null_count;
165 const int64_t num_rows);
168 const int64_t num_rows);
171 const int64_t num_rows);
174 const int64_t num_rows);
177 const int64_t num_rows);
180 const int64_t num_rows);
182 template <
typename T>
194 template <
typename T>
208 template <
typename T>
210 const int64_t num_rows,
213 const size_t max_thread_count = std::thread::hardware_concurrency();
215 const size_t num_threads = std::min(
216 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
218 std::vector<double> local_col_squared_residuals(num_threads, 0.);
219 std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
220 tbb::task_arena limited_arena(num_threads);
222 limited_arena.execute([&] {
224 tbb::blocked_range<int64_t>(0, num_rows),
225 [&](
const tbb::blocked_range<int64_t>& r) {
226 const int64_t start_idx = r.begin();
227 const int64_t end_idx = r.end();
228 double local_col_squared_residual = 0.;
229 int64_t local_col_non_null_count = 0;
230 for (int64_t r = start_idx; r < end_idx; ++r) {
231 const T val = data[r];
232 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
233 if (std::isnan(val) || std::isinf(val)) {
237 if (val == inline_null_value<T>()) {
240 const double residual = val - mean;
241 local_col_squared_residual += (residual * residual);
242 local_col_non_null_count++;
244 size_t thread_idx = tbb::this_task_arena::current_thread_index();
245 local_col_squared_residuals[thread_idx] += local_col_squared_residual;
246 local_col_non_null_counts[thread_idx] += local_col_non_null_count;
250 double col_sum_squared_residual = 0.0;
251 int64_t col_non_null_count = 0;
253 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
254 col_sum_squared_residual += local_col_squared_residuals[thread_idx];
255 col_non_null_count += local_col_non_null_counts[thread_idx];
258 return col_non_null_count == 0 ? 0
259 : sqrt(col_sum_squared_residual / col_non_null_count);
263 const int64_t num_rows,
266 const int64_t num_rows,
269 const int64_t num_rows,
272 const int64_t num_rows,
275 template <
typename T>
277 T col_min = std::numeric_limits<T>::max();
278 T col_max = std::numeric_limits<T>::lowest();
279 bool has_nulls =
false;
280 const int64_t num_rows = col.
size();
281 const size_t max_thread_count = std::thread::hardware_concurrency();
283 const size_t num_threads = std::min(
284 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
286 std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
287 std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
288 std::vector<bool> local_col_has_nulls(num_threads,
false);
289 tbb::task_arena limited_arena(num_threads);
291 limited_arena.execute([&] {
293 tbb::blocked_range<int64_t>(0, num_rows),
294 [&](
const tbb::blocked_range<int64_t>& r) {
295 const int64_t start_idx = r.begin();
296 const int64_t end_idx = r.end();
297 T local_col_min = std::numeric_limits<T>::max();
298 T local_col_max = std::numeric_limits<T>::lowest();
299 bool local_has_nulls =
false;
300 for (int64_t r = start_idx; r < end_idx; ++r) {
301 const T val = col[r];
302 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
303 if (std::isnan(val) || std::isinf(val)) {
308 local_has_nulls =
true;
311 if (val < local_col_min) {
314 if (val > local_col_max) {
318 const size_t thread_idx = tbb::this_task_arena::current_thread_index();
319 if (local_has_nulls) {
320 local_col_has_nulls[thread_idx] =
true;
322 if (local_col_min < local_col_mins[thread_idx]) {
323 local_col_mins[thread_idx] = local_col_min;
325 if (local_col_max > local_col_maxes[thread_idx]) {
326 local_col_maxes[thread_idx] = local_col_max;
331 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
332 if (local_col_has_nulls[thread_idx]) {
335 if (local_col_mins[thread_idx] < col_min) {
336 col_min = local_col_mins[thread_idx];
338 if (local_col_maxes[thread_idx] > col_max) {
339 col_max = local_col_maxes[thread_idx];
342 return {col_min, col_max, has_nulls};
364 template <
typename T>
367 const int64_t num_rows,
369 const double std_dev) {
370 if (std_dev <= 0.0) {
371 throw std::runtime_error(
"Standard deviation cannot be <= 0");
373 const double inv_std_dev = 1.0 / std_dev;
376 [&](
const tbb::blocked_range<int64_t>& r) {
377 const int64_t start_idx = r.begin();
378 const int64_t end_idx = r.end();
379 for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
380 output_data[row_idx] = (input_data[row_idx] - mean) * inv_std_dev;
387 const int64_t num_rows,
389 const double std_dev);
392 const int64_t num_rows,
394 const double std_dev);
396 template <
typename T>
398 const int64_t num_rows) {
399 const int64_t num_features = input_data.size();
400 std::vector<std::vector<T>> normalized_data(num_features);
401 for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
404 normalized_data[feature_idx].resize(num_rows);
406 normalized_data[feature_idx].data(),
411 return normalized_data;
415 const std::vector<float*>& input_data,
416 const int64_t num_rows);
418 const std::vector<double*>& input_data,
419 const int64_t num_rows);
421 template <
typename T>
423 const std::vector<T*>& input_data,
424 const int64_t num_rows) {
425 const int64_t num_features = input_data.size();
426 std::vector<std::vector<T>> normalized_data(num_features);
427 std::vector<T> means(num_features);
428 std::vector<T> std_devs(num_features);
429 for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
430 means[feature_idx] =
get_column_mean(input_data[feature_idx], num_rows);
431 std_devs[feature_idx] =
433 normalized_data[feature_idx].resize(num_rows);
435 normalized_data[feature_idx].data(),
438 std_devs[feature_idx]);
444 const std::vector<float*>& input_data,
445 const int64_t num_rows);
447 const std::vector<double*>& input_data,
448 const int64_t num_rows);
450 template <
typename T1,
typename T2>
453 T1 latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
454 T1 longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
455 T1 latitudeH = sin(latitudeArc * 0.5);
456 latitudeH *= latitudeH;
457 T1 lontitudeH = sin(longitudeArc * 0.5);
458 lontitudeH *= lontitudeH;
459 T1 tmp = cos(fromlat * 0.017453292519943295769236907684886) *
460 cos(tolat * 0.017453292519943295769236907684886);
461 return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
475 const double fromlat,
480 const double fromlat,
484 namespace FileUtilities {
491 std::string regex_string{glob};
493 regex_string = std::regex_replace(regex_string, std::regex(
"\\\\"),
"\\\\");
494 regex_string = std::regex_replace(regex_string, std::regex(
"\\^"),
"\\^");
495 regex_string = std::regex_replace(regex_string, std::regex(
"\\."),
"\\.");
496 regex_string = std::regex_replace(regex_string, std::regex(
"\\$"),
"\\$");
497 regex_string = std::regex_replace(regex_string, std::regex(
"\\|"),
"\\|");
498 regex_string = std::regex_replace(regex_string, std::regex(
"\\("),
"\\(");
499 regex_string = std::regex_replace(regex_string, std::regex(
"\\)"),
"\\)");
500 regex_string = std::regex_replace(regex_string, std::regex(
"\\{"),
"\\{");
501 regex_string = std::regex_replace(regex_string, std::regex(
"\\{"),
"\\}");
502 regex_string = std::regex_replace(regex_string, std::regex(
"\\["),
"\\[");
503 regex_string = std::regex_replace(regex_string, std::regex(
"\\]"),
"\\]");
504 regex_string = std::regex_replace(regex_string, std::regex(
"\\+"),
"\\+");
505 regex_string = std::regex_replace(regex_string, std::regex(
"\\/"),
"\\/");
507 regex_string = std::regex_replace(regex_string, std::regex(
"\\?"),
".");
508 regex_string = std::regex_replace(regex_string, std::regex(
"\\*"),
".*");
512 case_sensitive ? std::regex_constants::ECMAScript : std::regex_constants::icase);
515 std::vector<std::filesystem::path>
get_fs_paths(
const std::string& file_or_directory) {
516 const std::filesystem::path file_or_directory_path(file_or_directory);
517 const auto file_status = std::filesystem::status(file_or_directory_path);
519 std::vector<std::filesystem::path> fs_paths;
520 if (std::filesystem::is_regular_file(file_status)) {
521 fs_paths.emplace_back(file_or_directory_path);
523 }
else if (std::filesystem::is_directory(file_status)) {
524 for (std::filesystem::directory_entry
const& entry :
525 std::filesystem::directory_iterator(file_or_directory_path)) {
526 if (std::filesystem::is_regular_file(std::filesystem::status(entry))) {
527 fs_paths.emplace_back(entry.path());
532 const auto parent_path = file_or_directory_path.parent_path();
533 const auto parent_status = std::filesystem::status(parent_path);
534 if (std::filesystem::is_directory(parent_status)) {
535 const auto file_glob = file_or_directory_path.filename();
536 const std::regex glob_regex{
glob_to_regex(file_glob.string(),
false)};
538 for (std::filesystem::directory_entry
const& entry :
539 std::filesystem::directory_iterator(parent_path)) {
540 if (std::filesystem::is_regular_file(std::filesystem::status(entry))) {
541 const auto entry_filename = entry.path().filename().string();
542 if (std::regex_match(entry_filename, glob_regex)) {
543 fs_paths.emplace_back(entry.path());
555 template <
typename T>
560 switch (bounds_type) {
562 switch (interval_type) {
564 return input >= bounds_val;
566 return input > bounds_val;
571 switch (interval_type) {
573 return input <= bounds_val;
575 return input < bounds_val;
588 const int32_t bounds_val,
593 const int64_t bounds_val,
598 const float bounds_val,
603 const double bounds_val,
607 #endif // #ifndef __CUDACC__
std::regex glob_to_regex(const std::string &glob, bool case_sensitive=false)
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
DEVICE int64_t size() const
DEVICE T * getPtr() const
void z_std_normalize_col(const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
std::vector< std::filesystem::path > get_fs_paths(const std::string &file_or_directory)
const size_t max_inputs_per_thread
DEVICE TextEncodingDict * getPtr() const
DEVICE bool isNull(int64_t index) const
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
EXTENSION_NOINLINE double distance_in_meters(const double fromlon, const double fromlat, const double tolon, const double tolat)
Computes the distance, in meters, between two WGS-84 positions.
NEVER_INLINE HOST std::tuple< T, T, bool > get_column_metadata(const Column< T > &col)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)
DEVICE int64_t size() const
std::vector< std::string > glob(const std::string &pattern)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST bool is_valid_tf_input(const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)