23 #include <tbb/parallel_for.h>
24 #include <tbb/parallel_sort.h>
26 namespace TableFunctions_Namespace {
28 namespace OneHotEncoder_Namespace {
46 const double min_perc_col_total_per_key) {
49 const int32_t col_size = text_col.
size();
50 const int32_t col_range = col_min_max.second - col_min_max.first + 1;
64 std::vector<int32_t> key_counts(col_range, 0);
65 for (int32_t idx = 0; idx < col_size; ++idx) {
66 if (!text_col.
isNull(idx)) {
67 const int32_t key_idx = text_col[idx] - col_min_max.first;
68 key_counts[key_idx]++;
72 std::vector<int32_t> permutation_idxs(col_range);
75 [&](
const tbb::blocked_range<int32_t>& r) {
76 const int32_t r_end = r.end();
77 for (int32_t p = r.begin(); p < r_end; ++p) {
78 permutation_idxs[p] = p;
84 permutation_idxs.begin(),
85 permutation_idxs.begin() + col_range,
86 [&](
const int32_t&
a,
const int32_t& b) {
return key_counts[
a] > key_counts[b]; });
87 int32_t actual_top_k = std::min(col_range, top_k);
88 std::vector<int32_t> top_k_keys;
89 top_k_keys.reserve(actual_top_k);
90 const float col_size_fp =
static_cast<float>(col_size);
95 for (; k < actual_top_k; ++k) {
96 const int32_t key_counts_idx = permutation_idxs[k];
97 const int32_t key_count = key_counts[key_counts_idx];
98 const float key_count_perc = key_count / col_size_fp;
99 if (key_count_perc < min_perc_col_total_per_key) {
102 top_k_keys.emplace_back(key_counts_idx + col_min_max.first);
104 const bool has_other_keys = k < col_range && key_counts[permutation_idxs[k]] > 0;
105 return std::make_pair(top_k_keys, has_other_keys);
119 template <
typename F>
121 const int64_t num_one_hot_cols,
122 const int64_t col_size) {
123 std::vector<std::vector<F>> one_hot_allocated_buffers(num_one_hot_cols);
124 const int64_t target_num_col_allocations_per_thread =
125 std::ceil(100000.0 / (col_size + 1));
126 const ThreadInfo thread_info(std::thread::hardware_concurrency(),
128 target_num_col_allocations_per_thread);
131 std::vector<std::future<void>> allocator_threads;
132 for (int64_t col_idx = 0; col_idx < num_one_hot_cols;
136 [&one_hot_allocated_buffers, num_one_hot_cols, col_size, &thread_info](
137 const int64_t start_col_idx) {
138 const int64_t end_col_idx = std::min(
140 for (int64_t alloc_col_idx = start_col_idx; alloc_col_idx < end_col_idx;
142 one_hot_allocated_buffers[alloc_col_idx].resize(col_size, 0);
147 return one_hot_allocated_buffers;
151 const int64_t num_one_hot_cols,
152 const int64_t col_size);
154 const int64_t num_one_hot_cols,
155 const int64_t col_size);
165 int32_t min_key = std::numeric_limits<int32_t>::max();
166 int32_t max_key = std::numeric_limits<int32_t>::lowest();
167 for (
const auto& key : top_k_keys) {
178 return std::make_pair(min_key, max_key);
191 const int32_t min_key,
192 const int32_t max_key,
193 const bool has_other_key)
200 static std::vector<int16_t>
init_bytemap(
const std::vector<int32_t>& top_k_keys,
201 const int32_t min_key,
202 const int32_t max_key,
203 const bool has_other_key) {
208 if (static_cast<int64_t>(top_k_keys.size()) >= std::numeric_limits<int16_t>::max()) {
209 std::ostringstream error_oss;
210 error_oss <<
"Error: More than " << std::numeric_limits<int16_t>::max() - 1
211 <<
" top k categorical keys not allowed.";
212 throw std::runtime_error(error_oss.str());
214 std::vector<int16_t> bytemap(max_key - min_key + 1,
217 for (
const auto& key : top_k_keys) {
218 bytemap[key - min_key] = offset++;
224 if (key < min_key_ || key >
max_key_) {
237 template <
typename F>
241 one_hot_encoding_info) {
245 bool include_others_key =
false;
246 std::vector<int> top_k_keys;
251 const auto [top_k_keys_temp, has_other_keys] =
255 top_k_keys = top_k_keys_temp;
262 top_k_keys.size() > 1) {
263 top_k_keys.pop_back();
265 for (
const auto top_k_key : top_k_keys) {
271 for (
const auto& cat_feature : one_hot_encoded_col.
cat_features) {
276 const int64_t num_one_hot_cols = top_k_keys.size() + (include_others_key ? 1 : 0);
277 const int64_t col_size = text_col.
size();
279 allocate_one_hot_cols<F>(num_one_hot_cols, col_size);
280 constexpr int64_t max_bytemap_size = 10000000L;
283 const int64_t key_range = max_key - min_key + 1;
284 if (key_range > max_bytemap_size) {
285 throw std::runtime_error(
286 "One-hot vectors currently can only be generated on string columns with less "
291 top_k_keys, min_key, max_key, include_others_key);
294 [&](
const tbb::blocked_range<int64_t>& r) {
295 const int64_t r_end = r.end();
296 for (int64_t row_idx = r.begin(); row_idx < r_end; ++row_idx) {
297 const int32_t key = text_col[row_idx];
306 return one_hot_encoded_col;
312 one_hot_encoding_info);
317 one_hot_encoding_info);
319 template <
typename F>
324 one_hot_encoding_infos) {
325 const int64_t num_input_cols = text_cols.
numCols();
327 std::vector<OneHotEncodedCol<F>> one_hot_encoded_cols;
328 one_hot_encoded_cols.reserve(num_input_cols);
329 for (int64_t input_col_idx = 0; input_col_idx < num_input_cols; ++input_col_idx) {
331 reinterpret_cast<TextEncodingDict*>(text_cols.
ptrs_[input_col_idx]),
334 one_hot_encoded_cols.emplace_back(
335 one_hot_encode<F>(dummy_text_col, one_hot_encoding_infos[input_col_idx]));
337 return one_hot_encoded_cols;
344 one_hot_encoding_infos);
350 one_hot_encoding_infos);
356 #endif // #ifndef __CUDACC__
std::pair< int32_t, int32_t > get_min_max_keys(const std::vector< int32_t > &top_k_keys)
Finds the minimum and maximum keys in a given vector of keys and returns them as a pair...
int64_t num_elems_per_thread
NEVER_INLINE HOST OneHotEncodedCol< F > one_hot_encode(const Column< TextEncodingDict > &text_col, const TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo &one_hot_encoding_info)
Takes a column of text-encoded data and one-hot encoding information as input. It performs the one-ho...
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
std::string getString(int32_t string_id) const
float min_attr_proportion
future< Result > async(Fn &&fn, Args &&...args)
NEVER_INLINE HOST std::vector< std::vector< F > > allocate_one_hot_cols(const int64_t num_one_hot_cols, const int64_t col_size)
Allocates memory for the one-hot encoded columns and initializes them to zero. It takes the number of...
static constexpr int32_t INVALID_STR_ID
StringDictionaryProxy ** string_dict_proxies_
NEVER_INLINE HOST std::pair< std::vector< int32_t >, bool > get_top_k_keys(const Column< TextEncodingDict > &text_col, const int32_t top_k, const double min_perc_col_total_per_key)
This function calculates the top k most frequent keys (categories) in the provided column based on a ...
StringDictionaryProxy * string_dict_proxy_
std::vector< std::string > cat_features
const bool has_other_key_
const std::vector< int16_t > bytemap_
std::vector< std::vector< F > > encoded_buffers
DEVICE int64_t numCols() const
static std::vector< int16_t > init_bytemap(const std::vector< int32_t > &top_k_keys, const int32_t min_key, const int32_t max_key, const bool has_other_key)
std::vector< std::string > cat_features
DEVICE bool isNull(int64_t index) const
A struct that creates a bytemap to map each key to its corresponding one-hot column index...
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define DEBUG_TIMER(name)
DEVICE int64_t size() const
constexpr int16_t INVALID_COL_IDX
int16_t get_col_idx_for_key(const int32_t key) const
int32_t getIdOfString(const std::string &str) const
KeyToOneHotColBytemap(const std::vector< int32_t > &top_k_keys, const int32_t min_key, const int32_t max_key, const bool has_other_key)