32 #include <unordered_map>
40 namespace Fragmenter_Namespace {
44 namespace Data_Namespace {
71 const std::vector<InputTableInfo>& query_infos,
72 const std::vector<Data_Namespace::MemoryInfo>& gpu_mem_infos,
73 const double gpu_input_mem_limit_percent,
74 const std::vector<size_t> allowed_outer_fragment_indices);
77 std::map<shared::TableKey, const TableFragments*>& all_tables_fragments,
79 const std::vector<InputTableInfo>& query_infos);
82 const std::vector<uint64_t>& frag_offsets,
83 const int device_count,
85 const bool enable_multifrag_kernels,
86 const bool enable_inner_join_fragment_skipping,
93 template <
typename DISPATCH_FCN>
96 const auto& execution_kernels = device_itr.second;
97 CHECK_EQ(execution_kernels.size(), size_t(1));
99 const auto& fragments_list = execution_kernels.front().fragments;
110 template <
typename DISPATCH_FCN>
117 size_t tuple_count = 0;
119 std::unordered_map<int, size_t> execution_kernel_index;
121 CHECK(execution_kernel_index.insert(std::make_pair(device_itr.first,
size_t(0)))
125 bool dispatch_finished =
false;
126 while (!dispatch_finished) {
127 dispatch_finished =
true;
128 for (
const auto& device_itr : execution_kernels_per_device_) {
129 auto& kernel_idx = execution_kernel_index[device_itr.first];
130 if (kernel_idx < device_itr.second.size()) {
131 dispatch_finished =
false;
132 const auto& execution_kernel = device_itr.second[kernel_idx++];
160 const std::vector<uint64_t>& frag_offsets,
161 const int device_count,
162 const size_t num_bytes_for_row,
167 const std::vector<uint64_t>& frag_offsets,
168 const int device_count,
169 const size_t num_bytes_for_row,
174 const std::vector<uint64_t>& frag_offsets,
175 const int device_count,
176 const size_t num_bytes_for_row,
178 const bool enable_inner_join_fragment_skipping,
185 const bool is_temporary_table,
186 const std::vector<uint64_t>& frag_offsets,
187 const int device_count,
188 const size_t num_bytes_for_row,
190 const std::optional<size_t> table_desc_offset,
200 const size_t num_cols);
QueryFragmentDescriptor(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const std::vector< Data_Namespace::MemoryInfo > &gpu_mem_infos, const double gpu_input_mem_limit_percent, const std::vector< size_t > allowed_outer_fragment_indices)
std::optional< size_t > outer_tuple_count
std::map< size_t, size_t > tuple_count_per_device_
bool terminateDispatchMaybe(size_t &tuple_count, const RelAlgExecutionUnit &ra_exe_unit, const ExecutionKernelDescriptor &kernel) const
int64_t rowid_lookup_key_
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
std::vector< Fragmenter_Namespace::FragmentInfo > TableFragments
void assignFragsToKernelDispatch(DISPATCH_FCN f, const RelAlgExecutionUnit &ra_exe_unit) const
void buildFragmentPerKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
std::vector< FragmentsPerTable > FragmentsList
double gpu_input_mem_limit_percent_
bool shouldCheckWorkUnitWatchdog() const
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
Used by Fragmenter classes to store info about each fragment - the fragment id and number of tuples(r...
void checkDeviceMemoryUsage(const Fragmenter_Namespace::FragmentInfo &fragment, const int device_id, const size_t num_cols)
size_t outer_fragments_size_
static void computeAllTablesFragments(std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos)
void buildMultifragKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, const bool enable_inner_join_fragment_skipping, Executor *executor)
shared::TableKey table_key
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
void assignFragsToMultiDispatch(DISPATCH_FCN f) const
void buildFragmentPerKernelForTable(const TableFragments *fragments, const RelAlgExecutionUnit &ra_exe_unit, const InputDescriptor &table_desc, const bool is_temporary_table, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ChunkMetadataVector &deleted_chunk_metadata_vec, const std::optional< size_t > table_desc_offset, const ExecutorDeviceType &device_type, Executor *executor)
std::vector< size_t > allowed_outer_fragment_indices_
void buildFragmentPerKernelMapForUnion(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
std::vector< size_t > fragment_ids
std::map< shared::TableKey, const TableFragments * > selected_tables_fragments_
void buildFragmentKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const ExecutorDeviceType &device_type, const bool enable_multifrag_kernels, const bool enable_inner_join_fragment_skipping, Executor *executor)
std::map< size_t, size_t > available_gpu_mem_bytes_