OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ResultSetSort.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
23 #ifdef HAVE_CUDA
24 #include "Execute.h"
25 #include "ResultSet.h"
26 #include "ResultSetSortImpl.h"
27 
28 #include "../Shared/thread_count.h"
29 
30 #include <future>
31 
32 std::unique_ptr<CudaMgr_Namespace::CudaMgr> g_cuda_mgr; // for unit tests only
33 
34 namespace {
35 
36 void set_cuda_context(Data_Namespace::DataMgr* data_mgr, const int device_id) {
37  if (data_mgr) {
38  data_mgr->getCudaMgr()->setContext(device_id);
39  return;
40  }
41  // for unit tests only
42  CHECK(g_cuda_mgr);
43  g_cuda_mgr->setContext(device_id);
44 }
45 
46 } // namespace
47 
48 void ResultSet::doBaselineSort(const ExecutorDeviceType device_type,
49  const std::list<Analyzer::OrderEntry>& order_entries,
50  const size_t top_n,
51  const Executor* executor) {
52  CHECK_EQ(size_t(1), order_entries.size());
54  const auto& oe = order_entries.front();
55  CHECK_GT(oe.tle_no, 0);
56  CHECK_LE(static_cast<size_t>(oe.tle_no), targets_.size());
57  size_t logical_slot_idx = 0;
58  size_t physical_slot_off = 0;
59  for (size_t i = 0; i < static_cast<size_t>(oe.tle_no - 1); ++i) {
60  physical_slot_off += query_mem_desc_.getPaddedSlotWidthBytes(logical_slot_idx);
61  logical_slot_idx =
63  }
64  const auto col_off =
65  get_slot_off_quad(query_mem_desc_) * sizeof(int64_t) + physical_slot_off;
66  const size_t col_bytes = query_mem_desc_.getPaddedSlotWidthBytes(logical_slot_idx);
67  const auto row_bytes = get_row_bytes(query_mem_desc_);
68  const auto target_groupby_indices_sz = query_mem_desc_.targetGroupbyIndicesSize();
69  CHECK(target_groupby_indices_sz == 0 ||
70  static_cast<size_t>(oe.tle_no) <= target_groupby_indices_sz);
71  const int64_t target_groupby_index{
72  target_groupby_indices_sz == 0
73  ? -1
74  : query_mem_desc_.getTargetGroupbyIndex(oe.tle_no - 1)};
76  col_off,
77  col_bytes,
78  row_bytes,
79  targets_[oe.tle_no - 1],
80  target_groupby_index};
81  PodOrderEntry pod_oe{oe.tle_no, oe.is_desc, oe.nulls_first};
82  auto groupby_buffer = storage_->getUnderlyingBuffer();
83  auto data_mgr = getDataManager();
84  const auto step = static_cast<size_t>(
85  device_type == ExecutorDeviceType::GPU ? getGpuCount() : cpu_threads());
86  CHECK_GE(step, size_t(1));
87  const auto key_bytewidth = query_mem_desc_.getEffectiveKeyWidth();
88  if (step > 1) {
89  std::vector<std::future<void>> top_futures;
90  std::vector<Permutation> strided_permutations(step);
91  for (size_t start = 0; start < step; ++start) {
92  top_futures.emplace_back(std::async(
94  [&strided_permutations,
95  data_mgr,
96  device_type,
97  groupby_buffer,
98  pod_oe,
99  key_bytewidth,
100  layout,
101  top_n,
102  start,
103  step] {
104  if (device_type == ExecutorDeviceType::GPU) {
105  set_cuda_context(data_mgr, start);
106  }
107  strided_permutations[start] = (key_bytewidth == 4)
108  ? baseline_sort<int32_t>(device_type,
109  start,
110  data_mgr,
111  groupby_buffer,
112  pod_oe,
113  layout,
114  top_n,
115  start,
116  step)
117  : baseline_sort<int64_t>(device_type,
118  start,
119  data_mgr,
120  groupby_buffer,
121  pod_oe,
122  layout,
123  top_n,
124  start,
125  step);
126  }));
127  }
128  for (auto& top_future : top_futures) {
129  top_future.wait();
130  }
131  for (auto& top_future : top_futures) {
132  top_future.get();
133  }
134  permutation_.reserve(strided_permutations.size() * top_n);
135  for (const auto& strided_permutation : strided_permutations) {
136  permutation_.insert(
137  permutation_.end(), strided_permutation.begin(), strided_permutation.end());
138  }
139  auto pv = PermutationView(permutation_.data(), permutation_.size());
140  topPermutation(pv, top_n, createComparator(order_entries, pv, executor, false));
141  if (top_n < permutation_.size()) {
142  permutation_.resize(top_n);
143  permutation_.shrink_to_fit();
144  }
145  return;
146  } else {
147  permutation_ =
148  (key_bytewidth == 4)
150  device_type, 0, data_mgr, groupby_buffer, pod_oe, layout, top_n, 0, 1)
152  device_type, 0, data_mgr, groupby_buffer, pod_oe, layout, top_n, 0, 1);
153  }
154 }
155 
157  const std::list<Analyzer::OrderEntry>& order_entries,
158  const size_t top_n) {
159  if (order_entries.size() != 1 || query_mem_desc_.hasKeylessHash() ||
161  return false;
162  }
163  const auto& order_entry = order_entries.front();
164  CHECK_GE(order_entry.tle_no, 1);
165  CHECK_LE(static_cast<size_t>(order_entry.tle_no), targets_.size());
166  const auto& target_info = targets_[order_entry.tle_no - 1];
167  if (!target_info.sql_type.is_number() || is_distinct_target(target_info)) {
168  return false;
169  }
173  top_n;
174 }
175 
178 }
179 
180 int ResultSet::getGpuCount() const {
181  const auto data_mgr = getDataManager();
182  if (!data_mgr) {
183  return g_cuda_mgr ? g_cuda_mgr->getDeviceCount() : 0;
184  }
185  return data_mgr->gpusPresent() ? data_mgr->getCudaMgr()->getDeviceCount() : 0;
186 }
187 #endif // HAVE_CUDA
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:177
#define CHECK_EQ(x, y)
Definition: Logger.h:301
VectorView< PermutationIdx > PermutationView
Definition: ResultSet.h:154
Permutation permutation_
Definition: ResultSet.h:955
int64_t getTargetGroupbyIndex(const size_t target_idx) const
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
QueryMemoryDescriptor query_mem_desc_
Definition: ResultSet.h:947
#define CHECK_GE(x, y)
Definition: Logger.h:306
std::unique_ptr< ResultSetStorage > storage_
Definition: ResultSet.h:948
size_t get_slot_off_quad(const QueryMemoryDescriptor &query_mem_desc)
size_t getEffectiveKeyWidth() const
#define CHECK_GT(x, y)
Definition: Logger.h:305
ExecutorDeviceType
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:234
const std::vector< TargetInfo > targets_
Definition: ResultSet.h:943
future< Result > async(Fn &&fn, Args &&...args)
static SysCatalog & instance()
Definition: SysCatalog.h:343
size_t advance_slot(const size_t j, const TargetInfo &target_info, const bool separate_varlen_storage)
int getDeviceCount() const
Definition: CudaMgr.h:90
size_t targetGroupbyIndicesSize() const
bool canUseFastBaselineSort(const std::list< Analyzer::OrderEntry > &order_entries, const size_t top_n)
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
Comparator createComparator(const std::list< Analyzer::OrderEntry > &order_entries, const PermutationView permutation, const Executor *executor, const bool single_threaded)
Definition: ResultSet.h:877
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
int tle_no
QueryDescriptionType getQueryDescriptionType() const
bool isSingleColumnGroupByWithPerfectHash() const
#define CHECK_LE(x, y)
Definition: Logger.h:304
size_t get_row_bytes(const QueryMemoryDescriptor &query_mem_desc)
GroupByBaselineHash
Definition: enums.h:58
static PermutationView topPermutation(PermutationView, const size_t n, const Comparator &)
Definition: ResultSet.cpp:1315
#define CHECK(condition)
Definition: Logger.h:291
int getGpuCount() const
bool gpusPresent() const
Definition: DataMgr.h:170
Basic constructors and methods of the row set interface.
void doBaselineSort(const ExecutorDeviceType device_type, const std::list< Analyzer::OrderEntry > &order_entries, const size_t top_n, const Executor *executor)
bool separate_varlen_storage_valid_
Definition: ResultSet.h:980
Data_Namespace::DataMgr * getDataManager() const
int cpu_threads()
Definition: thread_count.h:25
template std::vector< uint32_t > baseline_sort< int32_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
template std::vector< uint32_t > baseline_sort< int64_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)