OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DataMgr.cpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2022 HEAVY.AI, Inc.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
23 #include "DataMgr/DataMgr.h"
27 #include "CudaMgr/CudaMgr.h"
29 #include "FileMgr/GlobalFileMgr.h"
31 
32 #ifdef __APPLE__
33 #include <sys/sysctl.h>
34 #include <sys/types.h>
35 #endif
36 
37 #include <boost/container/small_vector.hpp>
38 #include <boost/filesystem.hpp>
39 
40 #include <algorithm>
41 #include <cctype>
42 #include <charconv>
43 #include <fstream>
44 #include <limits>
45 #include <numeric>
46 #include <string_view>
47 
48 extern bool g_enable_fsi;
49 
50 #ifdef ENABLE_MEMKIND
51 bool g_enable_tiered_cpu_mem{false};
52 std::string g_pmem_path{};
53 size_t g_pmem_size{0};
54 #endif
55 
57 
58 namespace Data_Namespace {
59 
60 namespace {
61 // Global pointer and function for atexit registration.
62 // Do NOT use this pointer for anything else.
63 static DataMgr* g_data_mgr_ptr = nullptr;
64 static bool at_exit_called = false;
65 } // namespace
66 
68  at_exit_called = true;
70  // safely destroy all gpu allocations explicitly to avoid unexpected
71  // `CUDA_ERROR_DEINITIALIZED` exception while trying to synchronize
72  // devices to destroy BufferMgr for GPU, i.e., 'GpuCudaBufferMgr` and `CudaMgr`
74  }
75 }
76 
77 DataMgr::DataMgr(const std::string& dataDir,
78  const SystemParameters& system_parameters,
79  std::unique_ptr<CudaMgr_Namespace::CudaMgr> cudaMgr,
80  const bool useGpus,
81  const size_t reservedGpuMem,
82  const size_t numReaderThreads,
83  const File_Namespace::DiskCacheConfig cache_config)
84  : cudaMgr_{std::move(cudaMgr)}
85  , dataDir_{dataDir}
86  , hasGpus_{false}
87  , reservedGpuMem_{reservedGpuMem} {
88  if (useGpus) {
89  if (cudaMgr_) {
90  hasGpus_ = true;
91 
92  // we register the `atExitHandler` if we create `DataMgr` having GPU
93  // to make sure we clear all allocated GPU memory when destructing this `DataMgr`
94  g_data_mgr_ptr = this;
95  std::atexit(atExitHandler);
96  } else {
97  LOG(ERROR) << "CudaMgr instance is invalid, falling back to CPU-only mode.";
98  hasGpus_ = false;
99  }
100  } else {
101  // NOTE: useGpus == false with a valid cudaMgr is a potentially valid configuration.
102  // i.e. QueryEngine can be set to cpu-only for a cuda-enabled build, but still have
103  // rendering enabled. The renderer would require a CudaMgr in this case, in addition
104  // to a GpuCudaBufferMgr for cuda-backed thrust allocations.
105  // We're still setting hasGpus_ to false in that case tho to enforce cpu-only query
106  // execution.
107  hasGpus_ = false;
108  }
109 
110  populateMgrs(system_parameters, numReaderThreads, cache_config);
111  createTopLevelMetadata();
112 }
113 
115  g_data_mgr_ptr = nullptr;
116 
117  // This duplicates atExitHandler so we still shut down in the case of a startup
118  // exception. We can request cleanup of GPU memory twice, so it's safe.
119  if (!at_exit_called && hasGpus_) {
121  }
122 
123  int numLevels = bufferMgrs_.size();
124  for (int level = numLevels - 1; level >= 0; --level) {
125  for (size_t device = 0; device < bufferMgrs_[level].size(); device++) {
126  delete bufferMgrs_[level][device];
127  }
128  }
129 }
130 
132  SystemMemoryUsage usage;
133 #ifdef __linux__
134 
135  // Determine Linux available memory and total memory.
136  // Available memory is different from free memory because
137  // when Linux sees free memory, it tries to use it for
138  // stuff like disk caching. However, the memory is not
139  // reserved and is still available to be allocated by
140  // user processes.
141  // Parsing /proc/meminfo for this info isn't very elegant
142  // but as a virtual file it should be reasonably fast.
143  // See also:
144  // https://github.com/torvalds/linux/commit/34e431b0ae398fc54ea69ff85ec700722c9da773
146  usage.free = mi["MemAvailable"];
147  usage.total = mi["MemTotal"];
148 
149  // Determine process memory in use.
150  // See also:
151  // https://stackoverflow.com/questions/669438/how-to-get-memory-usage-at-runtime-using-c
152  // http://man7.org/linux/man-pages/man5/proc.5.html
153  int64_t size = 0;
154  int64_t resident = 0;
155  int64_t shared = 0;
156 
157  std::ifstream fstatm("/proc/self/statm");
158  fstatm >> size >> resident >> shared;
159  fstatm.close();
160 
161  long page_size =
162  sysconf(_SC_PAGE_SIZE); // in case x86-64 is configured to use 2MB pages
163 
164  usage.resident = resident * page_size;
165  usage.vtotal = size * page_size;
166  usage.regular = (resident - shared) * page_size;
167  usage.shared = shared * page_size;
168 
169  ProcBuddyinfoParser bi{};
170  bi.parseBuddyinfo();
171  usage.frag = bi.getFragmentationPercent();
172  usage.avail_pages = bi.getSumAvailPages();
173  usage.high_blocks = bi.getSumHighestBlocks();
174 
175 #else
176 
177  usage.total = 0;
178  usage.free = 0;
179  usage.resident = 0;
180  usage.vtotal = 0;
181  usage.regular = 0;
182  usage.shared = 0;
183  usage.frag = 0.0;
184  usage.avail_pages = 0;
185  usage.high_blocks = 0;
186 
187 #endif
188 
189  return usage;
190 }
191 
193 #ifdef __APPLE__
194  int mib[2];
195  size_t physical_memory;
196  size_t length;
197  // Get the Physical memory size
198  mib[0] = CTL_HW;
199  mib[1] = HW_MEMSIZE;
200  length = sizeof(size_t);
201  sysctl(mib, 2, &physical_memory, &length, NULL, 0);
202  return physical_memory;
203 #elif defined(_MSC_VER)
204  MEMORYSTATUSEX status;
205  status.dwLength = sizeof(status);
206  GlobalMemoryStatusEx(&status);
207  return status.ullTotalPhys;
208 #else // Linux
209  long pages = sysconf(_SC_PHYS_PAGES);
210  long page_size = sysconf(_SC_PAGE_SIZE);
211  return pages * page_size;
212 #endif
213 }
214 
215 void DataMgr::allocateCpuBufferMgr(int32_t device_id,
216  size_t total_cpu_size,
217  size_t min_cpu_slab_size,
218  size_t max_cpu_slab_size,
219  size_t default_cpu_slab_size,
220  size_t page_size,
221  const CpuTierSizeVector& cpu_tier_sizes) {
222 #ifdef ENABLE_MEMKIND
223  if (g_enable_tiered_cpu_mem) {
224  bufferMgrs_[1].push_back(
226  total_cpu_size,
227  cudaMgr_.get(),
228  min_cpu_slab_size,
229  max_cpu_slab_size,
230  default_cpu_slab_size,
231  page_size,
232  cpu_tier_sizes,
233  bufferMgrs_[0][0]));
234  return;
235  }
236 #endif
237 
238  bufferMgrs_[1].push_back(new Buffer_Namespace::CpuBufferMgr(0,
239  total_cpu_size,
240  cudaMgr_.get(),
241  min_cpu_slab_size,
242  max_cpu_slab_size,
243  default_cpu_slab_size,
244  page_size,
245  bufferMgrs_[0][0]));
246 }
247 
248 // This function exists for testing purposes so that we can test a reset of the cache.
250  const size_t num_reader_threads,
251  const SystemParameters& sys_params) {
252  int numLevels = bufferMgrs_.size();
253  for (int level = numLevels - 1; level >= 0; --level) {
254  for (size_t device = 0; device < bufferMgrs_[level].size(); device++) {
255  delete bufferMgrs_[level][device];
256  }
257  }
258  bufferMgrs_.clear();
259  populateMgrs(sys_params, num_reader_threads, cache_config);
261 }
262 
263 namespace {
264 size_t get_slab_size(size_t initial_slab_size,
265  size_t buffer_pool_size,
266  size_t page_size) {
267  auto slab_size = std::min(initial_slab_size, buffer_pool_size);
268  slab_size = (slab_size / page_size) * page_size;
269  return slab_size;
270 }
271 } // namespace
272 
273 void DataMgr::populateMgrs(const SystemParameters& system_parameters,
274  const size_t userSpecifiedNumReaderThreads,
275  const File_Namespace::DiskCacheConfig& cache_config) {
276  // no need for locking, as this is only called in the constructor
277  bufferMgrs_.resize(2);
278  bufferMgrs_[0].push_back(
279  new PersistentStorageMgr(dataDir_, userSpecifiedNumReaderThreads, cache_config));
280 
281  levelSizes_.push_back(1);
282  auto page_size = system_parameters.buffer_page_size;
283  CHECK_GT(page_size, size_t(0));
284  auto cpu_buffer_size = system_parameters.cpu_buffer_mem_bytes;
285  if (cpu_buffer_size == 0) { // if size is not specified
286  const auto total_system_memory = getTotalSystemMemory();
287  VLOG(1) << "Detected " << (float)total_system_memory / (1024 * 1024)
288  << "M of total system memory.";
289  cpu_buffer_size = total_system_memory *
290  0.8; // should get free memory instead of this ugly heuristic
291  }
292  auto min_cpu_slab_size =
293  get_slab_size(system_parameters.min_cpu_slab_size, cpu_buffer_size, page_size);
294  auto max_cpu_slab_size =
296  ? cpu_buffer_size
297  : get_slab_size(
298  system_parameters.max_cpu_slab_size, cpu_buffer_size, page_size);
299  auto default_cpu_slab_size =
300  get_slab_size(system_parameters.default_cpu_slab_size, cpu_buffer_size, page_size);
301  LOG(INFO) << "Min CPU Slab Size is " << float(min_cpu_slab_size) / (1024 * 1024)
302  << "MB";
303  LOG(INFO) << "Max CPU Slab Size is " << float(max_cpu_slab_size) / (1024 * 1024)
304  << "MB";
305  LOG(INFO) << "Default CPU Slab Size is " << float(default_cpu_slab_size) / (1024 * 1024)
306  << "MB";
307  LOG(INFO) << "Max memory pool size for CPU is "
308  << float(cpu_buffer_size) / (1024 * 1024) << "MB";
309 
310  size_t total_cpu_size = 0;
311 
312 #ifdef ENABLE_MEMKIND
313  CpuTierSizeVector cpu_tier_sizes(numCpuTiers, 0);
314  cpu_tier_sizes[CpuTier::DRAM] = cpuBufferSize;
315  if (g_enable_tiered_cpu_mem) {
316  cpu_tier_sizes[CpuTier::PMEM] = g_pmem_size;
317  LOG(INFO) << "Max memory pool size for PMEM is " << (float)g_pmem_size / (1024 * 1024)
318  << "MB";
319  }
320  for (auto cpu_tier_size : cpu_tier_sizes) {
321  total_cpu_size += cpu_tier_size;
322  }
323 #else
324  CpuTierSizeVector cpu_tier_sizes{};
325  total_cpu_size = cpu_buffer_size;
326 #endif
327 
328  if (hasGpus_ || cudaMgr_) {
329  LOG(INFO) << "Reserved GPU memory is " << (float)reservedGpuMem_ / (1024 * 1024)
330  << "MB includes render buffer allocation";
331  bufferMgrs_.resize(3);
333  total_cpu_size,
334  min_cpu_slab_size,
335  max_cpu_slab_size,
336  default_cpu_slab_size,
337  page_size,
338  cpu_tier_sizes);
339 
340  levelSizes_.push_back(1);
341  auto num_gpus = cudaMgr_->getDeviceCount();
342  for (int gpu_num = 0; gpu_num < num_gpus; ++gpu_num) {
343  auto gpu_max_mem_size =
344  system_parameters.gpu_buffer_mem_bytes != 0
345  ? system_parameters.gpu_buffer_mem_bytes
346  : (cudaMgr_->getDeviceProperties(gpu_num)->globalMem) - (reservedGpuMem_);
347  auto min_gpu_slab_size =
348  get_slab_size(system_parameters.min_gpu_slab_size, gpu_max_mem_size, page_size);
349  auto max_gpu_slab_size =
350  get_slab_size(system_parameters.max_gpu_slab_size, gpu_max_mem_size, page_size);
351  auto default_gpu_slab_size = get_slab_size(
352  system_parameters.default_gpu_slab_size, gpu_max_mem_size, page_size);
353  LOG(INFO) << "Min GPU Slab size for GPU " << gpu_num << " is "
354  << float(min_gpu_slab_size) / (1024 * 1024) << "MB";
355  LOG(INFO) << "Max GPU Slab size for GPU " << gpu_num << " is "
356  << float(max_gpu_slab_size) / (1024 * 1024) << "MB";
357  LOG(INFO) << "Default GPU Slab size for GPU " << gpu_num << " is "
358  << float(default_gpu_slab_size) / (1024 * 1024) << "MB";
359  LOG(INFO) << "Max memory pool size for GPU " << gpu_num << " is "
360  << float(gpu_max_mem_size) / (1024 * 1024) << "MB";
361  bufferMgrs_[2].push_back(
363  gpu_max_mem_size,
364  cudaMgr_.get(),
365  min_gpu_slab_size,
366  max_gpu_slab_size,
367  default_gpu_slab_size,
368  page_size,
369  bufferMgrs_[1][0]));
370  }
371  levelSizes_.push_back(num_gpus);
372  } else {
374  total_cpu_size,
375  min_cpu_slab_size,
376  max_cpu_slab_size,
377  default_cpu_slab_size,
378  page_size,
379  cpu_tier_sizes);
380  levelSizes_.push_back(1);
381  }
382 }
383 
384 void DataMgr::convertDB(const std::string basePath) {
385  // no need for locking, as this is only called in the constructor
386 
387  /* check that the data directory exists and it's empty */
388  std::string mapdDataPath(basePath + "/../" + shared::kDataDirectoryName + "/");
389  boost::filesystem::path path(mapdDataPath);
390  if (boost::filesystem::exists(path)) {
391  if (!boost::filesystem::is_directory(path)) {
392  LOG(FATAL) << "Path to directory \"" + shared::kDataDirectoryName +
393  "\" to convert DB is not a directory.";
394  }
395  } else { // data directory does not exist
396  LOG(FATAL) << "Path to directory \"" + shared::kDataDirectoryName +
397  "\" to convert DB does not exist.";
398  }
399 
400  File_Namespace::GlobalFileMgr* gfm{nullptr};
401  gfm = dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])->getGlobalFileMgr();
402  CHECK(gfm);
403 
404  LOG(INFO) << "Database conversion started.";
405  // this call also copies data into new DB structure
406  File_Namespace::FileMgr* fm_base_db = new File_Namespace::FileMgr(gfm, basePath);
407  delete fm_base_db;
408 
409  /* write content of DB into newly created/converted DB structure & location */
410  checkpoint(); // outputs data files as well as metadata files
411  LOG(INFO) << "Database conversion completed.";
412 }
413 
415  const { // create metadata shared by all tables of all DBs
416  ChunkKey chunkKey(2);
417  chunkKey[0] = 0; // top level db_id
418  chunkKey[1] = 0; // top level tb_id
419 
420  File_Namespace::GlobalFileMgr* gfm{nullptr};
421  gfm = dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])->getGlobalFileMgr();
422  CHECK(gfm);
423 
424  auto fm_top = gfm->getFileMgr(chunkKey);
425  if (auto fm = dynamic_cast<File_Namespace::FileMgr*>(fm_top)) {
426  fm->createOrMigrateTopLevelMetadata();
427  }
428 }
429 
430 std::vector<MemoryInfo> DataMgr::getMemoryInfo(const MemoryLevel mem_level) const {
431  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
432  return getMemoryInfoUnlocked(mem_level);
433 }
434 
435 std::vector<MemoryInfo> DataMgr::getMemoryInfoUnlocked(
436  const MemoryLevel mem_level) const {
437  std::vector<MemoryInfo> mem_info;
438  if (mem_level == MemoryLevel::CPU_LEVEL) {
439  Buffer_Namespace::CpuBufferMgr* cpu_buffer =
440  dynamic_cast<Buffer_Namespace::CpuBufferMgr*>(
442  CHECK(cpu_buffer);
443  MemoryInfo mi;
444 
445  mi.pageSize = cpu_buffer->getPageSize();
446  mi.maxNumPages = cpu_buffer->getMaxSize() / mi.pageSize;
447  mi.isAllocationCapped = cpu_buffer->isAllocationCapped();
448  mi.numPageAllocated = cpu_buffer->getAllocated() / mi.pageSize;
449 
450  const auto& slab_segments = cpu_buffer->getSlabSegments();
451  for (size_t slab_num = 0; slab_num < slab_segments.size(); ++slab_num) {
452  for (auto const& segment : slab_segments[slab_num]) {
453  MemoryData md;
454  md.slabNum = slab_num;
455  md.startPage = segment.start_page;
456  md.numPages = segment.num_pages;
457  md.touch = segment.last_touched;
458  md.memStatus = segment.mem_status;
459  md.chunk_key.insert(
460  md.chunk_key.end(), segment.chunk_key.begin(), segment.chunk_key.end());
461  mi.nodeMemoryData.push_back(md);
462  }
463  }
464  mem_info.push_back(mi);
465  } else if (hasGpus_) {
466  int numGpus = cudaMgr_->getDeviceCount();
467  for (int gpuNum = 0; gpuNum < numGpus; ++gpuNum) {
469  dynamic_cast<Buffer_Namespace::GpuCudaBufferMgr*>(
471  CHECK(gpu_buffer);
472  MemoryInfo mi;
473 
474  mi.pageSize = gpu_buffer->getPageSize();
475  mi.maxNumPages = gpu_buffer->getMaxSize() / mi.pageSize;
476  mi.isAllocationCapped = gpu_buffer->isAllocationCapped();
477  mi.numPageAllocated = gpu_buffer->getAllocated() / mi.pageSize;
478 
479  const auto& slab_segments = gpu_buffer->getSlabSegments();
480  for (size_t slab_num = 0; slab_num < slab_segments.size(); ++slab_num) {
481  for (auto const& segment : slab_segments[slab_num]) {
482  MemoryData md;
483  md.slabNum = slab_num;
484  md.startPage = segment.start_page;
485  md.numPages = segment.num_pages;
486  md.touch = segment.last_touched;
487  md.chunk_key.insert(
488  md.chunk_key.end(), segment.chunk_key.begin(), segment.chunk_key.end());
489  md.memStatus = segment.mem_status;
490  mi.nodeMemoryData.push_back(md);
491  }
492  }
493  mem_info.push_back(mi);
494  }
495  }
496  return mem_info;
497 }
498 
499 std::string DataMgr::dumpLevel(const MemoryLevel memLevel) {
500  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
501 
502  // if gpu we need to iterate through all the buffermanagers for each card
503  if (memLevel == MemoryLevel::GPU_LEVEL) {
504  int numGpus = cudaMgr_->getDeviceCount();
505  std::ostringstream tss;
506  for (int gpuNum = 0; gpuNum < numGpus; ++gpuNum) {
507  tss << bufferMgrs_[memLevel][gpuNum]->printSlabs();
508  }
509  return tss.str();
510  } else {
511  return bufferMgrs_[memLevel][0]->printSlabs();
512  }
513 }
514 
515 void DataMgr::clearMemory(const MemoryLevel memLevel) {
516  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
517 
518  // if gpu we need to iterate through all the buffermanagers for each card
519  if (memLevel == MemoryLevel::GPU_LEVEL) {
520  if (cudaMgr_) {
521  int numGpus = cudaMgr_->getDeviceCount();
522  for (int gpuNum = 0; gpuNum < numGpus; ++gpuNum) {
523  auto buffer_mgr_for_gpu =
524  dynamic_cast<Buffer_Namespace::BufferMgr*>(bufferMgrs_[memLevel][gpuNum]);
525  CHECK(buffer_mgr_for_gpu);
526  buffer_mgr_for_gpu->clearSlabs();
527  }
528  } else {
529  LOG(WARNING) << "Unable to clear GPU memory: No GPUs detected";
530  }
531  } else {
532  auto buffer_mgr_for_cpu =
533  dynamic_cast<Buffer_Namespace::BufferMgr*>(bufferMgrs_[memLevel][0]);
534  CHECK(buffer_mgr_for_cpu);
535  buffer_mgr_for_cpu->clearSlabs();
536  }
537 }
538 
540  const MemoryLevel memLevel,
541  const int deviceId) {
542  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
543  return bufferMgrs_[memLevel][deviceId]->isBufferOnDevice(key);
544 }
545 
547  const ChunkKey& keyPrefix) {
548  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
549  bufferMgrs_[0][0]->getChunkMetadataVecForKeyPrefix(chunkMetadataVec, keyPrefix);
550 }
551 
553  const MemoryLevel memoryLevel,
554  const int deviceId,
555  const size_t page_size) {
556  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
557  int level = static_cast<int>(memoryLevel);
558  return bufferMgrs_[level][deviceId]->createBuffer(key, page_size);
559 }
560 
562  const MemoryLevel memoryLevel,
563  const int deviceId,
564  const size_t numBytes) {
565  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
566  const auto level = static_cast<size_t>(memoryLevel);
567  CHECK_LT(level, levelSizes_.size()); // make sure we have a legit buffermgr
568  CHECK_LT(deviceId, levelSizes_[level]); // make sure we have a legit buffermgr
569  return bufferMgrs_[level][deviceId]->getBuffer(key, numBytes);
570 }
571 
573  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
574 
575  int numLevels = bufferMgrs_.size();
576  for (int level = numLevels - 1; level >= 0; --level) {
577  for (int device = 0; device < levelSizes_[level]; ++device) {
578  bufferMgrs_[level][device]->deleteBuffersWithPrefix(keyPrefix);
579  }
580  }
581 }
582 
583 // only deletes the chunks at the given memory level
585  const MemoryLevel memLevel) {
586  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
587 
588  if (bufferMgrs_.size() <= memLevel) {
589  return;
590  }
591  for (int device = 0; device < levelSizes_[memLevel]; ++device) {
592  bufferMgrs_[memLevel][device]->deleteBuffersWithPrefix(keyPrefix);
593  }
594 }
595 
596 // only deletes the chunks at the given memory level
598  const MemoryLevel memLevel,
599  const int device_id) {
600  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
601  CHECK_LT(memLevel, bufferMgrs_.size());
602  bufferMgrs_[memLevel][device_id]->deleteBuffer(key);
603 }
604 
606  const int deviceId,
607  const size_t numBytes) {
608  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
609  const auto level = static_cast<int>(memoryLevel);
610  CHECK_LT(deviceId, levelSizes_[level]);
611  return bufferMgrs_[level][deviceId]->alloc(numBytes);
612 }
613 
615  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
616  int level = static_cast<int>(buffer->getType());
617  bufferMgrs_[level][buffer->getDeviceId()]->free(buffer);
618 }
619 
620 void DataMgr::copy(AbstractBuffer* destBuffer, AbstractBuffer* srcBuffer) {
621  destBuffer->write(srcBuffer->getMemoryPtr(),
622  srcBuffer->size(),
623  0,
624  srcBuffer->getType(),
625  srcBuffer->getDeviceId());
626 }
627 
628 // could add function below to do arbitrary copies between buffers
629 
630 // void DataMgr::copy(AbstractBuffer *destBuffer, const AbstractBuffer *srcBuffer, const
631 // size_t numBytes, const size_t destOffset, const size_t srcOffset) {
632 //} /
633 
634 void DataMgr::checkpoint(const int db_id, const int tb_id) {
635  // TODO(adb): do we need a buffer mgr lock here?
636  // MAT Yes to reduce Parallel Executor TSAN issues (and correctness for now)
637  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
638  for (auto levelIt = bufferMgrs_.rbegin(); levelIt != bufferMgrs_.rend(); ++levelIt) {
639  // use reverse iterator so we start at GPU level, then CPU then DISK
640  for (auto deviceIt = levelIt->begin(); deviceIt != levelIt->end(); ++deviceIt) {
641  (*deviceIt)->checkpoint(db_id, tb_id);
642  }
643  }
644 }
645 
646 void DataMgr::checkpoint(const int db_id,
647  const int table_id,
648  const MemoryLevel memory_level) {
649  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
650  CHECK_LT(static_cast<size_t>(memory_level), bufferMgrs_.size());
651  CHECK_LT(static_cast<size_t>(memory_level), levelSizes_.size());
652  for (int device_id = 0; device_id < levelSizes_[memory_level]; device_id++) {
653  bufferMgrs_[memory_level][device_id]->checkpoint(db_id, table_id);
654  }
655 }
656 
658  // TODO(adb): SAA
659  // MAT Yes to reduce Parallel Executor TSAN issues (and correctness for now)
660  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
661  for (auto levelIt = bufferMgrs_.rbegin(); levelIt != bufferMgrs_.rend(); ++levelIt) {
662  // use reverse iterator so we start at GPU level, then CPU then DISK
663  for (auto deviceIt = levelIt->begin(); deviceIt != levelIt->end(); ++deviceIt) {
664  (*deviceIt)->checkpoint();
665  }
666  }
667 }
668 
669 void DataMgr::removeTableRelatedDS(const int db_id, const int tb_id) {
670  std::lock_guard<std::mutex> buffer_lock(buffer_access_mutex_);
671  bufferMgrs_[0][0]->removeTableRelatedDS(db_id, tb_id);
672 }
673 
674 void DataMgr::removeMutableTableDiskCacheData(const int db_id, const int tb_id) const {
676 }
677 
678 void DataMgr::setTableEpoch(const int db_id, const int tb_id, const int start_epoch) {
679  File_Namespace::GlobalFileMgr* gfm{nullptr};
680  gfm = dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])->getGlobalFileMgr();
681  CHECK(gfm);
682  gfm->setTableEpoch(db_id, tb_id, start_epoch);
683 }
684 
685 size_t DataMgr::getTableEpoch(const int db_id, const int tb_id) {
686  File_Namespace::GlobalFileMgr* gfm{nullptr};
687  gfm = dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])->getGlobalFileMgr();
688  CHECK(gfm);
689  return gfm->getTableEpoch(db_id, tb_id);
690 }
691 
692 void DataMgr::resetTableEpochFloor(const int32_t db_id, const int32_t tb_id) {
693  File_Namespace::GlobalFileMgr* gfm{nullptr};
694  gfm = dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])->getGlobalFileMgr();
695  CHECK(gfm);
696  gfm->resetTableEpochFloor(db_id, tb_id);
697 }
698 
700  File_Namespace::GlobalFileMgr* global_file_mgr{nullptr};
701  global_file_mgr =
702  dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])->getGlobalFileMgr();
703  CHECK(global_file_mgr);
704  return global_file_mgr;
705 }
706 
707 std::shared_ptr<ForeignStorageInterface> DataMgr::getForeignStorageInterface() const {
708  return dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[0][0])
710 }
711 
712 std::ostream& operator<<(std::ostream& os, const DataMgr::SystemMemoryUsage& mem_info) {
713  os << "jsonlog ";
714  os << "{";
715  os << " \"name\": \"CPU Memory Info\",";
716  os << " \"TotalMB\": " << mem_info.total / (1024. * 1024.) << ",";
717  os << " \"FreeMB\": " << mem_info.free / (1024. * 1024.) << ",";
718  os << " \"ProcessMB\": " << mem_info.resident / (1024. * 1024.) << ",";
719  os << " \"VirtualMB\": " << mem_info.vtotal / (1024. * 1024.) << ",";
720  os << " \"ProcessPlusSwapMB\": " << mem_info.regular / (1024. * 1024.) << ",";
721  os << " \"ProcessSharedMB\": " << mem_info.shared / (1024. * 1024.) << ",";
722  os << " \"FragmentationPercent\": " << mem_info.frag;
723  os << ", \"BuddyinfoHighBlocks\": " << mem_info.high_blocks;
724  os << ", \"BuddyinfoAvailPages\": " << mem_info.avail_pages;
725  os << " }";
726  return os;
727 }
728 
730  return dynamic_cast<PersistentStorageMgr*>(bufferMgrs_[MemoryLevel::DISK_LEVEL][0]);
731 }
732 
734  return getCpuBufferMgr()->getMaxSize();
735 }
736 
737 // following gets total size of all gpu buffer pools
739  if (bufferMgrs_.size() <= MemoryLevel::GPU_LEVEL) {
740  return static_cast<size_t>(0);
741  }
742  size_t total_gpu_buffer_pools_size{0};
743  for (auto const gpu_buffer_mgr : bufferMgrs_[MemoryLevel::GPU_LEVEL]) {
744  total_gpu_buffer_pools_size +=
745  dynamic_cast<Buffer_Namespace::GpuCudaBufferMgr*>(gpu_buffer_mgr)->getMaxSize();
746  }
747  return total_gpu_buffer_pools_size;
748 }
749 
752  return dynamic_cast<Buffer_Namespace::CpuBufferMgr*>(
754 }
755 
757  if (bufferMgrs_.size() > MemoryLevel::GPU_LEVEL) {
758  CHECK_GT(bufferMgrs_[MemoryLevel::GPU_LEVEL].size(), static_cast<size_t>(device_id));
759  return dynamic_cast<Buffer_Namespace::GpuCudaBufferMgr*>(
760  bufferMgrs_[MemoryLevel::GPU_LEVEL][device_id]);
761  } else {
762  return nullptr;
763  }
764 }
765 
766 namespace {
767 constexpr unsigned kMaxBuddyinfoBlocks = 32;
768 constexpr unsigned kMaxBuddyinfoTokens = kMaxBuddyinfoBlocks + 4;
769 constexpr double kErrorCodeUnableToOpenFile = -1.0;
770 constexpr double kErrorCodeOutOfMemory = -2.0;
771 template <typename T, std::size_t N>
772 using small_vector = boost::container::small_vector<T, N>;
773 
776 
777  // Sum total pages in BuddyinfoBlocks when iterated in reverse using Horner's method.
778  struct Horner {
779  size_t operator()(size_t sum, size_t blocks) const { return 2 * sum + blocks; }
780  };
781 
782  BuddyinfoBlocks() = default;
783 
784  // Set blocks from array of string_view tokens.
785  BuddyinfoBlocks(std::string_view const* const tokens, size_t const num_blocks) {
786  for (size_t i = 0; i < num_blocks; ++i) {
787  size_t block;
788  std::from_chars(tokens[i].data(), tokens[i].data() + tokens[i].size(), block);
789  blocks.push_back(block);
790  }
791  }
792 
793  void addBlocks(BuddyinfoBlocks const& rhs) {
794  if (blocks.size() < rhs.blocks.size()) {
795  blocks.resize(rhs.blocks.size(), 0u);
796  }
797  for (size_t i = 0; i < rhs.blocks.size(); ++i) {
798  blocks[i] += rhs.blocks[i];
799  }
800  }
801 
802  double fragPercent() const {
803  if (blocks.size() < 2u) {
804  return 0.0; // No fragmentation is possible with only one block column.
805  }
806  size_t scaled = 0;
807  size_t total = 0;
808  for (size_t order = 0; order < blocks.size(); ++order) {
809  size_t const pages = blocks[order] << order;
810  scaled += pages * (blocks.size() - 1 - order) / (blocks.size() - 1);
811  total += pages;
812  }
813  return total ? scaled * 100.0 / total : kErrorCodeOutOfMemory;
814  }
815 
816  size_t highestBlock() const { return blocks.empty() ? 0 : blocks.back(); }
817 
818  size_t sumAvailPages() const {
819  return std::accumulate(blocks.rbegin(), blocks.rend(), size_t(0), Horner{});
820  }
821 };
822 
823 // Split line on spaces into string_views.
826  size_t start = 0;
827  while (start < str.size()) {
828  // Find the start of the next token
829  start = str.find_first_not_of(' ', start);
830  // Check if we're at the end
831  if (start == std::string_view::npos) {
832  break;
833  }
834  // Find the end of the token. std::string_view::npos is ok.
835  size_t end = str.find(' ', start);
836  tokens.push_back(str.substr(start, end - start)); // Add the token to our list
837  start = end; // Set up for the next token
838  }
839  return tokens;
840 }
841 
842 } // namespace
843 
844 // Each row of /proc/buddyinfo is parsed into a BuddyinfoBlocks struct,
845 // from which the member variables are calculated.
847  std::ifstream file("/proc/buddyinfo");
848  if (!file.is_open()) {
851  return;
852  }
853 
854  constexpr unsigned max_line_size = 256;
855  char line[max_line_size];
856 
857  BuddyinfoBlocks frag; // Used to calculate frag_percent_.
858 
859  // Example: line = "Node 0, zone Normal 1 2 3 4 5 6 7 8 9 10 11"
860  // No CHECKs are done, and no exceptions are thrown. The worst that can happen is
861  // bad logs, which is not worth crashing the server or showing an error to the user.
862  while (file.getline(line, max_line_size)) {
863  auto tokens = tokenize(line); // Split on spaces.
864  // Sanity check on tokens.size() and known tokens.
865  if (5u <= tokens.size() && tokens[0] == "Node" && tokens[2] == "zone") {
866  BuddyinfoBlocks row(tokens.data() + 4, tokens.size() - 4);
867 
868  // Calculate member variables
869  frag.addBlocks(row);
870  if (tokens[3].substr(0, 3) != "DMA") {
871  sum_avail_pages_ += row.sumAvailPages();
872  sum_highest_blocks_ += row.highestBlock();
873  }
874  }
875  }
876  frag_percent_ = frag.fragPercent();
877 }
878 
879 } // namespace Data_Namespace
size_t getAllocated() override
Definition: BufferMgr.cpp:520
std::mutex buffer_access_mutex_
Definition: DataMgr.h:238
size_t default_gpu_slab_size
std::vector< int > ChunkKey
Definition: types.h:36
size_t g_pmem_size
const std::string kDataDirectoryName
std::vector< MemoryData > nodeMemoryData
Definition: DataMgr.h:75
Buffer_Namespace::MemStatus memStatus
Definition: DataMgr.h:67
void deleteChunk(const ChunkKey &key, const MemoryLevel mem_level, const int device_id)
Definition: DataMgr.cpp:597
size_t getMaxSize() override
Definition: BufferMgr.cpp:515
std::vector< std::vector< AbstractBufferMgr * > > bufferMgrs_
Definition: DataMgr.h:233
This file includes the class specification for the FILE manager (FileMgr), and related data structure...
size_t get_slab_size(size_t initial_slab_size, size_t buffer_pool_size, size_t page_size)
Definition: DataMgr.cpp:264
std::vector< int > levelSizes_
Definition: DataMgr.h:182
std::ostream & operator<<(std::ostream &os, const DataMgr::SystemMemoryUsage &mem_info)
Definition: DataMgr.cpp:712
#define LOG(tag)
Definition: Logger.h:285
SystemMemoryUsage getSystemMemoryUsage() const
Definition: DataMgr.cpp:131
PersistentStorageMgr * getPersistentStorageMgr() const
Definition: DataMgr.cpp:729
virtual int8_t * getMemoryPtr()=0
virtual MemoryLevel getType() const =0
void clearMemory(const MemoryLevel memLevel)
Definition: DataMgr.cpp:515
bool g_use_cpu_mem_pool_size_for_max_cpu_slab_size
Definition: DataMgr.cpp:56
std::vector< MemoryInfo > getMemoryInfoUnlocked(const MemoryLevel memLevel) const
Definition: DataMgr.cpp:435
void resetTableEpochFloor(const int32_t db_id, const int32_t tb_id)
Definition: DataMgr.cpp:692
std::string dumpLevel(const MemoryLevel memLevel)
Definition: DataMgr.cpp:499
size_t getCpuBufferPoolSize() const
Definition: DataMgr.cpp:733
void convertDB(const std::string basePath)
Definition: DataMgr.cpp:384
#define CHECK_GT(x, y)
Definition: Logger.h:305
size_t getGpuBufferPoolSize() const
Definition: DataMgr.cpp:738
constexpr size_t numCpuTiers
void allocateCpuBufferMgr(int32_t device_id, size_t total_cpu_size, size_t min_cpu_slab_size, size_t max_cpu_slab_size, size_t default_cpu_slab_size, size_t page_size, const std::vector< size_t > &cpu_tier_sizes)
Definition: DataMgr.cpp:215
static size_t getTotalSystemMemory()
Definition: DataMgr.cpp:192
Note(s): Forbid Copying Idiom 4.1.
Definition: BufferMgr.h:96
size_t getTableEpoch(const int db_id, const int tb_id)
Definition: DataMgr.cpp:685
Buffer_Namespace::GpuCudaBufferMgr * getGpuBufferMgr(int32_t device_id) const
Definition: DataMgr.cpp:756
std::shared_ptr< ForeignStorageInterface > getForeignStorageInterface() const
Definition: DataMgr.cpp:707
void createTopLevelMetadata() const
Definition: DataMgr.cpp:414
small_vector< size_t, kMaxBuddyinfoBlocks > blocks
Definition: DataMgr.cpp:775
bool isAllocationCapped() override
Definition: BufferMgr.cpp:525
static void atExitHandler()
Definition: DataMgr.cpp:67
void removeMutableTableDiskCacheData(const int db_id, const int tb_id) const
Definition: DataMgr.cpp:674
std::string g_pmem_path
std::unique_ptr< CudaMgr_Namespace::CudaMgr > cudaMgr_
Definition: DataMgr.h:234
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunkMetadataVec, const ChunkKey &keyPrefix)
Definition: DataMgr.cpp:546
void populateMgrs(const SystemParameters &system_parameters, const size_t userSpecifiedNumReaderThreads, const File_Namespace::DiskCacheConfig &cache_config)
Definition: DataMgr.cpp:273
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
An AbstractBuffer is a unit of data management for a data manager.
virtual void write(int8_t *src, const size_t num_bytes, const size_t offset=0, const MemoryLevel src_buffer_type=CPU_LEVEL, const int src_device_id=-1)=0
boost::container::small_vector< T, N > small_vector
Definition: DataMgr.cpp:772
small_vector< std::string_view, kMaxBuddyinfoTokens > tokenize(std::string_view const str)
Definition: DataMgr.cpp:824
File_Namespace::GlobalFileMgr * getGlobalFileMgr() const
Definition: DataMgr.cpp:699
Parse /proc/meminfo into key/value pairs.
Definition: DataMgr.h:79
#define CHECK_LT(x, y)
Definition: Logger.h:303
void deleteChunksWithPrefix(const ChunkKey &keyPrefix)
Definition: DataMgr.cpp:572
BuddyinfoBlocks(std::string_view const *const tokens, size_t const num_blocks)
Definition: DataMgr.cpp:785
tuple line
Definition: parse_ast.py:10
const std::vector< BufferList > & getSlabSegments()
Definition: BufferMgr.cpp:931
bool isBufferOnDevice(const ChunkKey &key, const MemoryLevel memLevel, const int deviceId)
Definition: DataMgr.cpp:539
AbstractBuffer * getChunkBuffer(const ChunkKey &key, const MemoryLevel memoryLevel, const int deviceId=0, const size_t numBytes=0)
Definition: DataMgr.cpp:561
std::vector< MemoryInfo > getMemoryInfo(const MemoryLevel memLevel) const
Definition: DataMgr.cpp:430
void removeTableRelatedDS(const int db_id, const int tb_id)
Definition: DataMgr.cpp:669
DataMgr(const std::string &dataDir, const SystemParameters &system_parameters, std::unique_ptr< CudaMgr_Namespace::CudaMgr > cudaMgr, const bool useGpus, const size_t reservedGpuMem=(1<< 27), const size_t numReaderThreads=0, const File_Namespace::DiskCacheConfig cacheConfig=File_Namespace::DiskCacheConfig())
Definition: DataMgr.cpp:77
Buffer_Namespace::CpuBufferMgr * getCpuBufferMgr() const
Definition: DataMgr.cpp:750
#define CHECK(condition)
Definition: Logger.h:291
void copy(AbstractBuffer *destBuffer, AbstractBuffer *srcBuffer)
Definition: DataMgr.cpp:620
void removeMutableTableCacheData(const int db_id, const int table_id) const
void resetBufferMgrs(const File_Namespace::DiskCacheConfig &cache_config, const size_t num_reader_threads, const SystemParameters &sys_params)
Definition: DataMgr.cpp:249
std::vector< int32_t > chunk_key
Definition: DataMgr.h:66
AbstractBuffer * createChunkBuffer(const ChunkKey &key, const MemoryLevel memoryLevel, const int deviceId=0, const size_t page_size=0)
Definition: DataMgr.cpp:552
Allocate GPU memory using GpuBuffers via DataMgr.
void free(AbstractBuffer *buffer)
Definition: DataMgr.cpp:614
bool g_enable_fsi
Definition: Catalog.cpp:96
std::vector< size_t > CpuTierSizeVector
#define VLOG(n)
Definition: Logger.h:388
Parse /proc/buddyinfo into a few fragmentation-related data.
Definition: DataMgr.h:112
size_t default_cpu_slab_size
void setTableEpoch(const int db_id, const int tb_id, const int start_epoch)
Definition: DataMgr.cpp:678
AbstractBuffer * alloc(const MemoryLevel memoryLevel, const int deviceId, const size_t numBytes)
Definition: DataMgr.cpp:605
std::string dataDir_
Definition: DataMgr.h:235