OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ForeignStorageCache.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  TODO(Misiu): A lot of methods here can be made asyncronous. It may be worth an
19  investigation to determine if it's worth adding async versions of them for performance
20  reasons.
21 */
22 
23 #include "ForeignStorageCache.h"
24 #include <boost/filesystem.hpp>
25 #include "Shared/File.h"
26 #include "Shared/measure.h"
27 
28 namespace foreign_storage {
29 namespace {
30 template <typename Func, typename T>
32  T& chunk_collection,
33  const ChunkKey& chunk_prefix) {
34  ChunkKey upper_prefix(chunk_prefix);
35  upper_prefix.push_back(std::numeric_limits<int>::max());
36  auto end_it = chunk_collection.upper_bound(static_cast<const ChunkKey>(upper_prefix));
37  for (auto chunk_it = chunk_collection.lower_bound(chunk_prefix); chunk_it != end_it;
38  ++chunk_it) {
39  func(*chunk_it);
40  }
41 }
42 
44  buffer->initEncoder(meta->sqlType);
45  buffer->setSize(meta->numBytes);
46  buffer->getEncoder()->setNumElems(meta->numElements);
47  buffer->getEncoder()->resetChunkStats(meta->chunkStats);
48  buffer->setUpdated();
49 }
50 } // namespace
51 
53  validatePath(config.path);
54  caching_file_mgr_ = std::make_unique<File_Namespace::CachingFileMgr>(config);
55 }
56 
58  caching_file_mgr_->deleteBufferIfExists(chunk_key);
59 }
60 
62  AbstractBuffer* buf,
63  const size_t num_bytes) {
64  caching_file_mgr_->putBuffer(key, buf, num_bytes);
65  CHECK(!buf->isDirty());
66 }
67 
68 void ForeignStorageCache::checkpoint(const int32_t db_id, const int32_t tb_id) {
69  caching_file_mgr_->checkpoint(db_id, tb_id);
70 }
71 
73  const ChunkKey& chunk_key) {
74  auto buf = caching_file_mgr_->getBufferIfExists(chunk_key);
75 
76  if (buf) {
77  if ((*buf)->hasDataPages()) {
78  // 1. If the buffer has data pages then must be in the cache.
79  return *buf;
80  }
81  if (is_varlen_data_key(chunk_key)) {
82  // 2. If the buffer is a varlen data buffer and the
83  // corresponding chunk contains only nulls, then even
84  // without data pages it will still have been cached
85  // if it has a corresponding index buffer which does
86  // have dataPages
87  // Note the empty buffer proviso that the size be 0,
88  // corresponding to all nulls in the chunk
89  auto index_chunk_key = chunk_key;
90  index_chunk_key[CHUNK_KEY_VARLEN_IDX] = 2;
91  auto index_buffer = caching_file_mgr_->getBufferIfExists(index_chunk_key);
92  if (index_buffer && (*index_buffer)->hasDataPages() && (*buf)->size() == 0) {
93  return *buf;
94  }
95  }
96  }
97  // 3. Otherwise this chunk hasn't been cached.
98  return nullptr;
99 }
100 
101 bool ForeignStorageCache::isMetadataCached(const ChunkKey& chunk_key) const {
102  auto buf = caching_file_mgr_->getBufferIfExists(chunk_key);
103  if (buf) {
104  return (*buf)->hasEncoder();
105  }
106  return false;
107 }
108 
110  auto timer = DEBUG_TIMER(__func__);
111  if (metadata_vec.empty()) {
112  return;
113  }
114  auto first_chunk_key = metadata_vec.begin()->first;
115  for (auto& [chunk_key, metadata] : metadata_vec) {
116  CHECK(in_same_table(chunk_key, first_chunk_key));
117  AbstractBuffer* buf;
118  AbstractBuffer* index_buffer = nullptr;
119  ChunkKey index_chunk_key;
120  if (is_varlen_key(chunk_key)) {
121  // For variable length chunks, metadata is associated with the data chunk.
122  CHECK(is_varlen_data_key(chunk_key));
123  index_chunk_key = {chunk_key[CHUNK_KEY_DB_IDX],
124  chunk_key[CHUNK_KEY_TABLE_IDX],
125  chunk_key[CHUNK_KEY_COLUMN_IDX],
126  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
127  2};
128  }
129  bool chunk_in_cache = false;
130  if (!caching_file_mgr_->isBufferOnDevice(chunk_key)) {
131  buf = caching_file_mgr_->createBuffer(chunk_key);
132 
133  if (!index_chunk_key.empty()) {
134  CHECK(!caching_file_mgr_->isBufferOnDevice(index_chunk_key));
135  index_buffer = caching_file_mgr_->createBuffer(index_chunk_key);
136  CHECK(index_buffer);
137  }
138  } else {
139  buf = caching_file_mgr_->getBuffer(chunk_key);
140 
141  if (!index_chunk_key.empty()) {
142  CHECK(caching_file_mgr_->isBufferOnDevice(index_chunk_key));
143  index_buffer = caching_file_mgr_->getBuffer(index_chunk_key);
144  CHECK(index_buffer);
145  }
146 
147  // We should have already cleared the data unless we are appending
148  // If the buffer metadata has changed, we need to remove this chunk
149  if (buf->getEncoder() != nullptr) {
150  const std::shared_ptr<ChunkMetadata> buf_metadata =
151  std::make_shared<ChunkMetadata>();
152  buf->getEncoder()->getMetadata(buf_metadata);
153  chunk_in_cache = *metadata.get() == *buf_metadata;
154  }
155  }
156 
157  if (!chunk_in_cache) {
158  set_metadata_for_buffer(buf, metadata.get());
159  eraseChunk(chunk_key);
160 
161  if (!index_chunk_key.empty()) {
162  CHECK(index_buffer);
163  index_buffer->setUpdated();
164  eraseChunk(index_chunk_key);
165  }
166  }
167  }
168  caching_file_mgr_->checkpoint(first_chunk_key[CHUNK_KEY_DB_IDX],
169  first_chunk_key[CHUNK_KEY_TABLE_IDX]);
170 }
171 
173  ChunkMetadataVector& metadata_vec,
174  const ChunkKey& chunk_prefix) const {
175  caching_file_mgr_->getChunkMetadataVecForKeyPrefix(metadata_vec, chunk_prefix);
176 }
177 
179  const ChunkKey& chunk_prefix) const {
180  return caching_file_mgr_->hasChunkMetadataForKeyPrefix(chunk_prefix);
181 }
182 
184  CHECK(is_table_key(chunk_prefix));
185  auto timer = DEBUG_TIMER(__func__);
186  caching_file_mgr_->clearForTable(chunk_prefix[CHUNK_KEY_DB_IDX],
187  chunk_prefix[CHUNK_KEY_TABLE_IDX]);
188 }
189 
191  auto timer = DEBUG_TIMER(__func__);
192  // FileMgrs do not clean up after themselves nicely, so we need to close all their disk
193  // resources and then re-create the CachingFileMgr to reset it.
194  caching_file_mgr_->closeRemovePhysical();
195  boost::filesystem::create_directory(caching_file_mgr_->getFileMgrBasePath());
196  caching_file_mgr_ = caching_file_mgr_->reconstruct();
197 }
198 
200  const ChunkKey& chunk_prefix) const {
201  return caching_file_mgr_->getChunkKeysForPrefix(chunk_prefix);
202 }
203 
205  caching_file_mgr_->removeChunkKeepMetadata(chunk_key);
206 }
207 
209  return caching_file_mgr_->dumpKeysWithChunkData();
210 }
211 
213  return caching_file_mgr_->dumpKeysWithMetadata();
214 }
215 
217  return caching_file_mgr_->dumpEvictionQueue();
218 }
219 
220 void ForeignStorageCache::validatePath(const std::string& base_path) const {
221  // check if base_path already exists, and if not create one
222  boost::filesystem::path path(base_path);
223  if (boost::filesystem::exists(path)) {
224  if (!boost::filesystem::is_directory(path)) {
225  throw std::runtime_error{
226  "cache path \"" + base_path +
227  "\" is not a directory. Please specify a valid directory "
228  "with --disk_cache_path=<path>, or use the default location."};
229  }
230  } else { // data directory does not exist
231  if (!boost::filesystem::create_directory(path)) {
232  throw std::runtime_error{
233  "could not create directory at cache path \"" + base_path +
234  "\". Please specify a valid directory location "
235  "with --disk_cache_path=<path> or use the default location."};
236  }
237  }
238 }
239 
241  const std::set<ChunkKey>& keys) const {
242  ChunkToBufferMap chunk_buffer_map;
243  for (const auto& key : keys) {
244  CHECK(caching_file_mgr_->isBufferOnDevice(key));
245  chunk_buffer_map[key] = caching_file_mgr_->getBuffer(key);
246  auto file_buf = dynamic_cast<File_Namespace::FileBuffer*>(chunk_buffer_map[key]);
247  CHECK(file_buf);
248  CHECK(!file_buf->hasDataPages());
249 
250  // Clear all buffer metadata
251  file_buf->resetToEmpty();
252  }
253  return chunk_buffer_map;
254 }
255 
257  const ChunkKey& chunk_key,
258  bool is_new_buffer) {
259  if (!is_new_buffer) {
260  CHECK(caching_file_mgr_->isBufferOnDevice(chunk_key));
261  return caching_file_mgr_->getBuffer(chunk_key);
262  } else {
263  CHECK(!caching_file_mgr_->isBufferOnDevice(chunk_key));
264  return caching_file_mgr_->createBuffer(chunk_key);
265  }
266 }
267 
268 void ForeignStorageCache::storeDataWrapper(const std::string& doc,
269  int32_t db_id,
270  int32_t tb_id) {
271  caching_file_mgr_->writeWrapperFile(doc, db_id, tb_id);
272 }
273 
275  int32_t table_id) const {
276  return caching_file_mgr_->hasWrapperFile(db_id, table_id);
277 }
278 } // namespace foreign_storage
std::vector< int > ChunkKey
Definition: types.h:36
bool isMetadataCached(const ChunkKey &) const
bool is_table_key(const ChunkKey &key)
Definition: types.h:44
bool is_varlen_data_key(const ChunkKey &key)
Definition: types.h:75
void eraseChunk(const ChunkKey &chunk_key)
void storeDataWrapper(const std::string &doc, int32_t db_id, int32_t tb_id)
#define CHUNK_KEY_DB_IDX
Definition: types.h:38
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:41
std::map< ChunkKey, AbstractBuffer * > ChunkToBufferMap
This file includes the class specification for the cache used by the Foreign Storage Interface (FSI)...
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:57
void initEncoder(const SQLTypeInfo &tmp_sql_type)
void setNumElems(const size_t num_elems)
Definition: Encoder.h:285
ChunkStats chunkStats
Definition: ChunkMetadata.h:37
virtual bool resetChunkStats(const ChunkStats &)
: Reset chunk level stats (min, max, nulls) using new values from the argument.
Definition: Encoder.h:274
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:231
void getCachedMetadataVecForKeyPrefix(ChunkMetadataVector &, const ChunkKey &) const
bool hasStoredDataWrapperMetadata(int32_t db_id, int32_t table_id) const
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:39
void iterate_over_matching_prefix(Func func, T &chunk_collection, const ChunkKey &chunk_prefix)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
An AbstractBuffer is a unit of data management for a data manager.
ChunkToBufferMap getChunkBuffersForCaching(const std::set< ChunkKey > &chunk_keys) const
void putBuffer(const ChunkKey &, AbstractBuffer *, const size_t numBytes=0)
void cacheMetadataVec(const ChunkMetadataVector &)
std::unique_ptr< File_Namespace::CachingFileMgr > caching_file_mgr_
void deleteBufferIfExists(const ChunkKey &chunk_key)
void set_metadata_for_buffer(AbstractBuffer *buffer, ChunkMetadata *meta)
void validatePath(const std::string &) const
#define CHUNK_KEY_VARLEN_IDX
Definition: types.h:42
std::vector< ChunkKey > getCachedChunksForKeyPrefix(const ChunkKey &) const
AbstractBuffer * getChunkBufferForPrecaching(const ChunkKey &chunk_key, bool is_new_buffer)
void setSize(const size_t size)
void checkpoint(const int32_t db_id, const int32_t tb_id)
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
File_Namespace::FileBuffer * getCachedChunkIfExists(const ChunkKey &)
bool hasCachedMetadataForKeyPrefix(const ChunkKey &) const
ForeignStorageCache(const File_Namespace::DiskCacheConfig &config)
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:40
bool in_same_table(const ChunkKey &left_key, const ChunkKey &right_key)
Definition: types.h:83
SQLTypeInfo sqlType
Definition: ChunkMetadata.h:34
A selection of helper methods for File I/O.
bool is_varlen_key(const ChunkKey &key)
Definition: types.h:71
size_t numElements
Definition: ChunkMetadata.h:36