OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{TableArchiver.cpp} Namespace Reference

Functions

std::string abs_path (const File_Namespace::GlobalFileMgr *global_file_mgr)
 
std::string run (const std::string &cmd, const std::string &chdir="", const bool log_failure=true)
 
std::string simple_file_cat (const std::string &archive_path, const std::string &file_name, const std::string &compression, const bool log_failure=true)
 
std::string get_table_schema (const std::string &archive_path, const std::string &table, const std::string &compression)
 
void update_or_drop_column_ids_in_page_headers (const boost::filesystem::path &path, const std::unordered_map< int, int > &column_ids_map, const int32_t table_epoch, const bool drop_not_update)
 
void update_or_drop_column_ids_in_table_files (const int32_t table_epoch, const std::string &temp_data_dir, const std::unordered_map< int, int > &column_ids_map, const bool drop_not_update)
 
void delete_old_symlinks (const std::string &table_data_dir)
 
void add_data_file_symlinks (const std::string &table_data_dir)
 
void rename_table_directories (const File_Namespace::GlobalFileMgr *global_file_mgr, const std::string &temp_data_dir, const std::vector< std::string > &target_paths, const std::string &name_prefix)
 
std::unordered_map< int, int > find_render_group_columns (const std::list< ColumnDescriptor > &src_columns, std::vector< std::string > &src_oldinfo_strs, const std::string &archive_path)
 
void drop_render_group_columns (const std::unordered_map< int, int > &render_group_column_ids, const std::string &archive_path, const std::string &temp_data_dir, const std::string &compression)
 

Variables

auto simple_file_closer = [](FILE* f) { std::fclose(f); }
 

Function Documentation

std::string anonymous_namespace{TableArchiver.cpp}::abs_path ( const File_Namespace::GlobalFileMgr global_file_mgr)
inline

Definition at line 82 of file TableArchiver.cpp.

References File_Namespace::GlobalFileMgr::getBasePath().

Referenced by TableArchiver::dumpTable(), rename_table_directories(), and TableArchiver::restoreTable().

82  {
83  return boost::filesystem::canonical(global_file_mgr->getBasePath()).string();
84 }
std::string getBasePath() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void anonymous_namespace{TableArchiver.cpp}::add_data_file_symlinks ( const std::string &  table_data_dir)

Definition at line 322 of file TableArchiver.cpp.

References DATA_FILE_EXT, and File_Namespace::kLegacyDataFileExtension.

Referenced by rename_table_directories().

322  {
323  std::map<boost::filesystem::path, boost::filesystem::path> old_to_new_paths;
324  for (boost::filesystem::directory_iterator it(table_data_dir), end_it; it != end_it;
325  it++) {
326  const auto path = boost::filesystem::canonical(it->path());
327  if (path.extension().string() == DATA_FILE_EXT) {
328  auto old_path = path;
329  old_path.replace_extension(File_Namespace::kLegacyDataFileExtension);
330  // Add a symlink to data file, if one does not exist.
331  if (!boost::filesystem::exists(old_path)) {
332  old_to_new_paths[old_path] = path;
333  }
334  }
335  }
336  for (const auto& [old_path, new_path] : old_to_new_paths) {
337  boost::filesystem::create_symlink(new_path.filename(), old_path);
338  }
339 }
#define DATA_FILE_EXT
Definition: File.h:25
constexpr auto kLegacyDataFileExtension
Definition: File.h:36

+ Here is the caller graph for this function:

void anonymous_namespace{TableArchiver.cpp}::delete_old_symlinks ( const std::string &  table_data_dir)

Definition at line 309 of file TableArchiver.cpp.

Referenced by rename_table_directories().

309  {
310  std::vector<boost::filesystem::path> symlinks;
311  for (boost::filesystem::directory_iterator it(table_data_dir), end_it; it != end_it;
312  it++) {
313  if (boost::filesystem::is_symlink(it->path())) {
314  symlinks.emplace_back(it->path());
315  }
316  }
317  for (const auto& symlink : symlinks) {
318  boost::filesystem::remove_all(symlink);
319  }
320 }

+ Here is the caller graph for this function:

void anonymous_namespace{TableArchiver.cpp}::drop_render_group_columns ( const std::unordered_map< int, int > &  render_group_column_ids,
const std::string &  archive_path,
const std::string &  temp_data_dir,
const std::string &  compression 
)

Definition at line 420 of file TableArchiver.cpp.

References measure< TimeT >::execution(), simple_file_cat(), table_epoch_filename, update_or_drop_column_ids_in_table_files(), and VLOG.

Referenced by TableArchiver::restoreTable().

424  {
425  // rewrite page files to drop the columns with IDs that are the keys of the map
426  if (render_group_column_ids.size()) {
427  const auto epoch = boost::lexical_cast<int32_t>(
428  simple_file_cat(archive_path, table_epoch_filename, compression));
429  const auto time_ms = measure<>::execution([&]() {
431  epoch, temp_data_dir, render_group_column_ids, true /* drop */);
432  });
433  VLOG(3) << "drop render group columns: " << time_ms << " ms";
434  }
435 }
static TimeT::rep execution(F func, Args &&...args)
Definition: sample.cpp:29
std::string simple_file_cat(const std::string &archive_path, const std::string &file_name, const std::string &compression, const bool log_failure=true)
static constexpr char const * table_epoch_filename
void update_or_drop_column_ids_in_table_files(const int32_t table_epoch, const std::string &temp_data_dir, const std::unordered_map< int, int > &column_ids_map, const bool drop_not_update)
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::unordered_map<int, int> anonymous_namespace{TableArchiver.cpp}::find_render_group_columns ( const std::list< ColumnDescriptor > &  src_columns,
std::vector< std::string > &  src_oldinfo_strs,
const std::string &  archive_path 
)

Definition at line 370 of file TableArchiver.cpp.

References logger::INFO, kMULTIPOLYGON, kPOLYGON, LOG, and split().

Referenced by TableArchiver::restoreTable().

373  {
374  // scan for poly or mpoly columns and collect their names
375  std::vector<std::string> poly_column_names;
376  for (auto const& src_column : src_columns) {
377  auto const sqltype = src_column.columnType.get_type();
378  if (sqltype == kPOLYGON || sqltype == kMULTIPOLYGON) {
379  poly_column_names.push_back(src_column.columnName);
380  }
381  }
382 
383  // remove any matching render group columns from the source list
384  // and capture their IDs in the keys of a map (value is ignored)
385  std::unordered_map<int, int> column_ids_to_drop;
386  auto last_itr = std::remove_if(
387  src_oldinfo_strs.begin(),
388  src_oldinfo_strs.end(),
389  [&](const std::string& v) -> bool {
390  // tokenize
391  std::vector<std::string> tokens;
393  tokens, v, boost::is_any_of(":"), boost::token_compress_on);
394  // extract name and ID
395  if (tokens.size() < 2) {
396  throw std::runtime_error(
397  "Dump " + archive_path +
398  " has invalid oldinfo file contents. Dump may be corrupt.");
399  }
400  auto const& column_name = tokens[0];
401  auto const column_id = std::stoi(tokens[1]);
402  for (auto const& poly_column_name : poly_column_names) {
403  // is it a render group column?
404  auto const render_group_column_name = poly_column_name + "_render_group";
405  if (column_name == render_group_column_name) {
406  LOG(INFO) << "RESTORE TABLE dropping render group column '"
407  << render_group_column_name << "' from dump " << archive_path;
408  // add to "set"
409  column_ids_to_drop[column_id] = -1;
410  return true;
411  }
412  }
413  return false;
414  });
415  src_oldinfo_strs.erase(last_itr, src_oldinfo_strs.end());
416 
417  return column_ids_to_drop;
418 }
#define LOG(tag)
Definition: Logger.h:285
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::string anonymous_namespace{TableArchiver.cpp}::get_table_schema ( const std::string &  archive_path,
const std::string &  table,
const std::string &  compression 
)
inline

Definition at line 182 of file TableArchiver.cpp.

References simple_file_cat(), and table_schema_filename.

Referenced by TableArchiver::restoreTable().

184  {
185  const auto schema_str =
186  simple_file_cat(archive_path, table_schema_filename, compression);
187  std::regex regex("@T");
188  return std::regex_replace(schema_str, regex, table);
189 }
static constexpr char const * table_schema_filename
std::string simple_file_cat(const std::string &archive_path, const std::string &file_name, const std::string &compression, const bool log_failure=true)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void anonymous_namespace{TableArchiver.cpp}::rename_table_directories ( const File_Namespace::GlobalFileMgr global_file_mgr,
const std::string &  temp_data_dir,
const std::vector< std::string > &  target_paths,
const std::string &  name_prefix 
)

Definition at line 341 of file TableArchiver.cpp.

References abs_path(), add_data_file_symlinks(), delete_old_symlinks(), and File_Namespace::FileMgr::renameAndSymlinkLegacyFiles().

Referenced by TableArchiver::restoreTable().

344  {
345  boost::filesystem::path base_path(temp_data_dir);
346  boost::filesystem::directory_iterator end_it;
347  int target_path_index = 0;
348  for (boost::filesystem::directory_iterator fit(base_path); fit != end_it; ++fit) {
349  if (!boost::filesystem::is_regular_file(fit->status())) {
350  const std::string file_path = fit->path().string();
351  const std::string file_name = fit->path().filename().string();
352  if (boost::istarts_with(file_name, name_prefix)) {
353  const std::string target_path =
354  abs_path(global_file_mgr) + "/" + target_paths[target_path_index++];
355  if (std::rename(file_path.c_str(), target_path.c_str())) {
356  throw std::runtime_error("Failed to rename file " + file_path + " to " +
357  target_path + ": " + std::strerror(errno));
358  }
359  // Delete any old/invalid symlinks contained in table dump.
360  delete_old_symlinks(target_path);
362  // For post-rebrand table dumps, symlinks need to be added here, since file mgr
363  // migration would already have been executed for the dumped table.
364  add_data_file_symlinks(target_path);
365  }
366  }
367  }
368 }
void delete_old_symlinks(const std::string &table_data_dir)
std::string abs_path(const File_Namespace::GlobalFileMgr *global_file_mgr)
static void renameAndSymlinkLegacyFiles(const std::string &table_data_dir)
Definition: FileMgr.cpp:1136
void add_data_file_symlinks(const std::string &table_data_dir)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::string anonymous_namespace{TableArchiver.cpp}::run ( const std::string &  cmd,
const std::string &  chdir = "",
const bool  log_failure = true 
)
inline

Definition at line 86 of file TableArchiver.cpp.

References logger::ERROR, report::error_code(), measure< TimeT >::execution(), LOG, to_lower(), to_string(), and VLOG.

88  {
89  VLOG(3) << "running cmd: " << cmd;
90  int rcode;
91  std::error_code ec;
92  std::string output, errors;
93  const auto time_ms = measure<>::execution([&]() {
94  using namespace boost::process;
95  ipstream stdout, stderr;
96  if (!chdir.empty()) {
97  rcode = system(cmd, std_out > stdout, std_err > stderr, ec, start_dir = chdir);
98  } else {
99  rcode = system(cmd, std_out > stdout, std_err > stderr, ec);
100  }
101  std::ostringstream ss_output, ss_errors;
102  stdout >> ss_output.rdbuf();
103  stderr >> ss_errors.rdbuf();
104  output = ss_output.str();
105  errors = ss_errors.str();
106  });
107  if (rcode || ec) {
108  if (log_failure) {
109  LOG(ERROR) << "failed cmd: " << cmd;
110  LOG(ERROR) << "exit code: " << rcode;
111  LOG(ERROR) << "error code: " << ec.value() << " - " << ec.message();
112  LOG(ERROR) << "stdout: " << output;
113  LOG(ERROR) << "stderr: " << errors;
114  }
115 #if defined(__APPLE__)
116  // osx bsdtar options "--use-compress-program" and "--fast-read" together
117  // run into pipe write error after tar extracts the first occurrence of a
118  // file and closes the read end while the decompression program still writes
119  // to the pipe. bsdtar doesn't handle this situation well like gnu tar does.
120  if (1 == rcode && cmd.find("--fast-read") &&
121  (errors.find("cannot write decoded block") != std::string::npos ||
122  errors.find("Broken pipe") != std::string::npos)) {
123  // ignore this error, or lose speed advantage of "--fast-read" on osx.
124  LOG(ERROR) << "tar error ignored on osx for --fast-read";
125  } else
126 #endif
127  // circumvent tar warning on reading file that is "changed as we read it".
128  // this warning results from reading a table file under concurrent inserts
129  if (1 == rcode && errors.find("changed as we read") != std::string::npos) {
130  LOG(ERROR) << "tar error ignored under concurrent inserts";
131  } else {
132  int error_code;
133  std::string error_message;
134  if (ec) {
135  error_code = ec.value();
136  error_message = ec.message();
137  } else {
138  error_code = rcode;
139  // Show a more concise message for permission errors instead of the default
140  // verbose message. Error logs will still contain all details.
141  if (to_lower(errors).find("permission denied") != std::string::npos) {
142  error_message = "Insufficient file read/write permission.";
143  } else {
144  error_message = errors;
145  }
146  }
147  throw std::runtime_error(
148  "An error occurred while executing an internal command. Error code: " +
149  std::to_string(error_code) + ", message: " + error_message);
150  }
151  } else {
152  VLOG(3) << "finished cmd: " << cmd;
153  VLOG(3) << "time: " << time_ms << " ms";
154  VLOG(3) << "stdout: " << output;
155  }
156  return output;
157 }
std::string to_lower(const std::string &str)
static TimeT::rep execution(F func, Args &&...args)
Definition: sample.cpp:29
#define LOG(tag)
Definition: Logger.h:285
std::string to_string(char const *&&v)
def error_code
Definition: report.py:234
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

std::string anonymous_namespace{TableArchiver.cpp}::simple_file_cat ( const std::string &  archive_path,
const std::string &  file_name,
const std::string &  compression,
const bool  log_failure = true 
)
inline

Definition at line 159 of file TableArchiver.cpp.

References get_quoted_string(), ddl_utils::IMPORT, run, and ddl_utils::validate_allowed_file_path().

Referenced by drop_render_group_columns(), get_table_schema(), and TableArchiver::restoreTable().

162  {
165 #if defined(__APPLE__)
166  constexpr static auto opt_occurrence = "--fast-read";
167 #else
168  constexpr static auto opt_occurrence = "--occurrence=1";
169 #endif
170  boost::filesystem::path temp_dir =
171  boost::filesystem::temp_directory_path() / boost::filesystem::unique_path();
172  boost::filesystem::create_directories(temp_dir);
173  run("tar " + compression + " -xvf " + get_quoted_string(archive_path) + " " +
174  opt_occurrence + " " + file_name,
175  temp_dir.string(),
176  log_failure);
177  const auto output = run("cat " + (temp_dir / file_name).string());
178  boost::filesystem::remove_all(temp_dir);
179  return output;
180 }
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
Definition: DdlUtils.cpp:822
static bool run

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void anonymous_namespace{TableArchiver.cpp}::update_or_drop_column_ids_in_page_headers ( const boost::filesystem::path &  path,
const std::unordered_map< int, int > &  column_ids_map,
const int32_t  table_epoch,
const bool  drop_not_update 
)

Definition at line 193 of file TableArchiver.cpp.

References DATA_FILE_EXT, heavyai::file_size(), heavyai::fopen(), File_Namespace::is_page_deleted_with_checkpoint(), simple_file_closer, split(), and to_string().

Referenced by update_or_drop_column_ids_in_table_files().

197  {
198  const std::string file_path = path.string();
199  const std::string file_name = path.filename().string();
200  std::vector<std::string> tokens;
201  boost::split(tokens, file_name, boost::is_any_of("."));
202 
203  // ref. FileMgr::init for hint of data file name layout
204  if (tokens.size() <= 2 || !(DATA_FILE_EXT == "." + tokens[2] || tokens[2] == "mapd")) {
205  // We are only interested in files in the form <id>.<page_size>.<DATA_FILE_EXT>
206  return;
207  }
208 
209  const auto page_size = boost::lexical_cast<int64_t>(tokens[1]);
210  const auto file_size = boost::filesystem::file_size(file_path);
211  std::unique_ptr<FILE, decltype(simple_file_closer)> fp(
212  std::fopen(file_path.c_str(), "r+"), simple_file_closer);
213  if (!fp) {
214  throw std::runtime_error("Failed to open " + file_path +
215  " for update: " + std::strerror(errno));
216  }
217  // TODO(Misiu): Rather than reference an exernal layout we should de-duplicate this
218  // page-reading code in a single location. This will also reduce the need for comments
219  // below.
220  // ref. FileInfo::openExistingFile for hint of chunk header layout
221  for (size_t page = 0; page < file_size / page_size; ++page) {
222  int32_t header_info[8];
223  if (0 != std::fseek(fp.get(), page * page_size, SEEK_SET)) {
224  throw std::runtime_error("Failed to seek to page# " + std::to_string(page) +
225  file_path + " for read: " + std::strerror(errno));
226  }
227  if (1 != fread(header_info, sizeof header_info, 1, fp.get())) {
228  throw std::runtime_error("Failed to read " + file_path + ": " +
229  std::strerror(errno));
230  }
231  if (const auto header_size = header_info[0]; header_size > 0) {
232  // header_info[1] is the page's db_id; but can also be used as an "is deleted"
233  // indicator if negative.
234  auto& contingent = header_info[1];
235  // header_info[2] is the page's table_id; but can also used to store the page's
236  // epoch since the FileMgr stores table_id information separately.
237  auto& epoch = header_info[2];
238  auto& col_id = header_info[3];
240  table_epoch, epoch, contingent)) {
241  continue;
242  }
243  auto column_map_it = column_ids_map.find(col_id);
244  bool rewrite_header = false;
245  if (drop_not_update) {
246  // if the header contains a column ID that is a key of the map
247  // erase the entire header so that column is effectively dropped
248  // the value of the map is ignored, thus allowing us to use the
249  // same function for both operations
250  if (column_map_it != column_ids_map.end()) {
251  // clear the entire header
252  std::memset(header_info, 0, sizeof(header_info));
253  rewrite_header = true;
254  }
255  } else {
256  if (column_map_it == column_ids_map.end()) {
257  throw std::runtime_error("Page " + std::to_string(page) + " in " + file_path +
258  " has unexpected Column ID " + std::to_string(col_id) +
259  ". Dump may be corrupt.");
260  }
261  // If a header contains a column id that is remapped to new location
262  // then write that change to the file.
263  if (const auto dest_col_id = column_map_it->second; col_id != dest_col_id) {
264  col_id = dest_col_id;
265  rewrite_header = true;
266  }
267  }
268  if (rewrite_header) {
269  if (0 != std::fseek(fp.get(), page * page_size, SEEK_SET)) {
270  throw std::runtime_error("Failed to seek to page# " + std::to_string(page) +
271  file_path + " for write: " + std::strerror(errno));
272  }
273  if (1 != fwrite(header_info, sizeof header_info, 1, fp.get())) {
274  throw std::runtime_error("Failed to write " + file_path + ": " +
275  std::strerror(errno));
276  }
277  }
278  }
279  }
280 }
#define DATA_FILE_EXT
Definition: File.h:25
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
::FILE * fopen(const char *filename, const char *mode)
Definition: heavyai_fs.cpp:74
bool is_page_deleted_with_checkpoint(int32_t table_epoch, int32_t page_epoch, int32_t contingent)
Definition: FileInfo.cpp:259
size_t file_size(const int fd)
Definition: heavyai_fs.cpp:33

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void anonymous_namespace{TableArchiver.cpp}::update_or_drop_column_ids_in_table_files ( const int32_t  table_epoch,
const std::string &  temp_data_dir,
const std::unordered_map< int, int > &  column_ids_map,
const bool  drop_not_update 
)

Definition at line 286 of file TableArchiver.cpp.

References ThreadController_NS::SimpleThreadController< FutureReturnType >::checkThreadsStatus(), cpu_threads(), ThreadController_NS::SimpleThreadController< FutureReturnType >::finish(), ThreadController_NS::SimpleThreadController< FutureReturnType >::startThread(), and update_or_drop_column_ids_in_page_headers().

Referenced by drop_render_group_columns(), and TableArchiver::restoreTable().

290  {
291  boost::filesystem::path base_path(temp_data_dir);
292  boost::filesystem::recursive_directory_iterator end_it;
294  for (boost::filesystem::recursive_directory_iterator fit(base_path); fit != end_it;
295  ++fit) {
296  if (!boost::filesystem::is_symlink(fit->path()) &&
297  boost::filesystem::is_regular_file(fit->status())) {
298  thread_controller.startThread(update_or_drop_column_ids_in_page_headers,
299  fit->path(),
300  column_ids_map,
301  table_epoch,
302  drop_not_update);
303  thread_controller.checkThreadsStatus();
304  }
305  }
306  thread_controller.finish();
307 }
void update_or_drop_column_ids_in_page_headers(const boost::filesystem::path &path, const std::unordered_map< int, int > &column_ids_map, const int32_t table_epoch, const bool drop_not_update)
int cpu_threads()
Definition: thread_count.h:25

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Variable Documentation

auto anonymous_namespace{TableArchiver.cpp}::simple_file_closer = [](FILE* f) { std::fclose(f); }
inline