OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
file_path_util.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #pragma once
25 #include <array>
26 #include <optional>
27 #include <stdexcept>
28 #include <string>
29 #include <vector>
30 
31 #ifdef HAVE_AWS_S3
32 #include <arrow/filesystem/filesystem.h>
33 #endif // HAVE_AWS_S3
34 #include <boost/filesystem.hpp>
35 
36 #include "Shared/DateTimeParser.h"
37 #include "Shared/StringTransform.h"
39 
40 namespace shared {
41 
42 using LocalFileComparator = std::function<bool(const std::string&, const std::string&)>;
43 #ifdef HAVE_AWS_S3
44 using ArrowFsComparator =
45  std::function<bool(const arrow::fs::FileInfo&, const arrow::fs::FileInfo&)>;
46 #endif // HAVE_AWS_S3
47 
48 inline const std::string FILE_SORT_ORDER_BY_KEY = "FILE_SORT_ORDER_BY";
49 inline const std::string FILE_SORT_REGEX_KEY = "FILE_SORT_REGEX";
50 
51 inline const std::string PATHNAME_ORDER_TYPE = "PATHNAME";
52 inline const std::string DATE_MODIFIED_ORDER_TYPE = "DATE_MODIFIED";
53 inline const std::string REGEX_ORDER_TYPE = "REGEX";
54 inline const std::string REGEX_DATE_ORDER_TYPE = "REGEX_DATE";
55 inline const std::string REGEX_NUMBER_ORDER_TYPE = "REGEX_NUMBER";
56 
57 inline const std::array<std::string, 5> supported_file_sort_order_types{
63 
64 inline const std::array<std::string, 2> non_regex_sort_order_types{
67 
68 inline const std::array<std::string, 3> regex_sort_order_types{REGEX_ORDER_TYPE,
71 
72 class FileNotFoundException : public std::runtime_error {
73  public:
74  FileNotFoundException(const std::string& error_message)
75  : std::runtime_error(error_message) {}
76 };
77 
78 inline void throw_file_not_found(const std::string& file_path) {
79  throw FileNotFoundException{"File or directory \"" + file_path + "\" does not exist."};
80 }
81 
82 class NoRegexFilterMatchException : public std::runtime_error {
83  public:
84  NoRegexFilterMatchException(const std::string& error_message)
85  : std::runtime_error(error_message) {}
86 };
87 
88 inline void throw_no_filter_match(const std::string& pattern) {
89  throw NoRegexFilterMatchException{"No files matched the regex file path \"" + pattern +
90  "\"."};
91 }
92 
94  std::optional<std::string> filter_regex{std::nullopt};
95  std::optional<std::string> sort_by{std::nullopt};
96  std::optional<std::string> sort_regex{std::nullopt};
97 };
98 
99 void validate_sort_options(const FilePathOptions& options);
100 
101 std::vector<std::string> local_glob_filter_sort_files(const std::string& file_path,
102  const FilePathOptions& options,
103  const bool recurse = true);
104 
105 #ifdef HAVE_AWS_S3
106 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
107  const std::vector<arrow::fs::FileInfo>& file_paths,
108  const FilePathOptions& options);
109 #endif // HAVE_AWS_S3
110 
111 const std::function<bool(const std::string&, const std::string&)>
112  common_regex_date_comp_ = [](const std::string& lhs, const std::string& rhs) -> bool {
113  int64_t lhs_t;
114  int64_t rhs_t;
115  try {
116  lhs_t = dateTimeParse<kDATE>(lhs, 0);
117  } catch (const std::exception& e) {
118  lhs_t = 0;
119  }
120  try {
121  rhs_t = dateTimeParse<kDATE>(rhs, 0);
122  } catch (const std::exception& e) {
123  rhs_t = 0;
124  }
125  return lhs_t < rhs_t;
126 };
127 const std::function<bool(const std::string&, const std::string&)>
129  [](const std::string& lhs, const std::string& rhs) -> bool {
130  int64_t lhs_i;
131  int64_t rhs_i;
132  try {
133  lhs_i = stoll(lhs, 0);
134  } catch (const std::exception& e) {
135  lhs_i = 0;
136  }
137  try {
138  rhs_i = stoll(rhs, 0);
139  } catch (const std::exception& e) {
140  rhs_i = 0;
141  }
142  return lhs_i < rhs_i;
143 };
144 
145 template <class T>
147  public:
148  inline FileOrderBase(const FilePathOptions& options)
149  : sort_regex_(options.sort_regex), sort_by_(options.sort_by) {}
150 
151  virtual inline std::string concatCaptureGroups(const std::string& file_name) const {
152  CHECK(sort_regex_.has_value());
153  boost::match_results<std::string::const_iterator> capture_groups;
154  boost::regex regex_pattern(sort_regex_.value());
155 
156  if (boost::regex_search(file_name, capture_groups, regex_pattern)) {
157  std::stringstream ss;
158  for (size_t i = 1; i < capture_groups.size(); i++) {
159  ss << capture_groups[i];
160  }
161  return ss.str();
162  }
163  return ""; // Empty strings sorted to beginning
164  }
165 
166  virtual inline std::string getSortBy() {
167  return to_upper(sort_by_.value_or(PATHNAME_ORDER_TYPE));
168  }
169 
170  virtual T getFileComparator() = 0;
171 
172  protected:
173  std::optional<std::string> sort_regex_;
174  std::optional<std::string> sort_by_;
175 };
176 
177 class FileOrderLocal : public FileOrderBase<LocalFileComparator> {
178  public:
180  : FileOrderBase<LocalFileComparator>(options) {}
181 
183  auto comparator_pair = comparator_map_.find(getSortBy());
184  CHECK(comparator_pair != comparator_map_.end());
185  return comparator_pair->second;
186  }
187 
188  protected:
189  const std::map<std::string, LocalFileComparator> comparator_map_{
191  [](const std::string& lhs, const std::string& rhs) -> bool { return lhs < rhs; }},
193  [](const std::string& lhs, const std::string& rhs) -> bool {
194  return boost::filesystem::last_write_time(lhs) <
195  boost::filesystem::last_write_time(rhs);
196  }},
198  [this](const std::string& lhs, const std::string& rhs) -> bool {
199  return this->concatCaptureGroups(lhs) < this->concatCaptureGroups(rhs);
200  }},
202  [this](const std::string& lhs, const std::string& rhs) -> bool {
204  this->concatCaptureGroups(rhs));
205  }},
207  [this](const std::string& lhs, const std::string& rhs) -> bool {
209  this->concatCaptureGroups(rhs));
210  }}};
211 };
212 
213 #ifdef HAVE_AWS_S3
214 
215 class FileOrderArrow : public FileOrderBase<ArrowFsComparator> {
216  public:
217  FileOrderArrow(const FilePathOptions& options)
218  : FileOrderBase<ArrowFsComparator>(options) {}
219 
220  inline ArrowFsComparator getFileComparator() override {
221  auto comparator_pair = comparator_map_.find(getSortBy());
222  CHECK(comparator_pair != comparator_map_.end());
223  return comparator_pair->second;
224  }
225 
226  protected:
227  const std::map<std::string, ArrowFsComparator> comparator_map_{
229  [](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
230  return lhs.path() < rhs.path();
231  }},
233  [](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
234  return lhs.mtime() < rhs.mtime();
235  }},
237  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
238  auto lhs_name = lhs.path();
239  auto rhs_name = rhs.path();
240  return this->concatCaptureGroups(lhs_name) < this->concatCaptureGroups(rhs_name);
241  }},
243  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
244  return common_regex_date_comp_(this->concatCaptureGroups(lhs.path()),
245  this->concatCaptureGroups(rhs.path()));
246  }},
248  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
249  return common_regex_number_comp_(this->concatCaptureGroups(lhs.path()),
250  this->concatCaptureGroups(rhs.path()));
251  }}};
252 };
253 
254 #endif // HAVE_AWS_S3
255 
256 bool file_or_glob_path_exists(const std::string& path);
257 
258 std::set<std::string> check_for_rolled_off_file_paths(
259  const std::vector<std::string>& all_file_paths,
260  std::vector<std::string>& processed_file_paths);
261 
262 bool is_s3_uri(const std::string& file_path);
263 } // namespace shared
const std::array< std::string, 2 > non_regex_sort_order_types
std::function< bool(const std::string &, const std::string &)> LocalFileComparator
const std::string REGEX_NUMBER_ORDER_TYPE
LocalFileComparator getFileComparator() override
std::optional< std::string > filter_regex
void throw_no_filter_match(const std::string &pattern)
const std::string REGEX_ORDER_TYPE
const std::string FILE_SORT_REGEX_KEY
bool is_s3_uri(const std::string &file_path)
std::optional< std::string > sort_regex
void validate_sort_options(const FilePathOptions &options)
virtual std::string getSortBy()
FileOrderBase(const FilePathOptions &options)
NoRegexFilterMatchException(const std::string &error_message)
const std::string REGEX_DATE_ORDER_TYPE
void throw_file_not_found(const std::string &file_path)
std::set< std::string > check_for_rolled_off_file_paths(const std::vector< std::string > &all_file_paths, std::vector< std::string > &processed_file_paths)
FileNotFoundException(const std::string &error_message)
const std::string PATHNAME_ORDER_TYPE
const std::string FILE_SORT_ORDER_BY_KEY
const std::map< std::string, LocalFileComparator > comparator_map_
std::string to_upper(const std::string &str)
bool file_or_glob_path_exists(const std::string &path)
FileOrderLocal(const FilePathOptions &options)
std::optional< std::string > sort_regex_
const std::array< std::string, 5 > supported_file_sort_order_types
const std::string DATE_MODIFIED_ORDER_TYPE
virtual T getFileComparator()=0
virtual std::string concatCaptureGroups(const std::string &file_name) const
#define CHECK(condition)
Definition: Logger.h:291
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
const std::array< std::string, 3 > regex_sort_order_types
std::optional< std::string > sort_by_
std::optional< std::string > sort_by
const std::function< bool(const std::string &, const std::string &)> common_regex_number_comp_
const std::function< bool(const std::string &, const std::string &)> common_regex_date_comp_