OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
file_path_util.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #include "Shared/file_path_util.h"
25 
26 #include "Logger/Logger.h"
28 #include "Shared/misc.h"
29 
30 namespace shared {
31 
32 void validate_sort_options(const FilePathOptions& options) {
33  const auto sort_by_str = to_upper(options.sort_by.value_or(PATHNAME_ORDER_TYPE));
34 
36  throw std::runtime_error{FILE_SORT_ORDER_BY_KEY +
37  " must be one of the following options: " +
39  }
40 
41  if (shared::contains(non_regex_sort_order_types, sort_by_str) &&
42  options.sort_regex.has_value()) {
43  throw std::runtime_error{"Option \"" + FILE_SORT_REGEX_KEY +
44  "\" must not be set for selected option \"" +
45  FILE_SORT_ORDER_BY_KEY + "='" + sort_by_str + "'\"."};
46  }
47 
48  if (shared::contains(regex_sort_order_types, sort_by_str) &&
49  !options.sort_regex.has_value()) {
50  throw std::runtime_error{"Option \"" + FILE_SORT_REGEX_KEY +
51  "\" must be set for selected option \"" +
52  FILE_SORT_ORDER_BY_KEY + "='" + sort_by_str + "'\"."};
53  }
54 }
55 
56 namespace {
57 
58 std::vector<std::string> glob_local_recursive_files(const std::string& file_path,
59  const bool recurse) {
60 #if 107200 <= BOOST_VERSION
61  constexpr auto options = boost::filesystem::directory_options::follow_directory_symlink;
62 #else
63  constexpr auto options = boost::filesystem::symlink_option::recurse;
64 #endif
65  std::vector<std::string> file_paths;
66 
67  if (boost::filesystem::is_regular_file(file_path)) {
68  file_paths.emplace_back(file_path);
69  } else if (recurse && boost::filesystem::is_directory(file_path)) {
70  for (boost::filesystem::recursive_directory_iterator it(file_path, options), eit;
71  it != eit;
72  ++it) {
73  if (!boost::filesystem::is_directory(it->path())) {
74  file_paths.emplace_back(it->path().string());
75  }
76  }
77  // empty directories will not throw an error
78  } else {
79  auto glob_results = heavyai::glob(file_path);
80  for (const auto& path : glob_results) {
81  if (recurse && boost::filesystem::is_directory(path)) {
82  auto expanded_paths = glob_local_recursive_files(path, true);
83  file_paths.insert(file_paths.end(), expanded_paths.begin(), expanded_paths.end());
84  } else {
85  file_paths.emplace_back(path);
86  }
87  }
88  if (file_paths.empty()) {
89  throw_file_not_found(file_path);
90  }
91  }
92  return file_paths;
93 }
94 
95 std::vector<std::string> regex_file_filter(const std::string& pattern,
96  const std::vector<std::string>& file_paths) {
97  boost::regex regex_pattern(pattern);
98  std::vector<std::string> matched_file_paths;
99  for (const auto& path : file_paths) {
100  if (boost::regex_match(path, regex_pattern)) {
101  matched_file_paths.emplace_back(path);
102  }
103  }
104  if (matched_file_paths.empty()) {
105  throw_no_filter_match(pattern);
106  }
107  return matched_file_paths;
108 }
109 
110 } // namespace
111 
112 std::vector<std::string> local_glob_filter_sort_files(const std::string& file_path,
113  const FilePathOptions& options,
114  const bool recurse) {
115  auto result_files = glob_local_recursive_files(file_path, recurse);
116  if (options.filter_regex.has_value()) {
117  result_files = regex_file_filter(options.filter_regex.value(), result_files);
118  }
119  // initial lexicographical order ensures a determinisitc ordering for files not matching
120  // sort_regex
121  FilePathOptions temp_options;
122  temp_options.sort_by = PATHNAME_ORDER_TYPE;
123  auto initial_file_order = FileOrderLocal(temp_options);
124  auto lexi_comp = initial_file_order.getFileComparator();
125  std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
126 
127  auto file_order = FileOrderLocal(options);
128  auto comp = file_order.getFileComparator();
129  std::stable_sort(result_files.begin(), result_files.end(), comp);
130  return result_files;
131 }
132 
133 #ifdef HAVE_AWS_S3
134 namespace {
135 
136 std::vector<arrow::fs::FileInfo> arrow_fs_regex_file_filter(
137  const std::string& pattern,
138  const std::vector<arrow::fs::FileInfo>& file_info_list) {
139  boost::regex regex_pattern(pattern);
140  std::vector<arrow::fs::FileInfo> matched_file_info_list;
141  for (const auto& file_info : file_info_list) {
142  if (boost::regex_match(file_info.path(), regex_pattern)) {
143  matched_file_info_list.emplace_back(file_info);
144  }
145  }
146  if (matched_file_info_list.empty()) {
147  throw_no_filter_match(pattern);
148  }
149  return matched_file_info_list;
150 }
151 
152 } // namespace
153 
154 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
155  const std::vector<arrow::fs::FileInfo>& file_paths,
156  const FilePathOptions& options) {
157  auto result_files =
158  options.filter_regex.has_value()
159  ? arrow_fs_regex_file_filter(options.filter_regex.value(), file_paths)
160  : file_paths;
161  // initial lexicographical order ensures a determinisitc ordering for files not matching
162  // sort_regex
163  FilePathOptions temp_options;
164  temp_options.sort_by = PATHNAME_ORDER_TYPE;
165  auto initial_file_order = FileOrderArrow(temp_options);
166  auto lexi_comp = initial_file_order.getFileComparator();
167  std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
168 
169  auto file_order = FileOrderArrow(options);
170  auto comp = file_order.getFileComparator();
171  std::stable_sort(result_files.begin(), result_files.end(), comp);
172  return result_files;
173 }
174 
175 #endif // HAVE_AWS_S3
176 
177 bool file_or_glob_path_exists(const std::string& path) {
178  return boost::filesystem::exists(path) || !heavyai::glob(path).empty();
179 }
180 
181 std::set<std::string> check_for_rolled_off_file_paths(
182  const std::vector<std::string>& all_file_paths,
183  std::vector<std::string>& processed_file_paths) {
184  std::set<std::string> rolled_off_file_paths;
185  if (all_file_paths.empty()) {
186  // An empty all_file_paths vector implies that all files have been rolled off
187  rolled_off_file_paths.insert(processed_file_paths.begin(),
188  processed_file_paths.end());
189  } else {
190  auto roll_off_end_it = std::find(
191  processed_file_paths.begin(), processed_file_paths.end(), all_file_paths[0]);
192  for (auto it = processed_file_paths.begin(); it != roll_off_end_it; it++) {
193  rolled_off_file_paths.emplace(*it);
194  }
195  }
196  if (!rolled_off_file_paths.empty()) {
197  processed_file_paths.erase(
198  processed_file_paths.begin(),
199  processed_file_paths.begin() + rolled_off_file_paths.size());
200  }
201  return rolled_off_file_paths;
202 }
203 
204 bool is_s3_uri(const std::string& file_path) {
205  const std::string s3_prefix = "s3://";
206  return file_path.find(s3_prefix) != std::string::npos;
207 }
208 } // namespace shared
bool contains(const T &container, const U &element)
Definition: misc.h:204
const std::array< std::string, 2 > non_regex_sort_order_types
std::optional< std::string > filter_regex
void throw_no_filter_match(const std::string &pattern)
const std::string FILE_SORT_REGEX_KEY
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
bool is_s3_uri(const std::string &file_path)
std::optional< std::string > sort_regex
void validate_sort_options(const FilePathOptions &options)
std::string join(T const &container, std::string const &delim)
void throw_file_not_found(const std::string &file_path)
std::set< std::string > check_for_rolled_off_file_paths(const std::vector< std::string > &all_file_paths, std::vector< std::string > &processed_file_paths)
const std::string PATHNAME_ORDER_TYPE
std::vector< std::string > glob_local_recursive_files(const std::string &file_path, const bool recurse)
const std::string FILE_SORT_ORDER_BY_KEY
std::string to_upper(const std::string &str)
bool file_or_glob_path_exists(const std::string &path)
const std::array< std::string, 5 > supported_file_sort_order_types
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
const std::array< std::string, 3 > regex_sort_order_types
std::vector< std::string > glob(const std::string &pattern)
std::vector< std::string > regex_file_filter(const std::string &pattern, const std::vector< std::string > &file_paths)
std::optional< std::string > sort_by