37 " must be one of the following options: " +
44 "\" must not be set for selected option \"" +
51 "\" must be set for selected option \"" +
60 #if 107200 <= BOOST_VERSION
61 constexpr
auto options = boost::filesystem::directory_options::follow_directory_symlink;
63 constexpr
auto options = boost::filesystem::symlink_option::recurse;
65 std::vector<std::string> file_paths;
67 if (boost::filesystem::is_regular_file(file_path)) {
68 file_paths.emplace_back(file_path);
69 }
else if (recurse && boost::filesystem::is_directory(file_path)) {
70 for (boost::filesystem::recursive_directory_iterator it(file_path, options), eit;
73 if (!boost::filesystem::is_directory(it->path())) {
74 file_paths.emplace_back(it->path().string());
80 for (
const auto& path : glob_results) {
81 if (recurse && boost::filesystem::is_directory(path)) {
83 file_paths.insert(file_paths.end(), expanded_paths.begin(), expanded_paths.end());
85 file_paths.emplace_back(path);
88 if (file_paths.empty()) {
96 const std::vector<std::string>& file_paths) {
97 boost::regex regex_pattern(pattern);
98 std::vector<std::string> matched_file_paths;
99 for (
const auto& path : file_paths) {
100 if (boost::regex_match(path, regex_pattern)) {
101 matched_file_paths.emplace_back(path);
104 if (matched_file_paths.empty()) {
107 return matched_file_paths;
114 const bool recurse) {
124 auto lexi_comp = initial_file_order.getFileComparator();
125 std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
128 auto comp = file_order.getFileComparator();
129 std::stable_sort(result_files.begin(), result_files.end(), comp);
136 std::vector<arrow::fs::FileInfo> arrow_fs_regex_file_filter(
137 const std::string& pattern,
138 const std::vector<arrow::fs::FileInfo>& file_info_list) {
139 boost::regex regex_pattern(pattern);
140 std::vector<arrow::fs::FileInfo> matched_file_info_list;
141 for (
const auto& file_info : file_info_list) {
142 if (boost::regex_match(file_info.path(), regex_pattern)) {
143 matched_file_info_list.emplace_back(file_info);
146 if (matched_file_info_list.empty()) {
149 return matched_file_info_list;
154 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
155 const std::vector<arrow::fs::FileInfo>& file_paths,
156 const FilePathOptions& options) {
158 options.filter_regex.has_value()
159 ? arrow_fs_regex_file_filter(options.filter_regex.value(), file_paths)
163 FilePathOptions temp_options;
165 auto initial_file_order = FileOrderArrow(temp_options);
166 auto lexi_comp = initial_file_order.getFileComparator();
167 std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
169 auto file_order = FileOrderArrow(options);
170 auto comp = file_order.getFileComparator();
171 std::stable_sort(result_files.begin(), result_files.end(), comp);
175 #endif // HAVE_AWS_S3
178 return boost::filesystem::exists(path) || !
heavyai::glob(path).empty();
182 const std::vector<std::string>& all_file_paths,
183 std::vector<std::string>& processed_file_paths) {
184 std::set<std::string> rolled_off_file_paths;
185 if (all_file_paths.empty()) {
187 rolled_off_file_paths.insert(processed_file_paths.begin(),
188 processed_file_paths.end());
190 auto roll_off_end_it = std::find(
191 processed_file_paths.begin(), processed_file_paths.end(), all_file_paths[0]);
192 for (
auto it = processed_file_paths.begin(); it != roll_off_end_it; it++) {
193 rolled_off_file_paths.emplace(*it);
196 if (!rolled_off_file_paths.empty()) {
197 processed_file_paths.erase(
198 processed_file_paths.begin(),
199 processed_file_paths.begin() + rolled_off_file_paths.size());
201 return rolled_off_file_paths;
205 const std::string s3_prefix =
"s3://";
206 return file_path.find(s3_prefix) != std::string::npos;
bool contains(const T &container, const U &element)
const std::array< std::string, 2 > non_regex_sort_order_types
std::optional< std::string > filter_regex
void throw_no_filter_match(const std::string &pattern)
const std::string FILE_SORT_REGEX_KEY
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
bool is_s3_uri(const std::string &file_path)
std::optional< std::string > sort_regex
void validate_sort_options(const FilePathOptions &options)
void throw_file_not_found(const std::string &file_path)
std::set< std::string > check_for_rolled_off_file_paths(const std::vector< std::string > &all_file_paths, std::vector< std::string > &processed_file_paths)
const std::string PATHNAME_ORDER_TYPE
std::vector< std::string > glob_local_recursive_files(const std::string &file_path, const bool recurse)
const std::string FILE_SORT_ORDER_BY_KEY
bool file_or_glob_path_exists(const std::string &path)
const std::array< std::string, 5 > supported_file_sort_order_types
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
const std::array< std::string, 3 > regex_sort_order_types
std::vector< std::string > glob(const std::string &pattern)
std::vector< std::string > regex_file_filter(const std::string &pattern, const std::vector< std::string > &file_paths)
std::optional< std::string > sort_by