32 #include <arrow/filesystem/filesystem.h>
34 #include <boost/filesystem.hpp>
44 using ArrowFsComparator =
45 std::function<bool(const arrow::fs::FileInfo&, const arrow::fs::FileInfo&)>;
75 : std::runtime_error(error_message) {}
85 : std::runtime_error(error_message) {}
95 std::optional<std::string>
sort_by{std::nullopt};
102 const FilePathOptions& options,
103 const bool recurse =
true);
106 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
107 const std::vector<arrow::fs::FileInfo>& file_paths,
108 const FilePathOptions& options);
109 #endif // HAVE_AWS_S3
111 const std::function<bool(const std::string&, const std::string&)>
116 lhs_t = dateTimeParse<kDATE>(lhs, 0);
117 }
catch (
const std::exception& e) {
121 rhs_t = dateTimeParse<kDATE>(rhs, 0);
122 }
catch (
const std::exception& e) {
125 return lhs_t < rhs_t;
127 const std::function<bool(const std::string&, const std::string&)>
129 [](
const std::string& lhs,
const std::string& rhs) ->
bool {
133 lhs_i = stoll(lhs, 0);
134 }
catch (
const std::exception& e) {
138 rhs_i = stoll(rhs, 0);
139 }
catch (
const std::exception& e) {
142 return lhs_i < rhs_i;
153 boost::match_results<std::string::const_iterator> capture_groups;
156 if (boost::regex_search(file_name, capture_groups, regex_pattern)) {
157 std::stringstream ss;
158 for (
size_t i = 1; i < capture_groups.size(); i++) {
159 ss << capture_groups[i];
185 return comparator_pair->second;
191 [](
const std::string& lhs,
const std::string& rhs) ->
bool {
return lhs < rhs; }},
193 [](
const std::string& lhs,
const std::string& rhs) ->
bool {
194 return boost::filesystem::last_write_time(lhs) <
195 boost::filesystem::last_write_time(rhs);
198 [
this](
const std::string& lhs,
const std::string& rhs) ->
bool {
202 [
this](
const std::string& lhs,
const std::string& rhs) ->
bool {
207 [
this](
const std::string& lhs,
const std::string& rhs) ->
bool {
215 class FileOrderArrow :
public FileOrderBase<ArrowFsComparator> {
217 FileOrderArrow(
const FilePathOptions& options)
218 : FileOrderBase<ArrowFsComparator>(options) {}
220 inline ArrowFsComparator getFileComparator()
override {
221 auto comparator_pair = comparator_map_.find(getSortBy());
222 CHECK(comparator_pair != comparator_map_.end());
223 return comparator_pair->second;
227 const std::map<std::string, ArrowFsComparator> comparator_map_{
229 [](
const arrow::fs::FileInfo& lhs,
const arrow::fs::FileInfo& rhs) ->
bool {
230 return lhs.path() < rhs.path();
233 [](
const arrow::fs::FileInfo& lhs,
const arrow::fs::FileInfo& rhs) ->
bool {
234 return lhs.mtime() < rhs.mtime();
237 [
this](
const arrow::fs::FileInfo& lhs,
const arrow::fs::FileInfo& rhs) ->
bool {
238 auto lhs_name = lhs.path();
239 auto rhs_name = rhs.path();
240 return this->concatCaptureGroups(lhs_name) < this->concatCaptureGroups(rhs_name);
243 [
this](
const arrow::fs::FileInfo& lhs,
const arrow::fs::FileInfo& rhs) ->
bool {
245 this->concatCaptureGroups(rhs.path()));
248 [
this](
const arrow::fs::FileInfo& lhs,
const arrow::fs::FileInfo& rhs) ->
bool {
250 this->concatCaptureGroups(rhs.path()));
254 #endif // HAVE_AWS_S3
259 const std::vector<std::string>& all_file_paths,
260 std::vector<std::string>& processed_file_paths);
262 bool is_s3_uri(
const std::string& file_path);
const std::array< std::string, 2 > non_regex_sort_order_types
std::function< bool(const std::string &, const std::string &)> LocalFileComparator
const std::string REGEX_NUMBER_ORDER_TYPE
LocalFileComparator getFileComparator() override
std::optional< std::string > filter_regex
void throw_no_filter_match(const std::string &pattern)
const std::string REGEX_ORDER_TYPE
const std::string FILE_SORT_REGEX_KEY
bool is_s3_uri(const std::string &file_path)
std::optional< std::string > sort_regex
void validate_sort_options(const FilePathOptions &options)
virtual std::string getSortBy()
FileOrderBase(const FilePathOptions &options)
NoRegexFilterMatchException(const std::string &error_message)
const std::string REGEX_DATE_ORDER_TYPE
void throw_file_not_found(const std::string &file_path)
std::set< std::string > check_for_rolled_off_file_paths(const std::vector< std::string > &all_file_paths, std::vector< std::string > &processed_file_paths)
FileNotFoundException(const std::string &error_message)
const std::string PATHNAME_ORDER_TYPE
const std::string FILE_SORT_ORDER_BY_KEY
const std::map< std::string, LocalFileComparator > comparator_map_
bool file_or_glob_path_exists(const std::string &path)
FileOrderLocal(const FilePathOptions &options)
std::optional< std::string > sort_regex_
const std::array< std::string, 5 > supported_file_sort_order_types
const std::string DATE_MODIFIED_ORDER_TYPE
virtual T getFileComparator()=0
virtual std::string concatCaptureGroups(const std::string &file_name) const
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
const std::array< std::string, 3 > regex_sort_order_types
std::optional< std::string > sort_by_
std::optional< std::string > sort_by
const std::function< bool(const std::string &, const std::string &)> common_regex_number_comp_
const std::function< bool(const std::string &, const std::string &)> common_regex_date_comp_