21 #include <tbb/parallel_for.h>
27 namespace RasterFormat_Namespace {
29 template <
typename OutputType>
33 const int64_t tiles_x,
34 const int64_t tiles_y,
35 const int64_t halo_y_pixels_per_tile_boundary,
36 const int64_t x_pixels_per_tile,
37 const int64_t y_pixels_per_tile,
38 const int64_t num_raster_channels,
39 const int64_t source_tile_y_offset) {
40 if (tile_y + source_tile_y_offset >= tiles_y || tile_y + source_tile_y_offset < 0) {
43 const int64_t output_tile_slot = tile_y * tiles_x + tile_x;
44 const int64_t source_tile_slot = (tile_y + source_tile_y_offset) * tiles_x + tile_x;
45 const int64_t pixels_per_tile = x_pixels_per_tile * y_pixels_per_tile;
46 const int64_t channel_pixels_per_tile = pixels_per_tile * num_raster_channels;
48 int64_t source_y_start;
49 int64_t halo_y_increment;
50 int64_t source_y_increment;
51 if (source_tile_y_offset > 0) {
53 halo_y_start = y_pixels_per_tile - 1;
54 halo_y_increment = -1;
55 source_y_start = 2 * halo_y_pixels_per_tile_boundary - 1;
56 source_y_increment = -1;
61 source_y_start = y_pixels_per_tile - 2 * halo_y_pixels_per_tile_boundary + 1;
62 source_y_increment = 1;
66 tbb::blocked_range<int64_t>(0, halo_y_pixels_per_tile_boundary),
67 [&](
const tbb::blocked_range<int64_t>& halo_y_pixels_range) {
68 const int64_t start_halo_y_offset = halo_y_pixels_range.begin();
69 const int64_t end_halo_y_offset = halo_y_pixels_range.end();
70 for (int64_t halo_y_offset = start_halo_y_offset;
71 halo_y_offset < end_halo_y_offset;
73 const int64_t halo_y = halo_y_start + halo_y_offset * halo_y_increment;
74 const int64_t source_y = source_y_start + halo_y_offset * source_y_increment;
75 const int64_t output_starting_slot_for_halo_row =
76 output_tile_slot * channel_pixels_per_tile + halo_y * x_pixels_per_tile;
77 const int64_t source_starting_slot_for_halo_row =
78 source_tile_slot * channel_pixels_per_tile + source_y * x_pixels_per_tile;
79 for (int64_t local_x_pixel = 0; local_x_pixel < x_pixels_per_tile;
81 const int64_t output_starting_slot =
82 output_starting_slot_for_halo_row + local_x_pixel;
83 const int64_t source_starting_slot =
84 source_starting_slot_for_halo_row + local_x_pixel;
85 output_data[output_starting_slot] = output_data[source_starting_slot];
86 output_data[output_starting_slot + pixels_per_tile] =
87 output_data[source_starting_slot + pixels_per_tile];
88 output_data[output_starting_slot + 2 * pixels_per_tile] =
89 output_data[source_starting_slot + 2 * pixels_per_tile];
95 template <
typename OutputType>
99 const int64_t tiles_x,
100 const int64_t tiles_y,
101 const int64_t halo_x_pixels_per_tile_boundary,
102 const int64_t x_pixels_per_tile,
103 const int64_t y_pixels_per_tile,
104 const int64_t num_raster_channels,
105 const int64_t source_tile_x_offset) {
106 if (tile_x + source_tile_x_offset >= tiles_x || tile_x + source_tile_x_offset < 0) {
109 const int64_t output_tile_slot = tile_y * tiles_x + tile_x;
110 const int64_t source_tile_slot = tile_y * tiles_x + tile_x + source_tile_x_offset;
111 const int64_t pixels_per_tile = x_pixels_per_tile * y_pixels_per_tile;
112 const int64_t channel_pixels_per_tile = pixels_per_tile * num_raster_channels;
113 int64_t halo_x_start;
114 int64_t source_x_start;
115 int64_t halo_x_increment;
116 int64_t source_x_increment;
117 if (source_tile_x_offset > 0) {
119 halo_x_start = x_pixels_per_tile - 1;
120 halo_x_increment = -1;
121 source_x_start = 2 * halo_x_pixels_per_tile_boundary - 1;
122 source_x_increment = -1;
126 halo_x_increment = 1;
127 source_x_start = x_pixels_per_tile - 2 * halo_x_pixels_per_tile_boundary + 1;
128 source_x_increment = 1;
132 tbb::blocked_range<int64_t>(0, y_pixels_per_tile),
133 [&](
const tbb::blocked_range<int64_t>& y_pixels_range) {
134 const int64_t start_local_y_pixel = y_pixels_range.begin();
135 const int64_t end_local_y_pixel = y_pixels_range.end();
137 for (int64_t local_y_pixel = start_local_y_pixel;
138 local_y_pixel < end_local_y_pixel;
140 const int64_t output_starting_slot_for_halo_row =
141 output_tile_slot * channel_pixels_per_tile +
142 local_y_pixel * x_pixels_per_tile;
143 const int64_t source_starting_slot_for_halo_row =
144 source_tile_slot * channel_pixels_per_tile +
145 local_y_pixel * x_pixels_per_tile;
147 for (int64_t halo_x_offset = 0; halo_x_offset < halo_x_pixels_per_tile_boundary;
149 const int64_t halo_x = halo_x_start + halo_x_offset * halo_x_increment;
150 const int64_t source_x = source_x_start + halo_x_offset * source_x_increment;
151 const int64_t output_starting_slot =
152 output_starting_slot_for_halo_row + halo_x;
153 const int64_t source_starting_slot =
154 source_starting_slot_for_halo_row + source_x;
155 output_data[output_starting_slot] = output_data[source_starting_slot];
156 output_data[output_starting_slot + pixels_per_tile] =
157 output_data[source_starting_slot + pixels_per_tile];
158 output_data[output_starting_slot + 2 * pixels_per_tile] =
159 output_data[source_starting_slot + 2 * pixels_per_tile];
165 template <
typename PixelType,
typename ColorType,
typename OutputType>
170 const PixelType x_input_units_per_pixel,
171 const PixelType y_input_units_per_pixel,
172 const float max_color_val,
173 const int64_t x_pixels_per_tile,
174 const int64_t y_pixels_per_tile,
175 const int64_t tile_boundary_halo_pixels,
176 const int64_t target_batch_size_multiple,
177 std::shared_ptr<CpuTimer> timer) {
178 const double x_pixels_per_input_unit = 1.0 / x_input_units_per_pixel;
179 const double y_pixels_per_input_unit = 1.0 / y_input_units_per_pixel;
180 timer->start_event_timer(
"Compute raster metadata");
184 const int64_t logical_x_pixels =
185 (x_pixel_range.second - x_pixel_range.first) * x_pixels_per_input_unit + 1;
186 const auto min_x_input_unit = x_pixel_range.first;
188 const int64_t logical_y_pixels =
189 (y_pixel_range.second - y_pixel_range.first) * y_pixels_per_input_unit + 1;
190 const auto min_y_input_unit = y_pixel_range.first;
194 int64_t halo_x_pixels_per_tile_boundary = 0;
195 int64_t halo_y_pixels_per_tile_boundary = 0;
196 int64_t logical_x_pixels_per_tile = x_pixels_per_tile;
197 int64_t logical_y_pixels_per_tile = y_pixels_per_tile;
198 if (logical_x_pixels > x_pixels_per_tile || logical_y_pixels > y_pixels_per_tile) {
200 halo_x_pixels_per_tile_boundary =
201 std::min(tile_boundary_halo_pixels, (x_pixels_per_tile - 1) / 2);
202 halo_y_pixels_per_tile_boundary =
203 std::min(tile_boundary_halo_pixels, (y_pixels_per_tile - 1) / 2);
204 logical_x_pixels_per_tile -= halo_x_pixels_per_tile_boundary * 2;
205 logical_y_pixels_per_tile -= halo_y_pixels_per_tile_boundary * 2;
207 (logical_x_pixels + (logical_x_pixels_per_tile - 1)) / logical_x_pixels_per_tile;
209 (logical_y_pixels + (logical_y_pixels_per_tile - 1)) / logical_y_pixels_per_tile;
212 const int64_t num_input_pixels = raster_x.
size();
213 const int64_t num_raster_channels = raster_channels.
numCols();
215 const float normalize_ratio = 1.0f / max_color_val;
216 const int64_t num_tiles = x_tiles * y_tiles;
217 const int64_t batch_tiles =
219 : (num_tiles + target_batch_size_multiple - 1) /
220 target_batch_size_multiple * target_batch_size_multiple;
221 const int64_t batch_pixels = batch_tiles * x_pixels_per_tile * y_pixels_per_tile;
222 std::vector<OutputType> output_data(batch_pixels * num_raster_channels,
223 static_cast<OutputType>(0));
224 const RasterInfo raster_info{num_raster_channels,
227 halo_x_pixels_per_tile_boundary,
228 halo_y_pixels_per_tile_boundary,
229 logical_x_pixels_per_tile,
230 logical_y_pixels_per_tile,
234 static_cast<double>(x_input_units_per_pixel),
235 static_cast<double>(y_input_units_per_pixel),
236 static_cast<double>(min_x_input_unit),
237 static_cast<double>(min_y_input_unit)};
239 timer->start_event_timer(
"Format logical pixels");
240 const int64_t pixels_per_tile = x_pixels_per_tile * y_pixels_per_tile;
241 if (x_tiles == 1 && y_tiles == 1) {
243 tbb::blocked_range<int64_t>(0, num_input_pixels),
244 [&](
const tbb::blocked_range<int64_t>& input_pixel_range) {
245 const int64_t end_input_pixel_idx = input_pixel_range.end();
246 for (int64_t input_pixel_idx = input_pixel_range.begin();
247 input_pixel_idx < end_input_pixel_idx;
249 const int64_t output_pixel_x = round(
250 (raster_x[input_pixel_idx] - min_x_input_unit) * x_pixels_per_input_unit);
251 const int64_t output_pixel_y = round(
252 (raster_y[input_pixel_idx] - min_y_input_unit) * y_pixels_per_input_unit);
253 const int64_t output_starting_slot =
254 output_pixel_y * x_pixels_per_tile + output_pixel_x;
255 for (int64_t channel_idx = 0; channel_idx < num_raster_channels;
257 output_data[output_starting_slot + channel_idx * pixels_per_tile] =
259 static_cast<float>(raster_channels[channel_idx][input_pixel_idx]) *
266 const int64_t channel_pixels_per_tile = pixels_per_tile * num_raster_channels;
268 tbb::blocked_range<int64_t>(0, num_input_pixels),
269 [&](
const tbb::blocked_range<int64_t>& input_pixel_range) {
270 const int64_t end_input_pixel_idx = input_pixel_range.end();
271 for (int64_t input_pixel_idx = input_pixel_range.begin();
272 input_pixel_idx < end_input_pixel_idx;
274 const int64_t logical_output_pixel_x = round(
275 (raster_x[input_pixel_idx] - min_x_input_unit) * x_pixels_per_input_unit);
276 const int64_t logical_output_pixel_y = round(
277 (raster_y[input_pixel_idx] - min_y_input_unit) * y_pixels_per_input_unit);
278 const int64_t output_tile_x =
279 logical_output_pixel_x / logical_x_pixels_per_tile;
280 const int64_t output_tile_y =
281 logical_output_pixel_y / logical_y_pixels_per_tile;
282 const int64_t local_output_pixel_x =
283 (logical_output_pixel_x % logical_x_pixels_per_tile) +
284 halo_x_pixels_per_tile_boundary;
285 const int64_t local_output_pixel_y =
286 (logical_output_pixel_y % logical_y_pixels_per_tile) +
287 halo_y_pixels_per_tile_boundary;
288 const int64_t output_tile_slot = output_tile_y * x_tiles + output_tile_x;
289 const int64_t output_starting_slot =
290 output_tile_slot * channel_pixels_per_tile +
291 local_output_pixel_y * x_pixels_per_tile + local_output_pixel_x;
292 for (int64_t channel_idx = 0; channel_idx < num_raster_channels;
294 output_data[output_starting_slot + channel_idx * pixels_per_tile] =
296 static_cast<float>(raster_channels[channel_idx][input_pixel_idx]) *
304 if (halo_x_pixels_per_tile_boundary > 0 || halo_y_pixels_per_tile_boundary > 0) {
305 timer->start_event_timer(
"Add halos");
306 for (int64_t tile_y = 0; tile_y < y_tiles; ++tile_y) {
307 for (int64_t tile_x = 0; tile_x < x_tiles; ++tile_x) {
308 for (int64_t offset = -1; offset < 1; offset += 2) {
314 halo_y_pixels_per_tile_boundary,
322 for (int64_t tile_y = 0; tile_y < y_tiles; ++tile_y) {
323 for (int64_t tile_x = 0; tile_x < x_tiles; ++tile_x) {
329 halo_x_pixels_per_tile_boundary,
339 halo_y_pixels_per_tile_boundary,
348 return std::make_pair(output_data, raster_info);
353 #endif // #ifdef __CUDACC__
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
DEVICE int64_t size() const
DEVICE int64_t numCols() const
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define TEMPLATE_NOINLINE