26 template <
typename TimeT = std::chrono::milliseconds>
28 template <
typename F,
typename... Args>
30 auto start = std::chrono::steady_clock::now();
31 func(std::forward<Args>(
args)...);
33 std::chrono::duration_cast<TimeT>(std::chrono::steady_clock::now() - start);
34 return duration.count();
39 if (err != CUDA_SUCCESS) {
40 std::cout << err << std::endl;
42 assert(err == CUDA_SUCCESS);
46 int main(
int argc,
char** argv) {
56 std::ifstream t(
"kernel.ptx");
57 std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
68 int64_t
N = 1000000000LL;
69 int8_t* byte_stream_col_0 =
new int8_t[
N];
70 memset(byte_stream_col_0, 42, N);
74 checkCudaErrors(cuMemcpyHtoD(devBufferA, byte_stream_col_0,
sizeof(int8_t) * N));
80 unsigned blockSizeX = 128;
81 unsigned blockSizeY = 1;
82 unsigned blockSizeZ = 1;
83 unsigned gridSizeX = 128;
84 unsigned gridSizeY = 1;
85 unsigned gridSizeZ = 1;
88 int64_t* result_vec =
new int64_t[blockSizeX * gridSizeX *
sizeof(int64_t)];
89 checkCudaErrors(cuMemAlloc(&devBufferB, blockSizeX * gridSizeX *
sizeof(int64_t)));
92 int64_t row_count =
N;
97 int64_t init_agg_val = 0;
99 checkCudaErrors(cuMemcpyHtoD(devBufferI, &init_agg_val,
sizeof(int64_t)));
101 void* KernelParams[] = {&devBufferAA, &devBufferN, &devBufferI, &devBufferB};
116 cuMemcpyDtoH(result_vec, devBufferB, blockSizeX * gridSizeX *
sizeof(int64_t)));
120 for (
size_t i = 0; i < blockSizeX * gridSizeX; ++i) {
121 result += result_vec[i];
123 std::cout << result << std::endl;
126 delete[] byte_stream_col_0;
static TimeT::rep execution(F func, Args &&...args)
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr