6eb06cf201
Squashed commit of the following: commit f029195705a15700380c6f832ba5d15d46fd6de7 Author: Jonathan R. Madsen <jrmadsen@users.noreply.github.com> Date: Thu Jul 13 14:38:56 2023 -0500 Formatting workflows for source (clang-format) and cmake (cmake-format) (#4) * Add .cmake-format.yaml file * Add formatting workflow * provide base input for creating PR * Update scheme for extracting branch name - disable running formatting on push to amd-staging branch * patch .cmake-format.yaml for find_package signature - apparently cmake-format doesn't format the full signature of find_package * run formatting (clang-format v11) (#7) Co-authored-by: jrmadsen <jrmadsen@users.noreply.github.com> * run cmake formatting (cmake-format) (#6) Co-authored-by: jrmadsen <jrmadsen@users.noreply.github.com> --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> commit bc4d135fdd8a1a9e51235f18a5d575fd2b3735e6 Author: Ammar ELWazir <aelwazir@amd.com> Date: Thu Jul 13 12:55:17 2023 -0500 Removing Build cache for potential issues with auto-generated header files (#5) Change-Id: I9e2319f4335e2f88585ffa6fac2bd88a1c952e6e commit ce86dea6a311d44d880fa684eb78f3329295e2a4 Author: Jonathan R. Madsen <jrmadsen@users.noreply.github.com> Date: Thu Jul 13 11:08:58 2023 -0500 Fix decltype(<hsa-function>) function pointer usage (#3) - the following is done in several places: decltype(hsa_memory_allocate)* hsa_memory_allocate - above can cause compiler errors - replace decltype(<hsa-function>) with decltype(::<hsa-function>) - this ensures that the type within the decltype is recognized as the global scope HSA function, not the variable - in many places, the variable has a "_fn" suffix to prevent this issue but added '::' anyway for consistency commit ac49fdd92a72e9c99394253a02da413a6c2e3b3a Merge: a07946a 03a0855 Author: Ammar ELWazir <aelwazir@amd.com> Date: Wed Jul 12 11:36:24 2023 -0500 Merge pull request #2 from ROCm-Developer-Tools/gerrit-amd-staging Pull from gerrit commit 03a085588cffe863e8f466de67be1cfb205b675a Merge:c26b32ba07946a Author: Ammar ELWazir <aelwazir@amd.com> Date: Wed Jul 12 10:57:30 2023 -0500 Merge branch 'amd-staging' into gerrit-amd-staging commit a07946a5cd4c670c83c27ad1a076a9d4567ce6d7 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 15:46:04 2023 +0000 Enabling Cached Builds commit 525e494a7f13941077a8fd4ad6840904db4d27d4 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 04:53:54 2023 +0000 Updating missed GPU Targets commit 42c75862f628c9bee7cfb7dc04dff2619430efbc Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 04:43:02 2023 +0000 Adding V1 Testing commit 9d72fd4aee85e4b0c12e717060d2730fa5b73be1 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 03:34:31 2023 +0000 Fixing Artifacts directory path commit f4000cc558b3b2e4676f7994f7ce8c8e6f94518e Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 03:27:26 2023 +0000 Fixing CMake for test build job commit 2ce8115d4c33948c3c8f957f545a95a04e1d6cd2 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 03:16:18 2023 +0000 Fixing Ubuntu CMake for ubuntu test build commit 6d0ed439191be900748d0c025157f9d689a73ec7 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 01:28:41 2023 +0000 Removing Navi21 commit e349a7642e5ae5eb03ab9fcd0a0f74f09f78cab5 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 01:14:14 2023 +0000 Removing Navi21 commit fefd02fe68d2a4bca7ec2e381960ad004ee9fc5b Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 00:42:48 2023 +0000 Fixing CMake Job commit 2ea46abf7bf92643efa8c549fa70346ffbd79d65 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 00:35:13 2023 +0000 Fixing CMake Job commit d99d681ed1999c5fcf291dc678b11a77205fb0f3 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Wed Jul 12 00:32:13 2023 +0000 Fixing Pull Latest Dockers and CMake Jobs commit dfc4498072d13b4a1df3a63047d34c682c3d9a29 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Tue Jul 11 23:54:21 2023 +0000 Fixing CMake job commit 919efe04de707f7c702031be15c3e2c5f8442cbb Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Tue Jul 11 23:52:13 2023 +0000 Adding Pull Last dockers job commit be1b1256e8b0e05308e8f7e7e69bee3acca55281 Author: Ammar ELWazir <aelwazir@amd.com> Date: Tue Jul 11 18:25:40 2023 -0500 Update cmake.yml commit 212299fa4355ae6ec18f9aaacbb79c51ea6c6f97 Author: Ammar ELWazir <aelwazir@amd.com> Date: Tue Jul 11 18:23:35 2023 -0500 Update cmake.yml commit 7c2c1327086a61466cc6cac39f70865c051a8bc7 Author: Ammar ELWazir <aelwazir@amd.com> Date: Tue Jul 11 18:18:53 2023 -0500 Update cmake.yml commit 191b5ce007e612e814c1d7a3afb4ad398f3852e1 Author: Ammar ELWazir <aelwazir@amd.com> Date: Tue Jul 11 16:03:22 2023 -0500 Update cmake.yml commit 8824113d95f3e13c7ce4d0af8e0d9d8f522a6c4a Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Tue Jul 11 16:28:09 2023 +0000 Fixing Pull from Gerrit job name Change-Id: I9e7ed9a27a13ca49d62c93bdadb30f0057e4d385 commit cc3d5e4b02ffb439e8cc2b3efa53527c376f9982 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Tue Jul 11 16:21:43 2023 +0000 Adding Staging sync job Change-Id: I0551f43878b0678ce4b3e74e27d62357cf95ad95 commit b9be2eee71380a2e6dd34d520e92d0c4209277a0 Author: Ammar ELWazir <Ammar.ELWazir@amd.com> Date: Tue Jul 11 15:57:11 2023 +0000 Fixing build.sh Change-Id: Ia987b0244f0875370d5fe69907b3f5e9cea914de commit 9eee33a95a1abd656a7ac5ca10a9f245e9825431 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 21:39:46 2023 -0500 Update cmake.yml commit 7093b85a78497140e8b52632ca2a002bdaeacd62 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 21:33:29 2023 -0500 Update cmake.yml commit f54697172c72a67740f9fdfa0c217b6ea6931576 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 21:01:26 2023 -0500 Update cmake.yml commit 1b6620e16f8940386b0f4f04e69e2410d21c0e26 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 20:21:02 2023 -0500 Update cmake.yml commit a94bec740c6b42c4b79c87bca20fa87b99bf060d Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 19:46:35 2023 -0500 Update cmake.yml commit 85d6b29d4375a69d575c18ece8542c50f2ddfcc3 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 19:34:39 2023 -0500 Update cmake.yml commit 8c004887cf1435f1a6214c3d2455299a8a27bd4c Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 19:31:17 2023 -0500 Update cmake.yml commit a14a9168e17d9348a53c6e9c9a47ba1edb4c4509 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 19:25:46 2023 -0500 Update cmake.yml commit 000f2f40b84e6a2f7d4becdbf5aed01436ca4c83 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 19:08:18 2023 -0500 Update cmake.yml commit a28a53d56731cad848fa9133d1c4dbaa8fc7afa7 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 19:03:39 2023 -0500 Update cmake.yml commit a6a2db01027f0b01fdfbb5997ddb772c7f51b649 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 18:21:53 2023 -0500 Update cmake.yml commit 118ef2a88b2d44e3207c31c343da3e5e5ec6f176 Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 17:55:57 2023 -0500 Update cmake.yml commit 03c4c232396440cd0be6d2dd7baf4ceea1c2589d Author: Ammar ELWazir <aelwazir@amd.com> Date: Mon Jul 10 17:48:49 2023 -0500 Create cmake.yml Change-Id: I77992f15694e77cbae49c56f9ff02f4f9079235d [ROCm/rocprofiler commit:d4a33cf33a]
384 rivejä
12 KiB
C++
384 rivejä
12 KiB
C++
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE. */
|
|
|
|
#include <algorithm>
|
|
#include <chrono>
|
|
#include <memory>
|
|
#include <numeric>
|
|
#include <vector>
|
|
#include <cfloat>
|
|
#include <cinttypes>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <hip/hip_runtime.h>
|
|
#include <hsa/hsa.h>
|
|
|
|
#include <rocprofiler/v2/rocprofiler.h>
|
|
|
|
#include "program.hpp"
|
|
#include "program_options.hpp"
|
|
#include "disassembly.hpp"
|
|
|
|
#define XSTR(x) STR(x)
|
|
#define STR(x) #x
|
|
#define DBL_FMT "." XSTR(DBL_DECIMAL_DIG) "f"
|
|
|
|
namespace util {
|
|
|
|
struct hipMalloc_freer {
|
|
void operator()(void* const ptr) { (void)hipFree(ptr); }
|
|
};
|
|
|
|
} // namespace util
|
|
|
|
namespace prng {
|
|
|
|
static uint64_t splitmix64_next(uint64_t* const sm64_state) {
|
|
uint64_t z = (*sm64_state += 0x9e3779b97f4a7c15);
|
|
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
|
|
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
|
|
return z ^ (z >> 31);
|
|
}
|
|
|
|
static inline uint64_t rotl64(const uint64_t x, int k) { return (x << k) | (x >> (64 - k)); }
|
|
|
|
static uint64_t xrs_next(uint64_t* const xrs_state) {
|
|
const uint64_t result = rotl64(xrs_state[0] + xrs_state[3], 23) + xrs_state[0];
|
|
|
|
const uint64_t t = xrs_state[1] << 17;
|
|
|
|
xrs_state[2] ^= xrs_state[0];
|
|
xrs_state[3] ^= xrs_state[1];
|
|
xrs_state[1] ^= xrs_state[2];
|
|
xrs_state[0] ^= xrs_state[3];
|
|
|
|
xrs_state[2] ^= t;
|
|
|
|
xrs_state[3] = rotl64(xrs_state[3], 45);
|
|
|
|
return result;
|
|
}
|
|
|
|
} // namespace prng
|
|
|
|
namespace kernel {
|
|
|
|
template <typename T> __global__ static void memset_gpu(T* const s, T const c, size_t const n) {
|
|
size_t i_start = threadIdx.x + blockIdx.x * blockDim.x;
|
|
size_t i_shift = blockDim.x * gridDim.x;
|
|
for (size_t i = i_start; i < n; i += i_shift) {
|
|
s[i] = c;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ static void count_gpu(T const* const xs, T* const out, size_t const n,
|
|
size_t const nblocks, T const gt) {
|
|
size_t i_start = threadIdx.x + blockIdx.x * blockDim.x;
|
|
size_t i_shift = blockDim.x * gridDim.x;
|
|
for (size_t i = i_start; i < n; i += i_shift) {
|
|
if (xs[i] > gt) {
|
|
atomicAdd(&out[i % nblocks], 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace kernel
|
|
|
|
static char const GETOPT_ARGS[] = "cd:mn:DP";
|
|
|
|
static void usage() {
|
|
fputs("usage: " PROGNAME
|
|
" [OPTION]... MIN [SEED]\n"
|
|
" -d DEV\tHIP device number\n"
|
|
" -n LEN\tLength of random integer array\n"
|
|
" -D\t\tPrint kernel disassembly\n"
|
|
" -P\t\tPrint source and disassembly of sampled PC locations\n"
|
|
"where\n"
|
|
" DEV : i32\n"
|
|
" MIN : u64\n"
|
|
" LEN : u64\n"
|
|
" SEED : u64\n",
|
|
stderr);
|
|
}
|
|
|
|
static int get_options(int argc, char** argv, program_options* const opts) {
|
|
int opt;
|
|
|
|
while (-1 != (opt = getopt(argc, argv, GETOPT_ARGS))) {
|
|
switch (opt) {
|
|
case 'd':
|
|
// TODO error checking
|
|
opts->device = strtol(optarg, nullptr, 10);
|
|
break;
|
|
case 'n':
|
|
// TODO error checking
|
|
opts->rands_len = strtoul(optarg, nullptr, 10);
|
|
break;
|
|
case 'D':
|
|
opts->disassemble = true;
|
|
break;
|
|
case 'P':
|
|
opts->pc_sampling = true;
|
|
break;
|
|
default:
|
|
usage();
|
|
return EXIT_FAILURE;
|
|
}
|
|
}
|
|
|
|
auto const optcount = argc - optind;
|
|
if (!(1 == optcount || 2 == optcount)) {
|
|
usage();
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
// TODO error checking
|
|
opts->gt = strtoul(argv[optind], nullptr, 10);
|
|
if (2 == argc - optind) {
|
|
opts->seed = strtoull(argv[optind + 1], nullptr, 10);
|
|
}
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
static program_options g_opts;
|
|
|
|
static void callback_flush_fn(rocprofiler_record_header_t const* record,
|
|
rocprofiler_record_header_t const* end_record,
|
|
rocprofiler_session_id_t session_id,
|
|
rocprofiler_buffer_id_t buffer_id) {
|
|
while (record < end_record) {
|
|
if (nullptr == record) {
|
|
break;
|
|
}
|
|
if (ROCPROFILER_PC_SAMPLING_RECORD == record->kind) {
|
|
auto const& pcr = (rocprofiler_record_pc_sample_t&)*record;
|
|
printf("dispatch[%" PRIu64 "] timestamp(%" PRIu64 ") gpu_id(%#" PRIx64 ") pc-sample(%#" PRIx64
|
|
") se(%" PRIu32 ")\n",
|
|
pcr.pc_sample.dispatch_id.value, pcr.pc_sample.timestamp.value,
|
|
pcr.pc_sample.gpu_id.handle, pcr.pc_sample.pc, pcr.pc_sample.se);
|
|
if (g_opts.pc_sampling) {
|
|
disassembly_print_pc_sample_context(pcr.pc_sample.pc);
|
|
}
|
|
}
|
|
rocprofiler_next_record(record, &record, session_id, buffer_id);
|
|
}
|
|
}
|
|
|
|
static int run_kernel(program_options const& opts) {
|
|
rocprofiler_session_id_t sid;
|
|
rocprofiler_filter_id_t fid, fid2;
|
|
rocprofiler_buffer_id_t bid;
|
|
auto rocprofiler_ok = ROCPROFILER_STATUS_SUCCESS;
|
|
|
|
if (opts.pc_sampling) {
|
|
ROCPROFILER_CHECK(rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &sid),
|
|
rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
fputs("error: failed to create rocprofiler session\n", stderr);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
rocprofiler_filter_property_t property{};
|
|
|
|
ROCPROFILER_CHECK(
|
|
rocprofiler_create_buffer(sid, callback_flush_fn, static_cast<size_t>(0x1000), &bid),
|
|
rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
fputs("error: failed to add PC sampling session mode\n", stderr);
|
|
goto out;
|
|
}
|
|
|
|
ROCPROFILER_CHECK(rocprofiler_create_filter(sid, ROCPROFILER_PC_SAMPLING_COLLECTION,
|
|
rocprofiler_filter_data_t{}, 0, &fid, property),
|
|
rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
goto cleanup;
|
|
}
|
|
|
|
ROCPROFILER_CHECK(rocprofiler_create_filter(sid, ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION,
|
|
rocprofiler_filter_data_t{}, 0, &fid2, property),
|
|
rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
goto cleanup;
|
|
}
|
|
|
|
ROCPROFILER_CHECK(rocprofiler_set_filter_buffer(sid, fid, bid), rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
goto cleanup;
|
|
}
|
|
|
|
ROCPROFILER_CHECK(rocprofiler_set_filter_buffer(sid, fid2, bid), rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
goto cleanup;
|
|
}
|
|
|
|
ROCPROFILER_CHECK(rocprofiler_start_session(sid), rocprofiler_ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS != rocprofiler_ok) {
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
{
|
|
printf("seed = %" PRIu64 "\n", opts.seed);
|
|
|
|
std::vector<uint64_t> rands(opts.rands_len);
|
|
using rands_elt_t = decltype(rands)::value_type;
|
|
|
|
uint64_t sm64_state = opts.seed, xrs_state[4];
|
|
|
|
{
|
|
using prng::splitmix64_next;
|
|
using prng::xrs_next;
|
|
|
|
// Initialize the Xoroshiro PRNG
|
|
xrs_state[0] = splitmix64_next(&sm64_state);
|
|
xrs_state[1] = splitmix64_next(&sm64_state);
|
|
xrs_state[2] = splitmix64_next(&sm64_state);
|
|
xrs_state[3] = splitmix64_next(&sm64_state);
|
|
|
|
// Fill rands with random integers
|
|
for (auto& i : rands) {
|
|
i = xrs_next(xrs_state);
|
|
}
|
|
}
|
|
|
|
struct tm {
|
|
using monoclk = std::chrono::steady_clock;
|
|
using dur = std::chrono::duration<double>;
|
|
};
|
|
|
|
using util::hipMalloc_freer;
|
|
|
|
auto const begin_time = tm::monoclk::now();
|
|
|
|
auto hip_ok = hipSuccess;
|
|
do {
|
|
HIP_CHECK_BREAK(hipSetDevice(opts.device), hip_ok);
|
|
|
|
auto const rands_nbytes = rands.size() * sizeof(rands_elt_t);
|
|
std::unique_ptr<rands_elt_t, hipMalloc_freer> rands_gpu;
|
|
{
|
|
rands_elt_t* rands_gpu_ptr;
|
|
HIP_CHECK_BREAK(hipMalloc(&rands_gpu_ptr, rands_nbytes), hip_ok);
|
|
rands_gpu.reset(rands_gpu_ptr);
|
|
}
|
|
|
|
HIP_CHECK_BREAK(hipMemcpy(rands_gpu.get(), rands.data(), rands_nbytes, hipMemcpyHostToDevice),
|
|
hip_ok);
|
|
(void)hipDeviceSynchronize();
|
|
|
|
uint32_t constexpr nthreads = 256U;
|
|
uint32_t const nblocks = (rands.size() + nthreads - 1) / nthreads;
|
|
|
|
using count_elt_t = size_t;
|
|
|
|
auto const count_subtotals_nbytes = nblocks * sizeof(count_elt_t);
|
|
std::unique_ptr<count_elt_t, hipMalloc_freer> count_subtotals_gpu;
|
|
{
|
|
count_elt_t* count_subtotals_gpu_ptr;
|
|
HIP_CHECK_BREAK(hipMalloc(&count_subtotals_gpu_ptr, count_subtotals_nbytes), hip_ok);
|
|
count_subtotals_gpu.reset(count_subtotals_gpu_ptr);
|
|
}
|
|
|
|
hipLaunchKernelGGL(kernel::memset_gpu, nblocks, nthreads, 0, 0, count_subtotals_gpu.get(),
|
|
0UL, static_cast<size_t>(nblocks));
|
|
HIP_CHECK_BREAK(hipGetLastError(), hip_ok);
|
|
(void)hipDeviceSynchronize();
|
|
|
|
auto const kernel_begin_time = tm::monoclk::now();
|
|
|
|
hipLaunchKernelGGL(kernel::count_gpu, nblocks, nthreads, 0, 0, rands_gpu.get(),
|
|
count_subtotals_gpu.get(), rands.size(), static_cast<size_t>(nblocks),
|
|
opts.gt);
|
|
HIP_CHECK_BREAK(hipGetLastError(), hip_ok);
|
|
(void)hipDeviceSynchronize();
|
|
|
|
auto const kernel_end_time = tm::monoclk::now();
|
|
|
|
std::vector<size_t> count_subtotals(nblocks);
|
|
HIP_CHECK_BREAK(hipMemcpy(count_subtotals.data(), count_subtotals_gpu.get(),
|
|
count_subtotals_nbytes, hipMemcpyDeviceToHost),
|
|
hip_ok);
|
|
(void)hipDeviceSynchronize();
|
|
|
|
// TODO parallel sum on GPU
|
|
auto const total =
|
|
std::accumulate(count_subtotals.cbegin(), count_subtotals.cend(), static_cast<size_t>(0));
|
|
|
|
auto const all_end_time = tm::monoclk::now();
|
|
|
|
tm::dur const kernel_time(kernel_end_time - kernel_begin_time);
|
|
auto total_time(all_end_time - begin_time);
|
|
tm::dur const total_time_without_tool_init(total_time);
|
|
printf(
|
|
"len(rands) = %zu; gt = %zu; count(rands, gt) = %zu\n"
|
|
"main kernel time elapsed: %" DBL_FMT
|
|
"\n"
|
|
"full time elapsed: %" DBL_FMT "\n",
|
|
rands.size(), opts.gt, total, kernel_time.count(), total_time_without_tool_init.count());
|
|
} while (false);
|
|
|
|
if (opts.disassemble) {
|
|
disassembly_disassemble_kernels(false);
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
if (opts.pc_sampling) {
|
|
rocprofiler_terminate_session(sid);
|
|
rocprofiler_flush_data(sid, bid);
|
|
rocprofiler_destroy_session(sid);
|
|
}
|
|
|
|
out:
|
|
return ROCPROFILER_STATUS_SUCCESS == rocprofiler_ok ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|
|
|
|
int main(int argc, char** argv) {
|
|
if (auto const ret = get_options(argc, argv, &g_opts); EXIT_SUCCESS != ret) {
|
|
return ret;
|
|
}
|
|
|
|
if (hsa_init() != HSA_STATUS_SUCCESS) {
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
int ret = EXIT_FAILURE;
|
|
auto ok = ROCPROFILER_STATUS_SUCCESS;
|
|
|
|
ROCPROFILER_CHECK(rocprofiler_initialize(), ok);
|
|
if (ROCPROFILER_STATUS_SUCCESS == ok) {
|
|
ret = run_kernel(g_opts);
|
|
} else {
|
|
goto out;
|
|
}
|
|
|
|
rocprofiler_finalize();
|
|
|
|
out:
|
|
hsa_shut_down();
|
|
return ROCPROFILER_STATUS_SUCCESS == ok && EXIT_FAILURE != ret ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|