Rework clock based unit tests (#2646)

This commit is contained in:
Jatin Chaudhary
2026-01-21 10:55:33 +00:00
committed by GitHub
parent d94185c5b2
commit 0590a72d4b
@@ -17,13 +17,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <cstring>
#include <numeric>
#include <vector>
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip/hip_ext.h>
#include <cstring>
#ifndef _WIN32
#include <dlfcn.h>
#endif
/**
* @addtogroup clock clock
@@ -32,302 +32,125 @@ THE SOFTWARE.
* Contains unit tests for clock, clock64 and wall_clock64 APIs
*/
__global__ void kernel_c64(int clock_rate, uint64_t wait_t) {
uint64_t start = clock64() / clock_rate, cur = 0; // in ms
do {
cur = clock64() / clock_rate - start;
} while (cur < wait_t);
// Any sort of wait based on clock cycles will be inaccurate give how modern GPUs clock themselves.
// What clock functions should exhibit is forward progress of the clock ticks.
// What we measure here is the start tick should be smaller than the end tick.
// We do some primitive math in the middle.
__device__ float reduce_32_elements(float* in) {
auto val = in[threadIdx.x];
val += __shfl_down(val, 16);
val += __shfl_down(val, 8);
val += __shfl_down(val, 4);
val += __shfl_down(val, 2);
val += __shfl_down(val, 1);
return val;
}
__global__ void kernel_c(int clock_rate, uint64_t wait_t) {
uint64_t start = clock() / clock_rate, cur = 0; // in ms
do {
cur = clock() / clock_rate - start;
} while (cur < wait_t);
}
__global__ void kernel_wc64(int clock_rate, uint64_t wait_t) {
uint64_t start = wall_clock64() / clock_rate, cur = 0; // in ms
do {
cur = wall_clock64() / clock_rate - start;
} while (cur < wait_t);
}
bool verify_time_execution(float ratio, float time1, float time2, float expected_time1,
float expected_time2) {
bool test_status = false;
if (fabs(time1 - expected_time1) < (ratio * expected_time1) &&
fabs(time2 - expected_time2) < (ratio * expected_time2)) {
INFO("Succeeded: Expected Vs Actual: Kernel1 - " << expected_time1 << " Vs " << time1
<< ", Kernel2 - " << expected_time2 << " Vs "
<< time2);
test_status = true;
} else {
INFO("Failed: Expected Vs Actual: Kernel1 -" << expected_time1 << " Vs " << time1
<< ", Kernel2 - " << expected_time2 << " Vs "
<< time2);
test_status = false;
}
return test_status;
}
/*
* Launching kernel1 and kernel2 and then we try to
* get the event elapsed time of each kernel using the start and
* end events.The event elapsed time should return us the kernel
* execution time for that particular kernel
*/
bool kernel_time_execution(void (*kernel)(int, uint64_t), int clock_rate, uint64_t expected_time1,
uint64_t expected_time2) {
hipStream_t stream;
hipEvent_t start_event1, end_event1, start_event2, end_event2;
float time1 = 0, time2 = 0;
HIP_CHECK(hipEventCreate(&start_event1));
HIP_CHECK(hipEventCreate(&end_event1));
HIP_CHECK(hipEventCreate(&start_event2));
HIP_CHECK(hipEventCreate(&end_event2));
HIP_CHECK(hipStreamCreate(&stream));
hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event1, end_event1, 0,
clock_rate, expected_time1);
hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event2, end_event2, 0,
clock_rate, expected_time2);
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipEventElapsedTime(&time1, start_event1, end_event1));
HIP_CHECK(hipEventElapsedTime(&time2, start_event2, end_event2));
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK(hipEventDestroy(start_event1));
HIP_CHECK(hipEventDestroy(end_event1));
HIP_CHECK(hipEventDestroy(start_event2));
HIP_CHECK(hipEventDestroy(end_event2));
float ratio = kernel == kernel_wc64 ? 0.01 : 0.5;
return verify_time_execution(ratio, time1, time2, expected_time1, expected_time2);
}
template <class T> void loadSym(T& symbol, const char* symbolName, void* handle) {
using namespace std::string_literals;
void* fnsym = dlsym(handle, symbolName);
if (!fnsym)
throw std::runtime_error("Failure while trying to dynamically load symbol: "s + symbolName);
symbol = reinterpret_cast<T>(fnsym);
}
void getCurrentDeviceUUID(hipUUID& uuid) {
hipDeviceProp_t props;
int deviceId;
HIP_CHECK(hipGetDevice(&deviceId));
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
std::memcpy(uuid.bytes, props.uuid.bytes, sizeof(hipUUID::bytes));
}
#ifndef _WIN32
// Gets the maximum engine frequency of the GPU by dynamically loading amdsmi
// @uuid the id of the GPU to query the frequency for
// @return the maximum engine frequency of the GPU (MHz) or -1 if error
int getEngineFreq(const hipUUID& uuid) {
static constexpr unsigned int AMDSMI_MAX_STRING_LENGTH = 256;
typedef void* amdsmi_processor_handle;
typedef void* amdsmi_socket_handle;
typedef enum {
AMDSMI_STATUS_SUCCESS = 0, //!< Call succeeded
} amdsmi_status_t;
typedef struct {
uint32_t clk; //!< In MHz
uint32_t min_clk; //!< In MHz
uint32_t max_clk; //!< In MHz
uint8_t clk_locked; //!< True/False
uint8_t clk_deep_sleep; //!< True/False
uint32_t reserved[4];
} amdsmi_clk_info_t;
typedef struct {
uint32_t drm_render; //!< the render node under /sys/class/drm/renderD*
uint32_t drm_card; //!< the graphic card device under /sys/class/drm/card*
uint32_t hsa_id; //!< the HSA enumeration ID
uint32_t hip_id; //!< the HIP enumeration ID
char hip_uuid[AMDSMI_MAX_STRING_LENGTH]; //!< the HIP unique identifer
} amdsmi_enumeration_info_t;
typedef enum {
AMDSMI_CLK_TYPE_GFX = 0x0
} amdsmi_clk_type_t;
amdsmi_clk_info_t clk_info;
uint32_t gpu_count = 0;
uint32_t num_processor = 0;
amdsmi_status_t (*fninit)(uint64_t);
amdsmi_status_t (*fnget_socket_handles)(uint32_t*, amdsmi_socket_handle*);
amdsmi_status_t (*fnget_processor_handles)(amdsmi_socket_handle, uint32_t*,
amdsmi_processor_handle*);
amdsmi_status_t (*fnget_gpu_enumeration_info)(amdsmi_processor_handle,
amdsmi_enumeration_info_t*);
amdsmi_status_t (*fnget_clock_info)(amdsmi_processor_handle, amdsmi_clk_type_t,
amdsmi_clk_info_t*);
amdsmi_status_t (*fnshut_down)();
int result = -1;
bool smi_initialized = false;
auto cleanUp = [&smi_initialized, &fnshut_down](void* handle) {
if (smi_initialized)
fnshut_down();
if (handle)
dlclose(handle);
};
std::unique_ptr<void, decltype(cleanUp)> lib_hdl(nullptr, cleanUp);
lib_hdl.reset(dlopen("libamd_smi.so", RTLD_LAZY));
if (!lib_hdl) {
return -1;
__global__ void reduce_c64(long long* start, long long* end, float* in /* 32 sized input */,
float* out /* single sized output*/) {
if (threadIdx.x == 0) {
*start = clock64();
}
try {
loadSym(fninit, "amdsmi_init", lib_hdl.get());
loadSym(fnget_socket_handles, "amdsmi_get_socket_handles", lib_hdl.get());
loadSym(fnget_processor_handles, "amdsmi_get_processor_handles", lib_hdl.get());
loadSym(fnget_gpu_enumeration_info, "amdsmi_get_gpu_enumeration_info", lib_hdl.get());
loadSym(fnget_clock_info, "amdsmi_get_clock_info", lib_hdl.get());
loadSym(fnshut_down, "amdsmi_shut_down", lib_hdl.get());
} catch (std::runtime_error&) {
return -1;
// do not reorder
__threadfence();
auto val = reduce_32_elements(in);
__threadfence();
if (threadIdx.x == 0) {
*out = val;
*end = clock64();
}
if (fninit(1ul << 1)) {
return -1;
} else
smi_initialized = true;
uint32_t socket_count = 0;
uint32_t num_socket = 0;
// get the socket count available in the system
if (fnget_socket_handles(&socket_count, nullptr)) {
return -1;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
if (fnget_socket_handles(&socket_count, &sockets[0])) {
return -1;
}
while (num_socket < socket_count && result == -1) {
// just get number of processors first
if (fnget_processor_handles(sockets[num_socket], &gpu_count, nullptr)) {
return -1;
}
std::vector<amdsmi_processor_handle> processors(gpu_count);
if (fnget_processor_handles(sockets[num_socket], &gpu_count, &processors[0])) {
return -1;
}
while (num_processor < gpu_count && result == -1) {
amdsmi_enumeration_info_t info;
int offset = 0;
const char* prefix = "GPU-";
if (fnget_gpu_enumeration_info(processors[num_processor], &info)) {
return -1;
}
if (!std::strncmp(info.hip_uuid, "GPU-", std::strlen(prefix))) {
// amd-smi adds "GPU-" in front of the hip_uuid; whereas HIP doesn't
offset = strlen(prefix);
}
if (!std::memcmp(uuid.bytes, info.hip_uuid + offset, sizeof(hipUUID::bytes) - offset)) {
if (fnget_clock_info(processors[num_processor], AMDSMI_CLK_TYPE_GFX, &clk_info)) {
return -1;
}
result = clk_info.max_clk;
}
num_processor++;
}
num_socket++;
num_processor = 0;
}
return result;
}
#endif
// @max_clock_rate will be set to the maximum clock rate as reported by hipDeviceGetAttribute()
// @return maximum engine clock rate obtained via amdsmi or -1 if querying via amdsmi fails
int getClockRate(int& max_clock_rate) {
max_clock_rate = 0; // in kHz
HIP_CHECK(hipDeviceGetAttribute(&max_clock_rate, hipDeviceAttributeClockRate, 0));
#ifdef _WIN32
return -1;
#else
hipUUID uuid;
int smi_clock_rate = 0; // in kHz
getCurrentDeviceUUID(uuid);
smi_clock_rate = getEngineFreq(uuid);
return smi_clock_rate;
#endif
}
/**
* Test Description
* ------------------------
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
* using device function clock64. Kernel execution time is calculated through elapsed time between
* the start and end event, and calculated time is compared with passed time values.
* Test source
* ------------------------
* - catch/unit/clock/hipClockCheck.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
__global__ void reduce_c(long long* start, long long* end, float* in /* 32 sized input */,
float* out /* single sized output*/) {
if (threadIdx.x == 0) {
*start = clock();
}
// do not reorder
__threadfence();
auto val = reduce_32_elements(in);
__threadfence();
if (threadIdx.x == 0) {
*out = val;
*end = clock();
}
}
__global__ void reduce_wc64(long long* start, long long* end, float* in /* 32 sized input */,
float* out /* single sized output*/) {
if (threadIdx.x == 0) {
*start = wall_clock64();
}
// do not reorder
__threadfence();
auto val = reduce_32_elements(in);
__threadfence();
if (threadIdx.x == 0) {
*out = val;
*end = wall_clock64();
}
}
void execute_clock_kernels(void (*kernel)(long long*, long long*, float*, float*)) {
constexpr size_t size = 32; /* Do not change this, the math in kernel is done for 32 elements */
float *d_in{}, *d_out{}, out{};
long long *d_clock_start{}, *d_clock_end{}, clock_start{}, clock_end{};
std::vector<float> in(size, 0.0f);
for (size_t i = 0; i < size; i++) {
in[i] = i + 1;
}
auto cpu_result = std::accumulate(in.begin(), in.end(), 0.0f);
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
HIP_CHECK(hipMalloc(&d_out, sizeof(float)));
HIP_CHECK(hipMalloc(&d_clock_start, sizeof(long long)));
HIP_CHECK(hipMalloc(&d_clock_end, sizeof(long long)));
HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * in.size(), hipMemcpyHostToDevice));
HIP_CHECK(hipMemset(d_out, 0, sizeof(float)));
HIP_CHECK(hipMemset(d_clock_start, 0, sizeof(long long)));
HIP_CHECK(hipMemset(d_clock_end, 0, sizeof(long long)));
hipLaunchKernelGGL(kernel, 1, size, 0, nullptr, d_clock_start, d_clock_end, d_in, d_out);
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemcpy(&clock_start, d_clock_start, sizeof(long long), hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(&clock_end, d_clock_end, sizeof(long long), hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_in));
HIP_CHECK(hipFree(d_out));
HIP_CHECK(hipFree(d_clock_start));
HIP_CHECK(hipFree(d_clock_end));
// Make sure the math happenned correctly
INFO("sum(1.0f, 2.0f, ..., 32.0f) gpu result: " << out << " cpu: " << cpu_result);
REQUIRE(out == cpu_result);
// Measure the clock progress
// There can be two scenarios:
// 1) clock_start < clock_end : which we expect
// 2) clock_start > clock_end : which means clock warped around, but chances of that happening is
// really low
INFO("Clock start: " << clock_start << " end: " << clock_end);
REQUIRE(clock_start < clock_end);
}
TEST_CASE("Unit_hipClock64_Positive_Basic") {
HIP_CHECK(hipSetDevice(0));
int max_clock_rate;
int clock_rate = getClockRate(max_clock_rate);
if (max_clock_rate == 0) {
HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
return;
}
if (IsGfx11()) {
HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
return;
}
if (clock_rate == -1) {
// libamd_smi.so might not be present depending on some systems, so we load it dynamically
// and use it if it is, otherwise we use the attribute
UNSCOPED_INFO(
"Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
clock_rate = max_clock_rate;
} else {
clock_rate *= 1000;
if (clock_rate != max_clock_rate) {
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
<< "kHz");
} else {
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz");
}
}
const auto expected_time1 = GENERATE(1000, 1500, 2000);
const auto expected_time2 = expected_time1 / 2;
REQUIRE(kernel_time_execution(kernel_c64, clock_rate, expected_time1, expected_time2));
execute_clock_kernels(reduce_c64);
}
/**
@@ -344,71 +167,17 @@ TEST_CASE("Unit_hipClock64_Positive_Basic") {
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipClock_Positive_Basic") {
HIP_CHECK(hipSetDevice(0));
int max_clock_rate;
int clock_rate = getClockRate(max_clock_rate);
if (max_clock_rate == 0) {
HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
return;
}
if (IsGfx11()) {
HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
HipTest::HIP_SKIP_TEST("Issue with clock() function on gfx11 devices!");
return;
}
if (clock_rate == -1) {
// libamd_smi.so might not be present depending on some systems, so we load it dynamically
// and use it if it is, otherwise we use the attribute
UNSCOPED_INFO(
"Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
clock_rate = max_clock_rate;
} else {
clock_rate *= 1000;
if (clock_rate != max_clock_rate) {
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
<< "kHz");
}
}
const auto expected_time1 = GENERATE(1000, 1500, 2000);
const auto expected_time2 = expected_time1 / 2;
REQUIRE(kernel_time_execution(kernel_c, clock_rate, expected_time1, expected_time2));
execute_clock_kernels(reduce_c);
}
/**
* Test Description
* ------------------------
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
* using device function wall_clock64. Kernel execution time is calculated through elapsed time
* between the start and end event, and calculated time is compared with passed time values.
* Test source
* ------------------------
* - catch/unit/clock/hipClockCheck.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipWallClock64_Positive_Basic") {
HIP_CHECK(hipSetDevice(0));
int clock_rate = 0; // in kHz
HIP_CHECK(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeWallClockRate, 0));
if (!clock_rate) {
HipTest::HIP_SKIP_TEST("hipDeviceAttributeWallClockRate returns 0");
return;
}
const auto expected_time1 = GENERATE(1000, 1500, 2000);
const auto expected_time2 = expected_time1 / 2;
REQUIRE(kernel_time_execution(kernel_wc64, clock_rate, expected_time1, expected_time2));
}
TEST_CASE("Unit_hipWallClock64_Positive_Basic") { execute_clock_kernels(reduce_wc64); }
/**
* End doxygen group DeviceLanguageTest.
* @}
*/
*/