Rework clock based unit tests (#2646)

2026-01-21 10:55:33 +00:00
parent d94185c5b2
commit 0590a72d4b
1 changed files with 114 additions and 345 deletions
@@ -17,13 +17,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+#include <cstring>
+#include <numeric>
+#include <vector>
+
 #include <hip_test_common.hh>
 #include <hip_test_checkers.hh>
 #include <hip/hip_ext.h>
-#include <cstring>
-#ifndef _WIN32
-#include <dlfcn.h>
-#endif

 /**
 * @addtogroup clock clock
@@ -32,302 +32,125 @@ THE SOFTWARE.
 * Contains unit tests for clock, clock64 and wall_clock64 APIs
 */

-__global__ void kernel_c64(int clock_rate, uint64_t wait_t) {
-  uint64_t start = clock64() / clock_rate, cur = 0;  // in ms
-  do {
-    cur = clock64() / clock_rate - start;
-  } while (cur < wait_t);
+// Any sort of wait based on clock cycles will be inaccurate give how modern GPUs clock themselves.
+// What clock functions should exhibit is forward progress of the clock ticks.
+// What we measure here is the start tick should be smaller than the end tick.
+// We do some primitive math in the middle.
+
+__device__ float reduce_32_elements(float* in) {
+  auto val = in[threadIdx.x];
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
 }

-__global__ void kernel_c(int clock_rate, uint64_t wait_t) {
-  uint64_t start = clock() / clock_rate, cur = 0;  // in ms
-  do {
-    cur = clock() / clock_rate - start;
-  } while (cur < wait_t);
-}
-
-__global__ void kernel_wc64(int clock_rate, uint64_t wait_t) {
-  uint64_t start = wall_clock64() / clock_rate, cur = 0;  // in ms
-  do {
-    cur = wall_clock64() / clock_rate - start;
-  } while (cur < wait_t);
-}
-
-bool verify_time_execution(float ratio, float time1, float time2, float expected_time1,
-                           float expected_time2) {
-  bool test_status = false;
-
-  if (fabs(time1 - expected_time1) < (ratio * expected_time1) &&
-      fabs(time2 - expected_time2) < (ratio * expected_time2)) {
-    INFO("Succeeded: Expected Vs Actual: Kernel1 - " << expected_time1 << " Vs " << time1
-                                                     << ", Kernel2 - " << expected_time2 << " Vs "
-                                                     << time2);
-    test_status = true;
-  } else {
-    INFO("Failed: Expected Vs Actual: Kernel1 -" << expected_time1 << " Vs " << time1
-                                                 << ", Kernel2 - " << expected_time2 << " Vs "
-                                                 << time2);
-    test_status = false;
-  }
-  return test_status;
-}
-
-/*
- * Launching kernel1 and kernel2 and then we try to
- * get the event elapsed time of each kernel using the start and
- * end events.The event elapsed time should return us the kernel
- * execution time for that particular kernel
- */
-bool kernel_time_execution(void (*kernel)(int, uint64_t), int clock_rate, uint64_t expected_time1,
-                           uint64_t expected_time2) {
-  hipStream_t stream;
-  hipEvent_t start_event1, end_event1, start_event2, end_event2;
-  float time1 = 0, time2 = 0;
-  HIP_CHECK(hipEventCreate(&start_event1));
-  HIP_CHECK(hipEventCreate(&end_event1));
-  HIP_CHECK(hipEventCreate(&start_event2));
-  HIP_CHECK(hipEventCreate(&end_event2));
-  HIP_CHECK(hipStreamCreate(&stream));
-  hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event1, end_event1, 0,
-                        clock_rate, expected_time1);
-  hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event2, end_event2, 0,
-                        clock_rate, expected_time2);
-  HIP_CHECK(hipStreamSynchronize(stream));
-  HIP_CHECK(hipEventElapsedTime(&time1, start_event1, end_event1));
-  HIP_CHECK(hipEventElapsedTime(&time2, start_event2, end_event2));
-
-  HIP_CHECK(hipStreamDestroy(stream));
-  HIP_CHECK(hipEventDestroy(start_event1));
-  HIP_CHECK(hipEventDestroy(end_event1));
-  HIP_CHECK(hipEventDestroy(start_event2));
-  HIP_CHECK(hipEventDestroy(end_event2));
-
-  float ratio = kernel == kernel_wc64 ? 0.01 : 0.5;
-
-  return verify_time_execution(ratio, time1, time2, expected_time1, expected_time2);
-}
-
-template <class T> void loadSym(T& symbol, const char* symbolName, void* handle) {
-  using namespace std::string_literals;
-  void* fnsym = dlsym(handle, symbolName);
-
-  if (!fnsym)
-    throw std::runtime_error("Failure while trying to dynamically load symbol: "s + symbolName);
-
-  symbol = reinterpret_cast<T>(fnsym);
-}
-
-void getCurrentDeviceUUID(hipUUID& uuid) {
-  hipDeviceProp_t props;
-  int deviceId;
-
-  HIP_CHECK(hipGetDevice(&deviceId));
-  HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::memcpy(uuid.bytes, props.uuid.bytes, sizeof(hipUUID::bytes));
-}
-
-#ifndef _WIN32
-// Gets the maximum engine frequency of the GPU by dynamically loading amdsmi
-// @uuid the id of the GPU to query the frequency for
-// @return the maximum engine frequency of the GPU (MHz) or -1 if error
-int getEngineFreq(const hipUUID& uuid) {
-  static constexpr unsigned int AMDSMI_MAX_STRING_LENGTH = 256;
-  typedef void* amdsmi_processor_handle;
-  typedef void* amdsmi_socket_handle;
-  typedef enum {
-    AMDSMI_STATUS_SUCCESS = 0,  //!< Call succeeded
-  } amdsmi_status_t;
-
-  typedef struct {
-    uint32_t clk;            //!< In MHz
-    uint32_t min_clk;        //!< In MHz
-    uint32_t max_clk;        //!< In MHz
-    uint8_t clk_locked;      //!< True/False
-    uint8_t clk_deep_sleep;  //!< True/False
-    uint32_t reserved[4];
-  } amdsmi_clk_info_t;
-
-  typedef struct {
-    uint32_t drm_render;  //!< the render node under /sys/class/drm/renderD*
-    uint32_t drm_card;    //!< the graphic card device under /sys/class/drm/card*
-    uint32_t hsa_id;      //!< the HSA enumeration ID
-    uint32_t hip_id;      //!< the HIP enumeration ID
-    char hip_uuid[AMDSMI_MAX_STRING_LENGTH];  //!< the HIP unique identifer
-  } amdsmi_enumeration_info_t;
-
-  typedef enum {
-    AMDSMI_CLK_TYPE_GFX = 0x0
-  } amdsmi_clk_type_t;
-
-  amdsmi_clk_info_t clk_info;
-  uint32_t gpu_count = 0;
-  uint32_t num_processor = 0;
-  amdsmi_status_t (*fninit)(uint64_t);
-  amdsmi_status_t (*fnget_socket_handles)(uint32_t*, amdsmi_socket_handle*);
-  amdsmi_status_t (*fnget_processor_handles)(amdsmi_socket_handle, uint32_t*,
-                                             amdsmi_processor_handle*);
-  amdsmi_status_t (*fnget_gpu_enumeration_info)(amdsmi_processor_handle,
-                                                amdsmi_enumeration_info_t*);
-  amdsmi_status_t (*fnget_clock_info)(amdsmi_processor_handle, amdsmi_clk_type_t,
-                                      amdsmi_clk_info_t*);
-  amdsmi_status_t (*fnshut_down)();
-  int result = -1;
-  bool smi_initialized = false;
-  auto cleanUp = [&smi_initialized, &fnshut_down](void* handle) {
-    if (smi_initialized)
-      fnshut_down();
-
-    if (handle)
-      dlclose(handle);
-  };
-  std::unique_ptr<void, decltype(cleanUp)> lib_hdl(nullptr, cleanUp);
-
-  lib_hdl.reset(dlopen("libamd_smi.so", RTLD_LAZY));
-
-  if (!lib_hdl) {
-    return -1;
+__global__ void reduce_c64(long long* start, long long* end, float* in /* 32 sized input */,
+                           float* out /* single sized output*/) {
+  if (threadIdx.x == 0) {
+    *start = clock64();
  }

-  try {
-    loadSym(fninit, "amdsmi_init", lib_hdl.get());
-    loadSym(fnget_socket_handles, "amdsmi_get_socket_handles", lib_hdl.get());
-    loadSym(fnget_processor_handles, "amdsmi_get_processor_handles", lib_hdl.get());
-    loadSym(fnget_gpu_enumeration_info, "amdsmi_get_gpu_enumeration_info", lib_hdl.get());
-    loadSym(fnget_clock_info, "amdsmi_get_clock_info", lib_hdl.get());
-    loadSym(fnshut_down, "amdsmi_shut_down", lib_hdl.get());
-  } catch (std::runtime_error&) {
-    return -1;
+  // do not reorder
+  __threadfence();
+  auto val = reduce_32_elements(in);
+  __threadfence();
+
+  if (threadIdx.x == 0) {
+    *out = val;
+    *end = clock64();
  }
-
-  if (fninit(1ul << 1)) {
-    return -1;
-  } else
-    smi_initialized = true;
-
-  uint32_t socket_count = 0;
-  uint32_t num_socket = 0;
-
-  // get the socket count available in the system
-  if (fnget_socket_handles(&socket_count, nullptr)) {
-    return -1;
-  }
-
-  std::vector<amdsmi_socket_handle> sockets(socket_count);
-  if (fnget_socket_handles(&socket_count, &sockets[0])) {
-    return -1;
-  }
-
-  while (num_socket < socket_count && result == -1) {
-    // just get number of processors first
-    if (fnget_processor_handles(sockets[num_socket], &gpu_count, nullptr)) {
-      return -1;
-    }
-
-    std::vector<amdsmi_processor_handle> processors(gpu_count);
-    if (fnget_processor_handles(sockets[num_socket], &gpu_count, &processors[0])) {
-      return -1;
-    }
-
-    while (num_processor < gpu_count && result == -1) {
-      amdsmi_enumeration_info_t info;
-      int offset = 0;
-      const char* prefix = "GPU-";
-
-      if (fnget_gpu_enumeration_info(processors[num_processor], &info)) {
-        return -1;
-      }
-
-      if (!std::strncmp(info.hip_uuid, "GPU-", std::strlen(prefix))) {
-        // amd-smi adds "GPU-" in front of the hip_uuid; whereas HIP doesn't
-        offset = strlen(prefix);
-      }
-
-      if (!std::memcmp(uuid.bytes, info.hip_uuid + offset, sizeof(hipUUID::bytes) - offset)) {
-        if (fnget_clock_info(processors[num_processor], AMDSMI_CLK_TYPE_GFX, &clk_info)) {
-          return -1;
-        }
-
-        result = clk_info.max_clk;
-      }
-
-      num_processor++;
-    }
-
-    num_socket++;
-    num_processor = 0;
-  }
-
-  return result;
-}
-#endif
-
-// @max_clock_rate will be set to the maximum clock rate as reported by hipDeviceGetAttribute()
-// @return         maximum engine clock rate obtained via amdsmi or -1 if querying via amdsmi fails
-int getClockRate(int& max_clock_rate) {
-  max_clock_rate = 0;  // in kHz
-  HIP_CHECK(hipDeviceGetAttribute(&max_clock_rate, hipDeviceAttributeClockRate, 0));
-
-#ifdef _WIN32
-  return -1;
-#else
-  hipUUID uuid;
-  int smi_clock_rate = 0;  // in kHz
-
-  getCurrentDeviceUUID(uuid);
-  smi_clock_rate = getEngineFreq(uuid);
-  return smi_clock_rate;
-#endif
 }

-/**
- * Test Description
- * ------------------------
- *  - Launches two kernels that run for a specified amount of time passed as a kernel argument by
- * using device function clock64. Kernel execution time is calculated through elapsed time between
- * the start and end event, and calculated time is compared with passed time values.
- * Test source
- * ------------------------
- *  - catch/unit/clock/hipClockCheck.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
+__global__ void reduce_c(long long* start, long long* end, float* in /* 32 sized input */,
+                         float* out /* single sized output*/) {
+  if (threadIdx.x == 0) {
+    *start = clock();
+  }
+
+  // do not reorder
+  __threadfence();
+  auto val = reduce_32_elements(in);
+  __threadfence();
+
+  if (threadIdx.x == 0) {
+    *out = val;
+    *end = clock();
+  }
+}
+
+__global__ void reduce_wc64(long long* start, long long* end, float* in /* 32 sized input */,
+                            float* out /* single sized output*/) {
+  if (threadIdx.x == 0) {
+    *start = wall_clock64();
+  }
+
+  // do not reorder
+  __threadfence();
+  auto val = reduce_32_elements(in);
+  __threadfence();
+
+  if (threadIdx.x == 0) {
+    *out = val;
+    *end = wall_clock64();
+  }
+}
+
+void execute_clock_kernels(void (*kernel)(long long*, long long*, float*, float*)) {
+  constexpr size_t size = 32; /* Do not change this, the math in kernel is done for 32 elements */
+  float *d_in{}, *d_out{}, out{};
+  long long *d_clock_start{}, *d_clock_end{}, clock_start{}, clock_end{};
+  std::vector<float> in(size, 0.0f);
+
+  for (size_t i = 0; i < size; i++) {
+    in[i] = i + 1;
+  }
+  auto cpu_result = std::accumulate(in.begin(), in.end(), 0.0f);
+
+  HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
+  HIP_CHECK(hipMalloc(&d_out, sizeof(float)));
+  HIP_CHECK(hipMalloc(&d_clock_start, sizeof(long long)));
+  HIP_CHECK(hipMalloc(&d_clock_end, sizeof(long long)));
+
+  HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * in.size(), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemset(d_out, 0, sizeof(float)));
+  HIP_CHECK(hipMemset(d_clock_start, 0, sizeof(long long)));
+  HIP_CHECK(hipMemset(d_clock_end, 0, sizeof(long long)));
+
+  hipLaunchKernelGGL(kernel, 1, size, 0, nullptr, d_clock_start, d_clock_end, d_in, d_out);
+  HIP_CHECK(hipDeviceSynchronize());
+
+  HIP_CHECK(hipMemcpy(&clock_start, d_clock_start, sizeof(long long), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(&clock_end, d_clock_end, sizeof(long long), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost));
+
+  HIP_CHECK(hipFree(d_in));
+  HIP_CHECK(hipFree(d_out));
+  HIP_CHECK(hipFree(d_clock_start));
+  HIP_CHECK(hipFree(d_clock_end));
+
+  // Make sure the math happenned correctly
+  INFO("sum(1.0f, 2.0f, ..., 32.0f) gpu result: " << out << " cpu: " << cpu_result);
+  REQUIRE(out == cpu_result);
+
+  // Measure the clock progress
+  // There can be two scenarios:
+  // 1) clock_start < clock_end : which we expect
+  // 2) clock_start > clock_end : which means clock warped around, but chances of that happening is
+  // really low
+  INFO("Clock start: " << clock_start << " end: " << clock_end);
+  REQUIRE(clock_start < clock_end);
+}
+
 TEST_CASE("Unit_hipClock64_Positive_Basic") {
-  HIP_CHECK(hipSetDevice(0));
-
-  int max_clock_rate;
-  int clock_rate = getClockRate(max_clock_rate);
-
-  if (max_clock_rate == 0) {
-    HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
-    return;
-  }
  if (IsGfx11()) {
    HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
    return;
  }

-  if (clock_rate == -1) {
-    // libamd_smi.so might not be present depending on some systems, so we load it dynamically
-    // and use it if it is, otherwise we use the attribute
-    UNSCOPED_INFO(
-        "Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
-    clock_rate = max_clock_rate;
-  } else {
-    clock_rate *= 1000;
-
-    if (clock_rate != max_clock_rate) {
-      UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
-                                   << "kHz");
-    } else {
-      UNSCOPED_INFO("clock rate: " << clock_rate << "kHz");
-    }
-  }
-
-  const auto expected_time1 = GENERATE(1000, 1500, 2000);
-  const auto expected_time2 = expected_time1 / 2;
-
-  REQUIRE(kernel_time_execution(kernel_c64, clock_rate, expected_time1, expected_time2));
+  execute_clock_kernels(reduce_c64);
 }

 /**
@@ -344,71 +167,17 @@ TEST_CASE("Unit_hipClock64_Positive_Basic") {
 *  - HIP_VERSION >= 5.2
 */
 TEST_CASE("Unit_hipClock_Positive_Basic") {
-  HIP_CHECK(hipSetDevice(0));
-
-  int max_clock_rate;
-  int clock_rate = getClockRate(max_clock_rate);
-
-  if (max_clock_rate == 0) {
-    HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
-    return;
-  }
  if (IsGfx11()) {
-    HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
+    HipTest::HIP_SKIP_TEST("Issue with clock() function on gfx11 devices!");
    return;
  }

-  if (clock_rate == -1) {
-    // libamd_smi.so might not be present depending on some systems, so we load it dynamically
-    // and use it if it is, otherwise we use the attribute
-    UNSCOPED_INFO(
-        "Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
-    clock_rate = max_clock_rate;
-  } else {
-    clock_rate *= 1000;
-
-    if (clock_rate != max_clock_rate) {
-      UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
-                                   << "kHz");
-    }
-  }
-
-  const auto expected_time1 = GENERATE(1000, 1500, 2000);
-  const auto expected_time2 = expected_time1 / 2;
-
-  REQUIRE(kernel_time_execution(kernel_c, clock_rate, expected_time1, expected_time2));
+  execute_clock_kernels(reduce_c);
 }

-/**
- * Test Description
- * ------------------------
- *  - Launches two kernels that run for a specified amount of time passed as a kernel argument by
- * using device function wall_clock64. Kernel execution time is calculated through elapsed time
- * between the start and end event, and calculated time is compared with passed time values.
- * Test source
- * ------------------------
- *  - catch/unit/clock/hipClockCheck.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
-TEST_CASE("Unit_hipWallClock64_Positive_Basic") {
-  HIP_CHECK(hipSetDevice(0));
-  int clock_rate = 0;  // in kHz
-  HIP_CHECK(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeWallClockRate, 0));
-
-  if (!clock_rate) {
-    HipTest::HIP_SKIP_TEST("hipDeviceAttributeWallClockRate returns 0");
-    return;
-  }
-
-  const auto expected_time1 = GENERATE(1000, 1500, 2000);
-  const auto expected_time2 = expected_time1 / 2;
-
-  REQUIRE(kernel_time_execution(kernel_wc64, clock_rate, expected_time1, expected_time2));
-}
+TEST_CASE("Unit_hipWallClock64_Positive_Basic") { execute_clock_kernels(reduce_wc64); }

 /**
 * End doxygen group DeviceLanguageTest.
 * @}
- */
+ */