415 Zeilen
14 KiB
C++
415 Zeilen
14 KiB
C++
/*
|
|
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#include <hip_test_common.hh>
|
|
#include <hip_test_checkers.hh>
|
|
#include <hip/hip_ext.h>
|
|
#include <cstring>
|
|
#ifndef _WIN32
|
|
#include <dlfcn.h>
|
|
#endif
|
|
|
|
/**
|
|
* @addtogroup clock clock
|
|
* @{
|
|
* @ingroup DeviceLanguageTest
|
|
* Contains unit tests for clock, clock64 and wall_clock64 APIs
|
|
*/
|
|
|
|
__global__ void kernel_c64(int clock_rate, uint64_t wait_t) {
|
|
uint64_t start = clock64() / clock_rate, cur = 0; // in ms
|
|
do {
|
|
cur = clock64() / clock_rate - start;
|
|
} while (cur < wait_t);
|
|
}
|
|
|
|
__global__ void kernel_c(int clock_rate, uint64_t wait_t) {
|
|
uint64_t start = clock() / clock_rate, cur = 0; // in ms
|
|
do {
|
|
cur = clock() / clock_rate - start;
|
|
} while (cur < wait_t);
|
|
}
|
|
|
|
__global__ void kernel_wc64(int clock_rate, uint64_t wait_t) {
|
|
uint64_t start = wall_clock64() / clock_rate, cur = 0; // in ms
|
|
do {
|
|
cur = wall_clock64() / clock_rate - start;
|
|
} while (cur < wait_t);
|
|
}
|
|
|
|
bool verify_time_execution(float ratio, float time1, float time2, float expected_time1,
|
|
float expected_time2) {
|
|
bool test_status = false;
|
|
|
|
if (fabs(time1 - expected_time1) < (ratio * expected_time1) &&
|
|
fabs(time2 - expected_time2) < (ratio * expected_time2)) {
|
|
INFO("Succeeded: Expected Vs Actual: Kernel1 - " << expected_time1 << " Vs " << time1
|
|
<< ", Kernel2 - " << expected_time2 << " Vs "
|
|
<< time2);
|
|
test_status = true;
|
|
} else {
|
|
INFO("Failed: Expected Vs Actual: Kernel1 -" << expected_time1 << " Vs " << time1
|
|
<< ", Kernel2 - " << expected_time2 << " Vs "
|
|
<< time2);
|
|
test_status = false;
|
|
}
|
|
return test_status;
|
|
}
|
|
|
|
/*
|
|
* Launching kernel1 and kernel2 and then we try to
|
|
* get the event elapsed time of each kernel using the start and
|
|
* end events.The event elapsed time should return us the kernel
|
|
* execution time for that particular kernel
|
|
*/
|
|
bool kernel_time_execution(void (*kernel)(int, uint64_t), int clock_rate, uint64_t expected_time1,
|
|
uint64_t expected_time2) {
|
|
hipStream_t stream;
|
|
hipEvent_t start_event1, end_event1, start_event2, end_event2;
|
|
float time1 = 0, time2 = 0;
|
|
HIP_CHECK(hipEventCreate(&start_event1));
|
|
HIP_CHECK(hipEventCreate(&end_event1));
|
|
HIP_CHECK(hipEventCreate(&start_event2));
|
|
HIP_CHECK(hipEventCreate(&end_event2));
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event1, end_event1, 0,
|
|
clock_rate, expected_time1);
|
|
hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event2, end_event2, 0,
|
|
clock_rate, expected_time2);
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
HIP_CHECK(hipEventElapsedTime(&time1, start_event1, end_event1));
|
|
HIP_CHECK(hipEventElapsedTime(&time2, start_event2, end_event2));
|
|
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
HIP_CHECK(hipEventDestroy(start_event1));
|
|
HIP_CHECK(hipEventDestroy(end_event1));
|
|
HIP_CHECK(hipEventDestroy(start_event2));
|
|
HIP_CHECK(hipEventDestroy(end_event2));
|
|
|
|
float ratio = kernel == kernel_wc64 ? 0.01 : 0.5;
|
|
|
|
return verify_time_execution(ratio, time1, time2, expected_time1, expected_time2);
|
|
}
|
|
|
|
template <class T> void loadSym(T& symbol, const char* symbolName, void* handle) {
|
|
using namespace std::string_literals;
|
|
void* fnsym = dlsym(handle, symbolName);
|
|
|
|
if (!fnsym)
|
|
throw std::runtime_error("Failure while trying to dynamically load symbol: "s + symbolName);
|
|
|
|
symbol = reinterpret_cast<T>(fnsym);
|
|
}
|
|
|
|
void getCurrentDeviceUUID(hipUUID& uuid) {
|
|
hipDeviceProp_t props;
|
|
int deviceId;
|
|
|
|
HIP_CHECK(hipGetDevice(&deviceId));
|
|
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
|
|
std::memcpy(uuid.bytes, props.uuid.bytes, sizeof(hipUUID::bytes));
|
|
}
|
|
|
|
#ifndef _WIN32
|
|
// Gets the maximum engine frequency of the GPU by dynamically loading amdsmi
|
|
// @uuid the id of the GPU to query the frequency for
|
|
// @return the maximum engine frequency of the GPU (MHz) or -1 if error
|
|
int getEngineFreq(const hipUUID& uuid) {
|
|
static constexpr unsigned int AMDSMI_MAX_STRING_LENGTH = 256;
|
|
typedef void* amdsmi_processor_handle;
|
|
typedef void* amdsmi_socket_handle;
|
|
typedef enum {
|
|
AMDSMI_STATUS_SUCCESS = 0, //!< Call succeeded
|
|
} amdsmi_status_t;
|
|
|
|
typedef struct {
|
|
uint32_t clk; //!< In MHz
|
|
uint32_t min_clk; //!< In MHz
|
|
uint32_t max_clk; //!< In MHz
|
|
uint8_t clk_locked; //!< True/False
|
|
uint8_t clk_deep_sleep; //!< True/False
|
|
uint32_t reserved[4];
|
|
} amdsmi_clk_info_t;
|
|
|
|
typedef struct {
|
|
uint32_t drm_render; //!< the render node under /sys/class/drm/renderD*
|
|
uint32_t drm_card; //!< the graphic card device under /sys/class/drm/card*
|
|
uint32_t hsa_id; //!< the HSA enumeration ID
|
|
uint32_t hip_id; //!< the HIP enumeration ID
|
|
char hip_uuid[AMDSMI_MAX_STRING_LENGTH]; //!< the HIP unique identifer
|
|
} amdsmi_enumeration_info_t;
|
|
|
|
typedef enum {
|
|
AMDSMI_CLK_TYPE_GFX = 0x0
|
|
} amdsmi_clk_type_t;
|
|
|
|
amdsmi_clk_info_t clk_info;
|
|
uint32_t gpu_count = 0;
|
|
uint32_t num_processor = 0;
|
|
amdsmi_status_t (*fninit)(uint64_t);
|
|
amdsmi_status_t (*fnget_socket_handles)(uint32_t*, amdsmi_socket_handle*);
|
|
amdsmi_status_t (*fnget_processor_handles)(amdsmi_socket_handle, uint32_t*,
|
|
amdsmi_processor_handle*);
|
|
amdsmi_status_t (*fnget_gpu_enumeration_info)(amdsmi_processor_handle,
|
|
amdsmi_enumeration_info_t*);
|
|
amdsmi_status_t (*fnget_clock_info)(amdsmi_processor_handle, amdsmi_clk_type_t,
|
|
amdsmi_clk_info_t*);
|
|
amdsmi_status_t (*fnshut_down)();
|
|
int result = -1;
|
|
bool smi_initialized = false;
|
|
auto cleanUp = [&smi_initialized, &fnshut_down](void* handle) {
|
|
if (smi_initialized)
|
|
fnshut_down();
|
|
|
|
if (handle)
|
|
dlclose(handle);
|
|
};
|
|
std::unique_ptr<void, decltype(cleanUp)> lib_hdl(nullptr, cleanUp);
|
|
|
|
lib_hdl.reset(dlopen("libamd_smi.so", RTLD_LAZY));
|
|
|
|
if (!lib_hdl) {
|
|
return -1;
|
|
}
|
|
|
|
try {
|
|
loadSym(fninit, "amdsmi_init", lib_hdl.get());
|
|
loadSym(fnget_socket_handles, "amdsmi_get_socket_handles", lib_hdl.get());
|
|
loadSym(fnget_processor_handles, "amdsmi_get_processor_handles", lib_hdl.get());
|
|
loadSym(fnget_gpu_enumeration_info, "amdsmi_get_gpu_enumeration_info", lib_hdl.get());
|
|
loadSym(fnget_clock_info, "amdsmi_get_clock_info", lib_hdl.get());
|
|
loadSym(fnshut_down, "amdsmi_shut_down", lib_hdl.get());
|
|
} catch (std::runtime_error&) {
|
|
return -1;
|
|
}
|
|
|
|
if (fninit(1ul << 1)) {
|
|
return -1;
|
|
} else
|
|
smi_initialized = true;
|
|
|
|
uint32_t socket_count = 0;
|
|
uint32_t num_socket = 0;
|
|
|
|
// get the socket count available in the system
|
|
if (fnget_socket_handles(&socket_count, nullptr)) {
|
|
return -1;
|
|
}
|
|
|
|
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
|
if (fnget_socket_handles(&socket_count, &sockets[0])) {
|
|
return -1;
|
|
}
|
|
|
|
while (num_socket < socket_count && result == -1) {
|
|
// just get number of processors first
|
|
if (fnget_processor_handles(sockets[num_socket], &gpu_count, nullptr)) {
|
|
return -1;
|
|
}
|
|
|
|
std::vector<amdsmi_processor_handle> processors(gpu_count);
|
|
if (fnget_processor_handles(sockets[num_socket], &gpu_count, &processors[0])) {
|
|
return -1;
|
|
}
|
|
|
|
while (num_processor < gpu_count && result == -1) {
|
|
amdsmi_enumeration_info_t info;
|
|
int offset = 0;
|
|
const char* prefix = "GPU-";
|
|
|
|
if (fnget_gpu_enumeration_info(processors[num_processor], &info)) {
|
|
return -1;
|
|
}
|
|
|
|
if (!std::strncmp(info.hip_uuid, "GPU-", std::strlen(prefix))) {
|
|
// amd-smi adds "GPU-" in front of the hip_uuid; whereas HIP doesn't
|
|
offset = strlen(prefix);
|
|
}
|
|
|
|
if (!std::memcmp(uuid.bytes, info.hip_uuid + offset, sizeof(hipUUID::bytes) - offset)) {
|
|
if (fnget_clock_info(processors[num_processor], AMDSMI_CLK_TYPE_GFX, &clk_info)) {
|
|
return -1;
|
|
}
|
|
|
|
result = clk_info.max_clk;
|
|
}
|
|
|
|
num_processor++;
|
|
}
|
|
|
|
num_socket++;
|
|
num_processor = 0;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
// @max_clock_rate will be set to the maximum clock rate as reported by hipDeviceGetAttribute()
|
|
// @return maximum engine clock rate obtained via amdsmi or -1 if querying via amdsmi fails
|
|
int getClockRate(int& max_clock_rate) {
|
|
max_clock_rate = 0; // in kHz
|
|
HIP_CHECK(hipDeviceGetAttribute(&max_clock_rate, hipDeviceAttributeClockRate, 0));
|
|
|
|
#ifdef _WIN32
|
|
return -1;
|
|
#else
|
|
hipUUID uuid;
|
|
int smi_clock_rate = 0; // in kHz
|
|
|
|
getCurrentDeviceUUID(uuid);
|
|
smi_clock_rate = getEngineFreq(uuid);
|
|
return smi_clock_rate;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
|
|
* using device function clock64. Kernel execution time is calculated through elapsed time between
|
|
* the start and end event, and calculated time is compared with passed time values.
|
|
* Test source
|
|
* ------------------------
|
|
* - catch/unit/clock/hipClockCheck.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 5.2
|
|
*/
|
|
TEST_CASE("Unit_hipClock64_Positive_Basic") {
|
|
HIP_CHECK(hipSetDevice(0));
|
|
|
|
int max_clock_rate;
|
|
int clock_rate = getClockRate(max_clock_rate);
|
|
|
|
if (max_clock_rate == 0) {
|
|
HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
|
|
return;
|
|
}
|
|
if (IsGfx11()) {
|
|
HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
|
|
return;
|
|
}
|
|
|
|
if (clock_rate == -1) {
|
|
// libamd_smi.so might not be present depending on some systems, so we load it dynamically
|
|
// and use it if it is, otherwise we use the attribute
|
|
UNSCOPED_INFO(
|
|
"Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
|
|
clock_rate = max_clock_rate;
|
|
} else {
|
|
clock_rate *= 1000;
|
|
|
|
if (clock_rate != max_clock_rate) {
|
|
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
|
|
<< "kHz");
|
|
} else {
|
|
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz");
|
|
}
|
|
}
|
|
|
|
const auto expected_time1 = GENERATE(1000, 1500, 2000);
|
|
const auto expected_time2 = expected_time1 / 2;
|
|
|
|
REQUIRE(kernel_time_execution(kernel_c64, clock_rate, expected_time1, expected_time2));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
|
|
* using device function clock. Kernel execution time is calculated through elapsed time between
|
|
* the start and end event, and calculated time is compared with passed time values.
|
|
* Test source
|
|
* ------------------------
|
|
* - catch/unit/clock/hipClockCheck.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 5.2
|
|
*/
|
|
TEST_CASE("Unit_hipClock_Positive_Basic") {
|
|
HIP_CHECK(hipSetDevice(0));
|
|
|
|
int max_clock_rate;
|
|
int clock_rate = getClockRate(max_clock_rate);
|
|
|
|
if (max_clock_rate == 0) {
|
|
HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
|
|
return;
|
|
}
|
|
if (IsGfx11()) {
|
|
HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
|
|
return;
|
|
}
|
|
|
|
if (clock_rate == -1) {
|
|
// libamd_smi.so might not be present depending on some systems, so we load it dynamically
|
|
// and use it if it is, otherwise we use the attribute
|
|
UNSCOPED_INFO(
|
|
"Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
|
|
clock_rate = max_clock_rate;
|
|
} else {
|
|
clock_rate *= 1000;
|
|
|
|
if (clock_rate != max_clock_rate) {
|
|
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
|
|
<< "kHz");
|
|
}
|
|
}
|
|
|
|
const auto expected_time1 = GENERATE(1000, 1500, 2000);
|
|
const auto expected_time2 = expected_time1 / 2;
|
|
|
|
REQUIRE(kernel_time_execution(kernel_c, clock_rate, expected_time1, expected_time2));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
|
|
* using device function wall_clock64. Kernel execution time is calculated through elapsed time
|
|
* between the start and end event, and calculated time is compared with passed time values.
|
|
* Test source
|
|
* ------------------------
|
|
* - catch/unit/clock/hipClockCheck.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 5.2
|
|
*/
|
|
TEST_CASE("Unit_hipWallClock64_Positive_Basic") {
|
|
HIP_CHECK(hipSetDevice(0));
|
|
int clock_rate = 0; // in kHz
|
|
HIP_CHECK(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeWallClockRate, 0));
|
|
|
|
if (!clock_rate) {
|
|
HipTest::HIP_SKIP_TEST("hipDeviceAttributeWallClockRate returns 0");
|
|
return;
|
|
}
|
|
|
|
const auto expected_time1 = GENERATE(1000, 1500, 2000);
|
|
const auto expected_time2 = expected_time1 / 2;
|
|
|
|
REQUIRE(kernel_time_execution(kernel_wc64, clock_rate, expected_time1, expected_time2));
|
|
}
|
|
|
|
/**
|
|
* End doxygen group DeviceLanguageTest.
|
|
* @}
|
|
*/
|