Rework clock based unit tests (#2646)
This commit is contained in:
@@ -17,13 +17,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip/hip_ext.h>
|
||||
#include <cstring>
|
||||
#ifndef _WIN32
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @addtogroup clock clock
|
||||
@@ -32,302 +32,125 @@ THE SOFTWARE.
|
||||
* Contains unit tests for clock, clock64 and wall_clock64 APIs
|
||||
*/
|
||||
|
||||
__global__ void kernel_c64(int clock_rate, uint64_t wait_t) {
|
||||
uint64_t start = clock64() / clock_rate, cur = 0; // in ms
|
||||
do {
|
||||
cur = clock64() / clock_rate - start;
|
||||
} while (cur < wait_t);
|
||||
// Any sort of wait based on clock cycles will be inaccurate give how modern GPUs clock themselves.
|
||||
// What clock functions should exhibit is forward progress of the clock ticks.
|
||||
// What we measure here is the start tick should be smaller than the end tick.
|
||||
// We do some primitive math in the middle.
|
||||
|
||||
__device__ float reduce_32_elements(float* in) {
|
||||
auto val = in[threadIdx.x];
|
||||
val += __shfl_down(val, 16);
|
||||
val += __shfl_down(val, 8);
|
||||
val += __shfl_down(val, 4);
|
||||
val += __shfl_down(val, 2);
|
||||
val += __shfl_down(val, 1);
|
||||
return val;
|
||||
}
|
||||
|
||||
__global__ void kernel_c(int clock_rate, uint64_t wait_t) {
|
||||
uint64_t start = clock() / clock_rate, cur = 0; // in ms
|
||||
do {
|
||||
cur = clock() / clock_rate - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
|
||||
__global__ void kernel_wc64(int clock_rate, uint64_t wait_t) {
|
||||
uint64_t start = wall_clock64() / clock_rate, cur = 0; // in ms
|
||||
do {
|
||||
cur = wall_clock64() / clock_rate - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
|
||||
bool verify_time_execution(float ratio, float time1, float time2, float expected_time1,
|
||||
float expected_time2) {
|
||||
bool test_status = false;
|
||||
|
||||
if (fabs(time1 - expected_time1) < (ratio * expected_time1) &&
|
||||
fabs(time2 - expected_time2) < (ratio * expected_time2)) {
|
||||
INFO("Succeeded: Expected Vs Actual: Kernel1 - " << expected_time1 << " Vs " << time1
|
||||
<< ", Kernel2 - " << expected_time2 << " Vs "
|
||||
<< time2);
|
||||
test_status = true;
|
||||
} else {
|
||||
INFO("Failed: Expected Vs Actual: Kernel1 -" << expected_time1 << " Vs " << time1
|
||||
<< ", Kernel2 - " << expected_time2 << " Vs "
|
||||
<< time2);
|
||||
test_status = false;
|
||||
}
|
||||
return test_status;
|
||||
}
|
||||
|
||||
/*
|
||||
* Launching kernel1 and kernel2 and then we try to
|
||||
* get the event elapsed time of each kernel using the start and
|
||||
* end events.The event elapsed time should return us the kernel
|
||||
* execution time for that particular kernel
|
||||
*/
|
||||
bool kernel_time_execution(void (*kernel)(int, uint64_t), int clock_rate, uint64_t expected_time1,
|
||||
uint64_t expected_time2) {
|
||||
hipStream_t stream;
|
||||
hipEvent_t start_event1, end_event1, start_event2, end_event2;
|
||||
float time1 = 0, time2 = 0;
|
||||
HIP_CHECK(hipEventCreate(&start_event1));
|
||||
HIP_CHECK(hipEventCreate(&end_event1));
|
||||
HIP_CHECK(hipEventCreate(&start_event2));
|
||||
HIP_CHECK(hipEventCreate(&end_event2));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event1, end_event1, 0,
|
||||
clock_rate, expected_time1);
|
||||
hipExtLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, stream, start_event2, end_event2, 0,
|
||||
clock_rate, expected_time2);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipEventElapsedTime(&time1, start_event1, end_event1));
|
||||
HIP_CHECK(hipEventElapsedTime(&time2, start_event2, end_event2));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
HIP_CHECK(hipEventDestroy(start_event1));
|
||||
HIP_CHECK(hipEventDestroy(end_event1));
|
||||
HIP_CHECK(hipEventDestroy(start_event2));
|
||||
HIP_CHECK(hipEventDestroy(end_event2));
|
||||
|
||||
float ratio = kernel == kernel_wc64 ? 0.01 : 0.5;
|
||||
|
||||
return verify_time_execution(ratio, time1, time2, expected_time1, expected_time2);
|
||||
}
|
||||
|
||||
template <class T> void loadSym(T& symbol, const char* symbolName, void* handle) {
|
||||
using namespace std::string_literals;
|
||||
void* fnsym = dlsym(handle, symbolName);
|
||||
|
||||
if (!fnsym)
|
||||
throw std::runtime_error("Failure while trying to dynamically load symbol: "s + symbolName);
|
||||
|
||||
symbol = reinterpret_cast<T>(fnsym);
|
||||
}
|
||||
|
||||
void getCurrentDeviceUUID(hipUUID& uuid) {
|
||||
hipDeviceProp_t props;
|
||||
int deviceId;
|
||||
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::memcpy(uuid.bytes, props.uuid.bytes, sizeof(hipUUID::bytes));
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
// Gets the maximum engine frequency of the GPU by dynamically loading amdsmi
|
||||
// @uuid the id of the GPU to query the frequency for
|
||||
// @return the maximum engine frequency of the GPU (MHz) or -1 if error
|
||||
int getEngineFreq(const hipUUID& uuid) {
|
||||
static constexpr unsigned int AMDSMI_MAX_STRING_LENGTH = 256;
|
||||
typedef void* amdsmi_processor_handle;
|
||||
typedef void* amdsmi_socket_handle;
|
||||
typedef enum {
|
||||
AMDSMI_STATUS_SUCCESS = 0, //!< Call succeeded
|
||||
} amdsmi_status_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t clk; //!< In MHz
|
||||
uint32_t min_clk; //!< In MHz
|
||||
uint32_t max_clk; //!< In MHz
|
||||
uint8_t clk_locked; //!< True/False
|
||||
uint8_t clk_deep_sleep; //!< True/False
|
||||
uint32_t reserved[4];
|
||||
} amdsmi_clk_info_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t drm_render; //!< the render node under /sys/class/drm/renderD*
|
||||
uint32_t drm_card; //!< the graphic card device under /sys/class/drm/card*
|
||||
uint32_t hsa_id; //!< the HSA enumeration ID
|
||||
uint32_t hip_id; //!< the HIP enumeration ID
|
||||
char hip_uuid[AMDSMI_MAX_STRING_LENGTH]; //!< the HIP unique identifer
|
||||
} amdsmi_enumeration_info_t;
|
||||
|
||||
typedef enum {
|
||||
AMDSMI_CLK_TYPE_GFX = 0x0
|
||||
} amdsmi_clk_type_t;
|
||||
|
||||
amdsmi_clk_info_t clk_info;
|
||||
uint32_t gpu_count = 0;
|
||||
uint32_t num_processor = 0;
|
||||
amdsmi_status_t (*fninit)(uint64_t);
|
||||
amdsmi_status_t (*fnget_socket_handles)(uint32_t*, amdsmi_socket_handle*);
|
||||
amdsmi_status_t (*fnget_processor_handles)(amdsmi_socket_handle, uint32_t*,
|
||||
amdsmi_processor_handle*);
|
||||
amdsmi_status_t (*fnget_gpu_enumeration_info)(amdsmi_processor_handle,
|
||||
amdsmi_enumeration_info_t*);
|
||||
amdsmi_status_t (*fnget_clock_info)(amdsmi_processor_handle, amdsmi_clk_type_t,
|
||||
amdsmi_clk_info_t*);
|
||||
amdsmi_status_t (*fnshut_down)();
|
||||
int result = -1;
|
||||
bool smi_initialized = false;
|
||||
auto cleanUp = [&smi_initialized, &fnshut_down](void* handle) {
|
||||
if (smi_initialized)
|
||||
fnshut_down();
|
||||
|
||||
if (handle)
|
||||
dlclose(handle);
|
||||
};
|
||||
std::unique_ptr<void, decltype(cleanUp)> lib_hdl(nullptr, cleanUp);
|
||||
|
||||
lib_hdl.reset(dlopen("libamd_smi.so", RTLD_LAZY));
|
||||
|
||||
if (!lib_hdl) {
|
||||
return -1;
|
||||
__global__ void reduce_c64(long long* start, long long* end, float* in /* 32 sized input */,
|
||||
float* out /* single sized output*/) {
|
||||
if (threadIdx.x == 0) {
|
||||
*start = clock64();
|
||||
}
|
||||
|
||||
try {
|
||||
loadSym(fninit, "amdsmi_init", lib_hdl.get());
|
||||
loadSym(fnget_socket_handles, "amdsmi_get_socket_handles", lib_hdl.get());
|
||||
loadSym(fnget_processor_handles, "amdsmi_get_processor_handles", lib_hdl.get());
|
||||
loadSym(fnget_gpu_enumeration_info, "amdsmi_get_gpu_enumeration_info", lib_hdl.get());
|
||||
loadSym(fnget_clock_info, "amdsmi_get_clock_info", lib_hdl.get());
|
||||
loadSym(fnshut_down, "amdsmi_shut_down", lib_hdl.get());
|
||||
} catch (std::runtime_error&) {
|
||||
return -1;
|
||||
// do not reorder
|
||||
__threadfence();
|
||||
auto val = reduce_32_elements(in);
|
||||
__threadfence();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
*out = val;
|
||||
*end = clock64();
|
||||
}
|
||||
|
||||
if (fninit(1ul << 1)) {
|
||||
return -1;
|
||||
} else
|
||||
smi_initialized = true;
|
||||
|
||||
uint32_t socket_count = 0;
|
||||
uint32_t num_socket = 0;
|
||||
|
||||
// get the socket count available in the system
|
||||
if (fnget_socket_handles(&socket_count, nullptr)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
if (fnget_socket_handles(&socket_count, &sockets[0])) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (num_socket < socket_count && result == -1) {
|
||||
// just get number of processors first
|
||||
if (fnget_processor_handles(sockets[num_socket], &gpu_count, nullptr)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<amdsmi_processor_handle> processors(gpu_count);
|
||||
if (fnget_processor_handles(sockets[num_socket], &gpu_count, &processors[0])) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (num_processor < gpu_count && result == -1) {
|
||||
amdsmi_enumeration_info_t info;
|
||||
int offset = 0;
|
||||
const char* prefix = "GPU-";
|
||||
|
||||
if (fnget_gpu_enumeration_info(processors[num_processor], &info)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!std::strncmp(info.hip_uuid, "GPU-", std::strlen(prefix))) {
|
||||
// amd-smi adds "GPU-" in front of the hip_uuid; whereas HIP doesn't
|
||||
offset = strlen(prefix);
|
||||
}
|
||||
|
||||
if (!std::memcmp(uuid.bytes, info.hip_uuid + offset, sizeof(hipUUID::bytes) - offset)) {
|
||||
if (fnget_clock_info(processors[num_processor], AMDSMI_CLK_TYPE_GFX, &clk_info)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
result = clk_info.max_clk;
|
||||
}
|
||||
|
||||
num_processor++;
|
||||
}
|
||||
|
||||
num_socket++;
|
||||
num_processor = 0;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
// @max_clock_rate will be set to the maximum clock rate as reported by hipDeviceGetAttribute()
|
||||
// @return maximum engine clock rate obtained via amdsmi or -1 if querying via amdsmi fails
|
||||
int getClockRate(int& max_clock_rate) {
|
||||
max_clock_rate = 0; // in kHz
|
||||
HIP_CHECK(hipDeviceGetAttribute(&max_clock_rate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
#ifdef _WIN32
|
||||
return -1;
|
||||
#else
|
||||
hipUUID uuid;
|
||||
int smi_clock_rate = 0; // in kHz
|
||||
|
||||
getCurrentDeviceUUID(uuid);
|
||||
smi_clock_rate = getEngineFreq(uuid);
|
||||
return smi_clock_rate;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
|
||||
* using device function clock64. Kernel execution time is calculated through elapsed time between
|
||||
* the start and end event, and calculated time is compared with passed time values.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/clock/hipClockCheck.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
__global__ void reduce_c(long long* start, long long* end, float* in /* 32 sized input */,
|
||||
float* out /* single sized output*/) {
|
||||
if (threadIdx.x == 0) {
|
||||
*start = clock();
|
||||
}
|
||||
|
||||
// do not reorder
|
||||
__threadfence();
|
||||
auto val = reduce_32_elements(in);
|
||||
__threadfence();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
*out = val;
|
||||
*end = clock();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void reduce_wc64(long long* start, long long* end, float* in /* 32 sized input */,
|
||||
float* out /* single sized output*/) {
|
||||
if (threadIdx.x == 0) {
|
||||
*start = wall_clock64();
|
||||
}
|
||||
|
||||
// do not reorder
|
||||
__threadfence();
|
||||
auto val = reduce_32_elements(in);
|
||||
__threadfence();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
*out = val;
|
||||
*end = wall_clock64();
|
||||
}
|
||||
}
|
||||
|
||||
void execute_clock_kernels(void (*kernel)(long long*, long long*, float*, float*)) {
|
||||
constexpr size_t size = 32; /* Do not change this, the math in kernel is done for 32 elements */
|
||||
float *d_in{}, *d_out{}, out{};
|
||||
long long *d_clock_start{}, *d_clock_end{}, clock_start{}, clock_end{};
|
||||
std::vector<float> in(size, 0.0f);
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
in[i] = i + 1;
|
||||
}
|
||||
auto cpu_result = std::accumulate(in.begin(), in.end(), 0.0f);
|
||||
|
||||
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
|
||||
HIP_CHECK(hipMalloc(&d_out, sizeof(float)));
|
||||
HIP_CHECK(hipMalloc(&d_clock_start, sizeof(long long)));
|
||||
HIP_CHECK(hipMalloc(&d_clock_end, sizeof(long long)));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_in, in.data(), sizeof(float) * in.size(), hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemset(d_out, 0, sizeof(float)));
|
||||
HIP_CHECK(hipMemset(d_clock_start, 0, sizeof(long long)));
|
||||
HIP_CHECK(hipMemset(d_clock_end, 0, sizeof(long long)));
|
||||
|
||||
hipLaunchKernelGGL(kernel, 1, size, 0, nullptr, d_clock_start, d_clock_end, d_in, d_out);
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
HIP_CHECK(hipMemcpy(&clock_start, d_clock_start, sizeof(long long), hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(&clock_end, d_clock_end, sizeof(long long), hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(float), hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipFree(d_in));
|
||||
HIP_CHECK(hipFree(d_out));
|
||||
HIP_CHECK(hipFree(d_clock_start));
|
||||
HIP_CHECK(hipFree(d_clock_end));
|
||||
|
||||
// Make sure the math happenned correctly
|
||||
INFO("sum(1.0f, 2.0f, ..., 32.0f) gpu result: " << out << " cpu: " << cpu_result);
|
||||
REQUIRE(out == cpu_result);
|
||||
|
||||
// Measure the clock progress
|
||||
// There can be two scenarios:
|
||||
// 1) clock_start < clock_end : which we expect
|
||||
// 2) clock_start > clock_end : which means clock warped around, but chances of that happening is
|
||||
// really low
|
||||
INFO("Clock start: " << clock_start << " end: " << clock_end);
|
||||
REQUIRE(clock_start < clock_end);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipClock64_Positive_Basic") {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
|
||||
int max_clock_rate;
|
||||
int clock_rate = getClockRate(max_clock_rate);
|
||||
|
||||
if (max_clock_rate == 0) {
|
||||
HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
|
||||
return;
|
||||
}
|
||||
if (IsGfx11()) {
|
||||
HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (clock_rate == -1) {
|
||||
// libamd_smi.so might not be present depending on some systems, so we load it dynamically
|
||||
// and use it if it is, otherwise we use the attribute
|
||||
UNSCOPED_INFO(
|
||||
"Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
|
||||
clock_rate = max_clock_rate;
|
||||
} else {
|
||||
clock_rate *= 1000;
|
||||
|
||||
if (clock_rate != max_clock_rate) {
|
||||
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
|
||||
<< "kHz");
|
||||
} else {
|
||||
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz");
|
||||
}
|
||||
}
|
||||
|
||||
const auto expected_time1 = GENERATE(1000, 1500, 2000);
|
||||
const auto expected_time2 = expected_time1 / 2;
|
||||
|
||||
REQUIRE(kernel_time_execution(kernel_c64, clock_rate, expected_time1, expected_time2));
|
||||
execute_clock_kernels(reduce_c64);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -344,71 +167,17 @@ TEST_CASE("Unit_hipClock64_Positive_Basic") {
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipClock_Positive_Basic") {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
|
||||
int max_clock_rate;
|
||||
int clock_rate = getClockRate(max_clock_rate);
|
||||
|
||||
if (max_clock_rate == 0) {
|
||||
HipTest::HIP_SKIP_TEST("hipDeviceAttributeClockRate returns 0");
|
||||
return;
|
||||
}
|
||||
if (IsGfx11()) {
|
||||
HipTest::HIP_SKIP_TEST("Issue with clock64() function on gfx11 devices!");
|
||||
HipTest::HIP_SKIP_TEST("Issue with clock() function on gfx11 devices!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (clock_rate == -1) {
|
||||
// libamd_smi.so might not be present depending on some systems, so we load it dynamically
|
||||
// and use it if it is, otherwise we use the attribute
|
||||
UNSCOPED_INFO(
|
||||
"Failed to get clock rate via amdsmi (is libamd_smi.so in the library search path?)");
|
||||
clock_rate = max_clock_rate;
|
||||
} else {
|
||||
clock_rate *= 1000;
|
||||
|
||||
if (clock_rate != max_clock_rate) {
|
||||
UNSCOPED_INFO("clock rate: " << clock_rate << "kHz is not set to maximum: " << max_clock_rate
|
||||
<< "kHz");
|
||||
}
|
||||
}
|
||||
|
||||
const auto expected_time1 = GENERATE(1000, 1500, 2000);
|
||||
const auto expected_time2 = expected_time1 / 2;
|
||||
|
||||
REQUIRE(kernel_time_execution(kernel_c, clock_rate, expected_time1, expected_time2));
|
||||
execute_clock_kernels(reduce_c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Launches two kernels that run for a specified amount of time passed as a kernel argument by
|
||||
* using device function wall_clock64. Kernel execution time is calculated through elapsed time
|
||||
* between the start and end event, and calculated time is compared with passed time values.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/clock/hipClockCheck.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipWallClock64_Positive_Basic") {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
int clock_rate = 0; // in kHz
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeWallClockRate, 0));
|
||||
|
||||
if (!clock_rate) {
|
||||
HipTest::HIP_SKIP_TEST("hipDeviceAttributeWallClockRate returns 0");
|
||||
return;
|
||||
}
|
||||
|
||||
const auto expected_time1 = GENERATE(1000, 1500, 2000);
|
||||
const auto expected_time2 = expected_time1 / 2;
|
||||
|
||||
REQUIRE(kernel_time_execution(kernel_wc64, clock_rate, expected_time1, expected_time2));
|
||||
}
|
||||
TEST_CASE("Unit_hipWallClock64_Positive_Basic") { execute_clock_kernels(reduce_wc64); }
|
||||
|
||||
/**
|
||||
* End doxygen group DeviceLanguageTest.
|
||||
* @}
|
||||
*/
|
||||
*/
|
||||
Reference in New Issue
Block a user