diff --git a/test/MatrixTranspose_test/MatrixTranspose.cpp b/test/MatrixTranspose_test/MatrixTranspose.cpp index 34007cf20f..76ce261fda 100644 --- a/test/MatrixTranspose_test/MatrixTranspose.cpp +++ b/test/MatrixTranspose_test/MatrixTranspose.cpp @@ -20,34 +20,39 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include -// roctracer extension API -#include - -// hip header file -#include +#ifdef __cplusplus +#include +using namespace std; +#else +#include +#endif // roctx header file #include +// roctracer extension API +#include -// kfd header file -#ifdef KFD_WRAPPER -#include +#if HIP_TEST +// hip header file +#include +// Macro to call HIP API +#define HIP_CALL(call) do { call; } while(0) +#else +#define HIP_CALL(call) do {} while(0) #endif #ifndef ITERATIONS # define ITERATIONS 101 #endif #define WIDTH 1024 - - #define NUM (WIDTH * WIDTH) - #define THREADS_PER_BLOCK_X 4 #define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 +#if HIP_TEST // Device (Kernel) function, it must be void __global__ void matrixTranspose(float* out, float* in, const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -55,6 +60,7 @@ __global__ void matrixTranspose(float* out, float* in, const int width) { out[y * width + x] = in[x * width + y]; } +#endif // CPU implementation of matrix transpose void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) { @@ -78,19 +84,33 @@ int main() { float* gpuMatrix; float* gpuTransposeMatrix; - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - - std::cout << "Device name " << devProp.name << std::endl; - int i; int errors; init_tracing(); +#if HIP_TEST + int gpuCount = 1; +#if MGPU_TEST + hipGetDeviceCount(&gpuCount); + printf("Number of GPUs: %d\n", gpuCount); +#endif + iterations *= gpuCount; +#endif + while (iterations-- > 0) { start_tracing(); +#if HIP_TEST + // set GPU + const int devIndex = iterations % gpuCount; + hipSetDevice(devIndex); + + hipDeviceProp_t devProp; + HIP_CALL(hipGetDeviceProperties(&devProp, 0)); + printf("Device %d name: %s\n", devIndex, devProp.name); +#endif + Matrix = (float*)malloc(NUM * sizeof(float)); TransposeMatrix = (float*)malloc(NUM * sizeof(float)); cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); @@ -101,8 +121,8 @@ int main() { } // allocate the memory on the device side - hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + HIP_CALL(hipMalloc((void**)&gpuMatrix, NUM * sizeof(float))); + HIP_CALL(hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float))); // correlation reagion32 roctracer_activity_push_external_correlation_id(31); @@ -110,7 +130,7 @@ int main() { roctracer_activity_push_external_correlation_id(32); // Memory transfer from host to device - hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice); + HIP_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice)); // correlation reagion33 roctracer_activity_push_external_correlation_id(33); @@ -119,9 +139,9 @@ int main() { roctxRangePush("hipLaunchKernel"); // Lauching kernel from host - hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, - gpuMatrix, WIDTH); + HIP_CALL(hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, + gpuMatrix, WIDTH)); roctxMark("after hipLaunchKernel"); @@ -131,39 +151,40 @@ int main() { // Memory transfer from device to host roctxRangePush("hipMemcpy"); - hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost); + HIP_CALL(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost)); roctxRangePop(); // for "hipMemcpy" roctxRangePop(); // for "hipLaunchKernel" // correlation reagion end - roctracer_activity_pop_external_correlation_id(); + roctracer_activity_pop_external_correlation_id(NULL); // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + HIP_CALL(matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH)); // verify the results errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { + if (abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { errors++; } } - if (errors != 0) { + if ((HIP_TEST != 0) && (errors != 0)) { printf("FAILED: %d errors\n", errors); } else { + errors = 0; printf("PASSED!\n"); } // free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); + HIP_CALL(hipFree(gpuMatrix)); + HIP_CALL(hipFree(gpuTransposeMatrix)); // correlation reagion end - roctracer_activity_pop_external_correlation_id(); + roctracer_activity_pop_external_correlation_id(NULL); // correlation reagion end - roctracer_activity_pop_external_correlation_id(); + roctracer_activity_pop_external_correlation_id(NULL); // free the resources on host side free(Matrix); @@ -182,18 +203,27 @@ int main() { #if 1 #include #include +#include +#include #include +#include +#include /* For SYS_xxx definitions */ + // Macro to check ROC-tracer calls status #define ROCTRACER_CALL(call) \ do { \ int err = call; \ if (err != 0) { \ - std::cerr << roctracer_error_string() << std::endl << std::flush; \ + fprintf(stderr, "%s\n", roctracer_error_string()); \ abort(); \ } \ } while (0) +static inline uint32_t GetTid() { return syscall(__NR_gettid); } +static inline uint32_t GetPid() { return syscall(__NR_getpid); } + + // Runtime API callback function void api_callback( uint32_t domain, @@ -204,47 +234,60 @@ void api_callback( (void)arg; if (domain == ACTIVITY_DOMAIN_ROCTX) { - const roctx_api_data_t* data = reinterpret_cast(callback_data); - fprintf(stdout, "ROCTX: \"%s\"\n", data->args.message); + const roctx_api_data_t* data = (const roctx_api_data_t*)(callback_data); + fprintf(stdout, "\n", data->args.message); return; } -#ifdef KFD_WRAPPER + if (domain == ACTIVITY_DOMAIN_KFD_API) { - const kfd_api_data_t* data = reinterpret_cast(callback_data); - fprintf(stdout, "KFD: <%s id(%u)\tcorrelation_id(%lu) %s> \n", + const kfd_api_data_t* data = (const kfd_api_data_t*)(callback_data); + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s pid(%u) tid(%u)> \n", roctracer_op_string(ACTIVITY_DOMAIN_KFD_API, cid, 0), cid, data->correlation_id, - (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); + (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit", + GetPid(), + GetTid() + ); return; } -#endif - const hip_api_data_t* data = reinterpret_cast(callback_data); - fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s> ", - roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), - cid, - data->correlation_id, - (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); + const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); if (data->phase == ACTIVITY_API_PHASE_ENTER) { switch (cid) { case HIP_API_ID_hipMemcpy: - fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)", + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s>\n dst(%p) src(%p) size(0x%x) kind(%u)\n", + roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), + cid, + data->correlation_id, + "on-enter", data->args.hipMemcpy.dst, data->args.hipMemcpy.src, (uint32_t)(data->args.hipMemcpy.sizeBytes), (uint32_t)(data->args.hipMemcpy.kind)); break; case HIP_API_ID_hipMalloc: - fprintf(stdout, "ptr(%p) size(0x%x)", + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s>\n ptr(%p) size(0x%x)\n", + roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), + cid, + data->correlation_id, + "on-enter", data->args.hipMalloc.ptr, (uint32_t)(data->args.hipMalloc.size)); break; case HIP_API_ID_hipFree: - fprintf(stdout, "ptr(%p)", + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s>\n ptr(%p)\n", + roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), + cid, + data->correlation_id, + "on-enter", data->args.hipFree.ptr); break; case HIP_API_ID_hipModuleLaunchKernel: - fprintf(stdout, "kernel(\"%s\") stream(%p)", + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s>\n kernel(\"%s\") stream(%p)\n", + roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), + cid, + data->correlation_id, + "on-enter", hipKernelNameRef(data->args.hipModuleLaunchKernel.f), data->args.hipModuleLaunchKernel.stream); break; @@ -254,21 +297,25 @@ void api_callback( } else { switch (cid) { case HIP_API_ID_hipMalloc: - fprintf(stdout, "*ptr(0x%p)", + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s>\n *ptr(0x%p)\n", + roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), + cid, + data->correlation_id, + "on-exit", *(data->args.hipMalloc.ptr)); break; default: break; } } - fprintf(stdout, "\n"); fflush(stdout); + //fprintf(stdout, "\n"); + fflush(stdout); } - // Activity tracing callback // hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) void activity_callback(const char* begin, const char* end, void* arg) { - const roctracer_record_t* record = reinterpret_cast(begin); - const roctracer_record_t* end_record = reinterpret_cast(end); + const roctracer_record_t* record = (const roctracer_record_t*)(begin); + const roctracer_record_t* end_record = (const roctracer_record_t*)(end); fprintf(stdout, "\tActivity records:\n"); fflush(stdout); while (record < end_record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); @@ -278,26 +325,25 @@ void activity_callback(const char* begin, const char* end, void* arg) { record->begin_ns, record->end_ns ); - if (record->domain == ACTIVITY_DOMAIN_HIP_API or record->domain == ACTIVITY_DOMAIN_KFD_API) { - fprintf(stdout, " process_id(%u) thread_id(%u)", + if ((record->domain == ACTIVITY_DOMAIN_HIP_API) || (record->domain == ACTIVITY_DOMAIN_KFD_API)) { + fprintf(stdout, " process_id(%u) thread_id(%u)\n", record->process_id, record->thread_id ); } else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) { - fprintf(stdout, " device_id(%d) queue_id(%lu)", + fprintf(stdout, " device_id(%d) queue_id(%lu)\n", record->device_id, record->queue_id ); - if (record->op == HIP_OP_ID_COPY) fprintf(stdout, " bytes(0x%zx)", record->bytes); + if (record->op == HIP_OP_ID_COPY) fprintf(stdout, " bytes(0x%zx)\n", record->bytes); } else if (record->domain == ACTIVITY_DOMAIN_EXT_API) { - fprintf(stdout, " external_id(%lu)", + fprintf(stdout, " external_id(%lu)\n", record->external_id ); } else { - fprintf(stderr, "Bad domain %d\n", record->domain); + fprintf(stderr, "Bad domain %d\n\n", record->domain); abort(); } - fprintf(stdout, "\n"); fflush(stdout); ROCTRACER_CALL(roctracer_next_record(record, &record)); } @@ -305,11 +351,12 @@ void activity_callback(const char* begin, const char* end, void* arg) { // Init tracing routine void init_tracing() { - std::cout << "# INIT #############################" << std::endl << std::flush; + printf("# INIT #############################\n"); // roctracer properties roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, NULL); // Allocating tracing pool - roctracer_properties_t properties{}; + roctracer_properties_t properties; + memset(&properties, 0, sizeof(roctracer_properties_t)); properties.buffer_size = 0x1000; properties.buffer_callback_fun = activity_callback; ROCTRACER_CALL(roctracer_open_pool(&properties)); @@ -318,11 +365,16 @@ void init_tracing() { // Enable HIP activity tracing ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); + // Enable KFD API tracing + ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_KFD_API, api_callback, NULL)); + ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_KFD_API)); + // Enable rocTX + ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, NULL)); } // Start tracing routine void start_tracing() { - std::cout << "# START (" << iterations << ") #############################" << std::endl << std::flush; + printf("# START (%d) #############################\n", iterations); // Start if ((iterations & 1) == 1) roctracer_start(); else roctracer_stop(); @@ -333,8 +385,9 @@ void stop_tracing() { ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)); ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); + ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_KFD_API)); ROCTRACER_CALL(roctracer_flush_activity()); - std::cout << "# STOP #############################" << std::endl << std::flush; + printf("# STOP #############################\n"); } #else void init_tracing() {}