diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ef0a8ea633..05593efedd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -35,8 +35,10 @@ add_custom_target( mytest COMMAND sh -xc "cp ${TEST_DIR}/MatrixTranspose/MatrixTranspose ${PROJECT_BINARY_DIR}/test" COMMAND HIP_VDI=${HIP_VDI} make -C "${TEST_DIR}/MatrixTranspose_test" COMMAND sh -xc "cp ${TEST_DIR}/MatrixTranspose_test/MatrixTranspose ${PROJECT_BINARY_DIR}/test/MatrixTranspose_test" + COMMAND MGPU_TEST=1 HIP_VDI=${HIP_VDI} make -C "${TEST_DIR}/MatrixTranspose_test" + COMMAND sh -xc "cp ${TEST_DIR}/MatrixTranspose_test/MatrixTranspose ${PROJECT_BINARY_DIR}/test/MatrixTranspose_mgpu" COMMAND C_TEST=1 HIP_VDI=${HIP_VDI} make -C "${TEST_DIR}/MatrixTranspose_test" - COMMAND sh -xc "cp ${TEST_DIR}/MatrixTranspose_test/MatrixTranspose_ctest ${PROJECT_BINARY_DIR}/test/MatrixTranspose_ctest" + COMMAND sh -xc "cp ${TEST_DIR}/MatrixTranspose_test/MatrixTranspose ${PROJECT_BINARY_DIR}/test/MatrixTranspose_ctest" ) ## Util sources diff --git a/test/MatrixTranspose_test/Makefile b/test/MatrixTranspose_test/Makefile index c59c497af1..dc5c42aa65 100644 --- a/test/MatrixTranspose_test/Makefile +++ b/test/MatrixTranspose_test/Makefile @@ -11,24 +11,24 @@ ifeq (,$(HIP_PATH)) endif HIPCC=$(HIP_PATH)/bin/hipcc - TARGET=hcc - - +EXECUTABLE=./MatrixTranspose +OBJECTS = MatrixTranspose.o FLAGS =-g -I$(ROOT_PATH) -I$(ROOT_PATH)/inc -I${HSA_KMT_INC_PATH} -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) + ifeq ($(C_TEST), 1) COMP=gcc SOURCES = MatrixTranspose.c FLAGS += -DHIP_TEST=0 -D__HIP_PLATFORM_HCC__=1 -I/opt/rocm/hcc/include - EXECUTABLE=./MatrixTranspose_ctest else COMP=$(HIPCC) - FLAGS += -DHIP_TEST=1 SOURCES = MatrixTranspose.cpp - EXECUTABLE=./MatrixTranspose + FLAGS += -DHIP_TEST=1 +endif +ifeq ($(MGPU_TEST), 1) + FLAGS += -DMGPU_TEST=1 endif -OBJECTS = MatrixTranspose.o .PHONY: test diff --git a/test/MatrixTranspose_test/MatrixTranspose.cpp b/test/MatrixTranspose_test/MatrixTranspose.cpp index 11ad71709a..c1189e9387 100644 --- a/test/MatrixTranspose_test/MatrixTranspose.cpp +++ b/test/MatrixTranspose_test/MatrixTranspose.cpp @@ -84,21 +84,33 @@ int main() { float* gpuMatrix; float* gpuTransposeMatrix; -#if HIP_TEST - hipDeviceProp_t devProp; - HIP_CALL(hipGetDeviceProperties(&devProp, 0)); - - printf("Device name %s\n", devProp.name); -#endif - int i; int errors; init_tracing(); +#if HIP_TEST + int gpuCount = 1; +#if MGPU_TEST + hipGetDeviceCount(&gpuCount); + printf("Number of GPUs: %d\n", gpuCount); +#endif + iterations *= gpuCount; +#endif + while (iterations-- > 0) { start_tracing(); +#if HIP_TEST + // set GPU + const int devIndex = iterations % gpuCount; + hipSetDevice(devIndex); + + hipDeviceProp_t devProp; + HIP_CALL(hipGetDeviceProperties(&devProp, 0)); + printf("Device %d name: %s\n", devIndex, devProp.name); +#endif + Matrix = (float*)malloc(NUM * sizeof(float)); TransposeMatrix = (float*)malloc(NUM * sizeof(float)); cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); diff --git a/test/MatrixTranspose_test_mgpu/Makefile b/test/MatrixTranspose_test_mgpu/Makefile deleted file mode 100644 index da9971b371..0000000000 --- a/test/MatrixTranspose_test_mgpu/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -ROOT_PATH = ../.. -LIB_PATH = $(ROOT_PATH)/build -ROC_LIBS = -L$(LIB_PATH) -lroctracer64 -export LD_LIBRARY_PATH=$(LIB_PATH) -HIP_VDI ?= 0 -ITERATIONS ?= 1 - -HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif - -HIPCC=$(HIP_PATH)/bin/hipcc - -TARGET=hcc - -SOURCES = MatrixTranspose.cpp -OBJECTS = $(SOURCES:.cpp=.o) - -EXECUTABLE=./MatrixTranspose - -.PHONY: test - - -all: clean $(EXECUTABLE) - -CXXFLAGS =-g -I$(ROOT_PATH) -I$(ROOT_PATH)/inc -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -CXX=$(HIPCC) - -$(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ $(ROC_LIBS) - -test: $(EXECUTABLE) - $(EXECUTABLE) - -clean: - rm -f $(EXECUTABLE) - rm -f $(OBJECTS) - rm -f $(HIP_PATH)/src/*.o - diff --git a/test/MatrixTranspose_test_mgpu/MatrixTranspose.cpp b/test/MatrixTranspose_test_mgpu/MatrixTranspose.cpp deleted file mode 100644 index ffd4c88109..0000000000 --- a/test/MatrixTranspose_test_mgpu/MatrixTranspose.cpp +++ /dev/null @@ -1,313 +0,0 @@ -/* -Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -// roctracer extension API -#include - -// hip header file -#include - -#ifndef ITERATIONS -# define ITERATIONS 1 -#endif -#define WIDTH 1024 - - -#define NUM (WIDTH * WIDTH) - -#define THREADS_PER_BLOCK_X 4 -#define THREADS_PER_BLOCK_Y 4 -#define THREADS_PER_BLOCK_Z 1 - -// Device (Kernel) function, it must be void -__global__ void matrixTranspose(float* out, float* in, const int width) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - - out[y * width + x] = in[x * width + y]; -} - -// CPU implementation of matrix transpose -void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) { - for (unsigned int j = 0; j < width; j++) { - for (unsigned int i = 0; i < width; i++) { - output[i * width + j] = input[j * width + i]; - } - } -} - -int iterations = ITERATIONS; -void init_tracing(); -void start_tracing(); -void stop_tracing(); - -int main() { - float* Matrix; - float* TransposeMatrix; - float* cpuTransposeMatrix; - - float* gpuMatrix; - float* gpuTransposeMatrix; - - int i; - int errors; - - int gpuCount = 0; - hipGetDeviceCount(&gpuCount); - std::cout << "Number of GPUs: " << gpuCount << std::endl; - - init_tracing(); - - while (iterations-- > 0) { - start_tracing(); - - Matrix = (float*)malloc(NUM * sizeof(float)); - TransposeMatrix = (float*)malloc(NUM * sizeof(float)); - cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); - - // initialize the input data - for (i = 0; i < NUM; i++) { - Matrix[i] = (float)i * 10.0f; - } - - for (i = 0; i < gpuCount; ++i) { - // switch GPU. - hipSetDevice(i); - - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - std::cout << "Device name " << devProp.name << std::endl; - - // allocate the memory on the device side - hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); - - // Memory transfer from host to device - hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice); - - // Lauching kernel from host - hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, - gpuMatrix, WIDTH); - - hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost); - - hipStreamSynchronize(0); - - // free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); - } - - // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); - - // verify the results - errors = 0; - double eps = 1.0E-6; - for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { - errors++; - } - } - if (errors != 0) { - printf("FAILED: %d errors\n", errors); - } else { - printf("PASSED!\n"); - } - - // free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); - - stop_tracing(); - } - - return errors; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// HIP Callbacks/Activity tracing -// -#if 1 -#include -#include - -// Macro to check ROC-tracer calls status -#define ROCTRACER_CALL(call) \ - do { \ - int err = call; \ - if (err != 0) { \ - std::cerr << roctracer_error_string() << std::endl << std::flush; \ - abort(); \ - } \ - } while (0) - -// Runtime API callback function -void api_callback( - uint32_t domain, - uint32_t cid, - const void* callback_data, - void* arg) -{ - std::cout << "### api_callback IN\n"; - (void)arg; - - //if (domain == ACTIVITY_DOMAIN_ROCTX) { - // const roctx_api_data_t* data = reinterpret_cast(callback_data); - // fprintf(stdout, "ROCTX: \"%s\"\n", data->args.message); - // return; - //} - - if (domain == ACTIVITY_DOMAIN_HCC_OPS) { - fprintf(stdout, "HCC OPS\n"); - return; - } - - if (domain == ACTIVITY_DOMAIN_HSA_API) { - fprintf(stdout, "HSA API\n"); - return; - } - - const hip_api_data_t* data = reinterpret_cast(callback_data); - fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s> ", - roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), - cid, - data->correlation_id, - (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); - if (data->phase == ACTIVITY_API_PHASE_ENTER) { - switch (cid) { - case HIP_API_ID_hipMemcpy: - fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)", - data->args.hipMemcpy.dst, - data->args.hipMemcpy.src, - (uint32_t)(data->args.hipMemcpy.sizeBytes), - (uint32_t)(data->args.hipMemcpy.kind)); - break; - case HIP_API_ID_hipMalloc: - fprintf(stdout, "ptr(%p) size(0x%x)", - data->args.hipMalloc.ptr, - (uint32_t)(data->args.hipMalloc.size)); - break; - case HIP_API_ID_hipFree: - fprintf(stdout, "ptr(%p)", - data->args.hipFree.ptr); - break; - case HIP_API_ID_hipModuleLaunchKernel: - fprintf(stdout, "kernel(\"%s\") stream(%p)", - hipKernelNameRef(data->args.hipModuleLaunchKernel.f), - data->args.hipModuleLaunchKernel.stream); - break; - default: - break; - } - } else { - switch (cid) { - case HIP_API_ID_hipMalloc: - fprintf(stdout, "*ptr(0x%p)", - *(data->args.hipMalloc.ptr)); - break; - default: - break; - } - } - fprintf(stdout, "\n"); fflush(stdout); -} - -// Activity tracing callback -// hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) -void activity_callback(const char* begin, const char* end, void* arg) { - std::cout << "### activity_callback IN\n"; - const roctracer_record_t* record = reinterpret_cast(begin); - const roctracer_record_t* end_record = reinterpret_cast(end); - fprintf(stdout, "\tActivity records:\n"); fflush(stdout); - while (record < end_record) { - const char * name = roctracer_op_string(record->domain, record->op, record->kind); - fprintf(stdout, "\tdomain(%u)", record->domain); - fprintf(stdout, "\t%s\tcorrelation_id(%lu) time_ns(%lu:%lu)", - name, - record->correlation_id, - record->begin_ns, - record->end_ns - ); - if (record->domain == ACTIVITY_DOMAIN_HIP_API) { - fprintf(stdout, " process_id(%u) thread_id(%u)", - record->process_id, - record->thread_id - ); - } else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) { - fprintf(stdout, " device_id(%d) queue_id(%lu)", - record->device_id, - record->queue_id - ); - if (record->op == HIP_OP_ID_COPY) fprintf(stdout, " bytes(0x%zx)", record->bytes); - } else if (record->domain == ACTIVITY_DOMAIN_EXT_API) { - fprintf(stdout, " external_id(%lu)", - record->external_id - ); - } else { - fprintf(stderr, "Bad domain %d\n", record->domain); - //abort(); - } - fprintf(stdout, "\n"); - fflush(stdout); - ROCTRACER_CALL(roctracer_next_record(record, &record)); - } -} - -// Init tracing routine -void init_tracing() { - std::cout << "# INIT #############################" << std::endl << std::flush; - // Allocating tracing pool - roctracer_properties_t properties{}; - properties.buffer_size = 0x1000; - properties.buffer_callback_fun = activity_callback; - properties.buffer_callback_arg = &properties; - ROCTRACER_CALL(roctracer_open_pool(&properties)); - // Enable API callbacks - ROCTRACER_CALL(roctracer_enable_callback(api_callback, NULL)); - // Enable activity tracing - ROCTRACER_CALL(roctracer_enable_activity()); -} - -// Start tracing routine -void start_tracing() { - std::cout << "# START (" << iterations << ") #############################" << std::endl << std::flush; -} - -// Stop tracing routine -void stop_tracing() { - ROCTRACER_CALL(roctracer_disable_callback()); - - ROCTRACER_CALL(roctracer_disable_activity()); - ROCTRACER_CALL(roctracer_flush_activity()); - std::cout << "# STOP #############################" << std::endl << std::flush; -} -#else -void init_tracing() {} -void start_tracing() {} -void stop_tracing() {} -#endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/run.sh b/test/run.sh index d634357516..7b7d5109b3 100755 --- a/test/run.sh +++ b/test/run.sh @@ -67,6 +67,7 @@ eval_test() { # rocTrecer is used explicitely by test eval_test "standalone C test" "LD_PRELOAD=libkfdwrapper64.so ./test/MatrixTranspose_ctest" eval_test "standalone HIP test" "LD_PRELOAD=libkfdwrapper64.so ./test/MatrixTranspose_test" +eval_test "standalone HIP MGPU test" "LD_PRELOAD=libkfdwrapper64.so ./test/MatrixTranspose_mgpu" # Tool test # rocTracer/tool is loaded by HSA runtime