HSA tracing domain with HIP MatrixTranspose sample test

Этот коммит содержится в:
Evgeny
2018-12-21 10:53:00 -06:00
родитель 7092fb92f5
Коммит d75eb95472
10 изменённых файлов: 559 добавлений и 238 удалений
+1 -1
Просмотреть файл
@@ -43,7 +43,7 @@ include ( env )
## Set test target
add_custom_target( mytest
COMMAND make -C "${CMAKE_CURRENT_SOURCE_DIR}/test/MatrixTranspose"
COMMAND ${PROJECT_BINARY_DIR}/run.sh)
COMMAND sh -xc "cp ${CMAKE_CURRENT_SOURCE_DIR}/test/MatrixTranspose/MatrixTranspose ${PROJECT_BINARY_DIR}/test" )
## Setup the package version.
get_version ( "1.0.0" )
+11 -8
Просмотреть файл
@@ -23,11 +23,6 @@
cmake_minimum_required ( VERSION 2.8.12 )
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
set ( RUN_SCRIPT "${TEST_DIR}/run.sh" )
set ( TEST_HSA "${TEST_DIR}/hsa/test" )
execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" )
execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git pull --rebase; fi" )
## Util sources
file( GLOB UTIL_SRC "${TEST_HSA}/util/*.cpp" )
@@ -35,9 +30,17 @@ file( GLOB UTIL_SRC "${TEST_HSA}/util/*.cpp" )
set ( TEST_LIB "tracer_tool" )
set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} )
add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} )
target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_HSA} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} )
target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_HSA} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} )
target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt )
set ( TEST_DIR ${TEST_HSA} )
add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa )
# HSA test
#set ( TEST_HSA "${TEST_DIR}/hsa/test" )
#set ( TEST_DIR ${TEST_HSA} )
#execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" )
#execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git pull --rebase; fi" )
#add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa )
# test run script
set ( RUN_SCRIPT "${TEST_DIR}/run.sh" )
execute_process ( COMMAND sh -xc "cp ${RUN_SCRIPT} ${PROJECT_BINARY_DIR}" )
+21 -17
Просмотреть файл
@@ -1,32 +1,36 @@
ROOT_PATH = ../..
LIB_PATH = $(ROOT_PATH)/build
LIB_NAME = roctracer64
ROC_LIBS = -L$(LIB_PATH) -l$(LIB_NAME)
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
EXECUTABLE = ./MatrixTranspose.exe
SOURCES = MatrixTranspose.cpp
OBJECTS = $(SOURCES:.cpp=.o)
HIPCC=$(HIP_PATH)/bin/hipcc
ITERATIONS ?= 100
TARGET=hcc
HIP_PATH ?= /opt/rocm/hip
HIPCC = $(HIP_PATH)/bin/hipcc
SOURCES = MatrixTranspose.cpp
OBJECTS = $(SOURCES:.cpp=.o)
EXECUTABLE=./MatrixTranspose
.PHONY: test
all: $(EXECUTABLE) test
CXXFLAGS =-g
CXX=$(HIPCC)
CXXFLAGS =-g -I$(ROOT_PATH) -I$(ROOT_PATH)/inc -DLOCAL_BUILD=1 -DITERATIONS=$(ITERATIONS)
export LD_LIBRARY_PATH=$(LIB_PATH)
all: clean $(EXECUTABLE) test
$(EXECUTABLE): $(OBJECTS)
$(HIPCC) $(OBJECTS) -o $@ $(HCC_LIBS) $(ROC_LIBS)
$(HIPCC) $(OBJECTS) -o $@
test: $(EXECUTABLE)
HCC_PROFILE=1 LD_PRELOAD=$(HCC_HOME)/lib/libmcwamp_hsa.so $(EXECUTABLE)
LD_PRELOAD=$(HCC_HOME)/lib/libmcwamp_hsa.so $(EXECUTABLE)
clean:
rm -f $(EXECUTABLE)
rm -f $(OBJECTS)
rm -f $(HIP_PATH)/src/*.o
.PHONY: all test clean
+50 -195
Просмотреть файл
@@ -23,11 +23,9 @@ THE SOFTWARE.
#include <iostream>
// hip header file
#include <hip/hip_runtime.h>
#include "hip/hip_runtime.h"
#ifndef ITERATIONS
# define ITERATIONS 100
#endif
#define WIDTH 1024
@@ -38,8 +36,7 @@ THE SOFTWARE.
#define THREADS_PER_BLOCK_Z 1
// Device (Kernel) function, it must be void
// hipLaunchParm provides the execution configuration
__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
__global__ void matrixTranspose(float* out, float* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
@@ -55,10 +52,6 @@ void matrixTransposeCPUReference(float* output, float* input, const unsigned int
}
}
int iterations = ITERATIONS;
void start_tracing();
void stop_tracing();
int main() {
float* Matrix;
float* TransposeMatrix;
@@ -75,193 +68,55 @@ int main() {
int i;
int errors;
while (iterations-- > 0) {
start_tracing();
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
// Lauching kernel from host
hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
gpuMatrix, WIDTH);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
stop_tracing();
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
// Lauching kernel from host
hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
gpuMatrix, WIDTH);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
return errors;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// HIP Callbacks/Activity tracing
//
#if 1
#include <inc/roctracer_hip.h>
#include <inc/roctracer_hcc.h>
// Macro to check ROC-tracer calls status
#define ROCTRACER_CALL(call) \
do { \
int err = call; \
if (err != 0) { \
std::cerr << roctracer_error_string() << std::endl << std::flush; \
abort(); \
} \
} while (0)
// HIP API callback function
void hip_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
(void)arg;
const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(callback_data);
fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s> ",
roctracer_id_string(ACTIVITY_DOMAIN_HIP_API, cid, 0),
cid,
data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
switch (cid) {
case HIP_API_ID_hipMemcpy:
fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)",
data->args.hipMemcpy.dst,
data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
break;
case HIP_API_ID_hipMalloc:
fprintf(stdout, "ptr(%p) size(0x%x)",
data->args.hipMalloc.ptr,
(uint32_t)(data->args.hipMalloc.size));
break;
case HIP_API_ID_hipFree:
fprintf(stdout, "ptr(%p)",
data->args.hipFree.ptr);
break;
case HIP_API_ID_hipModuleLaunchKernel:
fprintf(stdout, "kernel(\"%s\") stream(%p)",
hipKernelNameRef(data->args.hipModuleLaunchKernel.f),
data->args.hipModuleLaunchKernel.stream);
break;
default:
break;
}
} else {
switch (cid) {
case HIP_API_ID_hipMalloc:
fprintf(stdout, "*ptr(0x%p)",
*(data->args.hipMalloc.ptr));
break;
default:
break;
}
}
fprintf(stdout, "\n"); fflush(stdout);
}
// Activity tracing callback
// hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067)
void activity_callback(const char* begin, const char* end, void* arg) {
const roctracer_record_t* record = reinterpret_cast<const roctracer_record_t*>(begin);
const roctracer_record_t* end_record = reinterpret_cast<const roctracer_record_t*>(end);
fprintf(stdout, "\tActivity records:\n"); fflush(stdout);
while (record < end_record) {
const char * name = roctracer_id_string(record->domain, record->activity_id, record->kind);
fprintf(stdout, "\t%s\tcorrelation_id(%lu) time_ns(%lu:%lu)",
name,
record->correlation_id,
record->begin_ns,
record->end_ns
);
if (record->domain == ACTIVITY_DOMAIN_HIP_API) {
fprintf(stdout, " process_id(%u) thread_id(%u)",
record->process_id,
record->thread_id
);
} else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) {
fprintf(stdout, " device_id(%d) queue_id(%lu)",
record->device_id,
record->queue_id
);
} else {
fprintf(stderr, "Bad domain %d\n", record->domain);
abort();
}
if (record->activity_id == hc::HSA_OP_ID_COPY) fprintf(stdout, " bytes(0x%zx)", record->bytes);
fprintf(stdout, "\n");
fflush(stdout);
ROCTRACER_CALL(roctracer_next_record(record, &record));
}
}
// Start tracing routine
void start_tracing() {
std::cout << "# START #############################" << std::endl << std::flush;
// Allocating tracing pool
roctracer_properties_t properties{};
properties.buffer_size = 12;
properties.buffer_callback_fun = activity_callback;
ROCTRACER_CALL(roctracer_open_pool(&properties));
// Enable HIP API callbacks
ROCTRACER_CALL(roctracer_enable_callback(ACTIVITY_DOMAIN_ANY, 0, hip_api_callback, NULL));
// Enable HIP activity tracing
ROCTRACER_CALL(roctracer_enable_activity(ACTIVITY_DOMAIN_ANY, 0));
}
// Stop tracing routine
void stop_tracing() {
ROCTRACER_CALL(roctracer_disable_callback(ACTIVITY_DOMAIN_ANY, 0));
ROCTRACER_CALL(roctracer_disable_activity(ACTIVITY_DOMAIN_ANY, 0));
ROCTRACER_CALL(roctracer_close_pool());
std::cout << "# STOP #############################" << std::endl << std::flush;
}
#else
void start_tracing() {}
void stop_tracing() {}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+6 -9
Просмотреть файл
@@ -21,8 +21,7 @@ In order to use the HIP framework, we need to add the "hip_runtime.h" header fil
## Device-side code
We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
`__global__ void matrixTranspose(hipLaunchParm lp, `
` float *out, `
`__global__ void matrixTranspose(float *out, `
` float *in, `
` const int width, `
` const int height) `
@@ -41,11 +40,9 @@ other function-type qualifiers are:
`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
`__host__` cannot combine with `__global__`.
`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*.
`__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*.
Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
After `hipLaunchParm`, Kernel arguments follows next(i.e., `float *out, float *in, const int width, const int height`).
Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
The kernel function begins with
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
@@ -63,15 +60,15 @@ We allocated memory to the Matrix on host side by using malloc and initiallized
here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
Now, we'll see how to launch the kernel.
` hipLaunchKernel(matrixTranspose, `
` hipLaunchKernelGGL(matrixTranspose, `
` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), `
` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), `
` 0, 0, `
` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); `
HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
- Kernels launch with the `"hipLaunchKernel"` function
- The first five parameters to hipLaunchKernel are the following:
- Kernels launch with the `"hipLaunchKernelGGL"` function
- The first five parameters to hipLaunchKernelGGL are the following:
- **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
- **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)".
- **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)".
+32
Просмотреть файл
@@ -0,0 +1,32 @@
ROOT_PATH = ../..
LIB_PATH = $(ROOT_PATH)/build
LIB_NAME = roctracer64
ROC_LIBS = -L$(LIB_PATH) -l$(LIB_NAME)
EXECUTABLE = ./MatrixTranspose.exe
SOURCES = MatrixTranspose.cpp
OBJECTS = $(SOURCES:.cpp=.o)
ITERATIONS ?= 100
HIP_PATH ?= /opt/rocm/hip
HIPCC = $(HIP_PATH)/bin/hipcc
CXX=$(HIPCC)
CXXFLAGS =-g -I$(ROOT_PATH) -I$(ROOT_PATH)/inc -DLOCAL_BUILD=1 -DITERATIONS=$(ITERATIONS)
export LD_LIBRARY_PATH=$(LIB_PATH)
all: clean $(EXECUTABLE) test
$(EXECUTABLE): $(OBJECTS)
$(HIPCC) $(OBJECTS) -o $@ $(HCC_LIBS) $(ROC_LIBS)
test: $(EXECUTABLE)
HCC_PROFILE=1 LD_PRELOAD=$(HCC_HOME)/lib/libmcwamp_hsa.so $(EXECUTABLE)
clean:
rm -f $(EXECUTABLE)
rm -f $(OBJECTS)
.PHONY: all test clean
+267
Просмотреть файл
@@ -0,0 +1,267 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <iostream>
// hip header file
#include <hip/hip_runtime.h>
#ifndef ITERATIONS
# define ITERATIONS 100
#endif
#define WIDTH 1024
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1
// Device (Kernel) function, it must be void
// hipLaunchParm provides the execution configuration
__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
out[y * width + x] = in[x * width + y];
}
// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
}
}
int iterations = ITERATIONS;
void start_tracing();
void stop_tracing();
int main() {
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
float* gpuMatrix;
float* gpuTransposeMatrix;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
std::cout << "Device name " << devProp.name << std::endl;
int i;
int errors;
while (iterations-- > 0) {
start_tracing();
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
// Lauching kernel from host
hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
gpuMatrix, WIDTH);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
stop_tracing();
}
return errors;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// HIP Callbacks/Activity tracing
//
#if 1
#include <inc/roctracer_hip.h>
#include <inc/roctracer_hcc.h>
// Macro to check ROC-tracer calls status
#define ROCTRACER_CALL(call) \
do { \
int err = call; \
if (err != 0) { \
std::cerr << roctracer_error_string() << std::endl << std::flush; \
abort(); \
} \
} while (0)
// HIP API callback function
void hip_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
(void)arg;
const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(callback_data);
fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s> ",
roctracer_id_string(ACTIVITY_DOMAIN_HIP_API, cid, 0),
cid,
data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
switch (cid) {
case HIP_API_ID_hipMemcpy:
fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)",
data->args.hipMemcpy.dst,
data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
break;
case HIP_API_ID_hipMalloc:
fprintf(stdout, "ptr(%p) size(0x%x)",
data->args.hipMalloc.ptr,
(uint32_t)(data->args.hipMalloc.size));
break;
case HIP_API_ID_hipFree:
fprintf(stdout, "ptr(%p)",
data->args.hipFree.ptr);
break;
case HIP_API_ID_hipModuleLaunchKernel:
fprintf(stdout, "kernel(\"%s\") stream(%p)",
hipKernelNameRef(data->args.hipModuleLaunchKernel.f),
data->args.hipModuleLaunchKernel.stream);
break;
default:
break;
}
} else {
switch (cid) {
case HIP_API_ID_hipMalloc:
fprintf(stdout, "*ptr(0x%p)",
*(data->args.hipMalloc.ptr));
break;
default:
break;
}
}
fprintf(stdout, "\n"); fflush(stdout);
}
// Activity tracing callback
// hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067)
void activity_callback(const char* begin, const char* end, void* arg) {
const roctracer_record_t* record = reinterpret_cast<const roctracer_record_t*>(begin);
const roctracer_record_t* end_record = reinterpret_cast<const roctracer_record_t*>(end);
fprintf(stdout, "\tActivity records:\n"); fflush(stdout);
while (record < end_record) {
const char * name = roctracer_id_string(record->domain, record->activity_id, record->kind);
fprintf(stdout, "\t%s\tcorrelation_id(%lu) time_ns(%lu:%lu)",
name,
record->correlation_id,
record->begin_ns,
record->end_ns
);
if (record->domain == ACTIVITY_DOMAIN_HIP_API) {
fprintf(stdout, " process_id(%u) thread_id(%u)",
record->process_id,
record->thread_id
);
} else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) {
fprintf(stdout, " device_id(%d) queue_id(%lu)",
record->device_id,
record->queue_id
);
} else {
fprintf(stderr, "Bad domain %d\n", record->domain);
abort();
}
if (record->activity_id == hc::HSA_OP_ID_COPY) fprintf(stdout, " bytes(0x%zx)", record->bytes);
fprintf(stdout, "\n");
fflush(stdout);
ROCTRACER_CALL(roctracer_next_record(record, &record));
}
}
// Start tracing routine
void start_tracing() {
std::cout << "# START #############################" << std::endl << std::flush;
// Allocating tracing pool
roctracer_properties_t properties{};
properties.buffer_size = 12;
properties.buffer_callback_fun = activity_callback;
ROCTRACER_CALL(roctracer_open_pool(&properties));
// Enable HIP API callbacks
ROCTRACER_CALL(roctracer_enable_callback(ACTIVITY_DOMAIN_ANY, 0, hip_api_callback, NULL));
// Enable HIP activity tracing
ROCTRACER_CALL(roctracer_enable_activity(ACTIVITY_DOMAIN_ANY, 0));
}
// Stop tracing routine
void stop_tracing() {
ROCTRACER_CALL(roctracer_disable_callback(ACTIVITY_DOMAIN_ANY, 0));
ROCTRACER_CALL(roctracer_disable_activity(ACTIVITY_DOMAIN_ANY, 0));
ROCTRACER_CALL(roctracer_close_pool());
std::cout << "# STOP #############################" << std::endl << std::flush;
}
#else
void start_tracing() {}
void stop_tracing() {}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+100
Просмотреть файл
@@ -0,0 +1,100 @@
## Writing first HIP program ###
This tutorial shows how to get write simple HIP application. We will write the simplest Matrix Transpose program.
## HIP Introduction:
HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPUs. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your applications performance on GPU hardware.
## Requirement:
For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md)
## prerequiste knowledge:
Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
## Simple Matrix Transpose
Here is simple example showing how to write your first program in HIP.
In order to use the HIP framework, we need to add the "hip_runtime.h" header file. SInce its c++ api you can add any header file you have been using earlier while writing your c/c++ program. For gpgpu programming, we have host(microprocessor) and the device(gpu).
## Device-side code
We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
`__global__ void matrixTranspose(hipLaunchParm lp, `
` float *out, `
` float *in, `
` const int width, `
` const int height) `
`{ `
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; `
` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; `
` `
` out[y * width + x] = in[x * height + y]; `
`} `
`__global__` keyword is the Function-Type Qualifiers, it is used with functions that are executed on device and are called/launched from the hosts.
other function-type qualifiers are:
`__device__` functions are Executed on the device and Called from the device only
`__host__` functions are Executed on the host and Called from the host
`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
`__host__` cannot combine with `__global__`.
`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*.
Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
After `hipLaunchParm`, Kernel arguments follows next(i.e., `float *out, float *in, const int width, const int height`).
The kernel function begins with
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;`
here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block.
We are familiar with rest of the code on device-side.
## Host-side code
Now, we'll see how to call the kernel from the host. Inside the main() function, we first defined the pointers(for both, the host-side as well as device). The declaration of device pointer is similar to that of the host. Next, we have `hipDeviceProp_t`, it is the pre-defined struct for hip device properties. This is followed by `hipGetDeviceProperties(&devProp, 0)` It is used to extract the device information. The first parameter is the struct, second parameter is the device number to get properties for. Next line print the name of the device.
We allocated memory to the Matrix on host side by using malloc and initiallized it. While in order to allocate memory on device side we will be using `hipMalloc`, it's quiet similar to that of malloc instruction. After this, we will copy the data to the allocated memory on device-side using `hipMemcpy`.
` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
Now, we'll see how to launch the kernel.
` hipLaunchKernel(matrixTranspose, `
` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), `
` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), `
` 0, 0, `
` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); `
HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
- Kernels launch with the `"hipLaunchKernel"` function
- The first five parameters to hipLaunchKernel are the following:
- **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
- **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)".
- **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)".
- **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel. In MatrixTranspose sample, it's '0'.
- **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream.In MatrixTranspose sample, it's '0'.
- Kernel arguments follow these first five parameters. Here, these are "gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT".
Next, we'll copy the computed values/data back to the device using the `hipMemcpy`. Here the last parameter will be `hipMemcpyDeviceToHost`
After, copying the data from device to memory, we will verify it with the one we computed with the cpu reference funtion.
Finally, we will free the memory allocated earlier by using free() for host while for devices we will use `hipFree`.
## How to build and run:
Use the make command and execute it using ./exe
Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
## More Info:
- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md)
- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md)
- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP)
- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md)
- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
- [hipify-clang](https://github.com/ROCm-Developer-Tools/HIP/hipify-clang/README.md)
- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md)
- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md)
+2 -3
Просмотреть файл
@@ -38,15 +38,14 @@ export ROCP_AGENTS=1
# each thread creates a queue pre GPU agent
export ROCP_THRS=1
#eval ./test_hsa/ctrl
# paths to ROC profiler and oher libraries
export LD_LIBRARY_PATH=$PWD
# ROC profiler library loaded by HSA runtime
export HSA_TOOLS_LIB="test/libtracer_tool.so libroctracer64.so"
export LD_PRELOAD="$HSA_TOOLS_LIB"
eval ./test/hsa/ctrl
export ROCTRACER_DOMAIN="hsa"
HCC_PROFILE=1 LD_PRELOAD=$HCC_HOME/lib/libmcwamp_hsa.so ./test/MatrixTranspose
#valgrind --leak-check=full $tbin
#valgrind --tool=massif $tbin
+69 -5
Просмотреть файл
@@ -26,15 +26,27 @@ THE SOFTWARE.
#include <stdio.h>
#include <inc/roctracer_hsa.h>
#include <inc/roctracer_hip.h>
#include <inc/ext/hsa_rt_utils.hpp>
#define PUBLIC_API __attribute__((visibility("default")))
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
// Macro to check ROC-tracer calls status
#define ROCTRACER_CALL(call) \
do { \
int err = call; \
if (err != 0) { \
std::cerr << roctracer_error_string() << std::endl << std::flush; \
abort(); \
} \
} while (0)
typedef hsa_rt_utils::Timer::timestamp_t timestamp_t;
hsa_rt_utils::Timer timer(NULL);
thread_local timestamp_t begin_timestamp = 0;
thread_local timestamp_t hsa_begin_timestamp = 0;
thread_local timestamp_t hip_begin_timestamp = 0;
// HSA API callback function
void hsa_api_callback(
@@ -48,21 +60,73 @@ void hsa_api_callback(
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
begin_timestamp = timer.timestamp_fn_ns();
hsa_begin_timestamp = timer.timestamp_fn_ns();
} else {
const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? begin_timestamp : timer.timestamp_fn_ns();
const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer.timestamp_fn_ns();
std::ostringstream os;
os << '(' << begin_timestamp << ":" << end_timestamp << ") " << hsa_api_data_pair_t(cid, *data);
os << '(' << hsa_begin_timestamp << ":" << end_timestamp << ") " << hsa_api_data_pair_t(cid, *data);
fprintf(stdout, "%s\n", os.str().c_str());
}
}
void hip_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
(void)arg;
const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(callback_data);
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
hsa_begin_timestamp = timer.timestamp_fn_ns();
} else {
const timestamp_t end_timestamp = timer.timestamp_fn_ns();
fprintf(stdout, "(%lu:%lu) %s(", hsa_begin_timestamp, end_timestamp, roctracer_id_string(ACTIVITY_DOMAIN_HIP_API, cid, 0));
switch (cid) {
case HIP_API_ID_hipMemcpy:
fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)",
data->args.hipMemcpy.dst,
data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
break;
case HIP_API_ID_hipMalloc:
fprintf(stdout, "ptr(0x%p) size(0x%x)",
*(data->args.hipMalloc.ptr),
(uint32_t)(data->args.hipMalloc.size));
break;
case HIP_API_ID_hipFree:
fprintf(stdout, "ptr(%p)",
data->args.hipFree.ptr);
break;
case HIP_API_ID_hipModuleLaunchKernel:
fprintf(stdout, "kernel(\"%s\") stream(%p)",
hipKernelNameRef(data->args.hipModuleLaunchKernel.f),
data->args.hipModuleLaunchKernel.stream);
break;
default:
break;
}
fprintf(stdout, ")\n"); fflush(stdout);
}
}
extern "C" {
// HSA-runtime tool on-load method
PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) {
timer.init(table->core_->hsa_system_get_info_fn);
roctracer_enable_callback(ACTIVITY_DOMAIN_HSA_API, HSA_API_ID_ANY, hsa_api_callback, NULL);
const char* trace_domain = getenv("ROCTRACER_DOMAIN");
const bool trace_hsa = (trace_domain == NULL) || (strncmp(trace_domain, "hsa", 3) == 0);
const bool trace_hip = (trace_domain == NULL) || (strncmp(trace_domain, "hip", 3) == 0);
// Enable HSA API callbacks
if (trace_hsa) {
ROCTRACER_CALL(roctracer_enable_callback(ACTIVITY_DOMAIN_HSA_API, HSA_API_ID_ANY, hsa_api_callback, NULL));
}
// Enable HIP API callbacks
if (trace_hip) {
ROCTRACER_CALL(roctracer_enable_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_ANY, hip_api_callback, NULL));
}
return true;
}
}