show how to use variety of HIP_PROFILE features

Change-Id: I6edd66ac4c068b64e1dc3787d7f1f69ab3238469
This commit is contained in:
Ben Sander
2016-10-27 21:26:28 -05:00
rodzic f8c4fa982a
commit e2fbab109d
@@ -34,6 +34,8 @@ THE SOFTWARE.
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1
#define ITERATIONS 10
// Device (Kernel) function, it must be void
// hipLaunchParm provides the execution configuration
__global__ void matrixTranspose(hipLaunchParm lp,
@@ -62,10 +64,72 @@ void matrixTransposeCPUReference(
}
}
int main() {
//HIP_SCOPED_MARKER(__func__, "MainFunc");
HIP_BEGIN_MARKER(__func__, "MainFunc");
// Use a separate function to demonstrate how to use function name as part of scoped marker:
void runGPU(float *Matrix, float *TransposeMatrix,
float* gpuMatrix, float* gpuTransposeMatrix) {
// __func__ is a standard C++ macro which expands to the name of the function, in this case "runGPU"
HIP_SCOPED_MARKER(__func__, "MyGroup");
for (int i=0; i<ITERATIONS; i++) {
float eventMs = 0.0f;
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
// Record the start event
hipEventRecord(start, NULL);
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
// Record the stop event
hipEventRecord(stop, NULL);
hipEventSynchronize(stop);
hipEventElapsedTime(&eventMs, start, stop);
printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs);
// Record the start event
hipEventRecord(start, NULL);
// Lauching kernel from host
hipLaunchKernel(matrixTranspose,
dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
gpuTransposeMatrix , gpuMatrix, WIDTH);
// Record the stop event
hipEventRecord(stop, NULL);
hipEventSynchronize(stop);
hipEventElapsedTime(&eventMs, start, stop);
printf ("kernel Execution time = %6.3fms\n", eventMs);
// Record the start event
hipEventRecord(start, NULL);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
// Record the stop event
hipEventRecord(stop, NULL);
hipEventSynchronize(stop);
hipEventElapsedTime(&eventMs, start, stop);
printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs);
}
};
int main() {
float* Matrix;
float* TransposeMatrix;
@@ -79,18 +143,14 @@ int main() {
std::cout << "Device name " << devProp.name << std::endl;
hipEvent_t start, stop;
float eventMs = 1.0f;
{
// Show example of how to create a "scoped marker".
// The scoped marker records the time spent inside the { scope } of the marker - the begin timestamp is at the
// beginning of the code scope, and the end is recorded when the SCOPE exits. This can be viewed in CodeXL
// timeline relative to other GPU and CPU events.
// This marker captures the time spent in setup including host allocation, initialization, and device memory allocation.
HIP_SCOPED_MARKER("Setup", "App");
HIP_SCOPED_MARKER("Setup", "MyGroup");
hipEventCreate(&start);
hipEventCreate(&stop);
Matrix = (float*)malloc(NUM * sizeof(float));
@@ -103,91 +163,46 @@ int main() {
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
}
{
HIP_SCOPED_MARKER("Loop", "App");
runGPU(Matrix, TransposeMatrix, gpuMatrix, gpuTransposeMatrix);
// Record the start event
hipEventRecord(start, NULL);
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
// Record the stop event
hipEventRecord(stop, NULL);
hipEventSynchronize(stop);
hipEventElapsedTime(&eventMs, start, stop);
printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs);
// Record the start event
hipEventRecord(start, NULL);
// Lauching kernel from host
hipLaunchKernel(matrixTranspose,
dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
gpuTransposeMatrix , gpuMatrix, WIDTH);
// Record the stop event
hipEventRecord(stop, NULL);
hipEventSynchronize(stop);
hipEventElapsedTime(&eventMs, start, stop);
}
// show how to use explicit begin/end markers:
// We begin the timed region with HIP_BEGIN_MARKER, passing in the markerName and group:
// The region will stop when HIP_END_MARKER is called
// This is another way to mark begin/end - as an alternative to scoped markers.
HIP_BEGIN_MARKER("Check&TearDown", "MyGroup");
int errors = 0;
{
HIP_SCOPED_MARKER("Teardown", "App");
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
printf ("kernel Execution time = %6.3fms\n", eventMs);
// Record the start event
hipEventRecord(start, NULL);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
// Record the stop event
hipEventRecord(stop, NULL);
hipEventSynchronize(stop);
hipEventElapsedTime(&eventMs, start, stop);
printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
double eps = 1.0E-6;
for (int i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
errors++;
}
}
if (errors!=0) {
printf("FAILED: %d errors\n",errors);
} else {
printf ("PASSED!\n");
}
//free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
//free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
// verify the results
double eps = 1.0E-6;
for (int i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
errors++;
}
}
if (errors!=0) {
printf("FAILED: %d errors\n",errors);
} else {
printf ("PASSED!\n");
}
//free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
//free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
HIP_END_MARKER();