4d98a0169f
* Updated stream code to handle special cases when stream value is 0x01 or 0x02 * Removed extra definitions and updated tests to account for special case * Modified stream.cpp so that each thread assigned a unique stream ID when hipStreamPerThread is used as stream value. Modified tests to check that threads are assigned unique, repeated values when hipStreamPerThread is called * Updated idx_offset, stream_map, and thread counter to be in one struct. * Update stream.cpp to only use add_stream() and update tests for seperate unit test for hipStreamPerThread * Remove unecessary comment * Removed unecessary line * Updated tests and stream.cpp to update stream ID correctly * Updated test structure
164 строки
5.9 KiB
C++
164 строки
5.9 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "hip/hip_runtime.h"
|
|
|
|
#define BLOCKDIM 64
|
|
|
|
/* Macro for checking GPU API return values */
|
|
#define HIP_ASSERT(call) \
|
|
do \
|
|
{ \
|
|
hipError_t gpuErr = call; \
|
|
if(hipSuccess != gpuErr) \
|
|
{ \
|
|
printf( \
|
|
"GPU API Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(gpuErr)); \
|
|
exit(1); \
|
|
} \
|
|
} while(0)
|
|
|
|
// HIP kernel. Each thread takes care of one element of input
|
|
__global__ void
|
|
cube(double* input, double* output, size_t offset, size_t elements_per_stream)
|
|
{
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t gstride = blockDim.x * gridDim.x;
|
|
|
|
// Span all elements assigned to this stream
|
|
for(size_t id = tid + offset; id < offset + elements_per_stream; id += gstride)
|
|
for(size_t i = 0; i < 1000; ++i)
|
|
output[id] = input[id] * input[id] * input[id];
|
|
}
|
|
|
|
int
|
|
main()
|
|
{
|
|
// number of streams
|
|
const int num_streams = 8;
|
|
|
|
// Number of threads in each thread block
|
|
const int blockSize = 512;
|
|
|
|
// Size of vectors
|
|
int n = 100000000;
|
|
int elements_per_stream = n / num_streams;
|
|
int bytes_per_stream = elements_per_stream * sizeof(double);
|
|
|
|
// Host input vectors
|
|
double* h_input1{nullptr};
|
|
// Host output vector
|
|
double* h_output1{nullptr};
|
|
|
|
// Device input vectors
|
|
double* d_input1{nullptr};
|
|
// Device output vector
|
|
double* d_output1{nullptr};
|
|
|
|
// Creating events for timers
|
|
hipEvent_t start{}, stop{};
|
|
HIP_ASSERT(hipEventCreate(&start));
|
|
HIP_ASSERT(hipEventCreate(&stop));
|
|
|
|
// Creating streams
|
|
hipStream_t streams[num_streams];
|
|
for(int i = 0; i < num_streams; ++i)
|
|
{
|
|
HIP_ASSERT(hipStreamCreate(&streams[i]));
|
|
}
|
|
|
|
// Size, in bytes, of each vector
|
|
size_t bytes = n * sizeof(double);
|
|
|
|
// Allocate page locked memory for these vectors on host
|
|
HIP_ASSERT(hipHostMalloc(&h_input1, bytes));
|
|
HIP_ASSERT(hipHostMalloc(&h_output1, bytes));
|
|
|
|
// Allocate memory for each vector on GPU
|
|
HIP_ASSERT(hipMalloc(&d_input1, bytes));
|
|
HIP_ASSERT(hipMalloc(&d_output1, bytes));
|
|
|
|
// Initialize vectors on host
|
|
for(int i = 0; i < n; i++)
|
|
{
|
|
h_input1[i] = sin(i);
|
|
}
|
|
|
|
// Number of thread blocks in grid
|
|
const int gridSizePerStream = 104; //(int)ceil((float)elements_per_stream/blockSize);
|
|
|
|
HIP_ASSERT(hipEventRecord(start));
|
|
// Extra copy with null stream
|
|
HIP_ASSERT(hipMemcpyAsync(
|
|
&d_input1[0], &h_input1[0], bytes_per_stream, hipMemcpyHostToDevice, nullptr));
|
|
// split H2D copies and kernel calls into separate loops
|
|
for(int i = 0; i < num_streams; i++)
|
|
{
|
|
int offset = i * elements_per_stream;
|
|
HIP_ASSERT(hipMemcpyAsync(&d_input1[offset],
|
|
&h_input1[offset],
|
|
bytes_per_stream,
|
|
hipMemcpyHostToDevice,
|
|
streams[i]));
|
|
}
|
|
for(int i = 0; i < num_streams; i++)
|
|
{
|
|
int offset = i * elements_per_stream;
|
|
cube<<<gridSizePerStream, blockSize, 0, streams[i]>>>(
|
|
d_input1, d_output1, offset, elements_per_stream);
|
|
}
|
|
for(int i = 0; i < num_streams; i++)
|
|
{
|
|
int offset = i * elements_per_stream;
|
|
HIP_ASSERT(hipMemcpyAsync(&h_output1[offset],
|
|
&d_output1[offset],
|
|
bytes_per_stream,
|
|
hipMemcpyDeviceToHost,
|
|
streams[i]));
|
|
}
|
|
|
|
HIP_ASSERT(hipEventRecord(stop));
|
|
HIP_ASSERT(hipEventSynchronize(stop));
|
|
|
|
float milliseconds = 0;
|
|
HIP_ASSERT(hipEventElapsedTime(&milliseconds, start, stop));
|
|
|
|
// Release device memory
|
|
HIP_ASSERT(hipFree(d_input1));
|
|
HIP_ASSERT(hipFree(d_output1));
|
|
|
|
// Release host memory
|
|
HIP_ASSERT(hipHostFree(h_input1));
|
|
HIP_ASSERT(hipHostFree(h_output1));
|
|
|
|
// Destroy streams
|
|
for(int i = 0; i < num_streams; ++i)
|
|
{
|
|
HIP_ASSERT(hipStreamDestroy(streams[i]));
|
|
}
|
|
|
|
return 0;
|
|
}
|