Files
itrowbri e7a26594b7 [rocprofiler-sdk] Fix Stream ID Error for Attachment (#1142)
* Changed stream error warning, remove regex search from attach execute test

* Formatting

* Revert accidental change

* Fix stream hang error due to grabbing same lock twice

* Updated add stream code, need to update tests

* Update attachment tests to use streams, threads, and multiple devices

* Update tests and fix stream issues

* Updated error messages to be more explicit, updated json to csv code in conftest to include streams and threads

* Formatting

* Add attachment label to attachment tests and update validation to fix errors

* Fix attach twice conftest

* Disabled thread san tests for attachment since they no longer work with bin file changes

* Updated for comment

* Added null check for getting attach status
2025-10-17 16:34:05 -05:00

211 строки
7.6 KiB
C++

// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include <hip/hip_runtime.h>
#include <rocprofiler-sdk-roctx/roctx.h>
#include <unistd.h>
#include <chrono>
#include <cstdlib>
#include <iostream>
#include <string>
#include <thread>
#include <vector>
/* Macro for checking GPU API return values */
#define HIP_ASSERT(call) \
do \
{ \
hipError_t gpuErr = call; \
if(hipSuccess != gpuErr) \
{ \
printf( \
"GPU API Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(gpuErr)); \
exit(1); \
} \
} while(0)
__global__ void
simple_kernel(float* data, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < size)
{
data[idx] = data[idx] * 2.0f + 1.0f;
}
}
void
execute_kernels(const size_t tid,
const hipStream_t stream,
const size_t stream_id,
const size_t device_id)
{
// Set device
HIP_ASSERT(hipSetDevice(device_id));
// Allocate memory
const int size = 1024 * 1024; // 1M elements
const size_t bytes = size * sizeof(float);
float* h_data = new float[size];
float* d_data = nullptr;
HIP_ASSERT(hipMalloc(&d_data, bytes));
// Initialize data
for(int i = 0; i < size; ++i)
{
h_data[i] = static_cast<float>(i);
}
// Run kernels in a loop for a while
std::cout << "Starting kernel execution loop for thread " << tid << " with stream " << stream_id
<< " on device " << device_id << "...\n";
const int num_iterations = 30;
for(int iter = 0; iter < num_iterations; ++iter)
{
// Add ROCTX markers for better profiling
std::string range_name = "Iteration_" + std::to_string(iter + 1);
roctxRangePush(range_name.c_str()); // Removed - ROCTx not linked
// Copy data to device
roctxMark("Start_H2D_Copy");
auto err = hipMemcpyAsync(d_data, h_data, bytes, hipMemcpyHostToDevice, stream);
if(err != hipSuccess)
{
std::cerr << "Failed to copy data for thread " << tid << " with stream " << stream_id
<< " on device " << device_id << "...\n";
roctxRangePop(); // Removed - ROCTx not linked
break;
}
// Launch kernel
roctxMark("Launch_Kernel");
int threads_per_block = 256;
int blocks_per_grid = (size + threads_per_block - 1) / threads_per_block;
hipLaunchKernelGGL(
simple_kernel, dim3(blocks_per_grid), dim3(threads_per_block), 0, stream, d_data, size);
// Copy data back
roctxMark("Start_D2H_Copy");
err = hipMemcpyAsync(h_data, d_data, bytes, hipMemcpyDeviceToHost, stream);
if(err != hipSuccess)
{
std::cerr << "Failed to copy data for thread " << tid << " with stream " << stream_id
<< " on device " << device_id << "...\n";
roctxRangePop(); // Removed - ROCTx not linked
break;
}
// Wait for completion
roctxMark("Stream_Synchronize");
err = hipStreamSynchronize(stream);
if(err != hipSuccess)
{
std::cerr << "Failed to synchronize stream " << stream_id << " with thread " << tid
<< " on device " << device_id << "...\n";
roctxRangePop(); // Removed - ROCTx not linked
break;
}
roctxRangePop(); // Removed - ROCTx not linked
// Small delay between iterations
std::this_thread::sleep_for(std::chrono::milliseconds(500));
}
std::cout << "Kernel execution loop completed for thread " << tid << " with stream "
<< stream_id << " on device " << device_id << "...\n";
// Cleanup
HIP_ASSERT(hipFree(d_data));
delete[] h_data;
}
int
main(int argc, char** argv)
{
size_t nthreads{32};
size_t nstreams{8};
int ndevices{0};
for(int i = 1; i < argc; ++i)
{
auto _arg = std::string{argv[i]};
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
fprintf(stderr,
"usage: attachment-test [NUM_THREADS (%zu)] [NUM_STREAMS (%zu)] "
"[NUM_DEVICES (%d)]\n",
nthreads,
nstreams,
ndevices);
exit(EXIT_SUCCESS);
}
}
if(argc > 1) nthreads = std::atoll(argv[1]);
if(argc > 2) nstreams = std::atoll(argv[2]);
if(argc > 3) ndevices = std::stoi(argv[3]);
std::cout << "Attachment test app started with PID: " << getpid() << std::endl;
// Initialize HIP
int device_count = 0;
HIP_ASSERT(hipGetDeviceCount(&device_count));
if(device_count == 0)
{
std::cerr << "No HIP devices found or error getting device count" << std::endl;
return 1;
}
// Default ndecives to device_count. Ensure that we do not use more devices than are available
ndevices = ndevices == 0 ? device_count : ndevices;
if(ndevices > device_count)
{
std::cout << "Using " << device_count << " HIP devices instead of the requested "
<< ndevices << "\n";
ndevices = device_count;
}
std::cout << "After first call " << getpid() << std::endl;
auto _threads = std::vector<std::thread>{};
auto _streams = std::vector<hipStream_t>(nstreams);
_threads.reserve(nthreads);
for(auto& itr : _streams)
HIP_ASSERT(hipStreamCreate(&itr));
for(size_t i = 0; i < nthreads; ++i)
_threads.emplace_back(
execute_kernels, i, _streams.at(i % nstreams), i % nstreams, i % ndevices);
for(auto& itr : _threads)
itr.join();
// Destroy streams
for(auto itr : _streams)
HIP_ASSERT(hipStreamDestroy(itr));
std::cout << "Attachment test app finished" << std::endl;
return 0;
}