SWDEV-549309 - Ensure that the kernel is issued to a stream that is ass… (#552)

* SWDEV-549309 - Ensure that the kernel is issued to a stream that is associated to the current device

* SWDEV-549309 - Ensure that the kernel is issued to a stream that is associated to the current device in atomic tests

* SWDEV-549309 - Added fix for Unit_hipMallocFromPoolAsync_Multidevice_Concurrent test

* SWDEV-549309 - Added fix for Unit_hipMemcpyPeerAsync_Positive_Default

* SWDEV-549309 - Added fix for Unit_hipStreamAttachMemAsync_Positive_AttachGlobal and Unit_hipGetProcAddress_GraphAPIs_AddMemsetMemcpyNodes

[ROCm/hip-tests commit: d21a95cea8]
Этот коммит содержится в:
Assiouras, Ioannis
2025-08-14 07:20:56 +01:00
коммит произвёл GitHub
родитель 17d12dff14
Коммит 5283a114b2
15 изменённых файлов: 38 добавлений и 21 удалений
+1
Просмотреть файл
@@ -446,6 +446,7 @@ void TestCore(const TestParams& p) {
// Launch Kernel
for (auto i = 0u; i < p.num_devices; ++i) {
HIP_CHECK(hipSetDevice(i));
for (auto j = 0u; j < p.kernel_count; ++j) {
const auto& stream = streams[i * p.kernel_count + j].stream();
const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
+1
Просмотреть файл
@@ -272,6 +272,7 @@ void TestCore(const TestParams& p) {
}
// Launch Kernel and get back old vals
for (auto i = 0u; i < p.num_devices; ++i) {
HIP_CHECK(hipSetDevice(i));
for (auto j = 0u; j < p.kernel_count; ++j) {
const auto& stream = streams[i * p.kernel_count + j].stream();
const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
+1
Просмотреть файл
@@ -302,6 +302,7 @@ void TestCore(const TestParams& p) {
// Launch kernel
for (auto i = 0u; i < p.num_devices; ++i) {
HIP_CHECK(hipSetDevice(i));
for (auto j = 0u; j < p.kernel_count; ++j) {
const auto& stream = streams[i * p.kernel_count + j].stream();
const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
-2
Просмотреть файл
@@ -375,9 +375,7 @@ TEST_CASE("Unit_hipGetProcAddress_GraphAPIs_AddMemsetMemcpyNodes") {
hipGraphExec_t graphExec;
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, 0));
#ifdef _WIN32
HIP_CHECK(hipStreamSynchronize(0));
#endif
REQUIRE(validateArrayT<char>(hostMemDst, N, value) == true);
+7 -7
Просмотреть файл
@@ -319,8 +319,8 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice") {
* - HIP_VERSION >= 6.2
*/
#if HT_AMD
static void threadQAsyncCommands(streamMemAllocTest* testObj,
hipStream_t strm) {
static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
HIP_CHECK(hipSetDevice(idx));
// Create host buffer with test data.
testObj->createHostBufferWithData();
// Allocate device memory and transfer data to it asyncronously on stream.
@@ -350,7 +350,7 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_Concurrent") {
// Queue commands in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]);
std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
test.join();
}
// Wait for the streams
@@ -405,10 +405,10 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_MultiStream") {
// Queue commands in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx],
stream_buf[streamPerAsic*idx]);
std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1],
stream_buf[streamPerAsic*idx + 1]);
std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
stream_buf[streamPerAsic * idx], idx);
std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
stream_buf[streamPerAsic * idx + 1], idx);
test1.join();
test2.join();
}
+7 -7
Просмотреть файл
@@ -371,8 +371,8 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_ReleaseThreshold_Mgpu") {
/**
* Local Thread Functions
*/
static void threadQAsyncCommands(streamMemAllocTest* testObj,
hipStream_t strm) {
static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
HIP_CHECK(hipSetDevice(idx));
// Create host buffer with test data.
testObj->createHostBufferWithData();
// Allocate device memory and transfer data to it asyncronously on stream.
@@ -616,7 +616,7 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_Concurrent") {
// Queue commands in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]);
std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
test.join();
}
// Wait for the streams
@@ -675,10 +675,10 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_MultiStream") {
// Queue commands in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx],
stream_buf[streamPerAsic*idx]);
std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1],
stream_buf[streamPerAsic*idx + 1]);
std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
stream_buf[streamPerAsic * idx], idx);
std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
stream_buf[streamPerAsic * idx + 1], idx);
test1.join();
test2.join();
}
+5 -2
Просмотреть файл
@@ -836,7 +836,6 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
int *Hmm = NULL, NumElms = (1024 * 1024), InitVal = 123, blockSize = 64;
int *Hmm1 = NULL, DataMismatch = 0;
hipStream_t strm;
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMallocManaged(&Hmm, (NumElms * sizeof(int))));
// Initializing memory
for (int i = 0; i < NumElms; ++i) {
@@ -852,6 +851,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
for (int i = 1; i < Ngpus; ++i) {
DataMismatch = 0;
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
MemAdvise3<<<dimGrid, dimBlock, 0, strm>>>(Hmm, Hmm1, NumElms);
HIP_CHECK(hipStreamSynchronize(strm));
@@ -865,6 +865,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
WARN("DataMismatch is observed with the gpu: " << i);
REQUIRE(false);
}
HIP_CHECK(hipStreamDestroy(strm));
HIP_CHECK(hipFree(Hmm1));
}
}
@@ -873,10 +874,12 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
for (int i = 0; i < Ngpus; ++i) {
DataMismatch = 0;
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMemAdvise(Hmm, (NumElms * sizeof(int)),
hipMemAdviseSetReadMostly, i));
MemAdvise2<<<dimGrid, dimBlock, 0, strm>>>(Hmm, NumElms);
HIP_CHECK(hipStreamSynchronize(strm));
HIP_CHECK(hipStreamDestroy(strm));
}
// verifying the final result
for (int i = 0; i < NumElms; ++i) {
@@ -892,7 +895,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
}
#endif
HIP_CHECK(hipFree(Hmm));
HIP_CHECK(hipStreamDestroy(strm));
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
+5 -3
Просмотреть файл
@@ -51,9 +51,6 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") {
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
return;
}
const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
const StreamGuard stream_guard(stream_type);
const hipStream_t stream = stream_guard.stream();
const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);
@@ -64,6 +61,11 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") {
INFO("Src device: " << src_device << ", Dst device: " << dst_device);
HIP_CHECK(hipSetDevice(src_device));
const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
const StreamGuard stream_guard(stream_type);
const hipStream_t stream = stream_guard.stream();
HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
if (can_access_peer) {
HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
+1
Просмотреть файл
@@ -510,6 +510,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(bool& val_res)
}
for (int i = 0; i < numDevices; ++i) {
HIP_CHECK_THREAD(hipSetDevice(i));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
HIP_CHECK_THREAD(hipGetLastError());
+1
Просмотреть файл
@@ -475,6 +475,7 @@ void TestkindDefaultForDtoD(void) {
}
for (int i=0; i < NumDevices; ++i) {
HIP_CHECK(hipSetDevice(i));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
dim3(threadsPerBlock),
0, stream[i], static_cast<const int*>(A_d[i]),
+2
Просмотреть файл
@@ -109,6 +109,7 @@ TEST_CASE("test_svm_byte_granularity") {
// get all the devices going simultaneously
for(unsigned int d = 0; d < num_devices; d++) // device ids starting at 1.
{
HIP_CHECK(hipSetDevice(d));
write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
HIP_CHECK(hipGetLastError());
}
@@ -125,6 +126,7 @@ TEST_CASE("test_svm_byte_granularity") {
size_t adjusted_num_elements = num_elements - num_devices;
for(unsigned int d = 0; d < num_devices; d++)
{
HIP_CHECK(hipSetDevice(d));
sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
error_counts[d]);
HIP_CHECK(hipGetLastError());
+1
Просмотреть файл
@@ -129,6 +129,7 @@ void launch_kernels_and_verify(std::vector<hipStream_t> &streams, unsigned int n
// all the pixels.
for(unsigned int d=0; d < num_devices; d++)
{
HIP_CHECK(hipSetDevice(d));
build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>(
pInputImage, num_pixels, pNodes, pNumNodes, numBins, d);
HIP_CHECK(hipGetLastError());
+2
Просмотреть файл
@@ -208,6 +208,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
}
else
{
HIP_CHECK(hipSetDevice(ci));
create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
ListLength);
}
@@ -218,6 +219,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
}
else
{
HIP_CHECK(hipSetDevice(vi));
verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
ListLength);
}
+3
Просмотреть файл
@@ -87,6 +87,9 @@ TEST_CASE("Unit_hipStreamAttachMemAsync_Positive_AttachGlobal") {
HIP_CHECK(hipStreamSynchronize(nullptr));
for (int i = 0; i < stream_count; ++i) {
if (device_count > 1) {
HIP_CHECK(hipSetDevice(i));
}
HipTest::launchKernel(Set, 1, 1, 0, streams.at(i)->stream(), managed_global.ptr() + i, i);
}
+1
Просмотреть файл
@@ -407,6 +407,7 @@ class streamMemAllocTest {
dim3(THREADS_PER_BLOCK), 0, stream,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d), C_d, size);
HIP_CHECK(hipGetLastError());
}
// Transfer data from device to host asynchronously.
void transferFromMempool(hipStream_t stream) {