SWDEV-546631 - Fix hipLaunchHostFunction in stream capture for windows (#654)

Este commit está contenido en:
Ioannis Assiouras
2025-08-28 07:51:50 +01:00
cometido por GitHub
padre 4d98a0169f
commit 1017532916
Se han modificado 2 ficheros con 80 adiciones y 0 borrados
@@ -2231,6 +2231,7 @@ class GraphHostNode : public GraphNode {
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed during block command creation");
}
block_command->enqueue();
block_command->notifyCmdQueue();
block_command->release();
commands_[0]->release();
}
@@ -173,6 +173,85 @@ TEST_CASE("Unit_hipLaunchHostFunc_Positive_Thread") {
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
namespace {
__global__ void kernelA(double* arrayA, size_t size) {
const size_t x = threadIdx.x + blockDim.x * blockIdx.x;
if (x < size) {
arrayA[x] *= 2.0;
}
}
struct set_vector_args {
std::vector<double>& h_array;
double value;
};
static void set_vector(void* args) {
set_vector_args h_args{*(reinterpret_cast<set_vector_args*>(args))};
std::vector<double>& vec{h_args.h_array};
vec.assign(vec.size(), h_args.value);
}
} // namespace
TEST_CASE("Unit_hipLaunchHostFunc_H2D_Kernel_D2H_Capture") {
constexpr int numOfBlocks = 1024;
constexpr int threadsPerBlock = 1024;
constexpr size_t arraySize = 1U << 20; // 1,048,576
constexpr double initValue = 2.0;
double* d_arrayA = nullptr;
std::vector<double> h_array(arraySize);
hipStream_t captureStream{};
HIP_CHECK(hipStreamCreate(&captureStream));
// Begin stream capture
HIP_CHECK(hipStreamBeginCapture(captureStream, hipStreamCaptureModeGlobal));
// Device alloc (async so it belongs to the captured stream)
HIP_CHECK(hipMallocAsync(&d_arrayA, arraySize * sizeof(double), captureStream));
// Initialize host data via a host function in the stream
set_vector_args args{h_array, initValue};
HIP_CHECK(hipLaunchHostFunc(captureStream, set_vector, &args));
// HtoD copy
HIP_CHECK(hipMemcpyAsync(d_arrayA, h_array.data(), arraySize * sizeof(double),
hipMemcpyHostToDevice, captureStream));
// KernelA only
kernelA<<<numOfBlocks, threadsPerBlock, 0, captureStream>>>(d_arrayA, arraySize);
HIP_CHECK(hipGetLastError());
// DtoH copy
HIP_CHECK(hipMemcpyAsync(h_array.data(), d_arrayA, arraySize * sizeof(double),
hipMemcpyDeviceToHost, captureStream));
// Free device memory inside the graph
HIP_CHECK(hipFreeAsync(d_arrayA, captureStream));
// End capture -> graph
hipGraph_t graph{};
HIP_CHECK(hipStreamEndCapture(captureStream, &graph));
// Instantiate and launch
hipGraphExec_t graphExec{};
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipGraphLaunch(graphExec, captureStream));
HIP_CHECK(hipStreamSynchronize(captureStream));
// Validate: each element should be initValue * 2.0
const double expected = initValue * 2.0;
for (size_t i = 0; i < arraySize; ++i) {
REQUIRE(h_array[i] == expected);
}
// Cleanup
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(captureStream));
}
/**
* End doxygen group GraphTest.