From 741b4b9fdf52b502a447eb0b3f58c11f633dfd17 Mon Sep 17 00:00:00 2001 From: German Andryeyev <56892148+gandryey@users.noreply.github.com> Date: Mon, 29 Dec 2025 08:35:22 -0500 Subject: [PATCH] SWDEV-558849 - Fix Windows build for ROCR backend (#2368) --- projects/clr/rocclr/device/rocm/rocprintf.hpp | 6 ------ projects/clr/rocclr/device/rocm/rocvirtual.cpp | 10 ++++++++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocprintf.hpp b/projects/clr/rocclr/device/rocm/rocprintf.hpp index 7f21305762..4c094f26f2 100644 --- a/projects/clr/rocclr/device/rocm/rocprintf.hpp +++ b/projects/clr/rocclr/device/rocm/rocprintf.hpp @@ -24,12 +24,6 @@ * @{ */ -#ifndef copysign -#ifdef _MSC_VER -#define copysign(X, Y) (_copysign(X, Y)) -#endif //_MSC_VER -#endif // copysign - //! GPU Device Implementation namespace amd::roc { diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 23ca5559b4..ff8354e350 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1273,7 +1273,7 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector& pa const uint32_t queueMask = queueSize - 1; const uint32_t sw_queue_size = queueMask; const size_t numPackets = packets.size(); - size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE; + const size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE; const size_t kGpuLagPackets = 16; // Staggered copy pattern: powers of 2 (1, 2, 4, 8.. to DEBUG_HIP_GRAPH_BATCH_SIZE @@ -1281,9 +1281,15 @@ bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector& pa size_t batchSize = 1; // Allocate arrays once outside the loop to avoid repeated stack allocations +#if IS_LINUX uint16_t validHeaders[kMaxBatchSize]; uint16_t validSetups[kMaxBatchSize]; - +#else + // Ensure we don't exceed reasonable stack allocation size on Windows + assert(kMaxBatchSize <= 1024 && "Batch size too large for stack allocation"); + uint16_t* validHeaders = static_cast(_alloca(kMaxBatchSize * sizeof(uint16_t))); + uint16_t* validSetups = static_cast(_alloca(kMaxBatchSize * sizeof(uint16_t))); +#endif while (processedPackets < numPackets) { uint64_t currentReadIndex = Hsa::queue_load_read_index_scacquire(gpu_queue_); uint64_t currentWriteIndex = Hsa::queue_load_write_index_relaxed(gpu_queue_);