diff --git a/projects/clr/rocclr/device/blitcl.cpp b/projects/clr/rocclr/device/blitcl.cpp index 5454ffb404..6c899c49c5 100644 --- a/projects/clr/rocclr/device/blitcl.cpp +++ b/projects/clr/rocclr/device/blitcl.cpp @@ -39,7 +39,7 @@ const char* BlitLinearSourceCode = BLIT_KERNELS( extern void __amd_copyBufferRectAligned(__global uint*, __global uint*, ulong4, ulong4, ulong4); - extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong); + extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong); extern void __amd_streamOpsWait(__global uint*, __global ulong*, ulong, ulong, ulong); @@ -165,8 +165,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS( const char* HipExtraSourceCode = BLIT_KERNELS( __kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, - ulong value, ulong sizeBytes) { - __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes); + ulong value) { + __amd_streamOpsWrite(ptrInt, ptrUlong, value); } __kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, @@ -186,8 +186,8 @@ const char* HipExtraSourceCode = BLIT_KERNELS( const char* HipExtraSourceCodeNoGWS = BLIT_KERNELS( __kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, - ulong value, ulong sizeBytes) { - __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes); + ulong value) { + __amd_streamOpsWrite(ptrInt, ptrUlong, value); } __kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index d0e91884ab..14b77c83eb 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -2520,13 +2520,11 @@ bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value, if (is32BitWrite) { setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset); setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value); } else { setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr); setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset); - setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); } - setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes); + setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); // Create ND range object for the kernel's execution amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); // Execute the blit @@ -2555,16 +2553,13 @@ bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, si if (is32BitWait) { setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset); setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value); - setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags); - setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask); } else { setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr); setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset); - setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); - setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags); - setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask); } + setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); + setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags); + setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask); // Create ND range object for the kernel's execution amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 8dd007b09a..3b2b8e6429 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -660,6 +660,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.cooperativeGroups_ = settings().enableCoopGroups_; info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_; + // Enable StreamWrite and StreamWait for all devices + info_.aqlBarrierValue_ = true; if (amd::IS_HIP) { info_.largeBar_ = false; diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 0ae8b85afa..1378f2696c 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -1061,14 +1061,17 @@ VirtualGPU::~VirtualGPU() { amd::ScopedLock k(dev().lockAsyncOps()); amd::ScopedLock lock(dev().vgpusAccess()); - // Clear all timestamps, associated with this virtual GPU - auto& mgmt = *queues_[MainEngine]->aql_mgmt_; - for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) { - if (mgmt.aql_vgpus_[i] == this) { - mgmt.aql_vgpus_[i] = nullptr; - mgmt.aql_events_[i].invalidate(); + if (queues_[MainEngine] != nullptr) { + // Clear all timestamps, associated with this virtual GPU + auto& mgmt = *queues_[MainEngine]->aql_mgmt_; + for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) { + if (mgmt.aql_vgpus_[i] == this) { + mgmt.aql_vgpus_[i] = nullptr; + mgmt.aql_events_[i].invalidate(); + } } } + // Destroy RGP trace if (rgpCaptureEna()) { dev().rgpCaptureMgr()->FinishRGPTrace(this, true); diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 4c6ac4e733..86bdff7c1f 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -2567,7 +2567,6 @@ bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value, setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset); setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); } - setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes); // Create ND range object for the kernel's execution amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); // Execute the blit