SWDEV-347670 - StreamWait and StreamWrite on Windows

__amd_streamOpsWrite blitkernel in device-libs has only 3 args.
so getting rid of the 4th unused arg (sizeBytes)

Change-Id: I81cc1107f8b424bf58558c93a2495a1b878aef91


[ROCm/clr commit: e643406caa]
Este commit está contenido en:
Ajay
2024-01-15 17:12:45 -08:00
cometido por Ajay GunaShekar
padre d30e344ae0
commit e8a077dc68
Se han modificado 5 ficheros con 20 adiciones y 21 borrados
+5 -5
Ver fichero
@@ -39,7 +39,7 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
extern void __amd_copyBufferRectAligned(__global uint*, __global uint*, ulong4, ulong4, ulong4);
extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong);
extern void __amd_streamOpsWait(__global uint*, __global ulong*, ulong, ulong, ulong);
@@ -165,8 +165,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
const char* HipExtraSourceCode = BLIT_KERNELS(
__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
ulong value, ulong sizeBytes) {
__amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
ulong value) {
__amd_streamOpsWrite(ptrInt, ptrUlong, value);
}
__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
@@ -186,8 +186,8 @@ const char* HipExtraSourceCode = BLIT_KERNELS(
const char* HipExtraSourceCodeNoGWS = BLIT_KERNELS(
__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
ulong value, ulong sizeBytes) {
__amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
ulong value) {
__amd_streamOpsWrite(ptrInt, ptrUlong, value);
}
__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
+4 -9
Ver fichero
@@ -2520,13 +2520,11 @@ bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
if (is32BitWrite) {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
} else {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
}
setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
@@ -2555,16 +2553,13 @@ bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, si
if (is32BitWait) {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
} else {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
}
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
@@ -660,6 +660,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.cooperativeGroups_ = settings().enableCoopGroups_;
info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
// Enable StreamWrite and StreamWait for all devices
info_.aqlBarrierValue_ = true;
if (amd::IS_HIP) {
info_.largeBar_ = false;
+9 -6
Ver fichero
@@ -1061,14 +1061,17 @@ VirtualGPU::~VirtualGPU() {
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
// Clear all timestamps, associated with this virtual GPU
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
if (mgmt.aql_vgpus_[i] == this) {
mgmt.aql_vgpus_[i] = nullptr;
mgmt.aql_events_[i].invalidate();
if (queues_[MainEngine] != nullptr) {
// Clear all timestamps, associated with this virtual GPU
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
if (mgmt.aql_vgpus_[i] == this) {
mgmt.aql_vgpus_[i] = nullptr;
mgmt.aql_events_[i].invalidate();
}
}
}
// Destroy RGP trace
if (rgpCaptureEna()) {
dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
@@ -2567,7 +2567,6 @@ bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
}
setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit