diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 3a7285c2ad..7d12d50dc4 100755 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -1658,8 +1658,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) { // CPU read ahead, hence release GPU memory gpu().releaseGpuMemoryFence(); - void* src = srcMemory.owner()->getSvmPtr(); - std::memcpy(dstHost, src, size[0]); + char* src = reinterpret_cast(srcMemory.owner()->getSvmPtr()); + std::memcpy(dstHost, src + origin[0], size[0]); // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush gpu().hasPendingDispatch(); return true; @@ -1763,8 +1763,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) { // CPU read ahead, hence release GPU memory gpu().releaseGpuMemoryFence(); - void* dst = dstMemory.owner()->getSvmPtr(); - std::memcpy(dst, srcHost, size[0]); + char* dst = reinterpret_cast(dstMemory.owner()->getSvmPtr()); + std::memcpy(dst + origin[0], srcHost, size[0]); // Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache gpu().hasPendingDispatch(); gpu().releaseGpuMemoryFence(); diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index f15fc0449f..aea8c403c7 100755 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -906,8 +906,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { dev->info_.largeBar_ = false; } else { - // Disable smallCopy optimization for now - dev->info_.largeBar_ = false; + dev->info_.largeBar_ = true; } }