From fcf63c43c8ab213e7a6385118bd6e00a147100d2 Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 2 Feb 2017 18:23:12 -0500 Subject: [PATCH] P4 to Git Change 1368703 by todli@todli-win-opencl-kv1 on 2017/02/02 18:12:31 SWDEV-96241 - Support SDI on PAL (runtime changes) Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#42 edit [ROCm/clr commit: 1eda21898892da65786c260991678fe7737133a8] --- .../rocclr/runtime/device/pal/palmemory.cpp | 1 - .../rocclr/runtime/device/pal/palresource.cpp | 16 +++- .../rocclr/runtime/device/pal/palsettings.cpp | 7 +- .../rocclr/runtime/device/pal/palvirtual.cpp | 78 ++++++++----------- 4 files changed, 49 insertions(+), 53 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp index 8e1ab0a7ff..eb87409aef 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp @@ -168,7 +168,6 @@ Memory::create( if (result) { switch (memoryType()) { case Resource::Pinned: - case Resource::ExternalPhysical: // Marks memory object for direct GPU access to the host memory flags_ |= HostMemoryDirectAccess; break; diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index b6557bf473..85663e5689 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -399,9 +399,9 @@ Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) createInfo->heaps[0] = Pal::GpuHeapGartCacheable; desc_.cardMemory_ = false; break; - case Shader: - case BusAddressable: case ExternalPhysical: + desc_.cardMemory_ = false; + case Shader: // Fall through to process the memory allocation ... case Local: createInfo->heapCount = 2; @@ -1075,6 +1075,18 @@ Resource::create(MemoryType memType, CreateParams* params) createInfo.alignment = MaxGpuAlignment; createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; + + if (memoryType() == ExternalPhysical){ + cl_bus_address_amd bus_address = + (reinterpret_cast(params->owner_))->busAddress(); + createInfo.surfaceBusAddr = bus_address.surface_bus_address; + createInfo.markerBusAddr = bus_address.marker_bus_address; + createInfo.flags.sdiExternal = true; + } + else if (memoryType() == BusAddressable){ + createInfo.flags.busAddressable = true; + } + memTypeToHeap(&createInfo); // createInfo.priority; memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp index 284e14706d..54468881ee 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp @@ -353,13 +353,12 @@ Settings::create( } #endif // !defined(WITH_LIGHTNING_COMPILER) -//! @todo -/* - if (calAttr.totalSDIHeap > 0) { + if (palProp.gpuMemoryProperties.busAddressableMemSize > 0) { //Enable bus addressable memory extension enableExtension(ClAMDBusAddressableMemory); } - +//! @todo +/* if (calAttr.longIdleDetect) { // KMD is unable to detect if we map the visible memory for CPU access, so // accessing persistent staged buffer may fail if LongIdleDetct is enabled. diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 54bee9dbfa..caac438bcb 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -1264,6 +1264,7 @@ VirtualGPU::copyMemory(cl_command_type type // Check if HW can be used for memory copy switch (type) { + case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD: case CL_COMMAND_SVM_MEMCPY: case CL_COMMAND_COPY_BUFFER: { amd::Coord3D realSrcOrigin(srcOrigin[0]); @@ -2731,28 +2732,28 @@ VirtualGPU::submitSignal(amd::SignalCommand & vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); - pal::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory()); - Unimplemented(); -/* + pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory()); + + GpuEvent gpuEvent; + eventBegin(MainEngine); + + uint32_t value = vcmd.markerValue(); + uint32_t size = vcmd.memory().getSize(); + + addVmMemory(pGpuMemory); + if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { - uint64_t surfAddr = gpuMemory->iMem()->getPhysicalAddress(cs()); - uint64_t markerAddr = gpuMemory->iMem()->getMarkerAddress(cs()); - uint64_t markerOffset = markerAddr - surfAddr; - cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(), - markerOffset, false); + iCmd()->CmdWaitMemoryValue(*(pGpuMemory->iMem()), size, value, 0xFFFFFFFF, Pal::CompareFunc::GreaterEqual); } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { - GpuEvent gpuEvent; - eventBegin(MainEngine); - cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(), vcmd.markerOffset(), true); - //! @todo We don't need flush if an event is tracked. - cs()->Flush(); - eventEnd(MainEngine, gpuEvent); - gpuMemory->setBusy(*this, gpuEvent); - // Update the global GPU event - setGpuEvent(gpuEvent); + iCmd()->CmdUpdateMemory(*(pGpuMemory->iMem()), size, 4, &value); } -*/ + + eventEnd(MainEngine, gpuEvent); + pGpuMemory->setBusy(*this, gpuEvent); + // Update the global GPU event + setGpuEvent(gpuEvent); + profilingEnd(vcmd); } @@ -2761,39 +2762,24 @@ VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); + std::vector memObjects = vcmd.memObjects(); - cl_uint numObjects = memObjects.size(); - Pal::IGpuMemory** pGpuMemObjects = new Pal::IGpuMemory*[numObjects]; + uint32_t numObjects = memObjects.size(); - for(cl_uint i = 0; i < numObjects; ++i) + for (int i = 0; i < numObjects; i++) { - pal::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]); - pGpuMemObjects[i] = gpuMemory->iMem(); - gpuMemory->syncCacheFromHost(*this); - } + // dummy render into the SDI surfaces so that KMD will be able to provide the bus addresses + uint dummy = 0; + static_cast(dev().xferMgr()) + .writeRawData(*(dev().getGpuMemory(memObjects[i])), sizeof(dummy), &dummy); - uint64_t* surfBusAddr = new uint64_t[numObjects]; - uint64_t* markerBusAddr = new uint64_t[numObjects]; - Unimplemented(); -/* - gslErrorCode res = cs()->makeBuffersResident(numObjects, pGpuMemObjects, - surfBusAddr, markerBusAddr); - if(res != GSL_NO_ERROR) { - LogError("MakeBuffersResident failed"); - vcmd.setStatus(CL_INVALID_OPERATION); + pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]); + + pGpuMemory->syncCacheFromHost(*this); + + vcmd.busAddress()[i].surface_bus_address = pGpuMemory->iMem()->Desc().surfaceBusAddr; + vcmd.busAddress()[i].marker_bus_address = pGpuMemory->iMem()->Desc().markerBusAddr; } - else { - cl_bus_address_amd* busAddr = vcmd.busAddress(); - for(cl_uint i = 0; i < numObjects; ++i) - { - busAddr[i].surface_bus_address = surfBusAddr[i]; - busAddr[i].marker_bus_address = markerBusAddr[i]; - } - } -*/ - delete[] pGpuMemObjects; - delete[] surfBusAddr; - delete[] markerBusAddr; profilingEnd(vcmd); }