P4 to Git Change 1410373 by gandryey@gera-w8 on 2017/05/16 17:16:52
SWDEV-120180 - [amdgpu-pro] OpenCL support for SSG - Add initial support of DGMA memory under ROCr backend. - The implementation requires amdgpu-pro stack initialization and memory allocation. - An interop with HSA device is created for ROCr access Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/Makefile#10 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#153 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_lqdflash_amd.cpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#285 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#351 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/Makefile.oclrocm#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/lnxheaders.h#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodriver.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#51 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.hpp#101 edit ... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#41 edit
Этот коммит содержится в:
@@ -157,7 +157,7 @@ static const char* OclExtensionsString[] = {"cl_khr_fp64 ",
|
||||
"cl_khr_mipmap_image ",
|
||||
"cl_khr_mipmap_image_writes ",
|
||||
"",
|
||||
(IS_LINUX) ? "" : "cl_amd_liquid_flash ",
|
||||
"cl_amd_liquid_flash ",
|
||||
NULL};
|
||||
|
||||
namespace device {
|
||||
|
||||
@@ -313,6 +313,7 @@ bool Settings::create(const CALdeviceattribs& calAttr, bool reportAsOCL12Device,
|
||||
maxWorkloadTime_ = modifyMaxWorkload.time;
|
||||
}
|
||||
}
|
||||
enableExtension(ClAMDLiquidFlash);
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
// Enable atomics support
|
||||
@@ -332,7 +333,6 @@ bool Settings::create(const CALdeviceattribs& calAttr, bool reportAsOCL12Device,
|
||||
// Enable some platform extensions
|
||||
enableExtension(ClAmdDeviceAttributeQuery);
|
||||
enableExtension(ClKhrSpir);
|
||||
enableExtension(ClAMDLiquidFlash);
|
||||
|
||||
hwLDSSize_ = 32 * Ki;
|
||||
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
//
|
||||
// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
// NOTE: Some of the Linux driver stack's headers don't wrap their C-style interface names in 'extern "C" { ... }'
|
||||
// blocks when building with a C++ compiler, so we need to add that ourselves.
|
||||
#if __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#include <amdgpu.h>
|
||||
#include <amdgpu_drm.h>
|
||||
#include <amdgpu_shared.h>
|
||||
#include <xf86drm.h>
|
||||
#include <xf86drmMode.h>
|
||||
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
constexpr int32_t InvalidFd = -1; // value representing a invalid file descriptor for Linux
|
||||
|
||||
#if __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
@@ -0,0 +1,162 @@
|
||||
//
|
||||
// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#ifndef WITHOUT_HSA_BACKEND
|
||||
|
||||
#include "hsa_ext_amd.h"
|
||||
#include "lnxheaders.h"
|
||||
#include "prodevice.hpp"
|
||||
#include "amdgpu_drm.h"
|
||||
|
||||
namespace roc {
|
||||
|
||||
constexpr uint32_t kMaxDevices = 32;
|
||||
constexpr uint32_t kAtiVendorId = 0x1002;
|
||||
|
||||
IProDevice* IProDevice::Init(uint32_t bus, uint32_t dev, uint32_t func)
|
||||
{
|
||||
ProDevice* pro_device = new ProDevice();
|
||||
|
||||
if (pro_device == nullptr || !pro_device->Create(bus, dev, func)) {
|
||||
delete pro_device;
|
||||
return nullptr;
|
||||
}
|
||||
return pro_device;
|
||||
}
|
||||
|
||||
ProDevice::~ProDevice() {
|
||||
delete alloc_ops_;
|
||||
|
||||
if (dev_handle_ != nullptr) {
|
||||
amdgpu_device_deinitialize(dev_handle_);
|
||||
}
|
||||
if (file_desc_ > 0) {
|
||||
close(file_desc_);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef AMDGPU_CAPABILITY_SSG_FLAG
|
||||
#define AMDGPU_CAPABILITY_SSG_FLAG 4
|
||||
#endif
|
||||
|
||||
// ================================================================================================
|
||||
// Open drm device and initialize it. And also get the drm information.
|
||||
bool ProDevice::Create(uint32_t bus, uint32_t device, uint32_t func) {
|
||||
drmDevicePtr devices[kMaxDevices] = { };
|
||||
int32_t device_count = drmGetDevices(devices, kMaxDevices);
|
||||
bool result = false;
|
||||
|
||||
for (int32_t i = 0; i < device_count; i++) {
|
||||
// Check if the device vendor is AMD
|
||||
if (devices[i]->deviceinfo.pci->vendor_id != kAtiVendorId) {
|
||||
continue;
|
||||
}
|
||||
if ((devices[i]->businfo.pci->bus == bus) &&
|
||||
(devices[i]->businfo.pci->dev == device) &&
|
||||
(devices[i]->businfo.pci->func == func)) {
|
||||
|
||||
// pDevices[i]->nodes[DRM_NODE_PRIMARY];
|
||||
// Using render node here so that we can do the off-screen rendering without authentication
|
||||
file_desc_ = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR, 0);
|
||||
|
||||
if (file_desc_ > 0) {
|
||||
void* data, *file, *cap;
|
||||
|
||||
// Initialize the admgpu device.
|
||||
if (amdgpu_device_initialize(file_desc_, &major_ver_,
|
||||
&minor_ver_, &dev_handle_) == 0) {
|
||||
uint32_t version = 0;
|
||||
// amdgpu_query_gpu_info will never fail only if it is initialized
|
||||
amdgpu_query_gpu_info(dev_handle_, &gpu_info_);
|
||||
|
||||
drm_amdgpu_capability cap = {};
|
||||
amdgpu_query_info(dev_handle_, AMDGPU_INFO_CAPABILITY, sizeof(drm_amdgpu_capability), &cap);
|
||||
|
||||
// Check if DGMA and SSG are available
|
||||
if ((cap.flag & (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) ==
|
||||
(AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) {
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
alloc_ops_ = new amd::Monitor("DGMA mem alloc lock", true);
|
||||
if (nullptr == alloc_ops_) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const
|
||||
{
|
||||
amd::ScopedLock l(alloc_ops_);
|
||||
void* ptr = nullptr;
|
||||
amdgpu_bo_handle buf_handle = 0;
|
||||
amdgpu_bo_alloc_request req = {0};
|
||||
*host_ptr = nullptr;
|
||||
|
||||
req.alloc_size = size;
|
||||
req.phys_alignment = 64 * Ki;
|
||||
req.preferred_heap = AMDGPU_GEM_DOMAIN_DGMA;
|
||||
|
||||
// Allocate buffer in DGMA heap
|
||||
if (0 == amdgpu_bo_alloc(dev_handle_, &req, &buf_handle)) {
|
||||
amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
|
||||
uint32_t shared_handle = 0;
|
||||
// Find the base driver handle
|
||||
if (0 == amdgpu_bo_export(buf_handle, type, &shared_handle)) {
|
||||
uint32_t flags = 0;
|
||||
size_t buf_size = 0;
|
||||
// Map memory object to HSA device
|
||||
if (0 == hsa_amd_interop_map_buffer(1, &agent, shared_handle,
|
||||
flags, &buf_size, &ptr, nullptr, nullptr)) {
|
||||
// Ask GPUPro driver to provide CPU access to allocation
|
||||
if (0 == amdgpu_bo_cpu_map(buf_handle, host_ptr)) {
|
||||
allocs_.insert(std::pair<void*, std::pair<amdgpu_bo_handle, uint32_t>>(
|
||||
ptr, std::pair<amdgpu_bo_handle, uint32_t>(buf_handle, shared_handle)));
|
||||
}
|
||||
else {
|
||||
hsa_amd_interop_unmap_buffer(ptr);
|
||||
close(shared_handle);
|
||||
amdgpu_bo_free(buf_handle);
|
||||
}
|
||||
}
|
||||
else {
|
||||
close(shared_handle);
|
||||
amdgpu_bo_free(buf_handle);
|
||||
}
|
||||
}
|
||||
else {
|
||||
amdgpu_bo_free(buf_handle);
|
||||
}
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void ProDevice::FreeDmaBuffer(void* ptr) const
|
||||
{
|
||||
amd::ScopedLock l(alloc_ops_);
|
||||
auto it = allocs_.find(ptr);
|
||||
if (it != allocs_.end()) {
|
||||
amdgpu_bo_cpu_unmap(it->second.first);
|
||||
// Unmap memory from HSA device
|
||||
hsa_amd_interop_unmap_buffer(ptr);
|
||||
// Close shared handle
|
||||
close(it->second.second);
|
||||
int error = amdgpu_bo_free(it->second.first);
|
||||
allocs_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // WITHOUT_HSA_BACKEND
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
//
|
||||
// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef WITHOUT_HSA_BACKEND
|
||||
|
||||
#include "prodriver.hpp"
|
||||
#include "thread/monitor.hpp"
|
||||
#include <map>
|
||||
|
||||
/*! \addtogroup HSA
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! HSA Device Implementation
|
||||
namespace roc {
|
||||
|
||||
class ProDevice : public IProDevice {
|
||||
public:
|
||||
ProDevice()
|
||||
: file_desc_(0)
|
||||
, major_ver_(0)
|
||||
, minor_ver_(0)
|
||||
, cp_ver_(0)
|
||||
, alloc_ops_(nullptr) {}
|
||||
virtual ~ProDevice() override;
|
||||
|
||||
bool Create(uint32_t bus, uint32_t device, uint32_t func);
|
||||
|
||||
virtual void* AllocDmaBuffer(
|
||||
hsa_agent_t agent, size_t size, void** host_ptr) const override;
|
||||
virtual void FreeDmaBuffer(void* ptr) const override;
|
||||
|
||||
private:
|
||||
int32_t file_desc_; //!< File descriptor for the device
|
||||
uint32_t major_ver_; //!< Major driver version
|
||||
uint32_t minor_ver_; //!< Minor driver version
|
||||
uint32_t cp_ver_; //!< CP ucode version
|
||||
amdgpu_device_handle dev_handle_; //!< AMD gpu device handle
|
||||
amdgpu_gpu_info gpu_info_; //!< GPU info structure
|
||||
amdgpu_heap_info heap_info_; //!< Information about memory
|
||||
mutable std::map<void*, std::pair<amdgpu_bo_handle, uint32_t>> allocs_; //!< Alloced memory mapping
|
||||
amd::Monitor* alloc_ops_; //!< Serializes memory allocations/destructions
|
||||
};
|
||||
|
||||
} // namespace roc
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
#endif /*WITHOUT_HSA_BACKEND*/
|
||||
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef WITHOUT_HSA_BACKEND
|
||||
|
||||
#include "top.hpp"
|
||||
#include "hsa.h"
|
||||
|
||||
/*! \addtogroup HSA
|
||||
* @{
|
||||
*/
|
||||
|
||||
namespace roc {
|
||||
|
||||
//! Pro Device Interface
|
||||
class IProDevice : public amd::HeapObject {
|
||||
public:
|
||||
static IProDevice* Init(uint32_t bus, uint32_t device, uint32_t func);
|
||||
|
||||
virtual void* AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const = 0;
|
||||
virtual void FreeDmaBuffer(void* ptr) const = 0;
|
||||
|
||||
IProDevice() {}
|
||||
virtual ~IProDevice() {}
|
||||
};
|
||||
|
||||
} // namespace roc
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
#endif /*WITHOUT_HSA_BACKEND*/
|
||||
@@ -25,6 +25,7 @@
|
||||
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
||||
#include "device/rocm/rocmemory.hpp"
|
||||
#include "device/rocm/rocglinterop.hpp"
|
||||
#include "pro/prodriver.hpp"
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
@@ -111,16 +112,18 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
|
||||
}
|
||||
|
||||
Device::Device(hsa_agent_t bkendDevice)
|
||||
: mapCacheOps_(nullptr),
|
||||
mapCache_(nullptr),
|
||||
_bkendDevice(bkendDevice),
|
||||
gpuvm_segment_max_alloc_(0),
|
||||
alloc_granularity_(0),
|
||||
context_(nullptr),
|
||||
xferQueue_(nullptr),
|
||||
xferRead_(nullptr),
|
||||
xferWrite_(nullptr),
|
||||
numOfVgpus_(0) {
|
||||
: mapCacheOps_(nullptr)
|
||||
, mapCache_(nullptr)
|
||||
, _bkendDevice(bkendDevice)
|
||||
, gpuvm_segment_max_alloc_(0)
|
||||
, alloc_granularity_(0)
|
||||
, context_(nullptr)
|
||||
, xferQueue_(nullptr)
|
||||
, xferRead_(nullptr)
|
||||
, xferWrite_(nullptr)
|
||||
, pro_device_(nullptr)
|
||||
, pro_ena_(false)
|
||||
, numOfVgpus_(0) {
|
||||
group_segment_.handle = 0;
|
||||
system_segment_.handle = 0;
|
||||
system_coarse_segment_.handle = 0;
|
||||
@@ -128,6 +131,10 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
}
|
||||
|
||||
Device::~Device() {
|
||||
#ifdef WITH_AMDGPU_PRO
|
||||
delete pro_device_;
|
||||
#endif
|
||||
|
||||
// Release cached map targets
|
||||
for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) {
|
||||
if ((*mapCache_)[i] != nullptr) {
|
||||
@@ -474,16 +481,22 @@ bool Device::init() {
|
||||
|
||||
roc_device->deviceInfo_.gfxipVersion_ = major * 100 + minor * 10 + stepping;
|
||||
|
||||
if (!roc_device->mapHSADeviceToOpenCLDevice(agent)) {
|
||||
LogError("Failed mapping of HsaDevice to Device.");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!roc_device->create()) {
|
||||
LogError("Error creating new instance of Device.");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Setup System Memory to be Non-Coherent per user
|
||||
// request via environment variable. By default the
|
||||
// System Memory is setup to be Coherent
|
||||
if (roc_device->settings().enableNCMode_) {
|
||||
hsa_status_t err = hsa_amd_coherency_set_type(agent, HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
LogError("Unable to set NC memory policy!");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (selectedDevices[ordinal++] &&
|
||||
(flagIsDefault(GPU_DEVICE_NAME) || GPU_DEVICE_NAME == 0 || GPU_DEVICE_NAME[0] == '\0' ||
|
||||
!strcmp(GPU_DEVICE_NAME, roc_device->info_.name_))) {
|
||||
@@ -500,10 +513,71 @@ void Device::tearDown() {
|
||||
}
|
||||
|
||||
bool Device::create() {
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_PROFILE, &agent_profile_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create HSA settings
|
||||
settings_ = new Settings();
|
||||
roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
|
||||
if ((hsaSettings == nullptr) ||
|
||||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), deviceInfo_.gfxipVersion_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!amd::Device::create()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t hsa_bdf_id = 0;
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
|
||||
info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8;
|
||||
info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3;
|
||||
info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07);
|
||||
|
||||
#ifdef WITH_AMDGPU_PRO
|
||||
// Create amdgpu-pro device interface for SSG support
|
||||
pro_device_ = IProDevice::Init(
|
||||
info_.deviceTopology_.pcie.bus,
|
||||
info_.deviceTopology_.pcie.device,
|
||||
info_.deviceTopology_.pcie.function);
|
||||
if (pro_device_ != nullptr) {
|
||||
pro_ena_ = true;
|
||||
settings_->enableExtension(ClAMDLiquidFlash);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (populateOCLDeviceConstants() == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
// create compilation object with cache support
|
||||
int gfxipMajor = deviceInfo_.gfxipVersion_ / 100;
|
||||
int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10;
|
||||
int gfxipStepping = deviceInfo_.gfxipVersion_ % 10;
|
||||
|
||||
// Use compute capability as target (AMD:AMDGPU:major:minor:stepping)
|
||||
// with dash as delimiter to be compatible with Windows directory name
|
||||
std::ostringstream cacheTarget;
|
||||
cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping;
|
||||
|
||||
amd::CacheCompilation* compObj = new amd::CacheCompilation(
|
||||
cacheTarget.str(), "_rocm", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
|
||||
if (!compObj) {
|
||||
LogError("Unable to create cache compilation object!");
|
||||
return false;
|
||||
}
|
||||
|
||||
cacheCompilation_.reset(compObj);
|
||||
#endif
|
||||
|
||||
amd::Context::Info info = {0};
|
||||
std::vector<amd::Device*> devices;
|
||||
devices.push_back(this);
|
||||
@@ -568,59 +642,6 @@ device::Program* Device::createProgram(amd::option::Options* options) {
|
||||
return new roc::HSAILProgram(*this);
|
||||
}
|
||||
|
||||
bool Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev) {
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_PROFILE, &agent_profile_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create HSA settings
|
||||
settings_ = new Settings();
|
||||
roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
|
||||
if ((hsaSettings == nullptr) ||
|
||||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), deviceInfo_.gfxipVersion_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (populateOCLDeviceConstants() == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Setup System Memory to be Non-Coherent per user
|
||||
// request via environment variable. By default the
|
||||
// System Memory is setup to be Coherent
|
||||
if (hsaSettings->enableNCMode_) {
|
||||
hsa_status_t err = hsa_amd_coherency_set_type(dev, HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
LogError("Unable to set NC memory policy!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
// create compilation object with cache support
|
||||
int gfxipMajor = deviceInfo_.gfxipVersion_ / 100;
|
||||
int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10;
|
||||
int gfxipStepping = deviceInfo_.gfxipVersion_ % 10;
|
||||
|
||||
// Use compute capability as target (AMD:AMDGPU:major:minor:stepping)
|
||||
// with dash as delimiter to be compatible with Windows directory name
|
||||
std::ostringstream cacheTarget;
|
||||
cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping;
|
||||
|
||||
amd::CacheCompilation* compObj = new amd::CacheCompilation(
|
||||
cacheTarget.str(), "_rocm", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
|
||||
if (!compObj) {
|
||||
LogError("Unable to create cache compilation object!");
|
||||
return false;
|
||||
}
|
||||
|
||||
cacheCompilation_.reset(compObj);
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) {
|
||||
if (data == nullptr) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
@@ -734,16 +755,6 @@ bool Device::populateOCLDeviceConstants() {
|
||||
|
||||
info_.type_ = CL_DEVICE_TYPE_GPU;
|
||||
|
||||
uint32_t hsa_bdf_id = 0;
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
|
||||
info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8;
|
||||
info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3;
|
||||
info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07);
|
||||
info_.extensions_ = getExtensionString();
|
||||
info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
|
||||
(settings().doublePrecision_) ? 1 : 0;
|
||||
|
||||
@@ -58,6 +58,7 @@ class Memory;
|
||||
class Resource;
|
||||
class VirtualDevice;
|
||||
class PrintfDbg;
|
||||
class IProDevice;
|
||||
|
||||
// A NULL Device type used only for offline compilation
|
||||
// Only functions that are used for compilation will be in this device
|
||||
@@ -276,8 +277,6 @@ class Device : public NullDevice {
|
||||
//! Destructor for the physical HSA device
|
||||
virtual ~Device();
|
||||
|
||||
bool mapHSADeviceToOpenCLDevice(hsa_agent_t hsadevice);
|
||||
|
||||
// Temporary, delete it later when HSA Runtime and KFD is fully fucntional.
|
||||
void fake_device();
|
||||
|
||||
@@ -388,6 +387,10 @@ class Device : public NullDevice {
|
||||
|
||||
amd::Context& context() const { return *context_; }
|
||||
|
||||
// Returns AMD GPU Pro interfaces
|
||||
const IProDevice& iPro() const { return *pro_device_; }
|
||||
bool ProEna() const { return pro_ena_; }
|
||||
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
@@ -415,6 +418,8 @@ class Device : public NullDevice {
|
||||
|
||||
XferBuffers* xferRead_; //!< Transfer buffers read
|
||||
XferBuffers* xferWrite_; //!< Transfer buffers write
|
||||
const IProDevice* pro_device_; //!< AMDGPUPro device
|
||||
bool pro_ena_; //!< Extra functionality with AMDGPUPro device, beyond ROCr
|
||||
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "platform/memory.hpp"
|
||||
#include "platform/sampler.hpp"
|
||||
#include "amdocl/cl_gl_amd.hpp"
|
||||
#include "pro/prodriver.hpp"
|
||||
|
||||
namespace roc {
|
||||
|
||||
@@ -548,7 +549,12 @@ void Buffer::destroy() {
|
||||
}
|
||||
|
||||
const cl_mem_flags memFlags = owner()->getMemFlags();
|
||||
|
||||
#ifdef WITH_AMDGPU_PRO
|
||||
if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
|
||||
dev().iPro().FreeDmaBuffer(deviceMemory_);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if ((deviceMemory_ != nullptr) && (deviceMemory_ != owner()->getHostMem())) {
|
||||
// if they are identical, the host pointer will be
|
||||
// deallocated later on => avoid double deallocation
|
||||
@@ -611,6 +617,20 @@ bool Buffer::create() {
|
||||
|
||||
// Allocate backing storage in device local memory unless UHP or AHP are set
|
||||
const cl_mem_flags memFlags = owner()->getMemFlags();
|
||||
|
||||
#ifdef WITH_AMDGPU_PRO
|
||||
if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
|
||||
void* host_ptr = nullptr;
|
||||
deviceMemory_ = dev().iPro().AllocDmaBuffer(dev().getGpuAgents()[0], size(), &host_ptr);
|
||||
if (deviceMemory_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
owner()->setHostMem(host_ptr);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
|
||||
deviceMemory_ = dev().deviceLocalAlloc(size());
|
||||
|
||||
|
||||
@@ -95,6 +95,7 @@ bool Settings::create(bool fullProfile, int gfxipVersion) {
|
||||
enableExtension(ClKhr3DImageWrites);
|
||||
enableExtension(ClAmdMediaOps);
|
||||
enableExtension(ClAmdMediaOps2);
|
||||
enableExtension(ClAMDLiquidFlash);
|
||||
if (MesaInterop::Supported()) {
|
||||
enableExtension(ClKhrGlSharing);
|
||||
}
|
||||
|
||||
@@ -1757,4 +1757,56 @@ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
|
||||
}
|
||||
|
||||
void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); }
|
||||
|
||||
void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
|
||||
size_t copySize = cmd.size()[0];
|
||||
size_t fileOffset = cmd.fileOffset();
|
||||
Memory* mem = dev().getRocMemory(&cmd.memory());
|
||||
uint idx = 0;
|
||||
|
||||
assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
|
||||
(cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
|
||||
const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);
|
||||
|
||||
if (writeBuffer) {
|
||||
size_t dstOffset = cmd.origin()[0];
|
||||
while (copySize > 0) {
|
||||
Memory* staging = dev().getRocMemory(&cmd.staging(idx));
|
||||
size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
|
||||
dstSize = std::min(dstSize, copySize);
|
||||
void* dstBuffer = staging->cpuMap(*this);
|
||||
if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0,
|
||||
dstSize)) {
|
||||
cmd.setStatus(CL_INVALID_OPERATION);
|
||||
return;
|
||||
}
|
||||
staging->cpuUnmap(*this);
|
||||
|
||||
bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
|
||||
fileOffset += dstSize;
|
||||
dstOffset += dstSize;
|
||||
copySize -= dstSize;
|
||||
}
|
||||
} else {
|
||||
size_t srcOffset = cmd.origin()[0];
|
||||
while (copySize > 0) {
|
||||
Memory* staging = dev().getRocMemory(&cmd.staging(idx));
|
||||
size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
|
||||
srcSize = std::min(srcSize, copySize);
|
||||
bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);
|
||||
|
||||
void* srcBuffer = staging->cpuMap(*this);
|
||||
if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0,
|
||||
srcSize)) {
|
||||
cmd.setStatus(CL_INVALID_OPERATION);
|
||||
return;
|
||||
}
|
||||
staging->cpuUnmap(*this);
|
||||
|
||||
fileOffset += srcSize;
|
||||
srcOffset += srcSize;
|
||||
copySize -= srcSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // End of roc namespace
|
||||
|
||||
@@ -190,6 +190,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
|
||||
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
|
||||
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
|
||||
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
|
||||
|
||||
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {}
|
||||
void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {}
|
||||
|
||||
@@ -595,7 +595,7 @@ class SvmBuffer : AllStatic {
|
||||
//! Liquid flash extension
|
||||
class LiquidFlashFile : public RuntimeObject {
|
||||
private:
|
||||
const wchar_t* name_;
|
||||
std::wstring name_;
|
||||
cl_file_flags_amd flags_;
|
||||
void* handle_;
|
||||
uint32_t blockSize_;
|
||||
|
||||
Ссылка в новой задаче
Block a user