rocm-systems/rocclr/runtime/device/gpu/gpudevice.cpp

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//

#include "platform/program.hpp"
#include "platform/kernel.hpp"
#include "os/os.hpp"
#include "device/device.hpp"
#include "device/gpu/gpudefs.hpp"
#include "device/gpu/gpumemory.hpp"
#include "device/gpu/gpudevice.hpp"
#include "utils/flags.hpp"
#include "utils/versions.hpp"
#include "thread/monitor.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpubinary.hpp"
#include "device/gpu/gpusettings.hpp"
#include "device/gpu/gpublit.hpp"

#include "acl.h"

#include "amdocl/cl_common.hpp"
#include "CL/cl_gl.h"

#ifdef _WIN32
#include <d3d9.h>
#include <d3d10_1.h>
#include "CL/cl_d3d10.h"
#include "CL/cl_d3d11.h"
#include "CL/cl_dx9_media_sharing.h"
#endif // _WIN32

#include "os_if.h" // for osInit()

#include <cstring>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctype.h>

bool DeviceLoad()
{
    bool    ret = false;

    // Create online devices
    ret |= gpu::Device::init();
    // Create offline GPU devices
    ret |= gpu::NullDevice::init();

    return ret;
}

void DeviceUnload()
{
    gpu::Device::tearDown();
}

namespace gpu {

aclCompiler* NullDevice::compiler_;
aclCompiler* NullDevice::hsaCompiler_;
AppProfile Device::appProfile_;

NullDevice::NullDevice()
    : amd::Device(NULL)
    , calTarget_(static_cast<CALtarget>(0))
    , hwInfo_(NULL)
{
}

bool
NullDevice::init()
{
    bool result = false;
    std::vector<Device*> devices;

    devices = getDevices(CL_DEVICE_TYPE_GPU, false);

    // Loop through all supported devices and create each of them
    for (uint id = CAL_TARGET_CYPRESS; id <= CAL_TARGET_LAST; ++id) {
        bool    foundActive = false;

        if (gpu::DeviceInfo[id].targetName_[0] == '\0') {
            continue;
        }

        // Loop through all active devices and see if we match one
        for (uint i = 0; i < devices.size(); ++i) {
            if (static_cast<NullDevice*>(devices[i])->calTarget() ==
                static_cast<CALtarget>(id)) {
                foundActive = true;
                break;
            }
        }

        // Don't report an offline device if it's active
        if (foundActive) {
            continue;
        }

        NullDevice*  dev = new NullDevice();
        if (NULL != dev) {
            if (!dev->create(static_cast<CALtarget>(id))) {
                delete dev;
            }
            else {
                result |= true;
                dev->registerDevice();
            }
        }
    }

    return result;
}

bool
NullDevice::create(CALtarget target)
{
    CALdeviceattribs      calAttr = {0};
    CALdeviceVideoAttribs calVideoAttr = {0};

    online_ = false;

    // Mark the device as GPU type
    info_.type_     = CL_DEVICE_TYPE_GPU;
    info_.vendorId_ = 0x1002;

    calTarget_ = calAttr.target = target;
    hwInfo_ = &DeviceInfo[calTarget_];

    // Report the device name
    ::strcpy(info_.name_, hwInfo()->targetName_);

    // Force double if it could be supported
    switch (target) {
    case CAL_TARGET_CAYMAN:
    case CAL_TARGET_CYPRESS:
    case CAL_TARGET_PITCAIRN:
    case CAL_TARGET_CAPEVERDE:
    case CAL_TARGET_TAHITI:
    case CAL_TARGET_OLAND:
    case CAL_TARGET_HAINAN:
    case CAL_TARGET_DEVASTATOR:
    case CAL_TARGET_SCRAPPER:
    case CAL_TARGET_BONAIRE:
    case CAL_TARGET_SPECTRE:
    case CAL_TARGET_SPOOKY:
    case CAL_TARGET_KALINDI:
    case CAL_TARGET_HAWAII:
    case CAL_TARGET_ICELAND:
    case CAL_TARGET_TONGA:
    case CAL_TARGET_BERMUDA:
    case CAL_TARGET_FIJI:
    case CAL_TARGET_GODAVARI:
    case CAL_TARGET_CARRIZO:
        calAttr.doublePrecision = CAL_TRUE;
        break;
    default:
        break;
    }

    settings_ = new gpu::Settings();
    gpu::Settings* gpuSettings = reinterpret_cast<gpu::Settings*>(settings_);
    // Create setting for the offline target
    if ((gpuSettings == NULL) || !gpuSettings->create(calAttr
#if cl_amd_open_video
        , calVideoAttr
#endif //cl_amd_open_video
        )) {
        return false;
    }

    info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_;

    // Initialize the extension string for offline devices
    info_.extensions_   = getExtensionString();

    // Fill the version info
    ::strcpy(info_.name_, hwInfo()->targetName_);
    ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
    ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
        AMD_BUILD_STRING);
    info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
    info_.oclcVersion_ = "OpenCL C 1.2 ";

    return true;
}

device::Program*
NullDevice::createProgram(int oclVer)
{
    NullProgram* nullProgram = new NullProgram(*this);
    if (nullProgram == NULL) {
        LogError("Memory allocation has failed!");
    }

    return nullProgram;
}

void
Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings)
{
    numComputeRings_ = 0;

    for (uint i = 0; i < num; ++i) {
        desc_[desc[i].id] = desc[i];
        desc_[desc[i].id].priority = GSL_ENGINEPRIORITY_NEUTRAL;

        if (desc[i].id >= GSL_ENGINEID_COMPUTE0 &&
            desc[i].id <= GSL_ENGINEID_COMPUTE7) {
            numComputeRings_++;
        }
    }

    numComputeRings_ = std::min(numComputeRings_, maxNumComputeRings);
}

uint
Device::Engines::getRequested(uint engines, gslEngineDescriptor* desc) const
{
    uint slot = 0;
    for (uint i = 0; i < GSL_ENGINEID_MAX; ++i) {
        if ((engines & getMask(static_cast<gslEngineID>(i))) &&
            (desc_[i].id == static_cast<gslEngineID>(i))) {
            desc[slot] = desc_[i];
            engines &= ~getMask(static_cast<gslEngineID>(i));
            slot++;
        }
    }
    return (engines == 0) ? slot : 0;
}

Device::XferBuffers::~XferBuffers()
{
    // Destroy temporary buffer for reads
    for (std::list<Resource*>::const_iterator i = freeBuffers_.begin();
        i != freeBuffers_.end(); ++i) {
        // CPU optimization: unmap staging buffer just once
        if (!(*i)->cal()->cardMemory_) {
            (*i)->unmap(NULL);
        }
        delete (*i);
    }
    freeBuffers_.clear();
}

bool
Device::XferBuffers::create()
{
    Resource*   xferBuf = NULL;
    bool        result = false;
    // Note: create a 1D resource
    xferBuf = new Resource(dev(), bufSize_ / Heap::ElementSize,
        Heap::ElementType);

    // We will try to creat a CAL resource for the transfer buffer
    if ((NULL == xferBuf) || !xferBuf->create(type_)) {
        delete xferBuf;
        xferBuf = NULL;
        LogError("Couldn't allocate a transfer buffer!");
    }
    else {
        result = true;
        freeBuffers_.push_back(xferBuf);
        // CPU optimization: map staging buffer just once
        if (!xferBuf->cal()->cardMemory_) {
            xferBuf->map(NULL);
        }
    }

    return result;
}

Resource&
Device::XferBuffers::acquire()
{
    Resource*   xferBuf = NULL;
    size_t      listSize;

    // Lock the operations with the staged buffer list
    amd::ScopedLock  l(lock_);
    listSize = freeBuffers_.size();

    // If the list is empty, then attempt to allocate a staged buffer
    if (listSize == 0) {
        // Note: create a 1D resource
        xferBuf = new Resource(dev(), bufSize_ / Heap::ElementSize,
            Heap::ElementType);

        // We will try to create a CAL resource for the transfer buffer
        if ((NULL == xferBuf) || !xferBuf->create(type_)) {
            delete xferBuf;
            xferBuf = NULL;
            LogError("Couldn't allocate a transfer buffer!");
        }
        else {
            ++acquiredCnt_;
            // CPU optimization: map staging buffer just once
            if (!xferBuf->cal()->cardMemory_) {
                xferBuf->map(NULL);
            }
        }
    }

    if (xferBuf == NULL) {
        xferBuf = *(freeBuffers_.begin());
        freeBuffers_.erase(freeBuffers_.begin());
        ++acquiredCnt_;
    }

    return *xferBuf;
}

void
Device::XferBuffers::release(VirtualGPU& gpu, Resource& buffer)
{
    // Lock the operations with the staged buffer list
    amd::ScopedLock  l(lock_);
    // Make sure buffer isn't busy on the current VirtualGPU, because
    // the next aquire can come from different queue
    buffer.wait(gpu);
    freeBuffers_.push_back(&buffer);
    --acquiredCnt_;
}


Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev)
    : dev_(dev)
{
    // Lock the virtual GPU list
    dev_.vgpusAccess()->lock();

    // Find all available virtual GPUs and lock them
    // from the execution of commands
    for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) {
        dev_.vgpus()[idx]->execution().lock();
    }
}

Device::ScopedLockVgpus::~ScopedLockVgpus()
{
    // Find all available virtual GPUs and unlock them
    // for the execution of commands
    for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) {
        dev_.vgpus()[idx]->execution().unlock();
    }

    // Unock the virtual GPU list
    dev_.vgpusAccess()->unlock();
}

Device::Device()
    : NullDevice()
    , CALGSLDevice()
    , numOfVgpus_(0)
    , context_(NULL)
    , heap_(NULL)
    , dummyPage_(NULL)
    , lockAsyncOps_(NULL)
    , lockAsyncOpsForInitHeap_(NULL)
    , vgpusAccess_(NULL)
    , xferRead_(NULL)
    , xferWrite_(NULL)
    , vaCacheAccess_(NULL)
    , vaCacheList_(NULL)
    , mapCache_(NULL)
    , resourceCache_(NULL)
    , heapInitComplete_(false)
    , xferQueue_(NULL)
    , srdManager_(NULL)
{
}

Device::~Device()
{
    CondLog(vaCacheList_ == NULL ||
        (vaCacheList_->size() != 0), "Application didn't unmap all host memory!");

    delete srdManager_;

    for (uint s = 0; s < scratch_.size(); ++s) {
        delete scratch_[s];
        scratch_[s] = NULL;
    }

    // Destroy transfer queue
    delete xferQueue_;

    // Destroy blit program
    delete blitProgram_;

    // Release cached map targets
    for (uint i = 0; mapCache_ != NULL && i < mapCache_->size(); ++i) {
        if ((*mapCache_)[i] != NULL) {
            (*mapCache_)[i]->release();
        }
    }
    delete mapCache_;

    // Destroy temporary buffers for read/write
    delete xferRead_;
    delete xferWrite_;

    if (dummyPage_ != NULL) {
        dummyPage_->release();
    }

    // Destroy global heap
    if (heap_ != NULL) {
        delete heap_;
    }

    // Destroy resource cache
    delete resourceCache_;

    delete lockAsyncOps_;
    delete lockAsyncOpsForInitHeap_;
    delete vgpusAccess_;
    delete vaCacheAccess_;
    delete vaCacheList_;

    if (context_ != NULL) {
        context_->release();
    }

    // Close the active device
    close();
}

void Device::fillDeviceInfo(
    const CALdeviceattribs& calAttr,
    const CALdevicestatus& calStatus
#if cl_amd_open_video
    ,
    const CALdeviceVideoAttribs& calVideoAttr
#endif // cl_amd_open_video
    )
{
    info_.type_     = CL_DEVICE_TYPE_GPU;
    info_.vendorId_ = 0x1002;
    info_.maxComputeUnits_          = calAttr.numberOfSIMD;
    info_.maxWorkItemDimensions_    = 3;
    info_.numberOfShaderEngines     = calAttr.numberOfShaderEngines;

    if (settings().siPlus_) {
        // SI parts are scalar.  Also, reads don't need to be 128-bits to get peak rates.
        // For example, float4 is not faster than float as long as all threads fetch the same
        // amount of data and the reads are coalesced.  This is from the H/W team and confirmed
        // through experimentation.  May also be true on EG/NI, but no point in confusing
        // developers now.
        info_.nativeVectorWidthChar_    = info_.preferredVectorWidthChar_   = 4;
        info_.nativeVectorWidthShort_   = info_.preferredVectorWidthShort_  = 2;
        info_.nativeVectorWidthInt_     = info_.preferredVectorWidthInt_    = 1;
        info_.nativeVectorWidthLong_    = info_.preferredVectorWidthLong_   = 1;
        info_.nativeVectorWidthFloat_   = info_.preferredVectorWidthFloat_  = 1;
        info_.nativeVectorWidthDouble_  = info_.preferredVectorWidthDouble_ =
            (settings().checkExtension(ClKhrFp64)) ?  1 : 0;
        info_.nativeVectorWidthHalf_    = info_.preferredVectorWidthHalf_ = 0; // no half support
    }
    else {
        info_.nativeVectorWidthChar_    = info_.preferredVectorWidthChar_   = 16;
        info_.nativeVectorWidthShort_   = info_.preferredVectorWidthShort_  = 8;
        info_.nativeVectorWidthInt_     = info_.preferredVectorWidthInt_    = 4;
        info_.nativeVectorWidthLong_    = info_.preferredVectorWidthLong_   = 2;
        info_.nativeVectorWidthFloat_   = info_.preferredVectorWidthFloat_  = 4;
        info_.nativeVectorWidthDouble_  = info_.preferredVectorWidthDouble_ =
            (settings().checkExtension(ClKhrFp64)) ?  2 : 0;
        info_.nativeVectorWidthHalf_    = info_.preferredVectorWidthHalf_ = 0; // no half support
    }
    info_.maxClockFrequency_    = (calAttr.engineClock != 0) ? calAttr.engineClock : 555;
    info_.maxParameterSize_ = 1024;
     info_.minDataTypeAlignSize_ = sizeof(cl_long16);
    info_.singleFPConfig_       = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
        | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA;

    if (settings().checkExtension(ClKhrFp64)) {
        info_.doubleFPConfig_   = info_.singleFPConfig_ | CL_FP_DENORM;
    }

    if (settings().reportFMA_) {
        info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
    }

    info_.globalMemCacheLineSize_   = settings().cacheLineSize_;
    info_.globalMemCacheSize_       = settings().cacheSize_;
    if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) {
        info_.globalMemCacheType_   = CL_READ_WRITE_CACHE;
    }
    else {
        info_.globalMemCacheType_   = CL_NONE;
    }

    if (heap()->isVirtual()) {
#if defined(ATI_OS_LINUX)
        info_.globalMemSize_   =
            (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
            // globalMemSize is the actual available size for app on Linux
            // Because Linux base driver doesn't support paging
            static_cast<cl_ulong>(calStatus.availVisibleHeap +
            calStatus.availInvisibleHeap) / 100u) * Mi;
#else
        info_.globalMemSize_   =
            (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
            static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
#endif
        if (settings().apuSystem_) {
            info_.globalMemSize_   +=
                (static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi) / 2;
        }
        // Check if runtime has to reserve address space for testing
        if (settings().use64BitPtr_ && settings().preallocAddrSpace_ &&
            (info_.globalMemSize_ > ReservedAdressSpaceSize)) {
            info_.globalMemSize_ -= ReservedAdressSpaceSize;
        }
        else {
            reinterpret_cast<gpu::Settings*>(settings_)->preallocAddrSpace_ = false;
        }

        // We try to calculate the largest available memory size from
        // the largest available block in either heap.  In theory this
        // should be the size we can actually allocate at application
        // start.  Note that it may not be a guarantee still as the
        // application progresses.
        info_.maxMemAllocSize_ = std::max(
            cl_ulong(calStatus.largestBlockVisibleHeap * Mi),
            cl_ulong(calStatus.largestBlockInvisibleHeap * Mi));

        info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
            std::min(GPU_MAX_ALLOC_PERCENT, 100u) / 100u);

        //! \note Force max single allocation size.
        //! 4GB limit for the blit kernels and 64 bit optimizations.
        info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
                static_cast<cl_ulong>(settings().maxAllocSize_));
    }
    else {
        uint    maxHeapSize = flagIsDefault(GPU_MAX_HEAP_SIZE) ? 50 : GPU_MAX_HEAP_SIZE;
        info_.globalMemSize_   = (std::min(maxHeapSize, 100u)
            * calAttr.localRAM / 100u) * Mi;

        uint    maxAllocSize = flagIsDefault(GPU_MAX_ALLOC_PERCENT) ? 25 : GPU_MAX_ALLOC_PERCENT;
        info_.maxMemAllocSize_ = cl_ulong(info_.globalMemSize_ *
            std::min(maxAllocSize, 100u) / 100u);
    }

    if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) {
        LogError("We are unable to get a heap large enough to support the OpenCL minimum "\
            "requirement for FULL_PROFILE");
    }

    info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi),  info_.maxMemAllocSize_);

    // Clamp max single alloc size to the globalMemSize since it's
    // reduced by default
    info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_);

    // We need to verify that we are not reporting more global memory
    // that 4x single alloc
    info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_);

    // Use 64 bit pointers
    if (settings().use64BitPtr_) {
        info_.addressBits_  = 64;
    }
    else {
        info_.addressBits_  = 32;
        // Limit total size with 3GB for 32 bit
        info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi));
    }

    // Alignment in BITS of the base address of any allocated memory object
    static const size_t MemBaseAlignment = 256;
    //! @note Force 256 bytes alignment, since currently
    //! calAttr.surface_alignment returns 4KB. For pinned memory runtime
    //! should be able to create a view with 256 bytes alignement
    info_.memBaseAddrAlign_ = 8 * MemBaseAlignment;

    info_.maxConstantBufferSize_ = 64 * Ki;
    info_.maxConstantArgs_       = MaxConstArguments;

    // Image support fields
    if (settings().imageSupport_) {
        info_.imageSupport_      = CL_TRUE;
        info_.maxSamplers_       = MaxSamplers;
        info_.maxReadImageArgs_  = MaxReadImage;
        info_.maxWriteImageArgs_ = MaxWriteImage;
        info_.image2DMaxWidth_   = static_cast<size_t>(getMaxTextureSize());
        info_.image2DMaxHeight_  = static_cast<size_t>(getMaxTextureSize());
        info_.image3DMaxWidth_   = std::min(2 * Ki, static_cast<size_t>(getMaxTextureSize()));
        info_.image3DMaxHeight_  = std::min(2 * Ki, static_cast<size_t>(getMaxTextureSize()));
        info_.image3DMaxDepth_   = std::min(2 * Ki, static_cast<size_t>(getMaxTextureSize()));

        info_.imagePitchAlignment_       = 256; // XXX: 256 pixel pitch alignment for now
        info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now

        info_.bufferFromImageSupport_ = (heap()->isVirtual()) ? CL_TRUE : CL_FALSE;
    }

    info_.errorCorrectionSupport_    = CL_FALSE;

    if (settings().apuSystem_) {
        info_.hostUnifiedMemory_ = CL_TRUE;
    }

    info_.profilingTimerResolution_  = 1;
    info_.profilingTimerOffset_      = amd::Os::offsetToEpochNanos();
    info_.littleEndian_              = CL_TRUE;
    info_.available_                 = CL_TRUE;
    info_.compilerAvailable_         = CL_TRUE;
    info_.linkerAvailable_           = CL_TRUE;

    info_.executionCapabilities_     = CL_EXEC_KERNEL;
    if (settings().oclVersion_ >= OpenCL20) {
        info_.svmCapabilities_       = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER;
        if (settings().svmAtomics_) {
            info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS;
        }
    }
    info_.preferredPlatformAtomicAlignment_ = 0;
    info_.preferredGlobalAtomicAlignment_ = 0;
    info_.preferredLocalAtomicAlignment_ = 0;
    info_.queueProperties_           = CL_QUEUE_PROFILING_ENABLE;

    info_.platform_ = AMD_PLATFORM;

#if cl_amd_open_video
    // Open Video support
    // Decoder
    info_.openVideo_ = settings().openVideo_;
    info_.maxVideoSessions_ = calVideoAttr.max_decode_sessions;
    info_.numVideoAttribs_ = (calVideoAttr.data_size - 2 * sizeof(CALuint))
        / sizeof(CALvideoAttrib);
    info_.videoAttribs_ = const_cast<cl_video_attrib_amd*>(
        reinterpret_cast<const cl_video_attrib_amd*>(calVideoAttr.video_attribs));

    // Encoder
    info_.numVideoEncAttribs_ = (calVideoAttr.data_size - 2 * sizeof(CALuint))
        / sizeof(CALvideoEncAttrib);
    info_.videoEncAttribs_ = const_cast<cl_video_attrib_encode_amd*>(
        reinterpret_cast<const cl_video_attrib_encode_amd*>(calVideoAttr.video_enc_attribs));
#endif // cl_amd_open_video

    ::strcpy(info_.name_, hwInfo()->targetName_);
    ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
    ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
         AMD_BUILD_STRING "%s", (heap()->isVirtual()) ? " (VM)": "");

    info_.profile_ = "FULL_PROFILE";
    if (settings().oclVersion_ == OpenCL20) {
        info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO;
        info_.oclcVersion_ = "OpenCL C 2.0 ";
        info_.spirVersions_ = "1.2";
    }
    else if (settings().oclVersion_ == OpenCL12) {
        info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
        info_.oclcVersion_ = "OpenCL C 1.2 ";
        info_.spirVersions_ = "1.2";
    }
    else {
        info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO;
        info_.oclcVersion_ = "OpenCL C 1.0 ";
        info_.spirVersions_ = "";
        LogError("Unknown version for support");
    }

    // Fill workgroup info size
    info_.maxWorkGroupSize_     = settings().maxWorkGroupSize_;
    info_.maxWorkItemSizes_[0]  = info_.maxWorkGroupSize_;
    info_.maxWorkItemSizes_[1]  = info_.maxWorkGroupSize_;
    info_.maxWorkItemSizes_[2]  = info_.maxWorkGroupSize_;

    if (settings().hwLDSSize_ != 0) {
        info_.localMemType_ = CL_LOCAL;
        info_.localMemSize_ = settings().hwLDSSize_;
    }
    else {
        info_.localMemType_ = CL_GLOBAL;
        info_.localMemSize_ = 16 * Ki;
    }

    info_.extensions_   = getExtensionString();

    if (settings().checkExtension(ClExtAtomicCounters32)) {
        info_.maxAtomicCounters_    = MaxAtomicCounters;
    }

    info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
    info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation&(0xFF<<8))>>8;
    info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation&(0x1F<<3))>>3;
    info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation&0x07);

    ::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_));

    // OpenCL1.2 device info fields
    info_.builtInKernels_ = "";
    info_.imageMaxBufferSize_ = MaxImageBufferSize;
    info_.imageMaxArraySize_ = MaxImageArraySize;
    info_.preferredInteropUserSync_ = true;
    info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_;

    if (settings().oclVersion_ >= OpenCL20) {
        // OpenCL2.0 device info fields
        info_.maxWriteImageArgs_        = MaxReadWriteImage;    //!< For compatibility
        info_.maxReadWriteImageArgs_    = MaxReadWriteImage;

        info_.maxPipePacketSize_ = info_.maxMemAllocSize_;
        info_.maxPipeActiveReservations_ = 16;
        info_.maxPipeArgs_ = 16;

        info_.queueOnDeviceProperties_ =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
        info_.queueOnDevicePreferredSize_ = 16 * Ki;
        info_.queueOnDeviceMaxSize_ = 256 * Ki;
        info_.maxOnDeviceQueues_ = 1;
        info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
        info_.globalVariablePreferredTotalSize_ = static_cast<size_t>(info_.globalMemSize_);
        info_.maxGlobalVariableSize_ = static_cast<size_t>(info_.maxMemAllocSize_);
    }

    if (settings().checkExtension(ClAmdDeviceAttributeQuery)) {
        info_.simdPerCU_            = hwInfo()->simdPerCU_;
        info_.simdWidth_            = hwInfo()->simdWidth_;
        info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
        info_.wavefrontWidth_       = calAttr.wavefrontSize;
        info_.globalMemChannels_    = calAttr.memBusWidth / 32;
        info_.globalMemChannelBanks_    = calAttr.numMemBanks;
        info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
        info_.localMemSizePerCU_    = hwInfo()->localMemSizePerCU_;
        info_.localMemBanks_        = hwInfo()->localMemBanks_;
        info_.gfxipVersion_         = hwInfo()->gfxipVersion_;
        info_.threadTraceEnable_    = settings().threadTraceEnable_;
    }
}

extern const char* SchedulerSourceCode;

bool
Device::create(CALuint ordinal)
{
    appProfile_.init();

    // Open GSL device
    if (!open(ordinal, appProfile_.enableHighPerformanceState(), appProfile_.reportAsOCL12Device())) {
        return false;
    }

    // Update CAL target
    calTarget_ = getAttribs().target;
    hwInfo_ = &DeviceInfo[calTarget_];

    // Creates device settings
    settings_ = new gpu::Settings();
    gpu::Settings* gpuSettings = reinterpret_cast<gpu::Settings*>(settings_);
    if ((gpuSettings == NULL) || !gpuSettings->create(getAttribs()
#if cl_amd_open_video
          , getVideoAttribs()
#endif // cl_amd_open_video
          , appProfile_.reportAsOCL12Device()
        )) {
        return false;
    }

    amd::Context::Info  info = {0};
    std::vector<amd::Device*> devices;
    devices.push_back(this);

    // Create a dummy context
    context_ = new amd::Context(devices, info);
    if (context_ == NULL) {
        return false;
    }

    // Create the locks
    lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true);
    if (NULL == lockAsyncOps_) {
        return false;
    }

    lockAsyncOpsForInitHeap_ = new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true);
    if (NULL == lockAsyncOpsForInitHeap_) {
        return false;
    }

    vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true);
    if (NULL == vgpusAccess_) {
        return false;
    }
    vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true);
    if (NULL == vaCacheAccess_) {
        return false;
    }
    vaCacheList_ = new std::list<VACacheEntry*>();
    if (NULL == vaCacheList_) {
        return false;
    }

    mapCache_ = new std::vector<amd::Memory*>();
    if (mapCache_ == NULL) {
        return false;
    }
    // Use just 1 entry by default for the map cache
    mapCache_->push_back(NULL);

    size_t  resourceCacheSize = settings().resourceCacheSize_;

    // Allocate heap
    heapSize_ = settings().heapSize_;

    // Check if BE supports virtual addressing mode
    if (isVmMode()) {
        heap_ = new VirtualHeap(*this);
        gpuSettings->largeHostMemAlloc_ = (NULL != heap_) ? true : false;
    }

    // If virtual heap allocation failed, then try static allocation
    if (heap_ == NULL) {
        heap_ = new Heap(*this);
        // Disable resource cache if VM is disable
        resourceCacheSize = 0;
        if (NULL == heap_) {
            return false;
        }
    }

#ifdef DEBUG
    std::stringstream  message;
    if (settings().remoteAlloc_) {
        message << "Using *Remote* memory";
    }
    else {
        message << "Using *Local* memory";
    }
    if (!heap()->isVirtual()) {
        message << ": " << settings().heapSize_ / Mi << "MB, growth: " <<  \
            settings().heapSizeGrowth_ / Mi << "MB";
    }
    message << std::endl;
    LogInfo(message.str().c_str());
#endif // DEBUG

    // Create resource cache.
    // \note Cache must be created before any resource creation to avoid NULL check
    resourceCache_ = new ResourceCache(resourceCacheSize);
    if (NULL == resourceCache_) {
        return false;
    }

    // Fill the device info structure
    fillDeviceInfo(getAttribs(), getStatus()
#if cl_amd_open_video
        , getVideoAttribs()
#endif //cl_amd_open_video
    );

    if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
        if (NULL == hsaCompiler_) {
            const char* library = getenv("HSA_COMPILER_LIBRARY");
            aclCompilerOptions opts = {
                sizeof(aclCompilerOptions_0_8),
                library,
                NULL,
                NULL,
                NULL,
                NULL,
                NULL,
                NULL,
                &::malloc,
                &::free
            };
            // Initialize the compiler handle
            acl_error   error;
            hsaCompiler_ = aclCompilerInit(&opts, &error);
            if (error != ACL_SUCCESS) {
                 LogError("Error initializing the compiler");
                 return false;
            }
        }
    }
    else {
        blitProgram_ = new BlitProgram(context_);
        // Create blit programs
        if (blitProgram_ == NULL || !blitProgram_->create(this)) {
            delete blitProgram_;
            blitProgram_ = NULL;
            LogError("Couldn't create blit kernels!");
            return false;
        }
    }

    // Allocate SRD manager
    srdManager_ = new SrdManager(*this,
        std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki);
    if (srdManager_ == NULL) {
        return false;
    }

    return true;
}

bool
Device::initializeHeapResources()
{
    amd::ScopedLock k(lockAsyncOpsForInitHeap_);
    if (!heapInitComplete_) {
        heapInitComplete_ = true;
        uint nEngines;
        gslEngineDescriptor engines[GSL_ENGINEID_MAX];
        queryDeviceEngines(&nEngines, engines);
        engines_.create(nEngines, engines, settings().numComputeRings_);

        uint numComputeRings = engines_.numComputeRings();
        scratch_.resize((settings().useSingleScratch_) ? 1 : (numComputeRings ? numComputeRings : 1));

        // Initialize the number of mem object for the scratch buffer
        for (uint s = 0; s < scratch_.size(); ++s) {
            scratch_[s] = new ScratchBuffer((settings().siPlus_) ? 1 : info_.numberOfShaderEngines);
            if (NULL == scratch_[s]) {
                return false;
            }
        }

        // Complete initialization of the heap and other buffers
        if ((heap_ == NULL) || !heap_->create(heapSize_, settings().remoteAlloc_)) {
            LogError("Failed GPU heap creation");
            return false;
        }

        size_t dummySize = amd::Os::pageSize();
        if (heap()->isVirtual() && settings().preallocAddrSpace_) {
            dummySize = static_cast<size_t>(ReservedAdressSpaceSize - Mi);
        }

        // Allocate a dummy page for NULL pointer processing
        dummyPage_ = new(*context_) amd::Buffer(*context_, 0, dummySize);
        if ((dummyPage_ != NULL) && !dummyPage_->create()) {
            dummyPage_->release();
            return false;
        }

        Memory* devMemory = reinterpret_cast<Memory*>(dummyPage_->getDeviceMemory(*this));
        if (devMemory == NULL) {
            // Release memory
            dummyPage_->release();
            dummyPage_ = NULL;
            return false;
        }

        if (settings().stagedXferSize_ != 0) {
            // Initialize staged write buffers
            if (settings().stagedXferWrite_) {
                Resource::MemoryType type;
                if (settings().stagingWritePersistent_ && !settings().disablePersistent_) {
                    type = Resource::Persistent;
                } else {
                    type = Resource::RemoteUSWC;
                }
                xferWrite_ = new XferBuffers(*this, type,
                    amd::alignUp(settings().stagedXferSize_, heap()->granularityB()));
                if ((xferWrite_ == NULL) || !xferWrite_->create()) {
                    LogError("Couldn't allocate transfer buffer objects for read");
                    return false;
                }
            }

            // Initialize staged read buffers
            if (settings().stagedXferRead_) {
                xferRead_ = new XferBuffers(*this, Resource::Remote,
                    amd::alignUp(settings().stagedXferSize_, heap()->granularityB()));
                if ((xferRead_ == NULL) || !xferRead_->create()) {
                    LogError("Couldn't allocate transfer buffer objects for write");
                    return false;
                }
            }
        }

        // Delay compilation due to brig_loader memory allocation
        if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
            const char* scheduler = NULL;
            const char* ocl20 = NULL;
            if (settings().oclVersion_ == OpenCL20) {
                scheduler = SchedulerSourceCode;
                ocl20 = "-cl-std=CL2.0";
            }
            blitProgram_ = new BlitProgram(context_);
            // Create blit programs
            if (blitProgram_ == NULL ||
                !blitProgram_->create(this, scheduler, ocl20)) {
                delete blitProgram_;
                blitProgram_ = NULL;
                LogError("Couldn't create blit kernels!");
                return false;
            }
        }

        // Create a synchronized transfer queue
        xferQueue_ = new VirtualGPU(*this);
        if (!(xferQueue_ && xferQueue_->create(
            false,
    #if cl_amd_open_video
            NULL
    #endif // cl_amd_open_video
            ))) {
            delete xferQueue_;
            xferQueue_ = NULL;
        }
        if (NULL == xferQueue_) {
            LogError("Couldn't create the device transfer manager!");
            return false;
        }
        xferQueue_->enableSyncedBlit();
    }
    return true;
}

device::VirtualDevice*
Device::createVirtualDevice(
    bool    profiling,
    bool    interopQueue
#if cl_amd_open_video
    , void* calVideoProperties
#endif // cl_amd_open_video
    , uint  deviceQueueSize
    )
{
    // Not safe to add a queue. So lock the device
    amd::ScopedLock k(lockAsyncOps());
    amd::ScopedLock lock(vgpusAccess());

    // Initialization of heap and other resources occur during the command queue creation time.
    if (!initializeHeapResources()) {
        return NULL;
    }

    VirtualGPU* vgpu = new VirtualGPU(*this);
    if (vgpu && vgpu->create(
        profiling
#if cl_amd_open_video
        , calVideoProperties
#endif // cl_amd_open_video
        , deviceQueueSize
        )) {
        return vgpu;
    } else {
        delete vgpu;
        return NULL;
    }
}

bool
Device::reallocHeap(size_t size, bool remoteAlloc)
{
    size_t  heapSize    =  heapSize_ + ((size != 0) ?
        amd::alignUp(size, settings().heapSizeGrowth_) : 0);
    Heap*   oldHeap     = heap_;
    // Maximum heap limit size = reported size + internal memory
    size_t  maxHeapLimit = static_cast<size_t>(info().globalMemSize_) +
        // an extra 10MB for the alignments of allocations,
        // since the conformance test doesn't expect any
        10 * Mi;

    if ((settings().heapSizeGrowth_ == 0) ||
        // Allow the heap growth up to the global memory limit
        (heapSize_ + size > maxHeapLimit)) {
        return false;
    }
    heapSize = std::min(maxHeapLimit, heapSize);

    heap_ = new Heap(*this);

    // Make sure we have allocated a new global heap
    if (NULL == heap_) {
        heap_ = oldHeap;
        return false;
    }

    if (!heap_->create(heapSize, remoteAlloc)) {
        delete heap_;
        heap_ = oldHeap;
        return false;
    }

    // Copy the old heap to the new one
    if (!oldHeap->copyTo(heap_)) {
        delete heap_;
        heap_ = oldHeap;
        return false;
    }

    delete oldHeap;
    heapSize_ = heapSize;

    return true;
}

device::Program*
Device::createProgram(int oclVer)
{
    device::Program* gpuProgram;
    if (settings().hsail_ || (oclVer == 200)) {
        gpuProgram = new HSAILProgram(*this);
    }
    else {
        gpuProgram = new Program(*this);
    }
    if (gpuProgram == NULL) {
        LogError("We failed memory allocation for program!");
    }

    return gpuProgram;
}

//! Requested devices list as configured by the GPU_DEVICE_ORDINAL
typedef std::map<int, bool> requestedDevices_t;

//! Parses the requested list of devices to be exposed to the user.
static void
parseRequestedDeviceList(requestedDevices_t &requestedDevices) {
    char *pch = NULL;
    int requestedDeviceCount = 0;
    const char* requestedDeviceList = GPU_DEVICE_ORDINAL;

    pch = strtok(const_cast<char*>(requestedDeviceList), ",");
    while (pch != NULL) {
        bool deviceIdValid = true;
        int currentDeviceIndex = atoi(pch);
        // Validate device index.
        for (size_t i = 0; i < strlen(pch); i++) {
            if (!isdigit(pch[i])) {
                deviceIdValid = false;
                break;
            }
        }
        if (currentDeviceIndex < 0) {
            deviceIdValid = false;
        }
        // Get next token.
        pch = strtok(NULL, ",");
        if (!deviceIdValid) {
            continue;
        }

        // Requested device is valid.
        requestedDevices[currentDeviceIndex] = true;
    }
}

#if defined(_WIN32) && defined (DEBUG)
#include <cstdio>
#include <crtdbg.h>
static int reportHook(int reportType, char *message, int *returnValue)
{
    fprintf(stderr, "%s", message);
    ::exit(3);
    return 1;
}
#endif // _WIN32 & DEBUG

bool
Device::init()
{
    CALuint     numDevices = 0;
    bool        result = false;
    bool	useDeviceList = false;
    requestedDevices_t requestedDevices;

    const char *library = getenv("COMPILER_LIBRARY");
    aclCompilerOptions opts = {
        sizeof(aclCompilerOptions_0_8),
        library,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        &::malloc,
        &::free
    };

    hsaCompiler_ = NULL;
    compiler_ = aclCompilerInit(&opts, NULL);

#if defined(_WIN32) && !defined(_WIN64)
    // @toto: FIXME: remove this when CAL is fixed!!!
    unsigned int old, ignored;
    _controlfp_s(&old, 0, 0);
#endif // _WIN32 && !_WIN64
    // FIXME_lmoriche: needs cleanup
    osInit();
#if defined(_WIN32)
    //osAssertSetStyle(OSASSERT_STYLE_LOGANDEXIT);
#endif // WIN32

#if defined(_WIN32) && defined (DEBUG)
    if (::getenv("AMD_OCL_SUPPRESS_MESSAGE_BOX"))
    {
        _CrtSetReportHook(reportHook);
        _set_error_mode(_OUT_TO_STDERR);
   }
#endif // _WIN32 & DEBUG

    calInit();

#if defined(_WIN32) && !defined(_WIN64)
    _controlfp_s(&ignored, old, _MCW_RC | _MCW_PC);
#endif // _WIN32 && !_WIN64

    // Get the total number of active devices
    // Count up all the devices in the system.
    numDevices = calGetDeviceCount();

    CALuint ordinal = 0;
    const char* selectDeviceByName = NULL;
    if (!flagIsDefault(GPU_DEVICE_ORDINAL)) {
        useDeviceList = true;
        parseRequestedDeviceList(requestedDevices);
    }
    else if (!flagIsDefault(GPU_DEVICE_NAME)) {
        selectDeviceByName = GPU_DEVICE_NAME;
    }

    // Loop through all active devices and initialize the device info structure
    for (; ordinal < numDevices; ++ordinal) {
        // Create the GPU device object
        Device *d = new Device();
        result = (NULL != d) && d->create(ordinal);
        if (useDeviceList) {
            result &= (requestedDevices.find(ordinal) != requestedDevices.end());
        }
        if (result &&
            ((NULL == selectDeviceByName) || ('\0' == selectDeviceByName[0]) ||
             (strstr(selectDeviceByName, d->info().name_) != NULL))) {
            d->registerDevice();
        }
        else {
            delete d;
        }
    }
    return result;
}

void
Device::tearDown()
{
    osExit();
    calShutdown();
    aclCompilerFini(compiler_);
    if (hsaCompiler_ != NULL) {
        aclCompilerFini(hsaCompiler_);
    }
}

//! @note This funciton must be lock protected from a caller
HeapBlock*
Device::allocHeapBlock(size_t size) const
{
    HeapBlock* hb = NULL;

    // Allocate the underlying heap block
    hb = heap_->alloc(size);

    // Virtual heap should never fail allocation
    if ((hb == NULL) && (!heap_->isVirtual())) {
        // Queues can't process commands,
        // while the global heap reallocation occurs.
        // So stall all queues and then reallocate the global heap
        ScopedLockVgpus lock(*this);

        // Wait for idle
        for (uint idx = 0; idx < vgpus().size(); ++idx) {
            vgpus()[idx]->waitAllEngines();
        }

        // Acount memory alignment for the new allocation
        size_t  extraSpace = heap_->granularityB();
        if (size >= heap_->freeSpace()) {
            // Required extra space = requested size - free space
            extraSpace += size - heap_->freeSpace();
        }

        //! @note the const cast here looks bad, but the device object
        //  is a lock protected above. The rest of the code
        //  doesn't change the device object.
        //  So the const methods can be safly used everywhere else.
        //  In general we should avoid changing the device object after initialization

        // Try to reallocate the heap with the same memory type
        if (const_cast<Device*>(this)->reallocHeap(extraSpace, settings().remoteAlloc_)) {
            hb = heap_->alloc(size);
        }

        if (hb == NULL) {
            // Use reversed memory type as a temporary storage
            bool    remoteAlloc = settings().remoteAlloc_ ^ true;

            // Try to reallocate the heap
            if (const_cast<Device*>(this)->reallocHeap(extraSpace, remoteAlloc)) {
                // Back to the default location of the global heap
                remoteAlloc ^= true;
                if (!const_cast<Device*>(this)->reallocHeap(0, remoteAlloc)) {
                    LogWarning("New memory type for the \
                        global heap after reallocation!");
                }
                hb = heap_->alloc(size);
            }
        }
    }

    return hb;
}

gpu::Memory*
Device::getGpuMemory(amd::Memory* mem) const
{
    return static_cast<gpu::Memory*>(mem->getDeviceMemory(*this));
}


CalFormat
Device::getCalFormat(const amd::Image::Format& format) const
{
    // Find CAL format
    for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) {
        if ((format.image_channel_data_type ==
             MemoryFormatMap[i].clFormat_.image_channel_data_type) &&
            (format.image_channel_order ==
             MemoryFormatMap[i].clFormat_.image_channel_order)) {
            return MemoryFormatMap[i].calFormat_;
        }
    }
    osAssert(0 && "We didn't find CAL resource format!");
    return MemoryFormatMap[0].calFormat_;
}

amd::Image::Format
Device::getOclFormat(const CalFormat& format) const
{
    // Find CL format
    for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) {
        if ((format.type_ ==
             MemoryFormatMap[i].calFormat_.type_) &&
            (format.channelOrder_ ==
             MemoryFormatMap[i].calFormat_.channelOrder_)) {
            return MemoryFormatMap[i].clFormat_;
        }
    }
    osAssert(0 && "We didn't find OCL resource format!");
    return MemoryFormatMap[0].clFormat_;
}

// Create buffer without an owner (merge common code with createBuffer() ?)
gpu::Memory*
Device::createScratchBuffer(size_t size) const
{
    Memory* gpuMemory = NULL;

    // Use virtual heap allocation
    if (heap()->isVirtual()) {
        // Create a memory object
        gpuMemory = new gpu::Memory(*this, size);
        if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) {
            delete gpuMemory;
            gpuMemory = NULL;
        }
    }
    else {
        // We have to lock the heap block allocation,
        // so possible reallocation won't occur twice or
        // another thread could destroy a heap block,
        // while we didn't finish allocation
        amd::ScopedLock k(lockAsyncOps());

        HeapBlock* hb = allocHeapBlock(size);
        if (hb != NULL) {
            // wrap it
            gpuMemory = new gpu::Memory(*this, *hb);

            // Create resource
            if (NULL != gpuMemory) {
                Resource::ViewParams   params;
                params.offset_  = hb->offset_;
                params.size_    = hb->size_;
                params.resource_ = &(globalMem());
                params.memory_  = NULL;
                if (!gpuMemory->create(Resource::View, &params)) {
                    delete gpuMemory;
                    gpuMemory = NULL;
                }
            }
        }
    }

    return gpuMemory;
}

gpu::Memory*
Device::createBufferFromHeap(amd::Memory& owner) const
{
    size_t  size = owner.getSize();
    gpu::Memory* gpuMemory;

    // We have to lock the heap block allocation,
    // so possible reallocation won't occur twice or
    // another thread could destroy a heap block,
    // while we didn't finish allocation
    amd::ScopedLock k(lockAsyncOps());

    HeapBlock* hb = allocHeapBlock(size);
    if (hb == NULL) {
        LogError("We don't have enough video memory!");
        return NULL;
    }

    // Create a memory object
    gpuMemory = new gpu::Memory(*this, owner, hb);
    if (NULL == gpuMemory) {
        hb->setMemory(NULL);
        hb->free();
        return NULL;
    }

    Resource::ViewParams params;
    params.owner_       = &owner;
    params.offset_      = hb->offset_;
    params.size_        = hb->size_;
    params.resource_    = &(globalMem());
    params.memory_      = NULL;

    if (!gpuMemory->create(Resource::View, &params)) {
        delete gpuMemory;
        return NULL;
    }

    // Check if owner is interop memory
    if (owner.isInterop()) {
        if (!gpuMemory->createInterop(Memory::InteropHwEmulation)) {
            LogError("HW interop creation failed!");
            delete gpuMemory;
            return NULL;
        }
    }
    return gpuMemory;
}

gpu::Memory*
Device::createBuffer(
    amd::Memory&    owner,
    bool            directAccess,
    bool            bufferAlloc) const
{
    size_t  size = owner.getSize();
    gpu::Memory* gpuMemory;

    // Create resource
    bool result = false;

    if (owner.getType() == CL_MEM_OBJECT_PIPE) {
        // directAccess isnt needed as Pipes shouldnt be host accessible for GPU
        directAccess = false;
    }

    if (NULL != owner.parent()) {
        gpu::Memory*    gpuParent = getGpuMemory(owner.parent());
        if (NULL == gpuParent) {
            LogError("Can't get the owner object for subbuffer allocation");
            return NULL;
        }

        if (!heap()->isVirtual()) {
            bool    uhpAlloc =
                (owner.parent()->getMemFlags() & CL_MEM_USE_HOST_PTR) ? true : false;

            if (owner.parent()->getType() != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
                //! \note This extra line is necessary to make sure that subbuffer
                //! allocation is a synch operation,
                //! due to a possible realloc of heap(no VM) or parent(UHP)
                amd::ScopedLock k(lockAsyncOps());

                //! @note: For now make sure the parent is allocated in the global heap
                //! or if it's the UHP optimization for prepinned memory
                if (((gpuParent->hb() == NULL) || uhpAlloc) &&
                    !owner.parent()->reallocedDeviceMemory(this)) {
                    if (reallocMemory(*owner.parent())) {
                        gpuParent = getGpuMemory(owner.parent());
                    }
                    else {
                        LogError("Can't reallocate the owner object for subbuffer allocation");
                        return NULL;
                    }
                }

                return gpuParent->createBufferView(owner);
            }
            else {
                gpuParent = getGpuMemory(owner.parent()->parent());
                return gpuParent->createBufferView(*owner.parent()->parent());
            }
        }
        else {
            return gpuParent->createBufferView(owner);
        }
    }

    Resource::MemoryType    type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ?
        Resource::Remote : Resource::Local;

    if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) {
        type = Resource::BusAddressable;
    }
    else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) {
        type = Resource::ExternalPhysical;
    }

    // Use direct access if it's possible
    if (bufferAlloc || (type == Resource::Remote)) {
        bool    forceHeapAlloc = false;
        bool    remoteAlloc = false;
        // Internal means VirtualDevice!=NULL
        bool    internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) &&
              (owner.getVirtualDevice() != NULL)) ? true : false;

        // Create a memory object
        gpuMemory = new gpu::Buffer(*this, owner, owner.getSize());
        if (NULL == gpuMemory) {
            return NULL;
        }

        // Check if owner is interop memory
        if (owner.isInterop()) {
            result = gpuMemory->createInterop(Memory::InteropDirectAccess);
        }
        else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
            // Attempt to allocate from persistent heap
            result = gpuMemory->create(Resource::Persistent);
        }
        else if (directAccess || (type == Resource::Remote)) {
            // Check for system memory allocations
            if (owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
                // Allocate remote memory if AHP allocation and context has just 1 device
                if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) &&
                    (owner.getContext().devices().size() == 1)) {
                    if (owner.getMemFlags() & (CL_MEM_READ_ONLY |
                        CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
                        // GPU will be reading from this host memory buffer,
                        // so assume Host write into it
                        type = Resource::RemoteUSWC;
                        remoteAlloc = true;
                    }
                }
                // Make sure owner has a valid hostmem pointer and it's not COPY
                if (!remoteAlloc && (owner.getHostMem() != NULL)) {
                    Resource::PinnedParams params;
                    params.owner_ = &owner;
                    params.gpu_ =
                        reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());

                    params.hostMemRef_  = owner.getHostMemRef();
                    params.size_        = owner.getHostMemRef()->size();
                    if (0 == params.size_) {
                        params.size_ = owner.getSize();
                    }
                    // Create memory object
                    result = gpuMemory->create(Resource::Pinned, &params);

                    // If direct access failed
                    if (!result) {
                        // and VM off, then force a heap allocation
                        if (!heap()->isVirtual()) {
                            // Internal pinning doesn't need a heap allocation
                            if (!internalAlloc) {
                                forceHeapAlloc = true;
                            }
                        }
                        // Don't use cached allocation
                        // if size is biger than max single alloc
                        if (owner.getSize() > info().maxMemAllocSize_) {
                            delete gpuMemory;
                            return NULL;
                        }
                    }
                }
            }
        }

        if (!result && !forceHeapAlloc &&
            // Make sure it's not internal alloc
            !internalAlloc) {
            Resource::CreateParams  params;
            params.owner_ = &owner;

            // Create memory object
            result = gpuMemory->create(type, &params);

            // If allocation was successful
            if (result) {
                // Initialize if the memory is a pipe object
                if (owner.getType() == CL_MEM_OBJECT_PIPE) {
                    // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
                    // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
                    size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
                    gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
                }
                // If memory has direct access from host, then get CPU address
                if (gpuMemory->isHostMemDirectAccess() &&
                   (type != Resource::ExternalPhysical)) {
                    void* address = gpuMemory->map(NULL);
                    if (address != NULL) {
                        // Copy saved memory
                        if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) {
                            memcpy(address, owner.getHostMem(), owner.getSize());
                        }
                        // It should be safe to change the host memory pointer,
                        // because it's lock protected from the upper caller
                        owner.setHostMem(address);
                    }
                    else {
                        result = false;
                    }
                }
                // An optimization for CHP. Copy memory and destroy sysmem allocation
                else if ((gpuMemory->memoryType() != Resource::Pinned) &&
                         (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
                         (owner.getContext().devices().size() == 1)) {
                    amd::Coord3D    origin(0, 0, 0);
                    amd::Coord3D    region(owner.getSize());
                    static const bool Entire  = true;
                    if (xferMgr().writeBuffer(owner.getHostMem(),
                        *gpuMemory, origin, region, Entire)) {
                        // Clear CHP memory
                        owner.setHostMem(NULL);
                    }
                }
            }
        }

        if (!result && !forceHeapAlloc) {
            delete gpuMemory;
            return NULL;
        }
    }

    if (!result) {
        assert(!heap()->isVirtual() && "Can't have static heap allocation with VM");
        gpuMemory = createBufferFromHeap(owner);
    }

    return gpuMemory;
}

gpu::Memory*
Device::createImage(amd::Memory& owner, bool directAccess) const
{
    size_t  size = owner.getSize();
    amd::Image& image = *owner.asImage();
    gpu::Memory* gpuImage = NULL;
    CalFormat   format = getCalFormat(image.getImageFormat());

    if ((NULL != owner.parent()) && (owner.parent()->asImage() != NULL)) {
        device::Memory* devParent = owner.parent()->getDeviceMemory(*this);
        if (NULL == devParent) {
            LogError("Can't get the owner object for image view allocation");
            return NULL;
        }
        // Create a view on the specified device
        return (gpu::Memory*)createView(owner, *devParent);
    }

    gpuImage = new gpu::Image(*this, owner,
        image.getWidth(),
        image.getHeight(),
        image.getDepth(),
        format.type_,
        format.channelOrder_,
        image.getType());

    // Create resource
    if (NULL != gpuImage) {
        const bool imageBuffer =
            ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) ||
             ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) &&
              (owner.parent() != NULL) &&
              (owner.parent()->asBuffer() != NULL)));
        bool result = false;

        // Check if owner is interop memory
        if (owner.isInterop()) {
            result = gpuImage->createInterop(Memory::InteropDirectAccess);
        }
        else if (imageBuffer) {
            Resource::ImageBufferParams  params;
            gpu::Memory* buffer = reinterpret_cast<gpu::Memory*>
                (image.parent()->getDeviceMemory(*this));
            if (buffer == NULL) {
                LogError("Buffer creation for ImageBuffer failed!");
                delete gpuImage;
                return NULL;
            }
            params.owner_       = &owner;
            params.resource_    = buffer;
            params.memory_      = buffer;

            // Create memory object
            result = gpuImage->create(Resource::ImageBuffer, &params);
        }
        else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
            Resource::PinnedParams  params;
            params.owner_       = &owner;
            params.hostMemRef_  = owner.getHostMemRef();
            params.size_        = owner.getHostMemRef()->size();

            // Create memory object
            result = gpuImage->create(Resource::Pinned, &params);
        }

        if (!result && !owner.isInterop()) {
            if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
                // Attempt to allocate from persistent heap
                result = gpuImage->create(Resource::Persistent);
            }
            else {
                Resource::MemoryType    type = (owner.forceSysMemAlloc()) ?
                    Resource::RemoteUSWC : Resource::Local;
                // Create memory object
                result = gpuImage->create(type);
            }
        }

        if (!result) {
            delete gpuImage;
            return NULL;
        }
        else if ((gpuImage->memoryType() != Resource::Pinned) &&
                 (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
                 (owner.getContext().devices().size() == 1)) {
            // Ignore copy for image1D_buffer, since it was already done for buffer
            if (heap()->isVirtual() && imageBuffer) {
                // Clear CHP memory
                owner.setHostMem(NULL);
            }
            else if (!imageBuffer) {
                amd::Coord3D    origin(0, 0, 0);
                static const bool Entire  = true;
                if (xferMgr().writeImage(owner.getHostMem(),
                    *gpuImage, origin, image.getRegion(), 0, 0, Entire)) {
                    // Clear CHP memory
                    owner.setHostMem(NULL);
                }
            }
        }

        if (result) {
            gslMemObject temp = gpuImage->gslResource();
            size_t bytePitch = gpuImage->elementSize() * temp->getPitch();
            image.setBytePitch(bytePitch);
        }
    }

    return gpuImage;
}

//! Allocates cache memory on the card
device::Memory*
Device::createMemory(
    amd::Memory&    owner) const
{
    bool directAccess   = false;
    bool bufferAlloc    = false;
    gpu::Memory* memory = NULL;

    if (heap()->isVirtual()) {
        bufferAlloc = true;
    }
    //!@todo Remove this code when VM is always on.
    // Use zero-copy transfers for sysmem allocations or persistent memory
    else {
        if (owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR |
                                   CL_MEM_USE_HOST_PTR)) {
            bufferAlloc = true;
        }
    }

    if (owner.asBuffer()) {
        directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer)
            ? true : false;
        memory = createBuffer(owner, directAccess, bufferAlloc);
    }
    else if (owner.asImage()) {
        directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage)
            ? true : false;
        memory = createImage(owner, directAccess);
    }
    else {
        LogError("Unknown memory type!");
    }

    // Attempt to pin system memory if runtime didn't use direct access
    if ((memory != NULL) &&
        (memory->memoryType() != Resource::Pinned) &&
        (memory->memoryType() != Resource::Remote) &&
        (memory->memoryType() != Resource::RemoteUSWC) &&
        (memory->memoryType() != Resource::ExternalPhysical) &&
        ((owner.getHostMem() != NULL) ||
         ((NULL != owner.parent()) && (owner.getHostMem() != NULL)))) {
        bool ok = memory->pinSystemMemory(
            owner.getHostMem(), (owner.getHostMemRef()->size()) ?
                owner.getHostMemRef()->size() : owner.getSize());
        //! \note: Ignore the pinning result for now
    }

    return memory;
}

bool
Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const
{
    *sampler = NULL;
    if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) {
        Sampler* gpuSampler = new Sampler(*this);
        if ((NULL == gpuSampler) || !gpuSampler->create(owner.state())) {
            delete gpuSampler;
            return false;
        }
        *sampler = gpuSampler;
    }
    return true;
}

//! \note reallocMemory() must be called only from outside of
//! VirtualGPU submit commands methods.
//! Otherwise a deadlock in lockVgpus() is possible

bool
Device::reallocMemory(amd::Memory& owner) const
{
    bool directAccess   = false;
    bool bufferAlloc    = heap()->isVirtual();

    // For now we have to serialize reallocation code
    amd::ScopedLock lk(*lockAsyncOps_);

    // Read device memory after the lock,
    // since realloc from another thread can replace the pointer
    gpu::Memory*  gpuMemory = getGpuMemory(&owner);
    if (gpuMemory == NULL) {
        return false;
    }
    if (gpuMemory->hb() != NULL) {
        return true;
    }

    if (bufferAlloc) {
        if (gpuMemory->pinOffset() == 0) {
            return true;
        }
        else if (NULL != owner.parent()) {
            if (!reallocMemory(*owner.parent())) {
                return false;
            }
        }
    }

    if (owner.asBuffer()) {
        // Disable remote allocation if no VM
        if ((gpuMemory != NULL) &&
            ((gpuMemory->memoryType() == Resource::Remote) ||
             (gpuMemory->memoryType() == Resource::RemoteUSWC)) && !bufferAlloc) {
            // Make sure we don't have a stale memory in VA cache before reallocation
            // of system memory.
            // \note: the app must unmap() memory before kernel launch
            removeVACache(gpuMemory);
            static const bool forceAllocHostMem = true;
            static const bool forceCopy = true;
            owner.allocHostMemory(owner.getHostMem(), forceAllocHostMem, forceCopy);
        }
        gpuMemory = createBuffer(owner, directAccess, bufferAlloc);
    }
    else if (owner.asImage()) {
        return true;
    }
    else {
        LogError("Unknown memory type!");
    }

    if (gpuMemory != NULL) {
        gpu::Memory* newMemory = gpuMemory;
        gpu::Memory* oldMemory = getGpuMemory(&owner);

        // Transfer the object
        if (oldMemory != NULL) {
            if (!oldMemory->moveTo(*newMemory)) {
                delete newMemory;
                return false;
            }
        }

        // Attempt to pin system memory
        if ((newMemory->memoryType() != Resource::Pinned) &&
            ((owner.getHostMem() != NULL) ||
             ((NULL != owner.parent()) && (owner.getHostMem() != NULL)))) {
            bool ok = newMemory->pinSystemMemory(
                owner.getHostMem(), (owner.getHostMemRef()->size()) ?
                owner.getHostMemRef()->size() : owner.getSize());
            //! \note: Ignore the pinning result for now
        }

        return true;
    }

    return false;
}

device::Memory*
Device::createView(amd::Memory& owner, const device::Memory& parent) const
{
    size_t  size = owner.getSize();
    assert((owner.asImage() != NULL) && "View supports images only");
    const amd::Image& image = *owner.asImage();
    gpu::Memory* gpuImage = NULL;
    CalFormat   format = getCalFormat(image.getImageFormat());

    gpuImage = new gpu::Image(*this, owner,
        image.getWidth(),
        image.getHeight(),
        image.getDepth(),
        format.type_,
        format.channelOrder_,
        image.getType());

    // Create resource
    if (NULL != gpuImage) {
        bool result = false;
        Resource::ImageViewParams   params;
        const gpu::Memory& gpuMem = static_cast<const gpu::Memory&>(parent);

        params.owner_       = &owner;
        params.level_       = 0;
        params.layer_       = 0;
        params.resource_    = &gpuMem;
        params.gpu_ = reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
        params.memory_      = &gpuMem;

        // Create memory object
        result = gpuImage->create(Resource::ImageView, &params);
        if (!result) {
            delete gpuImage;
            return NULL;
        }
    }

    return gpuImage;
}


//! Attempt to bind with external graphics API's device/context
bool
Device::bindExternalDevice(
    intptr_t type, void* pDevice, void* pContext, bool validateOnly)
{
    assert(pDevice);

    switch (type) {
#ifdef _WIN32
    case CL_CONTEXT_D3D10_DEVICE_KHR:
        // There is no need to perform full initialization here
        // if the GSLDevice is still uninitialized.
        // Only adapter initialization is required
        // to validate D3D10 interoperability.
        PerformAdapterInitialization();

        // Associate GSL-D3D
        if (!associateD3D10Device(
            reinterpret_cast<ID3D10Device*>(pDevice))) {
            LogError("Failed gslD3D10Associate()");
            return false;
        }
        break;
    case CL_CONTEXT_D3D11_DEVICE_KHR:
        // There is no need to perform full initialization here
        // if the GSLDevice is still uninitialized.
        // Only adapter initialization is required to validate
        // D3D11 interoperability.
        PerformAdapterInitialization();

        // Associate GSL-D3D
        if (!associateD3D11Device(
            reinterpret_cast<ID3D11Device*>(pDevice))) {
            LogError("Failed gslD3D11Associate()");
            return false;
        }
        break;
    case CL_CONTEXT_ADAPTER_D3D9_KHR:
        PerformAdapterInitialization();

        // Associate GSL-D3D
        if (!associateD3D9Device(
            reinterpret_cast<IDirect3DDevice9*>(pDevice))) {
            LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure");
            return false;
        }
        break;
    case CL_CONTEXT_ADAPTER_D3D9EX_KHR:
        PerformAdapterInitialization();

        // Associate GSL-D3D
        if (!associateD3D9Device(
            reinterpret_cast<IDirect3DDevice9Ex*>(pDevice))) {
            LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure");
            return false;
        }
        break;
    case CL_CONTEXT_ADAPTER_DXVA_KHR:
        break;
#endif //_WIN32
    case CL_GL_CONTEXT_KHR:
    {

        // There is no need to perform full initialization here
        // if the GSLDevice is still uninitialized.
        // Only adapter initialization is required to validate
        // GL interoperability.
        PerformAdapterInitialization();

        // Attempt to associate GSL-OGL
        if (!glAssociate((CALvoid*)pContext, pDevice)) {
            if (!validateOnly) {
                LogError("Failed gslGLAssociate()");
            }
            return false;
        }
    }
        break;
    default:
        LogError("Unknown external device!");
        return false;
        break;
    }

    return true;
}

bool
Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
{
    if (type != CL_GL_CONTEXT_KHR) {
        return true;
    }

    if (pDevice != NULL) {
        // Dissociate GSL-OGL
        if (true != glDissociate(pContext, pDevice)) {
            if (validateOnly) {
                LogWarning("Failed gslGLDiassociate()");
            }
            return false;
        }
    }
    return true;
}

void*
Device::allocMapTarget(
    amd::Memory&        mem,
    const amd::Coord3D& origin,
    const amd::Coord3D& region,
    size_t*             rowPitch,
    size_t*             slicePitch)
{
    // Translate memory references
    gpu::Memory* memory = getGpuMemory(&mem);
    if (memory == NULL) {
        LogError("allocMapTarget failed. Can't allocate video memory");
        return NULL;
    }

    // Pass request over to memory
    return memory->allocMapTarget(origin, region, rowPitch, slicePitch);
}

bool
Device::globalFreeMemory(size_t* freeMemory) const
{
    const uint  TotalFreeMemory = 0;
    const uint  LargestFreeBlock = 1;

    // Initialization of heap and other resources because getMemInfo needs it.
    if (!(const_cast<Device*>(this)->initializeHeapResources())) {
        return false;
    }
    if (heap()->isVirtual()) {
        gslMemInfo memInfo = {0};
        getMemInfo(&memInfo);

         // Fill free memory info
        freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes +
            memInfo.cardExtMemAvailableBytes) / Ki;
        freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes,
           memInfo.cardExtLargestFreeBlockBytes) / Ki;
    }
    else {
        freeMemory[TotalFreeMemory] = static_cast<size_t>((info().globalMemSize_ -
            static_cast<cl_ulong>(heapSize_) + heap()->freeSpace()) / Ki);
        freeMemory[LargestFreeBlock] = freeMemory[TotalFreeMemory];
    }

    return true;
}

void
Device::addVACache(Memory* memory) const
{
    // Make sure system memory has direct access
    if (memory->isHostMemDirectAccess()) {
        // VA cache access must be serialised
        amd::ScopedLock lk(*vaCacheAccess_);
        void*   start = memory->owner()->getHostMem();
        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
        size_t  offset;
        Memory*   doubleMap = findMemoryFromVA(start, &offset);

        if (doubleMap == NULL) {
            // Allocate a new entry
            VACacheEntry*   entry = new VACacheEntry(start, end, memory);
            if (entry != NULL) {
                vaCacheList_->push_back(entry);
            }
        }
        else {
            LogError("Unexpected double map() call from the app!");
        }
    }
}

void
Device::removeVACache(const Memory* memory) const
{
    // Make sure system memory has direct access
    if (memory->isHostMemDirectAccess() && memory->owner()) {
        // VA cache access must be serialised
        amd::ScopedLock lk(*vaCacheAccess_);
        void*   start = memory->owner()->getHostMem();
        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();

        // Find VA cache entry for the specified memory
        std::list<VACacheEntry*>::const_iterator    it;
        for (it = vaCacheList_->begin(); it != vaCacheList_->end(); ++it) {
            VACacheEntry*   entry = *it;
            if (entry->startAddress_ == start) {
                CondLog((entry->endAddress_ != end), "Incorrect VA range");
                vaCacheList_->remove(entry);
                delete entry;
                break;
            }
        }
    }
}

Memory*
Device::findMemoryFromVA(const void* ptr, size_t* offset) const
{
    // VA cache access must be serialised
    amd::ScopedLock lk(*vaCacheAccess_);
    std::list<VACacheEntry*>::const_iterator    it;
    for (it = vaCacheList_->begin(); it != vaCacheList_->end(); ++it) {
        VACacheEntry*   entry = *it;
        if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) {
            *offset = static_cast<size_t>(reinterpret_cast<const char*>(ptr) -
                reinterpret_cast<char*>(entry->startAddress_));
            return entry->memory_;
        }
    }
    return NULL;
}

amd::Memory*
Device::findMapTarget(size_t size) const
{
    // Must be serialised. Global async is too conservative
    amd::ScopedLock lk(*lockAsyncOps_);

    amd::Memory*    map = NULL;
    size_t          minSize = 0;
    size_t          maxSize = 0;
    uint            mapId = mapCache_->size();
    uint            releaseId = mapCache_->size();

    // Find if the list has a map target of appropriate size
    for (uint i = 0; i < mapCache_->size(); i++) {
        if ((*mapCache_)[i] != NULL) {
            // Requested size is smaller than the entry size
            if (size < (*mapCache_)[i]->getSize()) {
                if ((minSize == 0) ||
                    (minSize > (*mapCache_)[i]->getSize())) {
                    minSize = (*mapCache_)[i]->getSize();
                    mapId = i;
                }
            }
            // Requeted size matches the entry size
            else if (size == (*mapCache_)[i]->getSize()) {
                mapId = i;
                break;
            }
            else {
                // Find the biggest map target in the list
                if (maxSize < (*mapCache_)[i]->getSize()) {
                    maxSize = (*mapCache_)[i]->getSize();
                    releaseId = i;
                }
            }
        }
    }

    // Check if we found any map target
    if (mapId < mapCache_->size()) {
        map = (*mapCache_)[mapId];
        (*mapCache_)[mapId] = NULL;
        Memory*     gpuMemory = reinterpret_cast<Memory*>
            (map->getDeviceMemory(*this));

        // Get the base pointer for the map resource
        if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) {
            (*mapCache_)[mapId]->release();
            map = NULL;
        }
    }
    // If cache is full, then release the biggest map target
    else if (releaseId < mapCache_->size()) {
        (*mapCache_)[releaseId]->release();
        (*mapCache_)[releaseId] = NULL;
    }

    return map;
}

bool
Device::addMapTarget(amd::Memory* memory) const
{
    // Must be serialised. Global async is too conservative
    amd::ScopedLock lk(*lockAsyncOps_);

    //the svm memory shouldn't be cached
    if (!memory->canBeCached()) {
        return false;
    }
    // Find if the list has a map target of appropriate size
    for (uint i = 0; i < mapCache_->size(); ++i) {
        if ((*mapCache_)[i] == NULL) {
            (*mapCache_)[i] = memory;
            return true;
        }
    }

    // Add a new entry
    mapCache_->push_back(memory);

    return true;
}

Device::ScratchBuffer::~ScratchBuffer()
{
    destroyMemory();
}

void
Device::ScratchBuffer::destroyMemory()
{
    for (uint i = 0; i < memObjs_.size(); ++i) {
        // Release memory object
        delete memObjs_[i];
        memObjs_[i] = NULL;
    }
    regNum_ = 0;
}

bool
Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
{
    if (regNum > 0) {
        // Serialize the scratch buffer allocation code
        amd::ScopedLock lk(*lockAsyncOps_);
        uint    s = vgpu->hwRing();

        // Check if the current buffer isn't big enough
        if (regNum > scratch_[s]->regNum_) {
            // Stall all command queues, since runtime will reallocate memory
            ScopedLockVgpus lock(*this);
            std::vector<Memory*>& mems = scratch_[s]->memObjs_;

            // Calculate the size of the new buffer +
            // (64 Ki) for alignment with generic address space
            size_t size = calcScratchBufferSize(regNum) + 64 * Ki;

            scratch_[s]->destroyMemory();

            // Loop through all memory objects and reallocate them
            for (uint i = 0; i < mems.size(); ++i) {
                // Allocate new buffer
                mems[i] = new gpu::Memory(*this, size);
                if ((mems[i] == NULL) || !mems[i]->create(Resource::Scratch)) {
                    LogError("Couldn't allocate scratch memory");
                    scratch_[s]->regNum_ = 0;
                    return false;
                }
            }
            scratch_[s]->regNum_ = regNum;
        }
    }
    return true;
}

bool
Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev)
{
    // Find the number of scratch registers used in the kernel
    const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
    uint regNum = static_cast<uint>(devKernel->workGroupInfo()->scratchRegs_);
    const VirtualGPU* vgpu = static_cast<const VirtualGPU*>(vdev);

    if (!allocScratch(regNum, vgpu)) {
        return false;
    }

    if (devKernel->hsa()) {
        const HSAILKernel* hsaKernel = static_cast<const HSAILKernel*>(devKernel);
        if (hsaKernel->dynamicParallelism()) {
            amd::DeviceQueue*  defQueue =
                kernel.program().context().defDeviceQueue(*this);
            vgpu = static_cast<VirtualGPU*>(defQueue->vDev());
            if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) {
                return false;
            }
        }
    }

    return true;
}

void
Device::destroyScratchBuffers()
{
    for (uint s = 0; s < scratch_.size(); ++s) {
        scratch_[s]->destroyMemory();
    }
}

void
Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize) const
{
    // All GSL sampler's parameters are in floats
    uint32_t    gslAddress = GSL_CLAMP_TO_BORDER;
    uint32_t    gslMinFilter = GSL_MIN_NEAREST;
    uint32_t    gslMagFilter = GSL_MAG_NEAREST;
    bool        unnorm = !(state & amd::Sampler::StateNormalizedCoordsMask);

    state &= ~amd::Sampler::StateNormalizedCoordsMask;

    // Program the sampler address mode
    switch (state & amd::Sampler::StateAddressMask) {
        case amd::Sampler::StateAddressRepeat:
            gslAddress = GSL_REPEAT;
            break;
        case amd::Sampler::StateAddressClampToEdge:
            gslAddress = GSL_CLAMP_TO_EDGE;
            break;
        case amd::Sampler::StateAddressMirroredRepeat:
            gslAddress = GSL_MIRRORED_REPEAT;
            break;
        case amd::Sampler::StateAddressClamp:
        case amd::Sampler::StateAddressNone:
        default:
            break;
    }
    state &= ~amd::Sampler::StateAddressMask;

    // Program texture filter mode
    if (state == amd::Sampler::StateFilterLinear) {
        gslMinFilter = GSL_MIN_LINEAR;
        gslMagFilter = GSL_MAG_LINEAR;
    }

    fillSamplerHwState(unnorm, gslMinFilter, gslMagFilter,
        gslAddress, hwState, hwStateSize);
}

void*
Device::hostAlloc(size_t size, size_t alignment, bool atomics) const
{
    //for discrete gpu, we only reserve,no commit yet.
    return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE);
}

void
Device::hostFree(void* ptr, size_t size) const
{
    //If we allocate the host memory, we need free, or we have to release
    amd::Os::releaseMemory(ptr, size);
}

void*
Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags) const
{
    alignment = std::max(alignment, static_cast<size_t>(info_.memBaseAddrAlign_));

    //VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later
    size_t vmBigK = 64 * Ki;
    alignment =  (alignment < vmBigK) ? vmBigK : alignment;

    size = amd::alignUp(size, alignment);

    //create a hidden buffer, which will allocated on the device later
    amd::Memory* mem = new (context) amd::Buffer(context, flags, size, reinterpret_cast<void*>(1));
    if (mem == NULL) {
        LogError("failed to create a svm mem object!");
        return NULL;
    }

    if (!mem->create(NULL, false)) {
        LogError("failed to create a svm hidden buffer!");
        mem->release();
        return NULL;
    }

    gpu::Memory* gpuMem = getGpuMemory(mem);

    //add the information to context so that we can use it later.
    amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem);

    return mem->getSvmPtr();
}

void
Device::svmFree(void *ptr) const
{
    amd::Memory * svmMem = NULL;
    svmMem = amd::SvmManager::FindSvmBuffer(ptr);
    if (NULL != svmMem) {
        svmMem->release();
        amd::SvmManager::RemoveSvmBuffer(ptr);
    }
}


Device::SrdManager::~SrdManager()
{
    for (uint i = 0; i < pool_.size(); ++i) {
        pool_[i].buf_->unmap(NULL);
        delete pool_[i].buf_;
        delete pool_[i].flags_;
    }
}

bool
Sampler::create(
    uint32_t oclSamplerState)
{
    hwSrd_ = dev_.srds().allocSrdSlot(&hwState_);
    if (0 == hwSrd_) {
        return false;
    }
    dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE);
    return true;
}

Sampler::~Sampler()
{
    dev_.srds().freeSrdSlot(hwSrd_);
}

uint64_t
Device::SrdManager::allocSrdSlot(address* cpuAddr)
{
    amd::ScopedLock lock(ml_);
    // Check all buffers in the pool of chunks
    for (uint i = 0; i < pool_.size(); ++i) {
        const Chunk&    ch = pool_[i];
        // Search for an empty slot
        for (uint s = 0; s < numFlags_; ++s) {
            uint mask = ch.flags_[s];
            // Check if there is an empty slot in this group
            if (mask != 0) {
                uint idx;
                // Find the first empty index
                for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx);
                // Mark the slot as busy
                ch.flags_[s] &= ~(1 << idx);
                // Calculate SRD offset in the buffer
                uint offset = (s * MaskBits + idx) * srdSize_;
                *cpuAddr = ch.buf_->data() + offset;
                return ch.buf_->vmAddress() + offset;
            }
        }
    }
    // At this point the manager doesn't have empty slots
    // and has to allocate a new chunk
    Chunk chunk;
    chunk.flags_ = new uint[numFlags_];
    if (chunk.flags_ == NULL) {
        return 0;
    }
    chunk.buf_ = new Memory(dev_, bufSize_);
    if (chunk.buf_ == NULL || !chunk.buf_->create(Resource::Remote) ||
        (NULL == chunk.buf_->map(NULL))) {
        delete [] chunk.flags_;
        delete chunk.buf_;
        return 0;
    }
    // All slots in the chunk are in "free" state
    memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint));
    // Take the first one...
    chunk.flags_[0] &= ~0x1;
    pool_.push_back(chunk);
    *cpuAddr = chunk.buf_->data();
    return chunk.buf_->vmAddress();
}

void
Device::SrdManager::freeSrdSlot(uint64_t addr) {
    amd::ScopedLock lock(ml_);
    // Check all buffers in the pool of chunks
    for (uint i = 0; i < pool_.size(); ++i) {
        Chunk* ch = &pool_[i];
        // Find the offset
        int64_t offs = static_cast<int64_t>(addr) -
            static_cast<int64_t>(ch->buf_->vmAddress());
        // Check if the offset inside the chunk buffer
        if ((offs >= 0) && (offs < bufSize_)) {
            // Find the index in the chunk
            uint idx  = offs / srdSize_;
            uint s = idx / MaskBits;
            // Free the slot
            ch->flags_[s] |= 1 << (idx % MaskBits);
            return;
        }
    }
    assert(false && "Wrong slot address!");
}

void
Device::SrdManager::fillResourceList(std::vector<const Resource*>&   memList)
{
    for (uint i = 0; i < pool_.size(); ++i) {
        memList.push_back(pool_[i].buf_);
    }
}

} // namespace gpu