rocm-systems/hipamd/src/hip_hcc.cpp

/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
 * @file hip_hcc.cpp
 *
 * Contains definitions for functions that are large enough that we don't want to inline them everywhere.
 * This file is compiled and linked into apps running HIP / HCC path.
 */
#include <assert.h>
#include <stdint.h>
#include <iostream>
#include <sstream>
#include <list>
#include <sys/types.h>
#include <unistd.h>
#include <deque>
#include <vector>
#include <algorithm>

#include <hc.hpp>
#include <hc_am.hpp>

#include "hip_runtime.h"
#include "hcc_detail/hip_hcc.h"
#include "hsa_ext_amd.h"

// TODO, re-org header order.
extern const char *ihipErrorString(hipError_t hip_error);
#include "hcc_detail/trace_helper.h"

const int release = 1;


int HIP_LAUNCH_BLOCKING = 0;

int HIP_PRINT_ENV = 0;
int HIP_TRACE_API= 0;
int HIP_ATP_MARKER= 0;
int HIP_DB= 0;
int HIP_STAGING_SIZE = 64;   /* size of staging buffers, in KB */
int HIP_STAGING_BUFFERS = 2;    // TODO - remove, two buffers should be enough.
int HIP_PININPLACE = 0;
int HIP_STREAM_SIGNALS = 2;  /* number of signals to allocate at stream creation */
int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */


//---
// Chicken bits for disabling functionality to work around potential issues:
int HIP_DISABLE_HW_KERNEL_DEP = 0;
int HIP_DISABLE_HW_COPY_DEP = 0;

thread_local int tls_defaultDevice = 0;
thread_local hipError_t tls_lastHipError = hipSuccess;


//=================================================================================================
//Forward Declarations:
//=================================================================================================
bool ihipIsValidDevice(unsigned deviceIndex);

std::once_flag hip_initialized;
ihipDevice_t *g_devices;
bool g_visible_device = false;
unsigned g_deviceCnt;
std::vector<int> g_hip_visible_devices;
hsa_agent_t g_cpu_agent;


//=================================================================================================
// Implementation:
//=================================================================================================


//=================================================================================================
// ihipSignal_t:
//=================================================================================================
//
//---
ihipSignal_t::ihipSignal_t() :  _sig_id(0)
{
    if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) {
        throw ihipException(hipErrorRuntimeMemory);
    }
    //tprintf (DB_SIGNAL, "  allocated hsa_signal=%lu\n", (_hsa_signal.handle));
}

//---
ihipSignal_t::~ihipSignal_t()
{
    tprintf (DB_SIGNAL, "  destroy hsa_signal #%lu (#%lu)\n", (_hsa_signal.handle), _sig_id);
    if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) {
       throw ihipException(hipErrorRuntimeOther);
    }
};


//=================================================================================================
// ihipStream_t:
//=================================================================================================
//---
ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) :
    _id(0), // will be set by add function.
    _av(av),
    _flags(flags),
    _device_index(device_index)
{
    tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
};


//---
ihipStream_t::~ihipStream_t()
{
}


//---
void ihipStream_t::locked_reclaimSignals(SIGSEQNUM sigNum)
{
    LockedAccessor_StreamCrit_t crit(_criticalData);

    tprintf(DB_SIGNAL, "reclaim signal #%lu\n", sigNum);
    // Mark all signals older and including this one as available for re-allocation.
    crit->_oldest_live_sig_id = sigNum+1;
}


//---
void ihipStream_t::waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal)
{
    SIGSEQNUM sigNum = signal->_sig_id;
    tprintf(DB_SYNC, "waitCopy signal:#%lu\n", sigNum);

    hsa_signal_wait_acquire(signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);


    tprintf(DB_SIGNAL, "waitCopy reclaim signal #%lu\n", sigNum);
    // Mark all signals older and including this one as available for reclaim
    if (sigNum > crit->_oldest_live_sig_id) {
        crit->_oldest_live_sig_id = sigNum+1; // TODO, +1 here seems dangerous.
    }

}

//Wait for all kernel and data copy commands in this stream to complete.
//This signature should be used in routines that already have locked the stream mutex
void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty)
{
    if (! assertQueueEmpty) {
        tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
        _av.wait();
    }
    if (crit->_last_copy_signal) {
        tprintf (DB_SYNC, "stream %p wait for lastCopy:#%lu...\n", this, lastCopySeqId(crit) );
        this->waitCopy(crit, crit->_last_copy_signal);
    }

    // Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal.
    crit->_last_command_type = ihipCommandCopyH2D;
    crit->_last_copy_signal = NULL;

    _depFutures.clear();
}


//---
//Wait for all kernel and data copy commands in this stream to complete.
void ihipStream_t::locked_wait(bool assertQueueEmpty)
{
    LockedAccessor_StreamCrit_t crit(_criticalData);

    wait(crit, assertQueueEmpty);

};


// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
// The packed _peerAgents can efficiently be used on each memory allocation.
template<>
void ihipDeviceCriticalBase_t<DeviceMutex>::recomputePeerAgents()
{
    _peerCnt = 0;
    std::for_each (_peers.begin(), _peers.end(), [this](ihipDevice_t* device) {
        _peerAgents[_peerCnt++] = device->_hsa_agent;
    });
}


template<>
bool ihipDeviceCriticalBase_t<DeviceMutex>::isPeer(const ihipDevice_t *peer)
{
    auto match = std::find(_peers.begin(), _peers.end(), peer);
    return (match != std::end(_peers));
}


template<>
bool ihipDeviceCriticalBase_t<DeviceMutex>::addPeer(ihipDevice_t *peer)
{
    auto match = std::find(_peers.begin(), _peers.end(), peer);
    if (match == std::end(_peers)) {
        // Not already a peer, let's update the list:
        _peers.push_back(peer);
        recomputePeerAgents();
        return true;
    }

    // If we get here - peer was already on list, silently ignore.
    return false;
}


template<>
bool ihipDeviceCriticalBase_t<DeviceMutex>::removePeer(ihipDevice_t *peer)
{
    auto match = std::find(_peers.begin(), _peers.end(), peer);
    if (match != std::end(_peers)) {
        // Found a valid peer, let's remove it.
        _peers.remove(peer);
        recomputePeerAgents();
        return true;
    } else {
        return false;
    }
}


template<>
void ihipDeviceCriticalBase_t<DeviceMutex>::resetPeers(ihipDevice_t *thisDevice)
{
    _peers.clear();
    _peerCnt = 0;
    addPeer(thisDevice); // peer-list always contains self agent.
}


template<>
void ihipDeviceCriticalBase_t<DeviceMutex>::addStream(ihipStream_t *stream)
{
    _streams.push_back(stream);
    stream->_id = incStreamId();
}

//-------------------------------------------------------------------------------------------------

//---
//Flavor that takes device index.
ihipDevice_t * getDevice(unsigned deviceIndex)
{
    if (ihipIsValidDevice(deviceIndex)) {
        return &g_devices[deviceIndex];
    } else {
        return NULL;
    }
};


//---
ihipDevice_t * ihipStream_t::getDevice() const
{
    return ::getDevice(_device_index);
};


//---
// Allocate a new signal from the signal pool.
// Returned signals have value of 0.
// Signals are intended for use in this stream and are always reclaimed "in-order".
ihipSignal_t *ihipStream_t::allocSignal(LockedAccessor_StreamCrit_t &crit)
{
    int numToScan = crit->_signalPool.size();
    do {
        auto thisCursor = crit->_signalCursor;
        if (++crit->_signalCursor == crit->_signalPool.size()) {
            crit->_signalCursor = 0;
        }

        if (crit->_signalPool[thisCursor]._sig_id < crit->_oldest_live_sig_id) {
            SIGSEQNUM oldSigId = crit->_signalPool[thisCursor]._sig_id;
            crit->_signalPool[thisCursor]._index = thisCursor;
            crit->_signalPool[thisCursor]._sig_id  =  ++crit->_stream_sig_id;  // allocate it.
            tprintf(DB_SIGNAL, "allocatSignal #%lu at pos:%i (old sigId:%lu < oldest_live:%lu)\n",
                    crit->_signalPool[thisCursor]._sig_id,
                    thisCursor, oldSigId, crit->_oldest_live_sig_id);


            return &crit->_signalPool[thisCursor];
        }

    } while (--numToScan) ;

    assert(numToScan == 0);

    // Have to grow the pool:
    crit->_signalCursor = crit->_signalPool.size(); // set to the beginning of the new entries:
    if (crit->_signalCursor > 10000) {
        fprintf (stderr, "warning: signal pool size=%d, may indicate runaway number of inflight commands\n", crit->_signalCursor);
    }
    crit->_signalPool.resize(crit->_signalPool.size() * 2);
    tprintf (DB_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", crit->_signalPool.size(), crit->_signalCursor);
    return allocSignal(crit);  // try again,

    // Should never reach here.
    assert(0);
}


//---
void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal)
{

    // Obtain the write index for the command queue
    uint64_t index = hsa_queue_load_write_index_relaxed(queue);
    const uint32_t queueMask = queue->size - 1;

    // Define the barrier packet to be at the calculated queue index address
    hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]);
    memset(barrier, 0, sizeof(hsa_barrier_and_packet_t));

    // setup header
    uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
    header |= 1 << HSA_PACKET_HEADER_BARRIER;
    //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
    //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
    barrier->header = header;

    barrier->dep_signal[0] = depSignal->_hsa_signal;

    barrier->completion_signal.handle = 0;

    // TODO - check queue overflow, return error:
    // Increment write index and ring doorbell to dispatch the kernel
    hsa_queue_store_write_index_relaxed(queue, index+1);
    hsa_signal_store_relaxed(queue->doorbell_signal, index);
}


//--
//When the commands in a stream change types (ie kernel command follows a data command,
//or data command follows a kernel command), then we need to add a barrier packet
//into the stream to mimic CUDA stream semantics.  (some hardware uses separate
//queues for data commands and kernel commands, and no implicit ordering is provided).
//
bool ihipStream_t::lockopen_preKernelCommand()
{
    LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);

    bool addedSync = false;
    // If switching command types, we need to add a barrier packet to synchronize things.
    if (crit->_last_command_type != ihipCommandKernel) {
        if (crit->_last_copy_signal) {
            addedSync = true;

            hsa_queue_t * q =  (hsa_queue_t*)_av.get_hsa_queue();
            if (HIP_DISABLE_HW_KERNEL_DEP == 0) {
                this->enqueueBarrier(q, crit->_last_copy_signal);
                tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n",
                        this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel], crit->_last_copy_signal->_sig_id)

            } else if (HIP_DISABLE_HW_KERNEL_DEP>0) {
                    tprintf (DB_SYNC, "stream %p switch %s to %s (HOST wait for previous...)\n",
                            this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel]);
                    this->waitCopy(crit, crit->_last_copy_signal);
            } else if (HIP_DISABLE_HW_KERNEL_DEP==-1) {
                tprintf (DB_SYNC, "stream %p switch %s to %s (IGNORE dependency)\n",
                        this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel]);
            }
        }
        crit->_last_command_type = ihipCommandKernel;
    }

    return addedSync;
}


//---
// Must be called after kernel finishes, this releases the lock on the stream so other commands can submit.
void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFuture)
{
    // We locked _criticalData in the lockopen_preKernelCommand() so OK to access here:
    _criticalData._last_kernel_future = kernelFuture;

    _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
};


//---
// Called whenever a copy command is set to the stream.
// Examines the last command sent to this stream and returns a signal to wait on, if required.
int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType)
{
    int needSync = 0;

    waitSignal->handle = 0;

    //_mutex.lock(); // will be unlocked in postCopyCommand

    // If switching command types, we need to add a barrier packet to synchronize things.
    if (FORCE_SAMEDIR_COPY_DEP || (crit->_last_command_type != copyType)) {


        if (crit->_last_command_type == ihipCommandKernel) {
            tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n",
                    this, ihipCommandName[crit->_last_command_type], ihipCommandName[copyType]);
            needSync = 1;
            hsa_signal_t *hsaSignal = (static_cast<hsa_signal_t*> (crit->_last_kernel_future.get_native_handle()));
            if (hsaSignal) {
                // Keep reference to the kernel future in order to keep the
                // dependent signal alive.
                _depFutures.push_back(crit->_last_kernel_future);
                *waitSignal = * hsaSignal;
            } else {
                assert(0); // if NULL signal, and we return 1, hsa_amd_memory_copy_async will fail.  Confirm this never happens.
            }
        } else if (crit->_last_copy_signal) {
            needSync = 1;
            tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n",
                    this, ihipCommandName[crit->_last_command_type], ihipCommandName[copyType], crit->_last_copy_signal->_sig_id);
            *waitSignal = crit->_last_copy_signal->_hsa_signal;
        }

        if (HIP_DISABLE_HW_COPY_DEP && needSync) {
            if (HIP_DISABLE_HW_COPY_DEP == -1) {
                tprintf (DB_SYNC, "IGNORE copy dependency\n")

            } else {
                tprintf (DB_SYNC, "HOST-wait for copy dependency\n")
                // do the wait here on the host, and disable the device-side command resolution.
                hsa_signal_wait_acquire(*waitSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
                needSync = 0;
            }
        }

        crit->_last_command_type = copyType;
    }

    crit->_last_copy_signal = lastCopy;

    return needSync;
}


//=================================================================================================
//
//Reset the device - this is called from hipDeviceReset.
//Device may be reset multiple times, and may be reset after init.
void ihipDevice_t::locked_reset()
{
    // Obtain mutex access to the device critical data, release by destructor
    LockedAccessor_DeviceCrit_t  crit(_criticalData);


    //---
    //Wait for pending activity to complete?  TODO - check if this is required behavior:
    tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n");

    // Reset and remove streams:
    // Delete all created streams including the default one.
    for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
        ihipStream_t *stream = *streamI;
        (*streamI)->locked_wait();
        tprintf(DB_SYNC, " delete stream=%p\n", stream);

        delete stream;
    }
    // Clear the list.
    crit->streams().clear();


    // Create a fresh default stream and add it:
    _default_stream = new ihipStream_t(_device_index, _acc.get_default_view(), hipStreamDefault);
    crit->addStream(_default_stream);


    // This resest peer list to just me:
    crit->resetPeers(this);

    // Reset and release all memory stored in the tracker:
    // Reset will remove peer mapping so don't need to do this explicitly.
    am_memtracker_reset(_acc);

};


//---
void ihipDevice_t::init(unsigned device_index, unsigned deviceCnt, hc::accelerator &acc, unsigned flags)
{
    _device_index = device_index;
    _device_flags = flags;
    _acc = acc;

    hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
    if (agent) {
        int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units);
        if (err != HSA_STATUS_SUCCESS) {
            _compute_units = 1;
        }

        _hsa_agent = *agent;
    } else {
        _hsa_agent.handle = static_cast<uint64_t> (-1);
    }

    getProperties(&_props);

    _criticalData.init(deviceCnt);

    locked_reset();


    tprintf(DB_SYNC, "created device with default_stream=%p\n", _default_stream);

    hsa_region_t *pinnedHostRegion;
    pinnedHostRegion = static_cast<hsa_region_t*>(_acc.get_hsa_am_system_region());
    _staging_buffer[0] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
    _staging_buffer[1] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);

};


ihipDevice_t::~ihipDevice_t()
{
    if (_default_stream) {
        delete _default_stream;
        _default_stream = NULL;
    }

    for (int i=0; i<2; i++) {
        if (_staging_buffer[i]) {
            delete _staging_buffer[i];
            _staging_buffer[i] = NULL;
        }
    }
}

//----


//=================================================================================================
// Utility functions, these are not part of the public HIP API
//=================================================================================================

//=================================================================================================

#define DeviceErrorCheck(x) if (x != HSA_STATUS_SUCCESS) { return hipErrorInvalidDevice; }

#define ErrorCheck(x) error_check(x, __LINE__, __FILE__)

void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) {
  if (hsa_error_code != HSA_STATUS_SUCCESS) {
    printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(),line_num);
  }
}

hsa_status_t get_region_info(hsa_region_t region, void* data)
{
    hsa_status_t err;
    hipDeviceProp_t* p_prop = reinterpret_cast<hipDeviceProp_t*>(data);
    uint32_t region_segment;

    // Get region segment
    err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &region_segment);
    ErrorCheck(err);

    switch(region_segment) {
    case HSA_REGION_SEGMENT_READONLY:
        err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->totalConstMem)); break;
    /* case HSA_REGION_SEGMENT_PRIVATE:
        cout<<"PRIVATE"<<endl; private segment cannot be queried */
    case HSA_REGION_SEGMENT_GROUP:
        err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->sharedMemPerBlock)); break;
    default: break;
    }
    return HSA_STATUS_SUCCESS;
}

// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it.
static hsa_status_t countGpuAgents(hsa_agent_t agent, void *data) {
    if (data == NULL) {
        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
    }
    hsa_device_type_t device_type;
    hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
    if (status != HSA_STATUS_SUCCESS) {
        return status;
    }
    if (device_type == HSA_DEVICE_TYPE_GPU) {
        (*static_cast<int*>(data))++;
    }
    return HSA_STATUS_SUCCESS;
}

// Internal version,
hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop)
{
    hipError_t e = hipSuccess;
    hsa_status_t err;

    // Set some defaults in case we don't find the appropriate regions:
    prop->totalGlobalMem = 0;
    prop->totalConstMem = 0;
    prop->sharedMemPerBlock = 0;
    prop-> maxThreadsPerMultiProcessor = 0;
    prop->regsPerBlock = 0;

    if (_hsa_agent.handle == -1) {
        return hipErrorInvalidDevice;
    }

    // Iterates over the agents to determine Multiple GPU devices
    // using the countGpuAgents callback.
    //! @bug : on HCC, isMultiGpuBoard returns True if system contains multiple GPUS (rather than if GPU is on a multi-ASIC board)
    int gpuAgentsCount = 0;
    err = hsa_iterate_agents(countGpuAgents, &gpuAgentsCount);
    if (err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; }
    DeviceErrorCheck(err);
    prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1;

    // Get agent name
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_NAME, &(prop->name));
    DeviceErrorCheck(err);

    // Get agent node
    uint32_t node;
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_NODE, &node);
    DeviceErrorCheck(err);

    // Get wavefront size
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,&prop->warpSize);
    DeviceErrorCheck(err);

    // Get max total number of work-items in a workgroup
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &prop->maxThreadsPerBlock );
    DeviceErrorCheck(err);

    // Get max number of work-items of each dimension of a work-group
    uint16_t work_group_max_dim[3];
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, work_group_max_dim);
    DeviceErrorCheck(err);
    for( int i =0; i< 3 ; i++) {
        prop->maxThreadsDim[i]= work_group_max_dim[i];
    }

    hsa_dim3_t grid_max_dim;
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
    DeviceErrorCheck(err);
    prop->maxGridSize[0]= (int) ((grid_max_dim.x == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.x);
    prop->maxGridSize[1]= (int) ((grid_max_dim.y == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.y);
    prop->maxGridSize[2]= (int) ((grid_max_dim.z == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.z);

    // Get Max clock frequency
    err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &prop->clockRate);
    prop->clockRate *= 1000.0;   // convert Mhz to Khz.
    DeviceErrorCheck(err);

    //uint64_t counterHz;
    //err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &counterHz);
    //DeviceErrorCheck(err);
    //prop->clockInstructionRate = counterHz / 1000;
    prop->clockInstructionRate = 100*1000; /* TODO-RT - hard-code until HSART has function to properly report clock */

    // Get Agent BDFID (bus/device/function ID)
    uint16_t bdf_id = 1;
    err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
    DeviceErrorCheck(err);

    // BDFID is 16bit uint: [8bit - BusID | 5bit - Device ID | 3bit - Function/DomainID]
    // TODO/Clarify: cudaDeviceProp::pciDomainID how to report?
    // prop->pciDomainID =  bdf_id & 0x7;
    prop->pciDeviceID =  (bdf_id>>3) & 0x1F;
    prop->pciBusID =  (bdf_id>>8) & 0xFF;

    // Masquerade as a 3.0-level device. This will change as more HW functions are properly supported.
    // Application code should use the arch.has* to do detailed feature detection.
    prop->major = 2;
    prop->minor = 0;

    // Get number of Compute Unit
    err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &(prop->multiProcessorCount));
    DeviceErrorCheck(err);

    // TODO-hsart - this appears to return 0?
    uint32_t cache_size[4];
    err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_CACHE_SIZE, cache_size);
    DeviceErrorCheck(err);
    prop->l2CacheSize = cache_size[1];

    /* Computemode for HSA Devices is always : cudaComputeModeDefault */
    prop->computeMode = 0;

    // Get Max Threads Per Multiprocessor
/*
    HsaSystemProperties props;
    hsaKmtReleaseSystemProperties();
    if(HSAKMT_STATUS_SUCCESS == hsaKmtAcquireSystemProperties(&props)) {
        HsaNodeProperties node_prop = {0};
        if(HSAKMT_STATUS_SUCCESS == hsaKmtGetNodeProperties(node, &node_prop)) {
            uint32_t waves_per_cu = node_prop.MaxWavesPerSIMD;
            prop-> maxThreadsPerMultiProcessor = prop->warpsize*waves_per_cu;
        }
    }
*/

    // Get memory properties
    err = hsa_agent_iterate_regions(_hsa_agent, get_region_info, prop);
    DeviceErrorCheck(err);

    // Get the size of the region we are using for Accelerator Memory allocations:
    hsa_region_t *am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
    err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
    DeviceErrorCheck(err);
    // maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
    // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size.
    prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;

    // Get Max memory clock frequency
    err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate);
    DeviceErrorCheck(err);
    prop->memoryClockRate *= 1000.0;   // convert Mhz to Khz.

    // Get global memory bus width in bits
    err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH, &prop->memoryBusWidth);
    DeviceErrorCheck(err);

    // Set feature flags - these are all mandatory for HIP on HCC path:
    // Some features are under-development and future revs may support flags that are currently 0.
    // Reporting of these flags should be synchronized with the HIP_ARCH* compile-time defines in hip_runtime.h

    prop->arch.hasGlobalInt32Atomics       = 1;
    prop->arch.hasGlobalFloatAtomicExch    = 1;
    prop->arch.hasSharedInt32Atomics       = 1;
    prop->arch.hasSharedFloatAtomicExch    = 1;
    prop->arch.hasFloatAtomicAdd           = 0;
    prop->arch.hasGlobalInt64Atomics       = 1;
    prop->arch.hasSharedInt64Atomics       = 1;
    prop->arch.hasDoubles                  = 1; // TODO - true for Fiji.
    prop->arch.hasWarpVote                 = 1;
    prop->arch.hasWarpBallot               = 1;
    prop->arch.hasWarpShuffle              = 1;
    prop->arch.hasFunnelShift              = 0; // TODO-hcc
    prop->arch.hasThreadFenceSystem        = 0; // TODO-hcc
    prop->arch.hasSyncThreadsExt           = 0; // TODO-hcc
    prop->arch.hasSurfaceFuncs             = 0; // TODO-hcc
    prop->arch.has3dGrid                   = 1;
    prop->arch.hasDynamicParallelism       = 0;

    prop->concurrentKernels = 1; // All ROCR hardware supports executing multiple kernels concurrently
    if ( _device_flags | hipDeviceMapHost) {
        prop->canMapHostMemory = 1;
    } else {
        prop->canMapHostMemory = 0;
    }
    return e;
}


// Implement "default" stream syncronization
//   This waits for all other streams to drain before continuing.
//   If waitOnSelf is set, this additionally waits for the default stream to empty.
void ihipDevice_t::locked_syncDefaultStream(bool waitOnSelf)
{
    LockedAccessor_DeviceCrit_t  crit(_criticalData);

    tprintf(DB_SYNC, "syncDefaultStream\n");

    for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
        ihipStream_t *stream = *streamI;

        // Don't wait for streams that have "opted-out" of syncing with NULL stream.
        // And - don't wait for the NULL stream
        if (!(stream->_flags & hipStreamNonBlocking)) {

            if (waitOnSelf || (stream != _default_stream)) {
                // TODO-hcc - use blocking or active wait here?
                // TODO-sync - cudaDeviceBlockingSync
                stream->locked_wait();
            }
        }
    }
}

//---
void ihipDevice_t::locked_addStream(ihipStream_t *s)
{
    LockedAccessor_DeviceCrit_t  crit(_criticalData);

    crit->addStream(s);
}

//---
void ihipDevice_t::locked_removeStream(ihipStream_t *s)
{
    LockedAccessor_DeviceCrit_t  crit(_criticalData);

    crit->streams().remove(s);
}


//---
//Heavyweight synchronization that waits on all streams, ignoring hipStreamNonBlocking flag.
void ihipDevice_t::locked_waitAllStreams()
{
    LockedAccessor_DeviceCrit_t  crit(_criticalData);

    tprintf(DB_SYNC, "waitAllStream\n");
    for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
        (*streamI)->locked_wait();
    }
}


// Read environment variables.
void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, const char *description)
{
    char * env = getenv(var_name1);

    // Check second name if first not defined, used to allow HIP_ or CUDA_ env vars.
    if ((env == NULL) && strcmp(var_name2, "0")) {
        env = getenv(var_name2);
    }

    // Check if the environment variable is either HIP_VISIBLE_DEVICES or CUDA_LAUNCH_BLOCKING, which
    // contains a sequence of comma-separated device IDs
    if (!(strcmp(var_name1,"HIP_VISIBLE_DEVICES") && strcmp(var_name2, "CUDA_VISIBLE_DEVICES")) && env){
        // Parse the string stream of env and store the device ids to g_hip_visible_devices global variable
        std::string str = env;
        std::istringstream ss(str);
        std::string device_id;
        // Clean up the defult value
        g_hip_visible_devices.clear();
        g_visible_device = true;
        // Read the visible device numbers
        while (std::getline(ss, device_id, ',')) {
            if (atoi(device_id.c_str()) >= 0) {
                g_hip_visible_devices.push_back(atoi(device_id.c_str()));
            }else// Any device number after invalid number will not present
                break;
        }
        // Print out the number of ids
        if (HIP_PRINT_ENV) {
            printf ("%-30s = ", var_name1);
            for(int i=0;i<g_hip_visible_devices.size();i++)
                printf ("%2d ", g_hip_visible_devices[i]);
            printf (": %s\n", description);
        }
    }
    else { // Parse environment variables with sigle value
        // Default is set when variable is initialized (at top of this file), so only override if we find
        // an environment variable.
        if (env) {
            long int v = strtol(env, NULL, 0);
            *var_ptr = (int) (v);
        }
        if (HIP_PRINT_ENV) {
            printf ("%-30s = %2d : %s\n", var_name1, *var_ptr, description);
        }
    }

}

#if defined (DEBUG)

#define READ_ENV_I(_build, _ENV_VAR, _ENV_VAR2, _description) \
    if ((_build == release) || (_build == debug) {\
        ihipReadEnv_I(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\
    };

#else

#define READ_ENV_I(_build, _ENV_VAR, _ENV_VAR2, _description) \
    if (_build == release) {\
        ihipReadEnv_I(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\
    };

#endif

// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it.
static hsa_status_t findCpuAgent(hsa_agent_t agent, void *data)
{
    hsa_device_type_t device_type;
    hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
    if (status != HSA_STATUS_SUCCESS) {
        return status;
    }
    if (device_type == HSA_DEVICE_TYPE_CPU) {
        (*static_cast<hsa_agent_t*>(data)) = agent;
        return HSA_STATUS_INFO_BREAK;
    }

    return HSA_STATUS_SUCCESS;
}


//---
//Function called one-time at initialization time to construct a table of all GPU devices.
//HIP/CUDA uses integer "deviceIds" - these are indexes into this table.
//AMP maintains a table of accelerators, but some are emulated - ie for debug or CPU.
//This function creates a vector with only the GPU accelerators.
//It is called with C++11 call_once, which provided thread-safety.
void ihipInit()
{

#if COMPILE_TRACE_MARKER
    amdtInitializeActivityLogger();
    amdtScopedMarker("ihipInit", "HIP", NULL);
#endif
    /*
     * Environment variables
     */
    g_hip_visible_devices.push_back(0); /* Set the default value of visible devices */
    READ_ENV_I(release, HIP_PRINT_ENV, 0,  "Print HIP environment variables.");
    //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading

    READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
    READ_ENV_I(release, HIP_DB, 0,  "Print various debug info.  Bitmask, see hip_hcc.cpp for more information.");
    if ((HIP_DB & (1<<DB_API))  && (HIP_TRACE_API == 0)) {
        // Set HIP_TRACE_API default before we read it, so it is printed correctly.
        HIP_TRACE_API = 1;
    }

    READ_ENV_I(release, HIP_TRACE_API, 0,  "Trace each HIP API call.  Print function name and return code to stderr as program executes.");
    READ_ENV_I(release, HIP_ATP_MARKER, 0,  "Add HIP function begin/end to ATP file generated with CodeXL");
    READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" );
    READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy.");
    READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy. Under development.");
    READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)");
    READ_ENV_I(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, "Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence" );

    READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands  - instead wait for dependency on host. -1 means ignore these dependencies. (debug mode)");
    READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands  - instead wait for dependency on host. -1 means ifnore these dependencies (debug mode)");


    // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled.
    if (HIP_DB && !COMPILE_HIP_DB) {
        fprintf (stderr, "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0.  (perhaps enable COMPILE_HIP_DB in src code before compiling?)", HIP_DB);
    }

    if (HIP_TRACE_API && !COMPILE_HIP_TRACE_API) {
        fprintf (stderr, "warning: env var HIP_TRACE_API=0x%x but COMPILE_HIP_TRACE_API=0.  (perhaps enable COMPILE_HIP_DB in src code before compiling?)", HIP_DB);
    }

    if (HIP_ATP_MARKER && !COMPILE_HIP_ATP_MARKER) {
        fprintf (stderr, "warning: env var HIP_ATP_MARKER=0x%x but COMPILE_HIP_ATP_MARKER=0.  (perhaps enable COMPILE_HIP_DB in src code before compiling?)", HIP_ATP_MARKER);
    }


    /*
     * Build a table of valid compute devices.
     */
    auto accs = hc::accelerator::get_all();

    int deviceCnt = 0;
    for (int i=0; i<accs.size(); i++) {
        if (! accs[i].get_is_emulated()) {
            deviceCnt++;
        }
    };

    // Make sure the hip visible devices are within the deviceCnt range
    for (int i = 0; i < g_hip_visible_devices.size(); i++) {
        if(g_hip_visible_devices[i] >= deviceCnt){
            // Make sure any DeviceID after invalid DeviceID will be erased.
            g_hip_visible_devices.resize(i);
            break;
        }
    }

    g_devices = new ihipDevice_t[deviceCnt];
    g_deviceCnt = 0;
    for (int i=0; i<accs.size(); i++) {
        // check if the device id is included in the HIP_VISIBLE_DEVICES env variable
        if (! accs[i].get_is_emulated()) {
            if (std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), (i-1)) == g_hip_visible_devices.end() && g_visible_device)
            {
                //If device is not in visible devices list, ignore
                continue;
            }
            g_devices[g_deviceCnt].init(g_deviceCnt, deviceCnt, accs[i], hipDeviceMapHost);
            g_deviceCnt++;
        }
    }

    // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
    if(!g_visible_device) {
        assert(deviceCnt == g_deviceCnt);
    }


    hsa_status_t err = hsa_iterate_agents(findCpuAgent, &g_cpu_agent);
    if (err != HSA_STATUS_INFO_BREAK) {
        // didn't find a CPU.
        throw ihipException(hipErrorRuntimeOther);
    }


    tprintf(DB_SYNC, "pid=%u %-30s\n", getpid(), "<ihipInit>");
}


bool ihipIsValidDevice(unsigned deviceIndex)
{
    // deviceIndex is unsigned so always > 0
    return (deviceIndex < g_deviceCnt);
}

//---
ihipDevice_t *ihipGetTlsDefaultDevice()
{
    // If this is invalid, the TLS state is corrupt.
    // This can fire if called before devices are initialized.
    // TODO - consider replacing assert with error code
    assert (ihipIsValidDevice(tls_defaultDevice));

    return &g_devices[tls_defaultDevice];
}


//---
ihipDevice_t *ihipGetDevice(int deviceId)
{
    if ((deviceId >= 0) && (deviceId < g_deviceCnt)) {
        return &g_devices[deviceId];
    } else {
        return NULL;
    }

}

//---
// Get the stream to use for a command submission.
//
// If stream==NULL synchronize appropriately with other streams and return the default av for the device.
// If stream is valid, return the AV to use.
hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
{
    if (stream == hipStreamNull ) {
        ihipDevice_t *device = ihipGetTlsDefaultDevice();

#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
        device->locked_syncDefaultStream(false);
#endif
        return device->_default_stream;
    } else {
        // Have to wait for legacy default stream to be empty:
        if (!(stream->_flags & hipStreamNonBlocking))  {
            tprintf(DB_SYNC, "stream %p wait default stream\n", stream);
            stream->getDevice()->_default_stream->locked_wait();
        }

        return stream;
    }
}


// TODO - data-up to data-down:
// Called just before a kernel is launched from hipLaunchKernel.
// Allows runtime to track some information about the stream.
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp)
{
	std::call_once(hip_initialized, ihipInit);
    stream = ihipSyncAndResolveStream(stream);
    lp->gridDim.x = grid.x;
    lp->gridDim.y = grid.y;
    lp->gridDim.z = grid.z;
    lp->groupDim.x = block.x;
    lp->groupDim.y = block.y;
    lp->groupDim.z = block.z;
    stream->lockopen_preKernelCommand();
//    *av = &stream->_av;
    lp->av = &stream->_av;
    lp->cf = new hc::completion_future;
//    lp->av = static_cast<void*>(av);
//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
}
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp)
{
	std::call_once(hip_initialized, ihipInit);
    stream = ihipSyncAndResolveStream(stream);
    lp->gridDim.x = grid;
    lp->gridDim.y = 0;
    lp->gridDim.z = 0;
    lp->groupDim.x = block.x;
    lp->groupDim.y = block.y;
    lp->groupDim.z = block.z;
    stream->lockopen_preKernelCommand();
//    *av = &stream->_av;
    lp->av = &stream->_av;
    lp->cf = new hc::completion_future;
//    lp->av = static_cast<void*>(av);
//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
}

hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp)
{
	std::call_once(hip_initialized, ihipInit);
    stream = ihipSyncAndResolveStream(stream);
    lp->gridDim.x = grid.x;
    lp->gridDim.y = grid.y;
    lp->gridDim.z = grid.z;
    lp->groupDim.x = block;
    lp->groupDim.y = 0;
    lp->groupDim.z = 0;
    stream->lockopen_preKernelCommand();
//    *av = &stream->_av;
    lp->av = &stream->_av;
    lp->cf = new hc::completion_future;
//    lp->av = static_cast<void*>(av);
//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
}

hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp)
{
	std::call_once(hip_initialized, ihipInit);
    stream = ihipSyncAndResolveStream(stream);
    lp->gridDim.x = grid;
    lp->gridDim.y = 0;
    lp->gridDim.z = 0;
    lp->groupDim.x = block;
    lp->groupDim.y = 0;
    lp->groupDim.z = 0;
    stream->lockopen_preKernelCommand();
//    *av = &stream->_av;
    lp->av = &stream->_av;
    lp->cf = new hc::completion_future;
//    lp->av = static_cast<void*>(av);
//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
}


//---
//Called after kernel finishes execution.
void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp)
{
//    stream->lockclose_postKernelCommand(cf);
    stream->lockclose_postKernelCommand(*lp.cf);
    if (HIP_LAUNCH_BLOCKING) {
        tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream);
    }
}


//
//=================================================================================================
// HIP API Implementation
//
// Implementor notes:
// _ All functions should call ihipInit as first action:
//    std::call_once(hip_initialized, ihipInit);
//
// - ALl functions should use ihipLogStatus to return error code (not return error directly).
//=================================================================================================
//
//---

//-------------------------------------------------------------------------------------------------


const char *ihipErrorString(hipError_t hip_error)
{
    switch (hip_error) {
        case hipSuccess                         : return "hipSuccess";
        case hipErrorMemoryAllocation           : return "hipErrorMemoryAllocation";
        case hipErrorLaunchOutOfResources       : return "hipErrorLaunchOutOfResources";
        case hipErrorInvalidValue               : return "hipErrorInvalidValue";
        case hipErrorInvalidResourceHandle      : return "hipErrorInvalidResourceHandle";
        case hipErrorInvalidDevice              : return "hipErrorInvalidDevice";
        case hipErrorInvalidMemcpyDirection     : return "hipErrorInvalidMemcpyDirection";
        case hipErrorNoDevice                   : return "hipErrorNoDevice";
        case hipErrorNotReady                   : return "hipErrorNotReady";
        case hipErrorPeerAccessNotEnabled       : return "hipErrorPeerAccessNotEnabled";
        case hipErrorPeerAccessAlreadyEnabled   : return "hipErrorPeerAccessAlreadyEnabled";

        case hipErrorRuntimeMemory              : return "hipErrorRuntimeMemory";
        case hipErrorRuntimeOther               : return "hipErrorRuntimeOther";
        case hipErrorUnknown                    : return "hipErrorUnknown";
        case hipErrorTbd                        : return "hipErrorTbd";
        default                                 : return "hipErrorUnknown";
    };
};


void ihipSetTs(hipEvent_t e)
{
    ihipEvent_t *eh = e._handle;
    if (eh->_state == hipEventStatusRecorded) {
        // already recorded, done:
        return;
    } else {
        // TODO - use completion-future functions to obtain ticks and timestamps:
        hsa_signal_t *sig  = static_cast<hsa_signal_t*> (eh->_marker.get_native_handle());
        if (sig) {
            if (hsa_signal_load_acquire(*sig) == 0) {
                eh->_timestamp = eh->_marker.get_end_tick();
                eh->_state = hipEventStatusRecorded;
            }
        }
    }
}


// Resolve hipMemcpyDefault to a known type.
unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem)
{
    hipMemcpyKind kind = hipMemcpyDefault;
    if(!srcTracked && !dstTracked)
    {
        kind = hipMemcpyHostToHost;
    }
    if(!srcTracked && dstTracked)
    {
        if(dstInDeviceMem) { kind = hipMemcpyHostToDevice; }
        else{ kind = hipMemcpyHostToHost; }
    }
    if (srcTracked && !dstTracked) {
        if(srcInDeviceMem) { kind = hipMemcpyDeviceToHost; }
        else { kind = hipMemcpyHostToHost; }
    }
    if (srcTracked && dstTracked) {
        if(srcInDeviceMem && dstInDeviceMem) { kind = hipMemcpyDeviceToDevice; }
        if(srcInDeviceMem && !dstInDeviceMem) { kind = hipMemcpyDeviceToHost; }
        if(!srcInDeviceMem && !dstInDeviceMem) { kind = hipMemcpyHostToHost; }
        if(!srcInDeviceMem && dstInDeviceMem) { kind = hipMemcpyHostToDevice; }
    }

    assert (kind != hipMemcpyDefault);
    return kind;
}


// Setup the copyCommandType and the copy agents (for hsa_amd_memory_async_copy)
// srcPhysAcc is the physical location of the src data.  For many copies this is the
void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent)
{
    // current* represents the device associated with the specified stream.
    ihipDevice_t *streamDevice = this->getDevice();
    hsa_agent_t streamAgent = streamDevice->_hsa_agent;

    // ROCR runtime logic is :
    // -    If both src and dst are cpu agent, launch thread and memcpy.  We want to avoid this.
    // -    If either/both src or dst is a gpu agent, use the first gpu agent’s DMA engine to perform the copy.

    switch (kind) {
        //case hipMemcpyHostToHost     : *commandType = ihipCommandCopyH2H; *srcAgent=streamAgent; *dstAgent=streamAgent; break;  // TODO - enable me, for async copy use SDMA.
        case hipMemcpyHostToHost     : *commandType = ihipCommandCopyH2H; *srcAgent=g_cpu_agent; *dstAgent=g_cpu_agent; break;
        case hipMemcpyHostToDevice   : *commandType = ihipCommandCopyH2D; *srcAgent=g_cpu_agent; *dstAgent=streamAgent; break;
        case hipMemcpyDeviceToHost   : *commandType = ihipCommandCopyD2H; *srcAgent=streamAgent; *dstAgent=g_cpu_agent; break;
        case hipMemcpyDeviceToDevice : *commandType = ihipCommandCopyD2D; *srcAgent=streamAgent; *dstAgent=streamAgent; break;
        default: throw ihipException(hipErrorInvalidMemcpyDirection);
    };
}


void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind)
{
    ihipDevice_t *device = this->getDevice();

    if (device == NULL) {
        throw ihipException(hipErrorInvalidDevice);
    }

    hc::accelerator acc;
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);

    bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
    bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
    bool srcInDeviceMem = srcPtrInfo._isInDeviceMem;
    bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;

    // Resolve default to a specific Kind so we know which algorithm to use:
    if (kind == hipMemcpyDefault) {
        kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
    };

    hsa_signal_t depSignal;

    bool copyEngineCanSeeSrcAndDest = false;
    if (kind == hipMemcpyDeviceToDevice) {
#if USE_PEER_TO_PEER>=2
        // TODO - consider refactor.  Do we need to support simul access of enable/disable peers with access?
        LockedAccessor_DeviceCrit_t  dcrit(device->criticalData());
        if (dcrit->isPeer(::getDevice(dstPtrInfo._appId)) && (dcrit->isPeer(::getDevice(srcPtrInfo._appId)))) {
            copyEngineCanSeeSrcAndDest = true;
        }
#endif
    }

    if (kind == hipMemcpyHostToDevice) {
        int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D);
        if(!srcTracked){
            if (HIP_STAGING_BUFFERS) {
                tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);

                if (HIP_PININPLACE) {
                    device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
                } else  {
                    device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
                }

                // The copy waits for inputs and then completes before returning so can reset queue to empty:
                this->wait(crit, true);
            } else {
                // TODO - remove, slow path.
                tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
#if USE_AV_COPY
                _av.copy(src,dst,sizeBytes);
#else
                hc::am_copy(dst, src, sizeBytes);
#endif
            }
        }else{
            hsa_agent_t dstAgent = *(static_cast<hsa_agent_t*>(dstPtrInfo._acc.get_hsa_agent()));
            hsa_agent_t srcAgent = *(static_cast<hsa_agent_t*>(srcPtrInfo._acc.get_hsa_agent()));

            ihipSignal_t *ihipSignal = allocSignal(crit);
            hsa_signal_t copyCompleteSignal = ihipSignal->_hsa_signal;

            hsa_signal_store_relaxed(copyCompleteSignal, 1);
            void *devPtrSrc = srcPtrInfo._devicePointer;
            tprintf(DB_COPY1, "HSA Async_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);

            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, devPtrSrc, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, copyCompleteSignal);

        // This is sync copy, so let's wait for copy right here:
            if (hsa_status == HSA_STATUS_SUCCESS) {
                waitCopy(crit, ihipSignal); // wait for copy, and return to pool.
            } else {
                throw ihipException(hipErrorInvalidValue);
            }
        }
    } else if (kind == hipMemcpyDeviceToHost) {
        int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyD2H);
        if (!dstTracked){
            if (HIP_STAGING_BUFFERS) {
                tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
                //printf ("staged-copy- read dep signals\n");
                device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);

                // The copy completes before returning so can reset queue to empty:
                this->wait(crit, true);

            } else {
            // TODO - remove, slow path.
                tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
#if USE_AV_COPY
                _av.copy(src, dst, sizeBytes);
#else
                hc::am_copy(dst, src, sizeBytes);
#endif
            }
        }else{
            hsa_agent_t dstAgent = *(static_cast<hsa_agent_t*>(dstPtrInfo._acc.get_hsa_agent()));
            hsa_agent_t srcAgent = *(static_cast<hsa_agent_t*>(srcPtrInfo._acc.get_hsa_agent()));

            ihipSignal_t *ihipSignal = allocSignal(crit);
            hsa_signal_t copyCompleteSignal = ihipSignal->_hsa_signal;

            hsa_signal_store_relaxed(copyCompleteSignal, 1);
            void *devPtrDst = dstPtrInfo._devicePointer;
            tprintf(DB_COPY1, "HSA Async_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);

            hsa_status_t hsa_status = hsa_amd_memory_async_copy(devPtrDst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, copyCompleteSignal);

        // This is sync copy, so let's wait for copy right here:
            if (hsa_status == HSA_STATUS_SUCCESS) {
                waitCopy(crit, ihipSignal); // wait for copy, and return to pool.
            } else {
                throw ihipException(hipErrorInvalidValue);
            }
        }
    } else if (kind == hipMemcpyHostToHost)  {
        int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2H);

        if (depSignalCnt) {
            // host waits before doing host memory copy.
            hsa_signal_wait_acquire(depSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
        }
        memcpy(dst, src, sizeBytes);
    } else if ((kind == hipMemcpyDeviceToDevice) && !copyEngineCanSeeSrcAndDest)  {
        int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyP2P);
        if (HIP_STAGING_BUFFERS) {
            tprintf(DB_COPY1, "P2P but engine can't see both pointers: staged copy P2P dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
            //printf ("staged-copy- read dep signals\n");
            hsa_agent_t dstAgent = * (static_cast<hsa_agent_t*> (dstPtrInfo._acc.get_hsa_agent()));
            hsa_agent_t srcAgent = * (static_cast<hsa_agent_t*> (srcPtrInfo._acc.get_hsa_agent()));

            device->_staging_buffer[1]->CopyPeerToPeer(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt ? &depSignal : NULL);

            // The copy completes before returning so can reset queue to empty:
            this->wait(crit, true);

        } else {
            assert(0); // currently no fallback for this path.
        }

    } else {
        // If not special case - these can all be handled by the hsa async copy:
        ihipCommand_t commandType;
        hsa_agent_t srcAgent, dstAgent;
        setAsyncCopyAgents(kind, &commandType, &srcAgent, &dstAgent);

        int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, commandType);

        // Get a completion signal:
        ihipSignal_t *ihipSignal = allocSignal(crit);
        hsa_signal_t copyCompleteSignal = ihipSignal->_hsa_signal;

        hsa_signal_store_relaxed(copyCompleteSignal, 1);

        tprintf(DB_COPY1, "HSA Async_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);

        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, copyCompleteSignal);

        // This is sync copy, so let's wait for copy right here:
        if (hsa_status == HSA_STATUS_SUCCESS) {
            waitCopy(crit, ihipSignal); // wait for copy, and return to pool.
        } else {
            throw ihipException(hipErrorInvalidValue);
        }
    }
}


// Sync copy that acquires lock:
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
{
    LockedAccessor_StreamCrit_t crit (_criticalData);
    copySync(crit, dst, src, sizeBytes, kind);
}


void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
{
    LockedAccessor_StreamCrit_t crit(_criticalData);

    ihipDevice_t *device = this->getDevice();

    if (device == NULL) {
        throw ihipException(hipErrorInvalidDevice);
    }

    if (kind == hipMemcpyHostToHost) {
        tprintf (DB_COPY2, "Asyc: H2H with memcpy");

        // TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the host-side sync here and keep everything flowing on the GPU.
        /* As this is a CPU op, we need to wait until all
        the commands in current stream are finished.
        */
        this->wait(crit);

        memcpy(dst, src, sizeBytes);

    } else {
        bool trueAsync = true;

        hc::accelerator acc;
        hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
        hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
        bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
        bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);


        // "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
        // If both pointers are not tracked, we need to fall back to a sync copy.
        if (!dstTracked || !srcTracked) {
            trueAsync = false;
        }

        if (kind == hipMemcpyDefault) {
            bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem);
            bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem);
            kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem);
        }


        ihipSignal_t *ihip_signal = allocSignal(crit);
        hsa_signal_store_relaxed(ihip_signal->_hsa_signal, 1);


        if(trueAsync == true){

            ihipCommand_t commandType;
            hsa_agent_t srcAgent, dstAgent;
            setAsyncCopyAgents(kind, &commandType, &srcAgent, &dstAgent);

            hsa_signal_t depSignal;
            int depSignalCnt = preCopyCommand(crit, ihip_signal, &depSignal, commandType);

            tprintf (DB_SYNC, " copy-async, waitFor=%lu completion=#%lu(%lu)\n", depSignalCnt? depSignal.handle:0x0, ihip_signal->_sig_id, ihip_signal->_hsa_signal.handle);

            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal);


            if (hsa_status == HSA_STATUS_SUCCESS) {
                if (HIP_LAUNCH_BLOCKING) {
                    tprintf(DB_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes);
                    this->wait(crit);
                }
            } else {
                // This path can be hit if src or dst point to unpinned host memory.
                // TODO-stream - does async-copy fall back to sync if input pointers are not pinned?
                throw ihipException(hipErrorInvalidValue);
            }
        } else {
            copySync(crit, dst, src, sizeBytes, kind);
        }
    }
}

//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// HCC-specific accessor functions:

/**
 * @return #hipSuccess, #hipErrorInvalidDevice
 */
//---
hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)
{
    std::call_once(hip_initialized, ihipInit);

    ihipDevice_t *d = ihipGetDevice(deviceId);
    hipError_t err;
    if (d == NULL) {
        err =  hipErrorInvalidDevice;
    } else {
        *acc = d->_acc;
        err = hipSuccess;
    }
    return ihipLogStatus(err);
}


/**
 * @return #hipSuccess
 */
//---
hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
{
    std::call_once(hip_initialized, ihipInit);

    if (stream == hipStreamNull ) {
        ihipDevice_t *device = ihipGetTlsDefaultDevice();
        stream = device->_default_stream;
    }

    *av = &(stream->_av);

    hipError_t err = hipSuccess;
    return ihipLogStatus(err);
}

// TODO - review signal / error reporting code.
// TODO - describe naming convention. ihip _.  No accessors.  No early returns from functions. Set status to success at top, only set error codes in implementation.  No tabs.
//        Caps convention _ or camelCase
//        if { }
//        Should use ihip* data structures inside code rather than app-facing hip.  For example, use ihipDevice_t (rather than hipDevice_t), ihipStream_t (rather than hipStream_t).
//        locked_
// TODO - describe MT strategy
//
//// TODO - add identifier numbers for streams and devices to help with debugging.

#if ONE_OBJECT_FILE
#include "staging_buffer.cpp"
#endif