2016-01-26 20:14:33 -06:00
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file hip_hcc.cpp
*
* Contains definitions for functions that are large enough that we don't want to inline them everywhere.
* This file is compiled and linked into apps running HIP / HCC path.
*/
2016-03-23 11:13:02 -05:00
# include <assert.h>
# include <stdint.h>
# include <iostream>
# include <sstream>
# include <list>
# include <sys/types.h>
# include <unistd.h>
# include <deque>
# include <vector>
# include <algorithm>
2016-07-09 19:29:55 +05:30
# include <atomic>
2016-10-04 22:17:18 +05:30
2016-03-23 11:13:02 -05:00
# include <hc.hpp>
# include <hc_am.hpp>
2016-10-04 22:17:18 +05:30
# include "hsa/hsa_ext_amd.h"
# include "libhsakmt/hsakmt.h"
2016-03-23 11:13:02 -05:00
2016-10-04 22:17:18 +05:30
# include "hip/hip_runtime.h"
# include "hip/hcc_detail/hip_hcc.h"
# include "hip/hcc_detail/trace_helper.h"
2016-02-10 11:52:42 -06:00
2016-08-08 11:55:57 -05:00
//=================================================================================================
//Global variables:
//=================================================================================================
2016-03-24 04:57:30 -05:00
const int release = 1 ;
2016-09-02 15:49:22 -05:00
const char * API_COLOR = KGRN ;
const char * API_COLOR_END = KNRM ;
2016-03-24 04:57:30 -05:00
int HIP_LAUNCH_BLOCKING = 0 ;
int HIP_PRINT_ENV = 0 ;
int HIP_TRACE_API = 0 ;
2016-09-02 15:49:22 -05:00
std : : string HIP_TRACE_API_COLOR ( " green " ) ;
2016-03-29 17:27:30 -05:00
int HIP_ATP_MARKER = 0 ;
2016-03-24 04:57:30 -05:00
int HIP_DB = 0 ;
int HIP_VISIBLE_DEVICES = 0 ; /* Contains a comma-separated sequence of GPU identifiers */
2016-09-19 17:49:25 -05:00
int HIP_NUM_KERNELS_INFLIGHT = 128 ;
2016-10-05 11:47:12 -05:00
int HIP_BLOCKING_SYNC = 0 ;
2016-03-24 04:57:30 -05:00
2016-10-05 12:18:16 -05:00
//#define DISABLE_COPY_EXT 1
2016-02-16 01:59:13 -06:00
2016-03-24 04:57:30 -05:00
std : : once_flag hip_initialized ;
2016-08-07 21:46:51 -05:00
2016-08-09 21:29:42 +05:30
// Array of pointers to devices.
2016-08-08 11:55:57 -05:00
ihipDevice_t * * g_deviceArray ;
2016-03-24 04:57:30 -05:00
bool g_visible_device = false ;
unsigned g_deviceCnt ;
std : : vector < int > g_hip_visible_devices ;
hsa_agent_t g_cpu_agent ;
2016-02-16 01:59:13 -06:00
//=================================================================================================
2016-08-08 17:49:02 -05:00
// Thread-local storage:
//=================================================================================================
// This is the implicit context used by all HIP commands.
// It can be set by hipSetDevice or by the CTX manipulation commands:
thread_local hipError_t tls_lastHipError = hipSuccess ;
2016-08-07 21:46:51 -05:00
2016-08-08 17:49:02 -05:00
//=================================================================================================
// Top-level "free" functions:
//=================================================================================================
2016-08-07 21:46:51 -05:00
static inline bool ihipIsValidDevice ( unsigned deviceIndex )
{
// deviceIndex is unsigned so always > 0
return ( deviceIndex < g_deviceCnt ) ;
}
2016-08-08 11:55:57 -05:00
ihipDevice_t * ihipGetDevice ( int deviceIndex )
2016-08-07 21:46:51 -05:00
{
if ( ihipIsValidDevice ( deviceIndex ) ) {
2016-08-08 11:55:57 -05:00
return g_deviceArray [ deviceIndex ] ;
2016-08-07 21:46:51 -05:00
} else {
return NULL ;
}
2016-08-08 11:55:57 -05:00
}
ihipCtx_t * ihipGetPrimaryCtx ( unsigned deviceIndex )
{
ihipDevice_t * device = ihipGetDevice ( deviceIndex ) ;
return device ? device - > getPrimaryCtx ( ) : NULL ;
2016-08-07 21:46:51 -05:00
} ;
2016-09-02 12:47:25 -05:00
static thread_local ihipCtx_t * tls_defaultCtx = nullptr ;
2016-08-09 15:37:19 -05:00
void ihipSetTlsDefaultCtx ( ihipCtx_t * ctx )
{
tls_defaultCtx = ctx ;
}
2016-08-07 21:46:51 -05:00
//---
2016-08-09 15:37:19 -05:00
//TODO - review the context creation strategy here. Really should be:
// - first "non-device" runtime call creates the context for this thread. Allowed to call setDevice first.
// - hipDeviceReset destroys the primary context for device?
// - Then context is created again for next usage.
2016-08-07 21:46:51 -05:00
ihipCtx_t * ihipGetTlsDefaultCtx ( )
{
2016-08-09 15:37:19 -05:00
// Per-thread initialization of the TLS:
if ( ( tls_defaultCtx = = nullptr ) & & ( g_deviceCnt > 0 ) ) {
ihipSetTlsDefaultCtx ( ihipGetPrimaryCtx ( 0 ) ) ;
}
return tls_defaultCtx ;
2016-08-07 21:46:51 -05:00
}
2016-02-16 01:59:13 -06:00
2016-08-22 16:15:27 +05:30
hipError_t ihipSynchronize ( void )
{
ihipGetTlsDefaultCtx ( ) - > locked_waitAllStreams ( ) ; // ignores non-blocking streams, this waits for all activity to finish.
2016-02-16 01:59:13 -06:00
2016-09-01 17:58:24 -05:00
return ( hipSuccess ) ;
2016-08-22 16:15:27 +05:30
}
2016-08-08 11:55:57 -05:00
2016-02-22 23:15:24 -06:00
2016-02-16 01:59:13 -06:00
//=================================================================================================
// ihipStream_t:
//=================================================================================================
//---
2016-08-07 21:46:51 -05:00
ihipStream_t : : ihipStream_t ( ihipCtx_t * ctx , hc : : accelerator_view av , unsigned int flags ) :
2016-03-26 11:45:25 -05:00
_id ( 0 ) , // will be set by add function.
2016-02-26 09:50:00 -06:00
_flags ( flags ) ,
2016-08-30 17:29:50 -05:00
_ctx ( ctx ) ,
_criticalData ( av )
2016-02-16 01:59:13 -06:00
{
2016-03-06 23:50:52 -06:00
tprintf ( DB_SYNC , " streamCreate: stream=%p \n " , this ) ;
2016-02-16 01:59:13 -06:00
} ;
2016-02-20 11:01:43 -06:00
2016-02-16 01:59:13 -06:00
//---
ihipStream_t : : ~ ihipStream_t ( )
2016-04-07 09:46:00 -05:00
{
2016-02-16 01:59:13 -06:00
}
2016-03-16 21:16:29 -05:00
//Wait for all kernel and data copy commands in this stream to complete.
2016-03-28 09:46:40 -05:00
//This signature should be used in routines that already have locked the stream mutex
2016-03-28 21:41:47 -05:00
void ihipStream_t : : wait ( LockedAccessor_StreamCrit_t & crit , bool assertQueueEmpty )
2016-02-22 23:15:24 -06:00
{
2016-03-18 03:02:00 -05:00
if ( ! assertQueueEmpty ) {
2016-03-23 01:17:53 -05:00
tprintf ( DB_SYNC , " stream %p wait for queue-empty.. \n " , this ) ;
2016-10-05 11:47:12 -05:00
crit - > _av . wait ( HIP_BLOCKING_SYNC ? hc : : hcWaitModeBlocked : hc : : hcWaitModeActive ) ;
2016-08-05 15:05:57 +05:30
}
2016-07-27 18:31:11 -05:00
crit - > _kernelCnt = 0 ;
2016-03-28 09:46:40 -05:00
}
//---
//Wait for all kernel and data copy commands in this stream to complete.
void ihipStream_t : : locked_wait ( bool assertQueueEmpty )
{
2016-03-28 21:41:47 -05:00
LockedAccessor_StreamCrit_t crit ( _criticalData ) ;
2016-03-28 09:46:40 -05:00
wait ( crit , assertQueueEmpty ) ;
2016-01-26 20:14:33 -06:00
} ;
2016-09-01 13:59:55 -05:00
// Causes current stream to wait for specified event to complete:
void ihipStream_t : : locked_waitEvent ( hipEvent_t event )
{
LockedAccessor_StreamCrit_t crit ( _criticalData ) ;
// TODO - check state of event here:
crit - > _av . create_blocking_marker ( event - > _marker ) ;
}
2016-08-30 17:29:50 -05:00
// Create a marker in this stream.
// Save state in the event so it can track the status of the event.
void ihipStream_t : : locked_recordEvent ( hipEvent_t event )
{
// Lock the stream to prevent simultaneous access
LockedAccessor_StreamCrit_t crit ( _criticalData ) ;
event - > _marker = crit - > _av . create_marker ( ) ;
}
2016-08-08 11:55:57 -05:00
//=============================================================================
2016-01-26 20:14:33 -06:00
2016-04-16 10:18:56 -05:00
2016-08-08 11:55:57 -05:00
//-------------------------------------------------------------------------------------------------
2016-04-11 12:52:18 -05:00
2016-04-18 20:49:33 -05:00
2016-08-08 11:55:57 -05:00
//---
const ihipDevice_t * ihipStream_t : : getDevice ( ) const
2016-04-18 20:49:33 -05:00
{
2016-08-08 11:55:57 -05:00
return _ctx - > getDevice ( ) ;
} ;
2016-03-28 09:46:40 -05:00
2016-03-23 11:13:02 -05:00
2016-02-16 01:59:13 -06:00
2016-08-08 11:55:57 -05:00
ihipCtx_t * ihipStream_t : : getCtx ( ) const
2016-04-16 10:18:56 -05:00
{
2016-08-07 20:47:02 -05:00
return _ctx ;
2016-04-16 10:18:56 -05:00
} ;
2016-08-08 11:55:57 -05:00
2016-07-23 14:54:20 -05:00
2016-02-22 23:15:24 -06:00
//--
2016-09-19 17:09:50 -05:00
// Lock the stream to prevent other threads from intervening.
2016-08-30 17:29:50 -05:00
LockedAccessor_StreamCrit_t ihipStream_t : : lockopen_preKernelCommand ( )
2016-02-22 23:15:24 -06:00
{
2016-03-28 21:41:47 -05:00
LockedAccessor_StreamCrit_t crit ( _criticalData , false /*no unlock at destruction*/ ) ;
2016-02-25 19:17:28 -06:00
2016-07-26 17:09:27 -05:00
if ( crit - > _kernelCnt > HIP_NUM_KERNELS_INFLIGHT ) {
2016-09-19 17:09:50 -05:00
this - > wait ( crit ) ;
2016-07-26 17:09:27 -05:00
crit - > _kernelCnt = 0 ;
2016-07-26 14:03:51 -05:00
}
2016-07-26 17:09:27 -05:00
crit - > _kernelCnt + + ;
2016-02-22 23:15:24 -06:00
2016-08-30 17:29:50 -05:00
return crit ;
2016-02-22 23:15:24 -06:00
}
//---
2016-03-28 21:41:47 -05:00
// Must be called after kernel finishes, this releases the lock on the stream so other commands can submit.
2016-09-26 16:32:35 -05:00
void ihipStream_t : : lockclose_postKernelCommand ( hc : : accelerator_view * av )
2016-02-16 01:59:13 -06:00
{
2016-02-25 19:17:28 -06:00
2016-08-30 17:29:50 -05:00
if ( HIP_LAUNCH_BLOCKING ) {
2016-10-05 11:47:12 -05:00
// TODO - fix this so it goes through proper stream::wait() call.// direct wait OK since we know the stream is locked.
av - > wait ( hc : : hcWaitModeActive ) ;
2016-08-30 17:29:50 -05:00
tprintf ( DB_SYNC , " %s LAUNCH_BLOCKING for kernel completion \n " , ToString ( this ) . c_str ( ) ) ;
}
2016-03-28 21:41:47 -05:00
_criticalData . unlock ( ) ; // paired with lock from lockopen_preKernelCommand.
2016-02-22 23:15:24 -06:00
} ;
2016-09-02 12:47:25 -05:00
// Precursor: the stream is already locked,specifically so this routine can enqueue work into the specified av.
void ihipStream_t : : launchModuleKernel (
hc : : accelerator_view av ,
hsa_signal_t signal ,
2016-08-22 14:17:55 -05:00
uint32_t blockDimX ,
uint32_t blockDimY ,
uint32_t blockDimZ ,
uint32_t gridDimX ,
uint32_t gridDimY ,
uint32_t gridDimZ ,
2016-09-07 12:57:18 -05:00
uint32_t groupSegmentSize ,
2016-09-19 11:20:51 -05:00
uint32_t privateSegmentSize ,
2016-08-22 14:17:55 -05:00
void * kernarg ,
size_t kernSize ,
uint64_t kernel ) {
hsa_status_t status ;
void * kern ;
2016-08-30 17:29:50 -05:00
hsa_amd_memory_pool_t * pool = reinterpret_cast < hsa_amd_memory_pool_t * > ( av . get_hsa_kernarg_region ( ) ) ;
2016-08-22 14:17:55 -05:00
status = hsa_amd_memory_pool_allocate ( * pool , kernSize , 0 , & kern ) ;
2016-08-30 17:29:50 -05:00
status = hsa_amd_agents_allow_access ( 1 , ( hsa_agent_t * ) av . get_hsa_agent ( ) , 0 , kern ) ;
2016-08-22 14:17:55 -05:00
memcpy ( kern , kernarg , kernSize ) ;
2016-08-30 17:29:50 -05:00
hsa_queue_t * Queue = ( hsa_queue_t * ) av . get_hsa_queue ( ) ;
2016-08-22 14:17:55 -05:00
const uint32_t queue_mask = Queue - > size - 1 ;
uint32_t packet_index = hsa_queue_load_write_index_relaxed ( Queue ) ;
hsa_kernel_dispatch_packet_t * dispatch_packet = & ( ( ( hsa_kernel_dispatch_packet_t * ) ( Queue - > base_address ) ) [ packet_index & queue_mask ] ) ;
dispatch_packet - > completion_signal = signal ;
dispatch_packet - > workgroup_size_x = blockDimX ;
dispatch_packet - > workgroup_size_y = blockDimY ;
dispatch_packet - > workgroup_size_z = blockDimZ ;
dispatch_packet - > grid_size_x = blockDimX * gridDimX ;
dispatch_packet - > grid_size_y = blockDimY * gridDimY ;
dispatch_packet - > grid_size_z = blockDimZ * gridDimZ ;
2016-09-07 12:57:18 -05:00
dispatch_packet - > group_segment_size = groupSegmentSize ;
dispatch_packet - > private_segment_size = privateSegmentSize ;
2016-08-22 14:17:55 -05:00
dispatch_packet - > kernarg_address = kern ;
dispatch_packet - > kernel_object = kernel ;
uint16_t header = ( HSA_PACKET_TYPE_KERNEL_DISPATCH < < HSA_PACKET_HEADER_TYPE ) |
( 1 < < HSA_PACKET_HEADER_BARRIER ) |
( HSA_FENCE_SCOPE_SYSTEM < < HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE ) |
( HSA_FENCE_SCOPE_SYSTEM < < HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE ) ;
uint16_t setup = 1 < < HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS ;
uint32_t header32 = header | ( setup < < 16 ) ;
__atomic_store_n ( ( uint32_t * ) ( dispatch_packet ) , header32 , __ATOMIC_RELEASE ) ;
hsa_queue_store_write_index_relaxed ( Queue , packet_index + 1 ) ;
hsa_signal_store_relaxed ( Queue - > doorbell_signal , packet_index ) ;
}
2016-03-16 21:16:29 -05:00
2016-08-08 11:55:57 -05:00
//=============================================================================
// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
// The packed _peerAgents can efficiently be used on each memory allocation.
template < >
void ihipCtxCriticalBase_t < CtxMutex > : : recomputePeerAgents ( )
2016-02-12 17:39:44 -06:00
{
2016-08-08 11:55:57 -05:00
_peerCnt = 0 ;
std : : for_each ( _peers . begin ( ) , _peers . end ( ) , [ this ] ( ihipCtx_t * ctx ) {
2016-08-08 14:54:38 -05:00
_peerAgents [ _peerCnt + + ] = ctx - > getDevice ( ) - > _hsaAgent ;
2016-08-08 11:55:57 -05:00
} ) ;
}
2016-03-26 10:46:20 -05:00
2016-04-18 20:49:33 -05:00
2016-08-08 11:55:57 -05:00
template < >
bool ihipCtxCriticalBase_t < CtxMutex > : : isPeer ( const ihipCtx_t * peer )
{
auto match = std : : find ( _peers . begin ( ) , _peers . end ( ) , peer ) ;
return ( match ! = std : : end ( _peers ) ) ;
}
2016-04-18 20:49:33 -05:00
2016-08-05 15:05:57 +05:30
2016-08-08 11:55:57 -05:00
template < >
bool ihipCtxCriticalBase_t < CtxMutex > : : addPeer ( ihipCtx_t * peer )
{
auto match = std : : find ( _peers . begin ( ) , _peers . end ( ) , peer ) ;
if ( match = = std : : end ( _peers ) ) {
// Not already a peer, let's update the list:
_peers . push_back ( peer ) ;
recomputePeerAgents ( ) ;
return true ;
2016-04-18 20:49:33 -05:00
}
2016-08-08 11:55:57 -05:00
// If we get here - peer was already on list, silently ignore.
return false ;
}
2016-04-18 20:49:33 -05:00
2016-08-08 11:55:57 -05:00
template < >
bool ihipCtxCriticalBase_t < CtxMutex > : : removePeer ( ihipCtx_t * peer )
{
auto match = std : : find ( _peers . begin ( ) , _peers . end ( ) , peer ) ;
if ( match ! = std : : end ( _peers ) ) {
// Found a valid peer, let's remove it.
_peers . remove ( peer ) ;
recomputePeerAgents ( ) ;
return true ;
} else {
return false ;
}
}
2016-04-11 12:52:18 -05:00
2016-08-08 11:55:57 -05:00
template < >
void ihipCtxCriticalBase_t < CtxMutex > : : resetPeers ( ihipCtx_t * thisDevice )
{
_peers . clear ( ) ;
_peerCnt = 0 ;
addPeer ( thisDevice ) ; // peer-list always contains self agent.
}
2016-02-12 17:39:44 -06:00
2016-09-22 10:39:17 -05:00
template < >
void ihipCtxCriticalBase_t < CtxMutex > : : printPeers ( FILE * f ) const
{
for ( auto iter = _peers . begin ( ) ; iter ! = _peers . end ( ) ; iter + + ) {
fprintf ( f , " %s " , ( * iter ) - > toString ( ) . c_str ( ) ) ;
} ;
}
2016-08-08 11:55:57 -05:00
template < >
void ihipCtxCriticalBase_t < CtxMutex > : : addStream ( ihipStream_t * stream )
2016-01-26 20:14:33 -06:00
{
2016-08-08 11:55:57 -05:00
stream - > _id = _streams . size ( ) ;
_streams . push_back ( stream ) ;
}
//=============================================================================
2016-03-26 11:45:25 -05:00
2016-08-08 14:54:38 -05:00
//=================================================================================================
// ihipDevice_t
//=================================================================================================
ihipDevice_t : : ihipDevice_t ( unsigned deviceId , unsigned deviceCnt , hc : : accelerator & acc ) :
2016-08-09 21:29:42 +05:30
_deviceId ( deviceId ) ,
2016-08-08 11:55:57 -05:00
_acc ( acc )
{
2016-03-17 00:20:56 -05:00
hsa_agent_t * agent = static_cast < hsa_agent_t * > ( acc . get_hsa_agent ( ) ) ;
2016-01-26 20:14:33 -06:00
if ( agent ) {
2016-08-08 14:54:38 -05:00
int err = hsa_agent_get_info ( * agent , ( hsa_agent_info_t ) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT , & _computeUnits ) ;
2016-01-26 20:14:33 -06:00
if ( err ! = HSA_STATUS_SUCCESS ) {
2016-08-08 14:54:38 -05:00
_computeUnits = 1 ;
2016-01-26 20:14:33 -06:00
}
2016-08-08 14:54:38 -05:00
_hsaAgent = * agent ;
2016-01-26 20:14:33 -06:00
} else {
2016-08-08 14:54:38 -05:00
_hsaAgent . handle = static_cast < uint64_t > ( - 1 ) ;
2016-01-26 20:14:33 -06:00
}
2016-08-08 11:55:57 -05:00
initProperties ( & _props ) ;
2016-02-12 04:30:09 -06:00
2016-03-17 20:09:10 -05:00
2016-08-08 11:55:57 -05:00
_primaryCtx = new ihipCtx_t ( this , deviceCnt , hipDeviceMapHost ) ;
}
2016-03-26 11:45:25 -05:00
2016-08-08 11:55:57 -05:00
ihipDevice_t : : ~ ihipDevice_t ( )
2016-01-26 20:14:33 -06:00
{
2016-09-19 17:49:25 -05:00
delete _primaryCtx ;
_primaryCtx = NULL ;
2016-01-26 20:14:33 -06:00
}
# define ErrorCheck(x) error_check(x, __LINE__, __FILE__)
void error_check ( hsa_status_t hsa_error_code , int line_num , std : : string str ) {
2016-07-09 19:29:55 +05:30
if ( ( hsa_error_code ! = HSA_STATUS_SUCCESS ) & & ( hsa_error_code ! = HSA_STATUS_INFO_BREAK ) ) {
2016-01-26 20:14:33 -06:00
printf ( " HSA reported error! \n In file: %s \n At line: %d \n " , str . c_str ( ) , line_num ) ;
}
}
2016-08-08 11:55:57 -05:00
//---
// Helper for initProperties
// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it.
static hsa_status_t countGpuAgents ( hsa_agent_t agent , void * data ) {
if ( data = = NULL ) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT ;
}
hsa_device_type_t device_type ;
hsa_status_t status = hsa_agent_get_info ( agent , HSA_AGENT_INFO_DEVICE , & device_type ) ;
if ( status ! = HSA_STATUS_SUCCESS ) {
return status ;
}
if ( device_type = = HSA_DEVICE_TYPE_GPU ) {
( * static_cast < int * > ( data ) ) + + ;
}
return HSA_STATUS_SUCCESS ;
}
2016-07-09 19:29:55 +05:30
hsa_status_t FindGpuDevice ( hsa_agent_t agent , void * data ) {
if ( data = = NULL ) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT ;
}
hsa_device_type_t hsa_device_type ;
hsa_status_t hsa_error_code =
hsa_agent_get_info ( agent , HSA_AGENT_INFO_DEVICE , & hsa_device_type ) ;
if ( hsa_error_code ! = HSA_STATUS_SUCCESS ) {
return hsa_error_code ;
}
if ( hsa_device_type = = HSA_DEVICE_TYPE_GPU ) {
* ( ( hsa_agent_t * ) data ) = agent ;
return HSA_STATUS_INFO_BREAK ;
}
return HSA_STATUS_SUCCESS ;
}
2016-08-05 15:05:57 +05:30
hsa_status_t GetDevicePool ( hsa_amd_memory_pool_t pool , void * data ) {
2016-07-09 19:29:55 +05:30
if ( NULL = = data ) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT ;
}
hsa_status_t err ;
hsa_amd_segment_t segment ;
uint32_t flag ;
2016-08-05 15:05:57 +05:30
err = hsa_amd_memory_pool_get_info ( pool , HSA_AMD_MEMORY_POOL_INFO_SEGMENT , & segment ) ;
2016-07-09 19:29:55 +05:30
ErrorCheck ( err ) ;
if ( HSA_AMD_SEGMENT_GLOBAL ! = segment ) return HSA_STATUS_SUCCESS ;
2016-08-05 15:05:57 +05:30
err = hsa_amd_memory_pool_get_info ( pool , HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS , & flag ) ;
2016-07-09 19:29:55 +05:30
ErrorCheck ( err ) ;
2016-08-05 15:05:57 +05:30
* ( ( hsa_amd_memory_pool_t * ) data ) = pool ;
2016-07-09 19:29:55 +05:30
return HSA_STATUS_SUCCESS ;
}
int checkAccess ( hsa_agent_t agent , hsa_amd_memory_pool_t pool )
{
hsa_status_t err ;
hsa_amd_memory_pool_access_t access ;
err = hsa_amd_agent_memory_pool_get_info ( agent , pool , HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS , & access ) ;
ErrorCheck ( err ) ;
return access ;
}
2016-01-26 20:14:33 -06:00
hsa_status_t get_region_info ( hsa_region_t region , void * data )
{
hsa_status_t err ;
hipDeviceProp_t * p_prop = reinterpret_cast < hipDeviceProp_t * > ( data ) ;
2016-02-18 18:54:19 +03:00
uint32_t region_segment ;
// Get region segment
err = hsa_region_get_info ( region , HSA_REGION_INFO_SEGMENT , & region_segment ) ;
ErrorCheck ( err ) ;
switch ( region_segment ) {
case HSA_REGION_SEGMENT_READONLY :
err = hsa_region_get_info ( region , HSA_REGION_INFO_SIZE , & ( p_prop - > totalConstMem ) ) ; break ;
/* case HSA_REGION_SEGMENT_PRIVATE:
cout<<"PRIVATE"<<endl; private segment cannot be queried */
case HSA_REGION_SEGMENT_GROUP :
err = hsa_region_get_info ( region , HSA_REGION_INFO_SIZE , & ( p_prop - > sharedMemPerBlock ) ) ; break ;
default : break ;
}
2016-01-26 20:14:33 -06:00
return HSA_STATUS_SUCCESS ;
}
2016-08-08 11:55:57 -05:00
2016-02-25 23:44:39 +03:00
// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it.
2016-08-08 11:55:57 -05:00
static hsa_status_t findCpuAgent ( hsa_agent_t agent , void * data )
{
2016-02-25 23:44:39 +03:00
hsa_device_type_t device_type ;
hsa_status_t status = hsa_agent_get_info ( agent , HSA_AGENT_INFO_DEVICE , & device_type ) ;
if ( status ! = HSA_STATUS_SUCCESS ) {
return status ;
}
2016-08-08 11:55:57 -05:00
if ( device_type = = HSA_DEVICE_TYPE_CPU ) {
( * static_cast < hsa_agent_t * > ( data ) ) = agent ;
return HSA_STATUS_INFO_BREAK ;
2016-02-25 23:44:39 +03:00
}
2016-08-08 11:55:57 -05:00
2016-02-25 23:44:39 +03:00
return HSA_STATUS_SUCCESS ;
}
2016-01-26 20:14:33 -06:00
2016-08-08 11:55:57 -05:00
# define DeviceErrorCheck(x) if (x != HSA_STATUS_SUCCESS) { return hipErrorInvalidDevice; }
//---
// Initialize properties for the device.
// Call this once when the ihipDevice_t is created:
hipError_t ihipDevice_t : : initProperties ( hipDeviceProp_t * prop )
2016-01-26 20:14:33 -06:00
{
hipError_t e = hipSuccess ;
2016-02-18 18:54:19 +03:00
hsa_status_t err ;
2016-01-26 20:14:33 -06:00
// Set some defaults in case we don't find the appropriate regions:
prop - > totalGlobalMem = 0 ;
prop - > totalConstMem = 0 ;
prop - > sharedMemPerBlock = 0 ;
prop - > maxThreadsPerMultiProcessor = 0 ;
prop - > regsPerBlock = 0 ;
2016-08-08 14:54:38 -05:00
if ( _hsaAgent . handle = = - 1 ) {
2016-01-26 20:14:33 -06:00
return hipErrorInvalidDevice ;
}
2016-02-25 23:44:39 +03:00
// Iterates over the agents to determine Multiple GPU devices
// using the countGpuAgents callback.
2016-03-23 10:29:44 -05:00
//! @bug : on HCC, isMultiGpuBoard returns True if system contains multiple GPUS (rather than if GPU is on a multi-ASIC board)
2016-02-25 23:44:39 +03:00
int gpuAgentsCount = 0 ;
err = hsa_iterate_agents ( countGpuAgents , & gpuAgentsCount ) ;
if ( err = = HSA_STATUS_INFO_BREAK ) { err = HSA_STATUS_SUCCESS ; }
DeviceErrorCheck ( err ) ;
prop - > isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1 ;
2016-01-26 20:14:33 -06:00
// Get agent name
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_NAME , & ( prop - > name ) ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
// Get agent node
uint32_t node ;
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_NODE , & node ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
// Get wavefront size
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_WAVEFRONT_SIZE , & prop - > warpSize ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
// Get max total number of work-items in a workgroup
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_WORKGROUP_MAX_SIZE , & prop - > maxThreadsPerBlock ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
// Get max number of work-items of each dimension of a work-group
uint16_t work_group_max_dim [ 3 ] ;
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_WORKGROUP_MAX_DIM , work_group_max_dim ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
for ( int i = 0 ; i < 3 ; i + + ) {
prop - > maxThreadsDim [ i ] = work_group_max_dim [ i ] ;
}
hsa_dim3_t grid_max_dim ;
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_GRID_MAX_DIM , & grid_max_dim ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
prop - > maxGridSize [ 0 ] = ( int ) ( ( grid_max_dim . x = = UINT32_MAX ) ? ( INT32_MAX ) : grid_max_dim . x ) ;
prop - > maxGridSize [ 1 ] = ( int ) ( ( grid_max_dim . y = = UINT32_MAX ) ? ( INT32_MAX ) : grid_max_dim . y ) ;
prop - > maxGridSize [ 2 ] = ( int ) ( ( grid_max_dim . z = = UINT32_MAX ) ? ( INT32_MAX ) : grid_max_dim . z ) ;
// Get Max clock frequency
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , ( hsa_agent_info_t ) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY , & prop - > clockRate ) ;
2016-01-26 20:14:33 -06:00
prop - > clockRate * = 1000.0 ; // convert Mhz to Khz.
DeviceErrorCheck ( err ) ;
//uint64_t counterHz;
//err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &counterHz);
//DeviceErrorCheck(err);
//prop->clockInstructionRate = counterHz / 1000;
prop - > clockInstructionRate = 100 * 1000 ; /* TODO-RT - hard-code until HSART has function to properly report clock */
2016-02-11 22:26:01 +03:00
// Get Agent BDFID (bus/device/function ID)
uint16_t bdf_id = 1 ;
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , ( hsa_agent_info_t ) HSA_AMD_AGENT_INFO_BDFID , & bdf_id ) ;
2016-02-11 22:26:01 +03:00
DeviceErrorCheck ( err ) ;
2016-01-26 20:14:33 -06:00
2016-02-11 22:26:01 +03:00
// BDFID is 16bit uint: [8bit - BusID | 5bit - Device ID | 3bit - Function/DomainID]
// TODO/Clarify: cudaDeviceProp::pciDomainID how to report?
// prop->pciDomainID = bdf_id & 0x7;
prop - > pciDeviceID = ( bdf_id > > 3 ) & 0x1F ;
prop - > pciBusID = ( bdf_id > > 8 ) & 0xFF ;
2016-01-26 20:14:33 -06:00
// Masquerade as a 3.0-level device. This will change as more HW functions are properly supported.
// Application code should use the arch.has* to do detailed feature detection.
prop - > major = 2 ;
prop - > minor = 0 ;
// Get number of Compute Unit
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , ( hsa_agent_info_t ) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT , & ( prop - > multiProcessorCount ) ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
// TODO-hsart - this appears to return 0?
uint32_t cache_size [ 4 ] ;
2016-08-08 14:54:38 -05:00
err = hsa_agent_get_info ( _hsaAgent , HSA_AGENT_INFO_CACHE_SIZE , cache_size ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
prop - > l2CacheSize = cache_size [ 1 ] ;
2016-02-09 08:39:08 -06:00
/* Computemode for HSA Devices is always : cudaComputeModeDefault */
2016-01-26 20:14:33 -06:00
prop - > computeMode = 0 ;
2016-09-19 17:49:25 -05:00
_isLargeBar = _acc . has_cpu_accessible_am ( ) ;
2016-07-09 19:29:55 +05:30
2016-02-12 01:29:20 +03:00
// Get Max Threads Per Multiprocessor
2016-06-23 22:22:58 +05:30
2016-02-12 01:29:20 +03:00
HsaSystemProperties props ;
hsaKmtReleaseSystemProperties ( ) ;
if ( HSAKMT_STATUS_SUCCESS = = hsaKmtAcquireSystemProperties ( & props ) ) {
HsaNodeProperties node_prop = { 0 } ;
if ( HSAKMT_STATUS_SUCCESS = = hsaKmtGetNodeProperties ( node , & node_prop ) ) {
uint32_t waves_per_cu = node_prop . MaxWavesPerSIMD ;
2016-06-23 22:22:58 +05:30
uint32_t simd_per_cu = node_prop . NumSIMDPerCU ;
prop - > maxThreadsPerMultiProcessor = prop - > warpSize * waves_per_cu * simd_per_cu ;
2016-02-12 01:29:20 +03:00
}
}
2016-06-23 22:22:58 +05:30
2016-02-12 01:29:20 +03:00
2016-02-12 00:04:14 +03:00
// Get memory properties
2016-08-08 14:54:38 -05:00
err = hsa_agent_iterate_regions ( _hsaAgent , get_region_info , prop ) ;
2016-01-26 20:14:33 -06:00
DeviceErrorCheck ( err ) ;
// Get the size of the region we are using for Accelerator Memory allocations:
2016-02-18 18:15:01 +03:00
hsa_region_t * am_region = static_cast < hsa_region_t * > ( _acc . get_hsa_am_region ( ) ) ;
2016-02-18 17:25:28 +03:00
err = hsa_region_get_info ( * am_region , HSA_REGION_INFO_SIZE , & prop - > totalGlobalMem ) ;
2016-02-12 00:04:14 +03:00
DeviceErrorCheck ( err ) ;
2016-02-12 01:29:20 +03:00
// maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
2016-02-12 00:04:14 +03:00
// Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size.
2016-02-12 01:29:20 +03:00
prop - > maxSharedMemoryPerMultiProcessor = prop - > totalGlobalMem ;
2016-01-26 20:14:33 -06:00
2016-02-18 17:25:28 +03:00
// Get Max memory clock frequency
2016-03-25 09:24:08 -05:00
err = hsa_region_get_info ( * am_region , ( hsa_region_info_t ) HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY , & prop - > memoryClockRate ) ;
2016-02-18 18:15:01 +03:00
DeviceErrorCheck ( err ) ;
2016-02-18 17:25:28 +03:00
prop - > memoryClockRate * = 1000.0 ; // convert Mhz to Khz.
2016-02-18 18:15:01 +03:00
// Get global memory bus width in bits
2016-03-25 09:24:08 -05:00
err = hsa_region_get_info ( * am_region , ( hsa_region_info_t ) HSA_AMD_REGION_INFO_BUS_WIDTH , & prop - > memoryBusWidth ) ;
2016-02-18 17:25:28 +03:00
DeviceErrorCheck ( err ) ;
2016-01-26 20:14:33 -06:00
// Set feature flags - these are all mandatory for HIP on HCC path:
// Some features are under-development and future revs may support flags that are currently 0.
// Reporting of these flags should be synchronized with the HIP_ARCH* compile-time defines in hip_runtime.h
prop - > arch . hasGlobalInt32Atomics = 1 ;
prop - > arch . hasGlobalFloatAtomicExch = 1 ;
2016-02-22 16:21:52 +05:30
prop - > arch . hasSharedInt32Atomics = 1 ;
prop - > arch . hasSharedFloatAtomicExch = 1 ;
2016-01-26 20:14:33 -06:00
prop - > arch . hasFloatAtomicAdd = 0 ;
prop - > arch . hasGlobalInt64Atomics = 1 ;
2016-02-22 16:21:52 +05:30
prop - > arch . hasSharedInt64Atomics = 1 ;
2016-01-26 20:14:33 -06:00
prop - > arch . hasDoubles = 1 ; // TODO - true for Fiji.
prop - > arch . hasWarpVote = 1 ;
prop - > arch . hasWarpBallot = 1 ;
prop - > arch . hasWarpShuffle = 1 ;
prop - > arch . hasFunnelShift = 0 ; // TODO-hcc
prop - > arch . hasThreadFenceSystem = 0 ; // TODO-hcc
prop - > arch . hasSyncThreadsExt = 0 ; // TODO-hcc
prop - > arch . hasSurfaceFuncs = 0 ; // TODO-hcc
prop - > arch . has3dGrid = 1 ;
prop - > arch . hasDynamicParallelism = 0 ;
2016-08-08 11:55:57 -05:00
prop - > concurrentKernels = 1 ; // All ROCm hardware supports executing multiple kernels concurrently
prop - > canMapHostMemory = 1 ; // All ROCm devices can map host memory
#if 0
// TODO - code broken below since it always returns 1.
// Are the flags part of the context or part of the device?
2016-03-24 07:33:24 -05:00
if ( _device_flags | hipDeviceMapHost) {
prop->canMapHostMemory = 1;
} else {
prop->canMapHostMemory = 0;
}
2016-08-08 11:55:57 -05:00
#endif
2016-01-26 20:14:33 -06:00
return e ;
}
2016-03-19 02:44:26 -05:00
2016-08-08 11:55:57 -05:00
//=================================================================================================
2016-08-08 14:54:38 -05:00
// ihipCtx_t
2016-08-08 11:55:57 -05:00
//=================================================================================================
2016-08-08 14:54:38 -05:00
ihipCtx_t : : ihipCtx_t ( ihipDevice_t * device , unsigned deviceCnt , unsigned flags ) :
2016-08-08 11:55:57 -05:00
_ctxFlags ( flags ) ,
_device ( device ) ,
_criticalData ( deviceCnt )
{
locked_reset ( ) ;
2016-08-08 14:54:38 -05:00
tprintf ( DB_SYNC , " created ctx with defaultStream=%p \n " , _defaultStream ) ;
2016-08-08 11:55:57 -05:00
} ;
ihipCtx_t : : ~ ihipCtx_t ( )
{
2016-08-08 14:54:38 -05:00
if ( _defaultStream ) {
delete _defaultStream ;
_defaultStream = NULL ;
2016-08-08 11:55:57 -05:00
}
}
//Reset the device - this is called from hipDeviceReset.
//Device may be reset multiple times, and may be reset after init.
void ihipCtx_t : : locked_reset ( )
{
// Obtain mutex access to the device critical data, release by destructor
LockedAccessor_CtxCrit_t crit ( _criticalData ) ;
//---
//Wait for pending activity to complete? TODO - check if this is required behavior:
tprintf ( DB_SYNC , " locked_reset waiting for activity to complete. \n " ) ;
// Reset and remove streams:
// Delete all created streams including the default one.
for ( auto streamI = crit - > const_streams ( ) . begin ( ) ; streamI ! = crit - > const_streams ( ) . end ( ) ; streamI + + ) {
ihipStream_t * stream = * streamI ;
( * streamI ) - > locked_wait ( ) ;
tprintf ( DB_SYNC , " delete stream=%p \n " , stream ) ;
delete stream ;
}
// Clear the list.
crit - > streams ( ) . clear ( ) ;
// Create a fresh default stream and add it:
2016-08-08 14:54:38 -05:00
_defaultStream = new ihipStream_t ( this , getDevice ( ) - > _acc . get_default_view ( ) , hipStreamDefault ) ;
crit - > addStream ( _defaultStream ) ;
2016-08-08 11:55:57 -05:00
// Reset peer list to just me:
crit - > resetPeers ( this ) ;
// Reset and release all memory stored in the tracker:
// Reset will remove peer mapping so don't need to do this explicitly.
// FIXME - This is clearly a non-const action! Is this a context reset or a device reset - maybe should reference count?
2016-08-08 14:54:38 -05:00
ihipDevice_t * device = getWriteableDevice ( ) ;
2016-08-08 11:55:57 -05:00
am_memtracker_reset ( device - > _acc ) ;
} ;
2016-09-22 10:39:17 -05:00
//---
std : : string ihipCtx_t : : toString ( ) const
{
std : : ostringstream ss ;
ss < < this ;
return ss . str ( ) ;
} ;
2016-08-08 11:55:57 -05:00
//----
//=================================================================================================
// Utility functions, these are not part of the public HIP API
//=================================================================================================
//=================================================================================================
2016-03-19 02:44:26 -05:00
// Implement "default" stream syncronization
// This waits for all other streams to drain before continuing.
// If waitOnSelf is set, this additionally waits for the default stream to empty.
2016-08-07 21:46:51 -05:00
void ihipCtx_t : : locked_syncDefaultStream ( bool waitOnSelf )
2016-03-19 02:44:26 -05:00
{
2016-08-08 11:55:57 -05:00
LockedAccessor_CtxCrit_t crit ( _criticalData ) ;
2016-03-26 10:46:20 -05:00
2016-03-19 02:44:26 -05:00
tprintf ( DB_SYNC , " syncDefaultStream \n " ) ;
2016-03-28 21:41:47 -05:00
for ( auto streamI = crit - > const_streams ( ) . begin ( ) ; streamI ! = crit - > const_streams ( ) . end ( ) ; streamI + + ) {
2016-03-19 02:44:26 -05:00
ihipStream_t * stream = * streamI ;
2016-03-25 09:24:08 -05:00
2016-03-19 02:44:26 -05:00
// Don't wait for streams that have "opted-out" of syncing with NULL stream.
// And - don't wait for the NULL stream
if ( ! ( stream - > _flags & hipStreamNonBlocking ) ) {
2016-08-08 14:54:38 -05:00
if ( waitOnSelf | | ( stream ! = _defaultStream ) ) {
2016-03-19 02:44:26 -05:00
// TODO-hcc - use blocking or active wait here?
// TODO-sync - cudaDeviceBlockingSync
2016-03-28 09:46:40 -05:00
stream - > locked_wait ( ) ;
2016-03-19 02:44:26 -05:00
}
}
}
}
2016-03-26 10:46:20 -05:00
//---
2016-08-07 21:46:51 -05:00
void ihipCtx_t : : locked_addStream ( ihipStream_t * s )
2016-03-26 10:46:20 -05:00
{
2016-08-08 11:55:57 -05:00
LockedAccessor_CtxCrit_t crit ( _criticalData ) ;
2016-03-26 10:46:20 -05:00
2016-04-18 20:49:33 -05:00
crit - > addStream ( s ) ;
2016-03-26 10:46:20 -05:00
}
//---
2016-08-07 21:46:51 -05:00
void ihipCtx_t : : locked_removeStream ( ihipStream_t * s )
2016-03-26 10:46:20 -05:00
{
2016-08-08 11:55:57 -05:00
LockedAccessor_CtxCrit_t crit ( _criticalData ) ;
2016-03-26 10:46:20 -05:00
2016-03-28 21:41:47 -05:00
crit - > streams ( ) . remove ( s ) ;
2016-03-26 10:46:20 -05:00
}
2016-03-19 02:44:26 -05:00
2016-03-19 05:42:19 -05:00
//---
//Heavyweight synchronization that waits on all streams, ignoring hipStreamNonBlocking flag.
2016-08-07 21:46:51 -05:00
void ihipCtx_t : : locked_waitAllStreams ( )
2016-03-19 05:42:19 -05:00
{
2016-08-08 11:55:57 -05:00
LockedAccessor_CtxCrit_t crit ( _criticalData ) ;
2016-03-26 10:46:20 -05:00
2016-03-19 05:42:19 -05:00
tprintf ( DB_SYNC , " waitAllStream \n " ) ;
2016-03-28 21:41:47 -05:00
for ( auto streamI = crit - > const_streams ( ) . begin ( ) ; streamI ! = crit - > const_streams ( ) . end ( ) ; streamI + + ) {
2016-03-28 09:46:40 -05:00
( * streamI ) - > locked_wait ( ) ;
2016-03-19 05:42:19 -05:00
}
}
2016-03-19 02:44:26 -05:00
2016-09-02 15:49:22 -05:00
//---
2016-01-26 20:14:33 -06:00
// Read environment variables.
void ihipReadEnv_I ( int * var_ptr , const char * var_name1 , const char * var_name2 , const char * description )
{
char * env = getenv ( var_name1 ) ;
// Check second name if first not defined, used to allow HIP_ or CUDA_ env vars.
if ( ( env = = NULL ) & & strcmp ( var_name2 , " 0 " ) ) {
env = getenv ( var_name2 ) ;
}
2016-09-02 15:49:22 -05:00
// TODO: Refactor this code so it is a separate call rather than being part of ihipReadEnv_I, which should only read integers.
2016-02-16 07:39:04 -06:00
// Check if the environment variable is either HIP_VISIBLE_DEVICES or CUDA_LAUNCH_BLOCKING, which
// contains a sequence of comma-separated device IDs
if ( ! ( strcmp ( var_name1 , " HIP_VISIBLE_DEVICES " ) & & strcmp ( var_name2 , " CUDA_VISIBLE_DEVICES " ) ) & & env ) {
2016-02-16 10:00:05 -06:00
// Parse the string stream of env and store the device ids to g_hip_visible_devices global variable
2016-02-16 07:39:04 -06:00
std : : string str = env ;
std : : istringstream ss ( str ) ;
std : : string device_id ;
2016-02-17 06:59:18 -06:00
// Clean up the defult value
g_hip_visible_devices . clear ( ) ;
2016-02-27 14:14:08 -06:00
g_visible_device = true ;
2016-02-17 06:59:18 -06:00
// Read the visible device numbers
2016-02-16 07:39:04 -06:00
while ( std : : getline ( ss , device_id , ' , ' ) ) {
2016-02-16 10:00:05 -06:00
if ( atoi ( device_id . c_str ( ) ) > = 0 ) {
g_hip_visible_devices . push_back ( atoi ( device_id . c_str ( ) ) ) ;
2016-08-30 17:29:50 -05:00
} else { // Any device number after invalid number will not present
2016-02-16 10:00:05 -06:00
break ;
2016-08-30 17:29:50 -05:00
}
2016-02-16 07:39:04 -06:00
}
2016-02-17 06:59:18 -06:00
// Print out the number of ids
2016-02-16 07:39:04 -06:00
if ( HIP_PRINT_ENV ) {
2016-02-16 10:00:05 -06:00
printf ( " %-30s = " , var_name1 ) ;
for ( int i = 0 ; i < g_hip_visible_devices . size ( ) ; i + + )
printf ( " %2d " , g_hip_visible_devices [ i ] ) ;
printf ( " : %s \n " , description ) ;
2016-02-16 07:39:04 -06:00
}
2016-01-26 20:14:33 -06:00
}
2016-02-16 07:39:04 -06:00
else { // Parse environment variables with sigle value
// Default is set when variable is initialized (at top of this file), so only override if we find
// an environment variable.
if ( env ) {
long int v = strtol ( env , NULL , 0 ) ;
* var_ptr = ( int ) ( v ) ;
}
if ( HIP_PRINT_ENV ) {
printf ( " %-30s = %2d : %s \n " , var_name1 , * var_ptr , description ) ;
}
2016-01-26 20:14:33 -06:00
}
2016-09-02 15:49:22 -05:00
}
void ihipReadEnv_S ( std : : string * var_ptr , const char * var_name1 , const char * var_name2 , const char * description )
{
char * env = getenv ( var_name1 ) ;
// Check second name if first not defined, used to allow HIP_ or CUDA_ env vars.
if ( ( env = = NULL ) & & strcmp ( var_name2 , " 0 " ) ) {
env = getenv ( var_name2 ) ;
}
2016-02-16 07:39:04 -06:00
2016-09-02 15:49:22 -05:00
if ( env ) {
* var_ptr = env ;
}
if ( HIP_PRINT_ENV ) {
printf ( " %-30s = %s : %s \n " , var_name1 , var_ptr - > c_str ( ) , description ) ;
}
2016-01-26 20:14:33 -06:00
}
2016-09-02 15:49:22 -05:00
2016-01-26 20:14:33 -06:00
# if defined (DEBUG)
# define READ_ENV_I(_build, _ENV_VAR, _ENV_VAR2, _description) \
if ((_build == release) || (_build == debug) {\
ihipReadEnv_I(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\
};
2016-09-02 15:49:22 -05:00
# define READ_ENV_S(_build, _ENV_VAR, _ENV_VAR2, _description) \
if ((_build == release) || (_build == debug) {\
ihipReadEnv_S(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\
};
2016-01-26 20:14:33 -06:00
# else
# define READ_ENV_I(_build, _ENV_VAR, _ENV_VAR2, _description) \
if (_build == release) {\
ihipReadEnv_I(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\
};
2016-09-02 15:49:22 -05:00
# define READ_ENV_S(_build, _ENV_VAR, _ENV_VAR2, _description) \
if (_build == release) {\
ihipReadEnv_S(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\
};
2016-01-26 20:14:33 -06:00
# endif
//---
//Function called one-time at initialization time to construct a table of all GPU devices.
//HIP/CUDA uses integer "deviceIds" - these are indexes into this table.
//AMP maintains a table of accelerators, but some are emulated - ie for debug or CPU.
//This function creates a vector with only the GPU accelerators.
//It is called with C++11 call_once, which provided thread-safety.
void ihipInit ( )
{
2016-03-23 10:29:44 -05:00
2016-07-21 16:02:51 +05:30
# if COMPILE_HIP_ATP_MARKER
2016-03-23 01:17:53 -05:00
amdtInitializeActivityLogger ( ) ;
2016-03-25 09:24:08 -05:00
amdtScopedMarker ( " ihipInit " , " HIP " , NULL ) ;
2016-03-23 01:17:53 -05:00
# endif
2016-02-12 04:30:09 -06:00
/*
* Environment variables
*/
2016-02-17 06:59:18 -06:00
g_hip_visible_devices . push_back ( 0 ) ; /* Set the default value of visible devices */
2016-02-26 09:50:00 -06:00
READ_ENV_I ( release , HIP_PRINT_ENV , 0 , " Print HIP environment variables. " ) ;
2016-02-17 00:59:12 -06:00
//-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading
2016-09-19 17:09:50 -05:00
// TODO: In HIP/hcc, this variable blocks after both kernel commmands and data transfer. Maybe should be bit-mask for each command type?
2016-02-12 04:30:09 -06:00
READ_ENV_I ( release , HIP_LAUNCH_BLOCKING , CUDA_LAUNCH_BLOCKING , " Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING. " ) ;
2016-03-19 02:44:26 -05:00
READ_ENV_I ( release , HIP_DB , 0 , " Print various debug info. Bitmask, see hip_hcc.cpp for more information. " ) ;
2016-04-18 20:49:33 -05:00
if ( ( HIP_DB & ( 1 < < DB_API ) ) & & ( HIP_TRACE_API = = 0 ) ) {
2016-03-29 17:27:30 -05:00
// Set HIP_TRACE_API default before we read it, so it is printed correctly.
2016-03-06 23:50:52 -06:00
HIP_TRACE_API = 1 ;
}
2016-02-17 00:59:12 -06:00
READ_ENV_I ( release , HIP_TRACE_API , 0 , " Trace each HIP API call. Print function name and return code to stderr as program executes. " ) ;
2016-09-02 15:49:22 -05:00
READ_ENV_S ( release , HIP_TRACE_API_COLOR , 0 , " Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White " ) ;
2016-03-29 17:27:30 -05:00
READ_ENV_I ( release , HIP_ATP_MARKER , 0 , " Add HIP function begin/end to ATP file generated with CodeXL " ) ;
2016-02-16 07:39:04 -06:00
READ_ENV_I ( release , HIP_VISIBLE_DEVICES , CUDA_VISIBLE_DEVICES , " Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence " ) ;
2016-01-26 20:14:33 -06:00
2016-03-06 23:50:52 -06:00
2016-10-05 11:47:12 -05:00
READ_ENV_I ( release , HIP_BLOCKING_SYNC , 0 , " Use blocking synchronization for stream waits. This may increase latency but is friendlier to other processes. If 0, spin-wait. " ) ;
READ_ENV_I ( release , HIP_NUM_KERNELS_INFLIGHT , 128 , " Max number of inflight kernels per stream before active synchronization is forced. " ) ;
2016-07-26 17:09:27 -05:00
2016-03-29 17:27:30 -05:00
// Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled.
2016-03-23 01:17:53 -05:00
if ( HIP_DB & & ! COMPILE_HIP_DB ) {
fprintf ( stderr , " warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?) " , HIP_DB ) ;
}
if ( HIP_TRACE_API & & ! COMPILE_HIP_TRACE_API ) {
fprintf ( stderr , " warning: env var HIP_TRACE_API=0x%x but COMPILE_HIP_TRACE_API=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?) " , HIP_DB ) ;
}
2016-03-29 17:27:30 -05:00
if ( HIP_ATP_MARKER & & ! COMPILE_HIP_ATP_MARKER ) {
fprintf ( stderr , " warning: env var HIP_ATP_MARKER=0x%x but COMPILE_HIP_ATP_MARKER=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?) " , HIP_ATP_MARKER ) ;
}
2016-09-02 15:49:22 -05:00
std : : transform ( HIP_TRACE_API_COLOR . begin ( ) , HIP_TRACE_API_COLOR . end ( ) , HIP_TRACE_API_COLOR . begin ( ) , : : tolower ) ;
if ( HIP_TRACE_API_COLOR = = " none " ) {
API_COLOR = " " ;
API_COLOR_END = " " ;
} else if ( HIP_TRACE_API_COLOR = = " red " ) {
API_COLOR = KRED ;
} else if ( HIP_TRACE_API_COLOR = = " green " ) {
API_COLOR = KGRN ;
} else if ( HIP_TRACE_API_COLOR = = " yellow " ) {
API_COLOR = KYEL ;
} else if ( HIP_TRACE_API_COLOR = = " blue " ) {
API_COLOR = KBLU ;
} else if ( HIP_TRACE_API_COLOR = = " magenta " ) {
API_COLOR = KMAG ;
} else if ( HIP_TRACE_API_COLOR = = " cyan " ) {
API_COLOR = KCYN ;
} else if ( HIP_TRACE_API_COLOR = = " white " ) {
API_COLOR = KWHT ;
} else {
fprintf ( stderr , " warning: env var HIP_TRACE_API_COLOR=%s must be None/Red/Green/Yellow/Blue/Magenta/Cyan/White " , HIP_TRACE_API_COLOR . c_str ( ) ) ;
} ;
2016-03-06 23:50:52 -06:00
2016-01-26 20:14:33 -06:00
/*
* Build a table of valid compute devices.
*/
auto accs = hc : : accelerator : : get_all ( ) ;
2016-03-24 04:57:30 -05:00
2016-02-12 04:30:09 -06:00
int deviceCnt = 0 ;
2016-01-26 20:14:33 -06:00
for ( int i = 0 ; i < accs . size ( ) ; i + + ) {
if ( ! accs [ i ] . get_is_emulated ( ) ) {
2016-02-12 04:30:09 -06:00
deviceCnt + + ;
}
} ;
2016-02-17 06:59:18 -06:00
// Make sure the hip visible devices are within the deviceCnt range
2016-02-16 10:00:05 -06:00
for ( int i = 0 ; i < g_hip_visible_devices . size ( ) ; i + + ) {
2016-02-17 06:59:18 -06:00
if ( g_hip_visible_devices [ i ] > = deviceCnt ) {
2016-02-16 10:00:05 -06:00
// Make sure any DeviceID after invalid DeviceID will be erased.
g_hip_visible_devices . resize ( i ) ;
break ;
}
}
2016-01-26 20:14:33 -06:00
2016-08-05 15:05:57 +05:30
hsa_status_t err = hsa_iterate_agents ( findCpuAgent , & g_cpu_agent ) ;
if ( err ! = HSA_STATUS_INFO_BREAK ) {
// didn't find a CPU.
throw ihipException ( hipErrorRuntimeOther ) ;
}
2016-08-08 11:55:57 -05:00
g_deviceArray = new ihipDevice_t * [ deviceCnt ] ;
2016-02-12 04:30:09 -06:00
g_deviceCnt = 0 ;
2016-01-26 20:14:33 -06:00
for ( int i = 0 ; i < accs . size ( ) ; i + + ) {
2016-02-17 06:59:18 -06:00
// check if the device id is included in the HIP_VISIBLE_DEVICES env variable
2016-01-26 20:14:33 -06:00
if ( ! accs [ i ] . get_is_emulated ( ) ) {
2016-02-27 14:48:00 -06:00
if ( std : : find ( g_hip_visible_devices . begin ( ) , g_hip_visible_devices . end ( ) , ( i - 1 ) ) = = g_hip_visible_devices . end ( ) & & g_visible_device )
2016-02-17 09:24:39 -06:00
{
2016-02-27 14:48:00 -06:00
//If device is not in visible devices list, ignore
2016-02-17 09:24:39 -06:00
continue ;
2016-02-17 06:59:18 -06:00
}
2016-08-08 11:55:57 -05:00
g_deviceArray [ g_deviceCnt ] = new ihipDevice_t ( g_deviceCnt , deviceCnt , accs [ i ] ) ;
2016-02-17 09:24:39 -06:00
g_deviceCnt + + ;
2016-01-26 20:14:33 -06:00
}
}
2016-02-17 06:59:18 -06:00
2016-02-27 14:14:08 -06:00
// If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
2016-03-23 10:29:44 -05:00
if ( ! g_visible_device ) {
2016-02-27 14:14:08 -06:00
assert ( deviceCnt = = g_deviceCnt ) ;
2016-03-23 10:29:44 -05:00
}
2016-01-26 20:14:33 -06:00
2016-08-09 15:37:19 -05:00
2016-03-23 10:29:44 -05:00
tprintf ( DB_SYNC , " pid=%u %-30s \n " , getpid ( ) , " <ihipInit> " ) ;
2016-01-26 20:14:33 -06:00
}
2016-03-14 14:40:41 -05:00
2016-03-23 11:13:02 -05:00
//---
// Get the stream to use for a command submission.
//
// If stream==NULL synchronize appropriately with other streams and return the default av for the device.
// If stream is valid, return the AV to use.
2016-03-23 12:59:52 -05:00
hipStream_t ihipSyncAndResolveStream ( hipStream_t stream )
2016-03-23 11:13:02 -05:00
{
if ( stream = = hipStreamNull ) {
2016-08-07 21:46:51 -05:00
ihipCtx_t * device = ihipGetTlsDefaultCtx ( ) ;
2016-03-23 11:13:02 -05:00
# ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
2016-03-26 10:46:20 -05:00
device - > locked_syncDefaultStream ( false ) ;
2016-03-23 11:13:02 -05:00
# endif
2016-08-08 14:54:38 -05:00
return device - > _defaultStream ;
2016-03-23 11:13:02 -05:00
} else {
2016-09-02 15:49:22 -05:00
// ALl streams have to wait for legacy default stream to be empty:
2016-03-23 11:13:02 -05:00
if ( ! ( stream - > _flags & hipStreamNonBlocking ) ) {
tprintf ( DB_SYNC , " stream %p wait default stream \n " , stream ) ;
2016-08-08 14:54:38 -05:00
stream - > getCtx ( ) - > _defaultStream - > locked_wait ( ) ;
2016-03-23 11:13:02 -05:00
}
2016-03-25 09:24:08 -05:00
2016-03-23 11:13:02 -05:00
return stream ;
}
}
2016-09-02 12:47:25 -05:00
void ihipPrintKernelLaunch ( const char * kernelName , const grid_launch_parm * lp , const hipStream_t stream )
2016-08-29 18:37:57 -05:00
{
2016-09-04 07:00:59 -05:00
if ( HIP_ATP_MARKER | | ( COMPILE_HIP_DB & & HIP_TRACE_API ) ) {
std : : stringstream os ;
os < < " <<hip-api: hipLaunchKernel ' " < < kernelName < < " ' "
< < " gridDim: " < < lp - > grid_dim
< < " groupDim: " < < lp - > group_dim
< < " sharedMem:+ " < < lp - > dynamic_group_mem_bytes
< < " " < < * stream ;
if ( COMPILE_HIP_DB & & HIP_TRACE_API ) {
std : : cerr < < API_COLOR < < os . str ( ) < < API_COLOR_END < < std : : endl ;
}
SCOPED_MARKER ( os . str ( ) . c_str ( ) , " HIP " , NULL ) ;
}
2016-08-29 18:37:57 -05:00
}
2016-01-26 20:14:33 -06:00
// Called just before a kernel is launched from hipLaunchKernel.
// Allows runtime to track some information about the stream.
2016-09-02 15:49:22 -05:00
hipStream_t ihipPreLaunchKernel ( hipStream_t stream , dim3 grid , dim3 block , grid_launch_parm * lp , const char * kernelNameStr )
2016-01-26 20:14:33 -06:00
{
2016-08-29 18:37:57 -05:00
HIP_INIT ( ) ;
2016-01-26 20:14:33 -06:00
stream = ihipSyncAndResolveStream ( stream ) ;
2016-06-20 23:46:42 -05:00
lp - > grid_dim . x = grid . x ;
lp - > grid_dim . y = grid . y ;
lp - > grid_dim . z = grid . z ;
lp - > group_dim . x = block . x ;
lp - > group_dim . y = block . y ;
lp - > group_dim . z = block . z ;
lp - > barrier_bit = barrier_bit_queue_default ;
lp - > launch_fence = - 1 ;
2016-08-30 17:29:50 -05:00
auto crit = stream - > lockopen_preKernelCommand ( ) ;
lp - > av = & ( crit - > _av ) ;
2016-09-26 16:32:35 -05:00
lp - > cf = nullptr ;
2016-09-04 07:00:59 -05:00
ihipPrintKernelLaunch ( kernelNameStr , lp , stream ) ;
2016-09-02 15:49:22 -05:00
2016-06-18 11:28:20 -05:00
return ( stream ) ;
}
2016-08-30 17:29:50 -05:00
2016-09-04 07:00:59 -05:00
hipStream_t ihipPreLaunchKernel ( hipStream_t stream , size_t grid , dim3 block , grid_launch_parm * lp , const char * kernelNameStr )
2016-06-18 11:28:20 -05:00
{
2016-08-30 17:29:50 -05:00
HIP_INIT ( ) ;
2016-06-18 11:28:20 -05:00
stream = ihipSyncAndResolveStream ( stream ) ;
2016-06-20 23:46:42 -05:00
lp - > grid_dim . x = grid ;
lp - > grid_dim . y = 1 ;
lp - > grid_dim . z = 1 ;
lp - > group_dim . x = block . x ;
lp - > group_dim . y = block . y ;
lp - > group_dim . z = block . z ;
lp - > barrier_bit = barrier_bit_queue_default ;
lp - > launch_fence = - 1 ;
2016-08-30 17:29:50 -05:00
auto crit = stream - > lockopen_preKernelCommand ( ) ;
lp - > av = & ( crit - > _av ) ;
2016-09-26 16:32:35 -05:00
lp - > cf = nullptr ;
2016-09-04 07:00:59 -05:00
ihipPrintKernelLaunch ( kernelNameStr , lp , stream ) ;
2016-06-18 11:28:20 -05:00
return ( stream ) ;
}
2016-01-26 20:14:33 -06:00
2016-08-30 17:29:50 -05:00
2016-09-04 07:00:59 -05:00
hipStream_t ihipPreLaunchKernel ( hipStream_t stream , dim3 grid , size_t block , grid_launch_parm * lp , const char * kernelNameStr )
2016-06-18 11:28:20 -05:00
{
2016-08-30 17:29:50 -05:00
HIP_INIT ( ) ;
2016-06-18 11:28:20 -05:00
stream = ihipSyncAndResolveStream ( stream ) ;
2016-06-20 23:46:42 -05:00
lp - > grid_dim . x = grid . x ;
lp - > grid_dim . y = grid . y ;
lp - > grid_dim . z = grid . z ;
lp - > group_dim . x = block ;
lp - > group_dim . y = 1 ;
lp - > group_dim . z = 1 ;
lp - > barrier_bit = barrier_bit_queue_default ;
lp - > launch_fence = - 1 ;
2016-08-30 17:29:50 -05:00
auto crit = stream - > lockopen_preKernelCommand ( ) ;
lp - > av = & ( crit - > _av ) ;
2016-09-26 16:32:35 -05:00
lp - > cf = nullptr ;
2016-09-04 07:00:59 -05:00
ihipPrintKernelLaunch ( kernelNameStr , lp , stream ) ;
2016-06-18 11:28:20 -05:00
return ( stream ) ;
}
2016-03-28 21:41:47 -05:00
2016-08-30 17:29:50 -05:00
2016-09-04 07:00:59 -05:00
hipStream_t ihipPreLaunchKernel ( hipStream_t stream , size_t grid , size_t block , grid_launch_parm * lp , const char * kernelNameStr )
2016-06-18 11:28:20 -05:00
{
2016-08-30 17:29:50 -05:00
HIP_INIT ( ) ;
2016-06-18 11:28:20 -05:00
stream = ihipSyncAndResolveStream ( stream ) ;
2016-06-20 23:46:42 -05:00
lp - > grid_dim . x = grid ;
lp - > grid_dim . y = 1 ;
lp - > grid_dim . z = 1 ;
lp - > group_dim . x = block ;
lp - > group_dim . y = 1 ;
lp - > group_dim . z = 1 ;
lp - > barrier_bit = barrier_bit_queue_default ;
lp - > launch_fence = - 1 ;
2016-08-30 17:29:50 -05:00
auto crit = stream - > lockopen_preKernelCommand ( ) ;
lp - > av = & ( crit - > _av ) ;
2016-09-26 16:32:35 -05:00
lp - > cf = nullptr ;
2016-09-08 14:52:51 -05:00
2016-09-04 07:00:59 -05:00
ihipPrintKernelLaunch ( kernelNameStr , lp , stream ) ;
2016-02-20 11:01:43 -06:00
return ( stream ) ;
2016-01-26 20:14:33 -06:00
}
2016-02-20 11:01:43 -06:00
//---
//Called after kernel finishes execution.
2016-08-30 17:29:50 -05:00
//This releases the lock on the stream.
2016-04-15 11:21:45 -05:00
void ihipPostLaunchKernel ( hipStream_t stream , grid_launch_parm & lp )
2016-01-26 20:14:33 -06:00
{
2016-09-26 16:32:35 -05:00
stream - > lockclose_postKernelCommand ( lp . av ) ;
2016-01-26 20:14:33 -06:00
}
//=================================================================================================
// HIP API Implementation
//
// Implementor notes:
2016-07-22 15:46:55 +05:30
// _ All functions should call HIP_INIT_API as first action:
// HIP_INIT_API(<function_arguments>);
2016-01-26 20:14:33 -06:00
//
// - ALl functions should use ihipLogStatus to return error code (not return error directly).
//=================================================================================================
//
//---
2016-03-24 04:57:30 -05:00
//-------------------------------------------------------------------------------------------------
const char * ihipErrorString ( hipError_t hip_error )
{
switch ( hip_error ) {
2016-04-06 15:49:32 -05:00
case hipSuccess : return " hipSuccess " ;
2016-09-15 11:28:18 -05:00
case hipErrorOutOfMemory : return " hipErrorOutOfMemory " ;
case hipErrorNotInitialized : return " hipErrorNotInitialized " ;
case hipErrorDeinitialized : return " hipErrorDeinitialized " ;
case hipErrorProfilerDisabled : return " hipErrorProfilerDisabled " ;
case hipErrorProfilerNotInitialized : return " hipErrorProfilerNotInitialized " ;
case hipErrorProfilerAlreadyStarted : return " hipErrorProfilerAlreadyStarted " ;
case hipErrorProfilerAlreadyStopped : return " hipErrorProfilerAlreadyStopped " ;
case hipErrorInvalidImage : return " hipErrorInvalidImage " ;
case hipErrorInvalidContext : return " hipErrorInvalidContext " ;
case hipErrorContextAlreadyCurrent : return " hipErrorContextAlreadyCurrent " ;
case hipErrorMapFailed : return " hipErrorMapFailed " ;
case hipErrorUnmapFailed : return " hipErrorUnmapFailed " ;
case hipErrorArrayIsMapped : return " hipErrorArrayIsMapped " ;
case hipErrorAlreadyMapped : return " hipErrorAlreadyMapped " ;
case hipErrorNoBinaryForGpu : return " hipErrorNoBinaryForGpu " ;
case hipErrorAlreadyAcquired : return " hipErrorAlreadyAcquired " ;
case hipErrorNotMapped : return " hipErrorNotMapped " ;
case hipErrorNotMappedAsArray : return " hipErrorNotMappedAsArray " ;
case hipErrorNotMappedAsPointer : return " hipErrorNotMappedAsPointer " ;
case hipErrorECCNotCorrectable : return " hipErrorECCNotCorrectable " ;
case hipErrorUnsupportedLimit : return " hipErrorUnsupportedLimit " ;
case hipErrorContextAlreadyInUse : return " hipErrorContextAlreadyInUse " ;
case hipErrorPeerAccessUnsupported : return " hipErrorPeerAccessUnsupported " ;
case hipErrorInvalidKernelFile : return " hipErrorInvalidKernelFile " ;
case hipErrorInvalidGraphicsContext : return " hipErrorInvalidGraphicsContext " ;
case hipErrorInvalidSource : return " hipErrorInvalidSource " ;
case hipErrorFileNotFound : return " hipErrorFileNotFound " ;
case hipErrorSharedObjectSymbolNotFound : return " hipErrorSharedObjectSymbolNotFound " ;
case hipErrorSharedObjectInitFailed : return " hipErrorSharedObjectInitFailed " ;
case hipErrorOperatingSystem : return " hipErrorOperatingSystem " ;
case hipErrorInvalidHandle : return " hipErrorInvalidHandle " ;
case hipErrorNotFound : return " hipErrorNotFound " ;
case hipErrorIllegalAddress : return " hipErrorIllegalAddress " ;
case hipErrorMissingConfiguration : return " hipErrorMissingConfiguration " ;
2016-04-06 15:49:32 -05:00
case hipErrorMemoryAllocation : return " hipErrorMemoryAllocation " ;
2016-09-15 11:28:18 -05:00
case hipErrorInitializationError : return " hipErrorInitializationError " ;
case hipErrorLaunchFailure : return " hipErrorLaunchFailure " ;
case hipErrorPriorLaunchFailure : return " hipErrorPriorLaunchFailure " ;
case hipErrorLaunchTimeOut : return " hipErrorLaunchTimeOut " ;
2016-06-10 20:12:46 -05:00
case hipErrorLaunchOutOfResources : return " hipErrorLaunchOutOfResources " ;
2016-09-15 11:28:18 -05:00
case hipErrorInvalidDeviceFunction : return " hipErrorInvalidDeviceFunction " ;
case hipErrorInvalidConfiguration : return " hipErrorInvalidConfiguration " ;
2016-04-06 15:49:32 -05:00
case hipErrorInvalidDevice : return " hipErrorInvalidDevice " ;
2016-09-15 11:28:18 -05:00
case hipErrorInvalidValue : return " hipErrorInvalidValue " ;
case hipErrorInvalidDevicePointer : return " hipErrorInvalidDevicePointer " ;
2016-04-06 15:49:32 -05:00
case hipErrorInvalidMemcpyDirection : return " hipErrorInvalidMemcpyDirection " ;
2016-09-15 11:28:18 -05:00
case hipErrorUnknown : return " hipErrorUnknown " ;
case hipErrorInvalidResourceHandle : return " hipErrorInvalidResourceHandle " ;
2016-04-06 15:49:32 -05:00
case hipErrorNotReady : return " hipErrorNotReady " ;
2016-09-15 11:28:18 -05:00
case hipErrorNoDevice : return " hipErrorNoDevice " ;
2016-04-06 15:49:32 -05:00
case hipErrorPeerAccessAlreadyEnabled : return " hipErrorPeerAccessAlreadyEnabled " ;
2016-09-15 11:28:18 -05:00
case hipErrorPeerAccessNotEnabled : return " hipErrorPeerAccessNotEnabled " ;
2016-04-06 15:49:32 -05:00
case hipErrorRuntimeMemory : return " hipErrorRuntimeMemory " ;
case hipErrorRuntimeOther : return " hipErrorRuntimeOther " ;
2016-09-15 11:28:18 -05:00
case hipErrorHostMemoryAlreadyRegistered : return " hipErrorHostMemoryAlreadyRegistered " ;
case hipErrorHostMemoryNotRegistered : return " hipErrorHostMemoryNotRegistered " ;
2016-04-06 15:49:32 -05:00
case hipErrorTbd : return " hipErrorTbd " ;
default : return " hipErrorUnknown " ;
2016-03-24 04:57:30 -05:00
} ;
} ;
void ihipSetTs ( hipEvent_t e )
{
2016-08-25 14:34:41 -05:00
ihipEvent_t * eh = e ;
2016-03-24 04:57:30 -05:00
if ( eh - > _state = = hipEventStatusRecorded ) {
// already recorded, done:
return ;
} else {
// TODO - use completion-future functions to obtain ticks and timestamps:
hsa_signal_t * sig = static_cast < hsa_signal_t * > ( eh - > _marker . get_native_handle ( ) ) ;
if ( sig ) {
if ( hsa_signal_load_acquire ( * sig ) = = 0 ) {
eh - > _timestamp = eh - > _marker . get_end_tick ( ) ;
eh - > _state = hipEventStatusRecorded ;
}
}
}
}
2016-09-22 10:39:17 -05:00
// Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx.
// The peer-list for a context controls which contexts have access to the memory allocated on that context.
// So we check dstCtx's and srcCtx's peerList to see if the booth include thisCtx.
bool ihipStream_t : : canSeePeerMemory ( const ihipCtx_t * thisCtx , ihipCtx_t * dstCtx , ihipCtx_t * srcCtx )
{
tprintf ( DB_COPY1 , " Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s \n " ,
thisCtx - > toString ( ) . c_str ( ) , dstCtx - > toString ( ) . c_str ( ) , srcCtx - > toString ( ) . c_str ( ) ) ;
// Use blocks to control scope of critical sections.
{
LockedAccessor_CtxCrit_t ctxCrit ( dstCtx - > criticalData ( ) ) ;
2016-09-27 14:53:13 -05:00
tprintf ( DB_SYNC , " dstCrit lock succeeded \n " ) ;
2016-09-22 10:39:17 -05:00
if ( ! ctxCrit - > isPeer ( thisCtx ) ) {
return false ;
} ;
}
{
LockedAccessor_CtxCrit_t ctxCrit ( srcCtx - > criticalData ( ) ) ;
2016-09-27 14:53:13 -05:00
tprintf ( DB_SYNC , " srcCrit lock succeeded \n " ) ;
2016-09-22 10:39:17 -05:00
if ( ! ctxCrit - > isPeer ( thisCtx ) ) {
return false ;
} ;
}
return true ;
} ;
2016-03-24 04:57:30 -05:00
// Resolve hipMemcpyDefault to a known type.
2016-09-22 10:39:17 -05:00
// TODO - review why is this so complicated, does this need srcTracked and dstTracked?
2016-04-16 17:10:13 -05:00
unsigned ihipStream_t : : resolveMemcpyDirection ( bool srcTracked , bool dstTracked , bool srcInDeviceMem , bool dstInDeviceMem )
2016-03-24 04:57:30 -05:00
{
hipMemcpyKind kind = hipMemcpyDefault ;
2016-04-16 17:10:13 -05:00
if ( ! srcTracked & & ! dstTracked )
{
2016-03-24 04:57:30 -05:00
kind = hipMemcpyHostToHost ;
2016-04-16 17:10:13 -05:00
}
if ( ! srcTracked & & dstTracked )
{
if ( dstInDeviceMem ) { kind = hipMemcpyHostToDevice ; }
else { kind = hipMemcpyHostToHost ; }
}
if ( srcTracked & & ! dstTracked ) {
if ( srcInDeviceMem ) { kind = hipMemcpyDeviceToHost ; }
else { kind = hipMemcpyHostToHost ; }
}
if ( srcTracked & & dstTracked ) {
if ( srcInDeviceMem & & dstInDeviceMem ) { kind = hipMemcpyDeviceToDevice ; }
if ( srcInDeviceMem & & ! dstInDeviceMem ) { kind = hipMemcpyDeviceToHost ; }
if ( ! srcInDeviceMem & & ! dstInDeviceMem ) { kind = hipMemcpyHostToHost ; }
if ( ! srcInDeviceMem & & dstInDeviceMem ) { kind = hipMemcpyHostToDevice ; }
2016-03-24 04:57:30 -05:00
}
assert ( kind ! = hipMemcpyDefault ) ;
return kind ;
}
2016-09-22 10:39:17 -05:00
// TODO - remove kind parm from here or use it below?
2016-09-27 15:27:21 -05:00
void ihipStream_t : : locked_copySync ( void * dst , const void * src , size_t sizeBytes , unsigned kind , bool resolveOn )
2016-03-24 04:57:30 -05:00
{
2016-08-08 11:55:57 -05:00
ihipCtx_t * ctx = this - > getCtx ( ) ;
const ihipDevice_t * device = ctx - > getDevice ( ) ;
2016-03-24 04:57:30 -05:00
if ( device = = NULL ) {
throw ihipException ( hipErrorInvalidDevice ) ;
}
2016-09-22 10:39:17 -05:00
hc : : accelerator acc ;
hc : : AmPointerInfo dstPtrInfo ( NULL , NULL , 0 , acc , 0 , 0 ) ;
hc : : AmPointerInfo srcPtrInfo ( NULL , NULL , 0 , acc , 0 , 0 ) ;
bool dstTracked = ( hc : : am_memtracker_getinfo ( & dstPtrInfo , dst ) = = AM_SUCCESS ) ;
bool srcTracked = ( hc : : am_memtracker_getinfo ( & srcPtrInfo , src ) = = AM_SUCCESS ) ;
if ( kind = = hipMemcpyDefault ) {
kind = resolveMemcpyDirection ( srcTracked , dstTracked , srcPtrInfo . _isInDeviceMem , dstPtrInfo . _isInDeviceMem ) ;
}
hc : : hcCommandKind hcCopyDir ;
switch ( kind ) {
case hipMemcpyHostToHost : hcCopyDir = hc : : hcMemcpyHostToHost ; break ;
case hipMemcpyHostToDevice : hcCopyDir = hc : : hcMemcpyHostToDevice ; break ;
case hipMemcpyDeviceToHost : hcCopyDir = hc : : hcMemcpyDeviceToHost ; break ;
case hipMemcpyDeviceToDevice : hcCopyDir = hc : : hcMemcpyDeviceToDevice ; break ;
} ;
2016-09-27 15:27:21 -05:00
// If this is P2P access, we need to check to see if the copy agent (specified by the stream where the copy is enqueued)
2016-09-22 10:39:17 -05:00
// has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers
// and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (forceHostCopyEngine=true).
bool forceHostCopyEngine = false ;
if ( hcCopyDir = = hc : : hcMemcpyDeviceToDevice ) {
if ( ! canSeePeerMemory ( ctx , ihipGetPrimaryCtx ( dstPtrInfo . _appId ) , ihipGetPrimaryCtx ( srcPtrInfo . _appId ) ) ) {
forceHostCopyEngine = true ;
tprintf ( DB_COPY1 , " Forcing use of host copy engine. \n " ) ;
} else {
tprintf ( DB_COPY1 , " Will use SDMA engine on streamDevice=%s. \n " , ctx - > toString ( ) . c_str ( ) ) ;
}
} ;
2016-09-27 15:27:21 -05:00
{
LockedAccessor_StreamCrit_t crit ( _criticalData ) ;
2016-10-05 12:18:16 -05:00
# if DISABLE_COPY_EXT
# warning ("Disabled copy_ext path, P2P host staging copies will not work")
// Note - peer-to-peer copies which require host staging will not work in this path.
crit - > _av . copy ( src , dst , sizeBytes ) ;
# else
2016-09-27 15:27:21 -05:00
crit - > _av . copy_ext ( src , dst , sizeBytes , hcCopyDir , srcPtrInfo , dstPtrInfo , forceHostCopyEngine ) ;
2016-10-05 12:18:16 -05:00
# endif
2016-09-27 15:27:21 -05:00
}
2016-03-24 04:57:30 -05:00
}
2016-09-27 15:27:21 -05:00
void ihipStream_t : : locked_copyAsync ( void * dst , const void * src , size_t sizeBytes , unsigned kind )
2016-03-28 09:46:40 -05:00
{
2016-08-08 11:55:57 -05:00
const ihipCtx_t * ctx = this - > getCtx ( ) ;
2016-03-24 04:57:30 -05:00
2016-08-08 11:55:57 -05:00
if ( ( ctx = = nullptr ) | | ( ctx - > getDevice ( ) = = nullptr ) ) {
2016-03-24 04:57:30 -05:00
throw ihipException ( hipErrorInvalidDevice ) ;
}
if ( kind = = hipMemcpyHostToHost ) {
tprintf ( DB_COPY2 , " Asyc: H2H with memcpy " ) ;
// TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the host-side sync here and keep everything flowing on the GPU.
/* As this is a CPU op, we need to wait until all
the commands in current stream are finished.
*/
2016-09-27 15:27:21 -05:00
LockedAccessor_StreamCrit_t crit ( _criticalData ) ;
2016-03-28 09:46:40 -05:00
this - > wait ( crit ) ;
2016-03-24 04:57:30 -05:00
memcpy ( dst , src , sizeBytes ) ;
} else {
hc : : accelerator acc ;
hc : : AmPointerInfo dstPtrInfo ( NULL , NULL , 0 , acc , 0 , 0 ) ;
hc : : AmPointerInfo srcPtrInfo ( NULL , NULL , 0 , acc , 0 , 0 ) ;
bool dstTracked = ( hc : : am_memtracker_getinfo ( & dstPtrInfo , dst ) = = AM_SUCCESS ) ;
bool srcTracked = ( hc : : am_memtracker_getinfo ( & srcPtrInfo , src ) = = AM_SUCCESS ) ;
2016-09-22 10:39:17 -05:00
bool copyEngineCanSeeSrcAndDest = true ;
if ( kind = = hipMemcpyDeviceToDevice ) {
copyEngineCanSeeSrcAndDest = canSeePeerMemory ( ctx , ihipGetPrimaryCtx ( dstPtrInfo . _appId ) , ihipGetPrimaryCtx ( srcPtrInfo . _appId ) ) ;
}
2016-03-25 09:24:08 -05:00
// "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
2016-03-24 04:57:30 -05:00
// If both pointers are not tracked, we need to fall back to a sync copy.
2016-09-27 15:27:21 -05:00
if ( dstTracked & & srcTracked & & copyEngineCanSeeSrcAndDest ) {
LockedAccessor_StreamCrit_t crit ( _criticalData ) ;
2016-03-24 04:57:30 -05:00
2016-09-27 15:27:21 -05:00
// Perform asynchronous copy:
2016-09-19 11:45:07 -05:00
try {
crit - > _av . copy_async ( src , dst , sizeBytes ) ;
} catch ( Kalmar : : runtime_exception ) {
throw ihipException ( hipErrorRuntimeOther ) ;
} ;
2016-03-24 04:57:30 -05:00
2016-09-19 11:45:07 -05:00
if ( HIP_LAUNCH_BLOCKING ) {
tprintf ( DB_SYNC , " LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu) \n " , sizeBytes ) ;
this - > wait ( crit ) ;
}
2016-09-27 15:27:21 -05:00
2016-03-24 04:57:30 -05:00
} else {
2016-09-27 15:27:21 -05:00
locked_copySync ( dst , src , sizeBytes , kind ) ;
2016-03-24 04:57:30 -05:00
}
}
}
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// HCC-specific accessor functions:
//---
hipError_t hipHccGetAccelerator ( int deviceId , hc : : accelerator * acc )
{
2016-07-22 15:46:55 +05:30
HIP_INIT_API ( deviceId , acc ) ;
2016-03-24 04:57:30 -05:00
2016-08-08 11:55:57 -05:00
const ihipDevice_t * device = ihipGetDevice ( deviceId ) ;
2016-03-24 04:57:30 -05:00
hipError_t err ;
2016-08-08 11:55:57 -05:00
if ( device = = NULL ) {
2016-03-24 04:57:30 -05:00
err = hipErrorInvalidDevice ;
} else {
2016-08-08 11:55:57 -05:00
* acc = device - > _acc ;
2016-03-24 04:57:30 -05:00
err = hipSuccess ;
}
return ihipLogStatus ( err ) ;
}
//---
hipError_t hipHccGetAcceleratorView ( hipStream_t stream , hc : : accelerator_view * * av )
{
2016-07-22 15:46:55 +05:30
HIP_INIT_API ( stream , av ) ;
2016-01-26 20:14:33 -06:00
2016-03-24 04:57:30 -05:00
if ( stream = = hipStreamNull ) {
2016-08-07 21:46:51 -05:00
ihipCtx_t * device = ihipGetTlsDefaultCtx ( ) ;
2016-08-08 14:54:38 -05:00
stream = device - > _defaultStream ;
2016-03-24 04:57:30 -05:00
}
2016-01-26 20:14:33 -06:00
2016-08-30 17:29:50 -05:00
* av = stream - > locked_getAv ( ) ;
2016-01-26 20:14:33 -06:00
2016-03-24 04:57:30 -05:00
hipError_t err = hipSuccess ;
return ihipLogStatus ( err ) ;
}
2016-02-16 01:59:13 -06:00
2016-03-19 02:44:26 -05:00
//// TODO - add identifier numbers for streams and devices to help with debugging.
2016-09-22 10:39:17 -05:00
//TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)