Refactor for stream->_av.
- move _av into stream critical section. ( HCC accelerator_view is not
thread-safe but HIP steram is. )
- Refactored many places in code that need to acquire critical section.
some were previously thread races, ie enqueueing marker.
-remove support for GRID_LAUNCH_VERSION < 20
-Enable USE_AV_COPY based on HCC work-week.
- Review hipModule docs, some calrity/editing.
Change-Id: I3ce7c25ece048c3504f55ecd4683e506bb1fc8b6
[ROCm/hip commit: e76a272d48]
This commit is contained in:
@@ -32,7 +32,7 @@ THE SOFTWARE.
|
||||
// #define USE_MEMCPYTOSYMBOL
|
||||
//
|
||||
//Use the new HCC accelerator_view::copy instead of am_copy
|
||||
#define USE_AV_COPY 0
|
||||
#define USE_AV_COPY (__hcc_workweek__ >= 16351)
|
||||
|
||||
// Compile peer-to-peer support.
|
||||
// >= 2 : use HCC hc:accelerator::get_is_peer
|
||||
@@ -353,18 +353,36 @@ struct LockedBase {
|
||||
};
|
||||
|
||||
|
||||
class ihipModule_t{
|
||||
public:
|
||||
hsa_executable_t executable;
|
||||
hsa_code_object_t object;
|
||||
std::string fileName;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
|
||||
class ihipFunction_t{
|
||||
public:
|
||||
hsa_executable_symbol_t kernel_symbol;
|
||||
uint64_t kernel;
|
||||
};
|
||||
|
||||
|
||||
template <typename MUTEX_TYPE>
|
||||
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipStreamCriticalBase_t() :
|
||||
ihipStreamCriticalBase_t(hc::accelerator_view av) :
|
||||
_last_command_type(ihipCommandCopyH2H),
|
||||
_last_copy_signal(NULL),
|
||||
_signalCursor(0),
|
||||
_oldest_live_sig_id(1),
|
||||
_streamSigId(0),
|
||||
_kernelCnt(0),
|
||||
_signalCnt(0)
|
||||
_signalCnt(0),
|
||||
_av(av)
|
||||
{
|
||||
_signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
|
||||
};
|
||||
@@ -395,27 +413,14 @@ public:
|
||||
// 2 are required if a barrier packet is inserted.
|
||||
uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
|
||||
SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id.
|
||||
|
||||
hc::accelerator_view _av;
|
||||
};
|
||||
|
||||
|
||||
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
|
||||
typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
|
||||
|
||||
class ihipModule_t{
|
||||
public:
|
||||
hsa_executable_t executable;
|
||||
hsa_code_object_t object;
|
||||
std::string fileName;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
|
||||
class ihipFunction_t{
|
||||
public:
|
||||
hsa_executable_symbol_t kernel_symbol;
|
||||
uint64_t kernel;
|
||||
};
|
||||
|
||||
// Internal stream structure.
|
||||
class ihipStream_t {
|
||||
@@ -431,16 +436,23 @@ typedef uint64_t SeqNum_t ;
|
||||
|
||||
void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
|
||||
|
||||
int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
|
||||
|
||||
//---
|
||||
// Thread-safe accessors - these acquire / release mutex:
|
||||
bool lockopen_preKernelCommand();
|
||||
// Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex.
|
||||
LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
|
||||
void lockclose_postKernelCommand(hc::completion_future &kernel_future);
|
||||
|
||||
int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
|
||||
|
||||
void locked_reclaimSignals(SIGSEQNUM sigNum);
|
||||
void locked_wait(bool assertQueueEmpty=false);
|
||||
SIGSEQNUM locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); };
|
||||
|
||||
hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
|
||||
|
||||
void locked_waitEvent(hipEvent_t event);
|
||||
void locked_recordEvent(hipEvent_t event);
|
||||
|
||||
//---
|
||||
|
||||
// Use this if we already have the stream critical data mutex:
|
||||
void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
|
||||
@@ -462,7 +474,6 @@ public:
|
||||
//---
|
||||
//Public member vars - these are set at initialization and never change:
|
||||
SeqNum_t _id; // monotonic sequence ID
|
||||
hc::accelerator_view _av;
|
||||
unsigned _flags;
|
||||
|
||||
|
||||
|
||||
@@ -50,9 +50,8 @@ THE SOFTWARE.
|
||||
|
||||
#if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
|
||||
// Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
|
||||
#define USE_GRID_LAUNCH_20 1
|
||||
#else
|
||||
#define USE_GRID_LAUNCH_20 0
|
||||
#error (HCC must support GRID_LAUNCH_20)
|
||||
#endif
|
||||
|
||||
#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*) 0x01)
|
||||
@@ -633,7 +632,6 @@ extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
|
||||
#define KNRM "\x1B[0m"
|
||||
#define KGRN "\x1B[32m"
|
||||
|
||||
#if USE_GRID_LAUNCH_20
|
||||
#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
|
||||
do {\
|
||||
grid_launch_parm lp;\
|
||||
@@ -645,21 +643,6 @@ do {\
|
||||
_kernelName (lp, ##__VA_ARGS__);\
|
||||
ihipPostLaunchKernel(trueStream, lp);\
|
||||
} while(0)
|
||||
#else
|
||||
#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
|
||||
do {\
|
||||
grid_launch_parm lp;\
|
||||
lp.groupMemBytes = _groupMemBytes; \
|
||||
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
|
||||
if (HIP_TRACE_API) {\
|
||||
fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
|
||||
#_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
|
||||
}\
|
||||
_kernelName (lp, ##__VA_ARGS__);\
|
||||
ihipPostLaunchKernel(trueStream, lp);\
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#elif defined (__HCC_C__)
|
||||
|
||||
@@ -1307,7 +1307,7 @@ hipError_t hipDriverGetVersion(int *driverVersion) ;
|
||||
hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
|
||||
|
||||
/**
|
||||
* @brief Freeing the module
|
||||
* @brief Frees the module
|
||||
*
|
||||
* @param [in] module
|
||||
*
|
||||
@@ -1319,7 +1319,7 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
|
||||
hipError_t hipModuleUnload(hipModule_t module);
|
||||
|
||||
/**
|
||||
* @brief Function with kname will be extracted present in module
|
||||
* @brief Function with kname will be extracted if present in module
|
||||
*
|
||||
* @param [in] module
|
||||
* @param [in] kname
|
||||
@@ -1330,19 +1330,20 @@ hipError_t hipModuleUnload(hipModule_t module);
|
||||
hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
|
||||
|
||||
/**
|
||||
* @brief returns device memory pointer and size of the kernel present in the module with symbol - name
|
||||
* @brief returns device memory pointer and size of the kernel present in the module with symbol @p name
|
||||
*
|
||||
* @param [in] moodule
|
||||
* @param [in] name
|
||||
* @param [out] dptr
|
||||
* @param [out[ bytes
|
||||
* @param [in] hmod
|
||||
* @param [in] name
|
||||
*
|
||||
* @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
|
||||
|
||||
|
||||
/**
|
||||
* @brief builds module from code object which resides in host memory. And image is pointer to that location.
|
||||
* @brief builds module from code object which resides in host memory. Image is pointer to that location.
|
||||
*
|
||||
* @param [in] image
|
||||
* @param [out] module
|
||||
@@ -1351,8 +1352,9 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h
|
||||
*/
|
||||
hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
|
||||
|
||||
|
||||
/**
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
|
||||
*
|
||||
* @param [in[ f
|
||||
* @param [in] gridDimX
|
||||
|
||||
@@ -72,9 +72,8 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
|
||||
{
|
||||
HIP_INIT_API(event, stream);
|
||||
|
||||
ihipEvent_t *eh = event;
|
||||
if (eh && eh->_state != hipEventStatusUnitialized) {
|
||||
eh->_stream = stream;
|
||||
if (event && event->_state != hipEventStatusUnitialized) {
|
||||
event->_stream = stream;
|
||||
|
||||
if (stream == NULL) {
|
||||
// If stream == NULL, wait on all queues.
|
||||
@@ -83,16 +82,16 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
|
||||
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
|
||||
ctx->locked_syncDefaultStream(true);
|
||||
|
||||
eh->_timestamp = hc::get_system_ticks();
|
||||
eh->_state = hipEventStatusRecorded;
|
||||
event->_timestamp = hc::get_system_ticks();
|
||||
event->_state = hipEventStatusRecorded;
|
||||
return ihipLogStatus(hipSuccess);
|
||||
} else {
|
||||
eh->_state = hipEventStatusRecording;
|
||||
event->_state = hipEventStatusRecording;
|
||||
// Clear timestamps
|
||||
eh->_timestamp = 0;
|
||||
eh->_marker = stream->_av.create_marker();
|
||||
|
||||
eh->_copySeqId = stream->locked_lastCopySeqId();
|
||||
event->_timestamp = 0;
|
||||
|
||||
// Record the event in the stream:
|
||||
stream->locked_recordEvent(event);
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
@@ -195,9 +195,9 @@ ihipSignal_t::~ihipSignal_t()
|
||||
//---
|
||||
ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags) :
|
||||
_id(0), // will be set by add function.
|
||||
_av(av),
|
||||
_flags(flags),
|
||||
_ctx(ctx)
|
||||
_ctx(ctx),
|
||||
_criticalData(av)
|
||||
{
|
||||
tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
|
||||
};
|
||||
@@ -246,7 +246,7 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
|
||||
{
|
||||
if (! assertQueueEmpty) {
|
||||
tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
|
||||
_av.wait();
|
||||
crit->_av.wait();
|
||||
}
|
||||
|
||||
if (crit->_last_copy_signal) {
|
||||
@@ -273,6 +273,18 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty)
|
||||
|
||||
};
|
||||
|
||||
|
||||
// Create a marker in this stream.
|
||||
// Save state in the event so it can track the status of the event.
|
||||
void ihipStream_t::locked_recordEvent(hipEvent_t event)
|
||||
{
|
||||
// Lock the stream to prevent simultaneous access
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData);
|
||||
|
||||
event->_marker = crit->_av.create_marker();
|
||||
event->_copySeqId = lastCopySeqId(crit);
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
|
||||
|
||||
@@ -388,11 +400,10 @@ int HIP_NUM_KERNELS_INFLIGHT = 128;
|
||||
//into the stream to mimic CUDA stream semantics. (some hardware uses separate
|
||||
//queues for data commands and kernel commands, and no implicit ordering is provided).
|
||||
//
|
||||
bool ihipStream_t::lockopen_preKernelCommand()
|
||||
LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
|
||||
{
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);
|
||||
|
||||
bool addedSync = false;
|
||||
|
||||
if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
|
||||
this->wait(crit);
|
||||
@@ -402,9 +413,8 @@ bool ihipStream_t::lockopen_preKernelCommand()
|
||||
// If switching command types, we need to add a barrier packet to synchronize things.
|
||||
if (crit->_last_command_type != ihipCommandKernel) {
|
||||
if (crit->_last_copy_signal) {
|
||||
addedSync = true;
|
||||
|
||||
hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue();
|
||||
hsa_queue_t * q = (hsa_queue_t*) (crit->_av.get_hsa_queue());
|
||||
if (HIP_DISABLE_HW_KERNEL_DEP == 0) {
|
||||
this->enqueueBarrier(q, crit->_last_copy_signal, NULL);
|
||||
tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n",
|
||||
@@ -422,7 +432,7 @@ bool ihipStream_t::lockopen_preKernelCommand()
|
||||
crit->_last_command_type = ihipCommandKernel;
|
||||
}
|
||||
|
||||
return addedSync;
|
||||
return crit;
|
||||
}
|
||||
|
||||
|
||||
@@ -433,6 +443,11 @@ void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFutu
|
||||
// We locked _criticalData in the lockopen_preKernelCommand() so OK to access here:
|
||||
_criticalData._last_kernel_future = kernelFuture;
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
kernelFuture.wait();
|
||||
tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str());
|
||||
}
|
||||
|
||||
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
|
||||
};
|
||||
|
||||
@@ -457,7 +472,7 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
|
||||
needSync = 1;
|
||||
ihipSignal_t *depSignal = allocSignal(crit);
|
||||
hsa_signal_store_relaxed(depSignal->_hsaSignal,1);
|
||||
this->enqueueBarrier(static_cast<hsa_queue_t*>(_av.get_hsa_queue()), NULL, depSignal);
|
||||
this->enqueueBarrier(static_cast<hsa_queue_t*>(crit->_av.get_hsa_queue()), NULL, depSignal);
|
||||
*waitSignal = depSignal->_hsaSignal;
|
||||
} else if (crit->_last_copy_signal) {
|
||||
needSync = 1;
|
||||
@@ -500,11 +515,17 @@ void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
|
||||
uint64_t kernel){
|
||||
hsa_status_t status;
|
||||
void *kern;
|
||||
hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
|
||||
|
||||
// Lock stream to prevent other threads from enqueueing kernels at same time.
|
||||
LockedAccessor_StreamCrit_t crit (_criticalData);
|
||||
|
||||
hc::accelerator_view av = crit->_av;
|
||||
|
||||
hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(av.get_hsa_kernarg_region());
|
||||
status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
|
||||
status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
|
||||
status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)av.get_hsa_agent(), 0, kern);
|
||||
memcpy(kern, kernarg, kernSize);
|
||||
hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
|
||||
hsa_queue_t *Queue = (hsa_queue_t*)av.get_hsa_queue();
|
||||
const uint32_t queue_mask = Queue->size-1;
|
||||
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
|
||||
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
|
||||
@@ -1117,8 +1138,9 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c
|
||||
while (std::getline(ss, device_id, ',')) {
|
||||
if (atoi(device_id.c_str()) >= 0) {
|
||||
g_hip_visible_devices.push_back(atoi(device_id.c_str()));
|
||||
}else// Any device number after invalid number will not present
|
||||
} else { // Any device number after invalid number will not present
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Print out the number of ids
|
||||
if (HIP_PRINT_ENV) {
|
||||
@@ -1327,7 +1349,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
|
||||
{
|
||||
HIP_INIT();
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
#if USE_GRID_LAUNCH_20
|
||||
lp->grid_dim.x = grid.x;
|
||||
lp->grid_dim.y = grid.y;
|
||||
lp->grid_dim.z = grid.z;
|
||||
@@ -1336,27 +1357,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
|
||||
lp->group_dim.z = block.z;
|
||||
lp->barrier_bit = barrier_bit_queue_default;
|
||||
lp->launch_fence = -1;
|
||||
#else
|
||||
lp->gridDim.x = grid.x;
|
||||
lp->gridDim.y = grid.y;
|
||||
lp->gridDim.z = grid.z;
|
||||
lp->groupDim.x = block.x;
|
||||
lp->groupDim.y = block.y;
|
||||
lp->groupDim.z = block.z;
|
||||
#endif
|
||||
stream->lockopen_preKernelCommand();
|
||||
// *av = &stream->_av;
|
||||
lp->av = &stream->_av;
|
||||
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
lp->av = &(crit->_av);
|
||||
lp->cf = new hc::completion_future;
|
||||
// lp->av = static_cast<void*>(av);
|
||||
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
|
||||
return (stream);
|
||||
}
|
||||
|
||||
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp)
|
||||
{
|
||||
HIP_INIT_API(stream, grid, block, lp);
|
||||
HIP_INIT();
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
#if USE_GRID_LAUNCH_20
|
||||
lp->grid_dim.x = grid;
|
||||
lp->grid_dim.y = 1;
|
||||
lp->grid_dim.z = 1;
|
||||
@@ -1365,28 +1377,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri
|
||||
lp->group_dim.z = block.z;
|
||||
lp->barrier_bit = barrier_bit_queue_default;
|
||||
lp->launch_fence = -1;
|
||||
#else
|
||||
lp->gridDim.x = grid;
|
||||
lp->gridDim.y = 1;
|
||||
lp->gridDim.z = 1;
|
||||
lp->groupDim.x = block.x;
|
||||
lp->groupDim.y = block.y;
|
||||
lp->groupDim.z = block.z;
|
||||
#endif
|
||||
stream->lockopen_preKernelCommand();
|
||||
// *av = &stream->_av;
|
||||
lp->av = &stream->_av;
|
||||
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
lp->av = &(crit->_av);
|
||||
lp->cf = new hc::completion_future;
|
||||
// lp->av = static_cast<void*>(av);
|
||||
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
|
||||
return (stream);
|
||||
}
|
||||
|
||||
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp)
|
||||
{
|
||||
HIP_INIT_API(stream, grid, block, lp);
|
||||
HIP_INIT();
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
#if USE_GRID_LAUNCH_20
|
||||
lp->grid_dim.x = grid.x;
|
||||
lp->grid_dim.y = grid.y;
|
||||
lp->grid_dim.z = grid.z;
|
||||
@@ -1395,28 +1397,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri
|
||||
lp->group_dim.z = 1;
|
||||
lp->barrier_bit = barrier_bit_queue_default;
|
||||
lp->launch_fence = -1;
|
||||
#else
|
||||
lp->gridDim.x = grid.x;
|
||||
lp->gridDim.y = grid.y;
|
||||
lp->gridDim.z = grid.z;
|
||||
lp->groupDim.x = block;
|
||||
lp->groupDim.y = 1;
|
||||
lp->groupDim.z = 1;
|
||||
#endif
|
||||
stream->lockopen_preKernelCommand();
|
||||
// *av = &stream->_av;
|
||||
lp->av = &stream->_av;
|
||||
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
lp->av = &(crit->_av);
|
||||
lp->cf = new hc::completion_future;
|
||||
// lp->av = static_cast<void*>(av);
|
||||
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
|
||||
return (stream);
|
||||
}
|
||||
|
||||
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp)
|
||||
{
|
||||
HIP_INIT_API(stream, grid, block, lp);
|
||||
HIP_INIT();
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
#if USE_GRID_LAUNCH_20
|
||||
lp->grid_dim.x = grid;
|
||||
lp->grid_dim.y = 1;
|
||||
lp->grid_dim.z = 1;
|
||||
@@ -1425,37 +1417,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
|
||||
lp->group_dim.z = 1;
|
||||
lp->barrier_bit = barrier_bit_queue_default;
|
||||
lp->launch_fence = -1;
|
||||
#else
|
||||
lp->gridDim.x = grid;
|
||||
lp->gridDim.y = 1;
|
||||
lp->gridDim.z = 1;
|
||||
lp->groupDim.x = block;
|
||||
lp->groupDim.y = 1;
|
||||
lp->groupDim.z = 1;
|
||||
#endif
|
||||
stream->lockopen_preKernelCommand();
|
||||
// *av = &stream->_av;
|
||||
lp->av = &stream->_av;
|
||||
lp->cf = new hc::completion_future;
|
||||
// lp->av = static_cast<void*>(av);
|
||||
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
|
||||
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
lp->av = &(crit->_av);
|
||||
lp->cf = new hc::completion_future; // TODO, is this necessary?
|
||||
return (stream);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Called after kernel finishes execution.
|
||||
//This releases the lock on the stream.
|
||||
void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp)
|
||||
{
|
||||
// stream->lockclose_postKernelCommand(cf);
|
||||
stream->lockclose_postKernelCommand(*lp.cf);
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream);
|
||||
}
|
||||
stream->lockclose_postKernelCommand(*(lp.cf));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
//=================================================================================================
|
||||
// HIP API Implementation
|
||||
//
|
||||
@@ -1629,7 +1607,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
// TODO - remove, slow path.
|
||||
tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
|
||||
#if USE_AV_COPY
|
||||
_av.copy(src,dst,sizeBytes);
|
||||
crit->_av.copy(src,dst,sizeBytes);
|
||||
#else
|
||||
hc::am_copy(dst, src, sizeBytes);
|
||||
#endif
|
||||
@@ -1677,7 +1655,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
// TODO - remove, slow path.
|
||||
tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
|
||||
#if USE_AV_COPY
|
||||
_av.copy(src, dst, sizeBytes);
|
||||
crit->_av.copy(src, dst, sizeBytes);
|
||||
#else
|
||||
hc::am_copy(dst, src, sizeBytes);
|
||||
#endif
|
||||
@@ -1879,7 +1857,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
|
||||
stream = device->_defaultStream;
|
||||
}
|
||||
|
||||
*av = &(stream->_av);
|
||||
*av = stream->locked_getAv();
|
||||
|
||||
hipError_t err = hipSuccess;
|
||||
return ihipLogStatus(err);
|
||||
|
||||
@@ -678,9 +678,12 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
|
||||
|
||||
|
||||
|
||||
// TODO - make member function of stream?
|
||||
template <typename T>
|
||||
hc::completion_future
|
||||
ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
|
||||
ihipMemsetKernel(hipStream_t stream,
|
||||
LockedAccessor_StreamCrit_t &crit,
|
||||
T * ptr, T val, size_t sizeBytes)
|
||||
{
|
||||
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
|
||||
const int threads_per_wg = 256;
|
||||
@@ -696,7 +699,7 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
|
||||
|
||||
hc::completion_future cf =
|
||||
hc::parallel_for_each(
|
||||
stream->_av,
|
||||
crit->_av,
|
||||
ext_tile,
|
||||
[=] (hc::tiled_index<1> idx)
|
||||
__attribute__((hc))
|
||||
@@ -713,41 +716,6 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
|
||||
return cf;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
hc::completion_future
|
||||
ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
|
||||
{
|
||||
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
|
||||
const int threads_per_wg = 256;
|
||||
|
||||
int threads = wg * threads_per_wg;
|
||||
if (threads > sizeBytes) {
|
||||
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
|
||||
}
|
||||
|
||||
|
||||
hc::extent<1> ext(threads);
|
||||
auto ext_tile = ext.tile(threads_per_wg);
|
||||
|
||||
hc::completion_future cf =
|
||||
hc::parallel_for_each(
|
||||
stream->_av,
|
||||
ext_tile,
|
||||
[=] (hc::tiled_index<1> idx)
|
||||
__attribute__((hc))
|
||||
{
|
||||
int offset = amp_get_global_id(0);
|
||||
// TODO-HCC - change to hc_get_local_size()
|
||||
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
|
||||
|
||||
for (int i=offset; i<sizeBytes; i+=stride) {
|
||||
c[i] = a[i];
|
||||
}
|
||||
});
|
||||
|
||||
return cf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
|
||||
@@ -762,7 +730,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
stream->lockopen_preKernelCommand();
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
@@ -771,7 +739,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
try {
|
||||
value = value & 0xff;
|
||||
unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
|
||||
cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -779,7 +747,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -814,7 +782,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (stream) {
|
||||
stream->lockopen_preKernelCommand();
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
@@ -823,7 +791,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
try {
|
||||
value = value & 0xff;
|
||||
unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
|
||||
cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
|
||||
cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -831,7 +799,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
} else {
|
||||
// use a slow byte-per-workitem copy:
|
||||
try {
|
||||
cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
|
||||
cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
|
||||
}
|
||||
catch (std::exception &ex) {
|
||||
e = hipErrorInvalidValue;
|
||||
|
||||
@@ -84,6 +84,10 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
#ifdef USE_AV_COPY
|
||||
printf ("USE_AV_COPY\n");
|
||||
#endif
|
||||
|
||||
{
|
||||
// TODO-hcc Convert to use create_blocking_marker(...) functionality.
|
||||
// Currently we have a super-conservative version of this - block on host, and drain the queue.
|
||||
|
||||
مرجع در شماره جدید
Block a user