Refactor for stream->_av.

- move _av into stream critical section.  ( HCC accelerator_view is not
  thread-safe but HIP steram is. )
- Refactored many places in code that need to acquire critical section.
some were previously thread races, ie enqueueing marker.

-remove support for GRID_LAUNCH_VERSION < 20
-Enable USE_AV_COPY based on HCC work-week.
- Review hipModule docs, some calrity/editing.

Change-Id: I3ce7c25ece048c3504f55ecd4683e506bb1fc8b6


[ROCm/hip commit: e76a272d48]
This commit is contained in:
Ben Sander
2016-08-30 17:29:50 -05:00
والد 1ef87ac5f2
کامیت bb0afa4e38
7فایلهای تغییر یافته به همراه128 افزوده شده و 183 حذف شده
@@ -32,7 +32,7 @@ THE SOFTWARE.
// #define USE_MEMCPYTOSYMBOL
//
//Use the new HCC accelerator_view::copy instead of am_copy
#define USE_AV_COPY 0
#define USE_AV_COPY (__hcc_workweek__ >= 16351)
// Compile peer-to-peer support.
// >= 2 : use HCC hc:accelerator::get_is_peer
@@ -353,18 +353,36 @@ struct LockedBase {
};
class ihipModule_t{
public:
hsa_executable_t executable;
hsa_code_object_t object;
std::string fileName;
void *ptr;
size_t size;
};
class ihipFunction_t{
public:
hsa_executable_symbol_t kernel_symbol;
uint64_t kernel;
};
template <typename MUTEX_TYPE>
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
{
public:
ihipStreamCriticalBase_t() :
ihipStreamCriticalBase_t(hc::accelerator_view av) :
_last_command_type(ihipCommandCopyH2H),
_last_copy_signal(NULL),
_signalCursor(0),
_oldest_live_sig_id(1),
_streamSigId(0),
_kernelCnt(0),
_signalCnt(0)
_signalCnt(0),
_av(av)
{
_signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
};
@@ -395,27 +413,14 @@ public:
// 2 are required if a barrier packet is inserted.
uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id.
hc::accelerator_view _av;
};
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
class ihipModule_t{
public:
hsa_executable_t executable;
hsa_code_object_t object;
std::string fileName;
void *ptr;
size_t size;
};
class ihipFunction_t{
public:
hsa_executable_symbol_t kernel_symbol;
uint64_t kernel;
};
// Internal stream structure.
class ihipStream_t {
@@ -431,16 +436,23 @@ typedef uint64_t SeqNum_t ;
void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
//---
// Thread-safe accessors - these acquire / release mutex:
bool lockopen_preKernelCommand();
// Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex.
LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
void lockclose_postKernelCommand(hc::completion_future &kernel_future);
int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
void locked_reclaimSignals(SIGSEQNUM sigNum);
void locked_wait(bool assertQueueEmpty=false);
SIGSEQNUM locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); };
hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
void locked_waitEvent(hipEvent_t event);
void locked_recordEvent(hipEvent_t event);
//---
// Use this if we already have the stream critical data mutex:
void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
@@ -462,7 +474,6 @@ public:
//---
//Public member vars - these are set at initialization and never change:
SeqNum_t _id; // monotonic sequence ID
hc::accelerator_view _av;
unsigned _flags;
@@ -50,9 +50,8 @@ THE SOFTWARE.
#if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
// Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
#define USE_GRID_LAUNCH_20 1
#else
#define USE_GRID_LAUNCH_20 0
#error (HCC must support GRID_LAUNCH_20)
#endif
#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*) 0x01)
@@ -633,7 +632,6 @@ extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
#define KNRM "\x1B[0m"
#define KGRN "\x1B[32m"
#if USE_GRID_LAUNCH_20
#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
do {\
grid_launch_parm lp;\
@@ -645,21 +643,6 @@ do {\
_kernelName (lp, ##__VA_ARGS__);\
ihipPostLaunchKernel(trueStream, lp);\
} while(0)
#else
#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
do {\
grid_launch_parm lp;\
lp.groupMemBytes = _groupMemBytes; \
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
if (HIP_TRACE_API) {\
fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
#_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
}\
_kernelName (lp, ##__VA_ARGS__);\
ihipPostLaunchKernel(trueStream, lp);\
} while(0)
#endif
#elif defined (__HCC_C__)
@@ -1307,7 +1307,7 @@ hipError_t hipDriverGetVersion(int *driverVersion) ;
hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
/**
* @brief Freeing the module
* @brief Frees the module
*
* @param [in] module
*
@@ -1319,7 +1319,7 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
hipError_t hipModuleUnload(hipModule_t module);
/**
* @brief Function with kname will be extracted present in module
* @brief Function with kname will be extracted if present in module
*
* @param [in] module
* @param [in] kname
@@ -1330,19 +1330,20 @@ hipError_t hipModuleUnload(hipModule_t module);
hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
/**
* @brief returns device memory pointer and size of the kernel present in the module with symbol - name
* @brief returns device memory pointer and size of the kernel present in the module with symbol @p name
*
* @param [in] moodule
* @param [in] name
* @param [out] dptr
* @param [out[ bytes
* @param [in] hmod
* @param [in] name
*
* @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
*/
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
/**
* @brief builds module from code object which resides in host memory. And image is pointer to that location.
* @brief builds module from code object which resides in host memory. Image is pointer to that location.
*
* @param [in] image
* @param [out] module
@@ -1351,8 +1352,9 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h
*/
hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
/**
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
*
* @param [in[ f
* @param [in] gridDimX
+9 -10
مشاهده پرونده
@@ -72,9 +72,8 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
{
HIP_INIT_API(event, stream);
ihipEvent_t *eh = event;
if (eh && eh->_state != hipEventStatusUnitialized) {
eh->_stream = stream;
if (event && event->_state != hipEventStatusUnitialized) {
event->_stream = stream;
if (stream == NULL) {
// If stream == NULL, wait on all queues.
@@ -83,16 +82,16 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
ctx->locked_syncDefaultStream(true);
eh->_timestamp = hc::get_system_ticks();
eh->_state = hipEventStatusRecorded;
event->_timestamp = hc::get_system_ticks();
event->_state = hipEventStatusRecorded;
return ihipLogStatus(hipSuccess);
} else {
eh->_state = hipEventStatusRecording;
event->_state = hipEventStatusRecording;
// Clear timestamps
eh->_timestamp = 0;
eh->_marker = stream->_av.create_marker();
eh->_copySeqId = stream->locked_lastCopySeqId();
event->_timestamp = 0;
// Record the event in the stream:
stream->locked_recordEvent(event);
return ihipLogStatus(hipSuccess);
}
+60 -82
مشاهده پرونده
@@ -195,9 +195,9 @@ ihipSignal_t::~ihipSignal_t()
//---
ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags) :
_id(0), // will be set by add function.
_av(av),
_flags(flags),
_ctx(ctx)
_ctx(ctx),
_criticalData(av)
{
tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
};
@@ -246,7 +246,7 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
{
if (! assertQueueEmpty) {
tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
_av.wait();
crit->_av.wait();
}
if (crit->_last_copy_signal) {
@@ -273,6 +273,18 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty)
};
// Create a marker in this stream.
// Save state in the event so it can track the status of the event.
void ihipStream_t::locked_recordEvent(hipEvent_t event)
{
// Lock the stream to prevent simultaneous access
LockedAccessor_StreamCrit_t crit(_criticalData);
event->_marker = crit->_av.create_marker();
event->_copySeqId = lastCopySeqId(crit);
}
//=============================================================================
@@ -388,11 +400,10 @@ int HIP_NUM_KERNELS_INFLIGHT = 128;
//into the stream to mimic CUDA stream semantics. (some hardware uses separate
//queues for data commands and kernel commands, and no implicit ordering is provided).
//
bool ihipStream_t::lockopen_preKernelCommand()
LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
{
LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);
bool addedSync = false;
if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
this->wait(crit);
@@ -402,9 +413,8 @@ bool ihipStream_t::lockopen_preKernelCommand()
// If switching command types, we need to add a barrier packet to synchronize things.
if (crit->_last_command_type != ihipCommandKernel) {
if (crit->_last_copy_signal) {
addedSync = true;
hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue();
hsa_queue_t * q = (hsa_queue_t*) (crit->_av.get_hsa_queue());
if (HIP_DISABLE_HW_KERNEL_DEP == 0) {
this->enqueueBarrier(q, crit->_last_copy_signal, NULL);
tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n",
@@ -422,7 +432,7 @@ bool ihipStream_t::lockopen_preKernelCommand()
crit->_last_command_type = ihipCommandKernel;
}
return addedSync;
return crit;
}
@@ -433,6 +443,11 @@ void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFutu
// We locked _criticalData in the lockopen_preKernelCommand() so OK to access here:
_criticalData._last_kernel_future = kernelFuture;
if (HIP_LAUNCH_BLOCKING) {
kernelFuture.wait();
tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str());
}
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
};
@@ -457,7 +472,7 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
needSync = 1;
ihipSignal_t *depSignal = allocSignal(crit);
hsa_signal_store_relaxed(depSignal->_hsaSignal,1);
this->enqueueBarrier(static_cast<hsa_queue_t*>(_av.get_hsa_queue()), NULL, depSignal);
this->enqueueBarrier(static_cast<hsa_queue_t*>(crit->_av.get_hsa_queue()), NULL, depSignal);
*waitSignal = depSignal->_hsaSignal;
} else if (crit->_last_copy_signal) {
needSync = 1;
@@ -500,11 +515,17 @@ void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
uint64_t kernel){
hsa_status_t status;
void *kern;
hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
// Lock stream to prevent other threads from enqueueing kernels at same time.
LockedAccessor_StreamCrit_t crit (_criticalData);
hc::accelerator_view av = crit->_av;
hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(av.get_hsa_kernarg_region());
status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)av.get_hsa_agent(), 0, kern);
memcpy(kern, kernarg, kernSize);
hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
hsa_queue_t *Queue = (hsa_queue_t*)av.get_hsa_queue();
const uint32_t queue_mask = Queue->size-1;
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
@@ -1117,8 +1138,9 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c
while (std::getline(ss, device_id, ',')) {
if (atoi(device_id.c_str()) >= 0) {
g_hip_visible_devices.push_back(atoi(device_id.c_str()));
}else// Any device number after invalid number will not present
} else { // Any device number after invalid number will not present
break;
}
}
// Print out the number of ids
if (HIP_PRINT_ENV) {
@@ -1327,7 +1349,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
{
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
#if USE_GRID_LAUNCH_20
lp->grid_dim.x = grid.x;
lp->grid_dim.y = grid.y;
lp->grid_dim.z = grid.z;
@@ -1336,27 +1357,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
lp->group_dim.z = block.z;
lp->barrier_bit = barrier_bit_queue_default;
lp->launch_fence = -1;
#else
lp->gridDim.x = grid.x;
lp->gridDim.y = grid.y;
lp->gridDim.z = grid.z;
lp->groupDim.x = block.x;
lp->groupDim.y = block.y;
lp->groupDim.z = block.z;
#endif
stream->lockopen_preKernelCommand();
// *av = &stream->_av;
lp->av = &stream->_av;
auto crit = stream->lockopen_preKernelCommand();
lp->av = &(crit->_av);
lp->cf = new hc::completion_future;
// lp->av = static_cast<void*>(av);
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
return (stream);
}
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp)
{
HIP_INIT_API(stream, grid, block, lp);
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
#if USE_GRID_LAUNCH_20
lp->grid_dim.x = grid;
lp->grid_dim.y = 1;
lp->grid_dim.z = 1;
@@ -1365,28 +1377,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri
lp->group_dim.z = block.z;
lp->barrier_bit = barrier_bit_queue_default;
lp->launch_fence = -1;
#else
lp->gridDim.x = grid;
lp->gridDim.y = 1;
lp->gridDim.z = 1;
lp->groupDim.x = block.x;
lp->groupDim.y = block.y;
lp->groupDim.z = block.z;
#endif
stream->lockopen_preKernelCommand();
// *av = &stream->_av;
lp->av = &stream->_av;
auto crit = stream->lockopen_preKernelCommand();
lp->av = &(crit->_av);
lp->cf = new hc::completion_future;
// lp->av = static_cast<void*>(av);
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
return (stream);
}
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp)
{
HIP_INIT_API(stream, grid, block, lp);
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
#if USE_GRID_LAUNCH_20
lp->grid_dim.x = grid.x;
lp->grid_dim.y = grid.y;
lp->grid_dim.z = grid.z;
@@ -1395,28 +1397,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri
lp->group_dim.z = 1;
lp->barrier_bit = barrier_bit_queue_default;
lp->launch_fence = -1;
#else
lp->gridDim.x = grid.x;
lp->gridDim.y = grid.y;
lp->gridDim.z = grid.z;
lp->groupDim.x = block;
lp->groupDim.y = 1;
lp->groupDim.z = 1;
#endif
stream->lockopen_preKernelCommand();
// *av = &stream->_av;
lp->av = &stream->_av;
auto crit = stream->lockopen_preKernelCommand();
lp->av = &(crit->_av);
lp->cf = new hc::completion_future;
// lp->av = static_cast<void*>(av);
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
return (stream);
}
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp)
{
HIP_INIT_API(stream, grid, block, lp);
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
#if USE_GRID_LAUNCH_20
lp->grid_dim.x = grid;
lp->grid_dim.y = 1;
lp->grid_dim.z = 1;
@@ -1425,37 +1417,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
lp->group_dim.z = 1;
lp->barrier_bit = barrier_bit_queue_default;
lp->launch_fence = -1;
#else
lp->gridDim.x = grid;
lp->gridDim.y = 1;
lp->gridDim.z = 1;
lp->groupDim.x = block;
lp->groupDim.y = 1;
lp->groupDim.z = 1;
#endif
stream->lockopen_preKernelCommand();
// *av = &stream->_av;
lp->av = &stream->_av;
lp->cf = new hc::completion_future;
// lp->av = static_cast<void*>(av);
// lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
auto crit = stream->lockopen_preKernelCommand();
lp->av = &(crit->_av);
lp->cf = new hc::completion_future; // TODO, is this necessary?
return (stream);
}
//---
//Called after kernel finishes execution.
//This releases the lock on the stream.
void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp)
{
// stream->lockclose_postKernelCommand(cf);
stream->lockclose_postKernelCommand(*lp.cf);
if (HIP_LAUNCH_BLOCKING) {
tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream);
}
stream->lockclose_postKernelCommand(*(lp.cf));
}
//
//=================================================================================================
// HIP API Implementation
//
@@ -1629,7 +1607,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
// TODO - remove, slow path.
tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
#if USE_AV_COPY
_av.copy(src,dst,sizeBytes);
crit->_av.copy(src,dst,sizeBytes);
#else
hc::am_copy(dst, src, sizeBytes);
#endif
@@ -1677,7 +1655,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
// TODO - remove, slow path.
tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
#if USE_AV_COPY
_av.copy(src, dst, sizeBytes);
crit->_av.copy(src, dst, sizeBytes);
#else
hc::am_copy(dst, src, sizeBytes);
#endif
@@ -1879,7 +1857,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
stream = device->_defaultStream;
}
*av = &(stream->_av);
*av = stream->locked_getAv();
hipError_t err = hipSuccess;
return ihipLogStatus(err);
+11 -43
مشاهده پرونده
@@ -678,9 +678,12 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
// TODO - make member function of stream?
template <typename T>
hc::completion_future
ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
ihipMemsetKernel(hipStream_t stream,
LockedAccessor_StreamCrit_t &crit,
T * ptr, T val, size_t sizeBytes)
{
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
const int threads_per_wg = 256;
@@ -696,7 +699,7 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
hc::completion_future cf =
hc::parallel_for_each(
stream->_av,
crit->_av,
ext_tile,
[=] (hc::tiled_index<1> idx)
__attribute__((hc))
@@ -713,41 +716,6 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
return cf;
}
template <typename T>
hc::completion_future
ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
{
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
const int threads_per_wg = 256;
int threads = wg * threads_per_wg;
if (threads > sizeBytes) {
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
}
hc::extent<1> ext(threads);
auto ext_tile = ext.tile(threads_per_wg);
hc::completion_future cf =
hc::parallel_for_each(
stream->_av,
ext_tile,
[=] (hc::tiled_index<1> idx)
__attribute__((hc))
{
int offset = amp_get_global_id(0);
// TODO-HCC - change to hc_get_local_size()
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
for (int i=offset; i<sizeBytes; i+=stride) {
c[i] = a[i];
}
});
return cf;
}
// TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
@@ -762,7 +730,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
stream = ihipSyncAndResolveStream(stream);
if (stream) {
stream->lockopen_preKernelCommand();
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
@@ -771,7 +739,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
try {
value = value & 0xff;
unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -779,7 +747,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
} else {
// use a slow byte-per-workitem copy:
try {
cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -814,7 +782,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
stream = ihipSyncAndResolveStream(stream);
if (stream) {
stream->lockopen_preKernelCommand();
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
@@ -823,7 +791,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
try {
value = value & 0xff;
unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -831,7 +799,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
} else {
// use a slow byte-per-workitem copy:
try {
cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -84,6 +84,10 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
hipError_t e = hipSuccess;
#ifdef USE_AV_COPY
printf ("USE_AV_COPY\n");
#endif
{
// TODO-hcc Convert to use create_blocking_marker(...) functionality.
// Currently we have a super-conservative version of this - block on host, and drain the queue.