From 6f95ad0a8de06e3bf102c2f3d07d7d2ff0001b9c Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Sat, 23 Jul 2016 14:54:20 -0500 Subject: [PATCH] Partial fix async after kernel launch signal issue Change-Id: Ib48d6564379160035bded9493b93663fba361710 [ROCm/hip commit: c756bb3398fa256365635d6d04e0fd221ae1f3ba] --- projects/hip/include/hcc_detail/hip_hcc.h | 1 + projects/hip/src/hip_hcc.cpp | 56 ++++++++++++++-- .../hip/tests/src/stress/hipStressAsync.cpp | 64 +++++++++++++++++++ .../hip/tests/src/stress/hipStressChain.cpp | 64 +++++++++++++++++++ .../hip/tests/src/stress/hipStressSync.cpp | 63 ++++++++++++++++++ 5 files changed, 243 insertions(+), 5 deletions(-) create mode 100644 projects/hip/tests/src/stress/hipStressAsync.cpp create mode 100644 projects/hip/tests/src/stress/hipStressChain.cpp create mode 100644 projects/hip/tests/src/stress/hipStressSync.cpp diff --git a/projects/hip/include/hcc_detail/hip_hcc.h b/projects/hip/include/hcc_detail/hip_hcc.h index a3f9c73a13..10c6cf5ff0 100644 --- a/projects/hip/include/hcc_detail/hip_hcc.h +++ b/projects/hip/include/hcc_detail/hip_hcc.h @@ -445,6 +445,7 @@ private: private: void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); + void enqueueBarrier(hsa_queue_t* queue, hsa_signal_t *depSignal); void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal); // The unsigned return is hipMemcpyKind diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 5bdbefcdfe..097cd702e4 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -359,6 +359,42 @@ void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) hsa_signal_store_relaxed(queue->doorbell_signal, index); } +void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, hsa_signal_t *depSignal) +{ + + // Obtain the write index for the command queue + uint64_t index = hsa_queue_load_write_index_relaxed(queue); + const uint32_t queueMask = queue->size - 1; + + // Define the barrier packet to be at the calculated queue index address + hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); + memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); + + // setup header + uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + header |= 1 << HSA_PACKET_HEADER_BARRIER; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + barrier->header = header; + + barrier->dep_signal[0].handle = 0; + barrier->dep_signal[1].handle = 0; + barrier->dep_signal[2].handle = 0; + barrier->dep_signal[3].handle = 0; + barrier->dep_signal[4].handle = 0; + + hsa_signal_t signal; + hsa_signal_create(1, 0, NULL, &signal); + *depSignal = signal; + barrier->completion_signal = signal; + + // TODO - check queue overflow, return error: + // Increment write index and ring doorbell to dispatch the kernel + hsa_queue_store_write_index_relaxed(queue, index+1); + hsa_signal_store_relaxed(queue->doorbell_signal, index); +} + + //-- //When the commands in a stream change types (ie kernel command follows a data command, @@ -429,15 +465,16 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", this, ihipCommandName[crit->_last_command_type], ihipCommandName[copyType]); needSync = 1; - hsa_signal_t *hsaSignal = (static_cast (crit->_last_kernel_future.get_native_handle())); - if (hsaSignal) { + this->enqueueBarrier(static_cast(_av.get_hsa_queue()), waitSignal); +// hsa_signal_t *hsaSignal = (static_cast (crit->_last_kernel_future.get_native_handle())); +/* if (hsaSignal) { // Keep reference to the kernel future in order to keep the // dependent signal alive. _depFutures.push_back(crit->_last_kernel_future); *waitSignal = * hsaSignal; } else { assert(0); // if NULL signal, and we return 1, hsa_amd_memory_copy_async will fail. Confirm this never happens. - } + }*/ } else if (crit->_last_copy_signal) { needSync = 1; tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", @@ -1364,7 +1401,6 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind) { ihipDevice_t *device = this->getDevice(); - if (device == NULL) { throw ihipException(hipErrorInvalidDevice); } @@ -1446,7 +1482,10 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); //printf ("staged-copy- read dep signals\n"); device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - + if(crit->_last_command_type == ihipCommandKernel){ + std::cout<<"Destroying depSignal MemcpySync"<wait(crit, true); @@ -1529,6 +1568,10 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const throw ihipException(hipErrorInvalidValue); } } + if(crit->_last_command_type == ihipCommandKernel){ + hsa_signal_destroy(depSignal); + } + } @@ -1602,6 +1645,9 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); + if (crit->_last_command_type == ihipCommandKernel) { + hsa_signal_destroy(depSignal); + } if (hsa_status == HSA_STATUS_SUCCESS) { if (HIP_LAUNCH_BLOCKING) { diff --git a/projects/hip/tests/src/stress/hipStressAsync.cpp b/projects/hip/tests/src/stress/hipStressAsync.cpp new file mode 100644 index 0000000000..3b8acb40a6 --- /dev/null +++ b/projects/hip/tests/src/stress/hipStressAsync.cpp @@ -0,0 +1,64 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* + * Test for checking the functionality of + * hipError_t hipDeviceSynchronize(); +*/ + +#include"hip_runtime.h" +#include + +#define _SIZE sizeof(int)*1024*1024 +#define NUM_STREAMS 20 +#define ITER 1<<10 + +__global__ void Iter(hipLaunchParm lp, int *Ad, int num){ + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + if(tx == 0){ + for(int i = 0; i +#include + +#define NUM_SIZE 8 +#define NUM_ITER 1 << 30 +static size_t size[NUM_SIZE]; + +__global__ void Add(hipLaunchParm lp, int *Ad){ + int tx = hipThreadIdx_x; + Ad[tx] = Ad[tx] + tx; +} + +void setup(){ + for(int i=0;i + +#define _SIZE sizeof(int)*1024*1024 +#define NUM_STREAMS 20 +#define ITER 1<<10 + +__global__ void Iter(hipLaunchParm lp, int *Ad, int num){ + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + if(tx == 0){ + for(int i = 0; i