From 1ab22946571926169bf10d7baea742b34ec6c17e Mon Sep 17 00:00:00 2001 From: gargrahul Date: Wed, 10 Feb 2016 04:29:55 +0530 Subject: [PATCH 01/94] Removed atomicInc and atomicDec support from HIP [ROCm/hip commit: 8c40a4ace4032964c9349f51295809a8a063756f] --- .../hip/docs/markdown/hip_kernel_language.md | 4 +- projects/hip/include/hcc_detail/hip_runtime.h | 40 +++++++------------ 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/projects/hip/docs/markdown/hip_kernel_language.md b/projects/hip/docs/markdown/hip_kernel_language.md index e7a6baa1a9..582cb8788a 100644 --- a/projects/hip/docs/markdown/hip_kernel_language.md +++ b/projects/hip/docs/markdown/hip_kernel_language.md @@ -435,8 +435,8 @@ HIP supports the following atomic operations. | int atomicMax(int* address, int val) | ✓ | ✓ | | unsigned int atomicMax(unsigned int* address,unsigned int val) | ✓ | ✓ | | unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | -| unsigned int atomicInc(unsigned int* address)| ✓
Takes one argument. | ✓
Wrapping increment,takes two arguments. | -| unsigned int atomicDec(unsigned int* address)| ✓
Takes one argument. | ✓
Wrapping decrement,takes two arguments. | +| unsigned int atomicInc(unsigned int* address)| ✗ | ✓ | +| unsigned int atomicDec(unsigned int* address)| ✗ | ✓ | | int atomicCAS(int* address, int compare, int val) | ✓ | ✓ | | unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ✓ | ✓ | | unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓ | ✓ | diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h index d7e4cba328..8474f066df 100644 --- a/projects/hip/include/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hcc_detail/hip_runtime.h @@ -195,18 +195,6 @@ __device__ inline unsigned long long int atomicMax(unsigned long long int* addre return (long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val); } -//atomicInc() -__device__ inline unsigned int atomicInc(unsigned int* address) -{ - return hc::atomic_fetch_inc(address); -} - -//atomicDec() -__device__ inline unsigned int atomicDec(unsigned int* address) -{ - return hc::atomic_fetch_dec(address); -} - //atomicCAS() __device__ inline int atomicCAS(int* address, int compare, int val) { @@ -318,17 +306,17 @@ __device__ inline unsigned int __ffsll(unsigned long long int input) return hc::__lastbit_u32_u64( input)+1; } -__device__ inline unsigned int __ffs(int input) +__device__ inline unsigned int __ffs(int input) { return hc::__lastbit_u32_s32( input)+1; } -__device__ inline unsigned int __ffsll(long long int input) +__device__ inline unsigned int __ffsll(long long int input) { return hc::__lastbit_u32_s64( input)+1; } -__device__ inline unsigned int __brev( unsigned int input) +__device__ inline unsigned int __brev( unsigned int input) { return hc::__bitrev_b32( input); } @@ -339,59 +327,59 @@ __device__ inline unsigned long long int __brevll( unsigned long long int input) } // warp vote function __all __any __ballot -__device__ inline int __all( int input) +__device__ inline int __all( int input) { return hc::__all( input); } -__device__ inline int __any( int input) +__device__ inline int __any( int input) { if( hc::__any( input)!=0) return 1; else return 0; } -__device__ inline unsigned long long int __ballot( int input) +__device__ inline unsigned long long int __ballot( int input) { return hc::__ballot( input); } // warp shuffle functions -__device__ inline int __shfl(int input, int lane, int width) +__device__ inline int __shfl(int input, int lane, int width) { return hc::__shfl(input,lane,width); } -__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline int __shfl_xor(int input, int lane_mask, int width) +__device__ inline int __shfl_xor(int input, int lane_mask, int width) { return hc::__shfl_xor(input,lane_mask,width); } -__device__ inline float __shfl(float input, int lane, int width) +__device__ inline float __shfl(float input, int lane, int width) { return hc::__shfl(input,lane,width); } -__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline float __shfl_xor(float input, int lane_mask, int width) +__device__ inline float __shfl_xor(float input, int lane_mask, int width) { return hc::__shfl_xor(input,lane_mask,width); } From 837875df37413ab421ee23cc16fa46152843cf3b Mon Sep 17 00:00:00 2001 From: streamhsa Date: Wed, 10 Feb 2016 20:01:16 +0800 Subject: [PATCH 02/94] Resolved test issues [ROCm/hip commit: 4035b71df4d2d8ed0ab774a4969f99b61b7757a1] --- projects/hip/tests/src/CMakeLists.txt | 2 +- projects/hip/tests/src/hipGridLaunch.cpp | 2 +- projects/hip/tests/src/hip_anyall.cpp | 18 ++++++++++++------ projects/hip/tests/src/hip_ballot.cpp | 21 ++++++++++++--------- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index ba4be66a22..0ec287b334 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -126,7 +126,7 @@ make_test(hipEventRecord --iterations 10) make_test(hipMemset " " ) make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes. make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. -make_test(hipMemset --N 500M --memsetval 0xa6 ) # big copy +make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) make_test(hipMemcpy " " ) diff --git a/projects/hip/tests/src/hipGridLaunch.cpp b/projects/hip/tests/src/hipGridLaunch.cpp index 4502446b3b..f13781362e 100644 --- a/projects/hip/tests/src/hipGridLaunch.cpp +++ b/projects/hip/tests/src/hipGridLaunch.cpp @@ -37,7 +37,7 @@ __device__ int foo(int i) //Syntax we would like to support with GRID_LAUNCH enabled: template __global__ void -vectorADD2( grid_launch_parm lp, +vectorADD2( hipLaunchParm lp, T *A_d, T *B_d, T *C_d, diff --git a/projects/hip/tests/src/hip_anyall.cpp b/projects/hip/tests/src/hip_anyall.cpp index e126541766..c6ac36e2e3 100644 --- a/projects/hip/tests/src/hip_anyall.cpp +++ b/projects/hip/tests/src/hip_anyall.cpp @@ -39,12 +39,17 @@ __global__ void int main(int argc, char *argv[]) -{ - +{ int warpSize; + hipDeviceProp_t devProp; + hipDeviceGetProperties(&devProp, 0); + if(strncmp(devProp.name,"Fiji",1)==0) warpSize =64; + else warpSize =32; + int anycount =0; + int allcount =0; int Num_Threads_per_Block = 1024; int Num_Blocks_per_Grid = 1; - int Num_Warps_per_Block = Num_Threads_per_Block/64; - int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/64; + int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; + int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); @@ -69,10 +74,11 @@ for (int i=0; i Date: Wed, 10 Feb 2016 20:05:59 +0800 Subject: [PATCH 03/94] Updated readme for test [ROCm/hip commit: 6f2a94fb93ff93a7b777be7f094e43dbb0259d0d] --- projects/hip/tests/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hip/tests/README.md b/projects/hip/tests/README.md index 48b88505ef..9c6929761c 100644 --- a/projects/hip/tests/README.md +++ b/projects/hip/tests/README.md @@ -2,9 +2,9 @@ Tests uses CMAKE as teh build infrastructure. Use : -> mkdir build -> cd build -> cmake ../src +> cd src +> cmake . +> make > make test From 4c861f028401edd481f1a3f3ed3682a5a36c047f Mon Sep 17 00:00:00 2001 From: streamhsa Date: Wed, 10 Feb 2016 21:02:52 +0800 Subject: [PATCH 04/94] Remove test for atomicInc and atomicDec [ROCm/hip commit: 51bff8757cd57bd437b552da64f583d052f5080e] --- .../hip/tests/src/hipSimpleAtomicsTest.cpp | 4 ++-- projects/hip/tests/src/hip_anyall.cpp | 19 +++++++++++-------- projects/hip/tests/src/hip_ballot.cpp | 14 +++++++------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/projects/hip/tests/src/hipSimpleAtomicsTest.cpp b/projects/hip/tests/src/hipSimpleAtomicsTest.cpp index f492643e41..f0ae0f582f 100644 --- a/projects/hip/tests/src/hipSimpleAtomicsTest.cpp +++ b/projects/hip/tests/src/hipSimpleAtomicsTest.cpp @@ -216,11 +216,11 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata) // Atomic increment (modulo 17+1) //atomicInc((unsigned int *)&g_odata[5], 17); - atomicInc((unsigned int *)&g_odata[5]); + //atomicInc((unsigned int *)&g_odata[5]); // Atomic decrement // atomicDec((unsigned int *)&g_odata[6], 137); - atomicDec((unsigned int *)&g_odata[6]); + //atomicDec((unsigned int *)&g_odata[6]); // Atomic compare-and-swap atomicCAS(&g_odata[7], tid-1, tid); diff --git a/projects/hip/tests/src/hip_anyall.cpp b/projects/hip/tests/src/hip_anyall.cpp index c6ac36e2e3..52a2a13db9 100644 --- a/projects/hip/tests/src/hip_anyall.cpp +++ b/projects/hip/tests/src/hip_anyall.cpp @@ -28,22 +28,25 @@ THE SOFTWARE. #define HIP_ASSERT(x) (assert((x)==hipSuccess)) __global__ void - warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block) + warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift) { int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - device_any[hipThreadIdx_x>>6] = __any(tid -77); - device_all[hipThreadIdx_x>>6] = __all(tid -77); + device_any[hipThreadIdx_x>>pshift] = __any(tid -77); + device_all[hipThreadIdx_x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) -{ int warpSize; +{ int warpSize, pshift; hipDeviceProp_t devProp; hipDeviceGetProperties(&devProp, 0); - if(strncmp(devProp.name,"Fiji",1)==0) warpSize =64; - else warpSize =32; + if(strncmp(devProp.name,"Fiji",1)==0) +{ warpSize =64; + pshift =6; +} + else {warpSize =32; pshift=5;} int anycount =0; int allcount =0; int Num_Threads_per_Block = 1024; @@ -65,7 +68,7 @@ for (int i=0; i> 6; + const unsigned int warp_num = hipThreadIdx_x >> pshift; atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); } int main(int argc, char *argv[]) -{ int warpSize; +{ int warpSize, pshift; hipDeviceProp_t devProp; hipDeviceGetProperties(&devProp, 0); - if(strncmp(devProp.name,"Fiji",1)==0) warpSize =64; - else warpSize =32; + if(strncmp(devProp.name,"Fiji",1)==0) {warpSize =64; pshift =6;} + else {warpSize =32; pshift =5;} unsigned int Num_Threads_per_Block = 512; unsigned int Num_Blocks_per_Grid = 1; unsigned int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; @@ -33,7 +33,7 @@ int main(int argc, char *argv[]) HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice)); - hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block); + hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift); HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost)); @@ -45,7 +45,7 @@ int main(int argc, char *argv[]) divergent_count++;} } -if (divergent_count==1) printf("PASSED"); else printf("FAILED"); +if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n"); return EXIT_SUCCESS; } From 3a032ff31760e56562bfd3e56e2741ccbe31606e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 10 Feb 2016 17:21:18 +0300 Subject: [PATCH 05/94] Formatting, no functional changes [ROCm/hip commit: 254da4ec538fcaecb60f0a465cbe527ee6d998da] --- projects/hip/include/hip_runtime_api.h | 142 ++++++++++++------------- 1 file changed, 69 insertions(+), 73 deletions(-) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 75f5807b6e..9e86bfc034 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -33,34 +33,34 @@ THE SOFTWARE. #include typedef struct { - // 32-bit Atomics: - unsigned hasGlobalInt32Atomics : 1; ///< 32-bit integer atomics for global memory - unsigned hasGlobalFloatAtomicExch : 1; ///< 32-bit float atomic exch for global memory - unsigned hasSharedInt32Atomics : 1; ///< 32-bit integer atomics for shared memory - unsigned hasSharedFloatAtomicExch : 1; ///< 32-bit float atomic exch for shared memory - unsigned hasFloatAtomicAdd : 1; ///< 32-bit float atomic add in global and shared memory + // 32-bit Atomics + unsigned hasGlobalInt32Atomics : 1; ///< 32-bit integer atomics for global memory. + unsigned hasGlobalFloatAtomicExch : 1; ///< 32-bit float atomic exch for global memory. + unsigned hasSharedInt32Atomics : 1; ///< 32-bit integer atomics for shared memory. + unsigned hasSharedFloatAtomicExch : 1; ///< 32-bit float atomic exch for shared memory. + unsigned hasFloatAtomicAdd : 1; ///< 32-bit float atomic add in global and shared memory. - // 64-bit Atomics: - unsigned hasGlobalInt64Atomics : 1; ///< 64-bit integer atomics for global memory - unsigned hasSharedInt64Atomics : 1; ///< 64-bit integer atomics for shared memory + // 64-bit Atomics + unsigned hasGlobalInt64Atomics : 1; ///< 64-bit integer atomics for global memory. + unsigned hasSharedInt64Atomics : 1; ///< 64-bit integer atomics for shared memory. // Doubles - unsigned hasDoubles : 1; ///< double-precision floating point. + unsigned hasDoubles : 1; ///< Double-precision floating point. - // Warp cross-lane operations: - unsigned hasWarpVote : 1; ///< warp vote instructions (__any, __all) - unsigned hasWarpBallot : 1; ///< warp ballot instructions (__ballot) - unsigned hasWarpShuffle : 1; ///< warp shuffle operations. (__shfl_*) - unsigned hasFunnelShift : 1; ///< funnel two words into one, with shift&mask caps + // Warp cross-lane operations + unsigned hasWarpVote : 1; ///< Warp vote instructions (__any, __all). + unsigned hasWarpBallot : 1; ///< Warp ballot instructions (__ballot). + unsigned hasWarpShuffle : 1; ///< Warp shuffle operations. (__shfl_*). + unsigned hasFunnelShift : 1; ///< Funnel two words into one with shift&mask caps. // Sync - unsigned hasThreadFenceSystem : 1; ///< __threadfence_system - unsigned hasSyncThreadsExt : 1; ///< __syncthreads_count, syncthreads_and, syncthreads_or + unsigned hasThreadFenceSystem : 1; ///< __threadfence_system. + unsigned hasSyncThreadsExt : 1; ///< __syncthreads_count, syncthreads_and, syncthreads_or. // Misc - unsigned hasSurfaceFuncs : 1; ///< Surface functions - unsigned has3dGrid : 1; ///< Grid and group dims are 3D (rather than 2D) - unsigned hasDynamicParallelism : 1; ///< Dynamic parallelism + unsigned hasSurfaceFuncs : 1; ///< Surface functions. + unsigned has3dGrid : 1; ///< Grid and group dims are 3D (rather than 2D). + unsigned hasDynamicParallelism : 1; ///< Dynamic parallelism. } hipDeviceArch_t; @@ -72,28 +72,25 @@ typedef struct { * */ typedef struct hipDeviceProp_t { - char name[256]; ///< Device name - size_t totalGlobalMem; ///< Size of global memory region (in bytes) - size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes) - int regsPerBlock ; ///< registers per block - int warpSize ; ///< warp size - int maxThreadsPerBlock; ///< max work items per work group or workgroup max size - int maxThreadsDim[3]; ///< max number of threads in each dimension (XYZ) of a block - int maxGridSize[3]; ///< max grid dimensions (XYZ) - int clockRate ; ///< max clock frequency of the multiProcessors, in khz. - - size_t totalConstMem; ///< Size of shared memory region (in bytes) - int major ; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. - int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. - int multiProcessorCount; ///< number of multi-processors (compute units) - int l2CacheSize; ///< L2 cache size - int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor - int computeMode; ///< Compute mode - - int clockInstructionRate ; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP. - - hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. - int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently + char name[256]; ///< Device name. + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors, in khz. + size_t totalConstMem; ///< Size of shared memory region (in bytes). + int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. + int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. + int multiProcessorCount; ///< Number of multi-processors (compute units). + int l2CacheSize; ///< L2 cache size. + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int computeMode; ///< Compute mode. + int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP. + hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. } hipDeviceProp_t; @@ -111,19 +108,18 @@ typedef struct hipDeviceProp_t { * @ingroup Enumerations */ typedef enum hipError_t { - hipSuccess = 0 ///< Successful completion. - ,hipErrorMemoryAllocation ///< Memory allocation error. - ,hipErrorMemoryFree ///< Memory free error. - ,hipErrorUnknownSymbol ///< Unknown symbol - ,hipErrorOutOfResources ///< Out of resources error - ,hipErrorInvalidValue ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range. - ,hipErrorInvalidResourceHandle ///< Resource handle (hipEvent_t or hipStream_t) invalid. - ,hipErrorInvalidDevice ///< DeviceID must be in range 0...#compute-devices. - ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices - ,hipErrorNotReady ///< indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. - - ,hipErrorUnknown ///< Unknown error - ,hipErrorTbd ///< Marker that more error codes are needed. + hipSuccess = 0 ///< Successful completion. + ,hipErrorMemoryAllocation ///< Memory allocation error. + ,hipErrorMemoryFree ///< Memory free error. + ,hipErrorUnknownSymbol ///< Unknown symbol. + ,hipErrorOutOfResources ///< Out of resources error. + ,hipErrorInvalidValue ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range. + ,hipErrorInvalidResourceHandle ///< Resource handle (hipEvent_t or hipStream_t) invalid. + ,hipErrorInvalidDevice ///< DeviceID must be in range 0...#compute-devices. + ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices + ,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. + ,hipErrorUnknown ///< Unknown error. + ,hipErrorTbd ///< Marker that more error codes are needed. } hipError_t; /* @@ -132,24 +128,24 @@ typedef enum hipError_t { * @ingroup Enumerations */ typedef enum hipDeviceAttribute_t { - hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. - hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block. - hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block. - hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block. - hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid. - hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid. - hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid. - hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. - hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. - hipDeviceAttributeWarpSize, ///< Warp size in threads. - hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. - hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. - hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. - hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. - hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. - hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. - hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. - hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. + hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block. + hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block. + hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block. + hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid. + hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid. + hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid. + hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. + hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. + hipDeviceAttributeWarpSize, ///< Warp size in threads. + hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. + hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. + hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. + hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. + hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. + hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. + hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. + hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. } hipDeviceAttribute_t; /** From f8290141b16c675764f0a75542c89a4dc16ab314 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 11 Feb 2016 13:06:58 +0530 Subject: [PATCH 06/94] Updated readme for test [ROCm/hip commit: a87c7988e898f452cb966f778067d6290fcef6aa] --- projects/hip/tests/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/hip/tests/README.md b/projects/hip/tests/README.md index 9c6929761c..30c7173b35 100644 --- a/projects/hip/tests/README.md +++ b/projects/hip/tests/README.md @@ -2,8 +2,9 @@ Tests uses CMAKE as teh build infrastructure. Use : -> cd src -> cmake . +> mkdir build +> cd build +> cmake ../src > make > make test From 286f4e783c5c1e62c0ae9d20cfff362e457c7aa6 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Thu, 11 Feb 2016 22:22:00 +0530 Subject: [PATCH 07/94] Add reminder to keep ROCR runtime on the system library path [ROCm/hip commit: d15f22113f9967547d1489945e859741152a93d1] --- projects/hip/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/hip/README.md b/projects/hip/README.md index 309951ddb8..9e52b7cd20 100644 --- a/projects/hip/README.md +++ b/projects/hip/README.md @@ -30,6 +30,7 @@ HIP code can be developed either on AMD HSA or Boltzmann platform using hcc comp * Install [hcc](https://bitbucket.org/multicoreware/hcc/wiki/Home) including supporting HSA kernel and runtime driver stack * By default HIP looks for hcc in /opt/hcc (can be overridden by setting HCC_HOME environment variable) * By default HIP looks for HSA in /opt/hsa (can be overridden by setting HSA_PATH environment variable) +* Ensure that ROCR runtime is installed and added to LD_LIBRARY_PATH #### NVIDIA (nvcc) * Install CUDA SDK from manufacturer website From 4eade0ce83845ffe97515fb453608a8ef9876f09 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 11 Feb 2016 22:26:01 +0300 Subject: [PATCH 08/94] BDFID (BusID/DeviceID/FunctionID) support. Except FunctionID (or DomainID in CUDA) support, because cudaDeviceProp::pciDomainID is not reported by CUDA. [ROCm/hip commit: 33f60c300d19ac05be030d6141e85de4fd300d69] --- projects/hip/include/hip_runtime_api.h | 4 ++++ projects/hip/include/nvcc_detail/hip_runtime_api.h | 6 ++++-- projects/hip/src/hip_hcc.cpp | 13 +++++++++++++ projects/hip/tests/src/hipGetDeviceAttribute.cpp | 3 ++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 9e86bfc034..ca02197ac1 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -91,6 +91,8 @@ typedef struct hipDeviceProp_t { int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP. hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. } hipDeviceProp_t; @@ -146,6 +148,8 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDeviceAttributePciBusId, ///< PCI Bus ID. + hipDeviceAttributePciDeviceId, ///< PCI Device ID. } hipDeviceAttribute_t; /** diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 4c9b35cab8..63e3c9983b 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -252,8 +252,10 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrMaxThreadsPerMultiProcessor; break; case hipDeviceAttributeComputeCapabilityMajor: cdattr = cudaDevAttrComputeCapabilityMajor; break; - case hipDeviceAttributeComputeCapabilityMinor: - cdattr = cudaDevAttrComputeCapabilityMinor; break; + case hipDeviceAttributePciBusId: + cdattr = cudaDevAttrPciBusId; break; + case hipDeviceAttributePciDeviceId: + cdattr = cudaDevAttrPciDeviceId; break; default: cerror = cudaErrorInvalidValue; break; } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 4b7b53550d..e367fe308b 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -310,7 +310,16 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) //prop->clockInstructionRate = counterHz / 1000; prop->clockInstructionRate = 100*1000; /* TODO-RT - hard-code until HSART has function to properly report clock */ + // Get Agent BDFID (bus/device/function ID) + uint16_t bdf_id = 1; + err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id); + DeviceErrorCheck(err); + // BDFID is 16bit uint: [8bit - BusID | 5bit - Device ID | 3bit - Function/DomainID] + // TODO/Clarify: cudaDeviceProp::pciDomainID how to report? + // prop->pciDomainID = bdf_id & 0x7; + prop->pciDeviceID = (bdf_id>>3) & 0x1F; + prop->pciBusID = (bdf_id>>8) & 0xFF; // Masquerade as a 3.0-level device. This will change as more HW functions are properly supported. // Application code should use the arch.has* to do detailed feature detection. @@ -839,6 +848,10 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->major; break; case hipDeviceAttributeComputeCapabilityMinor: *pi = prop->minor; break; + case hipDeviceAttributePciBusId: + *pi = prop->pciBusID; break; + case hipDeviceAttributePciDeviceId: + *pi = prop->pciDeviceID; break; default: e = hipErrorInvalidValue; break; } diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 62b6d432a0..6dc8861159 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -73,7 +73,8 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerMultiProcessor, props.maxThreadsPerMultiProcessor)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMajor, props.major)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor)); - + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciBusId, props.pciBusID)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID)); passed(); }; From c587f89de7cb0970daaf66d754455d2c18dffce6 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 12 Feb 2016 00:04:14 +0300 Subject: [PATCH 09/94] Device property maxThreadsPerMultiProcessor set equal to totalGlobalMem (HIP path). Reason: maxThreadsPerMultiProcessor should be as the same as group memory size. Group memory will not be paged out, so, the physical memory size = total shared memory size = group region size. NVCC path remains untouched: CUDA's device property maxThreadsPerMultiProcessor is reported. [ROCm/hip commit: 9f05a52c74048b58fe7463ca0514b3255adf2165] --- projects/hip/src/hip_hcc.cpp | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e367fe308b..47273328bf 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -342,29 +342,18 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) Default compute mode (Multiple threads can use cudaSetDevice() with this device) */ prop->computeMode = 0; - - -/* HsaSystemProperties props; - hsaKmtReleaseSystemProperties(); - if(HSAKMT_STATUS_SUCCESS == hsaKmtAcquireSystemProperties(&props)) - { - HsaNodeProperties node_prop = {0}; - if(HSAKMT_STATUS_SUCCESS == hsaKmtGetNodeProperties(node, &node_prop)) - { - uint32_t waves_per_cu = node_prop.MaxWavesPerSIMD; - prop-> maxThreadsPerMultiProcessor = prop->warpsize*waves_per_cu; - } - } */ - - // get memory properties */ + // Get memory properties err = hsa_agent_iterate_regions(_hsa_agent,get_region_info,prop); DeviceErrorCheck(err); - // Get the size of the region we are using for Accelerator Memory allocations: hsa_region_t *am_region = static_cast (_acc.get_hsa_am_region()); err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &(prop->totalGlobalMem)); + DeviceErrorCheck(err); + // maxThreadsPerMultiProcessor should be as the same as group memory size. + // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. + prop->maxThreadsPerMultiProcessor = prop->totalGlobalMem; // Set feature flags - these are all mandatory for HIP on HCC path: // Some features are under-development and future revs may support flags that are currently 0. From fcd154097f11e08704499139870d5b6891ce88f1 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 12 Feb 2016 01:29:20 +0300 Subject: [PATCH 10/94] Fix typo: maxThreadsPerMultiProcessor -> MaxSharedMemoryPerMultiprocessor Device property MaxSharedMemoryPerMultiprocessor set equal to totalGlobalMem (HIP path). Reason: MaxSharedMemoryPerMultiprocessor should be as the same as group memory size. Group memory will not be paged out, so, the physical memory size = total shared memory size = group region size. NVCC path remains untouched: CUDA's device property MaxSharedMemoryPerMultiprocessor is reported. hipify is updated as well. [ROCm/hip commit: ea8f99702d473ee5c495783e80d6ac6be539302f] --- projects/hip/bin/hipify | 3 + projects/hip/include/hip_runtime_api.h | 84 ++++++++++--------- .../hip/include/nvcc_detail/hip_runtime_api.h | 2 + projects/hip/src/hip_hcc.cpp | 19 ++++- .../hip/tests/src/hipGetDeviceAttribute.cpp | 1 + 5 files changed, 66 insertions(+), 43 deletions(-) diff --git a/projects/hip/bin/hipify b/projects/hip/bin/hipify index f4de89aab2..d143bdff37 100755 --- a/projects/hip/bin/hipify +++ b/projects/hip/bin/hipify @@ -364,6 +364,9 @@ while (@ARGV) { $ft{'err'} += s/\bcudaDevAttrMaxThreadsPerMultiProcessor\b/hipDeviceAttributeMaxThreadsPerMultiProcessor/g; $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMajor\b/hipDeviceAttributeComputeCapabilityMajor/g; $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMinor\b/hipDeviceAttributeComputeCapabilityMinor/g; + $ft{'err'} += s/\bcudaDevAttrPciBusId\b/hipDeviceAttributePciBusId/g; + $ft{'err'} += s/\bcudaDevAttrPciDeviceId\b/hipDeviceAttributePciDeviceId/g; + $ft{'err'} += s/\bcudaDevAttrMaxSharedMemoryPerMultiprocessor\b/hipDeviceAttributeMaxSharedMemoryPerMultiprocessor/g; $ft{'dev'} += s/\bcudaDeviceAttr\b/hipDeviceAttribute_t/g; $ft{'dev'} += s/\bcudaDeviceGetAttribute\b/hipDeviceGetAttribute/g; diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index ca02197ac1..882103a1f4 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -72,27 +72,28 @@ typedef struct { * */ typedef struct hipDeviceProp_t { - char name[256]; ///< Device name. - size_t totalGlobalMem; ///< Size of global memory region (in bytes). - size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). - int regsPerBlock; ///< Registers per block. - int warpSize; ///< Warp size. - int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. - int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. - int maxGridSize[3]; ///< Max grid dimensions (XYZ). - int clockRate; ///< Max clock frequency of the multiProcessors, in khz. - size_t totalConstMem; ///< Size of shared memory region (in bytes). - int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. - int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. - int multiProcessorCount; ///< Number of multi-processors (compute units). - int l2CacheSize; ///< L2 cache size. - int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. - int computeMode; ///< Compute mode. - int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP. - hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. - int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. - int pciBusID; ///< PCI Bus ID. - int pciDeviceID; ///< PCI Device ID. + char name[256]; ///< Device name. + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors, in khz. + size_t totalConstMem; ///< Size of shared memory region (in bytes). + int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. + int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. + int multiProcessorCount; ///< Number of multi-processors (compute units). + int l2CacheSize; ///< L2 cache size. + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int computeMode; ///< Compute mode. + int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP. + hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. + size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. } hipDeviceProp_t; @@ -130,26 +131,27 @@ typedef enum hipError_t { * @ingroup Enumerations */ typedef enum hipDeviceAttribute_t { - hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. - hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block. - hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block. - hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block. - hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid. - hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid. - hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid. - hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. - hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. - hipDeviceAttributeWarpSize, ///< Warp size in threads. - hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. - hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. - hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. - hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. - hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. - hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. - hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. - hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. - hipDeviceAttributePciBusId, ///< PCI Bus ID. - hipDeviceAttributePciDeviceId, ///< PCI Device ID. + hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. + hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block. + hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block. + hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block. + hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid. + hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid. + hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid. + hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. + hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. + hipDeviceAttributeWarpSize, ///< Warp size in threads. + hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. + hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. + hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. + hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. + hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. + hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. + hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. + hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDeviceAttributePciBusId, ///< PCI Bus ID. + hipDeviceAttributePciDeviceId, ///< PCI Device ID. + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per Multiprocessor. } hipDeviceAttribute_t; /** diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 63e3c9983b..f84de73872 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -256,6 +256,8 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrPciBusId; break; case hipDeviceAttributePciDeviceId: cdattr = cudaDevAttrPciDeviceId; break; + case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: + cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor; break; default: cerror = cudaErrorInvalidValue; break; } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 47273328bf..bdfbdb230b 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -342,6 +342,19 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) Default compute mode (Multiple threads can use cudaSetDevice() with this device) */ prop->computeMode = 0; + // Get Max Threads Per Multiprocessor +/* + HsaSystemProperties props; + hsaKmtReleaseSystemProperties(); + if(HSAKMT_STATUS_SUCCESS == hsaKmtAcquireSystemProperties(&props)) { + HsaNodeProperties node_prop = {0}; + if(HSAKMT_STATUS_SUCCESS == hsaKmtGetNodeProperties(node, &node_prop)) { + uint32_t waves_per_cu = node_prop.MaxWavesPerSIMD; + prop-> maxThreadsPerMultiProcessor = prop->warpsize*waves_per_cu; + } + } +*/ + // Get memory properties err = hsa_agent_iterate_regions(_hsa_agent,get_region_info,prop); @@ -351,9 +364,9 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) hsa_region_t *am_region = static_cast (_acc.get_hsa_am_region()); err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &(prop->totalGlobalMem)); DeviceErrorCheck(err); - // maxThreadsPerMultiProcessor should be as the same as group memory size. + // maxSharedMemoryPerMultiProcessor should be as the same as group memory size. // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. - prop->maxThreadsPerMultiProcessor = prop->totalGlobalMem; + prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; // Set feature flags - these are all mandatory for HIP on HCC path: // Some features are under-development and future revs may support flags that are currently 0. @@ -841,6 +854,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->pciBusID; break; case hipDeviceAttributePciDeviceId: *pi = prop->pciDeviceID; break; + case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: + *pi = prop->maxSharedMemoryPerMultiProcessor; break; default: e = hipErrorInvalidValue; break; } diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 6dc8861159..30fac8c1b4 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -75,6 +75,7 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciBusId, props.pciBusID)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, props.maxSharedMemoryPerMultiProcessor)); passed(); }; From 99052abbdb2613bb97dca4479518dfc47ed76f84 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 12 Feb 2016 13:58:35 +0530 Subject: [PATCH 11/94] Updated integer intrinsics documentation [ROCm/hip commit: 2c0148013186ecdba6b0ae4181ff8facb4f9a68e] --- .../hip/docs/markdown/hip_kernel_language.md | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/projects/hip/docs/markdown/hip_kernel_language.md b/projects/hip/docs/markdown/hip_kernel_language.md index 582cb8788a..87215e2369 100644 --- a/projects/hip/docs/markdown/hip_kernel_language.md +++ b/projects/hip/docs/markdown/hip_kernel_language.md @@ -363,27 +363,27 @@ Following is the list of supported double precision mathematical functions. [1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. [↩](#a2) ### Integer Intrinsics -Following is the list of supported integer intrinsics. +Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only. -| **Function** | **Supported on Host** | **Supported on Device** | -| --- | --- | --- | -| unsigned int __brev ( unsigned int x )
Reverse the bit order of a 32 bit unsigned integer. | ✓ | ✓ | -| unsigned long long int __brevll ( unsigned long long int x )
Reverse the bit order of a 64 bit unsigned integer. | ✓ | ✓ | -| int __clz ( int x )
Return the number of consecutive high-order zero bits in a 32 bit integer. | ✓ | ✓ | -| unsigned int __clz(unsigned int x)
Return the number of consecutive high-order zero bits in 32 bit unsigned integer. | ✓ | ✗ | -| int __clzll ( long long int x )
Count the number of consecutive high-order zero bits in a 64 bit integer. | ✓ | ✓ | -| unsigned int __clzll(long long int x)
Return the number of consecutive high-order zero bits in 64 bit signed integer. | ✓ | ✗ | -| unsigned int __ffs(unsigned int x)
Find the position of least signigicant bit set to 1 in a 32 bit unsigned integer.[1](#f3) | ✓ | ✓| -| unsigned int __ffs(int x)
Find the position of least signigicant bit set to 1 in a 32 bit signed integer. | ✗ | ✓ | -| unsigned int __ffsll(unsigned long long int x)
Find the position of least signigicant bit set to 1 in a 64 bit unsigned integer.[1](#f3) | ✓ | ✓ | -| unsigned int __ffsll(long long int x)
Find the position of least signigicant bit set to 1 in a 64 bit signed integer. | ✗ | ✓ | -| unsigned int __popc ( unsigned int x )
Count the number of bits that are set to 1 in a 32 bit integer. | ✓ | ✓ | -| int __popcll ( unsigned long long int x )
Count the number of bits that are set to 1 in a 64 bit integer. | ✓ | ✓ | +| **Function** | +| --- | +| unsigned int __brev ( unsigned int x )
Reverse the bit order of a 32 bit unsigned integer. | +| unsigned long long int __brevll ( unsigned long long int x )
Reverse the bit order of a 64 bit unsigned integer. | +| int __clz ( int x )
Return the number of consecutive high-order zero bits in a 32 bit integer. | +| unsigned int __clz(unsigned int x)
Return the number of consecutive high-order zero bits in 32 bit unsigned integer. | +| int __clzll ( long long int x )
Count the number of consecutive high-order zero bits in a 64 bit integer. | +| unsigned int __clzll(long long int x)
Return the number of consecutive high-order zero bits in 64 bit signed integer. | +| unsigned int __ffs(unsigned int x)
Find the position of least signigicant bit set to 1 in a 32 bit unsigned integer.[1](#f3) | +| unsigned int __ffs(int x)
Find the position of least signigicant bit set to 1 in a 32 bit signed integer. | +| unsigned int __ffsll(unsigned long long int x)
Find the position of least signigicant bit set to 1 in a 64 bit unsigned integer.[1](#f3) | +| unsigned int __ffsll(long long int x)
Find the position of least signigicant bit set to 1 in a 64 bit signed integer. | +| unsigned int __popc ( unsigned int x )
Count the number of bits that are set to 1 in a 32 bit integer. | +| int __popcll ( unsigned long long int x )
Count the number of bits that are set to 1 in a 64 bit integer. | [1] The hcc implementation of __ffs() and __ffsll() contains code to add a constant +1 to produce the ffs result format. For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, hcc provides hc::__lastbit_u32_u32(unsigned int input) and hc::__lastbit_u32_u64(unsigned long long int input). -The index returned by __lastbit_ instructions starts at 0, while for ffs the index starts at 1. +The index returned by __lastbit_ instructions starts at -1, while for ffs the index starts at 0. ## Texture Functions From 5657d7bae7f111778d79b71ef69cf80a45272cbf Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 12 Feb 2016 14:21:58 +0530 Subject: [PATCH 12/94] Documented supported fastmath functions [ROCm/hip commit: 447201a6a0179add6159e1083525e5bbecab2be6] --- .../hip/docs/markdown/hip_kernel_language.md | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/projects/hip/docs/markdown/hip_kernel_language.md b/projects/hip/docs/markdown/hip_kernel_language.md index 87215e2369..9d20fe82a3 100644 --- a/projects/hip/docs/markdown/hip_kernel_language.md +++ b/projects/hip/docs/markdown/hip_kernel_language.md @@ -385,6 +385,28 @@ For the cases where this overhead is not acceptable and programmer is willing to hcc provides hc::__lastbit_u32_u32(unsigned int input) and hc::__lastbit_u32_u64(unsigned long long int input). The index returned by __lastbit_ instructions starts at -1, while for ffs the index starts at 0. +### Floating-point Intrinsics +Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only. + +| **Function** | +| --- | +| float __cosf ( float x )
Calculate the fast approximate cosine of the input argument. | +| float __expf ( float x )
Calculate the fast approximate base e exponential of the input argument. | +| float __frsqrt_rn ( float x )
Compute `1 / √x` in round-to-nearest-even mode. | +| float __fsqrt_rd ( float x )
Compute `√x` in round-down mode. | +| float __fsqrt_rn ( float x )
Compute `√x` in round-to-nearest-even mode. | +| float __fsqrt_ru ( float x )
Compute `√x` in round-up mode. | +| float __fsqrt_rz ( float x )
Compute `√x` in round-towards-zero mode. | +| float __log10f ( float x )
Calculate the fast approximate base 10 logarithm of the input argument. | +| float __log2f ( float x )
Calculate the fast approximate base 2 logarithm of the input argument. | +| float __logf ( float x )
Calculate the fast approximate base e logarithm of the input argument. | +| float __powf ( float x, float y )
Calculate the fast approximate of xy. | +| float __sinf ( float x )
Calculate the fast approximate sine of the input argument. | +| float __tanf ( float x )
Calculate the fast approximate tangent of the input argument. | +| double __dsqrt_rd ( double x )
Compute `√x` in round-down mode. | +| double __dsqrt_rn ( double x )
Compute `√x` in round-to-nearest-even mode. | +| double __dsqrt_ru ( double x )
Compute `√x` in round-up mode. | +| double __dsqrt_rz ( double x )
Compute `√x` in round-towards-zero mode. | ## Texture Functions Texture functions are not supported. From 0a6e6e3b7ec4e5fdf81a9acad6db4fde6ce92a42 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 9 Feb 2016 08:39:08 -0600 Subject: [PATCH 13/94] Fix bug in device bounds comparison. Shows up in multi-GPU. [ROCm/hip commit: e483eea85b51a19d4fdbbaf310c95b9409cb6236] --- projects/hip/src/hip_hcc.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index bdfbdb230b..f9f3b1b1f5 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -40,7 +40,7 @@ THE SOFTWARE. #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) -//#define USE_ASYNC_COPY +#define USE_ASYNC_COPY 0 #define INLINE static inline @@ -338,8 +338,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) DeviceErrorCheck(err); prop->l2CacheSize = cache_size[1]; - /* Computemode for HSA Devices is always : cudaComputeModeDefault :/ - Default compute mode (Multiple threads can use cudaSetDevice() with this device) */ + /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; // Get Max Threads Per Multiprocessor @@ -760,7 +759,7 @@ hipError_t hipSetDevice(int device) { std::call_once(hip_initialized, ihipInit); - if ((device < 0) || (device > g_devices.size())) { + if ((device < 0) || (device >= g_devices.size())) { return ihipLogStatus(hipErrorInvalidDevice); } else { tls_defaultDevice = device; @@ -1428,7 +1427,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipError_t e = hipSuccess; -#ifdef USE_ASYNC_COPY +#if USE_ASYNC_COPY if (ihipIsValidDevice(stream->_device_index)) { ihipDevice_t *device = &g_devices[stream->_device_index]; From d4a90f8afd82b411e03affb9b06ba92dddb72de5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 10 Feb 2016 11:52:42 -0600 Subject: [PATCH 14/94] Create address tracker for am_alloc. Tracks device where memory is allocated, pinned-host or device, and more. Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. [ROCm/hip commit: 4ee2a5229b68df89ef1b82315057dce8c45cd1d2] --- projects/hip/bin/hipcc | 1 + projects/hip/bin/hipify | 2 + projects/hip/include/hcc_detail/AM.h | 92 +++++ .../hip/include/hcc_detail/hip_runtime_api.h | 8 + projects/hip/include/hip_runtime_api.h | 24 ++ projects/hip/src/hc_AM.cpp | 219 ++++++++++++ projects/hip/src/hip_hcc.cpp | 71 +++- projects/hip/tests/src/CMakeLists.txt | 2 + projects/hip/tests/src/hipPointerAttrib.cpp | 319 ++++++++++++++++++ projects/hip/tests/src/test_common.cpp | 14 +- projects/hip/tests/src/test_common.h | 2 + 11 files changed, 743 insertions(+), 11 deletions(-) create mode 100644 projects/hip/include/hcc_detail/AM.h create mode 100644 projects/hip/src/hc_AM.cpp create mode 100644 projects/hip/tests/src/hipPointerAttrib.cpp diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index d001c6febe..7537750ff6 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -71,6 +71,7 @@ if ($HIP_PLATFORM eq "hcc") { $HIPLDFLAGS .= " -L$HSA_PATH/lib -lhsa-runtime64 -lhc_am"; # Add C++ libs for GCC. $HIPLDFLAGS .= " -lstdc++"; + $HIPLDFLAGS .= " -lm"; if ($verbose & 0x2) { print ("HSA_PATH=$HSA_PATH\n"); diff --git a/projects/hip/bin/hipify b/projects/hip/bin/hipify index d143bdff37..e3b6c64c88 100755 --- a/projects/hip/bin/hipify +++ b/projects/hip/bin/hipify @@ -277,6 +277,8 @@ while (@ARGV) { $ft{'mem'} += s/\bcudaMemcpyKind\b/hipMemcpyKind/g; + $ft{'mem'} += s/\bcudaPointerAttributes\b/hipPointerAttribute_t/g; + #-------- # Memory management: diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h new file mode 100644 index 0000000000..1cfcf2dfb2 --- /dev/null +++ b/projects/hip/include/hcc_detail/AM.h @@ -0,0 +1,92 @@ +#pragma once + +#include + +typedef int am_status_t; +#define AM_SUCCESS 0 +// TODO - provide better mapping of HSA error conditions to HC error codes. +#define AM_ERROR_MISC -1 /** Misellaneous error */ + +// Flags for am_alloc API: +#define amHostPinned 0x1 + + +namespace hc { + +// This is the data that is maintained for each pointer: +struct AmPointerInfo { + bool _isDeviceMem; + void * _hostPointer; + void * _devicePointer; + size_t _sizeBytes; + hc::accelerator _acc; + unsigned _allocationFlags; + + AmPointerInfo() {}; + + AmPointerInfo(bool isDeviceMem, void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, unsigned allocationFlags) : + _isDeviceMem(isDeviceMem), + _hostPointer(hostPointer), + _devicePointer(devicePointer), + _sizeBytes(sizeBytes), + _acc(acc), + _allocationFlags(allocationFlags) {}; +}; +} + + + +namespace hc { + + +/** + * Allocates a block of @p size bytes of memory on the specified @p acc. + * + * The contents of the newly allocated block of memory are not initialized. + * + * If @p size == 0, 0 is returned. + * + * Flags must be 0. + * + * @returns : On success, pointer to the newly allocated memory is returned. + * The pointer is typecast to the desired return type. + * + * If an error occurred trying to allocate the requested memory, 0 is returned. + * + * @see am_free, am_copy + */ +auto_voidp AM_alloc(size_t size, hc::accelerator acc, unsigned flags); + +/** + * Frees a block of memory previously allocated with am_alloc. + * + * @see am_alloc, am_copy + */ +am_status_t AM_free(void* ptr); + + +/** + * Copies @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. + * + * @returns AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. + * @see am_alloc, am_free + */ +am_status_t AM_copy(void* dst, const void* src, size_t size); + +am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr); + + +// TODO-implement these: +//am_status_t AM_track_pointer(void* ptr, size_t size, bool isDeviceMem=false, unsigned allocationFlags=0x0); +//am_status_t AM_untrack_pointer(void* ptr); + +/** + * Prints the contents of the memory tracker table to stderr + * + * Intended primarily for debug purposes. + **/ +void AM_print_tracker(); + + +}; // namespace hc + diff --git a/projects/hip/include/hcc_detail/hip_runtime_api.h b/projects/hip/include/hcc_detail/hip_runtime_api.h index 225b065654..a0c676987b 100644 --- a/projects/hip/include/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hcc_detail/hip_runtime_api.h @@ -105,6 +105,8 @@ enum hipMemcpyKind { } ; + + // Doxygen end group GlobalDefs /** @} */ @@ -128,6 +130,7 @@ typedef struct hipEvent_t { + #ifdef __cplusplus } /* extern "C" */ #endif @@ -634,6 +637,11 @@ hipError_t hipEventQuery(hipEvent_t event) ; */ +/** + * @brief Return attributes for the specified pointer + */ +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) ; + /** * @brief Allocate memory on the default accelerator diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 882103a1f4..41ad338d6d 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -97,6 +97,30 @@ typedef struct hipDeviceProp_t { } hipDeviceProp_t; +/** + * Memory type (for pointer attributes) + */ +enum hipMemoryType { + hipMemoryTypeHost, ///< Memory is physically located on host + hipMemoryTypeDevice ///< Memory is physically located on device. (see deviceId for specific device) +}; + + + +/** + * Pointer attributes + */ +typedef struct hipPointerAttribute_t { + enum hipMemoryType memoryType; + int device; + void *devicePointer; + void *hostPointer; + int isManaged; + unsigned allocationFlags; /* flags specified when memory was allocated*/ + /* peers? */ +} hipPointerAttribute_t; + + // hack to get these to show up in Doxygen: /** * @defgroup GlobalDefs Global enum and defines diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp new file mode 100644 index 0000000000..87e29e4bcc --- /dev/null +++ b/projects/hip/src/hc_AM.cpp @@ -0,0 +1,219 @@ + +#include "hc_am.hpp" +#include "hsa.h" + + +#include "hcc_detail/AM.h" // TODO - Remove me. + +#define DB_TRACKER 1 + +#if DB_TRACKER +#define mprintf( ...) {\ + fprintf (stderr, __VA_ARGS__);\ + }; +#else +#define mprintf( ...) +#endif + +//========================================================================================================= +// Pointer Tracker Structures: +//========================================================================================================= +#include +#include +//#include + +struct AmMemoryRange { + void * _basePointer; + void * _endPointer; + AmMemoryRange(void *basePointer, size_t sizeBytes) : + _basePointer(basePointer), _endPointer((unsigned char*)basePointer + sizeBytes - 1) {}; +}; + +// Functor to compare ranges: +struct AmMemoryRangeCompare { + // Return true is LHS range is less than RHS - used to order the + bool operator()(const AmMemoryRange &lhs, const AmMemoryRange &rhs) const + { + return lhs._endPointer < rhs._basePointer; + } + +}; + + +std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) +{ + os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes + << " isDeviceMem:" << ap._isDeviceMem << " allocFlags:" << ap._allocationFlags; + return os; +} + + + +// This structure tracks information for each pointer. +// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. +// The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. +// The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. +class AmPointerTracker { +typedef std::map MapTrackerType; +public: + + void insert(void *pointer, const hc::AmPointerInfo &p); + int remove(void *pointer); + + MapTrackerType::iterator find(void *hostPtr); + + MapTrackerType::iterator end() { return _tracker.end(); }; + + std::ostream & print (std::ostream &os); +private: + MapTrackerType _tracker; + //std::shared_timed_mutex _mut; +}; + + +//--- +void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) +{ + // TODO-mutex - write lock. + mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); + _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); + + +} + + +//--- +// Return 1 if removed or 0 if not found. +int AmPointerTracker::remove (void *pointer) +{ + // TODO-mutex - write lock. + mprintf ("remove: %p\n", pointer); + return _tracker.erase(AmMemoryRange(pointer,1)); +} + + +//--- +AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) +{ + // TODO-mutex- read lock + auto iter = _tracker.find(AmMemoryRange(pointer,1)); + mprintf ("find: %p\n", pointer); + return iter; +} + + +std::ostream & AmPointerTracker::print (std::ostream &os) +{ + for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { + os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; + os << iter->second << std::endl; + } + + return os; +} + + +//========================================================================================================= +// Global var defs: +//========================================================================================================= +AmPointerTracker g_amPointerTracker; // Track all am pointer allocations. + + +//========================================================================================================= +// API Definitions. +//========================================================================================================= +// +// + +namespace hc { + +// Allocate accelerator memory, return NULL if memory could not be allocated: +auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) +{ + + void *ptr = NULL; + + if (sizeBytes != 0 ) { + if (acc.is_hsa_accelerator()) { + hsa_agent_t *hsa_agent = static_cast (acc.get_default_view().get_hsa_agent()); + hsa_region_t *alloc_region; + if (flags & amHostPinned) { + alloc_region = static_cast(acc.get_hsa_am_system_region()); + } else { + alloc_region = static_cast(acc.get_hsa_am_region()); + } + + if (alloc_region->handle != -1) { + + hsa_status_t s1 = hsa_memory_allocate(*alloc_region, sizeBytes, &ptr); + hsa_status_t s2 = hsa_memory_assign_agent(ptr, *hsa_agent, HSA_ACCESS_PERMISSION_RW); + + if ((s1 != HSA_STATUS_SUCCESS) || (s2 != HSA_STATUS_SUCCESS)) { + ptr = NULL; + } else { + if (flags & amHostPinned) { + g_amPointerTracker.insert(ptr, + hc::AmPointerInfo(false/*isDevice*/, ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + } else { + g_amPointerTracker.insert(ptr, + hc::AmPointerInfo(true/*isDevice*/, NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + } + } + } + } + } + + return ptr; +}; + + +am_status_t AM_free(void* ptr) +{ + am_status_t status = AM_SUCCESS; + + if (ptr != NULL) { + hsa_memory_free(ptr); + + size_t numRemoved = g_amPointerTracker.remove(ptr) ; + if (numRemoved == 0) { + status = AM_ERROR_MISC; + } + } + return status; +} + + + +am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) +{ + am_status_t am_status = AM_ERROR_MISC; + hsa_status_t err = hsa_memory_copy(dst, src, sizeBytes); + + if (err == HSA_STATUS_SUCCESS) { + am_status = AM_SUCCESS; + } else { + am_status = AM_ERROR_MISC; + } + + return am_status; +} + + +am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) +{ + auto infoI = g_amPointerTracker.find(ptr); + if (infoI != g_amPointerTracker.end()) { + *info = infoI->second; + return AM_SUCCESS; + } else { + return AM_ERROR_MISC; + } +} + +void AM_print_tracker() +{ + g_amPointerTracker.print(std::cerr); +} + + +} // end namespace hc. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index f9f3b1b1f5..e9ee4c41dc 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -31,6 +31,8 @@ THE SOFTWARE. #include #include #include +#include + #include #include @@ -38,6 +40,9 @@ THE SOFTWARE. #include "hsa_ext_amd.h" + +#include "hc_AM.cpp" + #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) #define USE_ASYNC_COPY 0 @@ -466,7 +471,8 @@ void ihipInit() g_devices.reserve(accs.size()); for (int i=0; imemoryType = amPointerInfo._isDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; + attributes->hostPointer = amPointerInfo._hostPointer; + attributes->devicePointer = amPointerInfo._devicePointer; + attributes->isManaged = 0; + attributes->allocationFlags = amPointerInfo._allocationFlags; + + + attributes->device = -1; + e = hipErrorInvalidDevice; + for (int i=0; idevice = i; + e = hipSuccess; + break; + } + } + } else { + attributes->memoryType = hipMemoryTypeDevice; + attributes->hostPointer = 0; + attributes->devicePointer = 0; + attributes->device = -1; + attributes->isManaged = 0; + attributes->allocationFlags = 0; + + e = hipErrorInvalidValue; + } + + return ihipLogStatus(e); +} + + // kernel for launching memcpy operations: template @@ -1345,9 +1398,9 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hipError_t hip_status = hipSuccess; const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); - if (*ptr == NULL) { + if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { hip_status = hipSuccess; @@ -1367,9 +1420,9 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) const unsigned am_flags = amHostPinned; - *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); hipError_t hip_status = hipSuccess; - if (*ptr == NULL) { + if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { hip_status = hipSuccess; @@ -1444,7 +1497,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #else // TODO-hsart - what synchronization does hsa_copy provide? - hc::am_copy(dst, src, sizeBytes); + hc::AM_copy(dst, src, sizeBytes); e = hipSuccess; #endif @@ -1475,7 +1528,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::am_copy(dst, src, sizeBytes); + hc::AM_copy(dst, src, sizeBytes); #if 0 @@ -1592,7 +1645,7 @@ hipError_t hipFree(void* ptr) ihipWaitAllStreams(ihipGetTlsDefaultDevice()); if (ptr) { - hc::am_free(ptr); + hc::AM_free(ptr); } return ihipLogStatus(hipSuccess); @@ -1606,7 +1659,7 @@ hipError_t hipFreeHost(void* ptr) if (ptr) { #if USE_PINNED_HOST tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); - hc::am_free(ptr); + hc::AM_free(ptr); #else free(ptr); #endif diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index 0ec287b334..bf05fc8407 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -114,6 +114,7 @@ make_hip_executable (hipSimpleAtomicsTest hipSimpleAtomicsTest.cpp) make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp) make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp) make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) +make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -128,6 +129,7 @@ make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes. make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) +make_test(hipPointerAttrib " " ) make_test(hipMemcpy " " ) diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp new file mode 100644 index 0000000000..9d147d8183 --- /dev/null +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -0,0 +1,319 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +// Test pointer tracking logic: allocate memory and retrieve stats with hipPointerGetAttributes + +#include "hip_runtime.h" +#include "test_common.h" + +#ifdef __HIP_PLATFORM_HCC__ +#include "hcc_detail/AM.h" +#endif + +size_t Nbytes = 0; + +//================================================================================================= +// Utility Functions: +//================================================================================================= + +bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +{ + return ((lhs.hostPointer == rhs.hostPointer) && + (lhs.devicePointer == rhs.devicePointer) && + (lhs.memoryType == rhs.memoryType) && + (lhs.device == rhs.device) && + (lhs.allocationFlags == rhs.allocationFlags) + ) ; + +}; + + +bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +{ + return ! (lhs == rhs); +} + + +const char *memoryTypeToString(hipMemoryType memoryType) +{ + switch (memoryType) { + case hipMemoryTypeHost : return "[Host]"; + case hipMemoryTypeDevice : return "[Device]"; + default: return "[Unknown]"; + }; +} + + +void resetAttribs(hipPointerAttribute_t *attribs) +{ + attribs->hostPointer = (void*) (-1); + attribs->devicePointer = (void*) (-1); + attribs->memoryType = hipMemoryTypeHost; + attribs->device = -2; + attribs->isManaged = -1; + attribs->allocationFlags = 0xffff; +}; + + +void printAttribs(hipPointerAttribute_t *attribs) +{ + printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", + attribs->hostPointer, + attribs->devicePointer, + memoryTypeToString(attribs->memoryType), + attribs->device, + attribs->isManaged, + attribs->allocationFlags + ); +}; + + +inline int zrand(int max) +{ + return rand() % max; +} + + +//================================================================================================= +// Functins to run tests +//================================================================================================= +// +//Run through a couple simple cases to test lookups and hostd pointer arithmetic: +void simpleTests() +{ + char *A_d; + char *A_Pinned_h; + char *A_OSAlloc_h; + hipError_t e; + + HIPCHECK ( hipMalloc(&A_d, Nbytes) ); + HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) ); + A_OSAlloc_h = (char*)malloc(Nbytes); + + + hipPointerAttribute_t attribs; + hipPointerAttribute_t attribs2; + + // Device memory + printf ("\nDevice memory (hipMalloc)\n"); + HIPCHECK( hipPointerGetAttributes(&attribs, A_d)); + printf("getAttr:%-20s", "A_d"); printAttribs(&attribs); + + // Check pointer arithmetic cases: + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+100)); + printf("getAttr:%-20s", "A_d+100"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + // Corner case at end of array: + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+Nbytes-1)); + printf("getAttr:%-20s", "A_d+NBytes-1"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + // Pointer just beyond array - must be invalid or at least a different pointer + resetAttribs(&attribs2); + e = hipPointerGetAttributes(&attribs2, A_d+Nbytes+1); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + if (e != hipErrorInvalidValue) { + // We might have strayed into another pointer area. + printf("getAttr:%-20s", "A_d+NBytes"); printAttribs(&attribs2); + HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + } + + + resetAttribs(&attribs2); + e = hipPointerGetAttributes(&attribs2, A_d+Nbytes); + if (e != hipErrorInvalidValue) { + printf("%-20s", "A_d+Nbytes"); printAttribs(&attribs2); + HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + } + + hipFree(A_d); + e = hipPointerGetAttributes(&attribs, A_d); + HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. + + + // Device-visible host memory + printf ("\nDevice-visible host memory (hipMallocHost)\n"); + HIPCHECK( hipPointerGetAttributes(&attribs, A_Pinned_h)); + printf("getAttr:%-20s", "A_pinned_h"); printAttribs(&attribs); + + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_Pinned_h+Nbytes/2)); + printf("getAttr:%-20s", "A_pinned_h+NBytes/2"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + + hipFreeHost(A_Pinned_h); + e = hipPointerGetAttributes(&attribs, A_Pinned_h); + HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + + + // OS memory + printf ("\nOS-allocated memory (malloc)\n"); + e = hipPointerGetAttributes(&attribs, A_OSAlloc_h); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e)); + HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue. +} + + + + +struct SuperPointerAttribute { + void * _pointer; + size_t _sizeBytes; + hipPointerAttribute_t _attrib; +}; + + + +void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointer) +{ + hipPointerAttribute_t attribs; + resetAttribs(&attribs); + + HIPCHECK(hipPointerGetAttributes(&attribs, pointer)); + if (attribs != ref._attrib) { + printf("Test %d.%d", major, minor); + printf(" ref :: "); printAttribs(&ref._attrib); + printf(" getattr:: "); printAttribs(&attribs); + + HIPASSERT(attribs == ref._attrib); + } else { + if (p_verbose & 0x1) { + printf("#%4d.%d GOOD:%p getattr :: ",major, minor, pointer); printAttribs(&attribs); + } + } +} + + +void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) +{ + printf ("===========================================================================\n"); + printf ("clusterAllocs numAllocs=%d size=%lu..%lu\n", numAllocs, minSize, maxSize); + printf ("===========================================================================\n"); + std::vector reference(numAllocs); + + HIPASSERT(minSize > 0); + HIPASSERT(maxSize >= minSize); + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + //--- + //Populate with device and host allocations. + for (int i=0; i 1) { + checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1); + } + + } +} + + +void testMultiThreaded() +{ + std::thread t1(clusterAllocs, 1000, 101, 1000); + std::thread t2(clusterAllocs, 1000, 11, 100); + std::thread t3(clusterAllocs, 1000, 5, 10); + std::thread t4(clusterAllocs, 1000, 1, 4); + + t1.join(); + t2.join(); + t3.join(); + t4.join(); +} + + +int main(int argc, char *argv[]) +{ + + N= 1000000; + HipTest::parseStandardArguments(argc, argv, true); + + HIPCHECK(hipSetDevice(p_gpuDevice)); + + Nbytes = N*sizeof(char); + + printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice); + + + if (p_tests & 0x1) { + simpleTests(); + } + + if (p_tests & 0x2) { + srand(0x100); + clusterAllocs(100, 1024*1, 1024*1024); + } + + if (p_tests & 0x4) { + srand(0x200); + clusterAllocs(1000, 1, 10); // Many tiny allocations; + } + + if (p_tests & 0x8) { + testMultiThreaded(); + } + + printf ("\n"); + passed(); +} diff --git a/projects/hip/tests/src/test_common.cpp b/projects/hip/tests/src/test_common.cpp index d7a108a11b..02deb51c85 100644 --- a/projects/hip/tests/src/test_common.cpp +++ b/projects/hip/tests/src/test_common.cpp @@ -28,6 +28,8 @@ int iterations = 1; unsigned blocksPerCU = 6; // to hide latency unsigned threadsPerBlock = 256; int p_gpuDevice = 0; +unsigned p_verbose = 0; +int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/ @@ -114,8 +116,16 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) failed("Bad gpuDevice argument"); } - } - else { + } else if (!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose)) { + failed("Bad verbose argument"); + } + } else if (!strcmp(arg, "--tests") || (!strcmp(arg, "-t"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_tests)) { + failed("Bad tests argument"); + } + + } else { if (failOnUndefinedArg) { failed("Bad argument '%s'", arg); } else { diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index fee052c1ad..57d2ebc831 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -53,6 +53,8 @@ extern int iterations; extern unsigned blocksPerCU; extern unsigned threadsPerBlock; extern int p_gpuDevice; +extern unsigned p_verbose; +extern int p_tests; namespace HipTest { From 680b600b4a8ad9a506f75655319d03fbd7fd6880 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 11 Feb 2016 22:03:01 -0600 Subject: [PATCH 15/94] Tracker improvements - add API to add / remove user-pointers from the tracker. - test for thread-safety with MultiThreadtest_2 - rapid insertions/removal. - add mutex to provide thread-safety. - rename tracker interface to "memtracker_..." for consistency. - add am_memtracker_reset, connect to hipDeviceReset. - [ROCm/hip commit: de45e2291e720169fddb585db92de608dcb1761b] --- projects/hip/include/hcc_detail/AM.h | 60 ++++-- projects/hip/src/hc_AM.cpp | 135 ++++++++++++-- projects/hip/src/hip_hcc.cpp | 15 +- projects/hip/tests/src/hipPointerAttrib.cpp | 197 +++++++++++++++++--- 4 files changed, 353 insertions(+), 54 deletions(-) diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h index 1cfcf2dfb2..d41fed317a 100644 --- a/projects/hip/include/hcc_detail/AM.h +++ b/projects/hip/include/hcc_detail/AM.h @@ -15,22 +15,27 @@ namespace hc { // This is the data that is maintained for each pointer: struct AmPointerInfo { - bool _isDeviceMem; - void * _hostPointer; - void * _devicePointer; - size_t _sizeBytes; - hc::accelerator _acc; - unsigned _allocationFlags; + void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. + void * _devicePointer; ///< Device pointer. + size_t _sizeBytes; ///< Size of allocation. + hc::accelerator _acc; ///< Device / Accelerator to use. + bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) + bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. + + int _appId; ///< App-specific storage. Used by HIP to store deviceID. + unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags. AmPointerInfo() {}; - AmPointerInfo(bool isDeviceMem, void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, unsigned allocationFlags) : - _isDeviceMem(isDeviceMem), + AmPointerInfo(void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, bool isInDeviceMem, bool isAmManaged) : _hostPointer(hostPointer), _devicePointer(devicePointer), _sizeBytes(sizeBytes), _acc(acc), - _allocationFlags(allocationFlags) {}; + _isInDeviceMem(isInDeviceMem), + _isAmManaged(isAmManaged), + _appId(-1), + _appAllocationFlags(0) {}; }; } @@ -73,19 +78,46 @@ am_status_t AM_free(void* ptr); */ am_status_t AM_copy(void* dst, const void* src, size_t size); -am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr); +/** + * Return information about tracked pointer. + * + * AM tracks pointers when they are allocated or added to tracker with am_track_pointer. + * The tracker tracks the base pointer as well as the size of the allocation, and will + * find the information for a pointer anywhere in the tracked range. + * + * @returns AM_ERROR_MISC if pointer is not currently being tracked. + * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. + * + * @see AM_memtracker_add, + */ +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); +//TODO-doc +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); -// TODO-implement these: -//am_status_t AM_track_pointer(void* ptr, size_t size, bool isDeviceMem=false, unsigned allocationFlags=0x0); -//am_status_t AM_untrack_pointer(void* ptr); +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); + +/** + * Remove the pointer from the tracker structure. + * + * @p ptr may be anywhere in a tracked memory range. + * + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. + */ +am_status_t am_memtracker_remove(void* ptr); + +/** + * Remove all memory allocations associated with specified accelerator. + */ +size_t am_memtracker_reset(hc::accelerator acc); /** * Prints the contents of the memory tracker table to stderr * * Intended primarily for debug purposes. **/ -void AM_print_tracker(); +void am_memtracker_print(); }; // namespace hc diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 87e29e4bcc..36c8abf193 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -5,7 +5,8 @@ #include "hcc_detail/AM.h" // TODO - Remove me. -#define DB_TRACKER 1 +#define DB_TRACKER 0 +#define MUTEX_LOCK 1 #if DB_TRACKER #define mprintf( ...) {\ @@ -43,14 +44,16 @@ struct AmMemoryRangeCompare { std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) { os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes - << " isDeviceMem:" << ap._isDeviceMem << " allocFlags:" << ap._allocationFlags; + << " isInDeviceMem:" << ap._isInDeviceMem << " isAmManaged:" << ap._isAmManaged + << " appId:" << ap._appId << " appAllocFlags:" << ap._appAllocationFlags; return os; } - +//------------------------------------------------------------------------------------------------- // This structure tracks information for each pointer. -// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. +// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size +// will find the associated AmPointerInfo. // The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. // The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. class AmPointerTracker { @@ -64,9 +67,18 @@ public: MapTrackerType::iterator end() { return _tracker.end(); }; + size_t reset (hc::accelerator acc); + std::ostream & print (std::ostream &os); private: + // TODO - use or remove. + inline void writeLock(); + inline void writeUnlock(); + inline void readLock(); + inline void readUnlock(); + MapTrackerType _tracker; + std::mutex _mutex; //std::shared_timed_mutex _mut; }; @@ -74,11 +86,10 @@ private: //--- void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) { - // TODO-mutex - write lock. + std::lock_guard l (_mutex); + mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); - - } @@ -87,6 +98,7 @@ void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) int AmPointerTracker::remove (void *pointer) { // TODO-mutex - write lock. + std::lock_guard l (_mutex); mprintf ("remove: %p\n", pointer); return _tracker.erase(AmMemoryRange(pointer,1)); } @@ -96,14 +108,17 @@ int AmPointerTracker::remove (void *pointer) AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) { // TODO-mutex- read lock + std::lock_guard l (_mutex); auto iter = _tracker.find(AmMemoryRange(pointer,1)); mprintf ("find: %p\n", pointer); return iter; } +//--- std::ostream & AmPointerTracker::print (std::ostream &os) { + std::lock_guard l (_mutex); for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; os << iter->second << std::endl; @@ -112,6 +127,65 @@ std::ostream & AmPointerTracker::print (std::ostream &os) return os; } +//--- +// Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). +// Returns count of ranges removed. +size_t AmPointerTracker::reset (hc::accelerator acc) +{ + std::lock_guard l (_mutex); + mprintf ("reset: \n"); + + size_t count = 0; + // relies on C++11 (erase returns iterator) + for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { + if (iter->second._acc == acc) { + if (iter->second._isAmManaged) { + hsa_memory_free(iter->first._basePointer); + } + count++; + + iter = _tracker.erase(iter); + } else { + iter++; + } + } + + return count; +} + + + +//--- +void AmPointerTracker::writeLock () +{ + _mutex.lock(); +} + + +//--- +void AmPointerTracker::writeUnlock () +{ + _mutex.unlock(); +} + + +//--- +// TODO - support multiple concurrent reader +void AmPointerTracker::readLock () +{ + _mutex.lock(); +} + + +//--- +// TODO - support multiple concurrent reader +void AmPointerTracker::readUnlock () +{ + _mutex.unlock(); +} + + + //========================================================================================================= // Global var defs: @@ -153,10 +227,10 @@ auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) } else { if (flags & amHostPinned) { g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(false/*isDevice*/, ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, true /*isAMManaged*/)); } else { g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(true/*isDevice*/, NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, true /*isAMManaged*/)); } } } @@ -172,9 +246,10 @@ am_status_t AM_free(void* ptr) am_status_t status = AM_SUCCESS; if (ptr != NULL) { + // See also tracker::reset which can free memory. hsa_memory_free(ptr); - size_t numRemoved = g_amPointerTracker.remove(ptr) ; + int numRemoved = g_amPointerTracker.remove(ptr) ; if (numRemoved == 0) { status = AM_ERROR_MISC; } @@ -199,7 +274,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) } -am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) { auto infoI = g_amPointerTracker.find(ptr); if (infoI != g_amPointerTracker.end()) { @@ -210,10 +285,46 @@ am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) } } -void AM_print_tracker() + +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); + + +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) +{ + if (isDeviceMem) { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); + } else { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); + } + + return AM_SUCCESS; +} + + +am_status_t am_memtracker_remove(void* ptr) +{ + am_status_t status = AM_SUCCESS; + + int numRemoved = g_amPointerTracker.remove(ptr) ; + if (numRemoved == 0) { + status = AM_ERROR_MISC; + } + + return status; +} + +//--- +void am_memtracker_print() { g_amPointerTracker.print(std::cerr); } +//--- +size_t am_memtracker_reset(hc::accelerator acc) +{ + return g_amPointerTracker.reset(acc); +} + + } // end namespace hc. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e9ee4c41dc..a4246dc9cb 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -44,8 +44,8 @@ THE SOFTWARE. #include "hc_AM.cpp" #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) - #define USE_ASYNC_COPY 0 +#define USE_AM_TRACKER 1 /* use new AM memory tracker features */ #define INLINE static inline @@ -802,6 +802,13 @@ hipError_t hipDeviceReset(void) // It should destroy and clean up all resources allocated with the default device in the current process. // and needs to destroy all queues as well. // +#if USE_AM_TRACKER + // TODO - remove bug above. + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + if (device) { + am_memtracker_reset(device->_acc); + } +#endif return ihipLogStatus(hipSuccess); } @@ -1281,14 +1288,14 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; hc::AmPointerInfo amPointerInfo; - am_status_t status = hc::AM_get_pointer_info(&amPointerInfo, ptr); + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { - attributes->memoryType = amPointerInfo._isDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; + attributes->memoryType = amPointerInfo._isInDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; attributes->hostPointer = amPointerInfo._hostPointer; attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; - attributes->allocationFlags = amPointerInfo._allocationFlags; + attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = -1; diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp index 9d147d8183..93d503af65 100644 --- a/projects/hip/tests/src/hipPointerAttrib.cpp +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -75,7 +75,7 @@ void resetAttribs(hipPointerAttribute_t *attribs) }; -void printAttribs(hipPointerAttribute_t *attribs) +void printAttribs(const hipPointerAttribute_t *attribs) { printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", attribs->hostPointer, @@ -99,8 +99,13 @@ inline int zrand(int max) //================================================================================================= // //Run through a couple simple cases to test lookups and hostd pointer arithmetic: -void simpleTests() +void testSimple() { + printf ("\n"); + printf ("===========================================================================\n"); + printf ("Simple Tests\n"); + printf ("===========================================================================\n"); + char *A_d; char *A_Pinned_h; char *A_OSAlloc_h; @@ -179,8 +184,24 @@ void simpleTests() } +void resetTracker () +{ + if (p_verbose & 0x1) { + printf ("info: reset tracker for all devices in platform\n"); + } + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + // Clean up: + for (int i=0; i reference(numAllocs); HIPASSERT(minSize > 0); @@ -244,14 +264,15 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) reference[i]._attrib.memoryType = hipMemoryTypeHost; reference[i]._attrib.devicePointer = ptr; reference[i]._attrib.hostPointer = ptr; - reference[i]._attrib.allocationFlags = 1; // TODO-randomize these. + reference[i]._attrib.allocationFlags = 0; // TODO-randomize these. } reference[i]._pointer = ptr; } #ifdef __HIP_PLATFORM_HCC__ if (p_verbose & 0x2) { - hc::AM_print_tracker(); + printf ("Tracker after insertions:\n"); + hc::am_memtracker_print(); } #endif @@ -265,27 +286,143 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1); } + if (ref._attrib.memoryType == hipMemoryTypeDevice) { + hipFree(ref._pointer); + } else { + hipFreeHost(ref._pointer); + } + + } + + + +#ifdef __HIP_PLATFORM_HCC__ + if (p_verbose & 0x2) { + printf ("Tracker after cleanup:\n"); + hc::am_memtracker_print(); + } +#endif +} + + +void testMultiThreaded_1(bool serialize=false) +{ + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_1\n"); + if (serialize) printf ("[SERIALIZE]\n"); + printf ("===========================================================================\n"); + std::thread t1(clusterAllocs, 1000, 101, 1000); + if (serialize) t1.join(); + + std::thread t2(clusterAllocs, 1000, 11, 100); + if (serialize) t2.join(); + + std::thread t3(clusterAllocs, 1000, 5, 10); + if (serialize) t3.join(); + + std::thread t4(clusterAllocs, 1000, 1, 4); + if (serialize) t4.join(); + + if (!serialize) { + t1.join(); + t2.join(); + t3.join(); + t4.join(); + } + + resetTracker(); +} + + +///================================================================================================ + + +// Add pointers to tracker very quickly. +void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) +{ + int count = 0; + + for (int count=0; count< 1000000; count++) { + hipPointerAttribute_t a; + hipError_t e = hipPointerGetAttributes(&a, ptr); + if ((e != hipSuccess) || (a!= *refAttrib)) { + printf("Test %d (err=%d)\n", count, e); + HIPCHECK(e); + + printf(" ref :: "); printAttribs(refAttrib); + printf(" getattr:: "); printAttribs(&a); + } } } -void testMultiThreaded() +enum Dir {Up, Down}; +void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir) { - std::thread t1(clusterAllocs, 1000, 101, 1000); - std::thread t2(clusterAllocs, 1000, 11, 100); - std::thread t3(clusterAllocs, 1000, 5, 10); - std::thread t4(clusterAllocs, 1000, 1, 4); + const size_t bufferSize = 16; + size_t maxSize = numBuffers*bufferSize; + HIPASSERT((maxSize % bufferSize) == 0); // loop logic assumes this is true + + + for (int i=0; i inflight(2); + + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_2\n"); + printf ("===========================================================================\n"); + + hipSetDevice(0); + hipDeviceReset(); + + // Create some entries in the tracker: + for (int i=0; i<1000; i++) { + void *C_d; + HIPCHECK(hipMalloc(&C_d, 32)); + } + + + // Allocate a pointer that we will repeatedly lookup: + void *A_d; + HIPCHECK(hipMalloc(&A_d, 10000)); + hipPointerAttribute_t attrib1; + HIPCHECK(hipPointerGetAttributes(&attrib1, A_d)); + std::thread t1(thread_query, A_d, &attrib1); + + std::thread t2(thread_noise_generator, 10000, 1000, Up, Up); t1.join(); t2.join(); - t3.join(); - t4.join(); + + hipSetDevice(0); + hipDeviceReset(); } int main(int argc, char *argv[]) { - N= 1000000; HipTest::parseStandardArguments(argc, argv, true); @@ -296,22 +433,34 @@ int main(int argc, char *argv[]) printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice); - if (p_tests & 0x1) { - simpleTests(); + if (p_tests & 0x01) { + testSimple(); } - if (p_tests & 0x2) { + if (p_tests & 0x02) { srand(0x100); + printf ("\n===========================================================================\n"); clusterAllocs(100, 1024*1, 1024*1024); + resetTracker(); } - if (p_tests & 0x4) { + if (p_tests & 0x04) { srand(0x200); + printf ("\n===========================================================================\n"); clusterAllocs(1000, 1, 10); // Many tiny allocations; + resetTracker(); } - if (p_tests & 0x8) { - testMultiThreaded(); + if (p_tests & 0x08) { + srand(0x300); + testMultiThreaded_1(true); + testMultiThreaded_1(false); + } + + if (p_tests & 0x10) { + srand(0x400); + testMultiThreaded_2(); + resetTracker(); } printf ("\n"); From 305076d78fb4b59fedbaf7e1a61e8c748a4d3730 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 11 Feb 2016 23:07:19 -0600 Subject: [PATCH 16/94] Use memtracker 'appID' to store deviceID associated with ptr [ROCm/hip commit: 00fd172c64999cdb83d060ac2b2deadc1e04f592] --- projects/hip/src/hc_AM.cpp | 12 ++++- projects/hip/src/hip_hcc.cpp | 87 ++++++++++++++++++++++++------------ 2 files changed, 70 insertions(+), 29 deletions(-) diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 36c8abf193..92310164c0 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -286,7 +286,17 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) } -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags) +{ + auto iter = g_amPointerTracker.find(ptr); + if (iter != g_amPointerTracker.end()) { + iter->second._appId = appId; + iter->second._appAllocationFlags = allocationFlags; + return AM_SUCCESS; + } else { + return AM_ERROR_MISC; + } +} am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index a4246dc9cb..21da73d1da 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1295,18 +1295,11 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->hostPointer = amPointerInfo._hostPointer; attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; + attributes->allocationFlags = amPointerInfo._appAllocationFlags; + attributes->device = amPointerInfo._appId; - attributes->device = -1; - e = hipErrorInvalidDevice; - for (int i=0; idevice = i; - e = hipSuccess; - break; - } - } } else { attributes->memoryType = hipMemoryTypeDevice; attributes->hostPointer = 0; @@ -1322,6 +1315,36 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) } +// TODO - test this function: +/** + * @returns #hipSuccess, + * @returns #hipErrorInvalidValue if flags are not 0 + * @returns #hipErrorMemoryAllocation if hostPointer is not a tracked allocation. + */ +hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags) +{ + std::call_once(hip_initialized, ihipInit); + + hipError_t e = hipSuccess; + + // Flags must be 0: + if (flags == 0) { + e = hipErrorInvalidValue; + } else { + hc::AmPointerInfo amPointerInfo; + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); + if (status == AM_SUCCESS) { + *devicePointer = amPointerInfo._devicePointer; + } else { + e = hipErrorMemoryAllocation; + *devicePointer = NULL; + } + } + + return ihipLogStatus(e); +} + + // kernel for launching memcpy operations: template @@ -1398,24 +1421,31 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) } //--- +/** + * @returns #hipSuccess #hipErrorMemoryAllocation + */ hipError_t hipMalloc(void** ptr, size_t sizeBytes) { std::call_once(hip_initialized, ihipInit); hipError_t hip_status = hipSuccess; - const unsigned am_flags = 0; - *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + auto device = ihipGetTlsDefaultDevice(); - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; + if (device) { + const unsigned am_flags = 0; + *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_device_index, 0); + } } else { - hip_status = hipSuccess; + hip_status = hipErrorMemoryAllocation; } - ihipLogStatus(hip_status); - - return hip_status; + return ihipLogStatus(hip_status); } @@ -1423,23 +1453,24 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) { std::call_once(hip_initialized, ihipInit); + hipError_t hip_status = hipSuccess; #if USE_PINNED_HOST const unsigned am_flags = amHostPinned; + auto device = ihipGetTlsDefaultDevice(); - *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); - hipError_t hip_status = hipSuccess; - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } else { - hip_status = hipSuccess; + if (device) { + *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_device_index, 0); + } + + tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); } - tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); - - ihipLogStatus(hip_status); - - return hip_status; + return ihipLogStatus(hip_status); #else // TODO-hcc remove-me From c1fd0e16083a5dc8da10d7ce309c230a15642779 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 11 Feb 2016 23:13:43 -0600 Subject: [PATCH 17/94] Remove ! USE_PINNED_HOST support [ROCm/hip commit: 0370cd1cfcf96877ad86ea7d49a418be5218f2f3] --- projects/hip/src/hip_hcc.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 21da73d1da..e63186692c 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -43,7 +43,6 @@ THE SOFTWARE. #include "hc_AM.cpp" -#define USE_PINNED_HOST (__hcc_workweek__ >= 1601) #define USE_ASYNC_COPY 0 #define USE_AM_TRACKER 1 /* use new AM memory tracker features */ @@ -1454,7 +1453,6 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) std::call_once(hip_initialized, ihipInit); hipError_t hip_status = hipSuccess; -#if USE_PINNED_HOST const unsigned am_flags = amHostPinned; auto device = ihipGetTlsDefaultDevice(); @@ -1472,20 +1470,6 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) return ihipLogStatus(hip_status); -#else - // TODO-hcc remove-me - - // This code only works on Kaveri: - *ptr = malloc(sizeBytes); // TODO - call am_alloc for device memory, this will only on KV HSA. - if (*ptr != NULL) { - //TODO-hsart : need memory pin APIs to implement this correctly. - // FOr now do our best to allocate the memory, but return an error since - // the returned pointer can only be used on the HOST not the GPU. - return ihipLogStatus(hipErrorMemoryAllocation); - } else { - return ihipLogStatus(hipErrorMemoryAllocation); - } -#endif } hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) @@ -1695,12 +1679,8 @@ hipError_t hipFreeHost(void* ptr) std::call_once(hip_initialized, ihipInit); if (ptr) { -#if USE_PINNED_HOST tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); hc::AM_free(ptr); -#else - free(ptr); -#endif } return ihipLogStatus(hipSuccess); From b9dc0e9497db38f211542fb5047736e665c2f148 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 00:08:52 -0600 Subject: [PATCH 18/94] Query tracked memory sizes. Support more accurate hipMemGetInfo. Add test to hipPointerAttrib. [ROCm/hip commit: d7396b5af39b0ac8b7eedf68f30a4dcff396fbe8] --- projects/hip/include/hcc_detail/AM.h | 13 ++++++- projects/hip/src/hc_AM.cpp | 43 +++++++++++++++++++-- projects/hip/src/hip_hcc.cpp | 29 ++++++++++---- projects/hip/tests/src/hipPointerAttrib.cpp | 36 ++++++++++++++--- 4 files changed, 103 insertions(+), 18 deletions(-) diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h index d41fed317a..c183844869 100644 --- a/projects/hip/include/hcc_detail/AM.h +++ b/projects/hip/include/hcc_detail/AM.h @@ -78,6 +78,7 @@ am_status_t AM_free(void* ptr); */ am_status_t AM_copy(void* dst, const void* src, size_t size); + /** * Return information about tracked pointer. * @@ -92,10 +93,14 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); */ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); + +//TODO-doc +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); + + //TODO-doc am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); /** * Remove the pointer from the tracker structure. @@ -109,16 +114,20 @@ am_status_t am_memtracker_remove(void* ptr); /** * Remove all memory allocations associated with specified accelerator. + * + * @returns Number of entries reset. */ size_t am_memtracker_reset(hc::accelerator acc); /** - * Prints the contents of the memory tracker table to stderr + * Prints info about the memory tracker table. * * Intended primarily for debug purposes. **/ void am_memtracker_print(); +void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize); + }; // namespace hc diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 92310164c0..3a6d116261 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -63,13 +63,15 @@ public: void insert(void *pointer, const hc::AmPointerInfo &p); int remove(void *pointer); - MapTrackerType::iterator find(void *hostPtr); + MapTrackerType::iterator find(void *hostPtr) ; + + MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; + MapTrackerType::iterator end() { return _tracker.end(); } ; + void readerUnlock() { _mutex.unlock(); }; - MapTrackerType::iterator end() { return _tracker.end(); }; size_t reset (hc::accelerator acc); - std::ostream & print (std::ostream &os); private: // TODO - use or remove. inline void writeLock(); @@ -115,6 +117,7 @@ AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointe } +#if 0 //--- std::ostream & AmPointerTracker::print (std::ostream &os) { @@ -126,6 +129,7 @@ std::ostream & AmPointerTracker::print (std::ostream &os) return os; } +#endif //--- // Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). @@ -326,7 +330,38 @@ am_status_t am_memtracker_remove(void* ptr) //--- void am_memtracker_print() { - g_amPointerTracker.print(std::cerr); + std::ostream &os = std::cerr; + + //g_amPointerTracker.print(std::cerr); + for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { + os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; + os << iter->second << std::endl; + } + + g_amPointerTracker.readerUnlock(); +} + + +//--- +void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize) +{ + *deviceMemSize = *hostMemSize = *userMemSize = 0; + for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { + if (iter->second._acc == acc) { + size_t sizeBytes = iter->second._sizeBytes; + if (iter->second._isAmManaged) { + if (iter->second._isInDeviceMem) { + *deviceMemSize += sizeBytes; + } else { + *hostMemSize += sizeBytes; + } + } else { + *userMemSize += sizeBytes; + } + } + } + + g_amPointerTracker.readerUnlock(); } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e63186692c..fe273aa21c 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1286,6 +1286,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; +#if USE_AM_TRACKER hc::AmPointerInfo amPointerInfo; am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { @@ -1309,11 +1310,15 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) e = hipErrorInvalidValue; } +#else + e = hipErrorInvalidValue; +#endif return ihipLogStatus(e); } +#if USE_AM_TRACKER // TODO - test this function: /** * @returns #hipSuccess, @@ -1342,6 +1347,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi return ihipLogStatus(e); } +#endif @@ -1438,7 +1444,9 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { +#ifdef USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); +#endif } } else { hip_status = hipErrorMemoryAllocation; @@ -1462,7 +1470,9 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { +#ifdef USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); +#endif } tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); @@ -1627,10 +1637,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) /* - * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug) - * @bug - on hcc free always returns 50% of peak regardless of current allocations. hipMemGetInfo returns hipErrorInvalidValue to indicate this. + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug)S + * @warning On HCC, the free memory only accounts for memory allocated by this process and may be optimistic. */ -hipError_t hipMemGetInfo ( size_t * free, size_t * total ) +hipError_t hipMemGetInfo (size_t *free, size_t *total) { std::call_once(hip_initialized, ihipInit); @@ -1643,17 +1653,22 @@ hipError_t hipMemGetInfo ( size_t * free, size_t * total ) } if (free) { - *free = hipDevice->_props.totalGlobalMem * 0.5; // TODO +#if USE_AM_TRACKER + // TODO - replace with kernel-level for reporting free memory: + size_t deviceMemSize, hostMemSize, userMemSize; + hc::am_memtracker_sizeinfo(hipDevice->_acc, &deviceMemSize, &hostMemSize, &userMemSize); + *free = hipDevice->_props.totalGlobalMem - deviceMemSize; +#else + *free = hipDevice->_props.totalGlobalMem * 0.5; // TODO e=hipErrorInvalidValue; +#endif } } else { e = hipErrorInvalidDevice; } - // TODO-runtime - when we fix the 50% bug. - //return ihipLogStatus(hipErrorSuccess); - return ihipLogStatus(hipErrorInvalidValue); + return ihipLogStatus(e); } diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp index 93d503af65..1418997274 100644 --- a/projects/hip/tests/src/hipPointerAttrib.cpp +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -115,6 +115,11 @@ void testSimple() HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) ); A_OSAlloc_h = (char*)malloc(Nbytes); + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + printf ("hipMemGetInfo: free=%zu (%4.2f) Nbytes=%lu total=%zu (%4.2f)\n", free, (float)(free/1024.0/1024.0), Nbytes, total, (float)(total/1024.0/1024.0)); + HIPASSERT(free + Nbytes <= total); + hipPointerAttribute_t attribs; hipPointerAttribute_t attribs2; @@ -244,6 +249,10 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) //--- //Populate with device and host allocations. + size_t totalDeviceAllocated[numDevices]; + for (int i =0; i=0; p-=bufferSize) { hc::am_memtracker_add(p, bufferSize, acc, false); } } if (removeDir == Up) { - for (char *p = basePtr; p=0; p-=bufferSize) { + hc::am_memtracker_remove(p); + } + } } } From c441d5ec29b8d4e59a41dedace57118165371e9a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 04:30:09 -0600 Subject: [PATCH 19/94] Step1 in staging buffer copy. - use StagingBuffer class for copies. - refactor g_device to use array rather than vector. (keeps pointers from moving). [ROCm/hip commit: 24c1fdb8643d25be2218fac99095c1ed0e4ea059] --- projects/hip/include/hcc_detail/AM.h | 8 +- projects/hip/src/hc_AM.cpp | 18 +-- projects/hip/src/hip_hcc.cpp | 200 +++++++++++++++++++++---- projects/hip/tests/src/hipMemcpy.cpp | 2 +- projects/hip/tests/src/test_common.cpp | 2 +- projects/hip/util/vim/hip.vim | 3 + 6 files changed, 188 insertions(+), 45 deletions(-) diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h index c183844869..04804ffaa5 100644 --- a/projects/hip/include/hcc_detail/AM.h +++ b/projects/hip/include/hcc_detail/AM.h @@ -22,8 +22,8 @@ struct AmPointerInfo { bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. - int _appId; ///< App-specific storage. Used by HIP to store deviceID. - unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags. + int _appId; ///< App-specific storage. (Used by HIP to store deviceID.) + unsigned _appAllocationFlags; ///< App-specific allocation flags. (Used by HIP to store allocation flags.) AmPointerInfo() {}; @@ -91,7 +91,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); * * @see AM_memtracker_add, */ -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); //TODO-doc @@ -99,7 +99,7 @@ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, //TODO-doc -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); +am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); /** diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 3a6d116261..2d22b49fd4 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -24,10 +24,10 @@ //#include struct AmMemoryRange { - void * _basePointer; - void * _endPointer; - AmMemoryRange(void *basePointer, size_t sizeBytes) : - _basePointer(basePointer), _endPointer((unsigned char*)basePointer + sizeBytes - 1) {}; + const void * _basePointer; + const void * _endPointer; + AmMemoryRange(const void *basePointer, size_t sizeBytes) : + _basePointer(basePointer), _endPointer((const unsigned char*)basePointer + sizeBytes - 1) {}; }; // Functor to compare ranges: @@ -63,7 +63,7 @@ public: void insert(void *pointer, const hc::AmPointerInfo &p); int remove(void *pointer); - MapTrackerType::iterator find(void *hostPtr) ; + MapTrackerType::iterator find(const void *hostPtr) ; MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; MapTrackerType::iterator end() { return _tracker.end(); } ; @@ -107,7 +107,7 @@ int AmPointerTracker::remove (void *pointer) //--- -AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) +AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) { // TODO-mutex- read lock std::lock_guard l (_mutex); @@ -144,7 +144,7 @@ size_t AmPointerTracker::reset (hc::accelerator acc) for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { if (iter->second._acc == acc) { if (iter->second._isAmManaged) { - hsa_memory_free(iter->first._basePointer); + hsa_memory_free(const_cast (iter->first._basePointer)); } count++; @@ -278,7 +278,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) } -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) { auto infoI = g_amPointerTracker.find(ptr); if (infoI != g_amPointerTracker.end()) { @@ -290,7 +290,7 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) } -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags) +am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) { auto iter = g_amPointerTracker.find(ptr); if (iter != g_amPointerTracker.end()) { diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index fe273aa21c..08f7859271 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -43,7 +43,7 @@ THE SOFTWARE. #include "hc_AM.cpp" -#define USE_ASYNC_COPY 0 +#define USE_ASYNC_COPY 1 #define USE_AM_TRACKER 1 /* use new AM memory tracker features */ #define INLINE static inline @@ -60,10 +60,12 @@ static const int release = 1; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; +int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -#define TRACE_API 0x1 /* trace API calls and return values */ -#define TRACE_SYNC 0x2 /* trace synchronization pieces */ -#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +#define TRACE_API 0x1 /* trace API calls and return values */ +#define TRACE_SYNC 0x2 /* trace synchronization pieces */ +#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +#define TRACE_COPY2 0x8 /* trace memory copy commands. Detailed. */ #define tprintf(trace_level, ...) {\ if (HIP_TRACE_API & trace_level) {\ @@ -119,6 +121,28 @@ struct ihipEvent_t { } ; +//------------------------------------------------------------------------------------------------- +struct StagingBuffer { + static const int numBuffers = 2; + + int _bufferIndex; // Operating on buffer 0 or 1? + + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + + + StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + + void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); + +private: + char *_pinnedStagingBuffer[numBuffers]; + hsa_signal_t _completion_signal[numBuffers]; +}; + + + +//------------------------------------------------------------------------------------------------- struct ihipDevice_t { unsigned _device_index; // index into g_devices. @@ -135,8 +159,11 @@ struct ihipDevice_t unsigned _compute_units; + StagingBuffer *_staging_host2device; + StagingBuffer *_staging_device2host; + public: - ihipDevice_t(unsigned device_index, hc::accelerator acc); + void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); // TODO- create a copy constructor. @@ -145,10 +172,10 @@ public: //================================================================================================= -ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) - : _device_index(device_index), - _acc(acc) +void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) { + _device_index = device_index; + _acc = acc; hsa_agent_t *agent = static_cast (acc.get_default_view().get_hsa_agent()); if (agent) { int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units); @@ -166,6 +193,9 @@ ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) _null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); + + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_device2host = NULL; }; #if 0 @@ -187,7 +217,8 @@ thread_local int tls_defaultDevice = 0; // Global initialization. std::once_flag hip_initialized; -std::vector g_devices; // Vector of all non-emulated (ie GPU) accelerators in the system. +ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +unsigned g_deviceCnt; //================================================================================================= @@ -462,25 +493,36 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c //It is called with C++11 call_once, which provided thread-safety. void ihipInit() { - - /* - * Build a table of valid compute devices. - */ - auto accs = hc::accelerator::get_all(); - g_devices.reserve(accs.size()); - for (int i=0; i"); @@ -489,7 +531,7 @@ void ihipInit() INLINE bool ihipIsValidDevice(unsigned deviceIndex) { // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_devices.size()); + return (deviceIndex < g_deviceCnt); } @@ -508,7 +550,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice() //--- INLINE ihipDevice_t *ihipGetDevice(int deviceId) { - if ((deviceId >= 0) && (deviceId < g_devices.size())) { + if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { return &g_devices[deviceId]; } else { return NULL; @@ -675,7 +717,7 @@ hipError_t hipGetDeviceCount(int *count) { std::call_once(hip_initialized, ihipInit); - *count = g_devices.size(); + *count = g_deviceCnt; if (*count > 0) { return ihipLogStatus(hipSuccess); @@ -764,7 +806,7 @@ hipError_t hipSetDevice(int device) { std::call_once(hip_initialized, ihipInit); - if ((device < 0) || (device >= g_devices.size())) { + if ((device < 0) || (device >= g_deviceCnt)) { return ihipLogStatus(hipErrorInvalidDevice); } else { tls_defaultDevice = device; @@ -1299,6 +1341,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; + if (attributes->device < 0) { + e = hipErrorInvalidDevice; + } + } else { attributes->memoryType = hipMemoryTypeDevice; @@ -1482,6 +1528,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) } +//--- hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { #ifdef USE_MEMCPYTOSYMBOL @@ -1500,6 +1547,102 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } +//------------------------------------------------------------------------------------------------- +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : + _bufferIndex(0), + _device(device), + _bufferSize(bufferSize) +{ + for (int i=0; i_acc, amHostPinned); + if (_pinnedStagingBuffer[i] == NULL) { + throw; + } + hsa_signal_create(0, 0, NULL, &_completion_signal[i]); + } +}; + + +//--- +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) { + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); + + assert(sizeBytes < UINT64_MAX/2); // TODO + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) { + + // TODO - double-buffer these guys. + size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; + + tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]); + + memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes); + + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp); + + hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]); + + tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status); + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + srcp += theseBytes; + dstp += theseBytes; + } +} + + + + +#if USE_AM_TRACKER +// TODO - add mutex to limit in/out: +void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +{ + hc::AmPointerInfo dstPtrInfo, srcPtrInfo; + + bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); + bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); + + bool useStagingBuffer = true; + + // Resolve default to a specific Kind, since we use different algorithms: + if (kind == hipMemcpyDefault) { + bool dstIsHost = (dstNotTracked || dstPtrInfo._isInDeviceMem); + bool srcIsHost = (srcNotTracked || srcPtrInfo._isInDeviceMem); + if (srcIsHost && !dstIsHost) { + kind = hipMemcpyHostToDevice; + } else if (!srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyHostToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToDevice; + } + } + + switch (kind) { + case hipMemcpyHostToDevice: + if (srcNotTracked) { + device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } else { + assert(0); // TODO + //hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + break; + case hipMemcpyDeviceToHost: + // TODO - optimize the copy here. + hc::AM_copy(dst, src, sizeBytes); + break; + default: + assert(0); // TODO + } +} +#endif + + + //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -1517,11 +1660,8 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind ihipDevice_t *device = &g_devices[stream->_device_index]; - hsa_signal_t completion_signal; // init/obtain from pool. + ihipAsyncCopy(device, dst, src, sizeBytes, kind); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, size, device->_hsa_agent, 0, NULL, &completion_signal); - - e = (hsa_status == HSA_STATUS_SUCCESS) ? hipSuccess : hipErrorTbd; } else { e = hipErrorInvalidResourceHandle; } diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 5db2b270d6..7664cfb581 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) size_t Nbytes = N*sizeof(int); - printf ("N=%zu \n", N); + printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; diff --git a/projects/hip/tests/src/test_common.cpp b/projects/hip/tests/src/test_common.cpp index 02deb51c85..3da5568b7c 100644 --- a/projects/hip/tests/src/test_common.cpp +++ b/projects/hip/tests/src/test_common.cpp @@ -88,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) if (!strcmp(arg, " ")) { // skip NULL args. - } else if (!strcmp(arg, "--N")) { + } else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) { if (++i >= argc || !HipTest::parseSize(argv[i], &N)) { failed("Bad N size argument"); } diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim index 01f3b3f2ad..e2236f4fbc 100644 --- a/projects/hip/util/vim/hip.vim +++ b/projects/hip/util/vim/hip.vim @@ -151,6 +151,9 @@ syn keyword hipFunctionName hipUnbindTexture syn keyword hipFlags hipFilterModePoint syn keyword hipFlags hipMemcpyHostToDevice syn keyword hipFlags hipMemcpyDeviceToHost +syn keyword hipFlags hipMemcpyHostToHost +syn keyword hipFlags hipMemcpyDeviceToDevice +syn keyword hipFlags hipMemcpyDefault syn keyword hipFlags hipReadModeElementType syn keyword hipFlags hipSuccess syn keyword hipFlags hipTextureType1D From 83eb66eb8ea87f4b7deb491ae79386efe23d7dc8 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 17:39:44 -0600 Subject: [PATCH 20/94] Improve copy testing implementation. - add tests for (unpinned/pinned) x H2H x D2D. - Free memory at end of test. [ROCm/hip commit: 134d7975ce0710cdf5db0693010853a464a9dbd6] --- projects/hip/src/hip_hcc.cpp | 72 +++++++++++++----- projects/hip/tests/src/hipMemcpy.cpp | 105 ++++++++++++++++++++++++--- projects/hip/tests/src/test_common.h | 70 ++++++++++++++++-- projects/hip/util/vim/hip.vim | 1 + 4 files changed, 214 insertions(+), 34 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 08f7859271..f397b02cbe 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -132,6 +132,7 @@ struct StagingBuffer { StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + ~StagingBuffer(); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); @@ -163,6 +164,7 @@ struct ihipDevice_t StagingBuffer *_staging_device2host; public: + void reset(); void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); @@ -172,6 +174,17 @@ public: //================================================================================================= +// +//Reset the device - this is called from hipDeviceReset. +//Device may be reset multiple times, and may be reset after init. +void ihipDevice_t::reset() +{ + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_device2host = NULL; +}; + + +//--- void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) { _device_index = device_index; @@ -194,8 +207,7 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); - _staging_device2host = NULL; + this->reset(); }; #if 0 @@ -205,6 +217,13 @@ ihipDevice_t::~ihipDevice_t() delete _null_stream; _null_stream = NULL; } + + if (_staging_device2host) { + delete _staging_device2host; + } + if (_staging_host2device){ + delete _staging_host2device; + } } #endif @@ -848,6 +867,7 @@ hipError_t hipDeviceReset(void) ihipDevice_t *device = ihipGetTlsDefaultDevice(); if (device) { am_memtracker_reset(device->_acc); + device->reset(); // re-allocate required resources. } #endif @@ -1562,6 +1582,18 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : } }; +//--- +StagingBuffer::~StagingBuffer() +{ + for (int i=0; i_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); - } else { - assert(0); // TODO - //hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - break; - case hipMemcpyDeviceToHost: - // TODO - optimize the copy here. - hc::AM_copy(dst, src, sizeBytes); - break; - default: - assert(0); // TODO + if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { + if (useStagingBuffer) { + device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } + } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { + // TODO - optimize the copy here. + hc::AM_copy(dst, src, sizeBytes); + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals: + hsa_signal_t completion_signal; + hsa_signal_create(1, 0, NULL, &completion_signal); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, completion_signal); + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + hsa_signal_destroy(completion_signal); } } #endif @@ -1815,6 +1851,7 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total) //--- hipError_t hipFree(void* ptr) { + // TODO - ensure this pointer was created by hipMalloc and not hipMallocHost std::call_once(hip_initialized, ihipInit); @@ -1831,6 +1868,7 @@ hipError_t hipFree(void* ptr) hipError_t hipFreeHost(void* ptr) { + // TODO - ensure this pointer was created by hipMallocHost and not hipMalloc std::call_once(hip_initialized, ihipInit); if (ptr) { diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 7664cfb581..241c39c2ad 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -23,24 +23,21 @@ THE SOFTWARE. #include "test_common.h" - -int main(int argc, char *argv[]) +// Test simple H2D copies and back. +void simpleTest1() { - HipTest::parseStandardArguments(argc, argv, true); - + printf ("test: %s\n", __func__); size_t Nbytes = N*sizeof(int); - printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N); - + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); @@ -50,8 +47,98 @@ int main(int argc, char *argv[]) HIPCHECK (hipDeviceSynchronize()); - HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK (hipDeviceReset()); + + printf (" %s success\n", __func__); +} + + +// Test many different kinds of memory copies: + +template +void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) +{ + printf ("test: %s\n", __func__); + + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + size_t Nbytes = N*sizeof(T); + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + T *A_hh = NULL; + T *B_hh = NULL; + T *C_dd = NULL; + + // Allocate some extra arrays: + + HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); + + + if (useHostToHost) { + if (usePinnedHost) { + HIPCHECK ( hipMallocHost(&A_hh, Nbytes) ); + HIPCHECK ( hipMallocHost(&B_hh, Nbytes) ); + } else { + A_hh = (T*)malloc(Nbytes); + B_hh = (T*)malloc(Nbytes); + } + + + // Do some extra host copies here to mix things up: + HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + + HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + +#if 0 + // Do some extra host copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); +#else + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); +#endif + + HIPCHECK ( hipDeviceSynchronize() ); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); + HIPCHECK ( hipDeviceReset() ); + + printf (" %s success\n", __func__); +} + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + + simpleTest1(); + + memcpyKind(false, false, false); + memcpyKind(true, false, false); + //memcpyKind(true); + passed(); } diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 57d2ebc831..5b631d2c3a 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -96,7 +96,7 @@ vectorADD(hipLaunchParm lp, template void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, - size_t N) + size_t N, bool usePinnedHost=false) { size_t Nbytes = N*sizeof(T); @@ -110,14 +110,32 @@ void initArrays(T **A_d, T **B_d, T **C_d, HIPCHECK ( hipMalloc(C_d, Nbytes) ); } - if (A_h) - *A_h = (T*)malloc(Nbytes); - - if (B_h) - *B_h = (T*)malloc(Nbytes); + if (usePinnedHost) { + if (A_h) { + HIPCHECK ( hipMallocHost(A_h, Nbytes) ); + } + if (B_h) { + HIPCHECK ( hipMallocHost(B_h, Nbytes) ); + } + if (C_h) { + HIPCHECK ( hipMallocHost(C_h, Nbytes) ); + } + } else { + if (A_h) { + *A_h = (T*)malloc(Nbytes); + HIPASSERT(*A_h != NULL); + } + + if (B_h) { + *B_h = (T*)malloc(Nbytes); + HIPASSERT(*B_h != NULL); + } - if (C_h) - *C_h = (T*)malloc(Nbytes); + if (C_h) { + *C_h = (T*)malloc(Nbytes); + HIPASSERT(*C_h != NULL); + } + } // Initialize the host data: @@ -130,7 +148,43 @@ void initArrays(T **A_d, T **B_d, T **C_d, } +template +void freeArrays(T *A_d, T *B_d, T *C_d, + T *A_h, T *B_h, T *C_h, bool usePinnedHost) +{ + if (A_d) { + HIPCHECK ( hipFree(A_d) ); + } + if (B_d) { + HIPCHECK ( hipFree(B_d) ); + } + if (C_d) { + HIPCHECK ( hipFree(C_d) ); + } + if (usePinnedHost) { + if (A_h) { + HIPCHECK (hipFreeHost(A_h)); + } + if (B_h) { + HIPCHECK (hipFreeHost(B_h)); + } + if (C_h) { + HIPCHECK (hipFreeHost(C_h)); + } + } else { + if (A_h) { + free (A_h); + } + if (B_h) { + free (B_h); + } + if (C_h) { + free (C_h); + } + } + +} // Assumes C_h contains vector add of A_h + B_h diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim index e2236f4fbc..e4ea0a4a9e 100644 --- a/projects/hip/util/vim/hip.vim +++ b/projects/hip/util/vim/hip.vim @@ -91,6 +91,7 @@ syn keyword hipFunctionName hipD3D9UnmapResources syn keyword hipFunctionName hipD3D9UnregisterResource syn keyword hipFunctionName hipDeviceGetProperties syn keyword hipFunctionName hipDeviceSynchronize +syn keyword hipFunctionName hipDeviceReset syn keyword hipFunctionName hipEventCreate syn keyword hipFunctionName hipEventDestroy syn keyword hipFunctionName hipEventElapsedTime From a835134f8eaac9676e74de260ee90d97b2b7984c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 18:23:55 -0600 Subject: [PATCH 21/94] Improve copy testing [ROCm/hip commit: 1bfd3cdbd07b6cd232e8e7f6ba837e3c04cc60e4] --- projects/hip/src/hip_hcc.cpp | 4 +- projects/hip/tests/src/hipMemcpy.cpp | 63 ++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index f397b02cbe..0b7c81a365 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1641,8 +1641,8 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size // Resolve default to a specific Kind, since we use different algorithms: if (kind == hipMemcpyDefault) { - bool dstIsHost = (dstNotTracked || dstPtrInfo._isInDeviceMem); - bool srcIsHost = (srcNotTracked || srcPtrInfo._isInDeviceMem); + bool dstIsHost = (dstNotTracked || !dstPtrInfo._isInDeviceMem); + bool srcIsHost = (srcNotTracked || !srcPtrInfo._isInDeviceMem); if (srcIsHost && !dstIsHost) { kind = hipMemcpyHostToDevice; } else if (!srcIsHost && dstIsHost) { diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 241c39c2ad..0de1b0b7a0 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -22,6 +22,8 @@ THE SOFTWARE. #include "hip_runtime.h" #include "test_common.h" +//:w #include + // Test simple H2D copies and back. void simpleTest1() @@ -59,9 +61,12 @@ void simpleTest1() // Test many different kinds of memory copies: template -void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) +void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { - printf ("test: %s\n", __func__); + printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + __func__, + typeid(T).name(), + usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; @@ -76,9 +81,6 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) T *B_hh = NULL; T *C_dd = NULL; - // Allocate some extra arrays: - - HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); if (useHostToHost) { @@ -91,7 +93,7 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) } - // Do some extra host copies here to mix things up: + // Do some extra host-to-host copies here to mix things up: HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); @@ -105,17 +107,19 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); -#if 0 - // Do some extra host copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + if (useDeviceToDevice) { + HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + // Do an extra device-to-device copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); -#else - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); -#endif + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, N); @@ -127,6 +131,25 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) } +template +void memcpytest2_loop() +{ + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { +#define USE_HOST_2_HOST +#ifdef USE_HOST_2_HOST + for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO +#else + for (int useHostToHost =0; useHostToHost<=0; useHostToHost++) { // TODO +#endif + for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { + for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { + memcpytest2(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + } + } + } + } +} + int main(int argc, char *argv[]) { @@ -135,9 +158,13 @@ int main(int argc, char *argv[]) simpleTest1(); - memcpyKind(false, false, false); - memcpyKind(true, false, false); - //memcpyKind(true); + //memcpytest2(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/); + + memcpytest2_loop(); + memcpytest2_loop(); + memcpytest2_loop(); + memcpytest2_loop(); + passed(); From b21c2ecfa470b6cf0daf1f3332622c4ecead4420 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 21:30:43 -0600 Subject: [PATCH 22/94] Add Bus Bandwidth test, leveraged from SHOC. [ROCm/hip commit: f3fd6476ebd57c8c511e6412ee5a035a04fd647f] --- .../1_Utils/hipBusBandwidth/LICENSE.txt | 27 + .../samples/1_Utils/hipBusBandwidth/Makefile | 16 + .../hipBusBandwidth/ResultDatabase.cpp | 520 ++++++++++++++++++ .../1_Utils/hipBusBandwidth/ResultDatabase.h | 100 ++++ .../hipBusBandwidth/hipBusBandwidth.cpp | 170 ++++++ 5 files changed, 833 insertions(+) create mode 100644 projects/hip/samples/1_Utils/hipBusBandwidth/LICENSE.txt create mode 100644 projects/hip/samples/1_Utils/hipBusBandwidth/Makefile create mode 100644 projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp create mode 100644 projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.h create mode 100644 projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/LICENSE.txt b/projects/hip/samples/1_Utils/hipBusBandwidth/LICENSE.txt new file mode 100644 index 0000000000..5d0d603232 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2011, UT-Battelle, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile b/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile new file mode 100644 index 0000000000..d233216313 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile @@ -0,0 +1,16 @@ +HIP_PATH?=$(shell hipconfig -p) +HIPCC=$(HIP_PATH)/bin/hipcc + +EXE=hipBusBandwidth + +all: install + +$(EXE): hipBusBandwidth.cpp ResultDatabase.cpp + $(HIPCC) $^ -o $@ + +install: $(EXE) + cp $(EXE) $(HIP_PATH)/bin + + +clean: + rm -f *.o $(EXE) diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp new file mode 100644 index 0000000000..f57aed11be --- /dev/null +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -0,0 +1,520 @@ +#include "ResultDatabase.h" + +#include +#include +#include +#include + +using namespace std; + +bool ResultDatabase::Result::operator<(const Result &rhs) const +{ + if (test < rhs.test) + return true; + if (test > rhs.test) + return false; + if (atts < rhs.atts) + return true; + if (atts > rhs.atts) + return false; + return false; // less-operator returns false on equal +} + +double ResultDatabase::Result::GetMin() const +{ + double r = FLT_MAX; + for (int i=0; i= 100) + return value[n-1]; + + double index = ((n + 1.) * q / 100.) - 1; + + vector sorted = value; + sort(sorted.begin(), sorted.end()); + + if (n == 2) + return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.)); + + int index_lo = int(index); + double frac = index - index_lo; + if (frac == 0) + return sorted[index_lo]; + + double lo = sorted[index_lo]; + double hi = sorted[index_lo + 1]; + return lo + (hi-lo)*frac; +} + +double ResultDatabase::Result::GetMean() const +{ + double r = 0; + for (int i=0; i &values) +{ + for (int i=0; i= results.size()) + { + Result r; + r.test = test; + r.atts = atts; + r.unit = unit; + results.push_back(r); + } + + results[index].value.push_back(value); +} + +// **************************************************************************** +// Method: ResultDatabase::DumpDetailed +// +// Purpose: +// Writes the full results, including all trials. +// +// Arguments: +// out where to print +// +// Programmer: Jeremy Meredith +// Creation: August 14, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 +// Renamed to DumpDetailed to make room for a DumpSummary. +// +// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 +// Added note about (*) missing value tag. +// +// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 +// Changed note about missing values to be worded a little better. +// +// **************************************************************************** +void ResultDatabase::DumpDetailed(ostream &out) +{ + vector sorted(results); + + sort(sorted.begin(), sorted.end()); + + int maxtrials = 1; + for (int i=0; i maxtrials) + maxtrials = sorted[i].value.size(); + } + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + out << std::fixed << right << std::setw(9) << std::setprecision(4); + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + out << endl; + + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + //Check to see if the file is empty - if so, add the headers + emptyFile = this->IsFileEmpty(fileName); + + //Open file and append by default + ofstream out; + out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); + + //Add headers only for empty files + if(emptyFile) + { + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test, " + << "atts, " + << "units, " + << "median, " + << "mean, " + << "stddev, " + << "min, " + << "max, "; + out << endl; + } + + for (int i=0; i +ResultDatabase::GetResultsForTest(const string &test) +{ + // get only the given test results + vector retval; + for (int i=0; i & +ResultDatabase::GetResults() const +{ + return results; +} diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.h b/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.h new file mode 100644 index 0000000000..4b63a02a1f --- /dev/null +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.h @@ -0,0 +1,100 @@ +#ifndef RESULT_DATABASE_H +#define RESULT_DATABASE_H + +#include +#include +#include +#include +#include +using std::string; +using std::vector; +using std::ostream; +using std::ofstream; +using std::ifstream; + + +// **************************************************************************** +// Class: ResultDatabase +// +// Purpose: +// Track numerical results as they are generated. +// Print statistics of raw results. +// +// Programmer: Jeremy Meredith +// Creation: June 12, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 +// Split timing reports into detailed and summary. E.g. for serial code, +// we might report all trial values, but skip them in parallel. +// +// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 +// Added check for missing value tag. +// +// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 +// Added percentile statistic. +// +// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 +// Added a method to extract a subset of results based on test name. Also, +// the Result class is now public, so that clients can use them directly. +// Added a GetResults method as well, and made several functions const. +// +// **************************************************************************** +class ResultDatabase +{ + public: + // + // A performance result for a single SHOC benchmark run. + // + struct Result + { + string test; // e.g. "readback" + string atts; // e.g. "pagelocked 4k^2" + string unit; // e.g. "MB/sec" + vector value; // e.g. "837.14" + double GetMin() const; + double GetMax() const; + double GetMedian() const; + double GetPercentile(double q) const; + double GetMean() const; + double GetStdDev() const; + + bool operator<(const Result &rhs) const; + + bool HadAnyFLTMAXValues() const + { + for (int i=0; i= FLT_MAX) + return true; + } + return false; + } + }; + + protected: + vector results; + + public: + void AddResult(const string &test, + const string &atts, + const string &unit, + double value); + void AddResults(const string &test, + const string &atts, + const string &unit, + const vector &values); + vector GetResultsForTest(const string &test); + const vector &GetResults() const; + void ClearAllResults(); + void DumpDetailed(ostream&); + void DumpSummary(ostream&); + void DumpCsv(string fileName); + + private: + bool IsFileEmpty(string fileName); + +}; + + +#endif diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp new file mode 100644 index 0000000000..8481476fc8 --- /dev/null +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "ResultDatabase.h" + +// Cmdline parms: +const bool p_verbose = false; +const bool p_pinned = true; +const unsigned int p_iters = 10; + +#define CHECK_HIP_ERROR() \ +{ \ + hipError_t err = hipGetLastError(); \ + if (err != hipSuccess) \ + { \ + printf("error=%d name=%s at " \ + "ln: %d\n ",err,hipGetErrorString(err),__LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} + + +// **************************************************************************** +// Function: runBenchmark +// +// Purpose: +// Measures the bandwidth of the bus connecting the host processor to the +// OpenCL device. This benchmark repeatedly transfers data chunks of various +// sizes across the bus to the OpenCL device, and calculates the bandwidth. +// +// +// Arguments: +// +// Returns: nothing +// +// Programmer: Jeremy Meredith +// Creation: September 08, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010 +// Added calculation of latency estimate. +// Ben Sander - moved to standalone test +// +// **************************************************************************** +void RunBenchmark(ResultDatabase &resultDB) +{ + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem = NULL; + if (p_pinned) + { + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + } + } + else + { + hostMem = new float[numMaxFloats]; + } + + for (int i = 0; i < numMaxFloats; i++) + { + hostMem[i] = i % 77; + } + + float *device; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + } + + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + CHECK_HIP_ERROR(); + + // Three passes, forward and backward both + for (int pass = 0; pass < p_iters; pass++) + { + // store the times temporarily to estimate latency + //float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) + { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; + + int nbytes = sizes[sizeIndex] * 1024; + + hipEventRecord(start, 0); + hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + //times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) + { + std::cerr << "size " << sizes[sizeIndex] << "k took " << t << + " ms\n"; + } + + double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); + resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); + resultDB.AddResult("DownloadTime", sizeStr, "ms", t); + } + } + + // Cleanup + hipFree((void*)device); + CHECK_HIP_ERROR(); + if (p_pinned) + { + hipFreeHost((void*)hostMem); + CHECK_HIP_ERROR(); + } + else + { + delete[] hostMem; + } + hipEventDestroy(start); + hipEventDestroy(stop); +} + + + +int main(int argc, char *argv[]) +{ + ResultDatabase resultDB; + RunBenchmark(resultDB); + + resultDB.DumpSummary(std::cout); + + resultDB.DumpDetailed(std::cout); +} From 05e9d398f4f38b0b6dacb9afb98deacd58b0bc6a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 22:46:34 -0600 Subject: [PATCH 23/94] Add D2H test [ROCm/hip commit: 559db057d54589ee46effa8140a051932729d347] --- .../hipBusBandwidth/hipBusBandwidth.cpp | 235 +++++++++++++++++- 1 file changed, 226 insertions(+), 9 deletions(-) diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 8481476fc8..c908fa655e 100644 --- a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -5,9 +5,15 @@ #include "ResultDatabase.h" // Cmdline parms: -const bool p_verbose = false; -const bool p_pinned = true; -const unsigned int p_iters = 10; +bool p_verbose = false; +bool p_pinned = true; +int p_iterations = 10; +int p_device = 0; +int p_detailed = 0; + +bool p_h2d = true; +bool p_d2h = true; + #define CHECK_HIP_ERROR() \ { \ @@ -43,7 +49,7 @@ const unsigned int p_iters = 10; // Ben Sander - moved to standalone test // // **************************************************************************** -void RunBenchmark(ResultDatabase &resultDB) +void RunBenchmark_H2D(ResultDatabase &resultDB) { // Sizes are in kb int nSizes = 20; @@ -51,6 +57,8 @@ void RunBenchmark(ResultDatabase &resultDB) 32768,65536,131072,262144,524288}; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipSetDevice(p_device); + // Create some host memory pattern float *hostMem = NULL; if (p_pinned) @@ -103,7 +111,7 @@ void RunBenchmark(ResultDatabase &resultDB) CHECK_HIP_ERROR(); // Three passes, forward and backward both - for (int pass = 0; pass < p_iters; pass++) + for (int pass = 0; pass < p_iterations; pass++) { // store the times temporarily to estimate latency //float times[nSizes]; @@ -158,13 +166,222 @@ void RunBenchmark(ResultDatabase &resultDB) } +void RunBenchmark_D2H(ResultDatabase &resultDB) +{ + + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem1; + float *hostMem2; + if (p_pinned) + { + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + hipError_t err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + hipError_t err2 = hipGetLastError(); + while (err1 != hipSuccess || err2 != hipSuccess) + { + // free the first buffer if only the second failed + if (err1 == hipSuccess) + hipFreeHost((void*)hostMem1); + + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + err2 = hipGetLastError(); + } + } + else + { + hostMem1 = new float[numMaxFloats]; + hostMem2 = new float[numMaxFloats]; + } + for (int i=0; i= argc || !parseInt(argv[i], &p_iterations)) { + failed("Bad iterations argument"); + } + } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) { + if (++i >= argc || !parseInt(argv[i], &p_device)) { + failed("Bad device argument"); + } + } else if (!strcmp(arg, "--unpinned")) { + p_pinned = 0; + } else if (!strcmp(arg, "--h2d")) { + p_h2d = true; + p_d2h = false; + + } else if (!strcmp(arg, "--d2h")) { + p_h2d = false; + p_d2h = true; + + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { + help(); + + } else if (!strcmp(arg, "--verbose")) { + p_verbose = 1; + } else if (!strcmp(arg, "--detailed")) { + p_detailed = 1; + } else { + failed("Bad argument '%s'", arg); + } + } + + return 0; +}; + + int main(int argc, char *argv[]) { - ResultDatabase resultDB; - RunBenchmark(resultDB); + parseStandardArguments(argc, argv); - resultDB.DumpSummary(std::cout); + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); - resultDB.DumpDetailed(std::cout); + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } + + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } } From 2498ca10d1d0302a5a8018b88ada13f2e54726bd Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 22:47:26 -0600 Subject: [PATCH 24/94] Add D2H test [ROCm/hip commit: bcb5953d6efaf92b71c5b8a9c152585bac1fcff8] --- .../samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index c908fa655e..b847f8db40 100644 --- a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -144,8 +144,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); - resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); - resultDB.AddResult("DownloadTime", sizeStr, "ms", t); + resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("H2D_Time", sizeStr, "ms", t); } } @@ -273,8 +273,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); - resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed); - resultDB.AddResult("ReadbackTime", sizeStr, "ms", t); + resultDB.AddResult("D2H_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("D2H_Time", sizeStr, "ms", t); } //resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.); //resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.); From 7a633dc4b8a082632e1b439da5ce5f4bd346216d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 01:14:01 -0600 Subject: [PATCH 25/94] Result formatting [ROCm/hip commit: c3720c19a8e09b551bd55613d3f31d4ac40c63bc] --- .../samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index f57aed11be..7d2f3aef84 100644 --- a/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -278,13 +278,16 @@ void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); + int testW = 15 ; + const int fieldW = 9; + sort(sorted.begin(), sorted.end()); - out << std::fixed << right << std::setw(9) << std::setprecision(4); + out << std::fixed << right << std::setprecision(4); // TODO: in big parallel runs, the "trials" are the procs // and we really don't want to print them all out.... - out << "test\t" + out << setw(testW) << "test\t" << setw(fieldW) << "atts\t" << "units\t" << "median\t" @@ -297,7 +300,7 @@ void ResultDatabase::DumpSummary(ostream &out) for (int i=0; i Date: Sat, 13 Feb 2016 01:15:23 -0600 Subject: [PATCH 26/94] D2H multi-buffer [ROCm/hip commit: b314777bc10e17274d91a1fa75e8706187f71266] --- projects/hip/src/hip_hcc.cpp | 138 +++++++++++++++++++++------ projects/hip/tests/src/hipMemcpy.cpp | 136 ++++++++++++++++++++------ 2 files changed, 212 insertions(+), 62 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 0b7c81a365..4921a61c72 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -31,7 +31,6 @@ THE SOFTWARE. #include #include #include -#include #include #include @@ -61,6 +60,7 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ +int HIP_STAGING_DOUBLE_BUFFER = 1; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -123,22 +123,23 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- struct StagingBuffer { - static const int numBuffers = 2; - - int _bufferIndex; // Operating on buffer 0 or 1? - - ihipDevice_t *_device; - size_t _bufferSize; // Size of the buffers. + static const int _numBuffers = 2; - StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + + StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ; ~StagingBuffer(); + void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); private: - char *_pinnedStagingBuffer[numBuffers]; - hsa_signal_t _completion_signal[numBuffers]; + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + bool _double_buffer; + + char *_pinnedStagingBuffer[_numBuffers]; + hsa_signal_t _completion_signal[_numBuffers]; }; @@ -179,7 +180,7 @@ public: //Device may be reset multiple times, and may be reset after init. void ihipDevice_t::reset() { - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER); _staging_device2host = NULL; }; @@ -519,6 +520,7 @@ void ihipInit() READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" ); + READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" ); /* * Build a table of valid compute devices. @@ -1568,12 +1570,13 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : - _bufferIndex(0), +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) : _device(device), - _bufferSize(bufferSize) + _bufferSize(bufferSize), + _double_buffer(doubleBuffer) { - for (int i=0; i_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; @@ -1585,7 +1588,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : //--- StagingBuffer::~StagingBuffer() { - for (int i=0; i (src); char *dstp = static_cast (dst); - assert(sizeBytes < UINT64_MAX/2); // TODO - for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) { + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + assert(sizeBytes < UINT64_MAX/2); // TODO + int bufferIndex = 0; + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { - // TODO - double-buffer these guys. size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]); + tprintf (TRACE_COPY2, "waiting... on completion signal\n"); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes); + tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + // TODO - use uncached memcpy, someday. + memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp); + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp); - hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); - tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status); - if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp += theseBytes; dstp += theseBytes; + if (_double_buffer) { + bufferIndex = (bufferIndex + 1) % _numBuffers; + } } + + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } +} + +//--- +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) +{ + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); + + int numBuffers = _double_buffer ? _numBuffers : 1; + + for (int i=0; i 0) { + // First launch the async copies to copy from dest to host + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; + + tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + + srcp0 += theseBytes; + } + + // Now unload the staging buffers: + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; + + tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); + + dstp1 += theseBytes; + } + } + + + //for (int i=0; i<_numBuffers; i++) { + // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //} } @@ -1657,10 +1725,18 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } else { + hc::AM_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { - // TODO - optimize the copy here. - hc::AM_copy(dst, src, sizeBytes); + if (useStagingBuffer) { + device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes); + } else { + hc::AM_copy(dst, src, sizeBytes); + } + } else if (kind == hipMemcpyHostToHost) { + memcpy(dst, src, sizeBytes); + } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals: diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 0de1b0b7a0..509f4a1177 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -22,8 +22,11 @@ THE SOFTWARE. #include "hip_runtime.h" #include "test_common.h" -//:w #include +void printSep() +{ + printf ("======================================================================================\n"); +} // Test simple H2D copies and back. void simpleTest1() @@ -61,21 +64,22 @@ void simpleTest1() // Test many different kinds of memory copies: template -void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { - printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + size_t sizeElements = numElements * sizeof(T); + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, typeid(T).name(), + sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; - size_t Nbytes = N*sizeof(T); - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; @@ -85,44 +89,44 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, if (useHostToHost) { if (usePinnedHost) { - HIPCHECK ( hipMallocHost(&A_hh, Nbytes) ); - HIPCHECK ( hipMallocHost(&B_hh, Nbytes) ); + HIPCHECK ( hipMallocHost(&A_hh, sizeElements) ); + HIPCHECK ( hipMallocHost(&B_hh, sizeElements) ); } else { - A_hh = (T*)malloc(Nbytes); - B_hh = (T*)malloc(Nbytes); + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, N); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); HIPCHECK ( hipDeviceReset() ); @@ -132,8 +136,10 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, template -void memcpytest2_loop() +void memcpytest2_loop(size_t numElements) { + printSep(); + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { #define USE_HOST_2_HOST #ifdef USE_HOST_2_HOST @@ -143,7 +149,7 @@ void memcpytest2_loop() #endif for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -151,20 +157,88 @@ void memcpytest2_loop() } +template +void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, typeid(T).name()); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + if (maxElem == 0) { + maxElem = free/sizeof(T)/5; + } + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + + for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + } +} + + +template +void multiThread_1(bool serialize) +{ + printSep(); + printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize); + std::thread t1 (memcpytest2,N, 0,0,0,0); + if (serialize) { + t1.join(); + } + + + std::thread t2 (memcpytest2,N, 0,0,0,0); + if (serialize) { + t2.join(); + } + + if (!serialize) { + t1.join(); + t2.join(); + } +} + + + int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); - simpleTest1(); + if (p_tests & 0x1) { + simpleTest1(); + } - //memcpytest2(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/); + if (p_tests & 0x2) { + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + } - memcpytest2_loop(); - memcpytest2_loop(); - memcpytest2_loop(); - memcpytest2_loop(); + if (p_tests & 0x4) { + printSep(); + memcpytest2_sizes(0,0); + printSep(); + memcpytest2_sizes(0,64); + printSep(); + memcpytest2_sizes(1024*1024, 13); + printSep(); + memcpytest2_sizes(1024*1024, 50); + } + if (p_tests & 0x8) { + printSep(); + multiThread_1(true); + multiThread_1(false); + } passed(); From 565d0fb96861d9de59ce57c28acbdc1323a52702 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 03:17:42 -0600 Subject: [PATCH 27/94] Enable -O3, style points on array size [ROCm/hip commit: 928996fec7158a782e71f6e5d11a885bb3ad720d] --- projects/hip/samples/1_Utils/hipBusBandwidth/Makefile | 3 ++- .../hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile b/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile index d233216313..77a92fb1a6 100644 --- a/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/Makefile @@ -2,11 +2,12 @@ HIP_PATH?=$(shell hipconfig -p) HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth +CXXFLAGS = -O3 -g all: install $(EXE): hipBusBandwidth.cpp ResultDatabase.cpp - $(HIPCC) $^ -o $@ + $(HIPCC) $(CXXFLAGS) $^ -o $@ install: $(EXE) cp $(EXE) $(HIP_PATH)/bin diff --git a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index b847f8db40..d276725921 100644 --- a/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/hip/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -52,9 +52,9 @@ bool p_d2h = true; void RunBenchmark_H2D(ResultDatabase &resultDB) { // Sizes are in kb - int nSizes = 20; - int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, - 32768,65536,131072,262144,524288}; + int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; + int nSizes = sizeof(sizes) / sizeof(int); + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipSetDevice(p_device); From 9e60e519d42fe4b95e582ce57180b50ee94e94ee Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 03:18:01 -0600 Subject: [PATCH 28/94] Add multi-threading synchonization on staging buffers and signals. Also pre-allocate a couple signals for copies. [ROCm/hip commit: 6420655dc893eff1b9b27ca965be818847c90ec1] --- projects/hip/src/hip_hcc.cpp | 85 +++++++++++++++------------- projects/hip/tests/src/hipMemcpy.cpp | 20 ++++--- 2 files changed, 60 insertions(+), 45 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 4921a61c72..4f95320ac3 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -60,7 +60,7 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_DOUBLE_BUFFER = 1; +int HIP_STAGING_BUFFERS = 2; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -123,11 +123,10 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- struct StagingBuffer { - static const int _numBuffers = 2; + static const int _max_buffers = 4; - - StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ; + StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); @@ -136,10 +135,10 @@ struct StagingBuffer { private: ihipDevice_t *_device; size_t _bufferSize; // Size of the buffers. - bool _double_buffer; + int _numBuffers; - char *_pinnedStagingBuffer[_numBuffers]; - hsa_signal_t _completion_signal[_numBuffers]; + char *_pinnedStagingBuffer[_max_buffers]; + hsa_signal_t _completion_signal[_max_buffers]; }; @@ -161,8 +160,9 @@ struct ihipDevice_t unsigned _compute_units; - StagingBuffer *_staging_host2device; - StagingBuffer *_staging_device2host; + hsa_signal_t _copy_signal; // signal to use for copies + std::mutex _copy_lock[2]; // mutex for each direction. + StagingBuffer *_staging_buffer[2]; // one buffer for each direction. public: void reset(); @@ -170,7 +170,7 @@ public: hipError_t getProperties(hipDeviceProp_t* prop); // TODO- create a copy constructor. - //~ihipDevice_t(); + ~ihipDevice_t(); }; @@ -180,8 +180,8 @@ public: //Device may be reset multiple times, and may be reset after init. void ihipDevice_t::reset() { - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER); - _staging_device2host = NULL; + _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); }; @@ -208,10 +208,13 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); + hsa_signal_create(0, 0, NULL, &_copy_signal); + this->reset(); }; -#if 0 +#if 1 +// TODO-remove #ifdef ihipDevice_t::~ihipDevice_t() { if (_null_stream) { @@ -219,12 +222,12 @@ ihipDevice_t::~ihipDevice_t() _null_stream = NULL; } - if (_staging_device2host) { - delete _staging_device2host; - } - if (_staging_host2device){ - delete _staging_host2device; + for (int i=0; i<2; i++) { + if (_staging_buffer[i]) { + delete _staging_buffer[i]; + } } + hsa_signal_destroy(_copy_signal); } #endif @@ -519,8 +522,8 @@ void ihipInit() READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); - READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" ); - READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" ); + READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)." ); + READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction."); /* * Build a table of valid compute devices. @@ -1570,11 +1573,14 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) : +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) : _device(device), _bufferSize(bufferSize), - _double_buffer(doubleBuffer) + _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) { + + + for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. _pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned); @@ -1630,8 +1636,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte srcp += theseBytes; dstp += theseBytes; - if (_double_buffer) { - bufferIndex = (bufferIndex + 1) % _numBuffers; + if (++bufferIndex >= _numBuffers) { + bufferIndex = 0; } } @@ -1647,9 +1653,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); - int numBuffers = _double_buffer ? _numBuffers : 1; - - for (int i=0; i 0) { // First launch the async copies to copy from dest to host - for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; @@ -1673,7 +1677,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte } // Now unload the staging buffers: - for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; @@ -1705,7 +1709,7 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); - bool useStagingBuffer = true; + bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. // Resolve default to a specific Kind, since we use different algorithms: if (kind == hipMemcpyDefault) { @@ -1724,31 +1728,36 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { - device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + std::lock_guard l (device->_copy_lock[0]); + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes); } else { hc::AM_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { if (useStagingBuffer) { - device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes); + std::lock_guard l (device->_copy_lock[1]); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes); } else { hc::AM_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { - memcpy(dst, src, sizeBytes); + memcpy(dst, src, sizeBytes); // TODO - not async. } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals: - hsa_signal_t completion_signal; - hsa_signal_create(1, 0, NULL, &completion_signal); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, completion_signal); + + device->_copy_lock[1].lock(); + + hsa_signal_store_relaxed(device->_copy_signal, 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } - hsa_signal_destroy(completion_signal); + device->_copy_lock[1].unlock(); + } } #endif diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 509f4a1177..3502b81e9d 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -129,7 +129,6 @@ void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, boo HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); - HIPCHECK ( hipDeviceReset() ); printf (" %s success\n", __func__); } @@ -177,24 +176,26 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + HIPCHECK ( hipDeviceReset() ); memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + HIPCHECK ( hipDeviceReset() ); memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host } } template -void multiThread_1(bool serialize) +void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); - printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize); - std::thread t1 (memcpytest2,N, 0,0,0,0); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, typeid(T).name(), serialize, usePinnedHost); + std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,N, 0,0,0,0); + std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); if (serialize) { t2.join(); } @@ -213,10 +214,12 @@ int main(int argc, char *argv[]) if (p_tests & 0x1) { + HIPCHECK ( hipDeviceReset() ); simpleTest1(); } if (p_tests & 0x2) { + HIPCHECK ( hipDeviceReset() ); memcpytest2_loop(N); memcpytest2_loop(N); memcpytest2_loop(N); @@ -224,6 +227,7 @@ int main(int argc, char *argv[]) } if (p_tests & 0x4) { + HIPCHECK ( hipDeviceReset() ); printSep(); memcpytest2_sizes(0,0); printSep(); @@ -235,9 +239,11 @@ int main(int argc, char *argv[]) } if (p_tests & 0x8) { + HIPCHECK ( hipDeviceReset() ); printSep(); - multiThread_1(true); - multiThread_1(false); + multiThread_1(true, true); + multiThread_1(false, true); + multiThread_1(false, false); // TODO } passed(); From 189e652a5e5d35916f3d06c32631717e4b523956 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 15 Feb 2016 13:16:05 +0300 Subject: [PATCH 29/94] Formatting, no functional changes. [ROCm/hip commit: 072d649d8ddb2675144539ff7a568cf8bd0f1da6] --- .../hip/samples/1_Utils/hipInfo/hipInfo.cpp | 55 ++++++++----------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index bff2114f96..de73aababd 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -41,9 +41,9 @@ THE SOFTWARE. #define HIPCHECK(error) \ if (error != hipSuccess) { \ - printf("%serror: '%s'(%d) at %s:%d%s\n", \ - KRED,hipGetErrorString(error), error,\ - __FILE__, __LINE__,KNRM); \ + printf("%serror: '%s'(%d) at %s:%d%s\n", \ + KRED, hipGetErrorString(error), error,\ + __FILE__, __LINE__,KNRM);\ failed("API returned error code.");\ } @@ -53,12 +53,11 @@ void printCompilerInfo () printf ("compiler: hcc version=%s, workweek (YYWWD) = %u\n", __hcc_version__, __hcc_workweek__); #endif #ifdef __NVCC__ - printf ("compiler: nvcc\n"); + printf ("compiler: nvcc\n"); #endif } - -double bytesToGB(size_t s) +double bytesToGB(size_t s) { return (double)s / (1024.0*1024.0*1024.0); } @@ -66,7 +65,6 @@ double bytesToGB(size_t s) void printDeviceProp (int deviceId) { using namespace std; - const int w1 = 30; cout << left; @@ -77,33 +75,27 @@ void printDeviceProp (int deviceId) hipDeviceProp_t props; HIPCHECK(hipDeviceGetProperties(&props, deviceId)); - cout << setw(w1) << "Name: " << props.name << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; - cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; - - cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; - cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; - cout << setw(w1) << "warpSize" << props.warpSize << endl; - cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; - cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; - cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; - cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; - - cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; - cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; - cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; - - - cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; - cout << setw(w1) << "major" << props.major << endl; - cout << setw(w1) << "minor" << props.minor << endl; - cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; - cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; - cout << setw(w1) << "computeMode" << props.computeMode << endl; - + cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; + cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; + cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; + cout << setw(w1) << "warpSize" << props.warpSize << endl; + cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; + cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; + cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; + cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; + cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; + cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; + cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; + cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; + cout << setw(w1) << "major" << props.major << endl; + cout << setw(w1) << "minor" << props.minor << endl; + cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; + cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "computeMode" << props.computeMode << endl; cout << setw(w1) << "arch.hasGlobalInt32Atomics" << props.arch.hasGlobalInt32Atomics << endl; cout << setw(w1) << "arch.hasGlobalFloatAtomicExch" << props.arch.hasGlobalFloatAtomicExch << endl; cout << setw(w1) << "arch.hasSharedInt32Atomics" << props.arch.hasSharedInt32Atomics << endl; @@ -121,17 +113,14 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "arch.hasSurfaceFuncs" << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid" << props.arch.has3dGrid << endl; cout << setw(w1) << "arch.hasDynamicParallelism" << props.arch.hasDynamicParallelism << endl; - cout << endl; size_t free, total; - hipMemGetInfo(&free, &total); cout << fixed << setprecision(2); cout << setw(w1) << "memInfo.total " << bytesToGB(total) << " GB" << endl; cout << setw(w1) << "memInfo.free " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; - } int main(int argc, char *argv[]) From 9ccdbdca713c1ae60b755eef14ff4f3f94254091 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 05:19:52 -0600 Subject: [PATCH 30/94] Fix tests to account for multi-gpu [ROCm/hip commit: 3ecd1b153953f987abd93a2ea51d947505820a6a] --- projects/hip/tests/src/hipMemcpy.cpp | 3 +++ projects/hip/tests/src/hipPointerAttrib.cpp | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 3502b81e9d..b76f98c687 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -212,6 +212,9 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + if (p_tests & 0x1) { HIPCHECK ( hipDeviceReset() ); diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp index 1418997274..12856da984 100644 --- a/projects/hip/tests/src/hipPointerAttrib.cpp +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -289,6 +289,7 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) for (int i =0; i Date: Mon, 15 Feb 2016 05:40:12 -0600 Subject: [PATCH 31/94] Update docs, cleanup [ROCm/hip commit: 4637e19da4d09e69416135422f1a2215244d55d2] --- projects/hip/include/hcc_detail/AM.h | 31 ++++++++++-- projects/hip/src/hc_AM.cpp | 76 ++++------------------------ 2 files changed, 38 insertions(+), 69 deletions(-) diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h index 04804ffaa5..40d9ea8382 100644 --- a/projects/hip/include/hcc_detail/AM.h +++ b/projects/hip/include/hcc_detail/AM.h @@ -79,6 +79,7 @@ am_status_t AM_free(void* ptr); am_status_t AM_copy(void* dst, const void* src, size_t size); + /** * Return information about tracked pointer. * @@ -94,11 +95,23 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); -//TODO-doc +/** + * Adds a pointer to the memory tracker. + * + * @return AM_SUCCESS + * @see am_memtracker_getinfo + */ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); -//TODO-doc +/* + * Updates infor for an existing pointer in the memory tracker. + * + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. + * + * @see am_memtracker_getinfo, am_memtracker_add + */ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); @@ -109,23 +122,33 @@ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocation * * @returns AM_ERROR_MISC if pointer is not found in tracker. * @returns AM_SUCCESS if pointer is not found in tracker. + * + * @see am_memtracker_getinfo, am_memtracker_add */ am_status_t am_memtracker_remove(void* ptr); /** - * Remove all memory allocations associated with specified accelerator. + * Remove all memory allocations associated with specified accelerator from the memory tracker. * * @returns Number of entries reset. + * @see am_memtracker_getinfo */ size_t am_memtracker_reset(hc::accelerator acc); /** - * Prints info about the memory tracker table. + * Prints the entries in the memory tracker table. * * Intended primarily for debug purposes. + * @see am_memtracker_getinfo **/ void am_memtracker_print(); + +/** + * Returns total sizes of device, host, and user memory allocated by the application + * + * User memory is registered with am_tracker_add. + **/ void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize); diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 2d22b49fd4..221322f4b0 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -73,12 +73,6 @@ public: size_t reset (hc::accelerator acc); private: - // TODO - use or remove. - inline void writeLock(); - inline void writeUnlock(); - inline void readLock(); - inline void readUnlock(); - MapTrackerType _tracker; std::mutex _mutex; //std::shared_timed_mutex _mut; @@ -117,20 +111,6 @@ AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void * } -#if 0 -//--- -std::ostream & AmPointerTracker::print (std::ostream &os) -{ - std::lock_guard l (_mutex); - for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { - os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; - os << iter->second << std::endl; - } - - return os; -} -#endif - //--- // Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). // Returns count of ranges removed. @@ -158,39 +138,6 @@ size_t AmPointerTracker::reset (hc::accelerator acc) } - -//--- -void AmPointerTracker::writeLock () -{ - _mutex.lock(); -} - - -//--- -void AmPointerTracker::writeUnlock () -{ - _mutex.unlock(); -} - - -//--- -// TODO - support multiple concurrent reader -void AmPointerTracker::readLock () -{ - _mutex.lock(); -} - - -//--- -// TODO - support multiple concurrent reader -void AmPointerTracker::readUnlock () -{ - _mutex.unlock(); -} - - - - //========================================================================================================= // Global var defs: //========================================================================================================= @@ -289,6 +236,17 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) } } +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) +{ + if (isDeviceMem) { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); + } else { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); + } + + return AM_SUCCESS; +} + am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) { @@ -303,18 +261,6 @@ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocation } -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) -{ - if (isDeviceMem) { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); - } else { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); - } - - return AM_SUCCESS; -} - - am_status_t am_memtracker_remove(void* ptr) { am_status_t status = AM_SUCCESS; From 79a99f48f98efa5cb46191a56b0d4baf69842b8c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 05:40:30 -0600 Subject: [PATCH 32/94] Remove old include path. [ROCm/hip commit: 57274850f964364d87a17011a72b2fbefda34ec6] --- projects/hip/tests/src/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index bf05fc8407..7e4736a99a 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -19,7 +19,6 @@ MESSAGE ("HIP_PATH=" ${HIP_PATH}) if (${HIP_PLATFORM} STREQUAL "hcc") MESSAGE ("HIP_PLATFORM=hcc") - set (HC_PATH ${HIP_PATH}/hc) set (HSA_PATH /opt/hsa) #--- @@ -30,7 +29,7 @@ if (${HIP_PLATFORM} STREQUAL "hcc") #These includes are used for all files. #Include HIP and HC since the tests need both of these: #Note below HSA path is surgically included only where necessary. - include_directories(${HIP_PATH}/include ${HC_PATH}/include) + include_directories(${HIP_PATH}/include) # hip_hcc.o: add_library(hip_hcc OBJECT ${HIP_PATH}/src/hip_hcc.cpp) From d58eab17066c0ec2bfacf01661f7ac13d3f6855f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 05:41:09 -0600 Subject: [PATCH 33/94] Move warpSize to header, have shuffles use default warpsize. [ROCm/hip commit: db3a63360bd11260ab7870dee5bbddff0957ce36] --- projects/hip/include/hcc_detail/hip_runtime.h | 23 +++++++++++-------- projects/hip/src/hip_hcc.cpp | 6 ----- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h index 8474f066df..7c5a2f2e36 100644 --- a/projects/hip/include/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hcc_detail/hip_runtime.h @@ -108,6 +108,12 @@ THE SOFTWARE. #define __HCC_C__ #endif + +// TODO - hipify-clang - change to use the function call. +//#define warpSize hc::__wavesize() +const int warpSize = 64; + + #define clock_t long long int __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); }; __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); }; @@ -344,42 +350,42 @@ __device__ inline unsigned long long int __ballot( int input) } // warp shuffle functions -__device__ inline int __shfl(int input, int lane, int width) +__device__ inline int __shfl(int input, int lane, int width=warpSize) { return hc::__shfl(input,lane,width); } -__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline int __shfl_xor(int input, int lane_mask, int width) +__device__ inline int __shfl_xor(int input, int lane_mask, int width=warpSize) { return hc::__shfl_xor(input,lane_mask,width); } -__device__ inline float __shfl(float input, int lane, int width) +__device__ inline float __shfl(float input, int lane, int width=warpSize) { return hc::__shfl(input,lane,width); } -__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline float __shfl_xor(float input, int lane_mask, int width) +__device__ inline float __shfl_xor(float input, int lane_mask, int width=warpSize) { return hc::__shfl_xor(input,lane_mask,width); } @@ -438,7 +444,6 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); }; #define hipGridDim_z (hc_get_num_groups(0)) -extern int warpSize ; #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 4f95320ac3..d4a6857559 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -49,8 +49,6 @@ THE SOFTWARE. //--- // Environment variables: -// TODO-HCC - map this to the HC instruction that uses HSAIL to get the wave size. -int warpSize = 64; // Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. //static const int debug = 0; @@ -169,7 +167,6 @@ public: void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); - // TODO- create a copy constructor. ~ihipDevice_t(); }; @@ -213,8 +210,6 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->reset(); }; -#if 1 -// TODO-remove #ifdef ihipDevice_t::~ihipDevice_t() { if (_null_stream) { @@ -229,7 +224,6 @@ ihipDevice_t::~ihipDevice_t() } hsa_signal_destroy(_copy_signal); } -#endif //---- From 9ab5b92173726a13badce266f5496e4ab21e926a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 21:16:00 -0600 Subject: [PATCH 34/94] Update before checkin to HCC. Add support for USE_AM_TRACKER=2 (HCC version). Add AM_ALLOC, AM_FREE indirection to ease swapping AM implementations. [ROCm/hip commit: 38c735fd1d83f5a5c58fc9416ce654bed5356f7a] --- projects/hip/include/hcc_detail/AM.h | 23 +++++----- projects/hip/src/hc_AM.cpp | 2 - projects/hip/src/hip_hcc.cpp | 69 ++++++++++++++++++++++------ 3 files changed, 66 insertions(+), 28 deletions(-) diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h index 40d9ea8382..74542789af 100644 --- a/projects/hip/include/hcc_detail/AM.h +++ b/projects/hip/include/hcc_detail/AM.h @@ -13,7 +13,7 @@ typedef int am_status_t; namespace hc { -// This is the data that is maintained for each pointer: +// Info for each pointer in the memtry tracker: struct AmPointerInfo { void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. void * _devicePointer; ///< Device pointer. @@ -45,7 +45,7 @@ namespace hc { /** - * Allocates a block of @p size bytes of memory on the specified @p acc. + * Allocate a block of @p size bytes of memory on the specified @p acc. * * The contents of the newly allocated block of memory are not initialized. * @@ -53,7 +53,7 @@ namespace hc { * * Flags must be 0. * - * @returns : On success, pointer to the newly allocated memory is returned. + * @return : On success, pointer to the newly allocated memory is returned. * The pointer is typecast to the desired return type. * * If an error occurred trying to allocate the requested memory, 0 is returned. @@ -63,17 +63,18 @@ namespace hc { auto_voidp AM_alloc(size_t size, hc::accelerator acc, unsigned flags); /** - * Frees a block of memory previously allocated with am_alloc. + * Free a block of memory previously allocated with am_alloc. * + * @return AM_SUCCESS * @see am_alloc, am_copy */ am_status_t AM_free(void* ptr); /** - * Copies @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. + * Copy @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. * - * @returns AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. + * @return AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. * @see am_alloc, am_free */ am_status_t AM_copy(void* dst, const void* src, size_t size); @@ -96,7 +97,7 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); /** - * Adds a pointer to the memory tracker. + * Add a pointer to the memory tracker. * * @return AM_SUCCESS * @see am_memtracker_getinfo @@ -105,7 +106,7 @@ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, /* - * Updates infor for an existing pointer in the memory tracker. + * Update info for an existing pointer in the memory tracker. * * @returns AM_ERROR_MISC if pointer is not found in tracker. * @returns AM_SUCCESS if pointer is not found in tracker. @@ -116,7 +117,7 @@ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocation /** - * Remove the pointer from the tracker structure. + * Remove @ptr from the tracker structure. * * @p ptr may be anywhere in a tracked memory range. * @@ -136,7 +137,7 @@ am_status_t am_memtracker_remove(void* ptr); size_t am_memtracker_reset(hc::accelerator acc); /** - * Prints the entries in the memory tracker table. + * Print the entries in the memory tracker table. * * Intended primarily for debug purposes. * @see am_memtracker_getinfo @@ -145,7 +146,7 @@ void am_memtracker_print(); /** - * Returns total sizes of device, host, and user memory allocated by the application + * Return total sizes of device, host, and user memory allocated by the application * * User memory is registered with am_tracker_add. **/ diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 221322f4b0..272024cfe7 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -93,7 +93,6 @@ void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) // Return 1 if removed or 0 if not found. int AmPointerTracker::remove (void *pointer) { - // TODO-mutex - write lock. std::lock_guard l (_mutex); mprintf ("remove: %p\n", pointer); return _tracker.erase(AmMemoryRange(pointer,1)); @@ -103,7 +102,6 @@ int AmPointerTracker::remove (void *pointer) //--- AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) { - // TODO-mutex- read lock std::lock_guard l (_mutex); auto iter = _tracker.find(AmMemoryRange(pointer,1)); mprintf ("find: %p\n", pointer); diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index d4a6857559..30a0d993e0 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -40,10 +40,18 @@ THE SOFTWARE. #include "hsa_ext_amd.h" -#include "hc_AM.cpp" #define USE_ASYNC_COPY 1 -#define USE_AM_TRACKER 1 /* use new AM memory tracker features */ +#define USE_AM_TRACKER 2 /* >0 = use new AM memory tracker features. 1= use HIP impl, 2=use HCC impl */ + +#if USE_AM_TRACKER==1 +#include "hc_AM.cpp" +#define AM_ALLOC hc::AM_alloc +#define AM_FREE hc::AM_free +#else +#define AM_ALLOC hc::am_alloc +#define AM_FREE hc::am_free +#endif #define INLINE static inline @@ -1504,7 +1512,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (device) { const unsigned am_flags = 0; - *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -1531,7 +1539,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) auto device = ihipGetTlsDefaultDevice(); if (device) { - *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { @@ -1577,7 +1585,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuf for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. - _pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned); + _pinnedStagingBuffer[i] = AM_ALLOC(_bufferSize, device->_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; } @@ -1590,7 +1598,7 @@ StagingBuffer::~StagingBuffer() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { - hc::AM_free(_pinnedStagingBuffer[i]); + AM_FREE(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } hsa_signal_destroy(_completion_signal[i]); @@ -1695,8 +1703,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #if USE_AM_TRACKER -// TODO - add mutex to limit in/out: -void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { hc::AmPointerInfo dstPtrInfo, srcPtrInfo; @@ -1725,14 +1732,16 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size std::lock_guard l (device->_copy_lock[0]); device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes); } else { - hc::AM_copy(dst, src, sizeBytes); + // TODO - remove, slow path. + hc::am_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { if (useStagingBuffer) { std::lock_guard l (device->_copy_lock[1]); device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes); } else { - hc::AM_copy(dst, src, sizeBytes); + // TODO - remove, slow path. + hc::am_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { memcpy(dst, src, sizeBytes); // TODO - not async. @@ -1757,6 +1766,36 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size #endif +#if 0 // USE_AM_TRACKER +void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +{ + bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. + + hipStatus_t e = hipSuccess; + + // TODO - check kind is not default. + if (kind == hipMemcpyDefault) { + e = hipErrorInvalidMemoryDirection; + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals: + + device->_copy_lock[1].lock(); + + hsa_signal_store_relaxed(device->_copy_signal, 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + device->_copy_lock[1].unlock(); + + } +} +#endif + + //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) @@ -1775,7 +1814,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind ihipDevice_t *device = &g_devices[stream->_device_index]; - ihipAsyncCopy(device, dst, src, sizeBytes, kind); + ihipSyncCopy(device, dst, src, sizeBytes, kind); } else { e = hipErrorInvalidResourceHandle; @@ -1784,7 +1823,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #else // TODO-hsart - what synchronization does hsa_copy provide? - hc::AM_copy(dst, src, sizeBytes); + hc::am_copy(dst, src, sizeBytes); e = hipSuccess; #endif @@ -1815,7 +1854,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::AM_copy(dst, src, sizeBytes); + hc::am_copy(dst, src, sizeBytes); #if 0 @@ -1938,7 +1977,7 @@ hipError_t hipFree(void* ptr) ihipWaitAllStreams(ihipGetTlsDefaultDevice()); if (ptr) { - hc::AM_free(ptr); + AM_FREE(ptr); } return ihipLogStatus(hipSuccess); @@ -1952,7 +1991,7 @@ hipError_t hipFreeHost(void* ptr) if (ptr) { tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); - hc::AM_free(ptr); + AM_FREE(ptr); } return ihipLogStatus(hipSuccess); From 787078f0b3359b0fd8822e792afd4eb78b4f3a19 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 Feb 2016 01:58:24 -0600 Subject: [PATCH 35/94] Add comments to tests [ROCm/hip commit: fb883d9d03d9c3035056430f7f933a3c4aaa6a91] --- projects/hip/tests/src/hipMemcpy.cpp | 25 ++++++++--- projects/hip/tests/src/hipPointerAttrib.cpp | 48 ++++++++++++++++----- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 3502b81e9d..97a372304a 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -28,7 +28,9 @@ void printSep() printf ("======================================================================================\n"); } +//--- // Test simple H2D copies and back. +// Designed to stress a small number of simple smoke tests void simpleTest1() { printf ("test: %s\n", __func__); @@ -61,8 +63,16 @@ void simpleTest1() } -// Test many different kinds of memory copies: - +//--- +// Test many different kinds of memory copies. +// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. +// +// IN: numElements controls the number of elements used for allocations. +// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. +// IN: useHostToHost : If true, add an extra host-to-host copy. +// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. +// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. +// template void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { @@ -134,18 +144,15 @@ void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, boo } +//--- +//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault template void memcpytest2_loop(size_t numElements) { printSep(); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { -#define USE_HOST_2_HOST -#ifdef USE_HOST_2_HOST for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO -#else - for (int useHostToHost =0; useHostToHost<=0; useHostToHost++) { // TODO -#endif for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); @@ -156,6 +163,8 @@ void memcpytest2_loop(size_t numElements) } +//--- +//Try many different sizes to memory copy. template void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) { @@ -184,6 +193,8 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) } +//--- +//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: template void multiThread_1(bool serialize, bool usePinnedHost) { diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp index 1418997274..586b2af5b5 100644 --- a/projects/hip/tests/src/hipPointerAttrib.cpp +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -27,7 +27,9 @@ THE SOFTWARE. #include "test_common.h" #ifdef __HIP_PLATFORM_HCC__ -#include "hcc_detail/AM.h" +//#include "hcc_detail/AM.h" +#include "hc_am.hpp" + #endif size_t Nbytes = 0; @@ -97,8 +99,8 @@ inline int zrand(int max) //================================================================================================= // Functins to run tests //================================================================================================= -// -//Run through a couple simple cases to test lookups and hostd pointer arithmetic: +//-- +//Run through a couple simple cases to test lookups and host pointer arithmetic: void testSimple() { printf ("\n"); @@ -188,7 +190,10 @@ void testSimple() HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue. } - +//--- +//Reset the memory tracker (remove allocations from all known devices): +//This frees any memory allocated through the runtime. +//The routine will not release any void resetTracker () { if (p_verbose & 0x1) { @@ -214,7 +219,8 @@ struct SuperPointerAttribute { }; - +//--- +//Support function to check result against a reference: void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointer) { hipPointerAttribute_t attribs; @@ -236,6 +242,12 @@ void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointe } +//--- +//Test that allocates memory across all 4 devices withing the specified size range (minSize...maxSize). +//Then does lookups to make sure the info reported by the tracker matches expecations +//Then deallocates it all. +// +//Multiple threads can call this funtion and in fact we do this in the testMultiThreaded_1 test. void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) { printf (" clusterAllocs numAllocs=%d size=%lu..%lu\n", numAllocs, minSize, maxSize); @@ -313,9 +325,6 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) } - - - #ifdef __HIP_PLATFORM_HCC__ if (p_verbose & 0x2) { printf ("Tracker after cleanup:\n"); @@ -325,6 +334,10 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) } +//--- +// Multi-threaded test with many simul allocs. +// IN : serialize will force the test to run in serial fashion. +// Seems like this does not hit MT corner cases in the tracker very often - testMultiThreaded_2 below seems more effective. void testMultiThreaded_1(bool serialize=false) { printf ("\n===========================================================================\n"); @@ -356,8 +369,8 @@ void testMultiThreaded_1(bool serialize=false) ///================================================================================================ - -// Add pointers to tracker very quickly. +//--- +//Repeatedly query a single entry: void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) { int count = 0; @@ -376,6 +389,9 @@ void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) } +#ifdef __HIP_PLATFORM_HCC__ +//--- +// Add pointers to tracker very quickly, then remove them quickly: enum Dir {Up, Down}; void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir) { @@ -412,6 +428,13 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove } +//--- +//Multi-thread test that is effective at catching locking errors in the alloc/dealloc/tracker. +//The query thread repeately requests information on the same block of memory. +//Meanwhile, the thread_noise_generator registers a large number of blocks, and +//then unregisters them. This causes a large amount of rebalancing in the tree +//structure and will generate errors unless the locks in the tracker are preventing reading +//while writing. void testMultiThreaded_2() { std::atomic inflight(2); @@ -445,6 +468,8 @@ void testMultiThreaded_2() hipSetDevice(0); hipDeviceReset(); } +#endif + int main(int argc, char *argv[]) @@ -483,11 +508,14 @@ int main(int argc, char *argv[]) testMultiThreaded_1(false); } + +#ifdef __HIP_PLATFORM_HCC__ if (p_tests & 0x10) { srand(0x400); testMultiThreaded_2(); resetTracker(); } +#endif printf ("\n"); passed(); From 512163b88988ec9a7af60a0581d96cf6f7bca250 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 Feb 2016 01:59:13 -0600 Subject: [PATCH 36/94] Add per-stream pool for hsa_signals. [ROCm/hip commit: caef9b5ced84a71d8c76cdc134ed2cb5869ad91e] --- .../hip/include/hcc_detail/hip_runtime_api.h | 2 +- projects/hip/include/hip_runtime_api.h | 3 + projects/hip/src/hip_hcc.cpp | 209 +++++++++++++----- 3 files changed, 154 insertions(+), 60 deletions(-) diff --git a/projects/hip/include/hcc_detail/hip_runtime_api.h b/projects/hip/include/hcc_detail/hip_runtime_api.h index a0c676987b..5fe398b84c 100644 --- a/projects/hip/include/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hcc_detail/hip_runtime_api.h @@ -115,7 +115,7 @@ enum hipMemcpyKind { // The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope. -typedef struct ihipStream_t * hipStream_t; +typedef class ihipStream_t * hipStream_t; /* diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 41ad338d6d..de6d175039 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -134,6 +134,7 @@ typedef struct hipPointerAttribute_t { * @enum * @ingroup Enumerations */ +// Developer note - when updating these, update the hipErrorName and hipErrorString functions typedef enum hipError_t { hipSuccess = 0 ///< Successful completion. ,hipErrorMemoryAllocation ///< Memory allocation error. @@ -143,6 +144,8 @@ typedef enum hipError_t { ,hipErrorInvalidValue ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range. ,hipErrorInvalidResourceHandle ///< Resource handle (hipEvent_t or hipStream_t) invalid. ,hipErrorInvalidDevice ///< DeviceID must be in range 0...#compute-devices. + ,hipErrorInvalidMemcpyDirection ///< Invalid memory copy direction + ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices ,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. ,hipErrorUnknown ///< Unknown error. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 30a0d993e0..ff1f39d780 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -67,6 +67,7 @@ int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; +int HIP_STREAM_SIGNALS = 2; /* number of signals to use when stream is created */ #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -90,18 +91,50 @@ enum ihipCommand_t { ihipCommandData, }; + +// Small wrapper around signals. +// Designed to be used from stream. +struct ihipSignal_t { + hsa_signal_t _hsa_signal; + int _refCnt; + + ihipSignal_t() : _refCnt(0) { + if (hsa_signal_create(1, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + } + + ~ihipSignal_t() { + if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + // _refCnt should be 0, unless we are shutting down... + _refCnt = 0; + }; +}; + + // Internal stream structure. -struct ihipStream_t { +class ihipStream_t { +public: unsigned _device_index; hc::accelerator_view _av; unsigned _flags; ihipCommand_t _last_command; //ihipStream_t() : _av(){ }; - ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel) - {}; -} ; + ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); + ~ihipStream_t(); + + inline ihipDevice_t * getDevice() const; + + hsa_signal_t getSignal() ; + void releaseSignal(ihipSignal_t *signal) ; + +private: + int _signalCursor; + std::vector _signalPool; +}; @@ -179,6 +212,91 @@ public: }; +//================================================================================================= +// Global Data Structures: +//================================================================================================= +//TLS - must be initialized here. +thread_local hipError_t tls_lastHipError = hipSuccess; +thread_local int tls_defaultDevice = 0; + +// Global initialization. +std::once_flag hip_initialized; +ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +unsigned g_deviceCnt; +//================================================================================================= + + +//================================================================================================= +// Implementation: +//================================================================================================= + + +//================================================================================================= +// ihipStream_t: +//================================================================================================= +//--- +ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : + _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel), + _signalCursor(0) +{ + _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + +}; + +//--- +ihipStream_t::~ihipStream_t() +{ + _signalPool.clear(); +} + + +//--- +inline ihipDevice_t * ihipStream_t::getDevice() const +{ + return &g_devices[_device_index]; +}; + + +// Allocate a new signal from the signal pool. +// Returned signals are initialized to a value of "1". +hsa_signal_t ihipStream_t::getSignal() +{ + int numToScan = _signalPool.size(); + do { + auto thisCursor = _signalCursor; + if (++_signalCursor > _signalPool.size()) { + _signalCursor = 0; + } + + if (_signalPool[thisCursor]._refCnt == 0) { + _signalPool[thisCursor]._refCnt ++; // allocate it + return _signalPool[thisCursor]._hsa_signal; + } + + numToScan--; + } while (numToScan) ; + + assert(numToScan == 0); + + // Have to grow the pool: + printf ("Grow signal pool\n"); + _signalCursor = _signalPool.size(); // set to the beginning of the new entries: + _signalPool.resize(_signalPool.size() * 2); + return getSignal(); // try again, + + // Shouldnever reach here. + assert(0); +} + + +void ihipStream_t::releaseSignal(ihipSignal_t *signal) +{ + if (--signal->_refCnt <= 0) { + // restore signal to the initial value 1 + hsa_signal_store_release(signal->_hsa_signal, 1); + } +} + //================================================================================================= // //Reset the device - this is called from hipDeviceReset. @@ -235,17 +353,6 @@ ihipDevice_t::~ihipDevice_t() //---- -//================================================================================================= -//TLS - must be initialized here. -thread_local hipError_t tls_lastHipError = hipSuccess; -thread_local int tls_defaultDevice = 0; - -// Global initialization. -std::once_flag hip_initialized; -ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. -unsigned g_deviceCnt; - -//================================================================================================= @@ -524,8 +631,9 @@ void ihipInit() READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); - READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)." ); - READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction."); + READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); + READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); + READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to use when creating a new stream (pool can later grow)"); /* * Build a table of valid compute devices. @@ -1012,6 +1120,7 @@ const char *hipGetErrorName(hipError_t hip_error) case hipErrorInvalidValue : return "hipErrorInvalidValue"; case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle"; case hipErrorInvalidDevice : return "hipErrorInvalidDevice"; + case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection"; case hipErrorNoDevice : return "hipErrorNoDevice"; case hipErrorNotReady : return "hipErrorNotReady"; case hipErrorUnknown : return "hipErrorUnknown"; @@ -1744,7 +1853,7 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB hc::am_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { - memcpy(dst, src, sizeBytes); // TODO - not async. + memcpy(dst, src, sizeBytes); } else { // Let HSA runtime handle it: @@ -1766,37 +1875,6 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB #endif -#if 0 // USE_AM_TRACKER -void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) -{ - bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. - - hipStatus_t e = hipSuccess; - - // TODO - check kind is not default. - if (kind == hipMemcpyDefault) { - e = hipErrorInvalidMemoryDirection; - } else { - // Let HSA runtime handle it: - // TODO - need buffer pool for the signals: - - device->_copy_lock[1].lock(); - - hsa_signal_store_relaxed(device->_copy_signal, 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); - - if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - - device->_copy_lock[1].unlock(); - - } -} -#endif - - - //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -1822,13 +1900,10 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #else - // TODO-hsart - what synchronization does hsa_copy provide? hc::am_copy(dst, src, sizeBytes); e = hipSuccess; #endif - // TODO - when am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. - return ihipLogStatus(e); } @@ -1856,20 +1931,34 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. hc::am_copy(dst, src, sizeBytes); -#if 0 - - hipStream_t s =ihipGetStream(stream); +#if USE_ASYNC_COPY + hipStream_t s = ihipSyncAndResolveStream(stream); if (s) { - hc::completion_future cf = ihipMemcpyKernel (s, static_cast (dst), static_cast (src), sizeBytes); + ihipDevice_t *device = s->getDevice(); - //cf.wait(); + if (kind == hipMemcpyDefault) { + e = hipErrorInvalidMemcpyDirection; + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals rather than lock: + device->_copy_lock[1].lock(); - e = hipSuccess; + hsa_signal_store_relaxed(device->_copy_signal, 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + device->_copy_lock[1].unlock(); + + } } else { e = hipErrorInvalidValue; } + #endif // TODO - if am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. @@ -2113,3 +2202,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a hipError_t err = hipSuccess; return ihipLogStatus(err); } + +// TODO - review signal / error reporting code. From 8c230eab736541ad6361e1444e3ffee46ec566d3 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 16 Feb 2016 07:39:04 -0600 Subject: [PATCH 37/94] Implement to read HIP_VISIBLE_DEVICES to internal global variable [ROCm/hip commit: d40cbef2af85d85a26df2d0846bd97428c9ba208] --- projects/hip/src/hip_hcc.cpp | 91 +++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 4f95320ac3..24d6c20dfd 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include #include +#include #include #include #include @@ -60,7 +61,10 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_BUFFERS = 2; +int HIP_STAGING_BUFFERS = 2; +int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ +// vector of integers that contains the visible device IDs +std::vector HIP_VISIBLE_DEVICES_IDS; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -176,9 +180,9 @@ public: //================================================================================================= // -//Reset the device - this is called from hipDeviceReset. +//Reset the device - this is called from hipDeviceReset. //Device may be reset multiple times, and may be reset after init. -void ihipDevice_t::reset() +void ihipDevice_t::reset() { _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); @@ -477,17 +481,39 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c env = getenv(var_name2); } - // Default is set when variable is initialized (at top of this file), so only override if we find - // an environment variable. - if (env) { - long int v = strtol(env, NULL, 0); - *var_ptr = (int) (v); + // Check if the environment variable is either HIP_VISIBLE_DEVICES or CUDA_LAUNCH_BLOCKING, which + // contains a sequence of comma-separated device IDs + if (!(strcmp(var_name1,"HIP_VISIBLE_DEVICES") && strcmp(var_name2, "CUDA_VISIBLE_DEVICES")) && env){ + // Parse the string stream of env and store the device ids to HIP_VISIBLE_DEVICES_IDS global variable + std::string str = env; + std::istringstream ss(str); + std::string device_id; + + while (std::getline(ss, device_id, ',')) { + HIP_VISIBLE_DEVICES_IDS.push_back(atoi(device_id.c_str())); + } + // Print out the number of ids for debugging + if (HIP_PRINT_ENV) { + std::cout << "HIP visible device id is set to be: "; + for(int i=0;i _max_buffers ? _max_buffers : numBuffers) { - - + + for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. _pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned); @@ -1605,10 +1632,10 @@ StagingBuffer::~StagingBuffer() //--- -void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) { - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completion_signal[i], 0); @@ -1621,7 +1648,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; tprintf (TRACE_COPY2, "waiting... on completion signal\n"); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. @@ -1632,7 +1659,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); - assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp += theseBytes; dstp += theseBytes; @@ -1643,15 +1670,15 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } //--- -void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) { - const char *srcp0 = static_cast (src); - char *dstp1 = static_cast (dst); + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completion_signal[i], 0); @@ -1671,7 +1698,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); - assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp0 += theseBytes; } @@ -1682,7 +1709,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); @@ -1693,7 +1720,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte //for (int i=0; i<_numBuffers; i++) { - // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); //} } @@ -1709,7 +1736,7 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); - bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. + bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. // Resolve default to a specific Kind, since we use different algorithms: if (kind == hipMemcpyDefault) { @@ -1753,7 +1780,7 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } device->_copy_lock[1].unlock(); @@ -1786,7 +1813,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind } else { e = hipErrorInvalidResourceHandle; } - + #else // TODO-hsart - what synchronization does hsa_copy provide? From 66df1ff930cb85e7a215874c2263168594b88d86 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 16 Feb 2016 10:00:05 -0600 Subject: [PATCH 38/94] modify to add remove invalid devices numbers [ROCm/hip commit: 060439b6ab068c29e7f5653c6683ef761f1e4382] --- projects/hip/src/hip_hcc.cpp | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 24d6c20dfd..c955629ed6 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -32,6 +32,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -64,7 +65,7 @@ int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ // vector of integers that contains the visible device IDs -std::vector HIP_VISIBLE_DEVICES_IDS; +std::vector g_hip_visible_devices; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -484,20 +485,23 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c // Check if the environment variable is either HIP_VISIBLE_DEVICES or CUDA_LAUNCH_BLOCKING, which // contains a sequence of comma-separated device IDs if (!(strcmp(var_name1,"HIP_VISIBLE_DEVICES") && strcmp(var_name2, "CUDA_VISIBLE_DEVICES")) && env){ - // Parse the string stream of env and store the device ids to HIP_VISIBLE_DEVICES_IDS global variable + // Parse the string stream of env and store the device ids to g_hip_visible_devices global variable std::string str = env; std::istringstream ss(str); std::string device_id; - while (std::getline(ss, device_id, ',')) { - HIP_VISIBLE_DEVICES_IDS.push_back(atoi(device_id.c_str())); + if (atoi(device_id.c_str()) >= 0) { + g_hip_visible_devices.push_back(atoi(device_id.c_str())); + }else// Any device number after invalid number will not present + break; } - // Print out the number of ids for debugging + + // Print out the number of ids if (HIP_PRINT_ENV) { - std::cout << "HIP visible device id is set to be: "; - for(int i=0;i= g_deviceCnt){ + // Make sure any DeviceID after invalid DeviceID will be erased. + g_hip_visible_devices.resize(i); + break; + } + } tprintf(TRACE_API, "pid=%u %-30s\n", getpid(), ""); From bb8a1fe72dd97dc9aa58ce733ad998da38b7056d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 00:59:12 -0600 Subject: [PATCH 39/94] more work on async copies [ROCm/hip commit: 59379ffb44842173226df885511750280801010d] --- projects/hip/bin/hipcc | 2 +- projects/hip/src/hip_hcc.cpp | 70 ++++++---- projects/hip/tests/src/CMakeLists.txt | 2 + projects/hip/tests/src/hipMemcpyAsync.cpp | 149 ++++++++++++++++++++++ 4 files changed, 198 insertions(+), 25 deletions(-) create mode 100644 projects/hip/tests/src/hipMemcpyAsync.cpp diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index 7537750ff6..1ab4cf2759 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -164,7 +164,7 @@ if ($needHipHcc) { if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) { my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object"; if ($verbose & 0x10) { - $CMD .= " -g" ; + $CMD .= " -g -O2" ; } else { $CMD .= " -O3" ; } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index ff1f39d780..ae28947ef3 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -62,12 +62,13 @@ THE SOFTWARE. //static const int debug = 0; static const int release = 1; +int HIP_LAUNCH_BLOCKING = 0; + int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; -int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; -int HIP_STREAM_SIGNALS = 2; /* number of signals to use when stream is created */ +int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -128,7 +129,7 @@ public: inline ihipDevice_t * getDevice() const; - hsa_signal_t getSignal() ; + ihipSignal_t * getSignal() ; void releaseSignal(ihipSignal_t *signal) ; private: @@ -241,6 +242,13 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig { _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + auto s = this; + + std::for_each(_signalPool.begin(), _signalPool.end(), + [s](ihipSignal_t &iter) { + printf (" stream:%p allocated hsa_signal=%p\n", s, (iter._hsa_signal)); + }); + }; //--- @@ -259,18 +267,18 @@ inline ihipDevice_t * ihipStream_t::getDevice() const // Allocate a new signal from the signal pool. // Returned signals are initialized to a value of "1". -hsa_signal_t ihipStream_t::getSignal() +ihipSignal_t *ihipStream_t::getSignal() { int numToScan = _signalPool.size(); do { auto thisCursor = _signalCursor; - if (++_signalCursor > _signalPool.size()) { + if (++_signalCursor == _signalPool.size()) { _signalCursor = 0; } if (_signalPool[thisCursor]._refCnt == 0) { _signalPool[thisCursor]._refCnt ++; // allocate it - return _signalPool[thisCursor]._hsa_signal; + return &_signalPool[thisCursor]; } numToScan--; @@ -336,6 +344,7 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->reset(); }; + ihipDevice_t::~ihipDevice_t() { if (_null_stream) { @@ -628,12 +637,14 @@ void ihipInit() /* * Environment variables */ - READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); - READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); + READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); + //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading + READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); + READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); - READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to use when creating a new stream (pool can later grow)"); + READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); /* * Build a table of valid compute devices. @@ -791,7 +802,10 @@ inline bool ihipCheckCommandSwitchSync(hipStream_t stream, ihipCommand_t new_com addedSync = true; *marker = stream->_av.create_marker(); - tprintf (TRACE_SYNC, "stream %p switch to %s (barrier pkt inserted)\n", (void*)stream, new_command == ihipCommandKernel ? "Kernel" : "Data"); + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", + (void*)stream, + stream->_last_command == ihipCommandKernel ? "Kernel" : "Data", + new_command == ihipCommandKernel ? "Kernel" : "Data"); stream->_last_command = new_command; } @@ -1908,10 +1922,12 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind } -//--- -/* +#if USE_ASYNC_COPY==0 +/** * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ +#endif +//--- hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { std::call_once(hip_initialized, ihipInit); @@ -1927,9 +1943,6 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // Async - need to set up dependency on the last command queued to the device? - // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. - // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::am_copy(dst, src, sizeBytes); #if USE_ASYNC_COPY @@ -1943,25 +1956,33 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals rather than lock: - device->_copy_lock[1].lock(); + ihipSignal_t *ihip_signal = stream->getSignal(); - hsa_signal_store_relaxed(device->_copy_signal, 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); + //stream->saveLastSignal(ihipSignal); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + if (HIP_LAUNCH_BLOCKING) { + hsa_signal_wait_relaxed(ihip_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + stream->releaseSignal(ihip_signal); + } + } else { + // This path can be hit if src or dst point to unpinned host memory. + // TODO - does async-copy fall back to sync if input pointers are not pinned? + e = hipErrorInvalidValue; } - - device->_copy_lock[1].unlock(); - } } else { e = hipErrorInvalidValue; } - +#else + // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. + // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. + hc::am_copy(dst, src, sizeBytes); #endif - // TODO - if am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. return ihipLogStatus(e); } @@ -2015,6 +2036,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { + // TODO - call an ihip memset so HIP_TRACE is correct. return hipMemsetAsync(dst, value, sizeBytes, hipStreamNull); } diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index 7e4736a99a..ec0b15ad62 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -104,6 +104,7 @@ make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) make_hip_executable (hipMemcpy hipMemcpy.cpp) +make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) make_hip_executable (hipMemset hipMemset.cpp) make_hip_executable (hipEventRecord hipEventRecord.cpp) make_hip_executable (hipLanguageExtensions hipLanguageExtensions.cpp) @@ -131,6 +132,7 @@ make_test(hipGridLaunch " " ) make_test(hipPointerAttrib " " ) make_test(hipMemcpy " " ) +make_test(hipMemcpyAsync " " ) make_test(hipHcc " " ) diff --git a/projects/hip/tests/src/hipMemcpyAsync.cpp b/projects/hip/tests/src/hipMemcpyAsync.cpp new file mode 100644 index 0000000000..e2968af2f2 --- /dev/null +++ b/projects/hip/tests/src/hipMemcpyAsync.cpp @@ -0,0 +1,149 @@ +// Test under-development. Calls async mem-copy API, experiment with functionality. + +#include "hip_runtime.h" +#include "test_common.h" + +unsigned p_streams = 2; + + +void simpleNegTest() +{ + printf ("testing: %s\n",__func__); + hipError_t e; + float *A_malloc, *A_pinned, *A_d; + + size_t Nbytes = N*sizeof(float); + A_malloc = (float*)malloc(Nbytes); + HIPCHECK(hipMallocHost(&A_pinned, Nbytes)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + + + // Can't use default with async copy + e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); + HIPASSERT (e==hipErrorInvalidMemcpyDirection); + + + // Not sure what happens here, the memory must be pinned. + e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL); + HIPASSERT (e==hipErrorInvalidValue); + + +} + +//--- +//Classic example showing how to overlap data transfer with compute. +//We divide the work into "chunks" and create a stream for each chunk. +//Each chunk then runs a H2D copy, followed by kernel execution, followed by D2H copyback. +//Work in separate streams is independent which enables concurrency. + +// IN: nStreams : number of streams to use for the test +// IN :useNullStream - use NULL stream. Synchronizes everything. +// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D +// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H +void chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) +{ + + size_t Nbytes = N*sizeof(int); + printf ("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ",__func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); + printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0); + + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); + + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + + hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); + if (useNullStream) { + nStreams = 1; + stream[0] = NULL; + } else { + for (int i = 0; i < nStreams; ++i) { + HIPCHECK (hipStreamCreate(&stream[i])); + } + } + + + size_t workLeft = N; + size_t workPerStream = N / nStreams; + for (int i = 0; i < nStreams; ++i) { + size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; + size_t workBytes = work * sizeof(int); + + size_t offset = i*workPerStream; + + if (useSyncMemcpyH2D) { + HIPCHECK ( hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); + HIPCHECK ( hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); + }; + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); + + if (useSyncMemcpyD2H) { + HIPCHECK ( hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); + } + } + + + HIPCHECK (hipDeviceSynchronize()); + + + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true); +}; + + +//--- +//Parse arguments specific to this test. +void parseMyArguments(int argc, char *argv[]) +{ + int more_argc = HipTest::parseStandardArguments(argc, argv, false); + + // parse args for this test: + for (int i = 1; i < more_argc; i++) { + const char *arg = argv[i]; + + if (!strcmp(arg, "--streams")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { + failed("Bad streams argument"); + } + } else { + failed("Bad argument '%s'", arg); + } + }; +}; + + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + parseMyArguments(argc, argv); + + + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + simpleNegTest(); + + + chunkedAsyncExample(p_streams, true, true, true); // Easy sync version + chunkedAsyncExample(p_streams, false, true, true); // Easy sync version + chunkedAsyncExample(p_streams, false, false, true); // Some async + chunkedAsyncExample(p_streams, false, false, false); // All async + + + + passed(); + +} From e5ff38e42176033e645fc5d1a7deddc8d6dbea5f Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 17 Feb 2016 06:59:18 -0600 Subject: [PATCH 40/94] Implementation of HIP_VISIBLE_DEVICES in runtime [ROCm/hip commit: c1e120fb1b37cff29d2f78bbda30966cb1dca864] --- projects/hip/src/hip_hcc.cpp | 54 ++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index c955629ed6..ae0f00320f 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -64,8 +64,7 @@ int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ -// vector of integers that contains the visible device IDs -std::vector g_hip_visible_devices; +std::vector g_hip_visible_devices; /* vector of integers that contains the visible device IDs */ #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -489,14 +488,16 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c std::string str = env; std::istringstream ss(str); std::string device_id; + // Clean up the defult value + g_hip_visible_devices.clear(); + // Read the visible device numbers while (std::getline(ss, device_id, ',')) { if (atoi(device_id.c_str()) >= 0) { g_hip_visible_devices.push_back(atoi(device_id.c_str())); }else// Any device number after invalid number will not present break; } - - // Print out the number of ids + // Print out the number of ids if (HIP_PRINT_ENV) { printf ("%-30s = ", var_name1); for(int i=0;i= g_deviceCnt){ + if(g_hip_visible_devices[i] >= deviceCnt){ // Make sure any DeviceID after invalid DeviceID will be erased. g_hip_visible_devices.resize(i); break; } } + g_devices = new ihipDevice_t[deviceCnt]; + g_deviceCnt = 0; + for (int i=0; i"); } @@ -596,6 +608,12 @@ INLINE bool ihipIsValidDevice(unsigned deviceIndex) return (deviceIndex < g_deviceCnt); } +// check if the device ID is set as visible +INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) +{ + return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), + (int)deviceIndex) != g_hip_visible_devices.end(); +} //--- INLINE ihipDevice_t *ihipGetTlsDefaultDevice() From b3d70ca2716f8fe7d5b32056074fc91f62b89f9d Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 17 Feb 2016 09:24:39 -0600 Subject: [PATCH 41/94] 1. Bug fix 2. passed initial tests on different sets of HIP_VISIBLE_DEVICES: (0),(1),(0,1),(1,2),(2,3),(1,2,3),(2,3,4),(1,5,2,3) and achieved expected choice of GPU devices at the runtime. 3. Passed HIP test suite. [ROCm/hip commit: 8aa4bfce57c281de63f6a2b692123673ee368c51] --- projects/hip/src/hip_hcc.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index ae0f00320f..1f2f6f0b3b 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -33,6 +33,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -580,15 +581,19 @@ void ihipInit() g_deviceCnt = 0; for (int i=0; i Date: Wed, 17 Feb 2016 21:22:31 -0600 Subject: [PATCH 43/94] Tweak full formatting [ROCm/hip commit: dffe573d4918c6922ab3dd107afb05454efb60cb] --- projects/hip/bin/hipconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/hip/bin/hipconfig b/projects/hip/bin/hipconfig index db53d6014e..1687983330 100755 --- a/projects/hip/bin/hipconfig +++ b/projects/hip/bin/hipconfig @@ -82,6 +82,7 @@ if ($p_full) { system("$HCC_HOME/bin/hcc-config --cxxflags"); print ("HCC-ldflags : "); system("$HCC_HOME/bin/hcc-config --ldflags"); + printf("\n"); } if ($HIP_PLATFORM eq "nvcc") { print "\n" ; @@ -98,6 +99,8 @@ if ($p_full) { print "\n" ; print "== Linux Kernel\n"; system ("uname -a"); + + print "\n" ; $printed = 1; } From 5ab45e9be79ef16a155c8856ffd0f257321fbd74 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 21:23:36 -0600 Subject: [PATCH 44/94] USE_AM_TRACKER=0 works [ROCm/hip commit: 44f40e171a6f4e08303e7e3c0150763c09f193a2] --- projects/hip/src/hip_hcc.cpp | 42 ++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index ae28947ef3..83083022c2 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -41,8 +41,13 @@ THE SOFTWARE. -#define USE_ASYNC_COPY 1 -#define USE_AM_TRACKER 2 /* >0 = use new AM memory tracker features. 1= use HIP impl, 2=use HCC impl */ +#define USE_AM_TRACKER 0 /* >0 = use new AM memory tracker features. 2=use HCC impl */ +#define USE_ROCR_V2 0 + +#if ((USE_AM_TRACKER!=0) && (USE_AM_TRACKER!=2)) +#error (USE_AM_TRACKER must be 0 or 2) +#endif + #if USE_AM_TRACKER==1 #include "hc_AM.cpp" @@ -244,10 +249,12 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig auto s = this; +#if 0 std::for_each(_signalPool.begin(), _signalPool.end(), [s](ihipSignal_t &iter) { - printf (" stream:%p allocated hsa_signal=%p\n", s, (iter._hsa_signal)); + printf (" stream:%p allocated hsa_signal=%lu\n", s, (iter._hsa_signal.handle)); }); +#endif }; @@ -1640,7 +1647,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { -#ifdef USE_AM_TRACKER +#if USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); #endif } @@ -1666,7 +1673,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { -#ifdef USE_AM_TRACKER +#if USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); #endif } @@ -1752,10 +1759,15 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); +#endif + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p status=%x\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw @@ -1795,7 +1807,11 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); +#endif assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp0 += theseBytes; @@ -1876,7 +1892,11 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB device->_copy_lock[1].lock(); hsa_signal_store_relaxed(device->_copy_signal, 1); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, device->_copy_signal); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); +#endif if (hsa_status == HSA_STATUS_SUCCESS) { hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); @@ -1901,7 +1921,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipError_t e = hipSuccess; -#if USE_ASYNC_COPY +#if USE_AM_TRACKER if (ihipIsValidDevice(stream->_device_index)) { ihipDevice_t *device = &g_devices[stream->_device_index]; @@ -1922,7 +1942,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind } -#if USE_ASYNC_COPY==0 +#if USE_AM_TRACKER==0 /** * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ @@ -1944,7 +1964,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // Async - need to set up dependency on the last command queued to the device? -#if USE_ASYNC_COPY +#if USE_AM_TRACKER hipStream_t s = ihipSyncAndResolveStream(stream); @@ -1960,7 +1980,11 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp //stream->saveLastSignal(ihipSignal); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, ihip_signal->_hsa_signal); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); +#endif if (hsa_status == HSA_STATUS_SUCCESS) { From a2d8f9d98ecc5d872cd0b4f7d92f370a725b3a13 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 21:33:32 -0600 Subject: [PATCH 45/94] Remove HIP-local AM tracker (now in HCC) [ROCm/hip commit: d653782d9d92518507fcb58d6f7aa3bfb316bac9] --- projects/hip/include/hcc_detail/AM.h | 157 ------------- projects/hip/src/hc_AM.cpp | 319 --------------------------- projects/hip/src/hip_hcc.cpp | 28 +-- 3 files changed, 10 insertions(+), 494 deletions(-) delete mode 100644 projects/hip/include/hcc_detail/AM.h delete mode 100644 projects/hip/src/hc_AM.cpp diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h deleted file mode 100644 index 74542789af..0000000000 --- a/projects/hip/include/hcc_detail/AM.h +++ /dev/null @@ -1,157 +0,0 @@ -#pragma once - -#include - -typedef int am_status_t; -#define AM_SUCCESS 0 -// TODO - provide better mapping of HSA error conditions to HC error codes. -#define AM_ERROR_MISC -1 /** Misellaneous error */ - -// Flags for am_alloc API: -#define amHostPinned 0x1 - - -namespace hc { - -// Info for each pointer in the memtry tracker: -struct AmPointerInfo { - void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. - void * _devicePointer; ///< Device pointer. - size_t _sizeBytes; ///< Size of allocation. - hc::accelerator _acc; ///< Device / Accelerator to use. - bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) - bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. - - int _appId; ///< App-specific storage. (Used by HIP to store deviceID.) - unsigned _appAllocationFlags; ///< App-specific allocation flags. (Used by HIP to store allocation flags.) - - AmPointerInfo() {}; - - AmPointerInfo(void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, bool isInDeviceMem, bool isAmManaged) : - _hostPointer(hostPointer), - _devicePointer(devicePointer), - _sizeBytes(sizeBytes), - _acc(acc), - _isInDeviceMem(isInDeviceMem), - _isAmManaged(isAmManaged), - _appId(-1), - _appAllocationFlags(0) {}; -}; -} - - - -namespace hc { - - -/** - * Allocate a block of @p size bytes of memory on the specified @p acc. - * - * The contents of the newly allocated block of memory are not initialized. - * - * If @p size == 0, 0 is returned. - * - * Flags must be 0. - * - * @return : On success, pointer to the newly allocated memory is returned. - * The pointer is typecast to the desired return type. - * - * If an error occurred trying to allocate the requested memory, 0 is returned. - * - * @see am_free, am_copy - */ -auto_voidp AM_alloc(size_t size, hc::accelerator acc, unsigned flags); - -/** - * Free a block of memory previously allocated with am_alloc. - * - * @return AM_SUCCESS - * @see am_alloc, am_copy - */ -am_status_t AM_free(void* ptr); - - -/** - * Copy @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. - * - * @return AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. - * @see am_alloc, am_free - */ -am_status_t AM_copy(void* dst, const void* src, size_t size); - - - -/** - * Return information about tracked pointer. - * - * AM tracks pointers when they are allocated or added to tracker with am_track_pointer. - * The tracker tracks the base pointer as well as the size of the allocation, and will - * find the information for a pointer anywhere in the tracked range. - * - * @returns AM_ERROR_MISC if pointer is not currently being tracked. - * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. - * - * @see AM_memtracker_add, - */ -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); - - -/** - * Add a pointer to the memory tracker. - * - * @return AM_SUCCESS - * @see am_memtracker_getinfo - */ -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); - - -/* - * Update info for an existing pointer in the memory tracker. - * - * @returns AM_ERROR_MISC if pointer is not found in tracker. - * @returns AM_SUCCESS if pointer is not found in tracker. - * - * @see am_memtracker_getinfo, am_memtracker_add - */ -am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); - - -/** - * Remove @ptr from the tracker structure. - * - * @p ptr may be anywhere in a tracked memory range. - * - * @returns AM_ERROR_MISC if pointer is not found in tracker. - * @returns AM_SUCCESS if pointer is not found in tracker. - * - * @see am_memtracker_getinfo, am_memtracker_add - */ -am_status_t am_memtracker_remove(void* ptr); - -/** - * Remove all memory allocations associated with specified accelerator from the memory tracker. - * - * @returns Number of entries reset. - * @see am_memtracker_getinfo - */ -size_t am_memtracker_reset(hc::accelerator acc); - -/** - * Print the entries in the memory tracker table. - * - * Intended primarily for debug purposes. - * @see am_memtracker_getinfo - **/ -void am_memtracker_print(); - - -/** - * Return total sizes of device, host, and user memory allocated by the application - * - * User memory is registered with am_tracker_add. - **/ -void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize); - - -}; // namespace hc - diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp deleted file mode 100644 index 272024cfe7..0000000000 --- a/projects/hip/src/hc_AM.cpp +++ /dev/null @@ -1,319 +0,0 @@ - -#include "hc_am.hpp" -#include "hsa.h" - - -#include "hcc_detail/AM.h" // TODO - Remove me. - -#define DB_TRACKER 0 -#define MUTEX_LOCK 1 - -#if DB_TRACKER -#define mprintf( ...) {\ - fprintf (stderr, __VA_ARGS__);\ - }; -#else -#define mprintf( ...) -#endif - -//========================================================================================================= -// Pointer Tracker Structures: -//========================================================================================================= -#include -#include -//#include - -struct AmMemoryRange { - const void * _basePointer; - const void * _endPointer; - AmMemoryRange(const void *basePointer, size_t sizeBytes) : - _basePointer(basePointer), _endPointer((const unsigned char*)basePointer + sizeBytes - 1) {}; -}; - -// Functor to compare ranges: -struct AmMemoryRangeCompare { - // Return true is LHS range is less than RHS - used to order the - bool operator()(const AmMemoryRange &lhs, const AmMemoryRange &rhs) const - { - return lhs._endPointer < rhs._basePointer; - } - -}; - - -std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) -{ - os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes - << " isInDeviceMem:" << ap._isInDeviceMem << " isAmManaged:" << ap._isAmManaged - << " appId:" << ap._appId << " appAllocFlags:" << ap._appAllocationFlags; - return os; -} - - -//------------------------------------------------------------------------------------------------- -// This structure tracks information for each pointer. -// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size -// will find the associated AmPointerInfo. -// The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. -// The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. -class AmPointerTracker { -typedef std::map MapTrackerType; -public: - - void insert(void *pointer, const hc::AmPointerInfo &p); - int remove(void *pointer); - - MapTrackerType::iterator find(const void *hostPtr) ; - - MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; - MapTrackerType::iterator end() { return _tracker.end(); } ; - void readerUnlock() { _mutex.unlock(); }; - - - size_t reset (hc::accelerator acc); - -private: - MapTrackerType _tracker; - std::mutex _mutex; - //std::shared_timed_mutex _mut; -}; - - -//--- -void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) -{ - std::lock_guard l (_mutex); - - mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); - _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); -} - - -//--- -// Return 1 if removed or 0 if not found. -int AmPointerTracker::remove (void *pointer) -{ - std::lock_guard l (_mutex); - mprintf ("remove: %p\n", pointer); - return _tracker.erase(AmMemoryRange(pointer,1)); -} - - -//--- -AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) -{ - std::lock_guard l (_mutex); - auto iter = _tracker.find(AmMemoryRange(pointer,1)); - mprintf ("find: %p\n", pointer); - return iter; -} - - -//--- -// Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). -// Returns count of ranges removed. -size_t AmPointerTracker::reset (hc::accelerator acc) -{ - std::lock_guard l (_mutex); - mprintf ("reset: \n"); - - size_t count = 0; - // relies on C++11 (erase returns iterator) - for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { - if (iter->second._acc == acc) { - if (iter->second._isAmManaged) { - hsa_memory_free(const_cast (iter->first._basePointer)); - } - count++; - - iter = _tracker.erase(iter); - } else { - iter++; - } - } - - return count; -} - - -//========================================================================================================= -// Global var defs: -//========================================================================================================= -AmPointerTracker g_amPointerTracker; // Track all am pointer allocations. - - -//========================================================================================================= -// API Definitions. -//========================================================================================================= -// -// - -namespace hc { - -// Allocate accelerator memory, return NULL if memory could not be allocated: -auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) -{ - - void *ptr = NULL; - - if (sizeBytes != 0 ) { - if (acc.is_hsa_accelerator()) { - hsa_agent_t *hsa_agent = static_cast (acc.get_default_view().get_hsa_agent()); - hsa_region_t *alloc_region; - if (flags & amHostPinned) { - alloc_region = static_cast(acc.get_hsa_am_system_region()); - } else { - alloc_region = static_cast(acc.get_hsa_am_region()); - } - - if (alloc_region->handle != -1) { - - hsa_status_t s1 = hsa_memory_allocate(*alloc_region, sizeBytes, &ptr); - hsa_status_t s2 = hsa_memory_assign_agent(ptr, *hsa_agent, HSA_ACCESS_PERMISSION_RW); - - if ((s1 != HSA_STATUS_SUCCESS) || (s2 != HSA_STATUS_SUCCESS)) { - ptr = NULL; - } else { - if (flags & amHostPinned) { - g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, true /*isAMManaged*/)); - } else { - g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, true /*isAMManaged*/)); - } - } - } - } - } - - return ptr; -}; - - -am_status_t AM_free(void* ptr) -{ - am_status_t status = AM_SUCCESS; - - if (ptr != NULL) { - // See also tracker::reset which can free memory. - hsa_memory_free(ptr); - - int numRemoved = g_amPointerTracker.remove(ptr) ; - if (numRemoved == 0) { - status = AM_ERROR_MISC; - } - } - return status; -} - - - -am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) -{ - am_status_t am_status = AM_ERROR_MISC; - hsa_status_t err = hsa_memory_copy(dst, src, sizeBytes); - - if (err == HSA_STATUS_SUCCESS) { - am_status = AM_SUCCESS; - } else { - am_status = AM_ERROR_MISC; - } - - return am_status; -} - - -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) -{ - auto infoI = g_amPointerTracker.find(ptr); - if (infoI != g_amPointerTracker.end()) { - *info = infoI->second; - return AM_SUCCESS; - } else { - return AM_ERROR_MISC; - } -} - -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) -{ - if (isDeviceMem) { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); - } else { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); - } - - return AM_SUCCESS; -} - - -am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) -{ - auto iter = g_amPointerTracker.find(ptr); - if (iter != g_amPointerTracker.end()) { - iter->second._appId = appId; - iter->second._appAllocationFlags = allocationFlags; - return AM_SUCCESS; - } else { - return AM_ERROR_MISC; - } -} - - -am_status_t am_memtracker_remove(void* ptr) -{ - am_status_t status = AM_SUCCESS; - - int numRemoved = g_amPointerTracker.remove(ptr) ; - if (numRemoved == 0) { - status = AM_ERROR_MISC; - } - - return status; -} - -//--- -void am_memtracker_print() -{ - std::ostream &os = std::cerr; - - //g_amPointerTracker.print(std::cerr); - for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { - os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; - os << iter->second << std::endl; - } - - g_amPointerTracker.readerUnlock(); -} - - -//--- -void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize) -{ - *deviceMemSize = *hostMemSize = *userMemSize = 0; - for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { - if (iter->second._acc == acc) { - size_t sizeBytes = iter->second._sizeBytes; - if (iter->second._isAmManaged) { - if (iter->second._isInDeviceMem) { - *deviceMemSize += sizeBytes; - } else { - *hostMemSize += sizeBytes; - } - } else { - *userMemSize += sizeBytes; - } - } - } - - g_amPointerTracker.readerUnlock(); -} - - -//--- -size_t am_memtracker_reset(hc::accelerator acc) -{ - return g_amPointerTracker.reset(acc); -} - - -} // end namespace hc. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 83083022c2..7a32f91747 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -42,21 +42,13 @@ THE SOFTWARE. #define USE_AM_TRACKER 0 /* >0 = use new AM memory tracker features. 2=use HCC impl */ -#define USE_ROCR_V2 0 +#define USE_ROCR_V2 0 /* use the ROCR v2 async copy API with dst and src agents */ -#if ((USE_AM_TRACKER!=0) && (USE_AM_TRACKER!=2)) -#error (USE_AM_TRACKER must be 0 or 2) +#if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074) +#error (USE_AM_TRACKER requries HCC version of 16074 or newer) #endif -#if USE_AM_TRACKER==1 -#include "hc_AM.cpp" -#define AM_ALLOC hc::AM_alloc -#define AM_FREE hc::AM_free -#else -#define AM_ALLOC hc::am_alloc -#define AM_FREE hc::am_free -#endif #define INLINE static inline @@ -247,9 +239,9 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig { _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); - auto s = this; #if 0 + auto s = this; std::for_each(_signalPool.begin(), _signalPool.end(), [s](ihipSignal_t &iter) { printf (" stream:%p allocated hsa_signal=%lu\n", s, (iter._hsa_signal.handle)); @@ -1642,7 +1634,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (device) { const unsigned am_flags = 0; - *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); + *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -1669,7 +1661,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) auto device = ihipGetTlsDefaultDevice(); if (device) { - *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); + *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { @@ -1715,7 +1707,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuf for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. - _pinnedStagingBuffer[i] = AM_ALLOC(_bufferSize, device->_acc, amHostPinned); + _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, device->_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; } @@ -1728,7 +1720,7 @@ StagingBuffer::~StagingBuffer() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { - AM_FREE(_pinnedStagingBuffer[i]); + hc::am_free(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } hsa_signal_destroy(_completion_signal[i]); @@ -2112,7 +2104,7 @@ hipError_t hipFree(void* ptr) ihipWaitAllStreams(ihipGetTlsDefaultDevice()); if (ptr) { - AM_FREE(ptr); + hc::am_free(ptr); } return ihipLogStatus(hipSuccess); @@ -2126,7 +2118,7 @@ hipError_t hipFreeHost(void* ptr) if (ptr) { tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); - AM_FREE(ptr); + hc::am_free(ptr); } return ihipLogStatus(hipSuccess); From 4e0ba0604499618b3c97b9a9e994df83b5b12332 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 23:03:37 -0600 Subject: [PATCH 46/94] Enable Tracker and ROCR by default, verify with HCC [ROCm/hip commit: c6f8883b0d965da15241d2911cc1f4ee4ff00952] --- projects/hip/src/hip_hcc.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 7a32f91747..f814f99ddf 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -41,14 +41,19 @@ THE SOFTWARE. -#define USE_AM_TRACKER 0 /* >0 = use new AM memory tracker features. 2=use HCC impl */ -#define USE_ROCR_V2 0 /* use the ROCR v2 async copy API with dst and src agents */ +#define USE_AM_TRACKER 1 /* >0 = use new AM memory tracker features. */ +#define USE_ROCR_V2 1 /* use the ROCR v2 async copy API with dst and src agents */ #if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074) #error (USE_AM_TRACKER requries HCC version of 16074 or newer) #endif +#if (USE_ROCR_V2) and (USE_AM_TRACKER == 0) +#error (USE_ROCR_V2 requires USE_AM_TRACKER>0) +#endif + + #define INLINE static inline @@ -1478,7 +1483,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; #if USE_AM_TRACKER - hc::AmPointerInfo amPointerInfo; + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { @@ -1530,7 +1536,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi if (flags == 0) { e = hipErrorInvalidValue; } else { - hc::AmPointerInfo amPointerInfo; + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); if (status == AM_SUCCESS) { *devicePointer = amPointerInfo._devicePointer; @@ -1836,7 +1843,9 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #if USE_AM_TRACKER void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { - hc::AmPointerInfo dstPtrInfo, srcPtrInfo; + hc::accelerator acc; + hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); + hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); From 033914d8024c8981be035306eadbdd17e2ca0e07 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 21:29:51 -0600 Subject: [PATCH 47/94] Tweak version numbers [ROCm/hip commit: b12ec2180667f0231eb7009b0782b338e9d553a3] --- projects/hip/RELEASE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hip/RELEASE.md b/projects/hip/RELEASE.md index 9fea5d4c78..ae0a0d2b4e 100644 --- a/projects/hip/RELEASE.md +++ b/projects/hip/RELEASE.md @@ -17,17 +17,17 @@ Stay tuned - the work for many of these features is already in-flight. ## Revision History: =================================================================================================== -Release:0.80.01.00 +Release:0.80.01 Date: 2016.02.18 - Improve reporting and support for device-side math functions. - Update Runtime Documentation. - Improve implementations of cross-lane operations (_ballot, _any, _all). - Provide shuffle intrinsics (performance optimization in-progress). - Support hipDeviceAttribute for querying "one-shot" device attributes, as an alternative to hipDeviceGetProperties. -- + =================================================================================================== -Release:0.80.00.00 : +Release:0.80.00 : Date: 2016.01.25 Initial release with GPUOpen Launch. From d064a446d0b37b4a82efc9a74927fbd1ea51f83f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 03:05:53 -0600 Subject: [PATCH 48/94] remove extra : [ROCm/hip commit: b63470f4cc5d0cc8a7cf3d8a409e0dda012c923f] --- projects/hip/include/nvcc_detail/hip_runtime_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 7a1e9bc6e9..0ef4b38c67 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -243,7 +243,7 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeClockRate: cdattr = cudaDevAttrClockRate; break; case hipDeviceAttributeMemoryClockRate: - cdattr = cudaDevAttrMemoryClockRate:; break; + cdattr = cudaDevAttrMemoryClockRate; break; case hipDeviceAttributeMultiprocessorCount: cdattr = cudaDevAttrMultiProcessorCount; break; case hipDeviceAttributeComputeMode: From 8c1a0d19242e5777edcaacd31ddba2db0e4cb326 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 14:34:18 +0300 Subject: [PATCH 49/94] Attribute hipDevAttrConcurrentKernels for obtaining Device property concurrentKernels is added. [ROCm/hip commit: 2b6fda77caf86bb404a9cf1e0e802ce9bf730596] --- projects/hip/include/hip_runtime_api.h | 1 + projects/hip/include/nvcc_detail/hip_runtime_api.h | 2 ++ projects/hip/src/hip_hcc.cpp | 2 ++ projects/hip/tests/src/hipGetDeviceAttribute.cpp | 1 + 4 files changed, 6 insertions(+) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 882103a1f4..d754862544 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -149,6 +149,7 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDevAttrConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. hipDeviceAttributePciBusId, ///< PCI Bus ID. hipDeviceAttributePciDeviceId, ///< PCI Device ID. hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per Multiprocessor. diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index f84de73872..83f2d59646 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -252,6 +252,8 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrMaxThreadsPerMultiProcessor; break; case hipDeviceAttributeComputeCapabilityMajor: cdattr = cudaDevAttrComputeCapabilityMajor; break; + case hipDevAttrConcurrentKernels: + cdattr = cudaDevAttrConcurrentKernels; break; case hipDeviceAttributePciBusId: cdattr = cudaDevAttrPciBusId; break; case hipDeviceAttributePciDeviceId: diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index bdfbdb230b..1ce3f4a5bb 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -852,6 +852,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->minor; break; case hipDeviceAttributePciBusId: *pi = prop->pciBusID; break; + case hipDevAttrConcurrentKernels: + *pi = prop->concurrentKernels; break; case hipDeviceAttributePciDeviceId: *pi = prop->pciDeviceID; break; case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 30fac8c1b4..7f37e816d2 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -73,6 +73,7 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerMultiProcessor, props.maxThreadsPerMultiProcessor)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMajor, props.major)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDevAttrConcurrentKernels, props.concurrentKernels)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciBusId, props.pciBusID)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, props.maxSharedMemoryPerMultiProcessor)); From 801ae992d6801bc2a7249339925489f86759b590 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 15:08:55 +0300 Subject: [PATCH 50/94] hipInfo sample update with new Device Properties. [ROCm/hip commit: 5b05a9fef10b13d082d74f05efef6bd43b0f24cb] --- .../hip/samples/1_Utils/hipInfo/hipInfo.cpp | 78 ++++++++++--------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index de73aababd..c8979b1cc1 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -76,51 +76,55 @@ void printDeviceProp (int deviceId) HIPCHECK(hipDeviceGetProperties(&props, deviceId)); cout << setw(w1) << "Name: " << props.name << endl; + cout << setw(w1) << "pciBusID: " << props.pciBusID << endl; + cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; + cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; - cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; - cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; - cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; - cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; - cout << setw(w1) << "warpSize" << props.warpSize << endl; - cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; - cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; - cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; - cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; - cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; - cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; - cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; - cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; - cout << setw(w1) << "major" << props.major << endl; - cout << setw(w1) << "minor" << props.minor << endl; - cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; - cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; - cout << setw(w1) << "computeMode" << props.computeMode << endl; - cout << setw(w1) << "arch.hasGlobalInt32Atomics" << props.arch.hasGlobalInt32Atomics << endl; - cout << setw(w1) << "arch.hasGlobalFloatAtomicExch" << props.arch.hasGlobalFloatAtomicExch << endl; - cout << setw(w1) << "arch.hasSharedInt32Atomics" << props.arch.hasSharedInt32Atomics << endl; - cout << setw(w1) << "arch.hasSharedFloatAtomicExch" << props.arch.hasSharedFloatAtomicExch << endl; - cout << setw(w1) << "arch.hasFloatAtomicAdd" << props.arch.hasFloatAtomicAdd << endl; - cout << setw(w1) << "arch.hasGlobalInt64Atomics" << props.arch.hasGlobalInt64Atomics << endl; - cout << setw(w1) << "arch.hasSharedInt64Atomics" << props.arch.hasSharedInt64Atomics << endl; - cout << setw(w1) << "arch.hasDoubles" << props.arch.hasDoubles << endl; - cout << setw(w1) << "arch.hasWarpVote" << props.arch.hasWarpVote << endl; - cout << setw(w1) << "arch.hasWarpBallot" << props.arch.hasWarpBallot << endl; - cout << setw(w1) << "arch.hasWarpShuffle" << props.arch.hasWarpShuffle << endl; - cout << setw(w1) << "arch.hasFunnelShift" << props.arch.hasFunnelShift << endl; - cout << setw(w1) << "arch.hasThreadFenceSystem" << props.arch.hasThreadFenceSystem << endl; - cout << setw(w1) << "arch.hasSyncThreadsExt" << props.arch.hasSyncThreadsExt << endl; - cout << setw(w1) << "arch.hasSurfaceFuncs" << props.arch.hasSurfaceFuncs << endl; - cout << setw(w1) << "arch.has3dGrid" << props.arch.has3dGrid << endl; - cout << setw(w1) << "arch.hasDynamicParallelism" << props.arch.hasDynamicParallelism << endl; + cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; + cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; + cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; + cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; + cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; + cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl; + cout << setw(w1) << "warpSize: " << props.warpSize << endl; + cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl; + cout << setw(w1) << "computeMode: " << props.computeMode << endl; + cout << setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << endl; + cout << setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << endl; + cout << setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << endl; + cout << setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << endl; + cout << setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << endl; + cout << setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << endl; + cout << setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << endl; + cout << setw(w1) << "major: " << props.major << endl; + cout << setw(w1) << "minor: " << props.minor << endl; + cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl; + cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl; + cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch << endl; + cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl; + cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch << endl; + cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl; + cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl; + cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl; + cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl; + cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl; + cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl; + cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl; + cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl; + cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl; + cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl; + cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; + cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; + cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; cout << endl; size_t free, total; hipMemGetInfo(&free, &total); cout << fixed << setprecision(2); - cout << setw(w1) << "memInfo.total " << bytesToGB(total) << " GB" << endl; - cout << setw(w1) << "memInfo.free " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; + cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl; + cout << setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; } int main(int argc, char *argv[]) From a0cc7134e3de081c25b36f2102580e171a5c37ef Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 17:25:28 +0300 Subject: [PATCH 51/94] Device property memoryClockRate implementation. + Device property memoryClockRate is added to hipDeviceProp_t struct. + Device attribute hipDeviceAttributeMemoryClockRate is added to hipDeviceAttribute_t struct. + Tests update. + Rename hipDevAttrConcurrentKernels to hipDeviceAttributeConcurrentKernels. [ROCm/hip commit: 5ea8543d2eab4ea7ef9964b724d37669384d4423] --- projects/hip/include/hip_runtime_api.h | 6 ++++-- projects/hip/include/nvcc_detail/hip_runtime_api.h | 4 +++- projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp | 3 ++- projects/hip/src/hip_hcc.cpp | 11 +++++++++-- projects/hip/tests/src/hipGetDeviceAttribute.cpp | 3 ++- 5 files changed, 20 insertions(+), 7 deletions(-) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index d754862544..5191bc5d54 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -80,7 +80,8 @@ typedef struct hipDeviceProp_t { int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. int maxGridSize[3]; ///< Max grid dimensions (XYZ). - int clockRate; ///< Max clock frequency of the multiProcessors, in khz. + int clockRate; ///< Max clock frequency of the multiProcessors in khz. + int memoryClockRate; ///< Max memory clock frequency in khz. size_t totalConstMem; ///< Size of shared memory region (in bytes). int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. @@ -143,13 +144,14 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeWarpSize, ///< Warp size in threads. hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. + hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. - hipDevAttrConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. + hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. hipDeviceAttributePciBusId, ///< PCI Bus ID. hipDeviceAttributePciDeviceId, ///< PCI Device ID. hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per Multiprocessor. diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 83f2d59646..7a1e9bc6e9 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -242,6 +242,8 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrMaxRegistersPerBlock; break; case hipDeviceAttributeClockRate: cdattr = cudaDevAttrClockRate; break; + case hipDeviceAttributeMemoryClockRate: + cdattr = cudaDevAttrMemoryClockRate:; break; case hipDeviceAttributeMultiprocessorCount: cdattr = cudaDevAttrMultiProcessorCount; break; case hipDeviceAttributeComputeMode: @@ -252,7 +254,7 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrMaxThreadsPerMultiProcessor; break; case hipDeviceAttributeComputeCapabilityMajor: cdattr = cudaDevAttrComputeCapabilityMajor; break; - case hipDevAttrConcurrentKernels: + case hipDeviceAttributeConcurrentKernels: cdattr = cudaDevAttrConcurrentKernels; break; case hipDeviceAttributePciBusId: cdattr = cudaDevAttrPciBusId; break; diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index c8979b1cc1..18d9176a07 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -81,7 +81,8 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; - cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; + cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; + cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 1ce3f4a5bb..beba7c2775 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -362,12 +362,17 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Get the size of the region we are using for Accelerator Memory allocations: hsa_region_t *am_region = static_cast (_acc.get_hsa_am_region()); - err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &(prop->totalGlobalMem)); + err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem); DeviceErrorCheck(err); // maxSharedMemoryPerMultiProcessor should be as the same as group memory size. // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; + // Get Max memory clock frequency + err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); + prop->memoryClockRate *= 1000.0; // convert Mhz to Khz. + DeviceErrorCheck(err); + // Set feature flags - these are all mandatory for HIP on HCC path: // Some features are under-development and future revs may support flags that are currently 0. // Reporting of these flags should be synchronized with the HIP_ARCH* compile-time defines in hip_runtime.h @@ -838,6 +843,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->regsPerBlock; break; case hipDeviceAttributeClockRate: *pi = prop->clockRate; break; + case hipDeviceAttributeMemoryClockRate: + *pi = prop->memoryClockRate; break; case hipDeviceAttributeMultiprocessorCount: *pi = prop->multiProcessorCount; break; case hipDeviceAttributeComputeMode: @@ -852,7 +859,7 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->minor; break; case hipDeviceAttributePciBusId: *pi = prop->pciBusID; break; - case hipDevAttrConcurrentKernels: + case hipDeviceAttributeConcurrentKernels: *pi = prop->concurrentKernels; break; case hipDeviceAttributePciDeviceId: *pi = prop->pciDeviceID; break; diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 7f37e816d2..33b5e2ba03 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -67,13 +67,14 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeWarpSize, props.warpSize)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxRegistersPerBlock, props.regsPerBlock)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeClockRate, props.clockRate)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryClockRate, props.memoryClockRate)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeL2CacheSize, props.l2CacheSize)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerMultiProcessor, props.maxThreadsPerMultiProcessor)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMajor, props.major)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor)); - CHECK(test_hipDeviceGetAttribute(deviceId, hipDevAttrConcurrentKernels, props.concurrentKernels)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeConcurrentKernels, props.concurrentKernels)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciBusId, props.pciBusID)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, props.maxSharedMemoryPerMultiProcessor)); From 5cb2d5fc9a1f35d97ba9abec287332f4cc553605 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 20:43:03 -0600 Subject: [PATCH 52/94] Update doxygen HTML [ROCm/hip commit: 129088992b42ec7f4bee66064d54930e9c6d31a1] --- .../docs/RuntimeAPI/html/Synchonization.html | 6 +- .../hip/docs/RuntimeAPI/html/annotated.html | 2 +- projects/hip/docs/RuntimeAPI/html/bug.html | 2 +- .../hip/docs/RuntimeAPI/html/classes.html | 2 +- .../dir_68267d1309a1af8e8297ef4c3efbcdba.html | 4 +- .../dir_6d8604cb65fa6b83549668eb0ce09cac.html | 17 +- .../dir_d44c64559bbebec7f509842c48db8b23.html | 4 +- projects/hip/docs/RuntimeAPI/html/files.html | 12 +- .../hip/docs/RuntimeAPI/html/functions.html | 22 +- .../docs/RuntimeAPI/html/functions_vars.html | 22 +- .../hip/docs/RuntimeAPI/html/globals.html | 247 ++++- .../docs/RuntimeAPI/html/globals_defs.html | 29 +- .../docs/RuntimeAPI/html/globals_func.html | 118 ++- .../docs/RuntimeAPI/html/globals_vars.html | 5 +- .../hip/docs/RuntimeAPI/html/group__API.html | 2 +- .../docs/RuntimeAPI/html/group__Device.html | 61 +- .../docs/RuntimeAPI/html/group__Error.html | 2 +- .../docs/RuntimeAPI/html/group__Event.html | 8 +- .../RuntimeAPI/html/group__GlobalDefs.html | 134 ++- .../RuntimeAPI/html/group__HCC__Specific.html | 10 +- .../docs/RuntimeAPI/html/group__HIP-ENV.html | 2 +- .../docs/RuntimeAPI/html/group__Memory.html | 209 +++- .../RuntimeAPI/html/group__PeerToPeer.html | 100 +- .../docs/RuntimeAPI/html/group__Profiler.html | 4 +- .../docs/RuntimeAPI/html/group__Stream.html | 6 +- .../docs/RuntimeAPI/html/group__Texture.html | 2 +- .../docs/RuntimeAPI/html/group__Version.html | 8 +- .../html/hcc__detail_2hip__runtime_8h.html | 134 ++- .../hcc__detail_2hip__runtime_8h_source.html | 912 +++++++++--------- ...__detail_2hip__runtime__api_8h_source.html | 655 +++++++------ ..._detail_2hip__vector__types_8h_source.html | 306 +++--- .../hip/docs/RuntimeAPI/html/hierarchy.html | 2 +- .../html/hip__common_8h_source.html | 106 +- .../docs/RuntimeAPI/html/hip__hcc_8cpp.html | 38 +- .../html/hip__runtime_8h_source.html | 12 +- .../html/hip__runtime__api_8h_source.html | 262 +++-- .../html/hip__texture_8h_source.html | 54 +- .../html/hip__vector__types_8h_source.html | 7 +- .../html/host__defines_8h_source.html | 77 +- projects/hip/docs/RuntimeAPI/html/index.html | 2 +- .../hip/docs/RuntimeAPI/html/modules.html | 2 +- projects/hip/docs/RuntimeAPI/html/pages.html | 2 +- .../hip/docs/RuntimeAPI/html/search/all_0.js | 2 +- .../hip/docs/RuntimeAPI/html/search/all_1.js | 2 +- .../hip/docs/RuntimeAPI/html/search/all_10.js | 5 +- .../hip/docs/RuntimeAPI/html/search/all_11.js | 2 +- .../hip/docs/RuntimeAPI/html/search/all_2.js | 5 +- .../hip/docs/RuntimeAPI/html/search/all_3.js | 8 +- .../hip/docs/RuntimeAPI/html/search/all_4.js | 5 +- .../hip/docs/RuntimeAPI/html/search/all_5.js | 3 +- .../hip/docs/RuntimeAPI/html/search/all_6.js | 105 +- .../hip/docs/RuntimeAPI/html/search/all_7.js | 144 ++- .../hip/docs/RuntimeAPI/html/search/all_8.js | 4 +- .../hip/docs/RuntimeAPI/html/search/all_9.js | 10 +- .../hip/docs/RuntimeAPI/html/search/all_a.js | 12 +- .../hip/docs/RuntimeAPI/html/search/all_b.js | 2 +- .../hip/docs/RuntimeAPI/html/search/all_c.js | 3 +- .../hip/docs/RuntimeAPI/html/search/all_d.js | 6 +- .../hip/docs/RuntimeAPI/html/search/all_e.js | 2 +- .../hip/docs/RuntimeAPI/html/search/all_f.js | 3 +- .../docs/RuntimeAPI/html/search/defines_0.js | 2 +- .../docs/RuntimeAPI/html/search/enums_0.js | 5 +- .../RuntimeAPI/html/search/enumvalues_0.js | 25 + .../docs/RuntimeAPI/html/search/files_0.js | 6 +- .../RuntimeAPI/html/search/functions_0.js | 5 + .../hip/docs/RuntimeAPI/html/search/search.js | 6 +- .../RuntimeAPI/html/search/variables_1.js | 3 +- .../RuntimeAPI/html/search/variables_4.js | 2 + .../RuntimeAPI/html/search/variables_6.js | 3 +- .../RuntimeAPI/html/search/variables_7.js | 2 +- .../RuntimeAPI/html/search/variables_8.js | 3 +- .../RuntimeAPI/html/search/variables_9.js | 3 +- .../RuntimeAPI/html/search/variables_a.js | 2 +- .../RuntimeAPI/html/search/variables_b.js | 2 +- .../RuntimeAPI/html/search/variables_c.js | 2 +- .../RuntimeAPI/html/structdim3-members.html | 2 +- .../hip/docs/RuntimeAPI/html/structdim3.html | 4 +- .../structhipChannelFormatDesc-members.html | 2 +- .../html/structhipChannelFormatDesc.html | 4 +- .../html/structhipDeviceArch__t-members.html | 2 +- .../html/structhipDeviceArch__t.html | 36 +- .../html/structhipDeviceProp__t-members.html | 29 +- .../html/structhipDeviceProp__t.html | 44 +- .../html/structhipEvent__t-members.html | 2 +- .../RuntimeAPI/html/structhipEvent__t.html | 4 +- .../html/structihipDevice__t-members.html | 2 +- .../RuntimeAPI/html/structihipDevice__t.html | 4 +- .../html/structihipEvent__t-members.html | 2 +- .../RuntimeAPI/html/structihipEvent__t.html | 4 +- .../html/structihipStream__t-members.html | 2 +- .../RuntimeAPI/html/structihipStream__t.html | 4 +- .../html/structtexture-members.html | 2 +- .../docs/RuntimeAPI/html/structtexture.html | 6 +- .../html/structtextureReference-members.html | 2 +- .../html/structtextureReference.html | 6 +- 95 files changed, 2624 insertions(+), 1544 deletions(-) diff --git a/projects/hip/docs/RuntimeAPI/html/Synchonization.html b/projects/hip/docs/RuntimeAPI/html/Synchonization.html index c21879b667..f60f0cc1e4 100644 --- a/projects/hip/docs/RuntimeAPI/html/Synchonization.html +++ b/projects/hip/docs/RuntimeAPI/html/Synchonization.html @@ -79,8 +79,8 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

The following commands are "host-asynchronous" - meaning they do not wait for any preceding commands to complete, and may return control to the host thread before the requested operation completes:

  • Kernel launches (hipLaunchKernel() )
  • -
  • Asynchronous memory copies - any memory copy API which contains "Async", such as hipMemcpyAsync())
  • -
  • Any memory set (for example, hipMemset());
  • +
  • Asynchronous memory copies - any memory copy API which contains "Async", such as hipMemcpyAsync())
  • +
  • Any memory set (for example, hipMemset());
  • TODO

"Host-synchronous" commands have the following properties:

@@ -109,7 +109,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/annotated.html b/projects/hip/docs/RuntimeAPI/html/annotated.html index 49ef4eac76..4a61698f75 100644 --- a/projects/hip/docs/RuntimeAPI/html/annotated.html +++ b/projects/hip/docs/RuntimeAPI/html/annotated.html @@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/bug.html b/projects/hip/docs/RuntimeAPI/html/bug.html index bdc31de0f8..d9c64d7eb8 100644 --- a/projects/hip/docs/RuntimeAPI/html/bug.html +++ b/projects/hip/docs/RuntimeAPI/html/bug.html @@ -95,7 +95,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/classes.html b/projects/hip/docs/RuntimeAPI/html/classes.html index 6bd01f36c7..70491ef32f 100644 --- a/projects/hip/docs/RuntimeAPI/html/classes.html +++ b/projects/hip/docs/RuntimeAPI/html/classes.html @@ -105,7 +105,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html b/projects/hip/docs/RuntimeAPI/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html index fd08fac040..7d6a9d0e72 100644 --- a/projects/hip/docs/RuntimeAPI/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +++ b/projects/hip/docs/RuntimeAPI/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/src Directory Reference +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/src Directory Reference @@ -92,7 +92,7 @@ Files diff --git a/projects/hip/docs/RuntimeAPI/html/dir_6d8604cb65fa6b83549668eb0ce09cac.html b/projects/hip/docs/RuntimeAPI/html/dir_6d8604cb65fa6b83549668eb0ce09cac.html index 7d2d877c9b..9fb345b393 100644 --- a/projects/hip/docs/RuntimeAPI/html/dir_6d8604cb65fa6b83549668eb0ce09cac.html +++ b/projects/hip/docs/RuntimeAPI/html/dir_6d8604cb65fa6b83549668eb0ce09cac.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail Directory Reference +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail Directory Reference @@ -87,20 +87,25 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

Files

file  hip_runtime.h [code] + Contains definitions of APIs for HIP runtime.
  -file  hip_runtime_api.h [code] +file  hip_runtime_api.h [code] + Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language extensions (-hc mode) ; those functions in hip_runtime.h.
  -file  hip_texture.h [code] +file  hip_texture.h [code] + HIP C++ Texture API for hcc compiler.
  -file  hip_vector_types.h [code] +file  hip_vector_types.h [code] + Defines the different newt vector types for HIP runtime.
  -file  host_defines.h [code] +file  host_defines.h [code] + TODO-doc.
  diff --git a/projects/hip/docs/RuntimeAPI/html/dir_d44c64559bbebec7f509842c48db8b23.html b/projects/hip/docs/RuntimeAPI/html/dir_d44c64559bbebec7f509842c48db8b23.html index 1de6328c9c..34ef4a1333 100644 --- a/projects/hip/docs/RuntimeAPI/html/dir_d44c64559bbebec7f509842c48db8b23.html +++ b/projects/hip/docs/RuntimeAPI/html/dir_d44c64559bbebec7f509842c48db8b23.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include Directory Reference +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include Directory Reference @@ -103,7 +103,7 @@ Files diff --git a/projects/hip/docs/RuntimeAPI/html/files.html b/projects/hip/docs/RuntimeAPI/html/files.html index 06f026ba5f..bbb7037a0e 100644 --- a/projects/hip/docs/RuntimeAPI/html/files.html +++ b/projects/hip/docs/RuntimeAPI/html/files.html @@ -89,11 +89,11 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
[detail level 123]
- - - - - + + + + + @@ -105,7 +105,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/functions.html b/projects/hip/docs/RuntimeAPI/html/functions.html index 5f2b3f24eb..ce1f2c7bf7 100644 --- a/projects/hip/docs/RuntimeAPI/html/functions.html +++ b/projects/hip/docs/RuntimeAPI/html/functions.html @@ -81,6 +81,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • l
  • m
  • n
  • +
  • p
  • r
  • s
  • t
  • @@ -125,6 +126,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • computeMode : hipDeviceProp_t
  • +
  • concurrentKernels +: hipDeviceProp_t +
  • @@ -197,6 +201,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • maxGridSize : hipDeviceProp_t
  • +
  • maxSharedMemoryPerMultiProcessor +: hipDeviceProp_t +
  • maxThreadsDim : hipDeviceProp_t
  • @@ -206,6 +213,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • maxThreadsPerMultiProcessor : hipDeviceProp_t
  • +
  • memoryClockRate +: hipDeviceProp_t +
  • minor : hipDeviceProp_t
  • @@ -222,6 +232,16 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); +

    - p -

    + +

    - r -

    • regsPerBlock : hipDeviceProp_t @@ -275,7 +295,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/functions_vars.html b/projects/hip/docs/RuntimeAPI/html/functions_vars.html index d3740b0b60..9bc08863c4 100644 --- a/projects/hip/docs/RuntimeAPI/html/functions_vars.html +++ b/projects/hip/docs/RuntimeAPI/html/functions_vars.html @@ -81,6 +81,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    • l
    • m
    • n
    • +
    • p
    • r
    • s
    • t
    • @@ -125,6 +126,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    • computeMode : hipDeviceProp_t
    • +
    • concurrentKernels +: hipDeviceProp_t +
    @@ -197,6 +201,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • maxGridSize : hipDeviceProp_t
  • +
  • maxSharedMemoryPerMultiProcessor +: hipDeviceProp_t +
  • maxThreadsDim : hipDeviceProp_t
  • @@ -206,6 +213,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • maxThreadsPerMultiProcessor : hipDeviceProp_t
  • +
  • memoryClockRate +: hipDeviceProp_t +
  • minor : hipDeviceProp_t
  • @@ -222,6 +232,16 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); +

    - p -

    + +

    - r -

    @@ -96,141 +102,298 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    Here is a list of all documented file members with links to the documentation:
    +

    - _ -

    + + +

    - d -

    + +

    - h -

    + + +

    - o -

    diff --git a/projects/hip/docs/RuntimeAPI/html/globals_defs.html b/projects/hip/docs/RuntimeAPI/html/globals_defs.html index 35a768bc8c..1d2aada594 100644 --- a/projects/hip/docs/RuntimeAPI/html/globals_defs.html +++ b/projects/hip/docs/RuntimeAPI/html/globals_defs.html @@ -70,6 +70,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • All
  • Functions
  • Variables
  • +
  • Typedefs
  • +
  • Enumerations
  • +
  • Enumerator
  • Macros
  • @@ -90,14 +93,38 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
     
    diff --git a/projects/hip/docs/RuntimeAPI/html/globals_func.html b/projects/hip/docs/RuntimeAPI/html/globals_func.html index dff1f0b16e..17fe7c5f66 100644 --- a/projects/hip/docs/RuntimeAPI/html/globals_func.html +++ b/projects/hip/docs/RuntimeAPI/html/globals_func.html @@ -70,6 +70,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • All
  • Functions
  • Variables
  • +
  • Typedefs
  • +
  • Enumerations
  • +
  • Enumerator
  • Macros
  • @@ -98,124 +101,185 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

    - h -

    diff --git a/projects/hip/docs/RuntimeAPI/html/globals_vars.html b/projects/hip/docs/RuntimeAPI/html/globals_vars.html index 8117f6041c..eb20781be3 100644 --- a/projects/hip/docs/RuntimeAPI/html/globals_vars.html +++ b/projects/hip/docs/RuntimeAPI/html/globals_vars.html @@ -70,6 +70,9 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • All
  • Functions
  • Variables
  • +
  • Typedefs
  • +
  • Enumerations
  • +
  • Enumerator
  • Macros
  • @@ -106,7 +109,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/group__API.html b/projects/hip/docs/RuntimeAPI/html/group__API.html index ad2ed4ea23..895c153259 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__API.html +++ b/projects/hip/docs/RuntimeAPI/html/group__API.html @@ -110,7 +110,7 @@ Modules diff --git a/projects/hip/docs/RuntimeAPI/html/group__Device.html b/projects/hip/docs/RuntimeAPI/html/group__Device.html index c5b4996e69..5f42bdb840 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Device.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Device.html @@ -99,6 +99,9 @@ Functions + + + @@ -121,6 +124,48 @@ Functions

    Detailed Description

    ----------------------------------------------------------------------------------------------—

    Function Documentation

    + +
    +
    +
    o-include
    |o-hcc_detail
    ||o*hip_runtime.h
    ||o*hip_runtime_api.h
    ||o*hip_texture.h
    ||o*hip_vector_types.h
    ||\*host_defines.h
    ||o*hip_runtime.hContains definitions of APIs for HIP runtime
    ||o*hip_runtime_api.hContains C function APIs for HIP runtime. This file does not use any HCC builtin or special language extensions (-hc mode) ; those functions in hip_runtime.h
    ||o*hip_texture.hHIP C++ Texture API for hcc compiler
    ||o*hip_vector_types.hDefines the different newt vector types for HIP runtime
    ||\*host_defines.hTODO-doc
    |o*hip_common.h
    |o*hip_runtime.h
    |o*hip_runtime_api.h
    hipError_t hipGetDeviceCount (int *count)
     Return number of compute-capable devices. More...
     
    hipError_t hipDeviceGetAttribute (int *pi, hipDeviceAttribute_t attr, int device)
     Query device attribute. More...
     
    hipError_t hipDeviceGetProperties (hipDeviceProp_t *prop, int device)
     Returns device properties. More...
     
    + + + + + + + + + + + + + + + + + + + + + + + +
    hipError_t hipDeviceGetAttribute (int * pi,
    hipDeviceAttribute_t attr,
    int device 
    )
    +
    + +

    Query device attribute.

    +
    Parameters
    + + + + +
    [out]pipointer to value to return
    [in]attrattribute to query
    [in]devicewhich device to query for information
    +
    +
    + +
    +
    @@ -136,7 +181,7 @@ Functions

    Set Cache configuration for a specific function.

    -

    Note: AMD devices and recent NVIDIA GPUS do not support reconfigurable cache. This hint is ignored on those architectures.

    +

    Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.

    Returns
    hipSuccess
    @@ -197,7 +242,7 @@ Functions

    Get Shared memory bank configuration.

    -

    Note: AMD devices and recent NVIDIA GPUS do not support shared cache banking, and the hint is ignored on those architectures.

    +

    Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.

    Returns
    hipSuccess
    @@ -240,7 +285,7 @@ Functions

    Set L1/Shared cache partition.

    -

    Note: AMD devices and recent NVIDIA GPUS do not support reconfigurable cache. This hint is ignored on those architectures.

    +

    Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.

    Returns
    hipSuccess
    @@ -260,7 +305,7 @@ Functions

    Set Shared memory bank configuration.

    -

    Note: AMD devices and recent NVIDIA GPUS do not support shared cache banking, and the hint is ignored on those architectures.

    +

    Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.

    Returns
    hipSuccess
    @@ -301,7 +346,7 @@ Functions

    Set Cache configuration for a specific function.

    -

    Note: AMD devices and recent NVIDIA GPUS do not support reconfigurable cache. This hint is ignored on those architectures.

    +

    Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.

    Returns
    hipSuccess
    @@ -354,8 +399,8 @@ Functions -

    Returns in *count the number of devices that have ability to run compute commands. If there are no such devices, then hipGetDeviceCount will return hipErrorNoDevice. If 1 or more devices can be found, then hipGetDeviceCount returns hipSuccess.

    -
    Returns
    hipSuccess, hipErrorNoDevice
    +

    Returns in *count the number of devices that have ability to run compute commands. If there are no such devices, then hipGetDeviceCount will return hipErrorNoDevice. If 1 or more devices can be found, then hipGetDeviceCount returns hipSuccess.

    +
    Returns
    hipSuccess, hipErrorNoDevice
    @@ -397,7 +442,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/group__Error.html b/projects/hip/docs/RuntimeAPI/html/group__Error.html index cea57daed9..f6aeb04eb5 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Error.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Error.html @@ -197,7 +197,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/group__Event.html b/projects/hip/docs/RuntimeAPI/html/group__Event.html index 07c053b451..3b54300dd3 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Event.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Event.html @@ -209,11 +209,11 @@ Functions -
    Returns
    : hipSuccess, hipErrorInvalidResourceHandle, hipErrorNotReady,
    +
    Returns
    : hipSuccess, hipErrorInvalidResourceHandle, hipErrorNotReady,

    Computes the elapsed time between two events. Time is computed in ms, with a resolution of approximately 1 us.

    Events which are recorded in a NULL stream will block until all commands on all other streams complete execution, and then record the timestamp.

    Events which are recorded in a non-NULL stream will record their timestamp when they reach the head of the specified stream, after all previous commands in that stream have completed executing. Thus the time that the event recorded may be significantly after the host calls hipEventRecord.

    -

    If hipEventRecord has not been called on either event, then hipErrorInvalidResourceHandle is returned. If hipEventRecord has been called on both events, but the timestamp has not yet been recorded on one or both events (that is, hipEventQuery would return hipErrorNotReady on at least one of the events), then hipErrorNotReady is returned.

    +

    If hipEventRecord has not been called on either event, then hipErrorInvalidResourceHandle is returned. If hipEventRecord has been called on both events, but the timestamp has not yet been recorded on one or both events (that is, hipEventQuery would return hipErrorNotReady on at least one of the events), then hipErrorNotReady is returned.

    @@ -239,7 +239,7 @@ Functions
    Returns
    hipSuccess, hipEventNotReady
    -

    Query the status of the specified event. This function will return hipErrorNotReady if all commands in the appropriate stream (specified to hipEventRecord) have completed. If that work has not completed, or if hipEventRecord was not called on the event, then cudaSuccess is returned.

    +

    Query the status of the specified event. This function will return hipErrorNotReady if all commands in the appropriate stream (specified to hipEventRecord) have completed. If that work has not completed, or if hipEventRecord was not called on the event, then hipSuccess is returned.

    @@ -315,7 +315,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/group__GlobalDefs.html b/projects/hip/docs/RuntimeAPI/html/group__GlobalDefs.html index 7ec1058aed..53d57a58c1 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__GlobalDefs.html +++ b/projects/hip/docs/RuntimeAPI/html/group__GlobalDefs.html @@ -119,6 +119,9 @@ Typedefs typedef enum hipError_t hipError_t   + +typedef enum hipDeviceAttribute_t hipDeviceAttribute_t +  typedef enum hipFuncCache hipFuncCache   typedef enum hipSharedMemConfig hipSharedMemConfig @@ -146,6 +149,38 @@ Enumerations
    }   +enum  hipDeviceAttribute_t {
    +  hipDeviceAttributeMaxThreadsPerBlock, +hipDeviceAttributeMaxBlockDimX, +hipDeviceAttributeMaxBlockDimY, +hipDeviceAttributeMaxBlockDimZ, +
    +  hipDeviceAttributeMaxGridDimX, +hipDeviceAttributeMaxGridDimY, +hipDeviceAttributeMaxGridDimZ, +hipDeviceAttributeMaxSharedMemoryPerBlock, +
    +  hipDeviceAttributeTotalConstantMemory, +hipDeviceAttributeWarpSize, +hipDeviceAttributeMaxRegistersPerBlock, +hipDeviceAttributeClockRate, +
    +  hipDeviceAttributeMemoryClockRate, +hipDeviceAttributeMultiprocessorCount, +hipDeviceAttributeComputeMode, +hipDeviceAttributeL2CacheSize, +
    +  hipDeviceAttributeMaxThreadsPerMultiProcessor, +hipDeviceAttributeComputeCapabilityMajor, +hipDeviceAttributeComputeCapabilityMinor, +hipDeviceAttributeConcurrentKernels, +
    +  hipDeviceAttributePciBusId, +hipDeviceAttributePciDeviceId, +hipDeviceAttributeMaxSharedMemoryPerMultiprocessor +
    + } +  enum  hipFuncCache { hipFuncCachePreferNone, hipFuncCachePreferShared, hipFuncCachePreferL1, @@ -238,7 +273,7 @@ Enumerations
    -
    Warning
    On AMD devices and recent NVIDIA devices, these hints and controls are ignored.
    +
    Warning
    On AMD devices and recent Nvidia devices, these hints and controls are ignored.
    @@ -251,11 +286,94 @@ Enumerations
    -
    Warning
    On AMD devices and recent NVIDIA devices, these hints and controls are ignored.
    +
    Warning
    On AMD devices and recent Nvidia devices, these hints and controls are ignored.

    Enumeration Type Documentation

    + +
    +
    + + + + +
    enum hipDeviceAttribute_t
    +
    + + + + + + + + + + + + + + + + + + + + + + + + +
    Enumerator
    hipDeviceAttributeMaxThreadsPerBlock  +

    Maximum number of threads per block.

    +
    hipDeviceAttributeMaxBlockDimX  +

    Maximum x-dimension of a block.

    +
    hipDeviceAttributeMaxBlockDimY  +

    Maximum y-dimension of a block.

    +
    hipDeviceAttributeMaxBlockDimZ  +

    Maximum z-dimension of a block.

    +
    hipDeviceAttributeMaxGridDimX  +

    Maximum x-dimension of a grid.

    +
    hipDeviceAttributeMaxGridDimY  +

    Maximum y-dimension of a grid.

    +
    hipDeviceAttributeMaxGridDimZ  +

    Maximum z-dimension of a grid.

    +
    hipDeviceAttributeMaxSharedMemoryPerBlock  +

    Maximum shared memory available per block in bytes.

    +
    hipDeviceAttributeTotalConstantMemory  +

    Constant memory size in bytes.

    +
    hipDeviceAttributeWarpSize  +

    Warp size in threads.

    +
    hipDeviceAttributeMaxRegistersPerBlock  +

    Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor.

    +
    hipDeviceAttributeClockRate  +

    Peak clock frequency in kilohertz.

    +
    hipDeviceAttributeMemoryClockRate  +

    Peak memory clock frequency in kilohertz.

    +
    hipDeviceAttributeMultiprocessorCount  +

    Number of multiprocessors on the device.

    +
    hipDeviceAttributeComputeMode  +

    Compute mode that device is currently in.

    +
    hipDeviceAttributeL2CacheSize  +

    Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.

    +
    hipDeviceAttributeMaxThreadsPerMultiProcessor  +

    Maximum resident threads per multiprocessor.

    +
    hipDeviceAttributeComputeCapabilityMajor  +

    Major compute capability version number.

    +
    hipDeviceAttributeComputeCapabilityMinor  +

    Minor compute capability version number.

    +
    hipDeviceAttributeConcurrentKernels  +

    Device can possibly execute multiple kernels concurrently.

    +
    hipDeviceAttributePciBusId  +

    PCI Bus ID.

    +
    hipDeviceAttributePciDeviceId  +

    PCI Device ID.

    +
    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor  +

    Maximum Shared Memory Per Multiprocessor.

    +
    + +
    +
    @@ -282,7 +400,7 @@ Enumerations

    Out of resources error.

    hipErrorInvalidValue  -

    One or more of the paramters passed to the API call is NULL or not in an acceptable range.

    +

    One or more of the parameters passed to the API call is NULL or not in an acceptable range.

    hipErrorInvalidResourceHandle 

    Resource handle (hipEvent_t or hipStream_t) invalid.

    @@ -291,10 +409,10 @@ Enumerations

    DeviceID must be in range 0...#compute-devices.

    hipErrorNoDevice  -

    Call to cudaGetDeviceCount returned 0 devices.

    +

    Call to hipGetDeviceCount returned 0 devices.

    hipErrorNotReady  -

    indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery.

    +

    Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery.

    hipErrorUnknown 

    Unknown error.

    @@ -315,7 +433,7 @@ Enumerations
    -
    Warning
    On AMD devices and recent NVIDIA devices, these hints and controls are ignored.
    +
    Warning
    On AMD devices and recent Nvidia devices, these hints and controls are ignored.
    Enumerator
    hipFuncCachePreferNone 

    no preference for shared memory or L1 (default)

    @@ -372,7 +490,7 @@ Enumerations
    -
    Warning
    On AMD devices and recent NVIDIA devices, these hints and controls are ignored.
    +
    Warning
    On AMD devices and recent Nvidia devices, these hints and controls are ignored.
    diff --git a/projects/hip/docs/RuntimeAPI/html/group__HCC__Specific.html b/projects/hip/docs/RuntimeAPI/html/group__HCC__Specific.html index dda77995f2..10eb0d5298 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__HCC__Specific.html +++ b/projects/hip/docs/RuntimeAPI/html/group__HCC__Specific.html @@ -85,10 +85,10 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); - + - +
    Enumerator
    hipSharedMemBankSizeDefault 

    The compiler selects a device-specific value for the banking.

    @@ -390,7 +508,7 @@ Enumerations

    Functions

    hipError_t hipHccGetAccelerator (int deviceId, hc::accelerator *acc)
     Return hc::acclerator associated with the specified deviceId. More...
     Return hc::accelerator associated with the specified deviceId. More...
     
    hipError_t hipHccGetAcceleratorView (hipStream_t stream, hc::accelerator_view **av)
     Return hc::acclerator_view associated with the specified stream. More...
     Return hc::accelerator_view associated with the specified stream. More...
     

    Detailed Description

    @@ -123,7 +123,7 @@ Functions
    -

    Return hc::acclerator associated with the specified deviceId.

    +

    Return hc::accelerator associated with the specified deviceId.

    Returns
    hipSuccess, hipErrorInvalidDevice
    @@ -152,7 +152,7 @@ Functions
    -

    Return hc::acclerator_view associated with the specified stream.

    +

    Return hc::accelerator_view associated with the specified stream.

    Returns
    hipSuccess
    @@ -160,7 +160,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/group__HIP-ENV.html b/projects/hip/docs/RuntimeAPI/html/group__HIP-ENV.html index 49583a5127..b4f0537b8b 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__HIP-ENV.html +++ b/projects/hip/docs/RuntimeAPI/html/group__HIP-ENV.html @@ -101,7 +101,7 @@ int  diff --git a/projects/hip/docs/RuntimeAPI/html/group__Memory.html b/projects/hip/docs/RuntimeAPI/html/group__Memory.html index f2d06c6a49..362fb6a8d8 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Memory.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Memory.html @@ -85,25 +85,31 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

    Functions

    hipError_t hipMalloc (void **ptr, size_t size) + Allocate memory on the default accelerator. More...
      hipError_t hipMallocHost (void **ptr, size_t size) + Allocate pinned host memory. More...
      hipError_t hipFree (void *ptr) + Free memory allocated by the hcc hip memory allocation API. This API performs an implicit hipDeviceSynchronize() call. More...
      hipError_t hipFreeHost (void *ptr) + Free memory allocated by the hcc hip host memory allocation API. More...
      hipError_t hipMemcpy (void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind) + Copy data from src to dst. More...
      - -hipError_t hipMemcpyToSymbol (const char *symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind) +hipError_t hipMemcpyToSymbol (const char *symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind) + Copies sizeBytes bytes from the memory area pointed to by src to the memory area pointed to by offset bytes from the start of symbol symbol. More...
      hipError_t hipMemcpyAsync (void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0) + Copy data from src to dst asynchronously. More...
      - -hipError_t hipMemset (void *dst, int value, size_t sizeBytes) +hipError_t hipMemset (void *dst, int value, size_t sizeBytes) + Copy data from src to dst asynchronously. More...
      - -hipError_t hipMemsetAsync (void *dst, int value, size_t sizeBytes, hipStream_t=0) +hipError_t hipMemsetAsync (void *dst, int value, size_t sizeBytes, hipStream_t=0) + Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant byte value value. More...
      hipError_t hipMemGetInfo (size_t *free, size_t *total) @@ -131,14 +137,15 @@ Functions
    -

    Free memory allocated by the hcc hip memory allocation API. This API performs an implicit hipDeviceSynchronize() call.

    + +

    Free memory allocated by the hcc hip memory allocation API. This API performs an implicit hipDeviceSynchronize() call.

    Parameters
    [in]ptrPointer to memory to be freed
    -
    Returns
    Error code
    +
    Returns
    hipSuccess, hipErrorMemoryFree
    @@ -155,14 +162,15 @@ Functions
    -

    Free memory allocated by the hcc hip host memory allocation API

    + +

    Free memory allocated by the hcc hip host memory allocation API.

    Parameters
    [in]ptrPointer to memory to be freed
    -
    Returns
    Error code
    +
    Returns
    hipSuccess, hipErrorMemoryFree
    @@ -189,7 +197,8 @@ Functions
    -

    Allocate memory on the default accelerator

    + +

    Allocate memory on the default accelerator.

    Parameters
    @@ -224,11 +233,12 @@ Functions
    [out]ptrPointer to the allocated memory
    -

    Allocate pinned host memory

    + +

    Allocate pinned host memory.

    Parameters
    - - + +
    [in]ptrPointer to the allocated host pinned memory
    [out]sizeRequested memory size
    [out]ptrPointer to the allocated host pinned memory
    [in]sizeRequested memory size
    @@ -271,18 +281,19 @@ Functions
    -

    Copy data from src to dst. It supports memory from host to device, device to host, device to device and host to host The src and dst must not overlap. If the

    -

    This function is host-synchronous for most inputs. It uses the default NULL stream and will synchronize with other blocking streams on the same device.

    + +

    Copy data from src to dst.

    +

    It supports memory from host to device, device to host, device to device and host to host The src and dst must not overlap. TODO: cudaErrorInvalidMemcpyDirection error code is not supported right now, use hipErrorUnknown for now

    Parameters
    - +
    being copy to
    [out]dstData being copy to
    [in]srcData being copy from
    [in]sizeBytesData size in bytes
    [in]copyTypeMemory copy type
    -
    Returns
    Error code
    +
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorMemoryFree, hipErrorUnknown
    @@ -327,7 +338,9 @@ Functions
    -

    Copy data from src to dst asynchronously. It supports memory from host to device, device to host, device to device and host to host.

    + +

    Copy data from src to dst asynchronously.

    +

    TODO: cudaErrorInvalidMemcpyDirection error code is not supported right now, use hipErrorUnknown for now

    Parameters
    @@ -337,14 +350,168 @@ Functions
    [out]dstData being copy to
    -
    Returns
    Error code
    +
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorMemoryFree, hipErrorUnknown
    + +
    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    hipError_t hipMemcpyToSymbol (const char * symbolName,
    const void * src,
    size_t sizeBytes,
    size_t offset,
    hipMemcpyKind kind 
    )
    +
    + +

    Copies sizeBytes bytes from the memory area pointed to by src to the memory area pointed to by offset bytes from the start of symbol symbol.

    +

    The memory areas may not overlap. Symbol can either be a variable that resides in global or constant memory space, or it can be a character string, naming a variable that resides in global or constant memory space. Kind can be either hipMemcpyHostToDevice or hipMemcpyDeviceToDevice TODO: cudaErrorInvalidSymbol and cudaErrorInvalidMemcpyDirection is not supported, use hipErrorUnknown for now.

    +
    Parameters
    + + + + + + +
    [in]symbolName- Symbol destination on device
    [in]src- Data being copy from
    [in]sizeBytes- Data size in bytes
    [in]offset- Offset from start of symbol in bytes
    [in]kind- Type of transfer
    +
    +
    +
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorMemoryFree, hipErrorUnknown
    + +
    +
    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + +
    hipError_t hipMemset (void * dst,
    int value,
    size_t sizeBytes 
    )
    +
    + +

    Copy data from src to dst asynchronously.

    +

    It supports memory from host to device, device to host, device to device and host to host.

    +
    Parameters
    + + + + + +
    [out]dstData being copy to
    [in]srcData being copy from
    [in]sizeBytesData size in bytes
    [in]accelerator_viewAccelerator view which the copy is being enqueued
    +
    +
    +
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorMemoryFree
    + +
    +
    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    hipError_t hipMemsetAsync (void * dst,
    int value,
    size_t sizeBytes,
    hipStream_t  = 0 
    )
    +
    + +

    Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant byte value value.

    +

    hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the memset is complete. The operation can optionally be associated to a stream by passing a non-zero stream argument. If stream is non-zero, the operation may overlap with operations in other streams.

    +
    Parameters
    + + + + + +
    [out]dstPointer to device memory
    [in]value- Value to set for each byte of specified memory
    [in]sizeBytes- Size in bytes to set
    [in]stream- Stream identifier
    +
    +
    +
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorMemoryFree
    diff --git a/projects/hip/docs/RuntimeAPI/html/group__PeerToPeer.html b/projects/hip/docs/RuntimeAPI/html/group__PeerToPeer.html index 9b2174c2bd..c9417b25c0 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__PeerToPeer.html +++ b/projects/hip/docs/RuntimeAPI/html/group__PeerToPeer.html @@ -88,13 +88,16 @@ Functions  Determine if a device can access a peer's memory. More...
      hipError_t hipDeviceDisablePeerAccess (int peerDevice) + Disables registering memory on peerDevice for direct access from the current device. More...
      hipError_t hipDeviceEnablePeerAccess (int peerDevice, unsigned int flags) + Enables registering memory on peerDevice for direct access from the current device. More...
      - -hipError_t hipMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes) +hipError_t hipMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes) + Copies memory from one device to memory on another device. More...
      hipError_t hipMemcpyPeerAsync (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0) + Copies memory from one device to memory on another device. More...
     

    Detailed Description

    @@ -135,7 +138,7 @@ Functions - +
    [out]canAccessPeerreturns true if specified devices are peers.
    [in]device
    [in]peerDeviceReturns "1" in canAccessPeer if the specified device is capable of directly accessing memory phyically located on peerDevice , or "0" if not.
    [in]peerDeviceReturns "1" in canAccessPeer if the specified device is capable of directly accessing memory physically located on peerDevice , or "0" if not.
    @@ -156,6 +159,15 @@ Functions
    + +

    Disables registering memory on peerDevice for direct access from the current device.

    +

    If there are any allocations on peerDevice which were registered in the current device using hipPeerRegister() then these allocations will be automatically unregistered. Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.

    +
    Parameters
    + + +
    [in]peerDeviceTODO:cudaErrorPeerAccessNotEnabled and cudaErrorInvalidDevice error not supported in HIP, return hipErrorUnknown Returns hipSuccess, hipErrorUnknown
    +
    +
    Warning
    Need to update this function when RT supports P2P
    @@ -183,8 +195,74 @@ Functions
    + +

    Enables registering memory on peerDevice for direct access from the current device.

    +
    Parameters
    + + + +
    [in]peerDevice
    [in]flagsTODO:cudaErrorInvalidDevice error not supported in HIP, return hipErrorUnknown Returns hipSuccess, hipErrorInvalidDevice, hipErrorInvalidValue, hipErrorUnknown
    +
    +
    Warning
    Need to update this function when RT supports P2P
    +
    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    hipError_t hipMemcpyPeer (void * dst,
    int dstDevice,
    const void * src,
    int srcDevice,
    size_t sizeBytes 
    )
    +
    + +

    Copies memory from one device to memory on another device.

    +
    Parameters
    + + + + + + +
    [out]dst- Destination device pointer.
    [in]dstDevice- Destination device
    [in]src- Source device pointer
    [in]srcDevice- Source device
    [in]sizeBytes- Size of memory copy in bytes
    +
    +
    +

    Returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDevice

    +
    @@ -234,6 +312,20 @@ Functions
    + +

    Copies memory from one device to memory on another device.

    +
    Parameters
    + + + + + + + +
    [out]dst- Destination device pointer.
    [in]dstDevice- Destination device
    [in]src- Source device pointer
    [in]srcDevice- Source device
    [in]sizeBytes- Size of memory copy in bytes
    [in]stream- Stream identifier
    +
    +
    +

    Returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDevice

    Bug:
    This function uses a synchronous copy
    @@ -241,7 +333,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/group__Profiler.html b/projects/hip/docs/RuntimeAPI/html/group__Profiler.html index 354d17569d..30e28d3ac5 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Profiler.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Profiler.html @@ -80,12 +80,12 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

    ----------------------------------------------------------------------------------------------—

    -

    The cudaProfilerInitialize API format for "configFile" is not supported.

    +
    Warning
    The cudaProfilerInitialize API format for "configFile" is not supported.

    On AMD platforms, hipProfilerStart and hipProfilerStop require installation of AMD's GPU perf counter API and defining GPU_PERF

    diff --git a/projects/hip/docs/RuntimeAPI/html/group__Stream.html b/projects/hip/docs/RuntimeAPI/html/group__Stream.html index 6f2efc9edc..7f54af7b00 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Stream.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Stream.html @@ -140,7 +140,7 @@ Functions -
    Returns
    hipSuccess, hipErrorInvalidValue
    +
    Returns
    hipSuccess, hipErrorInvalidValue

    Create a new asynchronous stream. Flags controls behavior of the stream. See hipStreamDefault, hipStreamNonBlocking. hipStream_t are under development - with current HIP use the NULL stream.

    @@ -206,7 +206,7 @@ Functions -
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorInvalidResourceHandle
    +
    Returns
    hipSuccess, hipErrorInvalidValue, hipErrorInvalidResourceHandle

    Return flags associated with this stream in *flags.

    See Also
    hipStreamCreateWithFlags
    Returns
    hipSuccess
    @@ -283,7 +283,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/group__Texture.html b/projects/hip/docs/RuntimeAPI/html/group__Texture.html index a19d108f0a..97808148e7 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Texture.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Texture.html @@ -121,7 +121,7 @@ template<class T , int dim, enum hipTextureReadMode readMode> diff --git a/projects/hip/docs/RuntimeAPI/html/group__Version.html b/projects/hip/docs/RuntimeAPI/html/group__Version.html index f5cb9f0063..21f2a17ee4 100644 --- a/projects/hip/docs/RuntimeAPI/html/group__Version.html +++ b/projects/hip/docs/RuntimeAPI/html/group__Version.html @@ -85,7 +85,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

    Functions

    hipError_t hipDriverGetVersion (int *driverVersion) - Returns the approximate HIP driver versin. More...
    + Returns the approximate HIP driver version. More...
     

    Detailed Description

    @@ -105,8 +105,8 @@ Functions
    -

    Returns the approximate HIP driver versin.

    -
    Warning
    The HIP feature set does not correpond to an exact CUDA SDK driver revision. This function always set *driverVersion to 4 as an approximation though HIP supports some features which were introduced in later CUDA SDK revisions. HIP apps code should not rely on the driver revision number here and should use arch feature flags to test device capabiliies or conditional compilation.
    +

    Returns the approximate HIP driver version.

    +
    Warning
    The HIP feature set does not correspond to an exact CUDA SDK driver revision. This function always set *driverVersion to 4 as an approximation though HIP supports some features which were introduced in later CUDA SDK revisions. HIP apps code should not rely on the driver revision number here and should use arch feature flags to test device capabilities or conditional compilation.
    Returns
    hipSuccess
    @@ -114,7 +114,7 @@ Functions diff --git a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h.html b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h.html index 2b02e80742..984fe07ceb 100644 --- a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h.html +++ b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail/hip_runtime.h File Reference +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_runtime.h File Reference @@ -93,15 +93,19 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    hip_runtime.h File Reference
    + +

    Contains definitions of APIs for HIP runtime. +More...

    #include <cstring>
    #include <cmath>
    #include <string.h>
    #include <stddef.h>
    -#include <hip_runtime_api.h>
    +#include <hip_runtime_api.h>
    #include <hc.hpp>
    #include <grid_launch.h>
    -#include <hcc_detail/hip_texture.h>
    -#include <hcc_detail/host_defines.h>
    +#include <hcc_detail/hip_texture.h>
    +#include <hcc_detail/host_defines.h>
    +#include <hc_math.hpp>

    Go to the source code of this file.

    @@ -113,9 +117,6 @@ Macros - - @@ -224,12 +225,6 @@ __device__ unsigned int  - - - - @@ -290,6 +285,12 @@ __device__ unsigned int  + + + + @@ -305,9 +306,95 @@ __device__ int  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    #define hipLaunchParm   grid_launch_parm
     
    -#define __HIP_DEVICE_COMPILE__   0
     
    #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
     
    at
    __device__ unsigned long long int atomicMax (unsigned long long int *address, unsigned long long int val)
     
    -__device__ unsigned int atomicInc (unsigned int *address)
     
    -__device__ unsigned int atomicDec (unsigned int *address)
     
    __device__ int atomicCAS (int *address, int compare, int val)
     
    __
    __device__ unsigned int __ffsll (unsigned long long int input)
     
    +__device__ unsigned int __ffs (int input)
     
    +__device__ unsigned int __ffsll (long long int input)
     
    __device__ unsigned int __brev (unsigned int input)
     
    __any (
    __device__ unsigned long long int __ballot (int input)
     
    +__device__ int __shfl (int input, int lane, int width)
     
    +__device__ int __shfl_up (int input, unsigned int lane_delta, int width)
     
    +__device__ int __shfl_down (int input, unsigned int lane_delta, int width)
     
    +__device__ int __shfl_xor (int input, int lane_mask, int width)
     
    +__device__ float __shfl (float input, int lane, int width)
     
    +__device__ float __shfl_up (float input, unsigned int lane_delta, int width)
     
    +__device__ float __shfl_down (float input, unsigned int lane_delta, int width)
     
    +__device__ float __shfl_xor (float input, int lane_mask, int width)
     
    +int min (int arg1, int arg2) __attribute((hc
     
    +int max (int arg1, int arg2) __attribute((hc
     
    +__device__ float __cosf (float x)
     
    +__device__ float __expf (float x)
     
    +__device__ float __frsqrt_rn (float x)
     
    +__device__ float __fsqrt_rd (float x)
     
    +__device__ float __fsqrt_rn (float x)
     
    +__device__ float __fsqrt_ru (float x)
     
    +__device__ float __fsqrt_rz (float x)
     
    +__device__ float __log10f (float x)
     
    +__device__ float __log2f (float x)
     
    +__device__ float __logf (float x)
     
    +__device__ float __powf (float base, float exponent)
     
    +__device__ void __sincosf (float x, float *s, float *c)
     
    +__device__ float __sinf (float x)
     
    +__device__ float __tanf (float x)
     
    +__device__ float __dsqrt_rd (double x)
     
    +__device__ float __dsqrt_rn (double x)
     
    +__device__ float __dsqrt_ru (double x)
     
    +__device__ float __dsqrt_rz (double x)
     
    + + @@ -324,7 +411,9 @@ int 

    Variables

    int cpu
     
    int warpSize
     
     Make all HIP APIs host-synchronous.
     
    -

    Macro Definition Documentation

    +

    Detailed Description

    +

    Contains definitions of APIs for HIP runtime.

    +

    Macro Definition Documentation

    @@ -384,10 +473,25 @@ int Variable Documentation + +
    +
    + + + + +
    int cpu
    +
    +Initial value:
    {
    +
    return (int)(hc::precise_math::fmin((float)arg1, (float)arg2))
    +
    +
    +
    diff --git a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h_source.html index 667d8a38e2..a6c1b5210d 100644 --- a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail/hip_runtime.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_runtime.h Source File @@ -124,7 +124,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    38 
    39 #define CUDA_SUCCESS hipSuccess
    40 
    -
    41 #include <hip_runtime_api.h>
    +
    41 #include <hip_runtime_api.h>
    42 
    43 //---
    44 // Remainder of this file only compiles with HCC
    @@ -136,8 +136,8 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    50 //typedef grid_launch_parm hipLaunchParm ;
    51 #define hipLaunchParm grid_launch_parm
    52 
    -
    53 #include <hcc_detail/hip_texture.h>
    -
    54 #include <hcc_detail/host_defines.h>
    +
    53 #include <hcc_detail/hip_texture.h>
    +
    55 
    56 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
    57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
    @@ -148,488 +148,530 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
    63 // Device compile and not host compile:
    64 
    -
    65 
    -
    66 #define __HIP_DEVICE_COMPILE__ 1
    -
    67 
    -
    68 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
    -
    69  // 32-bit Atomics:
    -
    70 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
    -
    71 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
    -
    72 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
    -
    73 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
    -
    74 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
    -
    75 
    -
    76 // 64-bit Atomics:
    -
    77 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
    -
    78 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
    +
    65 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
    +
    66  // 32-bit Atomics:
    +
    67 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
    +
    68 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
    +
    69 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
    +
    70 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
    +
    71 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
    +
    72 
    +
    73 // 64-bit Atomics:
    +
    74 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
    +
    75 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
    +
    76 
    +
    77 // Doubles
    +
    78 #define __HIP_ARCH_HAS_DOUBLES__ (1)
    79 
    -
    80 // Doubles
    -
    81 #define __HIP_ARCH_HAS_DOUBLES__ (1)
    -
    82 
    -
    83 //warp cross-lane operations:
    -
    84 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
    -
    85 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
    -
    86 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
    -
    87 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
    -
    88 
    -
    89 //sync
    -
    90 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
    -
    91 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
    -
    92 
    -
    93 // misc
    -
    94 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
    -
    95 #define __HIP_ARCH_HAS_3DGRID__ (1)
    -
    96 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
    -
    97 
    -
    98 #else
    -
    99 // Host compile and not device compile:
    -
    100 #define __HIP_DEVICE_COMPILE__ 0
    -
    101 
    -
    102 #endif
    +
    80 //warp cross-lane operations:
    +
    81 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
    +
    82 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
    +
    83 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
    +
    84 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
    +
    85 
    +
    86 //sync
    +
    87 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
    +
    88 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
    +
    89 
    +
    90 // misc
    +
    91 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
    +
    92 #define __HIP_ARCH_HAS_3DGRID__ (1)
    +
    93 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
    +
    94 
    +
    95 #endif
    +
    96 
    +
    97 
    +
    98 
    +
    99 
    +
    100 
    +
    101 //TODO-HCC this is currently ignored by HCC target of HIP
    +
    102 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
    103 
    -
    104 
    -
    105 
    -
    106 
    -
    107 
    -
    108 //TODO-HCC this is currently ignored by HCC target of HIP
    -
    109 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
    +
    104 // Detect if we are compiling C++ mode or C mode
    +
    105 #if defined(__cplusplus)
    +
    106 #define __HCC_CPP__
    +
    107 #elif defined(__STDC_VERSION__)
    +
    108 #define __HCC_C__
    +
    109 #endif
    110 
    -
    111 // Detect if we are compiling C++ mode or C mode
    -
    112 #if defined(__cplusplus)
    -
    113 #define __HCC_CPP__
    -
    114 #elif defined(__STDC_VERSION__)
    -
    115 #define __HCC_C__
    -
    116 #endif
    -
    117 
    -
    118 #define clock_t long long int
    -
    119 __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
    -
    120 __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
    -
    121 
    -
    122 //atomicAdd()
    -
    123 __device__ inline int atomicAdd(int* address, int val)
    -
    124 {
    -
    125  return hc::atomic_fetch_add(address,val);
    -
    126 }
    -
    127 __device__ inline unsigned int atomicAdd(unsigned int* address,
    -
    128  unsigned int val)
    -
    129 {
    -
    130  return hc::atomic_fetch_add(address,val);
    -
    131 }
    -
    132 __device__ inline unsigned long long int atomicAdd(unsigned long long int* address,
    -
    133  unsigned long long int val)
    -
    134 {
    -
    135  return (long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
    -
    136 }
    -
    137 __device__ inline float atomicAdd(float* address, float val)
    -
    138 {
    -
    139  return hc::atomic_fetch_add(address,val);
    -
    140 }
    -
    141 
    -
    142 //atomicSub()
    -
    143 __device__ inline int atomicSub(int* address, int val)
    -
    144 {
    -
    145  return hc::atomic_fetch_sub(address,val);
    -
    146 }
    -
    147 __device__ inline unsigned int atomicSub(unsigned int* address,
    -
    148  unsigned int val)
    -
    149 {
    -
    150  return hc::atomic_fetch_sub(address,val);
    -
    151 }
    -
    152 
    -
    153 //atomicExch()
    -
    154 __device__ inline int atomicExch(int* address, int val)
    -
    155 {
    -
    156  return hc::atomic_exchange(address,val);
    -
    157 }
    -
    158 __device__ inline unsigned int atomicExch(unsigned int* address,
    -
    159  unsigned int val)
    -
    160 {
    -
    161  return hc::atomic_exchange(address,val);
    -
    162 }
    -
    163 __device__ inline unsigned long long int atomicExch(unsigned long long int* address,
    -
    164  unsigned long long int val)
    -
    165 {
    -
    166  return (long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
    -
    167 }
    -
    168 __device__ inline float atomicExch(float* address, float val)
    -
    169 {
    -
    170  return hc::atomic_exchange(address,val);
    -
    171 }
    -
    172 
    -
    173 //atomicMin()
    -
    174 __device__ inline int atomicMin(int* address, int val)
    -
    175 {
    -
    176  return hc::atomic_fetch_min(address,val);
    -
    177 }
    -
    178 __device__ inline unsigned int atomicMin(unsigned int* address,
    -
    179  unsigned int val)
    -
    180 {
    -
    181  return hc::atomic_fetch_min(address,val);
    -
    182 }
    -
    183 __device__ inline unsigned long long int atomicMin(unsigned long long int* address,
    -
    184  unsigned long long int val)
    -
    185 {
    -
    186  return (long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
    -
    187 }
    -
    188 
    -
    189 //atomicMax()
    -
    190 __device__ inline int atomicMax(int* address, int val)
    -
    191 {
    -
    192  return hc::atomic_fetch_max(address,val);
    -
    193 }
    -
    194 __device__ inline unsigned int atomicMax(unsigned int* address,
    -
    195  unsigned int val)
    -
    196 {
    -
    197  return hc::atomic_fetch_max(address,val);
    -
    198 }
    -
    199 __device__ inline unsigned long long int atomicMax(unsigned long long int* address,
    -
    200  unsigned long long int val)
    -
    201 {
    -
    202  return (long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
    +
    111 #define clock_t long long int
    +
    112 __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
    +
    113 __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
    +
    114 
    +
    115 //atomicAdd()
    +
    116 __device__ inline int atomicAdd(int* address, int val)
    +
    117 {
    +
    118  return hc::atomic_fetch_add(address,val);
    +
    119 }
    +
    120 __device__ inline unsigned int atomicAdd(unsigned int* address,
    +
    121  unsigned int val)
    +
    122 {
    +
    123  return hc::atomic_fetch_add(address,val);
    +
    124 }
    +
    125 __device__ inline unsigned long long int atomicAdd(unsigned long long int* address,
    +
    126  unsigned long long int val)
    +
    127 {
    +
    128  return (long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
    +
    129 }
    +
    130 __device__ inline float atomicAdd(float* address, float val)
    +
    131 {
    +
    132  return hc::atomic_fetch_add(address,val);
    +
    133 }
    +
    134 
    +
    135 //atomicSub()
    +
    136 __device__ inline int atomicSub(int* address, int val)
    +
    137 {
    +
    138  return hc::atomic_fetch_sub(address,val);
    +
    139 }
    +
    140 __device__ inline unsigned int atomicSub(unsigned int* address,
    +
    141  unsigned int val)
    +
    142 {
    +
    143  return hc::atomic_fetch_sub(address,val);
    +
    144 }
    +
    145 
    +
    146 //atomicExch()
    +
    147 __device__ inline int atomicExch(int* address, int val)
    +
    148 {
    +
    149  return hc::atomic_exchange(address,val);
    +
    150 }
    +
    151 __device__ inline unsigned int atomicExch(unsigned int* address,
    +
    152  unsigned int val)
    +
    153 {
    +
    154  return hc::atomic_exchange(address,val);
    +
    155 }
    +
    156 __device__ inline unsigned long long int atomicExch(unsigned long long int* address,
    +
    157  unsigned long long int val)
    +
    158 {
    +
    159  return (long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
    +
    160 }
    +
    161 __device__ inline float atomicExch(float* address, float val)
    +
    162 {
    +
    163  return hc::atomic_exchange(address,val);
    +
    164 }
    +
    165 
    +
    166 //atomicMin()
    +
    167 __device__ inline int atomicMin(int* address, int val)
    +
    168 {
    +
    169  return hc::atomic_fetch_min(address,val);
    +
    170 }
    +
    171 __device__ inline unsigned int atomicMin(unsigned int* address,
    +
    172  unsigned int val)
    +
    173 {
    +
    174  return hc::atomic_fetch_min(address,val);
    +
    175 }
    +
    176 __device__ inline unsigned long long int atomicMin(unsigned long long int* address,
    +
    177  unsigned long long int val)
    +
    178 {
    +
    179  return (long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
    +
    180 }
    +
    181 
    +
    182 //atomicMax()
    +
    183 __device__ inline int atomicMax(int* address, int val)
    +
    184 {
    +
    185  return hc::atomic_fetch_max(address,val);
    +
    186 }
    +
    187 __device__ inline unsigned int atomicMax(unsigned int* address,
    +
    188  unsigned int val)
    +
    189 {
    +
    190  return hc::atomic_fetch_max(address,val);
    +
    191 }
    +
    192 __device__ inline unsigned long long int atomicMax(unsigned long long int* address,
    +
    193  unsigned long long int val)
    +
    194 {
    +
    195  return (long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
    +
    196 }
    +
    197 
    +
    198 //atomicCAS()
    +
    199 __device__ inline int atomicCAS(int* address, int compare, int val)
    +
    200 {
    +
    201  hc::atomic_compare_exchange(address,&compare,val);
    +
    202  return *address;
    203 }
    -
    204 
    -
    205 //atomicInc()
    -
    206 __device__ inline unsigned int atomicInc(unsigned int* address)
    +
    204 __device__ inline unsigned int atomicCAS(unsigned int* address,
    +
    205  unsigned int compare,
    +
    206  unsigned int val)
    207 {
    -
    208  return hc::atomic_fetch_inc(address);
    -
    209 }
    -
    210 
    -
    211 //atomicDec()
    -
    212 __device__ inline unsigned int atomicDec(unsigned int* address)
    -
    213 {
    -
    214  return hc::atomic_fetch_dec(address);
    -
    215 }
    -
    216 
    -
    217 //atomicCAS()
    -
    218 __device__ inline int atomicCAS(int* address, int compare, int val)
    -
    219 {
    -
    220  hc::atomic_compare_exchange(address,&compare,val);
    -
    221  return *address;
    -
    222 }
    -
    223 __device__ inline unsigned int atomicCAS(unsigned int* address,
    -
    224  unsigned int compare,
    +
    208  hc::atomic_compare_exchange(address,&compare,val);
    +
    209  return *address;
    +
    210 }
    +
    211 __device__ inline unsigned long long int atomicCAS(unsigned long long int* address,
    +
    212  unsigned long long int compare,
    +
    213  unsigned long long int val)
    +
    214 {
    +
    215  hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
    +
    216  return *address;
    +
    217 }
    +
    218 
    +
    219 //atomicAnd()
    +
    220 __device__ inline int atomicAnd(int* address, int val)
    +
    221 {
    +
    222  return hc::atomic_fetch_and(address,val);
    +
    223 }
    +
    224 __device__ inline unsigned int atomicAnd(unsigned int* address,
    225  unsigned int val)
    226 {
    -
    227  hc::atomic_compare_exchange(address,&compare,val);
    -
    228  return *address;
    -
    229 }
    -
    230 __device__ inline unsigned long long int atomicCAS(unsigned long long int* address,
    -
    231  unsigned long long int compare,
    -
    232  unsigned long long int val)
    -
    233 {
    -
    234  hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
    -
    235  return *address;
    -
    236 }
    -
    237 
    -
    238 //atomicAnd()
    -
    239 __device__ inline int atomicAnd(int* address, int val)
    -
    240 {
    -
    241  return hc::atomic_fetch_and(address,val);
    -
    242 }
    -
    243 __device__ inline unsigned int atomicAnd(unsigned int* address,
    -
    244  unsigned int val)
    -
    245 {
    -
    246  return hc::atomic_fetch_and(address,val);
    -
    247 }
    -
    248 __device__ inline unsigned long long int atomicAnd(unsigned long long int* address,
    -
    249  unsigned long long int val)
    -
    250 {
    -
    251  return (long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
    -
    252 }
    -
    253 
    -
    254 //atomicOr()
    -
    255 __device__ inline int atomicOr(int* address, int val)
    -
    256 {
    -
    257  return hc::atomic_fetch_or(address,val);
    -
    258 }
    -
    259 __device__ inline unsigned int atomicOr(unsigned int* address,
    -
    260  unsigned int val)
    -
    261 {
    -
    262  return hc::atomic_fetch_or(address,val);
    -
    263 }
    -
    264 __device__ inline unsigned long long int atomicOr(unsigned long long int* address,
    -
    265  unsigned long long int val)
    -
    266 {
    -
    267  return (long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
    -
    268 }
    -
    269 
    -
    270 //atomicXor()
    -
    271 __device__ inline int atomicXor(int* address, int val)
    -
    272 {
    -
    273  return hc::atomic_fetch_xor(address,val);
    -
    274 }
    -
    275 __device__ inline unsigned int atomicXor(unsigned int* address,
    -
    276  unsigned int val)
    -
    277 {
    -
    278  return hc::atomic_fetch_xor(address,val);
    -
    279 }
    -
    280 __device__ inline unsigned long long int atomicXor(unsigned long long int* address,
    -
    281  unsigned long long int val)
    -
    282 {
    -
    283  return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
    -
    284 }
    -
    285 
    -
    286 #ifdef __HCC__
    -
    287 #include <hc.hpp>
    -
    288 // integer intrinsic function __poc __clz __ffs __brev
    -
    289 __device__ inline unsigned int __popc( unsigned int input)
    +
    227  return hc::atomic_fetch_and(address,val);
    +
    228 }
    +
    229 __device__ inline unsigned long long int atomicAnd(unsigned long long int* address,
    +
    230  unsigned long long int val)
    +
    231 {
    +
    232  return (long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
    +
    233 }
    +
    234 
    +
    235 //atomicOr()
    +
    236 __device__ inline int atomicOr(int* address, int val)
    +
    237 {
    +
    238  return hc::atomic_fetch_or(address,val);
    +
    239 }
    +
    240 __device__ inline unsigned int atomicOr(unsigned int* address,
    +
    241  unsigned int val)
    +
    242 {
    +
    243  return hc::atomic_fetch_or(address,val);
    +
    244 }
    +
    245 __device__ inline unsigned long long int atomicOr(unsigned long long int* address,
    +
    246  unsigned long long int val)
    +
    247 {
    +
    248  return (long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
    +
    249 }
    +
    250 
    +
    251 //atomicXor()
    +
    252 __device__ inline int atomicXor(int* address, int val)
    +
    253 {
    +
    254  return hc::atomic_fetch_xor(address,val);
    +
    255 }
    +
    256 __device__ inline unsigned int atomicXor(unsigned int* address,
    +
    257  unsigned int val)
    +
    258 {
    +
    259  return hc::atomic_fetch_xor(address,val);
    +
    260 }
    +
    261 __device__ inline unsigned long long int atomicXor(unsigned long long int* address,
    +
    262  unsigned long long int val)
    +
    263 {
    +
    264  return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
    +
    265 }
    +
    266 
    +
    267 #include <hc.hpp>
    +
    268 // integer intrinsic function __poc __clz __ffs __brev
    +
    269 __device__ inline unsigned int __popc( unsigned int input)
    +
    270 {
    +
    271  return hc::__popcount_u32_b32( input);
    +
    272 }
    +
    273 
    +
    274 __device__ inline unsigned int __popcll( unsigned long long int input)
    +
    275 {
    +
    276  return hc::__popcount_u32_b64(input);
    +
    277 }
    +
    278 
    +
    279 __device__ inline unsigned int __clz(unsigned int input)
    +
    280 {
    +
    281  return hc::__firstbit_u32_u32( input);
    +
    282 }
    +
    283 
    +
    284 __device__ inline unsigned int __clzll(unsigned long long int input)
    +
    285 {
    +
    286  return hc::__firstbit_u32_u64( input);
    +
    287 }
    +
    288 
    +
    289 __device__ inline unsigned int __clz(int input)
    290 {
    -
    291  return hc::__popcount_u32_b32( input);
    +
    291  return hc::__firstbit_u32_s32( input);
    292 }
    293 
    -
    294 __device__ inline unsigned int __popcll( unsigned long long int input)
    +
    294 __device__ inline unsigned int __clzll(long long int input)
    295 {
    -
    296  return hc::__popcount_u32_b64(input);
    +
    296  return hc::__firstbit_u32_s64( input);
    297 }
    298 
    -
    299 __device__ inline unsigned int __clz(unsigned int input)
    +
    299 __device__ inline unsigned int __ffs(unsigned int input)
    300 {
    -
    301  return hc::__firstbit_u32_u32( input);
    +
    301  return hc::__lastbit_u32_u32( input)+1;
    302 }
    303 
    -
    304 __device__ inline unsigned int __clzll(unsigned long long int input)
    +
    304 __device__ inline unsigned int __ffsll(unsigned long long int input)
    305 {
    -
    306  return hc::__firstbit_u32_u64( input);
    +
    306  return hc::__lastbit_u32_u64( input)+1;
    307 }
    308 
    -
    309 __device__ inline unsigned int __clz(int input)
    +
    309 __device__ inline unsigned int __ffs(int input)
    310 {
    -
    311  return hc::__firstbit_u32_s32( input);
    +
    311  return hc::__lastbit_u32_s32( input)+1;
    312 }
    313 
    -
    314 __device__ inline unsigned int __clzll(long long int input)
    +
    314 __device__ inline unsigned int __ffsll(long long int input)
    315 {
    -
    316  return hc::__firstbit_u32_s64( input);
    +
    316  return hc::__lastbit_u32_s64( input)+1;
    317 }
    318 
    -
    319 __device__ inline unsigned int __ffs(unsigned int input)
    +
    319 __device__ inline unsigned int __brev( unsigned int input)
    320 {
    -
    321  return hc::__lastbit_u32_u32( input)+1;
    +
    321  return hc::__bitrev_b32( input);
    322 }
    323 
    -
    324 __device__ inline unsigned int __ffsll(unsigned long long int input)
    +
    324 __device__ inline unsigned long long int __brevll( unsigned long long int input)
    325 {
    -
    326  return hc::__lastbit_u32_u64( input)+1;
    +
    326  return hc::__bitrev_b64( input);
    327 }
    328 
    -
    329 __device__ inline unsigned int __brev( unsigned int input)
    -
    330 {
    -
    331  return hc::__bitrev_b32( input);
    -
    332 }
    -
    333 
    -
    334 __device__ inline unsigned long long int __brevll( unsigned long long int input)
    -
    335 {
    -
    336  return hc::__bitrev_b64( input);
    -
    337 }
    -
    338 
    -
    339 // warp vote function __all __any __ballot
    +
    329 // warp vote function __all __any __ballot
    +
    330 __device__ inline int __all( int input)
    +
    331 {
    +
    332  return hc::__all( input);
    +
    333 }
    +
    334 
    +
    335 __device__ inline int __any( int input)
    +
    336 {
    +
    337  if( hc::__any( input)!=0) return 1;
    +
    338  else return 0;
    +
    339 }
    340 
    -
    341 __device__ inline int __all( int input)
    +
    341 __device__ inline unsigned long long int __ballot( int input)
    342 {
    -
    343  return hc::__all( input);
    +
    343  return hc::__ballot( input);
    344 }
    345 
    -
    346 
    -
    347 __device__ inline int __any( int input)
    +
    346 // warp shuffle functions
    +
    347 __device__ inline int __shfl(int input, int lane, int width)
    348 {
    -
    349  return hc::__any( input);
    +
    349  return hc::__shfl(input,lane,width);
    350 }
    351 
    -
    352 
    -
    353 __device__ inline unsigned long long int __ballot( int input)
    -
    354 {
    -
    355  return hc::__ballot( input);
    -
    356 }
    -
    357 
    -
    358 #endif
    -
    359 
    -
    360 
    +
    352 __device__ inline int __shfl_up(int input, unsigned int lane_delta, int width)
    +
    353 {
    +
    354  return hc::__shfl_up(input,lane_delta,width);
    +
    355 }
    +
    356 
    +
    357 __device__ inline int __shfl_down(int input, unsigned int lane_delta, int width)
    +
    358 {
    +
    359  return hc::__shfl_down(input,lane_delta,width);
    +
    360 }
    361 
    -
    362 #ifdef __HCC_ACCELERATOR__
    -
    363 #include <hc_math.hpp>
    -
    364 // TODO: Choose whether default is precise math or fast math based on compilation flag.
    -
    365 using namespace hc::precise_math;
    +
    362 __device__ inline int __shfl_xor(int input, int lane_mask, int width)
    +
    363 {
    +
    364  return hc::__shfl_xor(input,lane_mask,width);
    +
    365 }
    366 
    -
    367 //TODO: Undo this once min/max functions are supported by hc
    -
    368 inline int min(int arg1, int arg2) __attribute((hc,cpu)) { \
    -
    369  return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));}
    -
    370 inline int max(int arg1, int arg2) __attribute((hc,cpu)) { \
    -
    371  return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));}
    -
    372 
    -
    373 
    -
    374 //TODO - add a couple fast math operations here, the set here will grow :
    -
    375 __device__ inline float __log2f(float x) {return hc::fast_math::log2(x); };
    -
    376 __device__ inline float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); };
    -
    377 
    -
    378 #endif
    -
    379 
    -
    380 
    +
    367 __device__ inline float __shfl(float input, int lane, int width)
    +
    368 {
    +
    369  return hc::__shfl(input,lane,width);
    +
    370 }
    +
    371 
    +
    372 __device__ inline float __shfl_up(float input, unsigned int lane_delta, int width)
    +
    373 {
    +
    374  return hc::__shfl_up(input,lane_delta,width);
    +
    375 }
    +
    376 
    +
    377 __device__ inline float __shfl_down(float input, unsigned int lane_delta, int width)
    +
    378 {
    +
    379  return hc::__shfl_down(input,lane_delta,width);
    +
    380 }
    381 
    -
    385 #define hipThreadIdx_x (amp_get_local_id(2))
    -
    386 #define hipThreadIdx_y (amp_get_local_id(1))
    -
    387 #define hipThreadIdx_z (amp_get_local_id(0))
    -
    388 
    -
    389 #define hipBlockIdx_x (hc_get_group_id(2))
    -
    390 #define hipBlockIdx_y (hc_get_group_id(1))
    -
    391 #define hipBlockIdx_z (hc_get_group_id(0))
    -
    392 
    -
    393 #define hipBlockDim_x (amp_get_local_size(2))
    -
    394 #define hipBlockDim_y (amp_get_local_size(1))
    -
    395 #define hipBlockDim_z (amp_get_local_size(0))
    -
    396 
    -
    397 #define hipGridDim_x (hc_get_num_groups(2))
    -
    398 #define hipGridDim_y (hc_get_num_groups(1))
    -
    399 #define hipGridDim_z (hc_get_num_groups(0))
    -
    400 
    -
    401 
    -
    402 extern int warpSize ;
    -
    403 
    -
    404 
    -
    405 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
    -
    406 
    -
    407 
    -
    408 #if 0
    -
    409 #define KALMAR_PFE_BEGIN() \
    -
    410  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
    -
    411  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
    -
    412  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
    -
    413  \
    -
    414  hc::completion_future cf = hc::parallel_for_each (\
    -
    415  *lp.av,\
    -
    416  __hipExtTile,\
    -
    417  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
    -
    418 
    -
    419 
    +
    382 __device__ inline float __shfl_xor(float input, int lane_mask, int width)
    +
    383 {
    +
    384  return hc::__shfl_xor(input,lane_mask,width);
    +
    385 }
    +
    386 
    +
    387 
    +
    388 #include <hc_math.hpp>
    +
    389 // TODO: Choose whether default is precise math or fast math based on compilation flag.
    +
    390 #ifdef __HCC_ACCELERATOR__
    +
    391 using namespace hc::precise_math;
    +
    392 #endif
    +
    393 
    +
    394 //TODO: Undo this once min/max functions are supported by hc
    +
    395 inline int min(int arg1, int arg2) __attribute((hc,cpu)) { \
    +
    396  return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));}
    +
    397 inline int max(int arg1, int arg2) __attribute((hc,cpu)) { \
    +
    398  return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));}
    +
    399 
    +
    400 
    +
    401 //TODO - add a couple fast math operations here, the set here will grow :
    +
    402 __device__ inline float __cosf(float x) {return hc::fast_math::cosf(x); };
    +
    403 __device__ inline float __expf(float x) {return hc::fast_math::expf(x); };
    +
    404 __device__ inline float __frsqrt_rn(float x) {return hc::fast_math::rsqrt(x); };
    +
    405 __device__ inline float __fsqrt_rd(float x) {return hc::fast_math::sqrt(x); };
    +
    406 __device__ inline float __fsqrt_rn(float x) {return hc::fast_math::sqrt(x); };
    +
    407 __device__ inline float __fsqrt_ru(float x) {return hc::fast_math::sqrt(x); };
    +
    408 __device__ inline float __fsqrt_rz(float x) {return hc::fast_math::sqrt(x); };
    +
    409 __device__ inline float __log10f(float x) {return hc::fast_math::log10f(x); };
    +
    410 __device__ inline float __log2f(float x) {return hc::fast_math::log2f(x); };
    +
    411 __device__ inline float __logf(float x) {return hc::fast_math::logf(x); };
    +
    412 __device__ inline float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); };
    +
    413 __device__ inline void __sincosf(float x, float *s, float *c) {return hc::fast_math::sincosf(x, s, c); };
    +
    414 __device__ inline float __sinf(float x) {return hc::fast_math::sinf(x); };
    +
    415 __device__ inline float __tanf(float x) {return hc::fast_math::tanf(x); };
    +
    416 __device__ inline float __dsqrt_rd(double x) {return hc::fast_math::sqrt(x); };
    +
    417 __device__ inline float __dsqrt_rn(double x) {return hc::fast_math::sqrt(x); };
    +
    418 __device__ inline float __dsqrt_ru(double x) {return hc::fast_math::sqrt(x); };
    +
    419 __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); };
    420 
    -
    421 #define KALMAR_PFE_END \
    -
    422  ); \
    -
    423  if (HIP_LAUNCH_BLOCKING) {\
    -
    424  if (HIP_TRACE_API) {\
    -
    425  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
    -
    426  }\
    -
    427  cf.wait(); \
    -
    428  if (HIP_TRACE_API) {\
    -
    429  fprintf(stderr, "hiptrace1: ...completed.\n");\
    -
    430  }\
    -
    431  }
    -
    432 #endif
    -
    433 
    -
    434 
    -
    435 
    -
    436 #define HIP_KERNEL_NAME(...) __VA_ARGS__
    -
    437 
    -
    438 
    -
    439 #ifdef __HCC_CPP__
    -
    440 hc::accelerator_view *ihipLaunchKernel(hipStream_t stream);
    -
    441 
    -
    442 #if not defined(DISABLE_GRID_LAUNCH)
    -
    443 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
    -
    444 do {\
    -
    445  grid_launch_parm lp;\
    -
    446  lp.gridDim.x = _numBlocks3D.x; \
    -
    447  lp.gridDim.y = _numBlocks3D.y; \
    -
    448  lp.gridDim.z = _numBlocks3D.z; \
    -
    449  lp.groupDim.x = _blockDim3D.x; \
    -
    450  lp.groupDim.y = _blockDim3D.y; \
    -
    451  lp.groupDim.z = _blockDim3D.z; \
    -
    452  lp.groupMemBytes = _groupMemBytes;\
    -
    453  hc::completion_future cf;\
    -
    454  lp.cf = &cf; \
    -
    455  lp.av = (ihipLaunchKernel(_stream)); \
    -
    456  if (HIP_TRACE_API) {\
    -
    457  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
    -
    458  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
    -
    459  }\
    -
    460  _kernelName (lp, __VA_ARGS__);\
    -
    461 } while(0)
    -
    462 
    -
    463 #else
    -
    464 #warning(DISABLE_GRID_LAUNCH set)
    -
    465 
    -
    466 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
    -
    467 do {\
    -
    468  grid_launch_parm lp;\
    -
    469  lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x;/*Convert from #blocks to #threads*/ \
    -
    470  lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y;/*Convert from #blocks to #threads*/ \
    -
    471  lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z;/*Convert from #blocks to #threads*/ \
    -
    472  lp.groupDim.x = _blockDim3D.x; \
    -
    473  lp.groupDim.y = _blockDim3D.y; \
    -
    474  lp.groupDim.z = _blockDim3D.z; \
    -
    475  lp.groupMemBytes = _groupMemBytes;\
    -
    476  hc::completion_future cf;\
    -
    477  lp.cf = &cf; \
    -
    478  lp.av = (ihipLaunchKernel(_stream)); \
    -
    479  if (HIP_TRACE_API) {\
    -
    480  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
    -
    481  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
    -
    482  }\
    -
    483  _kernelName (lp, __VA_ARGS__);\
    -
    484 } while(0)
    -
    485 /*end hipLaunchKernel */
    -
    486 #endif
    -
    487 
    -
    488 #elif defined (__HCC_C__)
    -
    489 
    -
    490 //TODO - develop C interface.
    -
    491 
    -
    492 #endif
    -
    493 
    -
    494 
    -
    495 #if not defined(DISABLE_GRID_LAUNCH)
    -
    496 // TODO -In GL these are no-ops and can be removed:
    -
    497 // Keep them around for a little while as a fallback.
    -
    498 #define KERNELBEGIN
    -
    499 #define KERNELEND
    -
    500 
    -
    501 #else
    -
    502 
    -
    503 // TODO-GL:
    -
    504 // These wrap the kernel in a PFE loop with macros.
    -
    505 // Not required with GL but exist here as a fallback.
    -
    506 #define KERNELBEGIN \
    -
    507  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
    -
    508  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
    -
    509  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
    -
    510  \
    -
    511  hc::completion_future cf = \
    -
    512  hc::parallel_for_each (\
    -
    513  *lp.av,\
    -
    514  __hipExtTile,\
    -
    515  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
    -
    516  {
    -
    517 
    -
    518 
    -
    519 #define KERNELEND \
    -
    520  }); \
    -
    521  if (HIP_LAUNCH_BLOCKING) {\
    -
    522  if (HIP_TRACE_API) {\
    -
    523  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
    -
    524  }\
    -
    525  cf.wait(); \
    -
    526  if (HIP_TRACE_API) {\
    -
    527  fprintf(stderr, "hiptrace1: ...completed.\n");\
    -
    528  }\
    -
    529  }
    -
    530 
    -
    531 #endif /*DISABLE_GRID_LAUNCH*/
    +
    424 #define hipThreadIdx_x (amp_get_local_id(2))
    +
    425 #define hipThreadIdx_y (amp_get_local_id(1))
    +
    426 #define hipThreadIdx_z (amp_get_local_id(0))
    +
    427 
    +
    428 #define hipBlockIdx_x (hc_get_group_id(2))
    +
    429 #define hipBlockIdx_y (hc_get_group_id(1))
    +
    430 #define hipBlockIdx_z (hc_get_group_id(0))
    +
    431 
    +
    432 #define hipBlockDim_x (amp_get_local_size(2))
    +
    433 #define hipBlockDim_y (amp_get_local_size(1))
    +
    434 #define hipBlockDim_z (amp_get_local_size(0))
    +
    435 
    +
    436 #define hipGridDim_x (hc_get_num_groups(2))
    +
    437 #define hipGridDim_y (hc_get_num_groups(1))
    +
    438 #define hipGridDim_z (hc_get_num_groups(0))
    +
    439 
    +
    440 
    +
    441 extern int warpSize ;
    +
    442 
    +
    443 
    +
    444 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
    +
    445 
    +
    446 
    +
    447 #if 0
    +
    448 #define KALMAR_PFE_BEGIN() \
    +
    449  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
    +
    450  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
    +
    451  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
    +
    452  \
    +
    453  hc::completion_future cf = hc::parallel_for_each (\
    +
    454  *lp.av,\
    +
    455  __hipExtTile,\
    +
    456  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
    +
    457 
    +
    458 
    +
    459 
    +
    460 #define KALMAR_PFE_END \
    +
    461  ); \
    +
    462  if (HIP_LAUNCH_BLOCKING) {\
    +
    463  if (HIP_TRACE_API) {\
    +
    464  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
    +
    465  }\
    +
    466  cf.wait(); \
    +
    467  if (HIP_TRACE_API) {\
    +
    468  fprintf(stderr, "hiptrace1: ...completed.\n");\
    +
    469  }\
    +
    470  }
    +
    471 #endif
    +
    472 
    +
    473 
    +
    474 
    +
    475 #define HIP_KERNEL_NAME(...) __VA_ARGS__
    +
    476 
    +
    477 
    +
    478 #ifdef __HCC_CPP__
    +
    479 hc::accelerator_view *ihipLaunchKernel(hipStream_t stream);
    +
    480 
    +
    481 #if not defined(DISABLE_GRID_LAUNCH)
    +
    482 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
    +
    483 do {\
    +
    484  grid_launch_parm lp;\
    +
    485  lp.gridDim.x = _numBlocks3D.x; \
    +
    486  lp.gridDim.y = _numBlocks3D.y; \
    +
    487  lp.gridDim.z = _numBlocks3D.z; \
    +
    488  lp.groupDim.x = _blockDim3D.x; \
    +
    489  lp.groupDim.y = _blockDim3D.y; \
    +
    490  lp.groupDim.z = _blockDim3D.z; \
    +
    491  lp.groupMemBytes = _groupMemBytes;\
    +
    492  hc::completion_future cf;\
    +
    493  lp.cf = &cf; \
    +
    494  lp.av = (ihipLaunchKernel(_stream)); \
    +
    495  if (HIP_TRACE_API) {\
    +
    496  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
    +
    497  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
    +
    498  }\
    +
    499  _kernelName (lp, __VA_ARGS__);\
    +
    500 } while(0)
    +
    501 
    +
    502 #else
    +
    503 #warning(DISABLE_GRID_LAUNCH set)
    +
    504 
    +
    505 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
    +
    506 do {\
    +
    507  grid_launch_parm lp;\
    +
    508  lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x;/*Convert from #blocks to #threads*/ \
    +
    509  lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y;/*Convert from #blocks to #threads*/ \
    +
    510  lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z;/*Convert from #blocks to #threads*/ \
    +
    511  lp.groupDim.x = _blockDim3D.x; \
    +
    512  lp.groupDim.y = _blockDim3D.y; \
    +
    513  lp.groupDim.z = _blockDim3D.z; \
    +
    514  lp.groupMemBytes = _groupMemBytes;\
    +
    515  hc::completion_future cf;\
    +
    516  lp.cf = &cf; \
    +
    517  lp.av = (ihipLaunchKernel(_stream)); \
    +
    518  if (HIP_TRACE_API) {\
    +
    519  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
    +
    520  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
    +
    521  }\
    +
    522  _kernelName (lp, __VA_ARGS__);\
    +
    523 } while(0)
    +
    524 /*end hipLaunchKernel */
    +
    525 #endif
    +
    526 
    +
    527 #elif defined (__HCC_C__)
    +
    528 
    +
    529 //TODO - develop C interface.
    +
    530 
    +
    531 #endif
    532 
    533 
    -
    534 #endif // __HCC__
    -
    535 
    -
    536 
    -
    541 extern int HIP_PRINT_ENV ;
    -
    542 extern int HIP_TRACE_API;
    -
    543 extern int HIP_LAUNCH_BLOCKING ;
    -
    544 
    -
    550 // End doxygen API:
    +
    534 #if not defined(DISABLE_GRID_LAUNCH)
    +
    535 // TODO -In GL these are no-ops and can be removed:
    +
    536 // Keep them around for a little while as a fallback.
    +
    537 #define KERNELBEGIN
    +
    538 #define KERNELEND
    +
    539 
    +
    540 #else
    +
    541 
    +
    542 // TODO-GL:
    +
    543 // These wrap the kernel in a PFE loop with macros.
    +
    544 // Not required with GL but exist here as a fallback.
    +
    545 #define KERNELBEGIN \
    +
    546  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
    +
    547  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
    +
    548  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
    +
    549  \
    +
    550  hc::completion_future cf = \
    +
    551  hc::parallel_for_each (\
    +
    552  *lp.av,\
    +
    553  __hipExtTile,\
    +
    554  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
    +
    555  {
    +
    556 
    +
    557 
    +
    558 #define KERNELEND \
    +
    559  }); \
    +
    560  if (HIP_LAUNCH_BLOCKING) {\
    +
    561  if (HIP_TRACE_API) {\
    +
    562  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
    +
    563  }\
    +
    564  cf.wait(); \
    +
    565  if (HIP_TRACE_API) {\
    +
    566  fprintf(stderr, "hiptrace1: ...completed.\n");\
    +
    567  }\
    +
    568  }
    +
    569 
    +
    570 #endif /*DISABLE_GRID_LAUNCH*/
    +
    571 
    +
    572 
    +
    573 #endif // __HCC__
    +
    574 
    +
    575 
    +
    580 extern int HIP_PRINT_ENV ;
    +
    581 extern int HIP_TRACE_API;
    +
    582 extern int HIP_LAUNCH_BLOCKING ;
    +
    583 
    +
    589 // End doxygen API:
    int HIP_TRACE_API
    Trace HIP APIs.
    Definition: hip_hcc.cpp:57
    +
    TODO-doc.
    Definition: hip_hcc.cpp:82
    +
    HIP C++ Texture API for hcc compiler.
    int HIP_PRINT_ENV
    Print all HIP-related environment variables.
    Definition: hip_hcc.cpp:56
    +
    Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language ...
    int HIP_LAUNCH_BLOCKING
    Make all HIP APIs host-synchronous.
    Definition: hip_hcc.cpp:58
    diff --git a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h_source.html index 45bbf45deb..876afac7f7 100644 --- a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail/hip_runtime_api.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_runtime_api.h Source File @@ -89,7 +89,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    hip_runtime_api.h
    -
    1 /*
    +Go to the documentation of this file.
    1 /*
    2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
    3 
    4 Permission is hereby granted, free of charge, to any person obtaining a copy
    @@ -112,356 +112,345 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    21 */
    22 #pragma once
    23 
    -
    24 
    -
    25 #include <stdint.h>
    -
    26 #include <stddef.h>
    -
    27 
    -
    28 #include <hcc_detail/host_defines.h>
    -
    29 
    -
    30 #if defined (__HCC__) && (__hcc_workweek__ < 1602)
    -
    31 #error("This version of HIP requires a newer version of HCC.");
    -
    32 #endif
    -
    33 
    -
    34 
    -
    35 // hip_api_hcc.h
    -
    36 // Contains C function APIs for HIP runtime.
    -
    37 // This file does not use any HCC builtins or special language extensions (-hc mode) ; those functions in hip_hcc.h.
    -
    38 
    -
    39 
    -
    40 // Structure definitions:
    -
    41 #ifdef __cplusplus
    -
    42 extern "C" {
    -
    43 #endif
    -
    44 
    -
    49 #define hipStreamDefault 0x00
    -
    51 #define hipStreamNonBlocking 0x01
    -
    52 
    -
    53 
    -
    55 #define hipEventDefault 0x0
    -
    56 #define hipEventBlockingSync 0x1
    -
    57 #define hipEventDisableTiming 0x2
    -
    58 #define hipEventInterprocess 0x4
    -
    59 
    -
    60 
    -
    64 typedef enum hipFuncCache {
    - - - - -
    69 } hipFuncCache;
    -
    70 
    -
    71 
    -
    75 typedef enum hipSharedMemConfig {
    - - - - +
    29 #include <stdint.h>
    +
    30 #include <stddef.h>
    +
    31 
    + +
    33 
    +
    34 #if defined (__HCC__) && (__hcc_workweek__ < 1602)
    +
    35 #error("This version of HIP requires a newer version of HCC.");
    +
    36 #endif
    +
    37 
    +
    38 // Structure definitions:
    +
    39 #ifdef __cplusplus
    +
    40 extern "C" {
    +
    41 #endif
    +
    42 
    +
    47 #define hipStreamDefault 0x00
    +
    49 #define hipStreamNonBlocking 0x01
    +
    50 
    +
    51 
    +
    53 #define hipEventDefault 0x0
    +
    54 #define hipEventBlockingSync 0x1
    +
    55 #define hipEventDisableTiming 0x2
    +
    56 #define hipEventInterprocess 0x4
    +
    57 
    +
    58 
    +
    62 typedef enum hipFuncCache {
    + + + + +
    67 } hipFuncCache;
    +
    68 
    +
    69 
    +
    73 typedef enum hipSharedMemConfig {
    + + + + +
    78 
    +
    79 
    80 
    -
    81 
    -
    82 
    -
    87 typedef struct dim3 {
    -
    88  uint32_t x;
    -
    89  uint32_t y;
    -
    90  uint32_t z;
    -
    91 
    -
    92  dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
    -
    93 } dim3;
    +
    85 typedef struct dim3 {
    +
    86  uint32_t x;
    +
    87  uint32_t y;
    +
    88  uint32_t z;
    +
    89 
    +
    90  dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
    +
    91 } dim3;
    +
    92 
    +
    93 
    94 
    -
    95 
    -
    96 
    - - - - - - -
    107 } ;
    -
    108 
    -
    109 
    -
    110 // Doxygen end group GlobalDefs
    -
    114 //-------------------------------------------------------------------------------------------------
    -
    115 
    -
    116 
    -
    117 // The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope.
    -
    118 typedef struct ihipStream_t * hipStream_t;
    -
    119 
    -
    120 
    -
    121 /*
    -
    122  * Opaque structure allows the true event (pointed at by the handle) to remain "live" even if the surrounding hipEvent_t goes out-of-scope.
    -
    123  * This is handy for cases where the hipEvent_t goes out-of-scope but the true event is being written by some async queue or device */
    -
    124 typedef struct hipEvent_t {
    -
    125  struct ihipEvent_t *_handle;
    -
    126 } hipEvent_t;
    + + + + + + +
    105 } ;
    +
    106 
    +
    107 
    +
    108 // Doxygen end group GlobalDefs
    +
    112 //-------------------------------------------------------------------------------------------------
    +
    113 
    +
    114 
    +
    115 // The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope.
    +
    116 typedef struct ihipStream_t * hipStream_t;
    +
    117 
    +
    118 
    +
    119 /*
    +
    120  * Opaque structure allows the true event (pointed at by the handle) to remain "live" even if the surrounding hipEvent_t goes out-of-scope.
    +
    121  * This is handy for cases where the hipEvent_t goes out-of-scope but the true event is being written by some async queue or device */
    +
    122 typedef struct hipEvent_t {
    +
    123  struct ihipEvent_t *_handle;
    +
    124 } hipEvent_t;
    +
    125 
    +
    126 
    127 
    128 
    129 
    130 
    -
    131 
    -
    132 
    -
    133 #ifdef __cplusplus
    -
    134 } /* extern "C" */
    -
    135 #endif
    -
    136 
    +
    131 #ifdef __cplusplus
    +
    132 } /* extern "C" */
    +
    133 #endif
    +
    134 
    +
    135 
    +
    136 
    137 
    -
    138 
    -
    139 
    -
    140 //==================================================================================================
    -
    141 #ifdef __cplusplus
    -
    142 extern "C" {
    -
    143 #endif
    -
    144 
    - +
    138 //==================================================================================================
    +
    139 #ifdef __cplusplus
    +
    140 extern "C" {
    +
    141 #endif
    +
    142 
    + +
    167 
    +
    168 
    169 
    -
    170 
    -
    171 
    - -
    183 
    -
    184 
    -
    209 hipError_t hipSetDevice(int device);
    -
    210 
    -
    211 
    -
    223 hipError_t hipGetDevice(int *device);
    -
    224 
    -
    225 
    -
    233 hipError_t hipGetDeviceCount(int *count);
    -
    234 
    -
    235 
    - -
    245 
    -
    246 
    -
    247 
    -
    248 //Cache partitioning functions:
    -
    249 
    - -
    257 
    -
    258 
    - -
    266 
    -
    267 
    - -
    275 
    -
    276 //---
    -
    277 //Shared bank config functions:
    -
    278 
    - -
    286 
    -
    287 
    - -
    295 
    -
    296 
    -
    297 // end doxygen Device
    - -
    318 
    -
    319 
    - -
    332 
    -
    333 
    -
    334 
    -
    343 const char *hipGetErrorName(hipError_t hip_error);
    -
    344 
    -
    345 
    -
    356 const char *hipGetErrorString(hipError_t hip_error);
    -
    357 
    -
    358 // end doxygen Error
    -
    388 hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags);
    -
    389 
    -
    390 
    -
    391 
    -
    401 static inline hipError_t hipStreamCreate(hipStream_t *stream)
    -
    402 {
    - -
    404 }
    -
    405 
    -
    406 
    -
    422 hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);
    -
    423 
    -
    424 
    - -
    437 
    -
    438 
    - -
    453 
    -
    454 
    -
    468 hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags);
    -
    469 
    -
    470 
    -
    471 // end doxygen Stream
    -
    496 hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
    -
    497 
    -
    498 
    -
    505 static inline hipError_t hipEventCreate(hipEvent_t* event)
    -
    506 {
    -
    507  return hipEventCreateWithFlags(event, 0);
    -
    508 }
    -
    509 
    -
    510 
    -
    536 hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream = NULL);
    -
    537 
    -
    538 
    - -
    550 
    -
    551 
    - -
    566 
    -
    567 
    -
    592 hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop);
    -
    593 
    -
    594 
    - -
    608 
    -
    609 
    -
    610 // end doxygen Events
    -
    640 hipError_t hipMalloc(void** ptr, size_t size) ;
    -
    641 
    -
    642 
    -
    650 hipError_t hipMallocHost(void** ptr, size_t size) ;
    -
    651 
    -
    652 
    -
    653 
    -
    654 // TODO-doc (error codes)
    -
    662 hipError_t hipFree(void* ptr);
    -
    663 
    -
    664 
    -
    665 
    -
    666 // TODO-doc (error codes)
    -
    673 hipError_t hipFreeHost(void* ptr);
    -
    674 
    -
    675 
    + +
    181 
    +
    182 
    +
    207 hipError_t hipSetDevice(int device);
    +
    208 
    +
    209 
    +
    221 hipError_t hipGetDevice(int *device);
    +
    222 
    +
    223 
    +
    231 hipError_t hipGetDeviceCount(int *count);
    +
    232 
    +
    239 hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device);
    +
    240 
    + +
    250 
    +
    251 
    +
    252 
    +
    253 //Cache partitioning functions:
    +
    254 
    + +
    262 
    +
    263 
    + +
    271 
    +
    272 
    + +
    280 
    +
    281 //---
    +
    282 //Shared bank config functions:
    +
    283 
    + +
    291 
    +
    292 
    + +
    300 
    +
    301 
    +
    302 // end doxygen Device
    + +
    323 
    +
    324 
    + +
    337 
    +
    338 
    +
    339 
    +
    348 const char *hipGetErrorName(hipError_t hip_error);
    +
    349 
    +
    350 
    +
    361 const char *hipGetErrorString(hipError_t hip_error);
    +
    362 
    +
    363 // end doxygen Error
    +
    393 hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags);
    +
    394 
    +
    395 
    +
    396 
    +
    406 static inline hipError_t hipStreamCreate(hipStream_t *stream)
    +
    407 {
    + +
    409 }
    +
    410 
    +
    411 
    +
    427 hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);
    +
    428 
    +
    429 
    + +
    442 
    +
    443 
    + +
    458 
    +
    459 
    +
    473 hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags);
    +
    474 
    +
    475 
    +
    476 // end doxygen Stream
    +
    501 hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
    +
    502 
    +
    503 
    +
    510 static inline hipError_t hipEventCreate(hipEvent_t* event)
    +
    511 {
    +
    512  return hipEventCreateWithFlags(event, 0);
    +
    513 }
    +
    514 
    +
    515 
    +
    541 hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream = NULL);
    +
    542 
    +
    543 
    + +
    555 
    +
    556 
    + +
    571 
    +
    572 
    +
    597 hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop);
    +
    598 
    +
    599 
    + +
    613 
    +
    614 
    +
    615 // end doxygen Events
    +
    645 hipError_t hipMalloc(void** ptr, size_t size) ;
    +
    646 
    +
    647 
    +
    655 hipError_t hipMallocHost(void** ptr, size_t size) ;
    +
    656 
    +
    657 
    +
    665 hipError_t hipFree(void* ptr);
    +
    666 
    +
    667 
    +
    668 
    +
    675 hipError_t hipFreeHost(void* ptr);
    676 
    677 
    -
    678 // TODO-doc (error codes)
    -
    694 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
    +
    678 
    +
    693 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
    +
    694 
    695 
    -
    696 
    -
    697 hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind);
    -
    698 
    -
    699 
    -
    700 
    -
    701 // TODO-doc (error codes)
    -
    712 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0);
    -
    713 
    -
    714 
    -
    715 // TODO-doc
    -
    716 /*
    -
    717  * This function is host-asynchronous and may return before the memset operation completes.
    -
    718  * Same as hipMemsetAsync with null stream.
    -
    719  *
    -
    720  * */
    -
    721 hipError_t hipMemset(void* dst, int value, size_t sizeBytes );
    -
    722 hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t = 0 );
    -
    723 
    -
    724 
    -
    725 /*
    -
    726  * @brief Query memory info. Return snapshot of free memory, and total allocatable memory on the device.
    -
    727  *
    -
    728  * Returns in *free a snapshot of the current free memory o
    -
    729  **/
    -
    730 hipError_t hipMemGetInfo (size_t * free, size_t * total) ;
    -
    731 
    -
    732 // doxygen end Memory
    -
    757 hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice );
    -
    758 
    -
    759 // TODO-DOC
    -
    760 hipError_t hipDeviceDisablePeerAccess ( int peerDevice );
    -
    761 
    -
    762 // TODO-DOC
    -
    763 hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned int flags );
    -
    764 
    -
    765 // TODO-DOC
    -
    766 hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes );
    -
    767 
    -
    768 // TODO-DOC
    -
    769 hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream=0 );
    -
    770 // doxygen end PeerToPeer
    -
    794 hipError_t hipDriverGetVersion(int *driverVersion) ;
    -
    795 
    -
    796 
    -
    797 
    -
    798 // doxygen end Version Management
    -
    825 #ifdef __cplusplus
    -
    826 } /* extern "c" */
    -
    827 #endif
    -
    828 
    -
    829 
    -
    847 #ifdef __HCC__
    -
    848 #include <hc.hpp>
    -
    852 hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc);
    -
    853 
    -
    857 hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av);
    -
    858 #endif
    -
    859 
    -
    860 
    -
    861 // end-group HCC_Specific
    -
    868 // doxygen end HIP API
    -
    hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags)
    Definition: hip_hcc.cpp:1575
    -
    hipError_t hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice)
    Determine if a device can access a peer's memory.
    Definition: hip_hcc.cpp:1551
    +
    710 hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind);
    +
    711 
    +
    712 
    +
    724 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0);
    +
    725 
    +
    726 
    +
    739 hipError_t hipMemset(void* dst, int value, size_t sizeBytes );
    +
    740 
    +
    741 
    +
    755 hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t = 0 );
    +
    756 
    +
    757 
    +
    758 /*
    +
    759  * @brief Query memory info.
    +
    760  * Return snapshot of free memory, and total allocatable memory on the device.
    +
    761  *
    +
    762  * Returns in *free a snapshot of the current free memory o
    +
    763  **/
    +
    764 hipError_t hipMemGetInfo (size_t * free, size_t * total) ;
    +
    765 
    +
    766 // doxygen end Memory
    +
    791 hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice );
    +
    792 
    +
    793 
    +
    794 
    +
    805 hipError_t hipDeviceDisablePeerAccess ( int peerDevice );
    +
    806 
    +
    816 hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned int flags );
    +
    817 
    +
    829 hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes );
    +
    830 
    +
    843 hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream=0 );
    +
    844 // doxygen end PeerToPeer
    +
    868 hipError_t hipDriverGetVersion(int *driverVersion) ;
    +
    869 
    +
    870 
    +
    871 
    +
    872 // doxygen end Version Management
    +
    899 #ifdef __cplusplus
    +
    900 } /* extern "c" */
    +
    901 #endif
    +
    902 
    +
    903 
    +
    921 #ifdef __HCC__
    +
    922 #include <hc.hpp>
    +
    926 hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc);
    +
    927 
    +
    931 hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av);
    +
    932 #endif
    +
    933 
    +
    934 
    +
    935 // end-group HCC_Specific
    +
    942 // doxygen end HIP API
    +
    hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags)
    Enables registering memory on peerDevice for direct access from the current device.
    Definition: hip_hcc.cpp:1658
    +
    hipError_t hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice)
    Determine if a device can access a peer's memory.
    Definition: hip_hcc.cpp:1634
    hipError_t hipPeekAtLastError(void)
    Return last error returned by any HIP runtime API call.
    -
    hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
    Return hc::acclerator_view associated with the specified stream.
    Definition: hip_hcc.cpp:1647
    +
    hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
    Return hc::accelerator_view associated with the specified stream.
    Definition: hip_hcc.cpp:1730
    struct dim3 dim3
    -
    hipError_t hipFreeHost(void *ptr)
    Definition: hip_hcc.cpp:1529
    -
    hipError_t hipFuncSetCacheConfig(hipFuncCache config)
    Set Cache configuration for a specific function.
    Definition: hip_hcc.cpp:704
    -
    no preference for shared memory or L1 (default)
    Definition: hip_runtime_api.h:65
    -
    uint32_t x
    x
    Definition: hip_runtime_api.h:88
    -
    Host-to-Device Copy.
    Definition: hip_runtime_api.h:103
    -
    hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig)
    Get Shared memory bank configuration.
    Definition: hip_hcc.cpp:734
    -
    hipError_t hipSetDevice(int device)
    Set default device to be used for subsequent hip API calls from this thread.
    Definition: hip_hcc.cpp:747
    -
    Device-to-Host Copy.
    Definition: hip_runtime_api.h:104
    -
    hipError_t hipEventSynchronize(hipEvent_t event)
    : Wait for an event to complete.
    Definition: hip_hcc.cpp:1074
    -
    hipError_t hipDeviceGetProperties(hipDeviceProp_t *prop, int device)
    Returns device properties.
    Definition: hip_hcc.cpp:801
    -
    hipFuncCache
    Definition: hip_runtime_api.h:64
    -
    hipError_t hipEventQuery(hipEvent_t event)
    Query event status.
    Definition: hip_hcc.cpp:1169
    -
    hipError_t hipDeviceGetCacheConfig(hipFuncCache *cacheConfig)
    Set Cache configuration for a specific function.
    Definition: hip_hcc.cpp:690
    -
    hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream=NULL)
    Record an event in the specified stream.
    Definition: hip_hcc.cpp:1025
    -
    hipError_t hipGetDevice(int *device)
    Return the default device id for the calling host thread.
    Definition: hip_hcc.cpp:645
    -
    hipError_t hipEventDestroy(hipEvent_t event)
    Destroy the specified event.
    Definition: hip_hcc.cpp:1059
    -
    hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags)
    Create an asynchronous stream.
    Definition: hip_hcc.cpp:891
    -
    hipError_t hipDeviceDisablePeerAccess(int peerDevice)
    Definition: hip_hcc.cpp:1563
    -
    Definition: hip_runtime_api.h:87
    -
    uint32_t y
    y
    Definition: hip_runtime_api.h:89
    -
    prefer equal size L1 cache and shared memory
    Definition: hip_runtime_api.h:68
    -
    hipError_t hipEventCreateWithFlags(hipEvent_t *event, unsigned flags)
    Create an event with the specified flags.
    Definition: hip_hcc.cpp:1002
    -
    hipError_t hipMallocHost(void **ptr, size_t size)
    Definition: hip_hcc.cpp:1289
    -
    hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
    Return the elapsed time between two events.
    Definition: hip_hcc.cpp:1124
    -
    hipError_t hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0)
    Definition: hip_hcc.cpp:1596
    -
    hipError_t hipGetDeviceCount(int *count)
    Return number of compute-capable devices.
    Definition: hip_hcc.cpp:658
    -
    hipError_t hipStreamDestroy(hipStream_t stream)
    Destroys the specified stream.
    Definition: hip_hcc.cpp:955
    -
    hipError_t hipStreamSynchronize(hipStream_t stream)
    Wait for all commands in stream to complete.
    Definition: hip_hcc.cpp:932
    +
    TODO-doc.
    +
    hipError_t hipMemsetAsync(void *dst, int value, size_t sizeBytes, hipStream_t=0)
    Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant byte value val...
    Definition: hip_hcc.cpp:1513
    +
    hipError_t hipFreeHost(void *ptr)
    Free memory allocated by the hcc hip host memory allocation API.
    Definition: hip_hcc.cpp:1612
    +
    hipError_t hipMemcpyToSymbol(const char *symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind)
    Copies sizeBytes bytes from the memory area pointed to by src to the memory area pointed to by offset...
    Definition: hip_hcc.cpp:1410
    +
    hipError_t hipFuncSetCacheConfig(hipFuncCache config)
    Set Cache configuration for a specific function.
    Definition: hip_hcc.cpp:721
    +
    no preference for shared memory or L1 (default)
    Definition: hip_runtime_api.h:63
    +
    uint32_t x
    x
    Definition: hip_runtime_api.h:86
    +
    Host-to-Device Copy.
    Definition: hip_runtime_api.h:101
    +
    hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig)
    Get Shared memory bank configuration.
    Definition: hip_hcc.cpp:751
    +
    hipError_t hipSetDevice(int device)
    Set default device to be used for subsequent hip API calls from this thread.
    Definition: hip_hcc.cpp:764
    +
    Device-to-Host Copy.
    Definition: hip_runtime_api.h:102
    +
    hipError_t hipEventSynchronize(hipEvent_t event)
    : Wait for an event to complete.
    Definition: hip_hcc.cpp:1157
    +
    hipError_t hipDeviceGetProperties(hipDeviceProp_t *prop, int device)
    Returns device properties.
    Definition: hip_hcc.cpp:884
    +
    hipFuncCache
    Definition: hip_runtime_api.h:62
    +
    hipError_t hipEventQuery(hipEvent_t event)
    Query event status.
    Definition: hip_hcc.cpp:1252
    +
    hipError_t hipDeviceGetCacheConfig(hipFuncCache *cacheConfig)
    Set Cache configuration for a specific function.
    Definition: hip_hcc.cpp:707
    +
    hipError_t hipMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes)
    Copies memory from one device to memory on another device.
    Definition: hip_hcc.cpp:1667
    +
    hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr, int device)
    Query device attribute.
    Definition: hip_hcc.cpp:812
    +
    hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream=NULL)
    Record an event in the specified stream.
    Definition: hip_hcc.cpp:1108
    +
    hipError_t hipGetDevice(int *device)
    Return the default device id for the calling host thread.
    Definition: hip_hcc.cpp:662
    +
    hipDeviceAttribute_t
    Definition: hip_runtime_api.h:134
    +
    hipError_t hipEventDestroy(hipEvent_t event)
    Destroy the specified event.
    Definition: hip_hcc.cpp:1142
    +
    hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags)
    Create an asynchronous stream.
    Definition: hip_hcc.cpp:974
    +
    hipError_t hipDeviceDisablePeerAccess(int peerDevice)
    Disables registering memory on peerDevice for direct access from the current device.
    Definition: hip_hcc.cpp:1646
    +
    Definition: hip_runtime_api.h:85
    +
    uint32_t y
    y
    Definition: hip_runtime_api.h:87
    +
    prefer equal size L1 cache and shared memory
    Definition: hip_runtime_api.h:66
    +
    hipError_t hipEventCreateWithFlags(hipEvent_t *event, unsigned flags)
    Create an event with the specified flags.
    Definition: hip_hcc.cpp:1085
    +
    hipError_t hipMallocHost(void **ptr, size_t size)
    Allocate pinned host memory.
    Definition: hip_hcc.cpp:1372
    +
    hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
    Return the elapsed time between two events.
    Definition: hip_hcc.cpp:1207
    +
    hipError_t hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0)
    Copies memory from one device to memory on another device.
    Definition: hip_hcc.cpp:1679
    +
    hipError_t hipGetDeviceCount(int *count)
    Return number of compute-capable devices.
    Definition: hip_hcc.cpp:675
    +
    hipError_t hipMemset(void *dst, int value, size_t sizeBytes)
    Copy data from src to dst asynchronously.
    Definition: hip_hcc.cpp:1558
    +
    hipError_t hipStreamDestroy(hipStream_t stream)
    Destroys the specified stream.
    Definition: hip_hcc.cpp:1038
    +
    hipError_t hipStreamSynchronize(hipStream_t stream)
    Wait for all commands in stream to complete.
    Definition: hip_hcc.cpp:1015
    Definition: hip_hcc.cpp:82
    -
    Shared mem is banked at 4-bytes intervals and performs best when adjacent threads access data 4 bytes...
    Definition: hip_runtime_api.h:77
    -
    hipError_t
    Definition: hip_runtime_api.h:112
    -
    hipMemcpyKind
    Definition: hip_runtime_api.h:101
    -
    prefer larger L1 cache and smaller shared memory
    Definition: hip_runtime_api.h:67
    -
    hipError_t hipDriverGetVersion(int *driverVersion)
    Returns the approximate HIP driver versin.
    Definition: hip_hcc.cpp:1608
    -
    hipError_t hipDeviceSynchronize(void)
    Blocks until the default device has completed all preceding requested tasks.
    Definition: hip_hcc.cpp:764
    -
    Definition: hip_runtime_api.h:124
    -
    hipError_t hipDeviceSetCacheConfig(hipFuncCache cacheConfig)
    Set L1/Shared cache partition.
    Definition: hip_hcc.cpp:676
    -
    hipError_t hipMalloc(void **ptr, size_t size)
    Definition: hip_hcc.cpp:1268
    -
    const char * hipGetErrorName(hipError_t hip_error)
    Return name of the specified error code in text form.
    Definition: hip_hcc.cpp:848
    -
    hipError_t hipGetLastError(void)
    Return last error returned by any HIP runtime API call and resets the stored error code to hipSuccess...
    Definition: hip_hcc.cpp:829
    -
    hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags)
    Make the specified compute stream wait for an event.
    Definition: hip_hcc.cpp:915
    -
    hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags)
    Return flags associated with this stream.
    Definition: hip_hcc.cpp:979
    -
    #define hipStreamDefault
    Flags that can be used with hipStreamCreateWithFlags.
    Definition: hip_runtime_api.h:50
    -
    hipError_t hipFree(void *ptr)
    Definition: hip_hcc.cpp:1513
    -
    uint32_t z
    z
    Definition: hip_runtime_api.h:90
    -
    hipError_t hipDeviceReset(void)
    Destroy all resources and reset all state on the default device in the current process.
    Definition: hip_hcc.cpp:779
    +
    Shared mem is banked at 4-bytes intervals and performs best when adjacent threads access data 4 bytes...
    Definition: hip_runtime_api.h:75
    +
    hipError_t
    Definition: hip_runtime_api.h:114
    +
    hipMemcpyKind
    Definition: hip_runtime_api.h:99
    +
    prefer larger L1 cache and smaller shared memory
    Definition: hip_runtime_api.h:65
    +
    hipError_t hipDriverGetVersion(int *driverVersion)
    Returns the approximate HIP driver version.
    Definition: hip_hcc.cpp:1691
    +
    hipError_t hipDeviceSynchronize(void)
    Blocks until the default device has completed all preceding requested tasks.
    Definition: hip_hcc.cpp:781
    +
    Definition: hip_runtime_api.h:122
    +
    hipError_t hipDeviceSetCacheConfig(hipFuncCache cacheConfig)
    Set L1/Shared cache partition.
    Definition: hip_hcc.cpp:693
    +
    hipError_t hipMalloc(void **ptr, size_t size)
    Allocate memory on the default accelerator.
    Definition: hip_hcc.cpp:1351
    +
    const char * hipGetErrorName(hipError_t hip_error)
    Return name of the specified error code in text form.
    Definition: hip_hcc.cpp:931
    +
    hipError_t hipGetLastError(void)
    Return last error returned by any HIP runtime API call and resets the stored error code to hipSuccess...
    Definition: hip_hcc.cpp:912
    +
    hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags)
    Make the specified compute stream wait for an event.
    Definition: hip_hcc.cpp:998
    +
    hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags)
    Return flags associated with this stream.
    Definition: hip_hcc.cpp:1062
    +
    #define hipStreamDefault
    Flags that can be used with hipStreamCreateWithFlags.
    Definition: hip_runtime_api.h:48
    +
    hipError_t hipFree(void *ptr)
    Free memory allocated by the hcc hip memory allocation API. This API performs an implicit hipDeviceSy...
    Definition: hip_hcc.cpp:1596
    +
    uint32_t z
    z
    Definition: hip_runtime_api.h:88
    +
    hipError_t hipDeviceReset(void)
    Destroy all resources and reset all state on the default device in the current process.
    Definition: hip_hcc.cpp:796
    Definition: hip_runtime_api.h:74
    -
    hipError_t hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0)
    Definition: hip_hcc.cpp:1388
    -
    The compiler selects a device-specific value for the banking.
    Definition: hip_runtime_api.h:76
    -
    Device-to-Device Copy.
    Definition: hip_runtime_api.h:105
    +
    hipError_t hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0)
    Copy data from src to dst asynchronously.
    Definition: hip_hcc.cpp:1471
    +
    The compiler selects a device-specific value for the banking.
    Definition: hip_runtime_api.h:74
    +
    Device-to-Device Copy.
    Definition: hip_runtime_api.h:103
    Definition: hip_hcc.cpp:107
    -
    Runtime will automatically determine copy-kind based on virtual addresses.
    Definition: hip_runtime_api.h:106
    -
    hipSharedMemConfig
    Definition: hip_runtime_api.h:75
    -
    hipError_t hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind)
    Definition: hip_hcc.cpp:1346
    -
    hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config)
    Set Shared memory bank configuration.
    Definition: hip_hcc.cpp:719
    -
    prefer larger shared memory and smaller L1 cache
    Definition: hip_runtime_api.h:66
    -
    Host-to-Host Copy.
    Definition: hip_runtime_api.h:102
    -
    hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)
    Return hc::acclerator associated with the specified deviceId.
    Definition: hip_hcc.cpp:1627
    -
    Shared mem is banked at 8-byte intervals and performs best when adjacent threads access data 4 bytes ...
    Definition: hip_runtime_api.h:78
    -
    const char * hipGetErrorString(hipError_t hip_error)
    Return handy text string message to explain the error which occurred.
    Definition: hip_hcc.cpp:875
    +
    Runtime will automatically determine copy-kind based on virtual addresses.
    Definition: hip_runtime_api.h:104
    +
    hipSharedMemConfig
    Definition: hip_runtime_api.h:73
    +
    hipError_t hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind)
    Copy data from src to dst.
    Definition: hip_hcc.cpp:1429
    +
    hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config)
    Set Shared memory bank configuration.
    Definition: hip_hcc.cpp:736
    +
    prefer larger shared memory and smaller L1 cache
    Definition: hip_runtime_api.h:64
    +
    Host-to-Host Copy.
    Definition: hip_runtime_api.h:100
    +
    hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)
    Return hc::accelerator associated with the specified deviceId.
    Definition: hip_hcc.cpp:1710
    +
    Shared mem is banked at 8-byte intervals and performs best when adjacent threads access data 4 bytes ...
    Definition: hip_runtime_api.h:76
    +
    const char * hipGetErrorString(hipError_t hip_error)
    Return handy text string message to explain the error which occurred.
    Definition: hip_hcc.cpp:958
    diff --git a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h_source.html index 73911b9082..9bd0f090ab 100644 --- a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail/hip_vector_types.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_vector_types.h Source File @@ -89,7 +89,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    hip_vector_types.h
    -
    1 /*
    +Go to the documentation of this file.
    1 /*
    2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
    3 
    4 Permission is hereby granted, free of charge, to any person obtaining a copy
    @@ -110,147 +110,177 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20 THE SOFTWARE.
    21 */
    -
    22 #if defined (__HCC__) && (__hcc_workweek__ < 16032)
    -
    23 #error("This version of HIP requires a newer version of HCC.");
    -
    24 #endif
    -
    25 
    -
    26 #include <hc_short_vector.hpp>
    -
    27 
    -
    28 // Define char vector types
    -
    29 typedef hc::short_vector::char1 char1;
    -
    30 typedef hc::short_vector::char2 char2;
    -
    31 typedef hc::short_vector::char3 char3;
    -
    32 typedef hc::short_vector::char4 char4;
    +
    22 
    +
    28 #if defined (__HCC__) && (__hcc_workweek__ < 16032)
    +
    29 #error("This version of HIP requires a newer version of HCC.");
    +
    30 #endif
    +
    31 
    +
    32 #include <hc_short_vector.hpp>
    33 
    -
    34 // Define uchar vector types
    -
    35 typedef hc::short_vector::uchar1 uchar1;
    -
    36 typedef hc::short_vector::uchar2 uchar2;
    -
    37 typedef hc::short_vector::uchar3 uchar3;
    -
    38 typedef hc::short_vector::uchar4 uchar4;
    -
    39 
    -
    40 // Define short vector types
    -
    41 typedef hc::short_vector::short1 short1;
    -
    42 typedef hc::short_vector::short2 short2;
    -
    43 typedef hc::short_vector::short3 short3;
    -
    44 typedef hc::short_vector::short4 short4;
    -
    45 
    -
    46 // Define ushort vector types
    -
    47 typedef hc::short_vector::ushort1 ushort1;
    -
    48 typedef hc::short_vector::ushort2 ushort2;
    -
    49 typedef hc::short_vector::ushort3 ushort3;
    -
    50 typedef hc::short_vector::ushort4 ushort4;
    -
    51 
    -
    52 // Define int vector types
    -
    53 typedef hc::short_vector::int1 int1;
    -
    54 typedef hc::short_vector::int2 int2;
    -
    55 typedef hc::short_vector::int3 int3;
    -
    56 typedef hc::short_vector::int4 int4;
    -
    57 
    -
    58 // Define uint vector types
    -
    59 typedef hc::short_vector::uint1 uint1;
    -
    60 typedef hc::short_vector::uint2 uint2;
    -
    61 typedef hc::short_vector::uint3 uint3;
    -
    62 typedef hc::short_vector::uint4 uint4;
    -
    63 
    -
    64 // Define long vector types
    -
    65 typedef hc::short_vector::long1 long1;
    -
    66 typedef hc::short_vector::long2 long2;
    -
    67 typedef hc::short_vector::long3 long3;
    -
    68 typedef hc::short_vector::long4 long4;
    -
    69 
    -
    70 // Define ulong vector types
    -
    71 typedef hc::short_vector::ulong1 ulong1;
    -
    72 typedef hc::short_vector::ulong2 ulong2;
    -
    73 typedef hc::short_vector::ulong3 ulong3;
    -
    74 typedef hc::short_vector::ulong4 ulong4;
    -
    75 
    -
    76 // Define longlong vector types
    -
    77 typedef hc::short_vector::longlong1 longlong1;
    -
    78 typedef hc::short_vector::longlong2 longlong2;
    -
    79 typedef hc::short_vector::longlong3 longlong3;
    -
    80 typedef hc::short_vector::longlong4 longlong4;
    -
    81 
    -
    82 // Define ulonglong vector types
    -
    83 typedef hc::short_vector::ulonglong1 ulonglong1;
    -
    84 typedef hc::short_vector::ulonglong2 ulonglong2;
    -
    85 typedef hc::short_vector::ulonglong3 ulonglong3;
    -
    86 typedef hc::short_vector::ulonglong4 ulonglong4;
    -
    87 
    -
    88 // Define float vector types
    -
    89 typedef hc::short_vector::float1 float1;
    -
    90 typedef hc::short_vector::float2 float2;
    -
    91 typedef hc::short_vector::float3 float3;
    -
    92 typedef hc::short_vector::float4 float4;
    -
    93 
    -
    94 // Define double vector types
    -
    95 typedef hc::short_vector::double1 double1;
    -
    96 typedef hc::short_vector::double2 double2;
    -
    97 typedef hc::short_vector::double3 double3;
    -
    98 typedef hc::short_vector::double4 double4;
    -
    99 
    -
    100 // Inline functions for creating vector types from basic types
    -
    101 inline char1 make_char1(signed char x) { char1 t; t.x = x; return t; };
    -
    102 inline char2 make_char2(signed char x, signed char y) { char2 t; t.x = x; t.y = y; return t; };
    -
    103 inline char3 make_char3(signed char x, signed char y, signed char z) { char3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    104 inline char4 make_char4(signed char x, signed char y, signed char z, signed char w) { char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    105 
    -
    106 inline uchar1 make_uchar1(unsigned char x) { uchar1 t; t.x = x; return t; };
    -
    107 inline uchar2 make_uchar2(unsigned char x, unsigned char y) { uchar2 t; t.x = x; t.y = y; return t; };
    -
    108 inline uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) { uchar3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    109 inline uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) { uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    +
    34 //-- Signed
    +
    35 // Define char vector types
    +
    36 typedef hc::short_vector::char1 char1;
    +
    37 typedef hc::short_vector::char2 char2;
    +
    38 typedef hc::short_vector::char3 char3;
    +
    39 typedef hc::short_vector::char4 char4;
    +
    40 
    +
    41 // Define short vector types
    +
    42 typedef hc::short_vector::short1 short1;
    +
    43 typedef hc::short_vector::short2 short2;
    +
    44 typedef hc::short_vector::short3 short3;
    +
    45 typedef hc::short_vector::short4 short4;
    +
    46 
    +
    47 // Define int vector types
    +
    48 typedef hc::short_vector::int1 int1;
    +
    49 typedef hc::short_vector::int2 int2;
    +
    50 typedef hc::short_vector::int3 int3;
    +
    51 typedef hc::short_vector::int4 int4;
    +
    52 
    +
    53 // Define long vector types
    +
    54 typedef hc::short_vector::long1 long1;
    +
    55 typedef hc::short_vector::long2 long2;
    +
    56 typedef hc::short_vector::long3 long3;
    +
    57 typedef hc::short_vector::long4 long4;
    +
    58 
    +
    59 // Define longlong vector types
    +
    60 typedef hc::short_vector::longlong1 longlong1;
    +
    61 typedef hc::short_vector::longlong2 longlong2;
    +
    62 typedef hc::short_vector::longlong3 longlong3;
    +
    63 typedef hc::short_vector::longlong4 longlong4;
    +
    64 
    +
    65 
    +
    66 //-- Unsigned
    +
    67 // Define uchar vector types
    +
    68 typedef hc::short_vector::uchar1 uchar1;
    +
    69 typedef hc::short_vector::uchar2 uchar2;
    +
    70 typedef hc::short_vector::uchar3 uchar3;
    +
    71 typedef hc::short_vector::uchar4 uchar4;
    +
    72 
    +
    73 // Define ushort vector types
    +
    74 typedef hc::short_vector::ushort1 ushort1;
    +
    75 typedef hc::short_vector::ushort2 ushort2;
    +
    76 typedef hc::short_vector::ushort3 ushort3;
    +
    77 typedef hc::short_vector::ushort4 ushort4;
    +
    78 
    +
    79 // Define uint vector types
    +
    80 typedef hc::short_vector::uint1 uint1;
    +
    81 typedef hc::short_vector::uint2 uint2;
    +
    82 typedef hc::short_vector::uint3 uint3;
    +
    83 typedef hc::short_vector::uint4 uint4;
    +
    84 
    +
    85 // Define ulong vector types
    +
    86 typedef hc::short_vector::ulong1 ulong1;
    +
    87 typedef hc::short_vector::ulong2 ulong2;
    +
    88 typedef hc::short_vector::ulong3 ulong3;
    +
    89 typedef hc::short_vector::ulong4 ulong4;
    +
    90 
    +
    91 // Define ulonglong vector types
    +
    92 typedef hc::short_vector::ulonglong1 ulonglong1;
    +
    93 typedef hc::short_vector::ulonglong2 ulonglong2;
    +
    94 typedef hc::short_vector::ulonglong3 ulonglong3;
    +
    95 typedef hc::short_vector::ulonglong4 ulonglong4;
    +
    96 
    +
    97 
    +
    98 //-- Floating point
    +
    99 // Define float vector types
    +
    100 typedef hc::short_vector::float1 float1;
    +
    101 typedef hc::short_vector::float2 float2;
    +
    102 typedef hc::short_vector::float3 float3;
    +
    103 typedef hc::short_vector::float4 float4;
    +
    104 
    +
    105 // Define double vector types
    +
    106 typedef hc::short_vector::double1 double1;
    +
    107 typedef hc::short_vector::double2 double2;
    +
    108 typedef hc::short_vector::double3 double3;
    +
    109 typedef hc::short_vector::double4 double4;
    110 
    -
    111 inline short1 make_short1(short x) { short1 t; t.x = x; return t; };
    -
    112 inline short2 make_short2(short x, short y) { short2 t; t.x = x; t.y = y; return t; };
    -
    113 inline short3 make_short3(short x,short y, short z) { short3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    114 inline short4 make_short4(short x, short y, short z, short w) { short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    115 
    -
    116 inline ushort1 make_ushort1(unsigned short x) { ushort1 t; t.x = x; return t; };
    -
    117 inline ushort2 make_ushort2(unsigned short x, unsigned short y) { ushort2 t; t.x = x; t.y = y; return t; };
    -
    118 inline ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) { ushort3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    119 inline ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) { ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    120 
    -
    121 inline int1 make_int1(int x) { int1 t; t.x = x; return t; };
    -
    122 inline int2 make_int2(int x, int y) { int2 t; t.x = x; t.y = y; return t; };
    -
    123 inline int3 make_int3(int x, int y, int z) { int3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    124 inline int4 make_int4(int x, int y, int z, int w) { int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    125 inline uint1 make_uint1(unsigned int x) { uint1 t; t.x = x; return t; };
    -
    126 inline uint2 make_uint2(unsigned int x, unsigned int y) { uint2 t; t.x = x; t.y = y; return t; };
    -
    127 inline uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) { uint3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    128 inline uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) { uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    129 
    -
    130 inline long1 make_long1(long int x) { long1 t; t.x = x; return t; };
    -
    131 inline long2 make_long2(long int x, long int y) { long2 t; t.x = x; t.y = y; return t; };
    -
    132 inline long3 make_long3(long int x, long int y, long int z) { long3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    133 inline long4 make_long4(long int x, long int y, long int z, long int w) { long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    134 
    -
    135 inline ulong1 make_ulong1(unsigned long int x) { ulong1 t; t.x = x; return t; };
    -
    136 inline ulong2 make_ulong2(unsigned long int x, unsigned long int y) { ulong2 t; t.x = x; t.y = y; return t; };
    -
    137 inline ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z) { ulong3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    138 inline ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w) { ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    139 
    -
    140 inline longlong1 make_longlong1(long long int x) { longlong1 t; t.x = x; return t; };
    -
    141 inline longlong2 make_longlong2(long long int x, long long int y) { longlong2 t; t.x = x; t.y = y; return t; };
    -
    142 inline longlong3 make_longlong3(long long int x, long long int y, long long int z) { longlong3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    143 inline longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w) { longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    144 
    -
    145 inline ulonglong1 make_ulonglong1(unsigned long long int x) { ulonglong1 t; t.x = x; return t; };
    -
    146 inline ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y) { ulonglong2 t; t.x = x; t.y = y; return t; };
    -
    147 inline ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z) { ulonglong3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    148 inline ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w) { ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    149 
    -
    150 inline float1 make_float1(float x) { float1 t; t.x = x; return t; };
    -
    151 inline float2 make_float2(float x, float y) { float2 t; t.x = x; t.y = y; return t; };
    -
    152 inline float3 make_float3(float x, float y, float z) { float3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    153 inline float4 make_float4(float x, float y, float z, float w) { float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    -
    154 
    -
    155 inline double1 make_double1(double x) { double1 t; t.x = x; return t; };
    -
    156 inline double2 make_double2(double x, double y) { double2 t; t.x = x; t.y = y; return t; };
    -
    157 inline double3 make_double3(double x, double y, double z) { double3 t; t.x = x; t.y = y; t.z = z; return t; };
    -
    158 inline double4 make_double4(double x, double y, double z, double w) { double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; };
    +
    111 
    +
    113 // Inline functions for creating vector types from basic types
    +
    114 #define ONE_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT (T x) { VT t; t.x = x; return t; };
    +
    115 #define TWO_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT (T x, T y) { VT t; t.x=x; t.y=y; return t; };
    +
    116 #define THREE_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT (T x, T y, T z) { VT t; t.x=x; t.y=y; t.z=z; return t; };
    +
    117 #define FOUR_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT (T x, T y, T z, T w) { VT t; t.x=x; t.y=y; t.z=z; t.w=w; return t; };
    +
    118 
    +
    119 
    +
    120 //signed:
    +
    121 ONE_COMPONENT_ACCESS (signed char, char1);
    +
    122 TWO_COMPONENT_ACCESS (signed char, char2);
    +
    123 THREE_COMPONENT_ACCESS(signed char, char3);
    +
    124 FOUR_COMPONENT_ACCESS (signed char, char4);
    +
    125 
    +
    126 ONE_COMPONENT_ACCESS (short, short1);
    +
    127 TWO_COMPONENT_ACCESS (short, short2);
    +
    128 THREE_COMPONENT_ACCESS(short, short3);
    +
    129 FOUR_COMPONENT_ACCESS (short, short4);
    +
    130 
    +
    131 ONE_COMPONENT_ACCESS (int, int1);
    +
    132 TWO_COMPONENT_ACCESS (int, int2);
    +
    133 THREE_COMPONENT_ACCESS(int, int3);
    +
    134 FOUR_COMPONENT_ACCESS (int, int4);
    +
    135 
    +
    136 ONE_COMPONENT_ACCESS (long int, long1);
    +
    137 TWO_COMPONENT_ACCESS (long int, long2);
    +
    138 THREE_COMPONENT_ACCESS(long int, long3);
    +
    139 FOUR_COMPONENT_ACCESS (long int, long4);
    +
    140 
    +
    141 ONE_COMPONENT_ACCESS (long long int, ulong1);
    +
    142 TWO_COMPONENT_ACCESS (long long int, ulong2);
    +
    143 THREE_COMPONENT_ACCESS(long long int, ulong3);
    +
    144 FOUR_COMPONENT_ACCESS (long long int, ulong4);
    +
    145 
    +
    146 ONE_COMPONENT_ACCESS (long long int, longlong1);
    +
    147 TWO_COMPONENT_ACCESS (long long int, longlong2);
    +
    148 THREE_COMPONENT_ACCESS(long long int, longlong3);
    +
    149 FOUR_COMPONENT_ACCESS (long long int, longlong4);
    +
    150 
    +
    151 
    +
    152 // unsigned:
    +
    153 ONE_COMPONENT_ACCESS (unsigned char, uchar1);
    +
    154 TWO_COMPONENT_ACCESS (unsigned char, uchar2);
    +
    155 THREE_COMPONENT_ACCESS(unsigned char, uchar3);
    +
    156 FOUR_COMPONENT_ACCESS (unsigned char, uchar4);
    +
    157 
    +
    158 ONE_COMPONENT_ACCESS (unsigned short, ushort1);
    +
    159 TWO_COMPONENT_ACCESS (unsigned short, ushort2);
    +
    160 THREE_COMPONENT_ACCESS(unsigned short, ushort3);
    +
    161 FOUR_COMPONENT_ACCESS (unsigned short, ushort4);
    +
    162 
    +
    163 ONE_COMPONENT_ACCESS (unsigned int, uint1);
    +
    164 TWO_COMPONENT_ACCESS (unsigned int, uint2);
    +
    165 THREE_COMPONENT_ACCESS(unsigned int, uint3);
    +
    166 FOUR_COMPONENT_ACCESS (unsigned int, uint4);
    +
    167 
    +
    168 ONE_COMPONENT_ACCESS (unsigned long int, ulong1);
    +
    169 TWO_COMPONENT_ACCESS (unsigned long int, ulong2);
    +
    170 THREE_COMPONENT_ACCESS(unsigned long int, ulong3);
    +
    171 FOUR_COMPONENT_ACCESS (unsigned long int, ulong4);
    +
    172 
    +
    173 ONE_COMPONENT_ACCESS (unsigned long long int, ulong1);
    +
    174 TWO_COMPONENT_ACCESS (unsigned long long int, ulong2);
    +
    175 THREE_COMPONENT_ACCESS(unsigned long long int, ulong3);
    +
    176 FOUR_COMPONENT_ACCESS (unsigned long long int, ulong4);
    +
    177 
    +
    178 ONE_COMPONENT_ACCESS (unsigned long long int, ulonglong1);
    +
    179 TWO_COMPONENT_ACCESS (unsigned long long int, ulonglong2);
    +
    180 THREE_COMPONENT_ACCESS(unsigned long long int, ulonglong3);
    +
    181 FOUR_COMPONENT_ACCESS (unsigned long long int, ulonglong4);
    +
    182 
    +
    183 
    +
    184 //Floating point
    +
    185 ONE_COMPONENT_ACCESS (float, float1);
    +
    186 TWO_COMPONENT_ACCESS (float, float2);
    +
    187 THREE_COMPONENT_ACCESS(float, float3);
    +
    188 FOUR_COMPONENT_ACCESS (float, float4);
    +
    189 
    +
    190 ONE_COMPONENT_ACCESS (double, double1);
    +
    191 TWO_COMPONENT_ACCESS (double, double2);
    +
    192 THREE_COMPONENT_ACCESS(double, double3);
    +
    193 FOUR_COMPONENT_ACCESS (double, double4);
    +
    #define ONE_COMPONENT_ACCESS(T, VT)
    Definition: hip_vector_types.h:114
    diff --git a/projects/hip/docs/RuntimeAPI/html/hierarchy.html b/projects/hip/docs/RuntimeAPI/html/hierarchy.html index f10211f59f..801ddcd197 100644 --- a/projects/hip/docs/RuntimeAPI/html/hierarchy.html +++ b/projects/hip/docs/RuntimeAPI/html/hierarchy.html @@ -104,7 +104,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    diff --git a/projects/hip/docs/RuntimeAPI/html/hip__common_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hip__common_8h_source.html index aa62aa5504..5a5a586bdf 100644 --- a/projects/hip/docs/RuntimeAPI/html/hip__common_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hip__common_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hip_common.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hip_common.h Source File @@ -123,61 +123,65 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    32 #define __HIP_PLATFORM_HCC__
    33 #define __HIPCC__
    34 
    -
    35 # if defined __HCC_ACCELERATOR__
    -
    36 # define __HIP_DEVICE_COMPILE__ 1
    -
    37 # endif
    -
    38 #endif
    -
    39 
    -
    40 // Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
    -
    41 #if defined(__NVCC__)
    -
    42 #define __HIP_PLATFORM_NVCC__
    -
    43 # ifdef __CUDACC__
    -
    44 # define __HIPCC__
    -
    45 # endif
    -
    46 
    -
    47 # ifdef __CUDA_ARCH__
    -
    48 # define __HIP_DEVICE_COMPILE__ 1
    -
    49 # endif
    -
    50 
    -
    51 #endif
    -
    52 
    -
    53 
    -
    54 
    -
    55 
    -
    56 #if __HIP_DEVICE_COMPILE__ == 0
    -
    57 // 32-bit Atomics
    -
    58 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
    -
    59 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
    -
    60 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
    -
    61 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
    -
    62 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
    -
    63 
    -
    64 // 64-bit Atomics
    -
    65 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
    -
    66 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
    +
    35 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
    +
    36 #define __HIP_DEVICE_COMPILE__ 1
    +
    37 #else
    +
    38 #define __HIP_DEVICE_COMPILE__ 0
    +
    39 #endif
    +
    40 #endif
    +
    41 
    +
    42 // Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
    +
    43 #if defined(__NVCC__)
    +
    44 #define __HIP_PLATFORM_NVCC__
    +
    45 # ifdef __CUDACC__
    +
    46 # define __HIPCC__
    +
    47 # endif
    +
    48 
    +
    49 #if defined(__CUDA_ARCH__) and (__CUDA_ARCH__ != 0)
    +
    50 #define __HIP_DEVICE_COMPILE__ 1
    +
    51 #else
    +
    52 #define __HIP_DEVICE_COMPILE__ 0
    +
    53 #endif
    +
    54 
    +
    55 #endif
    +
    56 
    +
    57 
    +
    58 
    +
    59 
    +
    60 #if __HIP_DEVICE_COMPILE__ == 0
    +
    61 // 32-bit Atomics
    +
    62 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
    +
    63 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
    +
    64 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
    +
    65 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
    +
    66 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
    67 
    -
    68 // Doubles
    -
    69 #define __HIP_ARCH_HAS_DOUBLES__ (0)
    -
    70 
    -
    71 // Warp cross-lane operations
    -
    72 #define __HIP_ARCH_HAS_WARP_VOTE__ (0)
    -
    73 #define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
    -
    74 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
    -
    75 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
    -
    76 
    -
    77 // Sync
    -
    78 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
    -
    79 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
    +
    68 // 64-bit Atomics
    +
    69 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
    +
    70 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
    +
    71 
    +
    72 // Doubles
    +
    73 #define __HIP_ARCH_HAS_DOUBLES__ (0)
    +
    74 
    +
    75 // Warp cross-lane operations
    +
    76 #define __HIP_ARCH_HAS_WARP_VOTE__ (0)
    +
    77 #define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
    +
    78 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
    +
    79 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
    80 
    -
    81 // Misc
    -
    82 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
    -
    83 #define __HIP_ARCH_HAS_3DGRID__ (0)
    -
    84 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
    -
    85 #endif
    +
    81 // Sync
    +
    82 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
    +
    83 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
    +
    84 
    +
    85 // Misc
    +
    86 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
    +
    87 #define __HIP_ARCH_HAS_3DGRID__ (0)
    +
    88 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
    +
    89 #endif
    diff --git a/projects/hip/docs/RuntimeAPI/html/hip__hcc_8cpp.html b/projects/hip/docs/RuntimeAPI/html/hip__hcc_8cpp.html index 733d1a79a7..adfc13a1df 100644 --- a/projects/hip/docs/RuntimeAPI/html/hip__hcc_8cpp.html +++ b/projects/hip/docs/RuntimeAPI/html/hip__hcc_8cpp.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/src/hip_hcc.cpp File Reference +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/src/hip_hcc.cpp File Reference @@ -223,6 +223,9 @@ hc::accelerator_view * ihi hipError_t hipDeviceReset (void)  Destroy all resources and reset all state on the default device in the current process. More...
      +hipError_t hipDeviceGetAttribute (int *pi, hipDeviceAttribute_t attr, int device) + Query device attribute. More...
    hipError_t hipDeviceGetProperties (hipDeviceProp_t *props, int device)  Returns device properties. More...
      @@ -283,49 +286,58 @@ template<typename T > hc::completion_future ihipMemsetKernel (hipStream_t stream, T *ptr, T val, size_t sizeBytes)   hipError_t hipMalloc (void **ptr, size_t sizeBytes) + Allocate memory on the default accelerator. More...
      hipError_t hipMallocHost (void **ptr, size_t sizeBytes) + Allocate pinned host memory. More...
      - -hipError_t hipMemcpyToSymbol (const char *symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) +hipError_t hipMemcpyToSymbol (const char *symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) + Copies sizeBytes bytes from the memory area pointed to by src to the memory area pointed to by offset bytes from the start of symbol symbol. More...
      hipError_t hipMemcpy (void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind) + Copy data from src to dst. More...
      hipError_t hipMemcpyAsync (void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + Copy data from src to dst asynchronously. More...
      - -hipError_t hipMemsetAsync (void *dst, int value, size_t sizeBytes, hipStream_t stream) +hipError_t hipMemsetAsync (void *dst, int value, size_t sizeBytes, hipStream_t stream) + Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant byte value value. More...
      - -hipError_t hipMemset (void *dst, int value, size_t sizeBytes) +hipError_t hipMemset (void *dst, int value, size_t sizeBytes) + Copy data from src to dst asynchronously. More...
      hipError_t hipMemGetInfo (size_t *free, size_t *total)   hipError_t hipFree (void *ptr) + Free memory allocated by the hcc hip memory allocation API. This API performs an implicit hipDeviceSynchronize() call. More...
      hipError_t hipFreeHost (void *ptr) + Free memory allocated by the hcc hip host memory allocation API. More...
      hipError_t hipDeviceCanAccessPeer (int *canAccessPeer, int device, int peerDevice)  Determine if a device can access a peer's memory. More...
      hipError_t hipDeviceDisablePeerAccess (int peerDevice) + Disables registering memory on peerDevice for direct access from the current device. More...
      hipError_t hipDeviceEnablePeerAccess (int peerDevice, unsigned int flags) + Enables registering memory on peerDevice for direct access from the current device. More...
      - -hipError_t hipMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes) +hipError_t hipMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes) + Copies memory from one device to memory on another device. More...
      hipError_t hipMemcpyPeerAsync (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream) + Copies memory from one device to memory on another device. More...
      hipError_t hipDriverGetVersion (int *driverVersion) - Returns the approximate HIP driver versin. More...
    + Returns the approximate HIP driver version. More...
      hipError_t hipHccGetAccelerator (int deviceId, hc::accelerator *acc) - Return hc::acclerator associated with the specified deviceId. More...
    + Return hc::accelerator associated with the specified deviceId. More...
      hipError_t hipHccGetAcceleratorView (hipStream_t stream, hc::accelerator_view **av) - Return hc::acclerator_view associated with the specified stream. More...
    + Return hc::accelerator_view associated with the specified stream. More...
     

    @@ -466,7 +478,7 @@ std::vector< ihipDevice_t & diff --git a/projects/hip/docs/RuntimeAPI/html/hip__runtime_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hip__runtime_8h_source.html index d79618630c..40c09acfeb 100644 --- a/projects/hip/docs/RuntimeAPI/html/hip__runtime_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hip__runtime_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hip_runtime.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hip_runtime.h Source File @@ -135,14 +135,16 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    54 #endif
    55 
    56 
    -
    57 #include <hip_runtime_api.h>
    -
    58 #include <hip_vector_types.h>
    +
    57 #include <hip_runtime_api.h>
    +
    58 #include <hip_vector_types.h>
    59 
    - + +
    Contains definitions of APIs for HIP runtime.
    + diff --git a/projects/hip/docs/RuntimeAPI/html/hip__runtime__api_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hip__runtime__api_8h_source.html index 66d3cc5cfa..4ef99226aa 100644 --- a/projects/hip/docs/RuntimeAPI/html/hip__runtime__api_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hip__runtime__api_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hip_runtime_api.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hip_runtime_api.h Source File @@ -117,21 +117,21 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    33 #include <hip_common.h>
    34 
    35 typedef struct {
    -
    36  // 32-bit Atomics:
    +
    36  // 32-bit Atomics
    37  unsigned hasGlobalInt32Atomics : 1;
    38  unsigned hasGlobalFloatAtomicExch : 1;
    39  unsigned hasSharedInt32Atomics : 1;
    40  unsigned hasSharedFloatAtomicExch : 1;
    41  unsigned hasFloatAtomicAdd : 1;
    42 
    -
    43  // 64-bit Atomics:
    +
    43  // 64-bit Atomics
    44  unsigned hasGlobalInt64Atomics : 1;
    45  unsigned hasSharedInt64Atomics : 1;
    46 
    47  // Doubles
    48  unsigned hasDoubles : 1;
    49 
    -
    50  // Warp cross-lane operations:
    +
    50  // Warp cross-lane operations
    51  unsigned hasWarpVote : 1;
    52  unsigned hasWarpBallot : 1;
    53  unsigned hasWarpShuffle : 1;
    @@ -152,115 +152,175 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    68 // Common headers for both NVCC and HCC paths:
    69 
    74 typedef struct hipDeviceProp_t {
    -
    75  char name[256];
    -
    76  size_t totalGlobalMem;
    - -
    78  int regsPerBlock ;
    -
    79  int warpSize ;
    - -
    81  int maxThreadsDim[3];
    -
    82  int maxGridSize[3];
    -
    83  int clockRate ;
    -
    84 
    -
    85  size_t totalConstMem;
    -
    86  int major ;
    -
    87  int minor;
    - - - - -
    92 
    - -
    94 
    - - -
    97 
    -
    98 
    -
    99 // hack to get these to show up in Doxygen:
    -
    107 /*
    -
    108  * @brief hipError_t
    -
    109  * @enum
    -
    110  * @ingroup Enumerations
    -
    111  */
    -
    112 typedef enum hipError_t {
    - - - - - - - - - - -
    123 
    - - -
    126 } hipError_t;
    -
    127 
    +
    75  char name[256];
    +
    76  size_t totalGlobalMem;
    + + +
    79  int warpSize;
    + +
    81  int maxThreadsDim[3];
    +
    82  int maxGridSize[3];
    +
    83  int clockRate;
    + +
    85  size_t totalConstMem;
    +
    86  int major;
    +
    87  int minor;
    + + + + + + + +
    95  int pciBusID;
    + + + +
    99 
    +
    100 
    +
    101 // hack to get these to show up in Doxygen:
    +
    109 /*
    +
    110  * @brief hipError_t
    +
    111  * @enum
    +
    112  * @ingroup Enumerations
    +
    113  */
    +
    114 typedef enum hipError_t {
    + + + + + + + + + + + + +
    127 } hipError_t;
    128 
    -
    129 
    -
    134 #if defined(__HIP_PLATFORM_HCC__) and not defined (__HIP_PLATFORM_NVCC__)
    -
    135 #include "hcc_detail/hip_runtime_api.h"
    -
    136 #elif defined(__HIP_PLATFORM_NVCC__) and not defined (__HIP_PLATFORM_HCC__)
    -
    137 #include "nvcc_detail/hip_runtime_api.h"
    -
    138 #else
    -
    139 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
    -
    140 #endif
    -
    141 
    -
    142 
    -
    150 #ifdef __cplusplus
    -
    151 template<class T>
    -
    152 static inline hipError_t hipMalloc ( T** devPtr, size_t size)
    -
    153 {
    -
    154  return hipMalloc((void**)devPtr, size);
    -
    155 }
    -
    156 
    -
    157 template<class T>
    -
    158 static inline hipError_t hipMallocHost ( T** ptr, size_t size)
    -
    159 {
    -
    160  return hipMallocHost((void**)ptr, size);
    -
    161 }
    -
    162 #endif
    -
    Call to cudaGetDeviceCount returned 0 devices.
    Definition: hip_runtime_api.h:121
    -
    size_t totalConstMem
    Size of shared memory region (in bytes)
    Definition: hip_runtime_api.h:85
    -
    Unknown symbol.
    Definition: hip_runtime_api.h:116
    -
    Successful completion.
    Definition: hip_runtime_api.h:113
    +
    129 /*
    +
    130  * @brief hipDeviceAttribute_t
    +
    131  * @enum
    +
    132  * @ingroup Enumerations
    +
    133  */
    +
    134 typedef enum hipDeviceAttribute_t {
    + + + + + + + + + + + + + + + + + + + + + + + + +
    159 
    +
    164 #if defined(__HIP_PLATFORM_HCC__) and not defined (__HIP_PLATFORM_NVCC__)
    + +
    166 #elif defined(__HIP_PLATFORM_NVCC__) and not defined (__HIP_PLATFORM_HCC__)
    +
    167 #include "nvcc_detail/hip_runtime_api.h"
    +
    168 #else
    +
    169 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
    +
    170 #endif
    +
    171 
    +
    172 
    +
    180 #ifdef __cplusplus
    +
    181 template<class T>
    +
    182 static inline hipError_t hipMalloc ( T** devPtr, size_t size)
    +
    183 {
    +
    184  return hipMalloc((void**)devPtr, size);
    +
    185 }
    +
    186 
    +
    187 template<class T>
    +
    188 static inline hipError_t hipMallocHost ( T** ptr, size_t size)
    +
    189 {
    +
    190  return hipMallocHost((void**)ptr, size);
    +
    191 }
    +
    192 #endif
    +
    Call to hipGetDeviceCount returned 0 devices.
    Definition: hip_runtime_api.h:123
    +
    size_t totalConstMem
    Size of shared memory region (in bytes).
    Definition: hip_runtime_api.h:85
    +
    Maximum Shared Memory Per Multiprocessor.
    Definition: hip_runtime_api.h:157
    +
    Maximum x-dimension of a block.
    Definition: hip_runtime_api.h:136
    +
    Maximum x-dimension of a grid.
    Definition: hip_runtime_api.h:139
    +
    Unknown symbol.
    Definition: hip_runtime_api.h:118
    +
    Successful completion.
    Definition: hip_runtime_api.h:115
    int minor
    Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC...
    Definition: hip_runtime_api.h:87
    -
    int regsPerBlock
    registers per block
    Definition: hip_runtime_api.h:78
    -
    DeviceID must be in range 0...#compute-devices.
    Definition: hip_runtime_api.h:120
    -
    int clockRate
    max clock frequency of the multiProcessors, in khz.
    Definition: hip_runtime_api.h:83
    -
    Out of resources error.
    Definition: hip_runtime_api.h:117
    -
    Unknown error.
    Definition: hip_runtime_api.h:124
    -
    int maxThreadsPerBlock
    max work items per work group or workgroup max size
    Definition: hip_runtime_api.h:80
    -
    size_t sharedMemPerBlock
    Size of shared memory region (in bytes)
    Definition: hip_runtime_api.h:77
    +
    Maximum number of 32-bit registers available to a thread block. This number is shared by all thread b...
    Definition: hip_runtime_api.h:145
    +
    int regsPerBlock
    Registers per block.
    Definition: hip_runtime_api.h:78
    +
    Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
    Definition: hip_runtime_api.h:150
    +
    DeviceID must be in range 0...#compute-devices.
    Definition: hip_runtime_api.h:122
    +
    Peak clock frequency in kilohertz.
    Definition: hip_runtime_api.h:146
    +
    int clockRate
    Max clock frequency of the multiProcessors in khz.
    Definition: hip_runtime_api.h:83
    +
    Maximum z-dimension of a grid.
    Definition: hip_runtime_api.h:141
    +
    Out of resources error.
    Definition: hip_runtime_api.h:119
    +
    Minor compute capability version number.
    Definition: hip_runtime_api.h:153
    +
    Maximum shared memory available per block in bytes.
    Definition: hip_runtime_api.h:142
    +
    int pciBusID
    PCI Bus ID.
    Definition: hip_runtime_api.h:95
    +
    Maximum y-dimension of a grid.
    Definition: hip_runtime_api.h:140
    +
    Unknown error.
    Definition: hip_runtime_api.h:125
    +
    int maxThreadsPerBlock
    Max work items per work group or workgroup max size.
    Definition: hip_runtime_api.h:80
    +
    Maximum y-dimension of a block.
    Definition: hip_runtime_api.h:137
    +
    size_t sharedMemPerBlock
    Size of shared memory region (in bytes).
    Definition: hip_runtime_api.h:77
    int maxThreadsPerMultiProcessor
    Maximum resident threads per multi-processor.
    Definition: hip_runtime_api.h:90
    int l2CacheSize
    L2 cache size.
    Definition: hip_runtime_api.h:89
    -
    Resource handle (hipEvent_t or hipStream_t) invalid.
    Definition: hip_runtime_api.h:119
    -
    Memory allocation error.
    Definition: hip_runtime_api.h:114
    -
    hipDeviceArch_t arch
    Architectural feature flags. New for HIP.
    Definition: hip_runtime_api.h:95
    -
    int maxGridSize[3]
    max grid dimensions (XYZ)
    Definition: hip_runtime_api.h:82
    +
    hipDeviceAttribute_t
    Definition: hip_runtime_api.h:134
    +
    Major compute capability version number.
    Definition: hip_runtime_api.h:152
    +
    Maximum number of threads per block.
    Definition: hip_runtime_api.h:135
    +
    Resource handle (hipEvent_t or hipStream_t) invalid.
    Definition: hip_runtime_api.h:121
    +
    Memory allocation error.
    Definition: hip_runtime_api.h:116
    +
    hipDeviceArch_t arch
    Architectural feature flags. New for HIP.
    Definition: hip_runtime_api.h:93
    +
    int maxGridSize[3]
    Max grid dimensions (XYZ).
    Definition: hip_runtime_api.h:82
    int computeMode
    Compute mode.
    Definition: hip_runtime_api.h:91
    -
    hipError_t hipMallocHost(void **ptr, size_t size)
    Definition: hip_hcc.cpp:1289
    -
    Marker that more error codes are needed.
    Definition: hip_runtime_api.h:125
    +
    Maximum z-dimension of a block.
    Definition: hip_runtime_api.h:138
    +
    PCI Bus ID.
    Definition: hip_runtime_api.h:155
    +
    hipError_t hipMallocHost(void **ptr, size_t size)
    Allocate pinned host memory.
    Definition: hip_hcc.cpp:1372
    +
    Marker that more error codes are needed.
    Definition: hip_runtime_api.h:126
    +
    Warp size in threads.
    Definition: hip_runtime_api.h:144
    int major
    Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC...
    Definition: hip_runtime_api.h:86
    -
    hipError_t
    Definition: hip_runtime_api.h:112
    -
    int clockInstructionRate
    Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP...
    Definition: hip_runtime_api.h:93
    -
    Memory free error.
    Definition: hip_runtime_api.h:115
    -
    int warpSize
    warp size
    Definition: hip_runtime_api.h:79
    -
    size_t totalGlobalMem
    Size of global memory region (in bytes)
    Definition: hip_runtime_api.h:76
    -
    hipError_t hipMalloc(void **ptr, size_t size)
    Definition: hip_hcc.cpp:1268
    -
    int maxThreadsDim[3]
    max number of threads in each dimension (XYZ) of a block
    Definition: hip_runtime_api.h:81
    -
    One or more of the paramters passed to the API call is NULL or not in an acceptable range...
    Definition: hip_runtime_api.h:118
    +
    Peak memory clock frequency in kilohertz.
    Definition: hip_runtime_api.h:147
    +
    Maximum resident threads per multiprocessor.
    Definition: hip_runtime_api.h:151
    +
    hipError_t
    Definition: hip_runtime_api.h:114
    +
    int clockInstructionRate
    Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP...
    Definition: hip_runtime_api.h:92
    +
    Constant memory size in bytes.
    Definition: hip_runtime_api.h:143
    +
    Memory free error.
    Definition: hip_runtime_api.h:117
    +
    int warpSize
    Warp size.
    Definition: hip_runtime_api.h:79
    +
    int concurrentKernels
    Device can possibly execute multiple kernels concurrently.
    Definition: hip_runtime_api.h:94
    +
    size_t totalGlobalMem
    Size of global memory region (in bytes).
    Definition: hip_runtime_api.h:76
    +
    hipError_t hipMalloc(void **ptr, size_t size)
    Allocate memory on the default accelerator.
    Definition: hip_hcc.cpp:1351
    +
    Compute mode that device is currently in.
    Definition: hip_runtime_api.h:149
    +
    PCI Device ID.
    Definition: hip_runtime_api.h:156
    +
    int maxThreadsDim[3]
    Max number of threads in each dimension (XYZ) of a block.
    Definition: hip_runtime_api.h:81
    +
    Number of multiprocessors on the device.
    Definition: hip_runtime_api.h:148
    +
    One or more of the parameters passed to the API call is NULL or not in an acceptable range...
    Definition: hip_runtime_api.h:120
    Definition: hip_runtime_api.h:74
    -
    indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error...
    Definition: hip_runtime_api.h:122
    +
    Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error...
    Definition: hip_runtime_api.h:124
    +
    size_t maxSharedMemoryPerMultiProcessor
    Maximum Shared Memory Per Multiprocessor.
    Definition: hip_runtime_api.h:97
    +
    int pciDeviceID
    PCI Device ID.
    Definition: hip_runtime_api.h:96
    char name[256]
    Device name.
    Definition: hip_runtime_api.h:75
    Definition: hip_runtime_api.h:35
    -
    int multiProcessorCount
    number of multi-processors (compute units)
    Definition: hip_runtime_api.h:88
    +
    Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language ...
    +
    int memoryClockRate
    Max memory clock frequency in khz.
    Definition: hip_runtime_api.h:84
    +
    Device can possibly execute multiple kernels concurrently.
    Definition: hip_runtime_api.h:154
    +
    int multiProcessorCount
    Number of multi-processors (compute units).
    Definition: hip_runtime_api.h:88
    diff --git a/projects/hip/docs/RuntimeAPI/html/hip__texture_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hip__texture_8h_source.html index 9888a5c9a5..cdf660b728 100644 --- a/projects/hip/docs/RuntimeAPI/html/hip__texture_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hip__texture_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail/hip_texture.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_texture.h Source File @@ -89,7 +89,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    hip_texture.h
    -
    1 /*
    +Go to the documentation of this file.
    1 /*
    2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
    3 
    4 Permission is hereby granted, free of charge, to any person obtaining a copy
    @@ -112,7 +112,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    21 */
    22 #pragma once
    23 
    -
    28 #include <limits.h>
    +
    28 #include <limits.h>
    29 
    30 #include <hip_runtime.h>
    31 
    @@ -125,18 +125,18 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    38  int _dummy;
    40 
    -
    41 typedef enum hipTextureReadMode
    +
    41 typedef enum hipTextureReadMode
    42 {
    -
    43  hipReadModeElementType,
    -
    44 } hipTextureReadMode;
    + +
    46 
    -
    47 typedef enum hipTextureFilterMode
    +
    48 {
    -
    49  hipFilterModePoint,
    -
    50 } hipTextureFilterMode;
    + +
    52 
    -
    54  hipTextureFilterMode filterMode;
    +
    54  hipTextureFilterMode filterMode;
    55  bool normalized;
    56  hipChannelFormatDesc channelDesc;
    57 };
    @@ -160,7 +160,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    82 // These are C++ APIs - maybe belong in separate file.
    106 // C API:
    107 #if 0
    -
    108 hipChannelFormatDesc hipBindTexture(size_t *offset, struct textureReference *tex, const void *devPtr, const struct hipChannelFormatDesc *desc, size_t size=UINT_MAX)
    +
    108 hipChannelFormatDesc hipBindTexture(size_t *offset, struct textureReference *tex, const void *devPtr, const struct hipChannelFormatDesc *desc, size_t size=UINT_MAX)
    109 {
    110  tex->_dataPtr = devPtr;
    111 }
    @@ -171,7 +171,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    116  **/
    117 // TODO
    118 template <class T>
    -
    119 hipChannelFormatDesc hipCreateChannelDesc()
    +
    119 hipChannelFormatDesc hipCreateChannelDesc()
    120 {
    122  return desc;
    @@ -182,11 +182,11 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    127  **/
    128 // TODO-doc
    129 template <class T, int dim, enum hipTextureReadMode readMode>
    -
    130 hipError_t hipBindTexture(size_t *offset,
    -
    131  struct texture<T, dim, readMode> &tex,
    -
    132  const void *devPtr,
    -
    133  const struct hipChannelFormatDesc *desc,
    -
    134  size_t size=UINT_MAX)
    +
    130 hipError_t hipBindTexture(size_t *offset,
    +
    131  struct texture<T, dim, readMode> &tex,
    +
    132  const void *devPtr,
    +
    133  const struct hipChannelFormatDesc *desc,
    +
    134  size_t size=UINT_MAX)
    135 {
    136  tex._dataPtr = static_cast<const T*>(devPtr);
    137 
    @@ -199,10 +199,10 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    144  **/
    145 // TODO-doc
    146 template <class T, int dim, enum hipTextureReadMode readMode>
    -
    147 hipError_t hipBindTexture(size_t *offset,
    -
    148  struct texture<T, dim, readMode> &tex,
    -
    149  const void *devPtr,
    -
    150  size_t size=UINT_MAX)
    +
    147 hipError_t hipBindTexture(size_t *offset,
    +
    148  struct texture<T, dim, readMode> &tex,
    +
    149  const void *devPtr,
    +
    150  size_t size=UINT_MAX)
    151 {
    152  return hipBindTexture(offset, tex, devPtr, &tex.channelDesc, size);
    153 }
    @@ -224,16 +224,20 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    169 
    170 // doxygen end Texture
    176 // End doxygen API:
    -
    Successful completion.
    Definition: hip_runtime_api.h:113
    +
    Definition: hip_texture.h:43
    +
    Successful completion.
    Definition: hip_runtime_api.h:115
    +
    Definition: hip_texture.h:49
    Definition: hip_texture.h:53
    - -
    hipError_t
    Definition: hip_runtime_api.h:112
    +
    Contains definitions of APIs for HIP runtime.
    +
    hipError_t
    Definition: hip_runtime_api.h:114
    +
    hipTextureReadMode
    Definition: hip_texture.h:41
    +
    hipTextureFilterMode
    Definition: hip_texture.h:47
    Definition: hip_texture.h:36
    Definition: hip_texture.h:60
    diff --git a/projects/hip/docs/RuntimeAPI/html/hip__vector__types_8h_source.html b/projects/hip/docs/RuntimeAPI/html/hip__vector__types_8h_source.html index 15eba01435..b12e96e36d 100644 --- a/projects/hip/docs/RuntimeAPI/html/hip__vector__types_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/hip__vector__types_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hip_vector_types.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hip_vector_types.h Source File @@ -117,16 +117,17 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    27 
    28 
    29 #if defined(__HIP_PLATFORM_HCC__) and not defined (__HIP_PLATFORM_NVCC__)
    -
    30 #include <hcc_detail/hip_vector_types.h>
    +
    31 #elif defined(__HIP_PLATFORM_NVCC__) and not defined (__HIP_PLATFORM_HCC__)
    32 #include <vector_types.h>
    33 #else
    34 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
    35 #endif
    +
    Defines the different newt vector types for HIP runtime.
    diff --git a/projects/hip/docs/RuntimeAPI/html/host__defines_8h_source.html b/projects/hip/docs/RuntimeAPI/html/host__defines_8h_source.html index c10f6c5202..f4430d5190 100644 --- a/projects/hip/docs/RuntimeAPI/html/host__defines_8h_source.html +++ b/projects/hip/docs/RuntimeAPI/html/host__defines_8h_source.html @@ -4,7 +4,7 @@ -HIP: Heterogenous-computing Interface for Portability: /home/fpadmin/ben/HIP6/include/hcc_detail/host_defines.h Source File +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/host_defines.h Source File @@ -89,7 +89,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    host_defines.h
    -
    1 /*
    +Go to the documentation of this file.
    1 /*
    2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
    3 
    4 Permission is hereby granted, free of charge, to any person obtaining a copy
    @@ -110,47 +110,48 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20 THE SOFTWARE.
    21 */
    -
    22 #ifdef __HCC__
    -
    23 
    -
    26 #define __host__ __attribute__((cpu))
    -
    27 #define __device__ __attribute__((hc))
    -
    28 
    -
    29 #ifndef DISABLE_GRID_LAUNCH
    -
    30 #define __global__ __attribute__((hc_grid_launch))
    -
    31 #else
    -
    32 #define __global__
    -
    33 #endif
    +
    22 
    +
    28 #ifdef __HCC__
    +
    29 
    +
    32 #define __host__ __attribute__((cpu))
    +
    33 #define __device__ __attribute__((hc))
    34 
    -
    35 #define __noinline__ __attribute__((noinline))
    -
    36 #define __forceinline__ __attribute__((always_inline))
    -
    37 
    -
    38 
    -
    39 
    -
    40 /*
    -
    41  * Variable Type Qualifiers:
    -
    42  */
    -
    43 // _restrict is supported by the compiler
    -
    44 #define __shared__ tile_static
    -
    45 #define __constant__ __attribute__((address_space(2)))
    -
    46 
    -
    47 #else
    -
    48 // Non-HCC compiler
    -
    52 #define __host__
    -
    53 #define __device__
    -
    54 
    -
    55 #define __global__
    -
    56 
    -
    57 #define __noinline__
    -
    58 #define __forceinline__
    -
    59 
    -
    60 #define __shared__
    -
    61 #define __constant__
    +
    35 #ifndef DISABLE_GRID_LAUNCH
    +
    36 #define __global__ __attribute__((hc_grid_launch))
    +
    37 #else
    +
    38 #define __global__
    +
    39 #endif
    +
    40 
    +
    41 #define __noinline__ __attribute__((noinline))
    +
    42 #define __forceinline__ __attribute__((always_inline))
    +
    43 
    +
    44 
    +
    45 
    +
    46 /*
    +
    47  * Variable Type Qualifiers:
    +
    48  */
    +
    49 // _restrict is supported by the compiler
    +
    50 #define __shared__ tile_static
    +
    51 #define __constant__ __attribute__((address_space(2)))
    +
    52 
    +
    53 #else
    +
    54 // Non-HCC compiler
    +
    58 #define __host__
    +
    59 #define __device__
    +
    60 
    +
    61 #define __global__
    62 
    -
    63 #endif
    +
    63 #define __noinline__
    +
    64 #define __forceinline__
    +
    65 
    +
    66 #define __shared__
    +
    67 #define __constant__
    +
    68 
    +
    69 #endif
    diff --git a/projects/hip/docs/RuntimeAPI/html/index.html b/projects/hip/docs/RuntimeAPI/html/index.html index 7fc6c60b4d..8ba01822a2 100644 --- a/projects/hip/docs/RuntimeAPI/html/index.html +++ b/projects/hip/docs/RuntimeAPI/html/index.html @@ -91,7 +91,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    diff --git a/projects/hip/docs/RuntimeAPI/html/modules.html b/projects/hip/docs/RuntimeAPI/html/modules.html index d2d046519d..6c65a6cbdd 100644 --- a/projects/hip/docs/RuntimeAPI/html/modules.html +++ b/projects/hip/docs/RuntimeAPI/html/modules.html @@ -99,7 +99,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/pages.html b/projects/hip/docs/RuntimeAPI/html/pages.html index 1ac5403b3f..ecccd28e6d 100644 --- a/projects/hip/docs/RuntimeAPI/html/pages.html +++ b/projects/hip/docs/RuntimeAPI/html/pages.html @@ -88,7 +88,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_0.js b/projects/hip/docs/RuntimeAPI/html/search/all_0.js index 2fba32867b..00fe08ccff 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_0.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_0.js @@ -1,4 +1,4 @@ var searchData= [ - ['arch',['arch',['../structhipDeviceProp__t.html#afc58158e44bef6ad26f2be401434b049',1,'hipDeviceProp_t']]] + ['_5f_5fhost_5f_5f',['__host__',['../host__defines_8h.html#a803050db3c78e0db3ea59a0c35499622',1,'host_defines.h']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_1.js b/projects/hip/docs/RuntimeAPI/html/search/all_1.js index 8d42650d49..2fba32867b 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_1.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_1.js @@ -1,4 +1,4 @@ var searchData= [ - ['bug_20list',['Bug List',['../bug.html',1,'']]] + ['arch',['arch',['../structhipDeviceProp__t.html#afc58158e44bef6ad26f2be401434b049',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_10.js b/projects/hip/docs/RuntimeAPI/html/search/all_10.js index 133dd9dc6e..9043ae6945 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_10.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_10.js @@ -1,4 +1,7 @@ var searchData= [ - ['y',['y',['../structdim3.html#a83e60e072f7e8bdfde6ac05053cbb370',1,'dim3']]] + ['texture',['texture',['../structtexture.html',1,'texture< T, texType, hipTextureReadMode >'],['../group__Texture.html',1,'(Global Namespace)']]], + ['texturereference',['textureReference',['../structtextureReference.html',1,'']]], + ['totalconstmem',['totalConstMem',['../structhipDeviceProp__t.html#a29880232c56120be3455ce00d5379665',1,'hipDeviceProp_t']]], + ['totalglobalmem',['totalGlobalMem',['../structhipDeviceProp__t.html#acedd6a2d23423441e4bf51c4a1b719f9',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_11.js b/projects/hip/docs/RuntimeAPI/html/search/all_11.js index e8bf38b99c..46a1400a7b 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_11.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_11.js @@ -1,4 +1,4 @@ var searchData= [ - ['z',['z',['../structdim3.html#a866e38993ecc4e76fd47311236c16b04',1,'dim3']]] + ['warpsize',['warpSize',['../structhipDeviceProp__t.html#af3357d33c004608bf05bc21a352be81b',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_2.js b/projects/hip/docs/RuntimeAPI/html/search/all_2.js index f928b2e2ec..8d42650d49 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_2.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_2.js @@ -1,7 +1,4 @@ var searchData= [ - ['clockinstructionrate',['clockInstructionRate',['../structhipDeviceProp__t.html#a6fbf3b08a1a08ae700f1a06265f6666b',1,'hipDeviceProp_t']]], - ['clockrate',['clockRate',['../structhipDeviceProp__t.html#a1dd15bee43692b8649dfbdc1adbaaf96',1,'hipDeviceProp_t']]], - ['computemode',['computeMode',['../structhipDeviceProp__t.html#ae7d9216f8583a703359d0b9373823f5d',1,'hipDeviceProp_t']]], - ['control',['Control',['../group__Profiler.html',1,'']]] + ['bug_20list',['Bug List',['../bug.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_3.js b/projects/hip/docs/RuntimeAPI/html/search/all_3.js index d715da215b..e6f0e7edea 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_3.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_3.js @@ -1,6 +1,8 @@ var searchData= [ - ['device_20management',['Device Management',['../group__Device.html',1,'']]], - ['dim3',['dim3',['../structdim3.html',1,'dim3'],['../group__GlobalDefs.html#gacb37281795c3567d0b10a61c056d512b',1,'dim3(): hip_runtime_api.h']]], - ['device_20memory_20access',['Device Memory Access',['../group__PeerToPeer.html',1,'']]] + ['clockinstructionrate',['clockInstructionRate',['../structhipDeviceProp__t.html#a6fbf3b08a1a08ae700f1a06265f6666b',1,'hipDeviceProp_t']]], + ['clockrate',['clockRate',['../structhipDeviceProp__t.html#a1dd15bee43692b8649dfbdc1adbaaf96',1,'hipDeviceProp_t']]], + ['computemode',['computeMode',['../structhipDeviceProp__t.html#ae7d9216f8583a703359d0b9373823f5d',1,'hipDeviceProp_t']]], + ['concurrentkernels',['concurrentKernels',['../structhipDeviceProp__t.html#ad8461a28caf9c38c58cf358583b5bee3',1,'hipDeviceProp_t']]], + ['control',['Control',['../group__Profiler.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_4.js b/projects/hip/docs/RuntimeAPI/html/search/all_4.js index c9fd17e83e..d715da215b 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_4.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_4.js @@ -1,5 +1,6 @@ var searchData= [ - ['error_20handling',['Error Handling',['../group__Error.html',1,'']]], - ['event_20management',['Event Management',['../group__Event.html',1,'']]] + ['device_20management',['Device Management',['../group__Device.html',1,'']]], + ['dim3',['dim3',['../structdim3.html',1,'dim3'],['../group__GlobalDefs.html#gacb37281795c3567d0b10a61c056d512b',1,'dim3(): hip_runtime_api.h']]], + ['device_20memory_20access',['Device Memory Access',['../group__PeerToPeer.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_5.js b/projects/hip/docs/RuntimeAPI/html/search/all_5.js index 529e9394c3..c9fd17e83e 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_5.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_5.js @@ -1,4 +1,5 @@ var searchData= [ - ['global_20enum_20and_20defines',['Global enum and defines',['../group__GlobalDefs.html',1,'']]] + ['error_20handling',['Error Handling',['../group__Error.html',1,'']]], + ['event_20management',['Event Management',['../group__Event.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_6.js b/projects/hip/docs/RuntimeAPI/html/search/all_6.js index 84510111be..529e9394c3 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_6.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_6.js @@ -1,107 +1,4 @@ var searchData= [ - ['hip_20api',['HIP API',['../group__API.html',1,'']]], - ['has3dgrid',['has3dGrid',['../structhipDeviceArch__t.html#aa5e22d295cce0d9a34ee6e7e7e378c26',1,'hipDeviceArch_t']]], - ['hasdoubles',['hasDoubles',['../structhipDeviceArch__t.html#a2d624e3d85e615b71d1182f8912893b4',1,'hipDeviceArch_t']]], - ['hasdynamicparallelism',['hasDynamicParallelism',['../structhipDeviceArch__t.html#a9114bfc718bf0648b54ff9a319a36b35',1,'hipDeviceArch_t']]], - ['hasfloatatomicadd',['hasFloatAtomicAdd',['../structhipDeviceArch__t.html#a2321d6ef74aac91c044f3289d25b2d41',1,'hipDeviceArch_t']]], - ['hasfunnelshift',['hasFunnelShift',['../structhipDeviceArch__t.html#aaec4f2d983d4602858fae8e9ddeee3ff',1,'hipDeviceArch_t']]], - ['hasglobalfloatatomicexch',['hasGlobalFloatAtomicExch',['../structhipDeviceArch__t.html#a8f213ae9a4729dff1c636ac5de0e2fa2',1,'hipDeviceArch_t']]], - ['hasglobalint32atomics',['hasGlobalInt32Atomics',['../structhipDeviceArch__t.html#a8d00c3ab98869b602c714fe7abe68e93',1,'hipDeviceArch_t']]], - ['hasglobalint64atomics',['hasGlobalInt64Atomics',['../structhipDeviceArch__t.html#ad5aa54dbab22dbcd8cf98f57a96c6636',1,'hipDeviceArch_t']]], - ['hassharedfloatatomicexch',['hasSharedFloatAtomicExch',['../structhipDeviceArch__t.html#aff005558b4edabd27b27f286ac5b2f2b',1,'hipDeviceArch_t']]], - ['hassharedint32atomics',['hasSharedInt32Atomics',['../structhipDeviceArch__t.html#a1596330b1cb9cc73f142aee11b2ab853',1,'hipDeviceArch_t']]], - ['hassharedint64atomics',['hasSharedInt64Atomics',['../structhipDeviceArch__t.html#a00c2b930fcdcad9ea7b54b449db13966',1,'hipDeviceArch_t']]], - ['hassurfacefuncs',['hasSurfaceFuncs',['../structhipDeviceArch__t.html#a9eb2462148686d4c048b69b6e09f835e',1,'hipDeviceArch_t']]], - ['hassyncthreadsext',['hasSyncThreadsExt',['../structhipDeviceArch__t.html#ade6a3b21ad5f344dcd92c52102c274ba',1,'hipDeviceArch_t']]], - ['hasthreadfencesystem',['hasThreadFenceSystem',['../structhipDeviceArch__t.html#ac2818e3b91cba8beb36741e9867bb887',1,'hipDeviceArch_t']]], - ['haswarpballot',['hasWarpBallot',['../structhipDeviceArch__t.html#af1e934a8a5106995bcc256287585564c',1,'hipDeviceArch_t']]], - ['haswarpshuffle',['hasWarpShuffle',['../structhipDeviceArch__t.html#a3d922e8fc97ca1e8ecc39600b138fa2d',1,'hipDeviceArch_t']]], - ['haswarpvote',['hasWarpVote',['../structhipDeviceArch__t.html#a35bde017352eca1d4e0eceb3bf79f274',1,'hipDeviceArch_t']]], - ['hcc_2dspecific_20accessors',['HCC-Specific Accessors',['../group__HCC__Specific.html',1,'']]], - ['hip_20environment_20variables',['HIP Environment Variables',['../group__HIP-ENV.html',1,'']]], - ['hip_5fhcc_2ecpp',['hip_hcc.cpp',['../hip__hcc_8cpp.html',1,'']]], - ['hip_5flaunch_5fblocking',['HIP_LAUNCH_BLOCKING',['../group__HIP-ENV.html#ga8049b329f2663b4572d81e7a9aa8a155',1,'HIP_LAUNCH_BLOCKING(): hip_hcc.cpp'],['../group__HIP-ENV.html#ga8049b329f2663b4572d81e7a9aa8a155',1,'HIP_LAUNCH_BLOCKING(): hip_hcc.cpp']]], - ['hip_5fprint_5fenv',['HIP_PRINT_ENV',['../group__HIP-ENV.html#ga1e1c85dbb250f1acfb484c1be1f3b28a',1,'HIP_PRINT_ENV(): hip_hcc.cpp'],['../group__HIP-ENV.html#ga1e1c85dbb250f1acfb484c1be1f3b28a',1,'HIP_PRINT_ENV(): hip_hcc.cpp']]], - ['hip_5fruntime_2eh',['hip_runtime.h',['../hcc__detail_2hip__runtime_8h.html',1,'']]], - ['hip_5ftrace_5fapi',['HIP_TRACE_API',['../group__HIP-ENV.html#gaae9c541f3e25b8f002762337a03fec28',1,'HIP_TRACE_API(): hip_hcc.cpp'],['../group__HIP-ENV.html#gaae9c541f3e25b8f002762337a03fec28',1,'HIP_TRACE_API(): hip_hcc.cpp']]], - ['hipchannelformatdesc',['hipChannelFormatDesc',['../structhipChannelFormatDesc.html',1,'']]], - ['hipdevicearch_5ft',['hipDeviceArch_t',['../structhipDeviceArch__t.html',1,'']]], - ['hipdevicecanaccesspeer',['hipDeviceCanAccessPeer',['../group__PeerToPeer.html#gab53a55dbc087ff659918fd04287de3d3',1,'hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice): hip_hcc.cpp'],['../group__PeerToPeer.html#gab53a55dbc087ff659918fd04287de3d3',1,'hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice): hip_hcc.cpp']]], - ['hipdevicedisablepeeraccess',['hipDeviceDisablePeerAccess',['../group__PeerToPeer.html#ga41e60c01f63597529da1cd77bdd55379',1,'hipDeviceDisablePeerAccess(int peerDevice): hip_hcc.cpp'],['../group__PeerToPeer.html#ga41e60c01f63597529da1cd77bdd55379',1,'hipDeviceDisablePeerAccess(int peerDevice): hip_hcc.cpp']]], - ['hipdeviceenablepeeraccess',['hipDeviceEnablePeerAccess',['../group__PeerToPeer.html#ga098e0d626edbfb69b66d141a5a8b7dc6',1,'hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags): hip_hcc.cpp'],['../group__PeerToPeer.html#ga098e0d626edbfb69b66d141a5a8b7dc6',1,'hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags): hip_hcc.cpp']]], - ['hipdevicegetcacheconfig',['hipDeviceGetCacheConfig',['../group__Device.html#gaeeffa2456c5430400bea75ecd6ad1e68',1,'hipDeviceGetCacheConfig(hipFuncCache *cacheConfig): hip_hcc.cpp'],['../group__Device.html#gaeeffa2456c5430400bea75ecd6ad1e68',1,'hipDeviceGetCacheConfig(hipFuncCache *cacheConfig): hip_hcc.cpp']]], - ['hipdevicegetproperties',['hipDeviceGetProperties',['../group__Device.html#gad9ee6822e3e55431811fb6a00f7a1c10',1,'hipDeviceGetProperties(hipDeviceProp_t *prop, int device): hip_hcc.cpp'],['../group__Device.html#gad9ee6822e3e55431811fb6a00f7a1c10',1,'hipDeviceGetProperties(hipDeviceProp_t *props, int device): hip_hcc.cpp']]], - ['hipdevicegetsharedmemconfig',['hipDeviceGetSharedMemConfig',['../group__Device.html#ga1bb08f774a34a468d969a8a04791c9bb',1,'hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig): hip_hcc.cpp'],['../group__Device.html#ga1bb08f774a34a468d969a8a04791c9bb',1,'hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig): hip_hcc.cpp']]], - ['hipdeviceprop_5ft',['hipDeviceProp_t',['../structhipDeviceProp__t.html',1,'']]], - ['hipdevicereset',['hipDeviceReset',['../group__Device.html#ga8d57161ae56a8edc46eeda447417bf6c',1,'hipDeviceReset(void): hip_hcc.cpp'],['../group__Device.html#ga8d57161ae56a8edc46eeda447417bf6c',1,'hipDeviceReset(void): hip_hcc.cpp']]], - ['hipdevicesetcacheconfig',['hipDeviceSetCacheConfig',['../group__Device.html#gac2b282179f29c4c0ca7b5391242c6a4c',1,'hipDeviceSetCacheConfig(hipFuncCache cacheConfig): hip_hcc.cpp'],['../group__Device.html#gac2b282179f29c4c0ca7b5391242c6a4c',1,'hipDeviceSetCacheConfig(hipFuncCache cacheConfig): hip_hcc.cpp']]], - ['hipdevicesetsharedmemconfig',['hipDeviceSetSharedMemConfig',['../group__Device.html#ga9b1f279084e76691cedfbfadf9c717ee',1,'hipDeviceSetSharedMemConfig(hipSharedMemConfig config): hip_hcc.cpp'],['../group__Device.html#ga9b1f279084e76691cedfbfadf9c717ee',1,'hipDeviceSetSharedMemConfig(hipSharedMemConfig config): hip_hcc.cpp']]], - ['hipdevicesynchronize',['hipDeviceSynchronize',['../group__Device.html#gaefdc2847fb1d6c3fb1354e827a191ebd',1,'hipDeviceSynchronize(void): hip_hcc.cpp'],['../group__Device.html#gaefdc2847fb1d6c3fb1354e827a191ebd',1,'hipDeviceSynchronize(void): hip_hcc.cpp']]], - ['hipdrivergetversion',['hipDriverGetVersion',['../group__Version.html#gaf6c342f52d2a29a0aca5cdd89b4dd47c',1,'hipDriverGetVersion(int *driverVersion): hip_hcc.cpp'],['../group__Version.html#gaf6c342f52d2a29a0aca5cdd89b4dd47c',1,'hipDriverGetVersion(int *driverVersion): hip_hcc.cpp']]], - ['hiperror_5ft',['hipError_t',['../group__GlobalDefs.html#gadf5010f6e140a53ecbdf949e73e87594',1,'hip_runtime_api.h']]], - ['hiperrorinvaliddevice',['hipErrorInvalidDevice',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a07ab9b704ea693c1781a52741c60cd0d',1,'hip_runtime_api.h']]], - ['hiperrorinvalidresourcehandle',['hipErrorInvalidResourceHandle',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a88e525a7c8f35552dfada58e9f2f6d3a',1,'hip_runtime_api.h']]], - ['hiperrorinvalidvalue',['hipErrorInvalidValue',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a1e8215fe1108a508bad3944bce7b4d83',1,'hip_runtime_api.h']]], - ['hiperrormemoryallocation',['hipErrorMemoryAllocation',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a8293288a10109874749afe2562db09f2',1,'hip_runtime_api.h']]], - ['hiperrormemoryfree',['hipErrorMemoryFree',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a813b3f014e2a3932d1f0e3e712cf9d3c',1,'hip_runtime_api.h']]], - ['hiperrornodevice',['hipErrorNoDevice',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594ad4406972c318df36d231310a15131c24',1,'hip_runtime_api.h']]], - ['hiperrornotready',['hipErrorNotReady',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aa9638063c8746a9d1fda2b2069a0a9f1',1,'hip_runtime_api.h']]], - ['hiperroroutofresources',['hipErrorOutOfResources',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a60c1c080b79bdde9ef5e808f974ac9ed',1,'hip_runtime_api.h']]], - ['hiperrortbd',['hipErrorTbd',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594ab556409e11ddb0c4cf77a2f4fc91ea9e',1,'hip_runtime_api.h']]], - ['hiperrorunknown',['hipErrorUnknown',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aa74e64c5b2f5fb0d6a92681f5b234073',1,'hip_runtime_api.h']]], - ['hiperrorunknownsymbol',['hipErrorUnknownSymbol',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a45b297e6c3b2029dce1348658421481b',1,'hip_runtime_api.h']]], - ['hipevent_5ft',['hipEvent_t',['../structhipEvent__t.html',1,'']]], - ['hipeventblockingsync',['hipEventBlockingSync',['../group__GlobalDefs.html#gafa1c076a5b991763a98695063f1ea11d',1,'hip_runtime_api.h']]], - ['hipeventcreatewithflags',['hipEventCreateWithFlags',['../group__Event.html#gae86a5acb1b22b61bc9ecb9c28fc71b75',1,'hipEventCreateWithFlags(hipEvent_t *event, unsigned flags): hip_hcc.cpp'],['../group__Event.html#gae86a5acb1b22b61bc9ecb9c28fc71b75',1,'hipEventCreateWithFlags(hipEvent_t *event, unsigned flags): hip_hcc.cpp']]], - ['hipeventdefault',['hipEventDefault',['../group__GlobalDefs.html#ga122a5853359eba97cf047ddd153740f0',1,'hip_runtime_api.h']]], - ['hipeventdestroy',['hipEventDestroy',['../group__Event.html#ga83260357dce0c39e8c6a3c74ec97484c',1,'hipEventDestroy(hipEvent_t event): hip_hcc.cpp'],['../group__Event.html#ga83260357dce0c39e8c6a3c74ec97484c',1,'hipEventDestroy(hipEvent_t event): hip_hcc.cpp']]], - ['hipeventdisabletiming',['hipEventDisableTiming',['../group__GlobalDefs.html#ga3c0f44a85e36a4c67671da6bcdad0351',1,'hip_runtime_api.h']]], - ['hipeventelapsedtime',['hipEventElapsedTime',['../group__Event.html#gad4128b815cb475c8e13c7e66ff6250b7',1,'hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop): hip_hcc.cpp'],['../group__Event.html#gad4128b815cb475c8e13c7e66ff6250b7',1,'hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop): hip_hcc.cpp']]], - ['hipeventinterprocess',['hipEventInterprocess',['../group__GlobalDefs.html#ga0f01d74059baa704e42aeff8222166bb',1,'hip_runtime_api.h']]], - ['hipeventquery',['hipEventQuery',['../group__Event.html#ga5d12d7b798b5ceb5932d1ac21f5ac776',1,'hipEventQuery(hipEvent_t event): hip_hcc.cpp'],['../group__Event.html#ga5d12d7b798b5ceb5932d1ac21f5ac776',1,'hipEventQuery(hipEvent_t event): hip_hcc.cpp']]], - ['hipeventrecord',['hipEventRecord',['../group__Event.html#gace88ebd8c7ec42a6c2cebda2e8b0cb38',1,'hipEventRecord(hipEvent_t event, hipStream_t stream=NULL): hip_hcc.cpp'],['../group__Event.html#gace88ebd8c7ec42a6c2cebda2e8b0cb38',1,'hipEventRecord(hipEvent_t event, hipStream_t stream): hip_hcc.cpp']]], - ['hipeventsynchronize',['hipEventSynchronize',['../group__Event.html#ga1f72d98ba5d6f7dc3da54e0c41fe38b1',1,'hipEventSynchronize(hipEvent_t event): hip_hcc.cpp'],['../group__Event.html#ga1f72d98ba5d6f7dc3da54e0c41fe38b1',1,'hipEventSynchronize(hipEvent_t event): hip_hcc.cpp']]], - ['hipfree',['hipFree',['../group__Memory.html#ga740d08da65cae1441ba32f8fedb863d1',1,'hipFree(void *ptr): hip_hcc.cpp'],['../group__Memory.html#ga740d08da65cae1441ba32f8fedb863d1',1,'hipFree(void *ptr): hip_hcc.cpp']]], - ['hipfreehost',['hipFreeHost',['../group__Memory.html#ga28d7d92836116dfadeb62e416ee887d3',1,'hipFreeHost(void *ptr): hip_hcc.cpp'],['../group__Memory.html#ga28d7d92836116dfadeb62e416ee887d3',1,'hipFreeHost(void *ptr): hip_hcc.cpp']]], - ['hipfunccache',['hipFuncCache',['../group__GlobalDefs.html#gac7e4bfd88340fc06642136c839a3d822',1,'hipFuncCache(): hip_runtime_api.h'],['../group__GlobalDefs.html#gaad15dc7939a0a25b16e4aa161fb41eee',1,'hipFuncCache(): hip_runtime_api.h']]], - ['hipfunccachepreferequal',['hipFuncCachePreferEqual',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a0ddab0e840107634a152033103be44d7',1,'hip_runtime_api.h']]], - ['hipfunccachepreferl1',['hipFuncCachePreferL1',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a636a3c140db6b9d4a8bf7d5a61c398c5',1,'hip_runtime_api.h']]], - ['hipfunccacheprefernone',['hipFuncCachePreferNone',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a0813fbaa008ce1231ff9fed3911eb3af',1,'hip_runtime_api.h']]], - ['hipfunccacheprefershared',['hipFuncCachePreferShared',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a9b34337dfbadba25ed2aa270bbcabc43',1,'hip_runtime_api.h']]], - ['hipfuncsetcacheconfig',['hipFuncSetCacheConfig',['../group__Device.html#gadd94a910c2b840833cc325b1e5425702',1,'hipFuncSetCacheConfig(hipFuncCache config): hip_hcc.cpp'],['../group__Device.html#gadd94a910c2b840833cc325b1e5425702',1,'hipFuncSetCacheConfig(hipFuncCache cacheConfig): hip_hcc.cpp']]], - ['hipgetdevice',['hipGetDevice',['../group__Device.html#gaffc83567f2df3bbe2d37a19872d60f24',1,'hipGetDevice(int *device): hip_hcc.cpp'],['../group__Device.html#gaffc83567f2df3bbe2d37a19872d60f24',1,'hipGetDevice(int *device): hip_hcc.cpp']]], - ['hipgetdevicecount',['hipGetDeviceCount',['../group__Device.html#ga8555d5c76d88c50ddbf54ae70b568394',1,'hipGetDeviceCount(int *count): hip_hcc.cpp'],['../group__Device.html#ga8555d5c76d88c50ddbf54ae70b568394',1,'hipGetDeviceCount(int *count): hip_hcc.cpp']]], - ['hipgeterrorname',['hipGetErrorName',['../group__Error.html#ga88c474d77635523dbf6ca67be7b56999',1,'hipGetErrorName(hipError_t hip_error): hip_hcc.cpp'],['../group__Error.html#ga88c474d77635523dbf6ca67be7b56999',1,'hipGetErrorName(hipError_t hip_error): hip_hcc.cpp']]], - ['hipgeterrorstring',['hipGetErrorString',['../group__Error.html#ga5959779a654bbc98ffe6d36ab536740a',1,'hipGetErrorString(hipError_t hip_error): hip_hcc.cpp'],['../group__Error.html#ga5959779a654bbc98ffe6d36ab536740a',1,'hipGetErrorString(hipError_t hip_error): hip_hcc.cpp']]], - ['hipgetlasterror',['hipGetLastError',['../group__Error.html#ga533daeb9114d7fc2db8d867adf9e419b',1,'hipGetLastError(void): hip_hcc.cpp'],['../group__Error.html#ga533daeb9114d7fc2db8d867adf9e419b',1,'hipGetLastError(): hip_hcc.cpp']]], - ['hiphccgetaccelerator',['hipHccGetAccelerator',['../group__HCC__Specific.html#ga0d24b3157fd1b16d38672bb157ec4cd4',1,'hipHccGetAccelerator(int deviceId, hc::accelerator *acc): hip_hcc.cpp'],['../group__HCC__Specific.html#ga0d24b3157fd1b16d38672bb157ec4cd4',1,'hipHccGetAccelerator(int deviceId, hc::accelerator *acc): hip_hcc.cpp']]], - ['hiphccgetacceleratorview',['hipHccGetAcceleratorView',['../group__HCC__Specific.html#ga1a7087ea9c3c3323270d7cce73650b44',1,'hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av): hip_hcc.cpp'],['../group__HCC__Specific.html#ga1a7087ea9c3c3323270d7cce73650b44',1,'hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av): hip_hcc.cpp']]], - ['hipmalloc',['hipMalloc',['../group__Memory.html#ga4c6fcfe80010069d2792780d00dcead2',1,'hipMalloc(void **ptr, size_t size): hip_hcc.cpp'],['../group__Memory.html#ga4c6fcfe80010069d2792780d00dcead2',1,'hipMalloc(void **ptr, size_t sizeBytes): hip_hcc.cpp']]], - ['hipmallochost',['hipMallocHost',['../group__Memory.html#ga66399e729223ff5b66ffc16297c0710e',1,'hipMallocHost(void **ptr, size_t size): hip_hcc.cpp'],['../group__Memory.html#ga66399e729223ff5b66ffc16297c0710e',1,'hipMallocHost(void **ptr, size_t sizeBytes): hip_hcc.cpp']]], - ['hipmemcpy',['hipMemcpy',['../group__Memory.html#gac1a055d288302edd641c6d7416858e1e',1,'hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind): hip_hcc.cpp'],['../group__Memory.html#gac1a055d288302edd641c6d7416858e1e',1,'hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind): hip_hcc.cpp']]], - ['hipmemcpyasync',['hipMemcpyAsync',['../group__Memory.html#ga8ad5a0b13458917e1b9437732b21af54',1,'hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0): hip_hcc.cpp'],['../group__Memory.html#ga8ad5a0b13458917e1b9437732b21af54',1,'hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream): hip_hcc.cpp']]], - ['hipmemcpydefault',['hipMemcpyDefault',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18a4e37107e416f79a2edf2b6534163c823',1,'hip_runtime_api.h']]], - ['hipmemcpydevicetodevice',['hipMemcpyDeviceToDevice',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18abd05a09d3105e0ce25b34dd91cf83f88',1,'hip_runtime_api.h']]], - ['hipmemcpydevicetohost',['hipMemcpyDeviceToHost',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18aba2505e9ce1e5382f17730bc670917d1',1,'hip_runtime_api.h']]], - ['hipmemcpyhosttodevice',['hipMemcpyHostToDevice',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18aff32175ecb0c7113200286eff8211008',1,'hip_runtime_api.h']]], - ['hipmemcpyhosttohost',['hipMemcpyHostToHost',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18a9d66b705aa85a9c83f0f533cef70d0af',1,'hip_runtime_api.h']]], - ['hipmemcpykind',['hipMemcpyKind',['../group__GlobalDefs.html#ga232e222db36b1fc672ba98054d036a18',1,'hip_runtime_api.h']]], - ['hipmemcpypeerasync',['hipMemcpyPeerAsync',['../group__PeerToPeer.html#gab6211c18ca1e23252ef080cd6be855ca',1,'hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0): hip_hcc.cpp'],['../group__PeerToPeer.html#gab6211c18ca1e23252ef080cd6be855ca',1,'hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream): hip_hcc.cpp']]], - ['hippeekatlasterror',['hipPeekAtLastError',['../group__Error.html#ga1dd660bc739f7e13edd34615660f0148',1,'hip_runtime_api.h']]], - ['hipsetdevice',['hipSetDevice',['../group__Device.html#ga8ec0b093af0adadc7fe98bf33fa21620',1,'hipSetDevice(int device): hip_hcc.cpp'],['../group__Device.html#ga8ec0b093af0adadc7fe98bf33fa21620',1,'hipSetDevice(int device): hip_hcc.cpp']]], - ['hipsharedmembanksizedefault',['hipSharedMemBankSizeDefault',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104eaf5b325c9b7bde878913f768eaba5014d',1,'hip_runtime_api.h']]], - ['hipsharedmembanksizeeightbyte',['hipSharedMemBankSizeEightByte',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104ea64518b4f5a25f536c883330167e79258',1,'hip_runtime_api.h']]], - ['hipsharedmembanksizefourbyte',['hipSharedMemBankSizeFourByte',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104ea0a95a6e0c33106c42d66ab9476ff954a',1,'hip_runtime_api.h']]], - ['hipsharedmemconfig',['hipSharedMemConfig',['../group__GlobalDefs.html#ga2e17b71d94ac350f2ccd914fd49d104e',1,'hipSharedMemConfig(): hip_runtime_api.h'],['../group__GlobalDefs.html#ga6b1ca424fa26a5fb718937d662eaee7f',1,'hipSharedMemConfig(): hip_runtime_api.h']]], - ['hipstreamcreatewithflags',['hipStreamCreateWithFlags',['../group__Stream.html#gaf2382e3cc6632332a8983a0f58e43494',1,'hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags): hip_hcc.cpp'],['../group__Stream.html#gaf2382e3cc6632332a8983a0f58e43494',1,'hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags): hip_hcc.cpp']]], - ['hipstreamdefault',['hipStreamDefault',['../group__GlobalDefs.html#ga6df5f70eb976836ab3598cacf0ffcdf9',1,'hip_runtime_api.h']]], - ['hipstreamdestroy',['hipStreamDestroy',['../group__Stream.html#ga3076a3499ed2c7821311006100bb95ec',1,'hipStreamDestroy(hipStream_t stream): hip_hcc.cpp'],['../group__Stream.html#ga3076a3499ed2c7821311006100bb95ec',1,'hipStreamDestroy(hipStream_t stream): hip_hcc.cpp']]], - ['hipstreamgetflags',['hipStreamGetFlags',['../group__Stream.html#ga3249555a26439591b8873f70b39bb116',1,'hipStreamGetFlags(hipStream_t stream, unsigned int *flags): hip_hcc.cpp'],['../group__Stream.html#ga3249555a26439591b8873f70b39bb116',1,'hipStreamGetFlags(hipStream_t stream, unsigned int *flags): hip_hcc.cpp']]], - ['hipstreamnonblocking',['hipStreamNonBlocking',['../group__GlobalDefs.html#gaaba9ae995d9b43b7d1ee70c6fa12c57d',1,'hip_runtime_api.h']]], - ['hipstreamsynchronize',['hipStreamSynchronize',['../group__Stream.html#gabbfb9f573a6ebe8c478605ecb5504a74',1,'hipStreamSynchronize(hipStream_t stream): hip_hcc.cpp'],['../group__Stream.html#gabbfb9f573a6ebe8c478605ecb5504a74',1,'hipStreamSynchronize(hipStream_t stream): hip_hcc.cpp']]], - ['hipstreamwaitevent',['hipStreamWaitEvent',['../group__Stream.html#gacdd84c8f8ef1539c96c57c1d5bcae633',1,'hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags): hip_hcc.cpp'],['../group__Stream.html#gacdd84c8f8ef1539c96c57c1d5bcae633',1,'hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags): hip_hcc.cpp']]], - ['hipsuccess',['hipSuccess',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aadfbdb847b149723c684ebd764556063',1,'hip_runtime_api.h']]], - ['hipthreadidx_5fx',['hipThreadIdx_x',['../hcc__detail_2hip__runtime_8h.html#a48f5f9da77c5fab1fbcf0205bb347d89',1,'hip_runtime.h']]], - ['heterogeneous_2dcomputing_20interface_20for_20portability_20_28hip_29',['Heterogeneous-computing Interface for Portability (HIP)',['../index.html',1,'']]] + ['global_20enum_20and_20defines',['Global enum and defines',['../group__GlobalDefs.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_7.js b/projects/hip/docs/RuntimeAPI/html/search/all_7.js index 69beb95adf..be2e145b56 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_7.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_7.js @@ -1,6 +1,144 @@ var searchData= [ - ['ihipdevice_5ft',['ihipDevice_t',['../structihipDevice__t.html',1,'']]], - ['ihipevent_5ft',['ihipEvent_t',['../structihipEvent__t.html',1,'']]], - ['ihipstream_5ft',['ihipStream_t',['../structihipStream__t.html',1,'']]] + ['hip_20api',['HIP API',['../group__API.html',1,'']]], + ['has3dgrid',['has3dGrid',['../structhipDeviceArch__t.html#aa5e22d295cce0d9a34ee6e7e7e378c26',1,'hipDeviceArch_t']]], + ['hasdoubles',['hasDoubles',['../structhipDeviceArch__t.html#a2d624e3d85e615b71d1182f8912893b4',1,'hipDeviceArch_t']]], + ['hasdynamicparallelism',['hasDynamicParallelism',['../structhipDeviceArch__t.html#a9114bfc718bf0648b54ff9a319a36b35',1,'hipDeviceArch_t']]], + ['hasfloatatomicadd',['hasFloatAtomicAdd',['../structhipDeviceArch__t.html#a2321d6ef74aac91c044f3289d25b2d41',1,'hipDeviceArch_t']]], + ['hasfunnelshift',['hasFunnelShift',['../structhipDeviceArch__t.html#aaec4f2d983d4602858fae8e9ddeee3ff',1,'hipDeviceArch_t']]], + ['hasglobalfloatatomicexch',['hasGlobalFloatAtomicExch',['../structhipDeviceArch__t.html#a8f213ae9a4729dff1c636ac5de0e2fa2',1,'hipDeviceArch_t']]], + ['hasglobalint32atomics',['hasGlobalInt32Atomics',['../structhipDeviceArch__t.html#a8d00c3ab98869b602c714fe7abe68e93',1,'hipDeviceArch_t']]], + ['hasglobalint64atomics',['hasGlobalInt64Atomics',['../structhipDeviceArch__t.html#ad5aa54dbab22dbcd8cf98f57a96c6636',1,'hipDeviceArch_t']]], + ['hassharedfloatatomicexch',['hasSharedFloatAtomicExch',['../structhipDeviceArch__t.html#aff005558b4edabd27b27f286ac5b2f2b',1,'hipDeviceArch_t']]], + ['hassharedint32atomics',['hasSharedInt32Atomics',['../structhipDeviceArch__t.html#a1596330b1cb9cc73f142aee11b2ab853',1,'hipDeviceArch_t']]], + ['hassharedint64atomics',['hasSharedInt64Atomics',['../structhipDeviceArch__t.html#a00c2b930fcdcad9ea7b54b449db13966',1,'hipDeviceArch_t']]], + ['hassurfacefuncs',['hasSurfaceFuncs',['../structhipDeviceArch__t.html#a9eb2462148686d4c048b69b6e09f835e',1,'hipDeviceArch_t']]], + ['hassyncthreadsext',['hasSyncThreadsExt',['../structhipDeviceArch__t.html#ade6a3b21ad5f344dcd92c52102c274ba',1,'hipDeviceArch_t']]], + ['hasthreadfencesystem',['hasThreadFenceSystem',['../structhipDeviceArch__t.html#ac2818e3b91cba8beb36741e9867bb887',1,'hipDeviceArch_t']]], + ['haswarpballot',['hasWarpBallot',['../structhipDeviceArch__t.html#af1e934a8a5106995bcc256287585564c',1,'hipDeviceArch_t']]], + ['haswarpshuffle',['hasWarpShuffle',['../structhipDeviceArch__t.html#a3d922e8fc97ca1e8ecc39600b138fa2d',1,'hipDeviceArch_t']]], + ['haswarpvote',['hasWarpVote',['../structhipDeviceArch__t.html#a35bde017352eca1d4e0eceb3bf79f274',1,'hipDeviceArch_t']]], + ['hcc_2dspecific_20accessors',['HCC-Specific Accessors',['../group__HCC__Specific.html',1,'']]], + ['hip_20environment_20variables',['HIP Environment Variables',['../group__HIP-ENV.html',1,'']]], + ['hip_5fhcc_2ecpp',['hip_hcc.cpp',['../hip__hcc_8cpp.html',1,'']]], + ['hip_5flaunch_5fblocking',['HIP_LAUNCH_BLOCKING',['../group__HIP-ENV.html#ga8049b329f2663b4572d81e7a9aa8a155',1,'HIP_LAUNCH_BLOCKING(): hip_hcc.cpp'],['../group__HIP-ENV.html#ga8049b329f2663b4572d81e7a9aa8a155',1,'HIP_LAUNCH_BLOCKING(): hip_hcc.cpp']]], + ['hip_5fprint_5fenv',['HIP_PRINT_ENV',['../group__HIP-ENV.html#ga1e1c85dbb250f1acfb484c1be1f3b28a',1,'HIP_PRINT_ENV(): hip_hcc.cpp'],['../group__HIP-ENV.html#ga1e1c85dbb250f1acfb484c1be1f3b28a',1,'HIP_PRINT_ENV(): hip_hcc.cpp']]], + ['hip_5fruntime_2eh',['hip_runtime.h',['../hcc__detail_2hip__runtime_8h.html',1,'']]], + ['hip_5fruntime_5fapi_2eh',['hip_runtime_api.h',['../hcc__detail_2hip__runtime__api_8h.html',1,'']]], + ['hip_5ftexture_2eh',['hip_texture.h',['../hip__texture_8h.html',1,'']]], + ['hip_5ftrace_5fapi',['HIP_TRACE_API',['../group__HIP-ENV.html#gaae9c541f3e25b8f002762337a03fec28',1,'HIP_TRACE_API(): hip_hcc.cpp'],['../group__HIP-ENV.html#gaae9c541f3e25b8f002762337a03fec28',1,'HIP_TRACE_API(): hip_hcc.cpp']]], + ['hip_5fvector_5ftypes_2eh',['hip_vector_types.h',['../hcc__detail_2hip__vector__types_8h.html',1,'']]], + ['hipchannelformatdesc',['hipChannelFormatDesc',['../structhipChannelFormatDesc.html',1,'']]], + ['hipdevicearch_5ft',['hipDeviceArch_t',['../structhipDeviceArch__t.html',1,'']]], + ['hipdeviceattribute_5ft',['hipDeviceAttribute_t',['../group__GlobalDefs.html#gacc0acd7b9bda126c6bb3dfd6e2796d7c',1,'hip_runtime_api.h']]], + ['hipdeviceattributeclockrate',['hipDeviceAttributeClockRate',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca2300e077e020e7967592065561373b00',1,'hip_runtime_api.h']]], + ['hipdeviceattributecomputecapabilitymajor',['hipDeviceAttributeComputeCapabilityMajor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca2735739cf977b7d303266f6781131e8d',1,'hip_runtime_api.h']]], + ['hipdeviceattributecomputecapabilityminor',['hipDeviceAttributeComputeCapabilityMinor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca38edc4fcae456e47160d349da3249b85',1,'hip_runtime_api.h']]], + ['hipdeviceattributecomputemode',['hipDeviceAttributeComputeMode',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca4d0369a6ef7bd7890fdcabc16ed3385d',1,'hip_runtime_api.h']]], + ['hipdeviceattributeconcurrentkernels',['hipDeviceAttributeConcurrentKernels',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cad9f45254d0d048677f560032532d5504',1,'hip_runtime_api.h']]], + ['hipdeviceattributel2cachesize',['hipDeviceAttributeL2CacheSize',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca582ae5a26a7148504878890028e4b64c',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxblockdimx',['hipDeviceAttributeMaxBlockDimX',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cac1e4ac589db0d8adbbc241e3d0fcd594',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxblockdimy',['hipDeviceAttributeMaxBlockDimY',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca187dbffe12db09a56c0f75c340d879c9',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxblockdimz',['hipDeviceAttributeMaxBlockDimZ',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7caf811f51e03d1ffb025d80ac1da088675',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxgriddimx',['hipDeviceAttributeMaxGridDimX',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca03db8df0e7a9fbdaae683d97e8ac9c87',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxgriddimy',['hipDeviceAttributeMaxGridDimY',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca5b5cc49972679c5ccf62b79425ee99df',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxgriddimz',['hipDeviceAttributeMaxGridDimZ',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca6c206ac083999caf4640e5d91dae24f7',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxregistersperblock',['hipDeviceAttributeMaxRegistersPerBlock',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca82289b170192b6ea742be0efc6f95107',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxsharedmemoryperblock',['hipDeviceAttributeMaxSharedMemoryPerBlock',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca7bca3aa18b26d40eba043ae93e15c7e5',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxsharedmemorypermultiprocessor',['hipDeviceAttributeMaxSharedMemoryPerMultiprocessor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cad3e7f3d01533b32e12211172fcf410ba',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxthreadsperblock',['hipDeviceAttributeMaxThreadsPerBlock',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca8327aa23782d9c994bdef33a6d62e02e',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxthreadspermultiprocessor',['hipDeviceAttributeMaxThreadsPerMultiProcessor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7caddc08922b491eb1f6a583833cbf4e2f0',1,'hip_runtime_api.h']]], + ['hipdeviceattributememoryclockrate',['hipDeviceAttributeMemoryClockRate',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca6b68deafd65f036b30dc8051573eb000',1,'hip_runtime_api.h']]], + ['hipdeviceattributemultiprocessorcount',['hipDeviceAttributeMultiprocessorCount',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca5c1519870733ccf0b83f722678240e5f',1,'hip_runtime_api.h']]], + ['hipdeviceattributepcibusid',['hipDeviceAttributePciBusId',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca572b29c44f1322aa7657fdd784832f88',1,'hip_runtime_api.h']]], + ['hipdeviceattributepcideviceid',['hipDeviceAttributePciDeviceId',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca955d90286e87be9e3528f0b817ab32ff',1,'hip_runtime_api.h']]], + ['hipdeviceattributetotalconstantmemory',['hipDeviceAttributeTotalConstantMemory',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cac6089ac3a0f9c77cc382fb0eaa73ae9c',1,'hip_runtime_api.h']]], + ['hipdeviceattributewarpsize',['hipDeviceAttributeWarpSize',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7caffd94133e823247a6f1215343232f6ec',1,'hip_runtime_api.h']]], + ['hipdevicecanaccesspeer',['hipDeviceCanAccessPeer',['../group__PeerToPeer.html#gab53a55dbc087ff659918fd04287de3d3',1,'hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice): hip_hcc.cpp'],['../group__PeerToPeer.html#gab53a55dbc087ff659918fd04287de3d3',1,'hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice): hip_hcc.cpp']]], + ['hipdevicedisablepeeraccess',['hipDeviceDisablePeerAccess',['../group__PeerToPeer.html#ga41e60c01f63597529da1cd77bdd55379',1,'hipDeviceDisablePeerAccess(int peerDevice): hip_hcc.cpp'],['../group__PeerToPeer.html#ga41e60c01f63597529da1cd77bdd55379',1,'hipDeviceDisablePeerAccess(int peerDevice): hip_hcc.cpp']]], + ['hipdeviceenablepeeraccess',['hipDeviceEnablePeerAccess',['../group__PeerToPeer.html#ga098e0d626edbfb69b66d141a5a8b7dc6',1,'hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags): hip_hcc.cpp'],['../group__PeerToPeer.html#ga098e0d626edbfb69b66d141a5a8b7dc6',1,'hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags): hip_hcc.cpp']]], + ['hipdevicegetattribute',['hipDeviceGetAttribute',['../group__Device.html#gac49518ff2b26b98ea2ec9e9268761a24',1,'hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr, int device): hip_hcc.cpp'],['../group__Device.html#gac49518ff2b26b98ea2ec9e9268761a24',1,'hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr, int device): hip_hcc.cpp']]], + ['hipdevicegetcacheconfig',['hipDeviceGetCacheConfig',['../group__Device.html#gaeeffa2456c5430400bea75ecd6ad1e68',1,'hipDeviceGetCacheConfig(hipFuncCache *cacheConfig): hip_hcc.cpp'],['../group__Device.html#gaeeffa2456c5430400bea75ecd6ad1e68',1,'hipDeviceGetCacheConfig(hipFuncCache *cacheConfig): hip_hcc.cpp']]], + ['hipdevicegetproperties',['hipDeviceGetProperties',['../group__Device.html#gad9ee6822e3e55431811fb6a00f7a1c10',1,'hipDeviceGetProperties(hipDeviceProp_t *prop, int device): hip_hcc.cpp'],['../group__Device.html#gad9ee6822e3e55431811fb6a00f7a1c10',1,'hipDeviceGetProperties(hipDeviceProp_t *props, int device): hip_hcc.cpp']]], + ['hipdevicegetsharedmemconfig',['hipDeviceGetSharedMemConfig',['../group__Device.html#ga1bb08f774a34a468d969a8a04791c9bb',1,'hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig): hip_hcc.cpp'],['../group__Device.html#ga1bb08f774a34a468d969a8a04791c9bb',1,'hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig): hip_hcc.cpp']]], + ['hipdeviceprop_5ft',['hipDeviceProp_t',['../structhipDeviceProp__t.html',1,'']]], + ['hipdevicereset',['hipDeviceReset',['../group__Device.html#ga8d57161ae56a8edc46eeda447417bf6c',1,'hipDeviceReset(void): hip_hcc.cpp'],['../group__Device.html#ga8d57161ae56a8edc46eeda447417bf6c',1,'hipDeviceReset(void): hip_hcc.cpp']]], + ['hipdevicesetcacheconfig',['hipDeviceSetCacheConfig',['../group__Device.html#gac2b282179f29c4c0ca7b5391242c6a4c',1,'hipDeviceSetCacheConfig(hipFuncCache cacheConfig): hip_hcc.cpp'],['../group__Device.html#gac2b282179f29c4c0ca7b5391242c6a4c',1,'hipDeviceSetCacheConfig(hipFuncCache cacheConfig): hip_hcc.cpp']]], + ['hipdevicesetsharedmemconfig',['hipDeviceSetSharedMemConfig',['../group__Device.html#ga9b1f279084e76691cedfbfadf9c717ee',1,'hipDeviceSetSharedMemConfig(hipSharedMemConfig config): hip_hcc.cpp'],['../group__Device.html#ga9b1f279084e76691cedfbfadf9c717ee',1,'hipDeviceSetSharedMemConfig(hipSharedMemConfig config): hip_hcc.cpp']]], + ['hipdevicesynchronize',['hipDeviceSynchronize',['../group__Device.html#gaefdc2847fb1d6c3fb1354e827a191ebd',1,'hipDeviceSynchronize(void): hip_hcc.cpp'],['../group__Device.html#gaefdc2847fb1d6c3fb1354e827a191ebd',1,'hipDeviceSynchronize(void): hip_hcc.cpp']]], + ['hipdrivergetversion',['hipDriverGetVersion',['../group__Version.html#gaf6c342f52d2a29a0aca5cdd89b4dd47c',1,'hipDriverGetVersion(int *driverVersion): hip_hcc.cpp'],['../group__Version.html#gaf6c342f52d2a29a0aca5cdd89b4dd47c',1,'hipDriverGetVersion(int *driverVersion): hip_hcc.cpp']]], + ['hiperror_5ft',['hipError_t',['../group__GlobalDefs.html#gadf5010f6e140a53ecbdf949e73e87594',1,'hip_runtime_api.h']]], + ['hiperrorinvaliddevice',['hipErrorInvalidDevice',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a07ab9b704ea693c1781a52741c60cd0d',1,'hip_runtime_api.h']]], + ['hiperrorinvalidresourcehandle',['hipErrorInvalidResourceHandle',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a88e525a7c8f35552dfada58e9f2f6d3a',1,'hip_runtime_api.h']]], + ['hiperrorinvalidvalue',['hipErrorInvalidValue',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a1e8215fe1108a508bad3944bce7b4d83',1,'hip_runtime_api.h']]], + ['hiperrormemoryallocation',['hipErrorMemoryAllocation',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a8293288a10109874749afe2562db09f2',1,'hip_runtime_api.h']]], + ['hiperrormemoryfree',['hipErrorMemoryFree',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a813b3f014e2a3932d1f0e3e712cf9d3c',1,'hip_runtime_api.h']]], + ['hiperrornodevice',['hipErrorNoDevice',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594ad4406972c318df36d231310a15131c24',1,'hip_runtime_api.h']]], + ['hiperrornotready',['hipErrorNotReady',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aa9638063c8746a9d1fda2b2069a0a9f1',1,'hip_runtime_api.h']]], + ['hiperroroutofresources',['hipErrorOutOfResources',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a60c1c080b79bdde9ef5e808f974ac9ed',1,'hip_runtime_api.h']]], + ['hiperrortbd',['hipErrorTbd',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594ab556409e11ddb0c4cf77a2f4fc91ea9e',1,'hip_runtime_api.h']]], + ['hiperrorunknown',['hipErrorUnknown',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aa74e64c5b2f5fb0d6a92681f5b234073',1,'hip_runtime_api.h']]], + ['hiperrorunknownsymbol',['hipErrorUnknownSymbol',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a45b297e6c3b2029dce1348658421481b',1,'hip_runtime_api.h']]], + ['hipevent_5ft',['hipEvent_t',['../structhipEvent__t.html',1,'']]], + ['hipeventblockingsync',['hipEventBlockingSync',['../group__GlobalDefs.html#gafa1c076a5b991763a98695063f1ea11d',1,'hip_runtime_api.h']]], + ['hipeventcreatewithflags',['hipEventCreateWithFlags',['../group__Event.html#gae86a5acb1b22b61bc9ecb9c28fc71b75',1,'hipEventCreateWithFlags(hipEvent_t *event, unsigned flags): hip_hcc.cpp'],['../group__Event.html#gae86a5acb1b22b61bc9ecb9c28fc71b75',1,'hipEventCreateWithFlags(hipEvent_t *event, unsigned flags): hip_hcc.cpp']]], + ['hipeventdefault',['hipEventDefault',['../group__GlobalDefs.html#ga122a5853359eba97cf047ddd153740f0',1,'hip_runtime_api.h']]], + ['hipeventdestroy',['hipEventDestroy',['../group__Event.html#ga83260357dce0c39e8c6a3c74ec97484c',1,'hipEventDestroy(hipEvent_t event): hip_hcc.cpp'],['../group__Event.html#ga83260357dce0c39e8c6a3c74ec97484c',1,'hipEventDestroy(hipEvent_t event): hip_hcc.cpp']]], + ['hipeventdisabletiming',['hipEventDisableTiming',['../group__GlobalDefs.html#ga3c0f44a85e36a4c67671da6bcdad0351',1,'hip_runtime_api.h']]], + ['hipeventelapsedtime',['hipEventElapsedTime',['../group__Event.html#gad4128b815cb475c8e13c7e66ff6250b7',1,'hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop): hip_hcc.cpp'],['../group__Event.html#gad4128b815cb475c8e13c7e66ff6250b7',1,'hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop): hip_hcc.cpp']]], + ['hipeventinterprocess',['hipEventInterprocess',['../group__GlobalDefs.html#ga0f01d74059baa704e42aeff8222166bb',1,'hip_runtime_api.h']]], + ['hipeventquery',['hipEventQuery',['../group__Event.html#ga5d12d7b798b5ceb5932d1ac21f5ac776',1,'hipEventQuery(hipEvent_t event): hip_hcc.cpp'],['../group__Event.html#ga5d12d7b798b5ceb5932d1ac21f5ac776',1,'hipEventQuery(hipEvent_t event): hip_hcc.cpp']]], + ['hipeventrecord',['hipEventRecord',['../group__Event.html#gace88ebd8c7ec42a6c2cebda2e8b0cb38',1,'hipEventRecord(hipEvent_t event, hipStream_t stream=NULL): hip_hcc.cpp'],['../group__Event.html#gace88ebd8c7ec42a6c2cebda2e8b0cb38',1,'hipEventRecord(hipEvent_t event, hipStream_t stream): hip_hcc.cpp']]], + ['hipeventsynchronize',['hipEventSynchronize',['../group__Event.html#ga1f72d98ba5d6f7dc3da54e0c41fe38b1',1,'hipEventSynchronize(hipEvent_t event): hip_hcc.cpp'],['../group__Event.html#ga1f72d98ba5d6f7dc3da54e0c41fe38b1',1,'hipEventSynchronize(hipEvent_t event): hip_hcc.cpp']]], + ['hipfiltermodepoint',['hipFilterModePoint',['../hip__texture_8h.html#aa2f0b6002b81d0a43a808cb880bb21e6a56ede038ab7c805ec4b5b61d2b678dfc',1,'hip_texture.h']]], + ['hipfree',['hipFree',['../group__Memory.html#ga740d08da65cae1441ba32f8fedb863d1',1,'hipFree(void *ptr): hip_hcc.cpp'],['../group__Memory.html#ga740d08da65cae1441ba32f8fedb863d1',1,'hipFree(void *ptr): hip_hcc.cpp']]], + ['hipfreehost',['hipFreeHost',['../group__Memory.html#ga28d7d92836116dfadeb62e416ee887d3',1,'hipFreeHost(void *ptr): hip_hcc.cpp'],['../group__Memory.html#ga28d7d92836116dfadeb62e416ee887d3',1,'hipFreeHost(void *ptr): hip_hcc.cpp']]], + ['hipfunccache',['hipFuncCache',['../group__GlobalDefs.html#gac7e4bfd88340fc06642136c839a3d822',1,'hipFuncCache(): hip_runtime_api.h'],['../group__GlobalDefs.html#gaad15dc7939a0a25b16e4aa161fb41eee',1,'hipFuncCache(): hip_runtime_api.h']]], + ['hipfunccachepreferequal',['hipFuncCachePreferEqual',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a0ddab0e840107634a152033103be44d7',1,'hip_runtime_api.h']]], + ['hipfunccachepreferl1',['hipFuncCachePreferL1',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a636a3c140db6b9d4a8bf7d5a61c398c5',1,'hip_runtime_api.h']]], + ['hipfunccacheprefernone',['hipFuncCachePreferNone',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a0813fbaa008ce1231ff9fed3911eb3af',1,'hip_runtime_api.h']]], + ['hipfunccacheprefershared',['hipFuncCachePreferShared',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a9b34337dfbadba25ed2aa270bbcabc43',1,'hip_runtime_api.h']]], + ['hipfuncsetcacheconfig',['hipFuncSetCacheConfig',['../group__Device.html#gadd94a910c2b840833cc325b1e5425702',1,'hipFuncSetCacheConfig(hipFuncCache config): hip_hcc.cpp'],['../group__Device.html#gadd94a910c2b840833cc325b1e5425702',1,'hipFuncSetCacheConfig(hipFuncCache cacheConfig): hip_hcc.cpp']]], + ['hipgetdevice',['hipGetDevice',['../group__Device.html#gaffc83567f2df3bbe2d37a19872d60f24',1,'hipGetDevice(int *device): hip_hcc.cpp'],['../group__Device.html#gaffc83567f2df3bbe2d37a19872d60f24',1,'hipGetDevice(int *device): hip_hcc.cpp']]], + ['hipgetdevicecount',['hipGetDeviceCount',['../group__Device.html#ga8555d5c76d88c50ddbf54ae70b568394',1,'hipGetDeviceCount(int *count): hip_hcc.cpp'],['../group__Device.html#ga8555d5c76d88c50ddbf54ae70b568394',1,'hipGetDeviceCount(int *count): hip_hcc.cpp']]], + ['hipgeterrorname',['hipGetErrorName',['../group__Error.html#ga88c474d77635523dbf6ca67be7b56999',1,'hipGetErrorName(hipError_t hip_error): hip_hcc.cpp'],['../group__Error.html#ga88c474d77635523dbf6ca67be7b56999',1,'hipGetErrorName(hipError_t hip_error): hip_hcc.cpp']]], + ['hipgeterrorstring',['hipGetErrorString',['../group__Error.html#ga5959779a654bbc98ffe6d36ab536740a',1,'hipGetErrorString(hipError_t hip_error): hip_hcc.cpp'],['../group__Error.html#ga5959779a654bbc98ffe6d36ab536740a',1,'hipGetErrorString(hipError_t hip_error): hip_hcc.cpp']]], + ['hipgetlasterror',['hipGetLastError',['../group__Error.html#ga533daeb9114d7fc2db8d867adf9e419b',1,'hipGetLastError(void): hip_hcc.cpp'],['../group__Error.html#ga533daeb9114d7fc2db8d867adf9e419b',1,'hipGetLastError(): hip_hcc.cpp']]], + ['hiphccgetaccelerator',['hipHccGetAccelerator',['../group__HCC__Specific.html#ga0d24b3157fd1b16d38672bb157ec4cd4',1,'hipHccGetAccelerator(int deviceId, hc::accelerator *acc): hip_hcc.cpp'],['../group__HCC__Specific.html#ga0d24b3157fd1b16d38672bb157ec4cd4',1,'hipHccGetAccelerator(int deviceId, hc::accelerator *acc): hip_hcc.cpp']]], + ['hiphccgetacceleratorview',['hipHccGetAcceleratorView',['../group__HCC__Specific.html#ga1a7087ea9c3c3323270d7cce73650b44',1,'hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av): hip_hcc.cpp'],['../group__HCC__Specific.html#ga1a7087ea9c3c3323270d7cce73650b44',1,'hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av): hip_hcc.cpp']]], + ['hipmalloc',['hipMalloc',['../group__Memory.html#ga4c6fcfe80010069d2792780d00dcead2',1,'hipMalloc(void **ptr, size_t size): hip_hcc.cpp'],['../group__Memory.html#ga4c6fcfe80010069d2792780d00dcead2',1,'hipMalloc(void **ptr, size_t sizeBytes): hip_hcc.cpp']]], + ['hipmallochost',['hipMallocHost',['../group__Memory.html#ga66399e729223ff5b66ffc16297c0710e',1,'hipMallocHost(void **ptr, size_t size): hip_hcc.cpp'],['../group__Memory.html#ga66399e729223ff5b66ffc16297c0710e',1,'hipMallocHost(void **ptr, size_t sizeBytes): hip_hcc.cpp']]], + ['hipmemcpy',['hipMemcpy',['../group__Memory.html#gac1a055d288302edd641c6d7416858e1e',1,'hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind): hip_hcc.cpp'],['../group__Memory.html#gac1a055d288302edd641c6d7416858e1e',1,'hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind): hip_hcc.cpp']]], + ['hipmemcpyasync',['hipMemcpyAsync',['../group__Memory.html#ga8ad5a0b13458917e1b9437732b21af54',1,'hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0): hip_hcc.cpp'],['../group__Memory.html#ga8ad5a0b13458917e1b9437732b21af54',1,'hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream): hip_hcc.cpp']]], + ['hipmemcpydefault',['hipMemcpyDefault',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18a4e37107e416f79a2edf2b6534163c823',1,'hip_runtime_api.h']]], + ['hipmemcpydevicetodevice',['hipMemcpyDeviceToDevice',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18abd05a09d3105e0ce25b34dd91cf83f88',1,'hip_runtime_api.h']]], + ['hipmemcpydevicetohost',['hipMemcpyDeviceToHost',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18aba2505e9ce1e5382f17730bc670917d1',1,'hip_runtime_api.h']]], + ['hipmemcpyhosttodevice',['hipMemcpyHostToDevice',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18aff32175ecb0c7113200286eff8211008',1,'hip_runtime_api.h']]], + ['hipmemcpyhosttohost',['hipMemcpyHostToHost',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18a9d66b705aa85a9c83f0f533cef70d0af',1,'hip_runtime_api.h']]], + ['hipmemcpykind',['hipMemcpyKind',['../group__GlobalDefs.html#ga232e222db36b1fc672ba98054d036a18',1,'hip_runtime_api.h']]], + ['hipmemcpypeer',['hipMemcpyPeer',['../group__PeerToPeer.html#ga72ae9e7f498ab5684580892a5d7d8e2d',1,'hipMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes): hip_hcc.cpp'],['../group__PeerToPeer.html#ga72ae9e7f498ab5684580892a5d7d8e2d',1,'hipMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes): hip_hcc.cpp']]], + ['hipmemcpypeerasync',['hipMemcpyPeerAsync',['../group__PeerToPeer.html#gab6211c18ca1e23252ef080cd6be855ca',1,'hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0): hip_hcc.cpp'],['../group__PeerToPeer.html#gab6211c18ca1e23252ef080cd6be855ca',1,'hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream): hip_hcc.cpp']]], + ['hipmemcpytosymbol',['hipMemcpyToSymbol',['../group__Memory.html#ga131ac5c1ba04e186112491cb9bf964bc',1,'hipMemcpyToSymbol(const char *symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind): hip_hcc.cpp'],['../group__Memory.html#ga131ac5c1ba04e186112491cb9bf964bc',1,'hipMemcpyToSymbol(const char *symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind): hip_hcc.cpp']]], + ['hipmemset',['hipMemset',['../group__Memory.html#gac7441e74affcce4b8b69dba996c5ebc4',1,'hipMemset(void *dst, int value, size_t sizeBytes): hip_hcc.cpp'],['../group__Memory.html#gac7441e74affcce4b8b69dba996c5ebc4',1,'hipMemset(void *dst, int value, size_t sizeBytes): hip_hcc.cpp']]], + ['hipmemsetasync',['hipMemsetAsync',['../group__Memory.html#gaee4ed665ce0a60c661a809c175320a0c',1,'hipMemsetAsync(void *dst, int value, size_t sizeBytes, hipStream_t=0): hip_hcc.cpp'],['../group__Memory.html#gaee4ed665ce0a60c661a809c175320a0c',1,'hipMemsetAsync(void *dst, int value, size_t sizeBytes, hipStream_t stream): hip_hcc.cpp']]], + ['hippeekatlasterror',['hipPeekAtLastError',['../group__Error.html#ga1dd660bc739f7e13edd34615660f0148',1,'hip_runtime_api.h']]], + ['hipreadmodeelementtype',['hipReadModeElementType',['../hip__texture_8h.html#a442e950774f7306dc33692e358c92c94a829645801202174d052d667ffa4e1b8d',1,'hip_texture.h']]], + ['hipsetdevice',['hipSetDevice',['../group__Device.html#ga8ec0b093af0adadc7fe98bf33fa21620',1,'hipSetDevice(int device): hip_hcc.cpp'],['../group__Device.html#ga8ec0b093af0adadc7fe98bf33fa21620',1,'hipSetDevice(int device): hip_hcc.cpp']]], + ['hipsharedmembanksizedefault',['hipSharedMemBankSizeDefault',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104eaf5b325c9b7bde878913f768eaba5014d',1,'hip_runtime_api.h']]], + ['hipsharedmembanksizeeightbyte',['hipSharedMemBankSizeEightByte',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104ea64518b4f5a25f536c883330167e79258',1,'hip_runtime_api.h']]], + ['hipsharedmembanksizefourbyte',['hipSharedMemBankSizeFourByte',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104ea0a95a6e0c33106c42d66ab9476ff954a',1,'hip_runtime_api.h']]], + ['hipsharedmemconfig',['hipSharedMemConfig',['../group__GlobalDefs.html#ga2e17b71d94ac350f2ccd914fd49d104e',1,'hipSharedMemConfig(): hip_runtime_api.h'],['../group__GlobalDefs.html#ga6b1ca424fa26a5fb718937d662eaee7f',1,'hipSharedMemConfig(): hip_runtime_api.h']]], + ['hipstreamcreatewithflags',['hipStreamCreateWithFlags',['../group__Stream.html#gaf2382e3cc6632332a8983a0f58e43494',1,'hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags): hip_hcc.cpp'],['../group__Stream.html#gaf2382e3cc6632332a8983a0f58e43494',1,'hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags): hip_hcc.cpp']]], + ['hipstreamdefault',['hipStreamDefault',['../group__GlobalDefs.html#ga6df5f70eb976836ab3598cacf0ffcdf9',1,'hip_runtime_api.h']]], + ['hipstreamdestroy',['hipStreamDestroy',['../group__Stream.html#ga3076a3499ed2c7821311006100bb95ec',1,'hipStreamDestroy(hipStream_t stream): hip_hcc.cpp'],['../group__Stream.html#ga3076a3499ed2c7821311006100bb95ec',1,'hipStreamDestroy(hipStream_t stream): hip_hcc.cpp']]], + ['hipstreamgetflags',['hipStreamGetFlags',['../group__Stream.html#ga3249555a26439591b8873f70b39bb116',1,'hipStreamGetFlags(hipStream_t stream, unsigned int *flags): hip_hcc.cpp'],['../group__Stream.html#ga3249555a26439591b8873f70b39bb116',1,'hipStreamGetFlags(hipStream_t stream, unsigned int *flags): hip_hcc.cpp']]], + ['hipstreamnonblocking',['hipStreamNonBlocking',['../group__GlobalDefs.html#gaaba9ae995d9b43b7d1ee70c6fa12c57d',1,'hip_runtime_api.h']]], + ['hipstreamsynchronize',['hipStreamSynchronize',['../group__Stream.html#gabbfb9f573a6ebe8c478605ecb5504a74',1,'hipStreamSynchronize(hipStream_t stream): hip_hcc.cpp'],['../group__Stream.html#gabbfb9f573a6ebe8c478605ecb5504a74',1,'hipStreamSynchronize(hipStream_t stream): hip_hcc.cpp']]], + ['hipstreamwaitevent',['hipStreamWaitEvent',['../group__Stream.html#gacdd84c8f8ef1539c96c57c1d5bcae633',1,'hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags): hip_hcc.cpp'],['../group__Stream.html#gacdd84c8f8ef1539c96c57c1d5bcae633',1,'hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags): hip_hcc.cpp']]], + ['hipsuccess',['hipSuccess',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aadfbdb847b149723c684ebd764556063',1,'hip_runtime_api.h']]], + ['hiptexturefiltermode',['hipTextureFilterMode',['../hip__texture_8h.html#aa2f0b6002b81d0a43a808cb880bb21e6',1,'hip_texture.h']]], + ['hiptexturereadmode',['hipTextureReadMode',['../hip__texture_8h.html#a442e950774f7306dc33692e358c92c94',1,'hip_texture.h']]], + ['hipthreadidx_5fx',['hipThreadIdx_x',['../hcc__detail_2hip__runtime_8h.html#a48f5f9da77c5fab1fbcf0205bb347d89',1,'hip_runtime.h']]], + ['host_5fdefines_2eh',['host_defines.h',['../host__defines_8h.html',1,'']]], + ['heterogeneous_2dcomputing_20interface_20for_20portability_20_28hip_29',['Heterogeneous-computing Interface for Portability (HIP)',['../index.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_8.js b/projects/hip/docs/RuntimeAPI/html/search/all_8.js index 41a7c59602..69beb95adf 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_8.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_8.js @@ -1,4 +1,6 @@ var searchData= [ - ['l2cachesize',['l2CacheSize',['../structhipDeviceProp__t.html#a24404decccc16833973c803ced6f3a51',1,'hipDeviceProp_t']]] + ['ihipdevice_5ft',['ihipDevice_t',['../structihipDevice__t.html',1,'']]], + ['ihipevent_5ft',['ihipEvent_t',['../structihipEvent__t.html',1,'']]], + ['ihipstream_5ft',['ihipStream_t',['../structihipStream__t.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_9.js b/projects/hip/docs/RuntimeAPI/html/search/all_9.js index fd7cb073be..41a7c59602 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_9.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_9.js @@ -1,12 +1,4 @@ var searchData= [ - ['major',['major',['../structhipDeviceProp__t.html#aec9e4173c2e34cc232300c415dbd5e4f',1,'hipDeviceProp_t']]], - ['maxgridsize',['maxGridSize',['../structhipDeviceProp__t.html#ae529c23929f592120081fed31d877a55',1,'hipDeviceProp_t']]], - ['maxthreadsdim',['maxThreadsDim',['../structhipDeviceProp__t.html#a8ebba6fc12f80c9a9cf9b9193f0da465',1,'hipDeviceProp_t']]], - ['maxthreadsperblock',['maxThreadsPerBlock',['../structhipDeviceProp__t.html#af971cf1ca3ec1f68ad09036c0cc672e0',1,'hipDeviceProp_t']]], - ['maxthreadspermultiprocessor',['maxThreadsPerMultiProcessor',['../structhipDeviceProp__t.html#a23a39f4fd795addb3b125e9c3f6295ea',1,'hipDeviceProp_t']]], - ['memory_20management',['Memory Management',['../group__Memory.html',1,'']]], - ['minor',['minor',['../structhipDeviceProp__t.html#abb51208e2509a7a1d107f0da69108938',1,'hipDeviceProp_t']]], - ['multiprocessorcount',['multiProcessorCount',['../structhipDeviceProp__t.html#add8d9d2ad52aece9fd1dbe25c18d9d57',1,'hipDeviceProp_t']]], - ['management',['Management',['../group__Version.html',1,'']]] + ['l2cachesize',['l2CacheSize',['../structhipDeviceProp__t.html#a24404decccc16833973c803ced6f3a51',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_a.js b/projects/hip/docs/RuntimeAPI/html/search/all_a.js index 124bf0ddb8..ea8ffca597 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_a.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_a.js @@ -1,4 +1,14 @@ var searchData= [ - ['name',['name',['../structhipDeviceProp__t.html#a5b44bf8fa46faefcde989942b1d11a5e',1,'hipDeviceProp_t']]] + ['major',['major',['../structhipDeviceProp__t.html#aec9e4173c2e34cc232300c415dbd5e4f',1,'hipDeviceProp_t']]], + ['maxgridsize',['maxGridSize',['../structhipDeviceProp__t.html#ae529c23929f592120081fed31d877a55',1,'hipDeviceProp_t']]], + ['maxsharedmemorypermultiprocessor',['maxSharedMemoryPerMultiProcessor',['../structhipDeviceProp__t.html#aa1a32a7f387f6da845db7b228711fce8',1,'hipDeviceProp_t']]], + ['maxthreadsdim',['maxThreadsDim',['../structhipDeviceProp__t.html#a8ebba6fc12f80c9a9cf9b9193f0da465',1,'hipDeviceProp_t']]], + ['maxthreadsperblock',['maxThreadsPerBlock',['../structhipDeviceProp__t.html#af971cf1ca3ec1f68ad09036c0cc672e0',1,'hipDeviceProp_t']]], + ['maxthreadspermultiprocessor',['maxThreadsPerMultiProcessor',['../structhipDeviceProp__t.html#a23a39f4fd795addb3b125e9c3f6295ea',1,'hipDeviceProp_t']]], + ['memory_20management',['Memory Management',['../group__Memory.html',1,'']]], + ['memoryclockrate',['memoryClockRate',['../structhipDeviceProp__t.html#a6db0ab8e7e8cc13c84d7bb7f70226d5e',1,'hipDeviceProp_t']]], + ['minor',['minor',['../structhipDeviceProp__t.html#abb51208e2509a7a1d107f0da69108938',1,'hipDeviceProp_t']]], + ['multiprocessorcount',['multiProcessorCount',['../structhipDeviceProp__t.html#add8d9d2ad52aece9fd1dbe25c18d9d57',1,'hipDeviceProp_t']]], + ['management',['Management',['../group__Version.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_b.js b/projects/hip/docs/RuntimeAPI/html/search/all_b.js index 44ba50e0b7..124bf0ddb8 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_b.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_b.js @@ -1,4 +1,4 @@ var searchData= [ - ['regsperblock',['regsPerBlock',['../structhipDeviceProp__t.html#a73c1c21648a901799ff6bef83c11135b',1,'hipDeviceProp_t']]] + ['name',['name',['../structhipDeviceProp__t.html#a5b44bf8fa46faefcde989942b1d11a5e',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_c.js b/projects/hip/docs/RuntimeAPI/html/search/all_c.js index 559f8252a3..3eaae3688b 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_c.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_c.js @@ -1,5 +1,4 @@ var searchData= [ - ['sharedmemperblock',['sharedMemPerBlock',['../structhipDeviceProp__t.html#a3b9138678a0795c2677eddcfb1c67156',1,'hipDeviceProp_t']]], - ['stream_20management',['Stream Management',['../group__Stream.html',1,'']]] + ['one_5fcomponent_5faccess',['ONE_COMPONENT_ACCESS',['../hcc__detail_2hip__vector__types_8h.html#add5d9c0f058c5a52c2b9165a66035d0e',1,'hip_vector_types.h']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_d.js b/projects/hip/docs/RuntimeAPI/html/search/all_d.js index 9043ae6945..71b6a5df56 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_d.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_d.js @@ -1,7 +1,5 @@ var searchData= [ - ['texture',['texture',['../structtexture.html',1,'texture< T, texType, hipTextureReadMode >'],['../group__Texture.html',1,'(Global Namespace)']]], - ['texturereference',['textureReference',['../structtextureReference.html',1,'']]], - ['totalconstmem',['totalConstMem',['../structhipDeviceProp__t.html#a29880232c56120be3455ce00d5379665',1,'hipDeviceProp_t']]], - ['totalglobalmem',['totalGlobalMem',['../structhipDeviceProp__t.html#acedd6a2d23423441e4bf51c4a1b719f9',1,'hipDeviceProp_t']]] + ['pcibusid',['pciBusID',['../structhipDeviceProp__t.html#a1350f64d49b717ed3a06458f7549ccb0',1,'hipDeviceProp_t']]], + ['pcideviceid',['pciDeviceID',['../structhipDeviceProp__t.html#ae6aa845dc2d540f85098ea30be35f4eb',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_e.js b/projects/hip/docs/RuntimeAPI/html/search/all_e.js index 46a1400a7b..44ba50e0b7 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_e.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_e.js @@ -1,4 +1,4 @@ var searchData= [ - ['warpsize',['warpSize',['../structhipDeviceProp__t.html#af3357d33c004608bf05bc21a352be81b',1,'hipDeviceProp_t']]] + ['regsperblock',['regsPerBlock',['../structhipDeviceProp__t.html#a73c1c21648a901799ff6bef83c11135b',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/all_f.js b/projects/hip/docs/RuntimeAPI/html/search/all_f.js index 250c203caf..559f8252a3 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/all_f.js +++ b/projects/hip/docs/RuntimeAPI/html/search/all_f.js @@ -1,4 +1,5 @@ var searchData= [ - ['x',['x',['../structdim3.html#ac866c05f83a28dac20a153fc65b3b16c',1,'dim3']]] + ['sharedmemperblock',['sharedMemPerBlock',['../structhipDeviceProp__t.html#a3b9138678a0795c2677eddcfb1c67156',1,'hipDeviceProp_t']]], + ['stream_20management',['Stream Management',['../group__Stream.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/defines_0.js b/projects/hip/docs/RuntimeAPI/html/search/defines_0.js index c7c61558ca..00fe08ccff 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/defines_0.js +++ b/projects/hip/docs/RuntimeAPI/html/search/defines_0.js @@ -1,4 +1,4 @@ var searchData= [ - ['hipthreadidx_5fx',['hipThreadIdx_x',['../hcc__detail_2hip__runtime_8h.html#a48f5f9da77c5fab1fbcf0205bb347d89',1,'hip_runtime.h']]] + ['_5f_5fhost_5f_5f',['__host__',['../host__defines_8h.html#a803050db3c78e0db3ea59a0c35499622',1,'host_defines.h']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/enums_0.js b/projects/hip/docs/RuntimeAPI/html/search/enums_0.js index c47574066e..8258fd4f05 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/enums_0.js +++ b/projects/hip/docs/RuntimeAPI/html/search/enums_0.js @@ -1,7 +1,10 @@ var searchData= [ + ['hipdeviceattribute_5ft',['hipDeviceAttribute_t',['../group__GlobalDefs.html#gacc0acd7b9bda126c6bb3dfd6e2796d7c',1,'hip_runtime_api.h']]], ['hiperror_5ft',['hipError_t',['../group__GlobalDefs.html#gadf5010f6e140a53ecbdf949e73e87594',1,'hip_runtime_api.h']]], ['hipfunccache',['hipFuncCache',['../group__GlobalDefs.html#gac7e4bfd88340fc06642136c839a3d822',1,'hip_runtime_api.h']]], ['hipmemcpykind',['hipMemcpyKind',['../group__GlobalDefs.html#ga232e222db36b1fc672ba98054d036a18',1,'hip_runtime_api.h']]], - ['hipsharedmemconfig',['hipSharedMemConfig',['../group__GlobalDefs.html#ga2e17b71d94ac350f2ccd914fd49d104e',1,'hip_runtime_api.h']]] + ['hipsharedmemconfig',['hipSharedMemConfig',['../group__GlobalDefs.html#ga2e17b71d94ac350f2ccd914fd49d104e',1,'hip_runtime_api.h']]], + ['hiptexturefiltermode',['hipTextureFilterMode',['../hip__texture_8h.html#aa2f0b6002b81d0a43a808cb880bb21e6',1,'hip_texture.h']]], + ['hiptexturereadmode',['hipTextureReadMode',['../hip__texture_8h.html#a442e950774f7306dc33692e358c92c94',1,'hip_texture.h']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/enumvalues_0.js b/projects/hip/docs/RuntimeAPI/html/search/enumvalues_0.js index bb82e15f9b..5fadbd65af 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/enumvalues_0.js +++ b/projects/hip/docs/RuntimeAPI/html/search/enumvalues_0.js @@ -1,5 +1,28 @@ var searchData= [ + ['hipdeviceattributeclockrate',['hipDeviceAttributeClockRate',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca2300e077e020e7967592065561373b00',1,'hip_runtime_api.h']]], + ['hipdeviceattributecomputecapabilitymajor',['hipDeviceAttributeComputeCapabilityMajor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca2735739cf977b7d303266f6781131e8d',1,'hip_runtime_api.h']]], + ['hipdeviceattributecomputecapabilityminor',['hipDeviceAttributeComputeCapabilityMinor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca38edc4fcae456e47160d349da3249b85',1,'hip_runtime_api.h']]], + ['hipdeviceattributecomputemode',['hipDeviceAttributeComputeMode',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca4d0369a6ef7bd7890fdcabc16ed3385d',1,'hip_runtime_api.h']]], + ['hipdeviceattributeconcurrentkernels',['hipDeviceAttributeConcurrentKernels',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cad9f45254d0d048677f560032532d5504',1,'hip_runtime_api.h']]], + ['hipdeviceattributel2cachesize',['hipDeviceAttributeL2CacheSize',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca582ae5a26a7148504878890028e4b64c',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxblockdimx',['hipDeviceAttributeMaxBlockDimX',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cac1e4ac589db0d8adbbc241e3d0fcd594',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxblockdimy',['hipDeviceAttributeMaxBlockDimY',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca187dbffe12db09a56c0f75c340d879c9',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxblockdimz',['hipDeviceAttributeMaxBlockDimZ',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7caf811f51e03d1ffb025d80ac1da088675',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxgriddimx',['hipDeviceAttributeMaxGridDimX',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca03db8df0e7a9fbdaae683d97e8ac9c87',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxgriddimy',['hipDeviceAttributeMaxGridDimY',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca5b5cc49972679c5ccf62b79425ee99df',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxgriddimz',['hipDeviceAttributeMaxGridDimZ',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca6c206ac083999caf4640e5d91dae24f7',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxregistersperblock',['hipDeviceAttributeMaxRegistersPerBlock',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca82289b170192b6ea742be0efc6f95107',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxsharedmemoryperblock',['hipDeviceAttributeMaxSharedMemoryPerBlock',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca7bca3aa18b26d40eba043ae93e15c7e5',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxsharedmemorypermultiprocessor',['hipDeviceAttributeMaxSharedMemoryPerMultiprocessor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cad3e7f3d01533b32e12211172fcf410ba',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxthreadsperblock',['hipDeviceAttributeMaxThreadsPerBlock',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca8327aa23782d9c994bdef33a6d62e02e',1,'hip_runtime_api.h']]], + ['hipdeviceattributemaxthreadspermultiprocessor',['hipDeviceAttributeMaxThreadsPerMultiProcessor',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7caddc08922b491eb1f6a583833cbf4e2f0',1,'hip_runtime_api.h']]], + ['hipdeviceattributememoryclockrate',['hipDeviceAttributeMemoryClockRate',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca6b68deafd65f036b30dc8051573eb000',1,'hip_runtime_api.h']]], + ['hipdeviceattributemultiprocessorcount',['hipDeviceAttributeMultiprocessorCount',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca5c1519870733ccf0b83f722678240e5f',1,'hip_runtime_api.h']]], + ['hipdeviceattributepcibusid',['hipDeviceAttributePciBusId',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca572b29c44f1322aa7657fdd784832f88',1,'hip_runtime_api.h']]], + ['hipdeviceattributepcideviceid',['hipDeviceAttributePciDeviceId',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7ca955d90286e87be9e3528f0b817ab32ff',1,'hip_runtime_api.h']]], + ['hipdeviceattributetotalconstantmemory',['hipDeviceAttributeTotalConstantMemory',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7cac6089ac3a0f9c77cc382fb0eaa73ae9c',1,'hip_runtime_api.h']]], + ['hipdeviceattributewarpsize',['hipDeviceAttributeWarpSize',['../group__GlobalDefs.html#ggacc0acd7b9bda126c6bb3dfd6e2796d7caffd94133e823247a6f1215343232f6ec',1,'hip_runtime_api.h']]], ['hiperrorinvaliddevice',['hipErrorInvalidDevice',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a07ab9b704ea693c1781a52741c60cd0d',1,'hip_runtime_api.h']]], ['hiperrorinvalidresourcehandle',['hipErrorInvalidResourceHandle',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a88e525a7c8f35552dfada58e9f2f6d3a',1,'hip_runtime_api.h']]], ['hiperrorinvalidvalue',['hipErrorInvalidValue',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a1e8215fe1108a508bad3944bce7b4d83',1,'hip_runtime_api.h']]], @@ -11,6 +34,7 @@ var searchData= ['hiperrortbd',['hipErrorTbd',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594ab556409e11ddb0c4cf77a2f4fc91ea9e',1,'hip_runtime_api.h']]], ['hiperrorunknown',['hipErrorUnknown',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594aa74e64c5b2f5fb0d6a92681f5b234073',1,'hip_runtime_api.h']]], ['hiperrorunknownsymbol',['hipErrorUnknownSymbol',['../group__GlobalDefs.html#ggadf5010f6e140a53ecbdf949e73e87594a45b297e6c3b2029dce1348658421481b',1,'hip_runtime_api.h']]], + ['hipfiltermodepoint',['hipFilterModePoint',['../hip__texture_8h.html#aa2f0b6002b81d0a43a808cb880bb21e6a56ede038ab7c805ec4b5b61d2b678dfc',1,'hip_texture.h']]], ['hipfunccachepreferequal',['hipFuncCachePreferEqual',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a0ddab0e840107634a152033103be44d7',1,'hip_runtime_api.h']]], ['hipfunccachepreferl1',['hipFuncCachePreferL1',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a636a3c140db6b9d4a8bf7d5a61c398c5',1,'hip_runtime_api.h']]], ['hipfunccacheprefernone',['hipFuncCachePreferNone',['../group__GlobalDefs.html#ggac7e4bfd88340fc06642136c839a3d822a0813fbaa008ce1231ff9fed3911eb3af',1,'hip_runtime_api.h']]], @@ -20,6 +44,7 @@ var searchData= ['hipmemcpydevicetohost',['hipMemcpyDeviceToHost',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18aba2505e9ce1e5382f17730bc670917d1',1,'hip_runtime_api.h']]], ['hipmemcpyhosttodevice',['hipMemcpyHostToDevice',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18aff32175ecb0c7113200286eff8211008',1,'hip_runtime_api.h']]], ['hipmemcpyhosttohost',['hipMemcpyHostToHost',['../group__GlobalDefs.html#gga232e222db36b1fc672ba98054d036a18a9d66b705aa85a9c83f0f533cef70d0af',1,'hip_runtime_api.h']]], + ['hipreadmodeelementtype',['hipReadModeElementType',['../hip__texture_8h.html#a442e950774f7306dc33692e358c92c94a829645801202174d052d667ffa4e1b8d',1,'hip_texture.h']]], ['hipsharedmembanksizedefault',['hipSharedMemBankSizeDefault',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104eaf5b325c9b7bde878913f768eaba5014d',1,'hip_runtime_api.h']]], ['hipsharedmembanksizeeightbyte',['hipSharedMemBankSizeEightByte',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104ea64518b4f5a25f536c883330167e79258',1,'hip_runtime_api.h']]], ['hipsharedmembanksizefourbyte',['hipSharedMemBankSizeFourByte',['../group__GlobalDefs.html#gga2e17b71d94ac350f2ccd914fd49d104ea0a95a6e0c33106c42d66ab9476ff954a',1,'hip_runtime_api.h']]], diff --git a/projects/hip/docs/RuntimeAPI/html/search/files_0.js b/projects/hip/docs/RuntimeAPI/html/search/files_0.js index c6c9f7ce28..c60cd7e29f 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/files_0.js +++ b/projects/hip/docs/RuntimeAPI/html/search/files_0.js @@ -1,5 +1,9 @@ var searchData= [ ['hip_5fhcc_2ecpp',['hip_hcc.cpp',['../hip__hcc_8cpp.html',1,'']]], - ['hip_5fruntime_2eh',['hip_runtime.h',['../hcc__detail_2hip__runtime_8h.html',1,'']]] + ['hip_5fruntime_2eh',['hip_runtime.h',['../hcc__detail_2hip__runtime_8h.html',1,'']]], + ['hip_5fruntime_5fapi_2eh',['hip_runtime_api.h',['../hcc__detail_2hip__runtime__api_8h.html',1,'']]], + ['hip_5ftexture_2eh',['hip_texture.h',['../hip__texture_8h.html',1,'']]], + ['hip_5fvector_5ftypes_2eh',['hip_vector_types.h',['../hcc__detail_2hip__vector__types_8h.html',1,'']]], + ['host_5fdefines_2eh',['host_defines.h',['../host__defines_8h.html',1,'']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/functions_0.js b/projects/hip/docs/RuntimeAPI/html/search/functions_0.js index 4b4091e9d3..9e6f4be60d 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/functions_0.js +++ b/projects/hip/docs/RuntimeAPI/html/search/functions_0.js @@ -3,6 +3,7 @@ var searchData= ['hipdevicecanaccesspeer',['hipDeviceCanAccessPeer',['../group__PeerToPeer.html#gab53a55dbc087ff659918fd04287de3d3',1,'hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice): hip_hcc.cpp'],['../group__PeerToPeer.html#gab53a55dbc087ff659918fd04287de3d3',1,'hipDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice): hip_hcc.cpp']]], ['hipdevicedisablepeeraccess',['hipDeviceDisablePeerAccess',['../group__PeerToPeer.html#ga41e60c01f63597529da1cd77bdd55379',1,'hipDeviceDisablePeerAccess(int peerDevice): hip_hcc.cpp'],['../group__PeerToPeer.html#ga41e60c01f63597529da1cd77bdd55379',1,'hipDeviceDisablePeerAccess(int peerDevice): hip_hcc.cpp']]], ['hipdeviceenablepeeraccess',['hipDeviceEnablePeerAccess',['../group__PeerToPeer.html#ga098e0d626edbfb69b66d141a5a8b7dc6',1,'hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags): hip_hcc.cpp'],['../group__PeerToPeer.html#ga098e0d626edbfb69b66d141a5a8b7dc6',1,'hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags): hip_hcc.cpp']]], + ['hipdevicegetattribute',['hipDeviceGetAttribute',['../group__Device.html#gac49518ff2b26b98ea2ec9e9268761a24',1,'hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr, int device): hip_hcc.cpp'],['../group__Device.html#gac49518ff2b26b98ea2ec9e9268761a24',1,'hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr, int device): hip_hcc.cpp']]], ['hipdevicegetcacheconfig',['hipDeviceGetCacheConfig',['../group__Device.html#gaeeffa2456c5430400bea75ecd6ad1e68',1,'hipDeviceGetCacheConfig(hipFuncCache *cacheConfig): hip_hcc.cpp'],['../group__Device.html#gaeeffa2456c5430400bea75ecd6ad1e68',1,'hipDeviceGetCacheConfig(hipFuncCache *cacheConfig): hip_hcc.cpp']]], ['hipdevicegetproperties',['hipDeviceGetProperties',['../group__Device.html#gad9ee6822e3e55431811fb6a00f7a1c10',1,'hipDeviceGetProperties(hipDeviceProp_t *prop, int device): hip_hcc.cpp'],['../group__Device.html#gad9ee6822e3e55431811fb6a00f7a1c10',1,'hipDeviceGetProperties(hipDeviceProp_t *props, int device): hip_hcc.cpp']]], ['hipdevicegetsharedmemconfig',['hipDeviceGetSharedMemConfig',['../group__Device.html#ga1bb08f774a34a468d969a8a04791c9bb',1,'hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig): hip_hcc.cpp'],['../group__Device.html#ga1bb08f774a34a468d969a8a04791c9bb',1,'hipDeviceGetSharedMemConfig(hipSharedMemConfig *pConfig): hip_hcc.cpp']]], @@ -31,7 +32,11 @@ var searchData= ['hipmallochost',['hipMallocHost',['../group__Memory.html#ga66399e729223ff5b66ffc16297c0710e',1,'hipMallocHost(void **ptr, size_t size): hip_hcc.cpp'],['../group__Memory.html#ga66399e729223ff5b66ffc16297c0710e',1,'hipMallocHost(void **ptr, size_t sizeBytes): hip_hcc.cpp']]], ['hipmemcpy',['hipMemcpy',['../group__Memory.html#gac1a055d288302edd641c6d7416858e1e',1,'hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind): hip_hcc.cpp'],['../group__Memory.html#gac1a055d288302edd641c6d7416858e1e',1,'hipMemcpy(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind): hip_hcc.cpp']]], ['hipmemcpyasync',['hipMemcpyAsync',['../group__Memory.html#ga8ad5a0b13458917e1b9437732b21af54',1,'hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0): hip_hcc.cpp'],['../group__Memory.html#ga8ad5a0b13458917e1b9437732b21af54',1,'hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream): hip_hcc.cpp']]], + ['hipmemcpypeer',['hipMemcpyPeer',['../group__PeerToPeer.html#ga72ae9e7f498ab5684580892a5d7d8e2d',1,'hipMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes): hip_hcc.cpp'],['../group__PeerToPeer.html#ga72ae9e7f498ab5684580892a5d7d8e2d',1,'hipMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes): hip_hcc.cpp']]], ['hipmemcpypeerasync',['hipMemcpyPeerAsync',['../group__PeerToPeer.html#gab6211c18ca1e23252ef080cd6be855ca',1,'hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0): hip_hcc.cpp'],['../group__PeerToPeer.html#gab6211c18ca1e23252ef080cd6be855ca',1,'hipMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream): hip_hcc.cpp']]], + ['hipmemcpytosymbol',['hipMemcpyToSymbol',['../group__Memory.html#ga131ac5c1ba04e186112491cb9bf964bc',1,'hipMemcpyToSymbol(const char *symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind): hip_hcc.cpp'],['../group__Memory.html#ga131ac5c1ba04e186112491cb9bf964bc',1,'hipMemcpyToSymbol(const char *symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind): hip_hcc.cpp']]], + ['hipmemset',['hipMemset',['../group__Memory.html#gac7441e74affcce4b8b69dba996c5ebc4',1,'hipMemset(void *dst, int value, size_t sizeBytes): hip_hcc.cpp'],['../group__Memory.html#gac7441e74affcce4b8b69dba996c5ebc4',1,'hipMemset(void *dst, int value, size_t sizeBytes): hip_hcc.cpp']]], + ['hipmemsetasync',['hipMemsetAsync',['../group__Memory.html#gaee4ed665ce0a60c661a809c175320a0c',1,'hipMemsetAsync(void *dst, int value, size_t sizeBytes, hipStream_t=0): hip_hcc.cpp'],['../group__Memory.html#gaee4ed665ce0a60c661a809c175320a0c',1,'hipMemsetAsync(void *dst, int value, size_t sizeBytes, hipStream_t stream): hip_hcc.cpp']]], ['hippeekatlasterror',['hipPeekAtLastError',['../group__Error.html#ga1dd660bc739f7e13edd34615660f0148',1,'hip_runtime_api.h']]], ['hipsetdevice',['hipSetDevice',['../group__Device.html#ga8ec0b093af0adadc7fe98bf33fa21620',1,'hipSetDevice(int device): hip_hcc.cpp'],['../group__Device.html#ga8ec0b093af0adadc7fe98bf33fa21620',1,'hipSetDevice(int device): hip_hcc.cpp']]], ['hipstreamcreatewithflags',['hipStreamCreateWithFlags',['../group__Stream.html#gaf2382e3cc6632332a8983a0f58e43494',1,'hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags): hip_hcc.cpp'],['../group__Stream.html#gaf2382e3cc6632332a8983a0f58e43494',1,'hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags): hip_hcc.cpp']]], diff --git a/projects/hip/docs/RuntimeAPI/html/search/search.js b/projects/hip/docs/RuntimeAPI/html/search/search.js index ef361551ef..57684da009 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/search.js +++ b/projects/hip/docs/RuntimeAPI/html/search/search.js @@ -7,15 +7,15 @@ var indexSectionsWithContent = { - 0: "abcdeghilmnrstwxyz", + 0: "_abcdeghilmnoprstwxyz", 1: "dhit", 2: "h", 3: "h", - 4: "achlmnrstwxyz", + 4: "achlmnprstwxyz", 5: "dh", 6: "h", 7: "h", - 8: "h", + 8: "_ho", 9: "cdeghmst", 10: "bh" }; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_1.js b/projects/hip/docs/RuntimeAPI/html/search/variables_1.js index 1ba67673c4..06748f6da1 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_1.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_1.js @@ -2,5 +2,6 @@ var searchData= [ ['clockinstructionrate',['clockInstructionRate',['../structhipDeviceProp__t.html#a6fbf3b08a1a08ae700f1a06265f6666b',1,'hipDeviceProp_t']]], ['clockrate',['clockRate',['../structhipDeviceProp__t.html#a1dd15bee43692b8649dfbdc1adbaaf96',1,'hipDeviceProp_t']]], - ['computemode',['computeMode',['../structhipDeviceProp__t.html#ae7d9216f8583a703359d0b9373823f5d',1,'hipDeviceProp_t']]] + ['computemode',['computeMode',['../structhipDeviceProp__t.html#ae7d9216f8583a703359d0b9373823f5d',1,'hipDeviceProp_t']]], + ['concurrentkernels',['concurrentKernels',['../structhipDeviceProp__t.html#ad8461a28caf9c38c58cf358583b5bee3',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_4.js b/projects/hip/docs/RuntimeAPI/html/search/variables_4.js index 037341d156..12f7d72a4a 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_4.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_4.js @@ -2,9 +2,11 @@ var searchData= [ ['major',['major',['../structhipDeviceProp__t.html#aec9e4173c2e34cc232300c415dbd5e4f',1,'hipDeviceProp_t']]], ['maxgridsize',['maxGridSize',['../structhipDeviceProp__t.html#ae529c23929f592120081fed31d877a55',1,'hipDeviceProp_t']]], + ['maxsharedmemorypermultiprocessor',['maxSharedMemoryPerMultiProcessor',['../structhipDeviceProp__t.html#aa1a32a7f387f6da845db7b228711fce8',1,'hipDeviceProp_t']]], ['maxthreadsdim',['maxThreadsDim',['../structhipDeviceProp__t.html#a8ebba6fc12f80c9a9cf9b9193f0da465',1,'hipDeviceProp_t']]], ['maxthreadsperblock',['maxThreadsPerBlock',['../structhipDeviceProp__t.html#af971cf1ca3ec1f68ad09036c0cc672e0',1,'hipDeviceProp_t']]], ['maxthreadspermultiprocessor',['maxThreadsPerMultiProcessor',['../structhipDeviceProp__t.html#a23a39f4fd795addb3b125e9c3f6295ea',1,'hipDeviceProp_t']]], + ['memoryclockrate',['memoryClockRate',['../structhipDeviceProp__t.html#a6db0ab8e7e8cc13c84d7bb7f70226d5e',1,'hipDeviceProp_t']]], ['minor',['minor',['../structhipDeviceProp__t.html#abb51208e2509a7a1d107f0da69108938',1,'hipDeviceProp_t']]], ['multiprocessorcount',['multiProcessorCount',['../structhipDeviceProp__t.html#add8d9d2ad52aece9fd1dbe25c18d9d57',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_6.js b/projects/hip/docs/RuntimeAPI/html/search/variables_6.js index 44ba50e0b7..71b6a5df56 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_6.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_6.js @@ -1,4 +1,5 @@ var searchData= [ - ['regsperblock',['regsPerBlock',['../structhipDeviceProp__t.html#a73c1c21648a901799ff6bef83c11135b',1,'hipDeviceProp_t']]] + ['pcibusid',['pciBusID',['../structhipDeviceProp__t.html#a1350f64d49b717ed3a06458f7549ccb0',1,'hipDeviceProp_t']]], + ['pcideviceid',['pciDeviceID',['../structhipDeviceProp__t.html#ae6aa845dc2d540f85098ea30be35f4eb',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_7.js b/projects/hip/docs/RuntimeAPI/html/search/variables_7.js index a582d60c6a..44ba50e0b7 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_7.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_7.js @@ -1,4 +1,4 @@ var searchData= [ - ['sharedmemperblock',['sharedMemPerBlock',['../structhipDeviceProp__t.html#a3b9138678a0795c2677eddcfb1c67156',1,'hipDeviceProp_t']]] + ['regsperblock',['regsPerBlock',['../structhipDeviceProp__t.html#a73c1c21648a901799ff6bef83c11135b',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_8.js b/projects/hip/docs/RuntimeAPI/html/search/variables_8.js index b356d6e142..a582d60c6a 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_8.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_8.js @@ -1,5 +1,4 @@ var searchData= [ - ['totalconstmem',['totalConstMem',['../structhipDeviceProp__t.html#a29880232c56120be3455ce00d5379665',1,'hipDeviceProp_t']]], - ['totalglobalmem',['totalGlobalMem',['../structhipDeviceProp__t.html#acedd6a2d23423441e4bf51c4a1b719f9',1,'hipDeviceProp_t']]] + ['sharedmemperblock',['sharedMemPerBlock',['../structhipDeviceProp__t.html#a3b9138678a0795c2677eddcfb1c67156',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_9.js b/projects/hip/docs/RuntimeAPI/html/search/variables_9.js index 46a1400a7b..b356d6e142 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_9.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_9.js @@ -1,4 +1,5 @@ var searchData= [ - ['warpsize',['warpSize',['../structhipDeviceProp__t.html#af3357d33c004608bf05bc21a352be81b',1,'hipDeviceProp_t']]] + ['totalconstmem',['totalConstMem',['../structhipDeviceProp__t.html#a29880232c56120be3455ce00d5379665',1,'hipDeviceProp_t']]], + ['totalglobalmem',['totalGlobalMem',['../structhipDeviceProp__t.html#acedd6a2d23423441e4bf51c4a1b719f9',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_a.js b/projects/hip/docs/RuntimeAPI/html/search/variables_a.js index 250c203caf..46a1400a7b 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_a.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_a.js @@ -1,4 +1,4 @@ var searchData= [ - ['x',['x',['../structdim3.html#ac866c05f83a28dac20a153fc65b3b16c',1,'dim3']]] + ['warpsize',['warpSize',['../structhipDeviceProp__t.html#af3357d33c004608bf05bc21a352be81b',1,'hipDeviceProp_t']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_b.js b/projects/hip/docs/RuntimeAPI/html/search/variables_b.js index 133dd9dc6e..250c203caf 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_b.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_b.js @@ -1,4 +1,4 @@ var searchData= [ - ['y',['y',['../structdim3.html#a83e60e072f7e8bdfde6ac05053cbb370',1,'dim3']]] + ['x',['x',['../structdim3.html#ac866c05f83a28dac20a153fc65b3b16c',1,'dim3']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_c.js b/projects/hip/docs/RuntimeAPI/html/search/variables_c.js index e8bf38b99c..133dd9dc6e 100644 --- a/projects/hip/docs/RuntimeAPI/html/search/variables_c.js +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_c.js @@ -1,4 +1,4 @@ var searchData= [ - ['z',['z',['../structdim3.html#a866e38993ecc4e76fd47311236c16b04',1,'dim3']]] + ['y',['y',['../structdim3.html#a83e60e072f7e8bdfde6ac05053cbb370',1,'dim3']]] ]; diff --git a/projects/hip/docs/RuntimeAPI/html/structdim3-members.html b/projects/hip/docs/RuntimeAPI/html/structdim3-members.html index 429ba0bf78..a11da0ba47 100644 --- a/projects/hip/docs/RuntimeAPI/html/structdim3-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structdim3-members.html @@ -97,7 +97,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');

    diff --git a/projects/hip/docs/RuntimeAPI/html/structdim3.html b/projects/hip/docs/RuntimeAPI/html/structdim3.html index e305c9dfdc..9c5a6733d9 100644 --- a/projects/hip/docs/RuntimeAPI/html/structdim3.html +++ b/projects/hip/docs/RuntimeAPI/html/structdim3.html @@ -118,12 +118,12 @@ uint32_t 

    Detailed Description

    Struct for data in 3D


    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc-members.html b/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc-members.html index 87eaf5aa59..2670d6f1d7 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc-members.html @@ -94,7 +94,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc.html b/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc.html index d292794181..1ac6f4cde8 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipChannelFormatDesc.html @@ -98,12 +98,12 @@ int _dummy  
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t-members.html b/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t-members.html index 1b606771ad..5d14abe998 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t-members.html @@ -110,7 +110,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t.html b/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t.html index 7e28d4ee8e..8bd9d10bc6 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipDeviceArch__t.html @@ -95,59 +95,59 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); Public Attributes unsigned hasGlobalInt32Atomics: 1 - 32-bit integer atomics for global memory
    + 32-bit integer atomics for global memory.
      unsigned hasGlobalFloatAtomicExch: 1 - 32-bit float atomic exch for global memory
    + 32-bit float atomic exch for global memory.
      unsigned hasSharedInt32Atomics: 1 - 32-bit integer atomics for shared memory
    + 32-bit integer atomics for shared memory.
      unsigned hasSharedFloatAtomicExch: 1 - 32-bit float atomic exch for shared memory
    + 32-bit float atomic exch for shared memory.
      unsigned hasFloatAtomicAdd: 1 - 32-bit float atomic add in global and shared memory
    + 32-bit float atomic add in global and shared memory.
      unsigned hasGlobalInt64Atomics: 1 - 64-bit integer atomics for global memory
    + 64-bit integer atomics for global memory.
      unsigned hasSharedInt64Atomics: 1 - 64-bit integer atomics for shared memory
    + 64-bit integer atomics for shared memory.
      unsigned hasDoubles: 1 - double-precision floating point.
    + Double-precision floating point.
      unsigned hasWarpVote: 1 - warp vote instructions (__any, __all)
    + Warp vote instructions (__any, __all).
      unsigned hasWarpBallot: 1 - warp ballot instructions (__ballot)
    + Warp ballot instructions (__ballot).
      unsigned hasWarpShuffle: 1 - warp shuffle operations. (__shfl_*)
    + Warp shuffle operations. (__shfl_*).
      unsigned hasFunnelShift: 1 - funnel two words into one, with shift&mask caps
    + Funnel two words into one with shift&mask caps.
      unsigned hasThreadFenceSystem: 1 - __threadfence_system
    + __threadfence_system.
      unsigned hasSyncThreadsExt: 1 - __syncthreads_count, syncthreads_and, syncthreads_or
    + __syncthreads_count, syncthreads_and, syncthreads_or.
      unsigned hasSurfaceFuncs: 1 @@ -155,20 +155,20 @@ unsigned   unsigned has3dGrid: 1 - Grid and group dims are 3D (rather than 2D)
    + Grid and group dims are 3D (rather than 2D).
      unsigned hasDynamicParallelism: 1 - Dynamic parallellism.
    + Dynamic parallelism.
     
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t-members.html b/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t-members.html index d37974762e..4df537b6cc 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t-members.html @@ -94,24 +94,29 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); clockInstructionRatehipDeviceProp_t clockRatehipDeviceProp_t computeModehipDeviceProp_t - l2CacheSizehipDeviceProp_t - majorhipDeviceProp_t - maxGridSizehipDeviceProp_t + concurrentKernelshipDeviceProp_t + l2CacheSizehipDeviceProp_t + majorhipDeviceProp_t + maxGridSizehipDeviceProp_t + maxSharedMemoryPerMultiProcessorhipDeviceProp_t maxThreadsDimhipDeviceProp_t maxThreadsPerBlockhipDeviceProp_t maxThreadsPerMultiProcessorhipDeviceProp_t - minorhipDeviceProp_t - multiProcessorCounthipDeviceProp_t - namehipDeviceProp_t - regsPerBlockhipDeviceProp_t - sharedMemPerBlockhipDeviceProp_t - totalConstMemhipDeviceProp_t - totalGlobalMemhipDeviceProp_t - warpSizehipDeviceProp_t + memoryClockRatehipDeviceProp_t + minorhipDeviceProp_t + multiProcessorCounthipDeviceProp_t + namehipDeviceProp_t + pciBusIDhipDeviceProp_t + pciDeviceIDhipDeviceProp_t + regsPerBlockhipDeviceProp_t + sharedMemPerBlockhipDeviceProp_t + totalConstMemhipDeviceProp_t + totalGlobalMemhipDeviceProp_t + warpSizehipDeviceProp_t diff --git a/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t.html b/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t.html index 09483a9577..06e419e325 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipDeviceProp__t.html @@ -101,39 +101,43 @@ char   size_t totalGlobalMem - Size of global memory region (in bytes)
    + Size of global memory region (in bytes).
      size_t sharedMemPerBlock - Size of shared memory region (in bytes)
    + Size of shared memory region (in bytes).
      int regsPerBlock - registers per block
    + Registers per block.
      int warpSize - warp size
    + Warp size.
      int maxThreadsPerBlock - max work items per work group or workgroup max size
    + Max work items per work group or workgroup max size.
      int maxThreadsDim [3] - max number of threads in each dimension (XYZ) of a block
    + Max number of threads in each dimension (XYZ) of a block.
      int maxGridSize [3] - max grid dimensions (XYZ)
    + Max grid dimensions (XYZ).
      int clockRate - max clock frequency of the multiProcessors, in khz.
    + Max clock frequency of the multiProcessors in khz.
      + +int memoryClockRate + Max memory clock frequency in khz.
    size_t totalConstMem - Size of shared memory region (in bytes)
    + Size of shared memory region (in bytes).
      int major @@ -145,7 +149,7 @@ int   int multiProcessorCount - number of multi-processors (compute units)
    + Number of multi-processors (compute units).
      int l2CacheSize @@ -167,16 +171,32 @@ int hipDeviceArch_t arch  Architectural feature flags. New for HIP.
      + +int concurrentKernels + Device can possibly execute multiple kernels concurrently.
    +  + +int pciBusID + PCI Bus ID.
    +  + +int pciDeviceID + PCI Device ID.
    +  + +size_t maxSharedMemoryPerMultiProcessor + Maximum Shared Memory Per Multiprocessor.

    Detailed Description

    hipDeviceProp


    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structhipEvent__t-members.html b/projects/hip/docs/RuntimeAPI/html/structhipEvent__t-members.html index 22a528a157..1f7daca251 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipEvent__t-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipEvent__t-members.html @@ -94,7 +94,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structhipEvent__t.html b/projects/hip/docs/RuntimeAPI/html/structhipEvent__t.html index e904fd699b..202c20a0bb 100644 --- a/projects/hip/docs/RuntimeAPI/html/structhipEvent__t.html +++ b/projects/hip/docs/RuntimeAPI/html/structhipEvent__t.html @@ -98,12 +98,12 @@ struct ihipEvent_t *   
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structihipDevice__t-members.html b/projects/hip/docs/RuntimeAPI/html/structihipDevice__t-members.html index 5018e17cdd..5bc69024fc 100644 --- a/projects/hip/docs/RuntimeAPI/html/structihipDevice__t-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structihipDevice__t-members.html @@ -102,7 +102,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structihipDevice__t.html b/projects/hip/docs/RuntimeAPI/html/structihipDevice__t.html index 9b98f8364e..0a91addd72 100644 --- a/projects/hip/docs/RuntimeAPI/html/structihipDevice__t.html +++ b/projects/hip/docs/RuntimeAPI/html/structihipDevice__t.html @@ -126,12 +126,12 @@ unsigned _compute_units 
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structihipEvent__t-members.html b/projects/hip/docs/RuntimeAPI/html/structihipEvent__t-members.html index 0776e7e875..16bad989ee 100644 --- a/projects/hip/docs/RuntimeAPI/html/structihipEvent__t-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structihipEvent__t-members.html @@ -98,7 +98,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structihipEvent__t.html b/projects/hip/docs/RuntimeAPI/html/structihipEvent__t.html index 9c01e12246..a378a2a2d4 100644 --- a/projects/hip/docs/RuntimeAPI/html/structihipEvent__t.html +++ b/projects/hip/docs/RuntimeAPI/html/structihipEvent__t.html @@ -110,12 +110,12 @@ uint64_t _timestamp
     
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structihipStream__t-members.html b/projects/hip/docs/RuntimeAPI/html/structihipStream__t-members.html index d46a3b9656..57df3ae1ba 100644 --- a/projects/hip/docs/RuntimeAPI/html/structihipStream__t-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structihipStream__t-members.html @@ -98,7 +98,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structihipStream__t.html b/projects/hip/docs/RuntimeAPI/html/structihipStream__t.html index ba671e68af..264d5ffd81 100644 --- a/projects/hip/docs/RuntimeAPI/html/structihipStream__t.html +++ b/projects/hip/docs/RuntimeAPI/html/structihipStream__t.html @@ -114,12 +114,12 @@ ihipCommand_t _last_comman  
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structtexture-members.html b/projects/hip/docs/RuntimeAPI/html/structtexture-members.html index 12b5319739..42ea0039e3 100644 --- a/projects/hip/docs/RuntimeAPI/html/structtexture-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structtexture-members.html @@ -97,7 +97,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structtexture.html b/projects/hip/docs/RuntimeAPI/html/structtexture.html index bafbc0f8e3..4f8d592be9 100644 --- a/projects/hip/docs/RuntimeAPI/html/structtexture.html +++ b/projects/hip/docs/RuntimeAPI/html/structtexture.html @@ -107,7 +107,7 @@ const T * _dataPtr  - Public Attributes inherited from textureReference -hipTextureFilterMode filterMode +hipTextureFilterMode filterMode   bool normalized @@ -117,12 +117,12 @@ bool normalized 
    The documentation for this struct was generated from the following file: diff --git a/projects/hip/docs/RuntimeAPI/html/structtextureReference-members.html b/projects/hip/docs/RuntimeAPI/html/structtextureReference-members.html index 8b3c890f40..89df171ea7 100644 --- a/projects/hip/docs/RuntimeAPI/html/structtextureReference-members.html +++ b/projects/hip/docs/RuntimeAPI/html/structtextureReference-members.html @@ -96,7 +96,7 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search'); diff --git a/projects/hip/docs/RuntimeAPI/html/structtextureReference.html b/projects/hip/docs/RuntimeAPI/html/structtextureReference.html index d5fd3910fc..46cb4d44e7 100644 --- a/projects/hip/docs/RuntimeAPI/html/structtextureReference.html +++ b/projects/hip/docs/RuntimeAPI/html/structtextureReference.html @@ -103,7 +103,7 @@ Inheritance diagram for textureReference:

    Public Attributes

    -hipTextureFilterMode filterMode +hipTextureFilterMode filterMode   bool normalized @@ -113,12 +113,12 @@ bool normalized 
    The documentation for this struct was generated from the following file: From 6b6d71743a21acbcdbd69a37c075f22f3cba6216 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 21:01:14 -0600 Subject: [PATCH 53/94] Update doxygen HTML [ROCm/hip commit: ac5957320eb08fbe418f466780861c47143821c6] --- .../docs/RuntimeAPI/html/globals_enum.html | 120 +++++ .../docs/RuntimeAPI/html/globals_eval.html | 147 ++++++ .../docs/RuntimeAPI/html/globals_type.html | 114 +++++ .../hcc__detail_2hip__runtime__api_8h.html | 326 +++++++++++++ .../hcc__detail_2hip__vector__types_8h.html | 452 ++++++++++++++++++ .../docs/RuntimeAPI/html/hip__texture_8h.html | 209 ++++++++ .../RuntimeAPI/html/host__defines_8h.html | 147 ++++++ .../RuntimeAPI/html/search/defines_1.html | 26 + .../docs/RuntimeAPI/html/search/defines_1.js | 4 + .../RuntimeAPI/html/search/defines_2.html | 26 + .../docs/RuntimeAPI/html/search/defines_2.js | 4 + .../RuntimeAPI/html/search/variables_d.html | 26 + .../RuntimeAPI/html/search/variables_d.js | 4 + 13 files changed, 1605 insertions(+) create mode 100644 projects/hip/docs/RuntimeAPI/html/globals_enum.html create mode 100644 projects/hip/docs/RuntimeAPI/html/globals_eval.html create mode 100644 projects/hip/docs/RuntimeAPI/html/globals_type.html create mode 100644 projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h.html create mode 100644 projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h.html create mode 100644 projects/hip/docs/RuntimeAPI/html/hip__texture_8h.html create mode 100644 projects/hip/docs/RuntimeAPI/html/host__defines_8h.html create mode 100644 projects/hip/docs/RuntimeAPI/html/search/defines_1.html create mode 100644 projects/hip/docs/RuntimeAPI/html/search/defines_1.js create mode 100644 projects/hip/docs/RuntimeAPI/html/search/defines_2.html create mode 100644 projects/hip/docs/RuntimeAPI/html/search/defines_2.js create mode 100644 projects/hip/docs/RuntimeAPI/html/search/variables_d.html create mode 100644 projects/hip/docs/RuntimeAPI/html/search/variables_d.js diff --git a/projects/hip/docs/RuntimeAPI/html/globals_enum.html b/projects/hip/docs/RuntimeAPI/html/globals_enum.html new file mode 100644 index 0000000000..76cd3b9e6f --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/globals_enum.html @@ -0,0 +1,120 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: File Members + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + +
    + + + + +
    + +
    + +
    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/globals_eval.html b/projects/hip/docs/RuntimeAPI/html/globals_eval.html new file mode 100644 index 0000000000..5e5e2c4e57 --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/globals_eval.html @@ -0,0 +1,147 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: File Members + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + +
    + + + + +
    + +
    + +
    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/globals_type.html b/projects/hip/docs/RuntimeAPI/html/globals_type.html new file mode 100644 index 0000000000..2c2504da8c --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/globals_type.html @@ -0,0 +1,114 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: File Members + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + +
    + + + + +
    + +
    + +
    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h.html b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h.html new file mode 100644 index 0000000000..374a22be9b --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__runtime__api_8h.html @@ -0,0 +1,326 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_runtime_api.h File Reference + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    hip_runtime_api.h File Reference
    +
    +
    + +

    Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language extensions (-hc mode) ; those functions in hip_runtime.h. +More...

    +
    #include <stdint.h>
    +#include <stddef.h>
    +#include <hcc_detail/host_defines.h>
    +#include <hc.hpp>
    +
    +

    Go to the source code of this file.

    + + + + + + +

    +Classes

    struct  dim3
     
    struct  hipEvent_t
     
    + + + + + + + + + + + + + + + + + + + +

    +Macros

    #define hipStreamDefault   0x00
     Flags that can be used with hipStreamCreateWithFlags. More...
     
    +#define hipStreamNonBlocking   0x01
     Stream does not implicitly synchronize with null stream.
     
    #define hipEventDefault   0x0
     Flags that can be used with hipEventCreateWithFlags: More...
     
    +#define hipEventBlockingSync   0x1
     Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency.
     
    +#define hipEventDisableTiming   0x2
     Disable event's capability to record timing information. May improve performance.
     
    #define hipEventInterprocess   0x4
     Event can support IPC. More...
     
    + + + + + + + + + + + +

    +Typedefs

    typedef enum hipFuncCache hipFuncCache
     
    typedef enum hipSharedMemConfig hipSharedMemConfig
     
    typedef struct dim3 dim3
     
    +typedef struct ihipStream_thipStream_t
     
    +typedef struct hipEvent_t hipEvent_t
     
    + + + + + + + +

    +Enumerations

    enum  hipFuncCache { hipFuncCachePreferNone, +hipFuncCachePreferShared, +hipFuncCachePreferL1, +hipFuncCachePreferEqual + }
     
    enum  hipSharedMemConfig { hipSharedMemBankSizeDefault, +hipSharedMemBankSizeFourByte, +hipSharedMemBankSizeEightByte + }
     
    enum  hipMemcpyKind {
    +  hipMemcpyHostToHost = 0, +hipMemcpyHostToDevice = 1, +hipMemcpyDeviceToHost = 2, +hipMemcpyDeviceToDevice =3, +
    +  hipMemcpyDefault = 4 +
    + }
     
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Functions

    hipError_t hipDeviceSynchronize (void)
     Blocks until the default device has completed all preceding requested tasks. More...
     
    hipError_t hipDeviceReset (void)
     Destroy all resources and reset all state on the default device in the current process. More...
     
    hipError_t hipSetDevice (int device)
     Set default device to be used for subsequent hip API calls from this thread. More...
     
    hipError_t hipGetDevice (int *device)
     Return the default device id for the calling host thread. More...
     
    hipError_t hipGetDeviceCount (int *count)
     Return number of compute-capable devices. More...
     
    hipError_t hipDeviceGetAttribute (int *pi, hipDeviceAttribute_t attr, int device)
     Query device attribute. More...
     
    hipError_t hipDeviceGetProperties (hipDeviceProp_t *prop, int device)
     Returns device properties. More...
     
    hipError_t hipDeviceSetCacheConfig (hipFuncCache cacheConfig)
     Set L1/Shared cache partition. More...
     
    hipError_t hipDeviceGetCacheConfig (hipFuncCache *cacheConfig)
     Set Cache configuration for a specific function. More...
     
    hipError_t hipFuncSetCacheConfig (hipFuncCache config)
     Set Cache configuration for a specific function. More...
     
    hipError_t hipDeviceGetSharedMemConfig (hipSharedMemConfig *pConfig)
     Get Shared memory bank configuration. More...
     
    hipError_t hipDeviceSetSharedMemConfig (hipSharedMemConfig config)
     Set Shared memory bank configuration. More...
     
    hipError_t hipGetLastError (void)
     Return last error returned by any HIP runtime API call and resets the stored error code to hipSuccess. More...
     
    hipError_t hipPeekAtLastError (void)
     Return last error returned by any HIP runtime API call. More...
     
    const char * hipGetErrorName (hipError_t hip_error)
     Return name of the specified error code in text form. More...
     
    const char * hipGetErrorString (hipError_t hip_error)
     Return handy text string message to explain the error which occurred. More...
     
    hipError_t hipStreamCreateWithFlags (hipStream_t *stream, unsigned int flags)
     Create an asynchronous stream. More...
     
    hipError_t hipStreamWaitEvent (hipStream_t stream, hipEvent_t event, unsigned int flags)
     Make the specified compute stream wait for an event. More...
     
    hipError_t hipStreamSynchronize (hipStream_t stream)
     Wait for all commands in stream to complete. More...
     
    hipError_t hipStreamDestroy (hipStream_t stream)
     Destroys the specified stream. More...
     
    hipError_t hipStreamGetFlags (hipStream_t stream, unsigned int *flags)
     Return flags associated with this stream. More...
     
    hipError_t hipEventCreateWithFlags (hipEvent_t *event, unsigned flags)
     Create an event with the specified flags. More...
     
    hipError_t hipEventRecord (hipEvent_t event, hipStream_t stream=NULL)
     Record an event in the specified stream. More...
     
    hipError_t hipEventDestroy (hipEvent_t event)
     Destroy the specified event. More...
     
    hipError_t hipEventSynchronize (hipEvent_t event)
     : Wait for an event to complete. More...
     
    hipError_t hipEventElapsedTime (float *ms, hipEvent_t start, hipEvent_t stop)
     Return the elapsed time between two events. More...
     
    hipError_t hipEventQuery (hipEvent_t event)
     Query event status. More...
     
    hipError_t hipMalloc (void **ptr, size_t size)
     Allocate memory on the default accelerator. More...
     
    hipError_t hipMallocHost (void **ptr, size_t size)
     Allocate pinned host memory. More...
     
    hipError_t hipFree (void *ptr)
     Free memory allocated by the hcc hip memory allocation API. This API performs an implicit hipDeviceSynchronize() call. More...
     
    hipError_t hipFreeHost (void *ptr)
     Free memory allocated by the hcc hip host memory allocation API. More...
     
    hipError_t hipMemcpy (void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind)
     Copy data from src to dst. More...
     
    hipError_t hipMemcpyToSymbol (const char *symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind)
     Copies sizeBytes bytes from the memory area pointed to by src to the memory area pointed to by offset bytes from the start of symbol symbol. More...
     
    hipError_t hipMemcpyAsync (void *dst, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream=0)
     Copy data from src to dst asynchronously. More...
     
    hipError_t hipMemset (void *dst, int value, size_t sizeBytes)
     Copy data from src to dst asynchronously. More...
     
    hipError_t hipMemsetAsync (void *dst, int value, size_t sizeBytes, hipStream_t=0)
     Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant byte value value. More...
     
    +hipError_t hipMemGetInfo (size_t *free, size_t *total)
     
    hipError_t hipDeviceCanAccessPeer (int *canAccessPeer, int device, int peerDevice)
     Determine if a device can access a peer's memory. More...
     
    hipError_t hipDeviceDisablePeerAccess (int peerDevice)
     Disables registering memory on peerDevice for direct access from the current device. More...
     
    hipError_t hipDeviceEnablePeerAccess (int peerDevice, unsigned int flags)
     Enables registering memory on peerDevice for direct access from the current device. More...
     
    hipError_t hipMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes)
     Copies memory from one device to memory on another device. More...
     
    hipError_t hipMemcpyPeerAsync (void *dst, int dstDevice, const void *src, int srcDevice, size_t sizeBytes, hipStream_t stream=0)
     Copies memory from one device to memory on another device. More...
     
    hipError_t hipDriverGetVersion (int *driverVersion)
     Returns the approximate HIP driver version. More...
     
    hipError_t hipHccGetAccelerator (int deviceId, hc::accelerator *acc)
     Return hc::accelerator associated with the specified deviceId. More...
     
    hipError_t hipHccGetAcceleratorView (hipStream_t stream, hc::accelerator_view **av)
     Return hc::accelerator_view associated with the specified stream. More...
     
    +

    Detailed Description

    +

    Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language extensions (-hc mode) ; those functions in hip_runtime.h.

    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h.html b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h.html new file mode 100644 index 0000000000..7e73a981b1 --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/hcc__detail_2hip__vector__types_8h.html @@ -0,0 +1,452 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_vector_types.h File Reference + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    hip_vector_types.h File Reference
    +
    +
    + +

    Defines the different newt vector types for HIP runtime. +More...

    +
    #include <hc_short_vector.hpp>
    +
    +

    Go to the source code of this file.

    + + + + + + + + + + + +

    +Macros

    +#define ONE_COMPONENT_ACCESS(T, VT)   inline VT make_ ##VT (T x) { VT t; t.x = x; return t; };
     
    +
     
    +#define TWO_COMPONENT_ACCESS(T, VT)   inline VT make_ ##VT (T x, T y) { VT t; t.x=x; t.y=y; return t; };
     
    +#define THREE_COMPONENT_ACCESS(T, VT)   inline VT make_ ##VT (T x, T y, T z) { VT t; t.x=x; t.y=y; t.z=z; return t; };
     
    +#define FOUR_COMPONENT_ACCESS(T, VT)   inline VT make_ ##VT (T x, T y, T z, T w) { VT t; t.x=x; t.y=y; t.z=z; t.w=w; return t; };
     
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Typedefs

    +typedef hc::short_vector::char1 char1
     
    +typedef hc::short_vector::char2 char2
     
    +typedef hc::short_vector::char3 char3
     
    +typedef hc::short_vector::char4 char4
     
    +typedef hc::short_vector::short1 short1
     
    +typedef hc::short_vector::short2 short2
     
    +typedef hc::short_vector::short3 short3
     
    +typedef hc::short_vector::short4 short4
     
    +typedef hc::short_vector::int1 int1
     
    +typedef hc::short_vector::int2 int2
     
    +typedef hc::short_vector::int3 int3
     
    +typedef hc::short_vector::int4 int4
     
    +typedef hc::short_vector::long1 long1
     
    +typedef hc::short_vector::long2 long2
     
    +typedef hc::short_vector::long3 long3
     
    +typedef hc::short_vector::long4 long4
     
    +typedef hc::short_vector::longlong1 longlong1
     
    +typedef hc::short_vector::longlong2 longlong2
     
    +typedef hc::short_vector::longlong3 longlong3
     
    +typedef hc::short_vector::longlong4 longlong4
     
    +typedef hc::short_vector::uchar1 uchar1
     
    +typedef hc::short_vector::uchar2 uchar2
     
    +typedef hc::short_vector::uchar3 uchar3
     
    +typedef hc::short_vector::uchar4 uchar4
     
    +typedef hc::short_vector::ushort1 ushort1
     
    +typedef hc::short_vector::ushort2 ushort2
     
    +typedef hc::short_vector::ushort3 ushort3
     
    +typedef hc::short_vector::ushort4 ushort4
     
    +typedef hc::short_vector::uint1 uint1
     
    +typedef hc::short_vector::uint2 uint2
     
    +typedef hc::short_vector::uint3 uint3
     
    +typedef hc::short_vector::uint4 uint4
     
    +typedef hc::short_vector::ulong1 ulong1
     
    +typedef hc::short_vector::ulong2 ulong2
     
    +typedef hc::short_vector::ulong3 ulong3
     
    +typedef hc::short_vector::ulong4 ulong4
     
    +typedef
    +hc::short_vector::ulonglong1 
    ulonglong1
     
    +typedef
    +hc::short_vector::ulonglong2 
    ulonglong2
     
    +typedef
    +hc::short_vector::ulonglong3 
    ulonglong3
     
    +typedef
    +hc::short_vector::ulonglong4 
    ulonglong4
     
    +typedef hc::short_vector::float1 float1
     
    +typedef hc::short_vector::float2 float2
     
    +typedef hc::short_vector::float3 float3
     
    +typedef hc::short_vector::float4 float4
     
    +typedef hc::short_vector::double1 double1
     
    +typedef hc::short_vector::double2 double2
     
    +typedef hc::short_vector::double3 double3
     
    +typedef hc::short_vector::double4 double4
     
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Functions

    ONE_COMPONENT_ACCESS (signed char, char1)
     
    TWO_COMPONENT_ACCESS (signed char, char2)
     
    THREE_COMPONENT_ACCESS (signed char, char3)
     
    FOUR_COMPONENT_ACCESS (signed char, char4)
     
    ONE_COMPONENT_ACCESS (short, short1)
     
    TWO_COMPONENT_ACCESS (short, short2)
     
    THREE_COMPONENT_ACCESS (short, short3)
     
    FOUR_COMPONENT_ACCESS (short, short4)
     
    ONE_COMPONENT_ACCESS (int, int1)
     
    TWO_COMPONENT_ACCESS (int, int2)
     
    THREE_COMPONENT_ACCESS (int, int3)
     
    FOUR_COMPONENT_ACCESS (int, int4)
     
    ONE_COMPONENT_ACCESS (long int, long1)
     
    TWO_COMPONENT_ACCESS (long int, long2)
     
    THREE_COMPONENT_ACCESS (long int, long3)
     
    FOUR_COMPONENT_ACCESS (long int, long4)
     
    ONE_COMPONENT_ACCESS (long long int, ulong1)
     
    TWO_COMPONENT_ACCESS (long long int, ulong2)
     
    THREE_COMPONENT_ACCESS (long long int, ulong3)
     
    FOUR_COMPONENT_ACCESS (long long int, ulong4)
     
    ONE_COMPONENT_ACCESS (long long int, longlong1)
     
    TWO_COMPONENT_ACCESS (long long int, longlong2)
     
    THREE_COMPONENT_ACCESS (long long int, longlong3)
     
    FOUR_COMPONENT_ACCESS (long long int, longlong4)
     
    ONE_COMPONENT_ACCESS (unsigned char, uchar1)
     
    TWO_COMPONENT_ACCESS (unsigned char, uchar2)
     
    THREE_COMPONENT_ACCESS (unsigned char, uchar3)
     
    FOUR_COMPONENT_ACCESS (unsigned char, uchar4)
     
    ONE_COMPONENT_ACCESS (unsigned short, ushort1)
     
    TWO_COMPONENT_ACCESS (unsigned short, ushort2)
     
    THREE_COMPONENT_ACCESS (unsigned short, ushort3)
     
    FOUR_COMPONENT_ACCESS (unsigned short, ushort4)
     
    ONE_COMPONENT_ACCESS (unsigned int, uint1)
     
    TWO_COMPONENT_ACCESS (unsigned int, uint2)
     
    THREE_COMPONENT_ACCESS (unsigned int, uint3)
     
    FOUR_COMPONENT_ACCESS (unsigned int, uint4)
     
    ONE_COMPONENT_ACCESS (unsigned long int, ulong1)
     
    TWO_COMPONENT_ACCESS (unsigned long int, ulong2)
     
    THREE_COMPONENT_ACCESS (unsigned long int, ulong3)
     
    FOUR_COMPONENT_ACCESS (unsigned long int, ulong4)
     
    ONE_COMPONENT_ACCESS (unsigned long long int, ulong1)
     
    TWO_COMPONENT_ACCESS (unsigned long long int, ulong2)
     
    THREE_COMPONENT_ACCESS (unsigned long long int, ulong3)
     
    FOUR_COMPONENT_ACCESS (unsigned long long int, ulong4)
     
    ONE_COMPONENT_ACCESS (unsigned long long int, ulonglong1)
     
    TWO_COMPONENT_ACCESS (unsigned long long int, ulonglong2)
     
    THREE_COMPONENT_ACCESS (unsigned long long int, ulonglong3)
     
    FOUR_COMPONENT_ACCESS (unsigned long long int, ulonglong4)
     
    ONE_COMPONENT_ACCESS (float, float1)
     
    TWO_COMPONENT_ACCESS (float, float2)
     
    THREE_COMPONENT_ACCESS (float, float3)
     
    FOUR_COMPONENT_ACCESS (float, float4)
     
    ONE_COMPONENT_ACCESS (double, double1)
     
    TWO_COMPONENT_ACCESS (double, double2)
     
    THREE_COMPONENT_ACCESS (double, double3)
     
    FOUR_COMPONENT_ACCESS (double, double4)
     
    +

    Detailed Description

    +

    Defines the different newt vector types for HIP runtime.

    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/hip__texture_8h.html b/projects/hip/docs/RuntimeAPI/html/hip__texture_8h.html new file mode 100644 index 0000000000..d95b127f85 --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/hip__texture_8h.html @@ -0,0 +1,209 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/hip_texture.h File Reference + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    hip_texture.h File Reference
    +
    +
    + +

    HIP C++ Texture API for hcc compiler. +More...

    +
    #include <limits.h>
    +#include <hip_runtime.h>
    +
    +

    Go to the source code of this file.

    + + + + + + + + +

    +Classes

    struct  hipChannelFormatDesc
     
    struct  textureReference
     
    struct  texture< T, texType, hipTextureReadMode >
     
    + + + + + +

    +Macros

    +#define hipTextureType1D   1
     
    +#define tex1Dfetch(_tex, _addr)   (_tex._dataPtr[_addr])
     
    + + + + + + + +

    +Typedefs

    +typedef struct hipChannelFormatDesc hipChannelFormatDesc
     
    +typedef enum hipTextureReadMode hipTextureReadMode
     
    +typedef enum hipTextureFilterMode hipTextureFilterMode
     
    + + + + + +

    +Enumerations

    enum  hipTextureReadMode { hipReadModeElementType + }
     
    enum  hipTextureFilterMode { hipFilterModePoint + }
     
    + + + + + + + + + + + + + +

    +Functions

    +template<class T >
    hipChannelFormatDesc hipCreateChannelDesc ()
     
    +template<class T , int dim, enum hipTextureReadMode readMode>
    hipError_t hipBindTexture (size_t *offset, struct texture< T, dim, readMode > &tex, const void *devPtr, const struct hipChannelFormatDesc *desc, size_t size=UINT_MAX)
     
    +template<class T , int dim, enum hipTextureReadMode readMode>
    hipError_t hipBindTexture (size_t *offset, struct texture< T, dim, readMode > &tex, const void *devPtr, size_t size=UINT_MAX)
     
    +template<class T , int dim, enum hipTextureReadMode readMode>
    hipError_t hipUnbindTexture (struct texture< T, dim, readMode > *tex)
     
    +

    Detailed Description

    +

    HIP C++ Texture API for hcc compiler.

    +

    Enumeration Type Documentation

    + +
    +
    + + + + +
    enum hipTextureFilterMode
    +
    + + +
    Enumerator
    hipFilterModePoint  +

    Point filter mode.

    +
    + +
    +
    + +
    +
    + + + + +
    enum hipTextureReadMode
    +
    + + +
    Enumerator
    hipReadModeElementType  +

    Read texture as specified element type

    +
    + +
    +
    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/host__defines_8h.html b/projects/hip/docs/RuntimeAPI/html/host__defines_8h.html new file mode 100644 index 0000000000..9d7b976271 --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/host__defines_8h.html @@ -0,0 +1,147 @@ + + + + + + +HIP: Heterogenous-computing Interface for Portability: /home/bensander/HIP.public/include/hcc_detail/host_defines.h File Reference + + + + + + + + + +
    +
    + + + + + + +
    +
    HIP: Heterogenous-computing Interface for Portability +
    +
    +
    + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    host_defines.h File Reference
    +
    +
    + +

    TODO-doc. +More...

    + +

    Go to the source code of this file.

    + + + + + + + + + + + + + + + + +

    +Macros

    #define __host__   __attribute__((cpu))
     
    +#define __device__   __attribute__((hc))
     
    +#define __global__   __attribute__((hc_grid_launch))
     
    +#define __noinline__   __attribute__((noinline))
     
    +#define __forceinline__   __attribute__((always_inline))
     
    +#define __shared__   tile_static
     
    +#define __constant__   __attribute__((address_space(2)))
     
    +

    Detailed Description

    +

    TODO-doc.

    +

    Macro Definition Documentation

    + +
    +
    + + + + +
    #define __host__   __attribute__((cpu))
    +
    +

    Function and kernel markers

    + +
    +
    +
    + + + + diff --git a/projects/hip/docs/RuntimeAPI/html/search/defines_1.html b/projects/hip/docs/RuntimeAPI/html/search/defines_1.html new file mode 100644 index 0000000000..9e1d9fa116 --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/search/defines_1.html @@ -0,0 +1,26 @@ + + + + + + + + + +
    +
    Loading...
    +
    + +
    Searching...
    +
    No Matches
    + +
    + + diff --git a/projects/hip/docs/RuntimeAPI/html/search/defines_1.js b/projects/hip/docs/RuntimeAPI/html/search/defines_1.js new file mode 100644 index 0000000000..c7c61558ca --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/search/defines_1.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['hipthreadidx_5fx',['hipThreadIdx_x',['../hcc__detail_2hip__runtime_8h.html#a48f5f9da77c5fab1fbcf0205bb347d89',1,'hip_runtime.h']]] +]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/defines_2.html b/projects/hip/docs/RuntimeAPI/html/search/defines_2.html new file mode 100644 index 0000000000..6ef4b980d7 --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/search/defines_2.html @@ -0,0 +1,26 @@ + + + + + + + + + +
    +
    Loading...
    +
    + +
    Searching...
    +
    No Matches
    + +
    + + diff --git a/projects/hip/docs/RuntimeAPI/html/search/defines_2.js b/projects/hip/docs/RuntimeAPI/html/search/defines_2.js new file mode 100644 index 0000000000..3eaae3688b --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/search/defines_2.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['one_5fcomponent_5faccess',['ONE_COMPONENT_ACCESS',['../hcc__detail_2hip__vector__types_8h.html#add5d9c0f058c5a52c2b9165a66035d0e',1,'hip_vector_types.h']]] +]; diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_d.html b/projects/hip/docs/RuntimeAPI/html/search/variables_d.html new file mode 100644 index 0000000000..f47799968f --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_d.html @@ -0,0 +1,26 @@ + + + + + + + + + +
    +
    Loading...
    +
    + +
    Searching...
    +
    No Matches
    + +
    + + diff --git a/projects/hip/docs/RuntimeAPI/html/search/variables_d.js b/projects/hip/docs/RuntimeAPI/html/search/variables_d.js new file mode 100644 index 0000000000..e8bf38b99c --- /dev/null +++ b/projects/hip/docs/RuntimeAPI/html/search/variables_d.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['z',['z',['../structdim3.html#a866e38993ecc4e76fd47311236c16b04',1,'dim3']]] +]; From f54ada2a14ab4294a163df71568f88d8d0b8b348 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 19:46:51 -0600 Subject: [PATCH 54/94] Search multiple dirs. [ROCm/hip commit: 311a728d259d7ee1c2c6733647c0877907d92a46] --- projects/hip/bin/findcode.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hip/bin/findcode.sh b/projects/hip/bin/findcode.sh index 087e4873c0..a2334b3e2d 100755 --- a/projects/hip/bin/findcode.sh +++ b/projects/hip/bin/findcode.sh @@ -1,5 +1,5 @@ #!/bin/bash -SEARCH_DIR=$1 +SEARCH_DIRS=$@ -find $SEARCH_DIR -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' +find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' From bd83583d40eb0112a334932445f1754e63b03c27 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 20:25:03 -0600 Subject: [PATCH 55/94] Update release notes [ROCm/hip commit: 617e7d8a7d55264f8fd1b28eb91e09de0a78a012] --- projects/hip/CONTRIBUTING.md | 5 ----- projects/hip/RELEASE.md | 23 ++++++++++++++++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/projects/hip/CONTRIBUTING.md b/projects/hip/CONTRIBUTING.md index 7bc946c576..685683e697 100644 --- a/projects/hip/CONTRIBUTING.md +++ b/projects/hip/CONTRIBUTING.md @@ -110,11 +110,6 @@ executing: ../../test/myocyte/run0.cmd... PASSED! executing: ../../test/nn/run0.cmd... PASSED! --TESTING: nw executing: ../../test/nw/run0.cmd... PASSED! ---TESTING: particlefilter -executing: ../../test/particlefilter/run0.cmd... *** Error in `./particlefilter_naive': free(): invalid next size (fast): 0x0000000001ad89d0 *** - FAILED! -executing: ../../test/particlefilter/run1.cmd... *** Error in `./particlefilter_float': free(): invalid next size (fast): 0x0000000001a7e890 *** - FAILED! --TESTING: pathfinder executing: ../../test/pathfinder/run0.cmd... PASSED! --TESTING: srad diff --git a/projects/hip/RELEASE.md b/projects/hip/RELEASE.md index 605c2ee69f..9fea5d4c78 100644 --- a/projects/hip/RELEASE.md +++ b/projects/hip/RELEASE.md @@ -6,10 +6,31 @@ We have attempted to document known bugs and limitations - in particular the [HI - Async memory copies. - hipStream support. - Multi-GPU -- Shared-scope atomic operations. (due to compiler limitation, shared-scope map atomics map to global scope) +- Shared-scope atomic operations. (due to compiler limitation, shared-scope map atomics map to global) - Tuning built-in functions, including shfl. - Performance optimization. Stay tuned - the work for many of these features is already in-flight. + +## Revision History: + +=================================================================================================== +Release:0.80.01.00 +Date: 2016.02.18 +- Improve reporting and support for device-side math functions. +- Update Runtime Documentation. +- Improve implementations of cross-lane operations (_ballot, _any, _all). +- Provide shuffle intrinsics (performance optimization in-progress). +- Support hipDeviceAttribute for querying "one-shot" device attributes, as an alternative to hipDeviceGetProperties. +- + +=================================================================================================== +Release:0.80.00.00 : +Date: 2016.01.25 + +Initial release with GPUOpen Launch. + + + From 4fcd9f2542cfc6ee13d2aad774a398a83d412bfb Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 18:15:01 +0300 Subject: [PATCH 56/94] Device property memoryBusWidth implementation. + Device property memoryBusWidth is added to hipDeviceProp_t struct. + Device attribute hipDeviceAttributeMemoryBusWidth is added to hipDeviceAttribute_t struct. + Tests update. [ROCm/hip commit: 1c19dbb80709273754dcb5e96e467388eb8af904] --- projects/hip/include/hip_runtime_api.h | 4 +++- projects/hip/include/nvcc_detail/hip_runtime_api.h | 4 +++- projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp | 1 + projects/hip/src/hip_hcc.cpp | 13 +++++++++---- projects/hip/tests/src/hipGetDeviceAttribute.cpp | 1 + 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 5191bc5d54..dcec805be4 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -81,7 +81,8 @@ typedef struct hipDeviceProp_t { int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. int maxGridSize[3]; ///< Max grid dimensions (XYZ). int clockRate; ///< Max clock frequency of the multiProcessors in khz. - int memoryClockRate; ///< Max memory clock frequency in khz. + int memoryClockRate; ///< Max global memory clock frequency in khz. + int memoryBusWidth; ///< Global memory bus width in bits. size_t totalConstMem; ///< Size of shared memory region (in bytes). int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. @@ -145,6 +146,7 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. + hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 7a1e9bc6e9..85befff24f 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -243,7 +243,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeClockRate: cdattr = cudaDevAttrClockRate; break; case hipDeviceAttributeMemoryClockRate: - cdattr = cudaDevAttrMemoryClockRate:; break; + cdattr = cudaDevAttrMemoryClockRate; break; + case hipDeviceAttributeMemoryBusWidth: + cdattr = cudaDevAttrGlobalMemoryBusWidth; break; case hipDeviceAttributeMultiprocessorCount: cdattr = cudaDevAttrMultiProcessorCount; break; case hipDeviceAttributeComputeMode: diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index 18d9176a07..9c3d2c1b53 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -80,6 +80,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index beba7c2775..f52aa467f4 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -300,7 +300,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Get Max clock frequency - err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,&prop->clockRate); + err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &prop->clockRate); prop->clockRate *= 1000.0; // convert Mhz to Khz. DeviceErrorCheck(err); @@ -356,12 +356,11 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) */ // Get memory properties - - err = hsa_agent_iterate_regions(_hsa_agent,get_region_info,prop); + err = hsa_agent_iterate_regions(_hsa_agent, get_region_info, prop); DeviceErrorCheck(err); // Get the size of the region we are using for Accelerator Memory allocations: - hsa_region_t *am_region = static_cast (_acc.get_hsa_am_region()); + hsa_region_t *am_region = static_cast(_acc.get_hsa_am_region()); err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem); DeviceErrorCheck(err); // maxSharedMemoryPerMultiProcessor should be as the same as group memory size. @@ -370,7 +369,11 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Get Max memory clock frequency err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); + DeviceErrorCheck(err); prop->memoryClockRate *= 1000.0; // convert Mhz to Khz. + + // Get global memory bus width in bits + err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH, &prop->memoryBusWidth); DeviceErrorCheck(err); // Set feature flags - these are all mandatory for HIP on HCC path: @@ -845,6 +848,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->clockRate; break; case hipDeviceAttributeMemoryClockRate: *pi = prop->memoryClockRate; break; + case hipDeviceAttributeMemoryBusWidth: + *pi = prop->memoryBusWidth; break; case hipDeviceAttributeMultiprocessorCount: *pi = prop->multiProcessorCount; break; case hipDeviceAttributeComputeMode: diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 33b5e2ba03..3cd88e3ed7 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -68,6 +68,7 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxRegistersPerBlock, props.regsPerBlock)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeClockRate, props.clockRate)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryClockRate, props.memoryClockRate)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryBusWidth, props.memoryBusWidth)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeL2CacheSize, props.l2CacheSize)); From 68621c6702ae18e6d2f937a8cbb8396958e99c06 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 18:54:19 +0300 Subject: [PATCH 57/94] Formatting, no functional changes. [ROCm/hip commit: fbdeee39cda43389784dcd94228cdd6e82366062] --- projects/hip/src/hip_hcc.cpp | 77 +++++++++++------------------------- 1 file changed, 22 insertions(+), 55 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index f52aa467f4..d613d3a01b 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -207,36 +207,24 @@ void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) { hsa_status_t get_region_info(hsa_region_t region, void* data) { - hsa_status_t err; hipDeviceProp_t* p_prop = reinterpret_cast(data); - uint32_t region_segment ; + uint32_t region_segment; + // Get region segment + err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, ®ion_segment); + ErrorCheck(err); - // Get region segment - err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, ®ion_segment); - ErrorCheck(err); - - switch(region_segment) - { - case HSA_REGION_SEGMENT_READONLY: - err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->totalConstMem)); - - break; - - /* case HSA_REGION_SEGMENT_PRIVATE: - cout<<"PRIVATE"<sharedMemPerBlock)); - - break; - - default: - break; - } + switch(region_segment) { + case HSA_REGION_SEGMENT_READONLY: + err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->totalConstMem)); break; + /* case HSA_REGION_SEGMENT_PRIVATE: + cout<<"PRIVATE"<sharedMemPerBlock)); break; + default: break; + } return HSA_STATUS_SUCCESS; - } @@ -244,29 +232,23 @@ hsa_status_t get_region_info(hsa_region_t region, void* data) hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) { hipError_t e = hipSuccess; + hsa_status_t err; // Set some defaults in case we don't find the appropriate regions: prop->totalGlobalMem = 0; prop->totalConstMem = 0; prop->sharedMemPerBlock = 0; prop-> maxThreadsPerMultiProcessor = 0; - // - // prop->regsPerBlock = 0; - - hsa_status_t err; - if (_hsa_agent.handle == -1) { return hipErrorInvalidDevice; } - // Get agent name err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_NAME, &(prop->name)); DeviceErrorCheck(err); - // Get agent node uint32_t node; err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_NODE, &node); @@ -276,12 +258,10 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,&prop->warpSize); DeviceErrorCheck(err); - // Get max total number of work-items in a workgroup err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &prop->maxThreadsPerBlock ); DeviceErrorCheck(err); - // Get max number of work-items of each dimension of a work-group uint16_t work_group_max_dim[3]; err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, work_group_max_dim); @@ -290,7 +270,6 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) prop->maxThreadsDim[i]= work_group_max_dim[i]; } - hsa_dim3_t grid_max_dim; err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); DeviceErrorCheck(err); @@ -298,7 +277,6 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) prop->maxGridSize[1]= (int) ((grid_max_dim.y == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.y); prop->maxGridSize[2]= (int) ((grid_max_dim.z == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.z); - // Get Max clock frequency err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &prop->clockRate); prop->clockRate *= 1000.0; // convert Mhz to Khz. @@ -326,12 +304,10 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) prop->major = 2; prop->minor = 0; - // Get number of Compute Unit err = hsa_agent_get_info(_hsa_agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &(prop->multiProcessorCount)); DeviceErrorCheck(err); - // TODO-hsart - this appears to return 0? uint32_t cache_size[4]; err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_CACHE_SIZE, cache_size); @@ -387,17 +363,13 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) prop->arch.hasFloatAtomicAdd = 0; prop->arch.hasGlobalInt64Atomics = 1; prop->arch.hasSharedInt64Atomics = 0; // TODO-hcc-atomics - prop->arch.hasDoubles = 1; // TODO - true for Fiji. - prop->arch.hasWarpVote = 1; prop->arch.hasWarpBallot = 1; prop->arch.hasWarpShuffle = 1; prop->arch.hasFunnelShift = 0; // TODO-hcc - prop->arch.hasThreadFenceSystem = 0; // TODO-hcc prop->arch.hasSyncThreadsExt = 0; // TODO-hcc - prop->arch.hasSurfaceFuncs = 0; // TODO-hcc prop->arch.has3dGrid = 1; prop->arch.hasDynamicParallelism = 0; @@ -1399,7 +1371,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) #else // TODO-hcc remove-me - // This code only works on Kaveri: + // This code only works on Kaveri: *ptr = malloc(sizeBytes); // TODO - call am_alloc for device memory, this will only on KV HSA. if (*ptr != NULL) { //TODO-hsart : need memory pin APIs to implement this correctly. @@ -1415,18 +1387,15 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { #ifdef USE_MEMCPYTOSYMBOL - if(kind != hipMemcpyHostToDevice) - { - return ihipLogStatus(hipErrorInvalidValue); - } - auto device = ihipGetTlsDefaultDevice(); - + if(kind != hipMemcpyHostToDevice) { + return ihipLogStatus(hipErrorInvalidValue); + } + auto device = ihipGetTlsDefaultDevice(); hc::completion_future marker; ihipCheckCommandSwitchSync(device._null_stream, ihipCommandData, &marker); - - device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset); + device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset); #endif - return ihipLogStatus(hipSuccess); + return ihipLogStatus(hipSuccess); } @@ -1696,10 +1665,8 @@ hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int hipError_t hipDriverGetVersion(int *driverVersion) { std::call_once(hip_initialized, ihipInit); - *driverVersion = 4; - - return ihipLogStatus(hipSuccess); + return ihipLogStatus(hipSuccess); } From 7f6001cb753602fa721aa80e821702ae89fc76f8 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 19 Feb 2016 01:56:17 -0600 Subject: [PATCH 58/94] Describe how to update HTML docs [ROCm/hip commit: d8cf65edacac5bac920af261f3a57b4962c3cf70] --- projects/hip/tests/src/hipMemcpyAsync.cpp | 40 +++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/projects/hip/tests/src/hipMemcpyAsync.cpp b/projects/hip/tests/src/hipMemcpyAsync.cpp index 6192940270..19f1a94761 100644 --- a/projects/hip/tests/src/hipMemcpyAsync.cpp +++ b/projects/hip/tests/src/hipMemcpyAsync.cpp @@ -20,7 +20,8 @@ void simpleNegTest() // Can't use default with async copy e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); - HIPASSERT (e==hipErrorInvalidMemcpyDirection); + HIPASSERT (e==hipErrorInvalidMemcpyDirection); // TODO + HIPASSERT (e!= hipSuccess); // Not sure what happens here, the memory must be pinned. @@ -30,6 +31,33 @@ void simpleNegTest() //HIPASSERT (e==hipErrorInvalidValue); } + +//--- +//Send many async copies to the same stream. +//This requires runtime to keep track of many outstanding commands, and in the case of HCC requires growing/tracking the signal pool: +template +void test_manyCopies(int nElements, size_t numCopies, int nStreams) +{ + size_t Nbytes = nElements*sizeof(T); + printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0); + + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); + + size_t eachCopyBytes = Nbytes / numCopies; + + for (size_t i=0; i Date: Fri, 19 Feb 2016 13:27:03 +0300 Subject: [PATCH 59/94] Guard #ifdef USE_ROCR_20 is added for ROCR_20 device properties (memoryClockRate, memoryBusWidth) By default isn't defined. To add ROCR_20 support HIP have to be compiled as follows: make CXX_DEFINES+=-DUSE_ROCR_20 [ROCm/hip commit: 833c9e52ad9c75bad8b4c2f188f5af3fbb20a9d6] --- projects/hip/include/hip_runtime_api.h | 4 ++++ projects/hip/include/nvcc_detail/hip_runtime_api.h | 2 ++ projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp | 4 +++- projects/hip/src/hip_hcc.cpp | 4 ++++ projects/hip/tests/src/hipGetDeviceAttribute.cpp | 2 ++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index dcec805be4..2b5c98bc2a 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -81,8 +81,10 @@ typedef struct hipDeviceProp_t { int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. int maxGridSize[3]; ///< Max grid dimensions (XYZ). int clockRate; ///< Max clock frequency of the multiProcessors in khz. +#ifdef USE_ROCR_20 int memoryClockRate; ///< Max global memory clock frequency in khz. int memoryBusWidth; ///< Global memory bus width in bits. +#endif size_t totalConstMem; ///< Size of shared memory region (in bytes). int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. @@ -145,8 +147,10 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeWarpSize, ///< Warp size in threads. hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. +#ifdef USE_ROCR_20 hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. +#endif hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 85befff24f..fac9380bfd 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -242,10 +242,12 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrMaxRegistersPerBlock; break; case hipDeviceAttributeClockRate: cdattr = cudaDevAttrClockRate; break; +#ifdef USE_ROCR_20 case hipDeviceAttributeMemoryClockRate: cdattr = cudaDevAttrMemoryClockRate; break; case hipDeviceAttributeMemoryBusWidth: cdattr = cudaDevAttrGlobalMemoryBusWidth; break; +#endif case hipDeviceAttributeMultiprocessorCount: cdattr = cudaDevAttrMultiProcessorCount; break; case hipDeviceAttributeComputeMode: diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index 9c3d2c1b53..c7b298705b 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -80,9 +80,11 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; - cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; +#ifdef USE_ROCR_20 cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; + cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; +#endif cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index d613d3a01b..91c9c7ed55 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -343,6 +343,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; +#ifdef USE_ROCR_20 // Get Max memory clock frequency err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); DeviceErrorCheck(err); @@ -351,6 +352,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Get global memory bus width in bits err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH, &prop->memoryBusWidth); DeviceErrorCheck(err); +#endif // Set feature flags - these are all mandatory for HIP on HCC path: // Some features are under-development and future revs may support flags that are currently 0. @@ -818,10 +820,12 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->regsPerBlock; break; case hipDeviceAttributeClockRate: *pi = prop->clockRate; break; +#ifdef USE_ROCR_20 case hipDeviceAttributeMemoryClockRate: *pi = prop->memoryClockRate; break; case hipDeviceAttributeMemoryBusWidth: *pi = prop->memoryBusWidth; break; +#endif case hipDeviceAttributeMultiprocessorCount: *pi = prop->multiProcessorCount; break; case hipDeviceAttributeComputeMode: diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 3cd88e3ed7..dfd5c28f99 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -67,8 +67,10 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeWarpSize, props.warpSize)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxRegistersPerBlock, props.regsPerBlock)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeClockRate, props.clockRate)); +#ifdef USE_ROCR_20 CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryClockRate, props.memoryClockRate)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryBusWidth, props.memoryBusWidth)); +#endif CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeL2CacheSize, props.l2CacheSize)); From d8df47383d67c0bfef573fcebf66297f85ede0e4 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 Feb 2016 13:36:37 +0300 Subject: [PATCH 60/94] A support of the following device properties is added to legacy hipify.pl: hipDeviceAttributeConcurrentKernels, hipDeviceAttributeMemoryClockRate & hipDeviceAttributeMemoryBusWidth. [ROCm/hip commit: b23f9776ca82abfbd9c54a3afd93d5af84848228] --- projects/hip/bin/hipify | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/hip/bin/hipify b/projects/hip/bin/hipify index d143bdff37..7886c5e9eb 100755 --- a/projects/hip/bin/hipify +++ b/projects/hip/bin/hipify @@ -364,9 +364,12 @@ while (@ARGV) { $ft{'err'} += s/\bcudaDevAttrMaxThreadsPerMultiProcessor\b/hipDeviceAttributeMaxThreadsPerMultiProcessor/g; $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMajor\b/hipDeviceAttributeComputeCapabilityMajor/g; $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMinor\b/hipDeviceAttributeComputeCapabilityMinor/g; + $ft{'err'} += s/\bcudaDevAttrConcurrentKernels\b/hipDeviceAttributeConcurrentKernels/g; $ft{'err'} += s/\bcudaDevAttrPciBusId\b/hipDeviceAttributePciBusId/g; $ft{'err'} += s/\bcudaDevAttrPciDeviceId\b/hipDeviceAttributePciDeviceId/g; $ft{'err'} += s/\bcudaDevAttrMaxSharedMemoryPerMultiprocessor\b/hipDeviceAttributeMaxSharedMemoryPerMultiprocessor/g; + $ft{'err'} += s/\bcudaDevAttrMemoryClockRate\b/hipDeviceAttributeMemoryClockRate/g; + $ft{'err'} += s/\bcudaDevAttrGlobalMemoryBusWidth\b/hipDeviceAttributeMemoryBusWidth/g; $ft{'dev'} += s/\bcudaDeviceAttr\b/hipDeviceAttribute_t/g; $ft{'dev'} += s/\bcudaDeviceGetAttribute\b/hipDeviceGetAttribute/g; From 0389b292a64a00190e4f21d7d37a534b8dac6d4b Mon Sep 17 00:00:00 2001 From: streamhsa Date: Fri, 19 Feb 2016 20:18:03 +0800 Subject: [PATCH 61/94] Resolve issues for hip_popc and hip_ballot on nvcc [ROCm/hip commit: a7232c7f9e90cd6e76aa039740aac679078bc672] --- projects/hip/tests/src/hip_ballot.cpp | 10 +++++++--- projects/hip/tests/src/hip_popc.cpp | 23 ++++++++++------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/projects/hip/tests/src/hip_ballot.cpp b/projects/hip/tests/src/hip_ballot.cpp index 1f3012e689..76fb673ecc 100644 --- a/projects/hip/tests/src/hip_ballot.cpp +++ b/projects/hip/tests/src/hip_ballot.cpp @@ -9,7 +9,8 @@ __global__ void int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; const unsigned int warp_num = hipThreadIdx_x >> pshift; - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + if (pshift ==6) {atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));} + else {atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));} } @@ -18,8 +19,11 @@ int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipDeviceGetProperties(&devProp, 0); - if(strncmp(devProp.name,"Fiji",1)==0) {warpSize =64; pshift =6;} + + if(strncmp(devProp.name,"Fiji",1)==0) + {warpSize = 64; pshift =6;} else {warpSize =32; pshift =5;} + unsigned int Num_Threads_per_Block = 512; unsigned int Num_Blocks_per_Grid = 1; unsigned int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; @@ -41,7 +45,7 @@ int main(int argc, char *argv[]) if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n"; - else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n"; + else {std::cout << " Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n"; divergent_count++;} } diff --git a/projects/hip/tests/src/hip_popc.cpp b/projects/hip/tests/src/hip_popc.cpp index d03c11f32a..0227bdb97c 100644 --- a/projects/hip/tests/src/hip_popc.cpp +++ b/projects/hip/tests/src/hip_popc.cpp @@ -31,8 +31,8 @@ THE SOFTWARE. #define HIP_ASSERT(x) (assert((x)==hipSuccess)) -#define WIDTH 32 -#define HEIGHT 32 +#define WIDTH 16 +#define HEIGHT 16 #define NUM (WIDTH*HEIGHT) @@ -52,10 +52,10 @@ unsigned int popcountCPU( T value) { } return ret; } - + __global__ void -HIP_kernel(hipLaunchParm lp, - unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) +HIP_kernel(hipLaunchParm lp, + unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -71,20 +71,18 @@ HIP_kernel(hipLaunchParm lp, - - using namespace std; int main() { unsigned int* hostA; unsigned int* hostB; - unsigned int* hostC; + unsigned int* hostC; unsigned long long int* hostD; unsigned int* deviceA; unsigned int* deviceB; - unsigned int* deviceC; + unsigned int* deviceC; unsigned long long int* deviceD; hipDeviceProp_t devProp; @@ -107,7 +105,7 @@ int main() { // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = i; - hostD[i] = 1099511627776-i; + hostD[i] = 1099511627776-i; } HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(unsigned int))); @@ -128,7 +126,7 @@ int main() { HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost)); HIP_ASSERT(hipMemcpy(hostC, deviceC, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost)); - // verify the results + // verify the results errors = 0; for (i = 0; i < NUM; i++) { printf("gpu_popc =%d, cpu_popc =%d \n",hostA[i],popcountCPU(hostB[i])); @@ -149,11 +147,10 @@ int main() { } } if (errors!=0) { - printf("FAILED: %d errors\n",errors); + printf("FAILED: %d errors\n",errors); } else { printf ("__popcll() PASSED!\n"); } - HIP_ASSERT(hipFree(deviceA)); HIP_ASSERT(hipFree(deviceB)); HIP_ASSERT(hipFree(deviceC)); From ebf270093659a2748958c5bb6ce1ec45ce7e41d7 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 20 Feb 2016 11:01:43 -0600 Subject: [PATCH 62/94] Track last command to a stream. Passing simple tests. [ROCm/hip commit: d33d806a5be638f060b2eab982b56810d4f5c273] --- projects/hip/include/hcc_detail/hip_runtime.h | 9 +- projects/hip/src/hip_hcc.cpp | 345 ++++++++++++------ projects/hip/tests/src/hipMemcpyAsync.cpp | 80 +++- projects/hip/tests/src/test_common.h | 2 +- 4 files changed, 301 insertions(+), 135 deletions(-) diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h index 7c5a2f2e36..aeed53348e 100644 --- a/projects/hip/include/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hcc_detail/hip_runtime.h @@ -481,7 +481,8 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); }; #ifdef __HCC_CPP__ -hc::accelerator_view *ihipLaunchKernel(hipStream_t stream); +hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av); +void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf); #if not defined(DISABLE_GRID_LAUNCH) #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ @@ -496,12 +497,13 @@ do {\ lp.groupMemBytes = _groupMemBytes;\ hc::completion_future cf;\ lp.cf = &cf; \ - lp.av = (ihipLaunchKernel(_stream)); \ + hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \ if (HIP_TRACE_API) {\ fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \ #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\ }\ _kernelName (lp, __VA_ARGS__);\ + ihipPostLaunchKernel(trueStream, cf);\ } while(0) #else @@ -519,12 +521,13 @@ do {\ lp.groupMemBytes = _groupMemBytes;\ hc::completion_future cf;\ lp.cf = &cf; \ - lp.av = (ihipLaunchKernel(_stream)); \ + hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \ if (HIP_TRACE_API) {\ fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \ #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\ }\ _kernelName (lp, __VA_ARGS__);\ + ihipPostLaunchKernel(trueStream, cf);\ } while(0) /*end hipLaunchKernel */ #endif diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index f814f99ddf..2d7650b6ed 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -31,6 +31,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -91,28 +92,35 @@ struct ihipDevice_t; enum ihipCommand_t { ihipCommandKernel, - ihipCommandData, + ihipCommandCopyH2D, + ihipCommandCopyD2H, +}; + +const char* ihipCommandName[] = { + "Kernel", "CopyH2D", "CopyD2H" }; // Small wrapper around signals. // Designed to be used from stream. struct ihipSignal_t { - hsa_signal_t _hsa_signal; - int _refCnt; + hsa_signal_t _hsa_signal; // hsa signal handle + int _ref_cnt; // reference count, 0 == signal is free. + uint64_t _seq_id; // unique sequentially increasig ID. - ihipSignal_t() : _refCnt(0) { + ihipSignal_t() : _ref_cnt(0), _seq_id(0) { if (hsa_signal_create(1, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { throw; } + tprintf (TRACE_SYNC, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); } ~ihipSignal_t() { if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { throw; } - // _refCnt should be 0, unless we are shutting down... - _refCnt = 0; + // _ref_cnt should be 0, unless we are shutting down... + _ref_cnt = 0; }; }; @@ -120,23 +128,35 @@ struct ihipSignal_t { // Internal stream structure. class ihipStream_t { public: - unsigned _device_index; - hc::accelerator_view _av; - unsigned _flags; - ihipCommand_t _last_command; - //ihipStream_t() : _av(){ }; ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); ~ihipStream_t(); + inline void wait(); + inline ihipDevice_t * getDevice() const; - ihipSignal_t * getSignal() ; + ihipSignal_t * getSignal() ; void releaseSignal(ihipSignal_t *signal) ; + inline bool preKernelCommand(); + inline void postKernelCommand(hc::completion_future &kernel_future); + inline int copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + + //--- + unsigned _device_index; + hc::accelerator_view _av; + unsigned _flags; private: + void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); + + uint64_t _seq_signal_id; // Monotonically increasing unique signal id. + ihipCommand_t _last_command_type; // type of the last command + ihipSignal_t *_last_copy_signal; // signal of last copy command sent to the stream. Copy can be either H2D or D2H. + hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. + int _signalCursor; - std::vector _signalPool; + std::deque _signalPool; }; @@ -239,22 +259,20 @@ unsigned g_deviceCnt; //================================================================================================= //--- ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel), + _device_index(device_index), + _av(av), + _flags(flags), + _seq_signal_id(0), + _last_command_type(ihipCommandCopyH2D), + _last_copy_signal (NULL), _signalCursor(0) { + tprintf(TRACE_SYNC, " streamCreate: stream=%p\n", this); _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); - -#if 0 - auto s = this; - std::for_each(_signalPool.begin(), _signalPool.end(), - [s](ihipSignal_t &iter) { - printf (" stream:%p allocated hsa_signal=%lu\n", s, (iter._hsa_signal.handle)); - }); -#endif - }; + //--- ihipStream_t::~ihipStream_t() { @@ -262,6 +280,17 @@ ihipStream_t::~ihipStream_t() } +void ihipStream_t::wait() { + tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_seq_id: 0x0 ); + _av.wait(); + if (_last_copy_signal) { + hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + // TODO-stream : reset ? + } + +}; + + //--- inline ihipDevice_t * ihipStream_t::getDevice() const { @@ -280,8 +309,9 @@ ihipSignal_t *ihipStream_t::getSignal() _signalCursor = 0; } - if (_signalPool[thisCursor]._refCnt == 0) { - _signalPool[thisCursor]._refCnt ++; // allocate it + if (_signalPool[thisCursor]._ref_cnt == 0) { + _signalPool[thisCursor]._ref_cnt ++; // allocate it + _signalPool[thisCursor]._seq_id = ++_seq_signal_id; // allocate it return &_signalPool[thisCursor]; } @@ -291,19 +321,19 @@ ihipSignal_t *ihipStream_t::getSignal() assert(numToScan == 0); // Have to grow the pool: - printf ("Grow signal pool\n"); _signalCursor = _signalPool.size(); // set to the beginning of the new entries: _signalPool.resize(_signalPool.size() * 2); + tprintf (TRACE_SYNC, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); return getSignal(); // try again, - // Shouldnever reach here. + // Should never reach here. assert(0); } void ihipStream_t::releaseSignal(ihipSignal_t *signal) { - if (--signal->_refCnt <= 0) { + if (--signal->_ref_cnt <= 0) { // restore signal to the initial value 1 hsa_signal_store_release(signal->_hsa_signal, 1); } @@ -714,7 +744,7 @@ static inline void ihipWaitAllStreams(ihipDevice_t *device) { tprintf(TRACE_SYNC, "waitAllStream\n"); for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { - (*streamI)->_av.wait(); + (*streamI)->wait(); } } @@ -730,7 +760,7 @@ inline void ihipWaitNullStream(ihipDevice_t *device) if (!(stream->_flags & hipStreamNonBlocking)) { // TODO-hcc - use blocking or active wait here? // TODO-sync - cudaDeviceBlockingSync - stream->_av.wait(); + stream->wait(); } } } @@ -753,17 +783,9 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } } -#if 0 -inline hsa_status_t -HSABarrier::enqueueBarrier(hsa_queue_t* queue) { - hsa_status_t status = HSA_STATUS_SUCCESS; - hc::completion_future marker = stream->_av.create_marker(); - - // Create a signal to wait for the barrier to finish. - std::pair ret = Kalmar::ctx.getSignal(); - signal = ret.first; - signalIndex = ret.second; +void +ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { // Obtain the write index for the command queue uint64_t index = hsa_queue_load_write_index_relaxed(queue); @@ -776,21 +798,20 @@ HSABarrier::enqueueBarrier(hsa_queue_t* queue) { // setup header uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; header |= 1 << HSA_PACKET_HEADER_BARRIER; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; barrier->header = header; - barrier->completion_signal = signal; + barrier->dep_signal[0] = depSignal->_hsa_signal; + barrier->completion_signal.handle = 0; + + // TODO - check queue overflow, return error: // Increment write index and ring doorbell to dispatch the kernel hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); - - isDispatched = true; - - return status; } -#endif + //-- //When the commands in a stream change types (ie kernel command follows a data command, @@ -798,36 +819,93 @@ HSABarrier::enqueueBarrier(hsa_queue_t* queue) { //into the stream to mimic CUDA stream semantics. (some hardware uses separate //queues for data commands and kernel commands, and no implicit ordering is provided). // -inline bool ihipCheckCommandSwitchSync(hipStream_t stream, ihipCommand_t new_command, hc::completion_future *marker) +inline bool ihipStream_t::preKernelCommand() { bool addedSync = false; // If switching command types, we need to add a barrier packet to synchronize things. - if (stream->_last_command != new_command) { - addedSync = true; - *marker = stream->_av.create_marker(); + if (_last_command_type != ihipCommandKernel) { + if (_last_copy_signal) { + addedSync = true; - tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", - (void*)stream, - stream->_last_command == ihipCommandKernel ? "Kernel" : "Data", - new_command == ihipCommandKernel ? "Kernel" : "Data"); - stream->_last_command = new_command; + hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); + this->enqueueBarrier(q, _last_copy_signal); + + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", + this, + ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]) + } + _last_command_type = ihipCommandKernel; } return addedSync; } +//--- +inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) +{ + _last_kernel_future = kernelFuture; +}; + + + +//--- +// Called whenever a copy command is set to the stream. +// Examines the last command sent to this stream and returns a signal to wait on, if required. +inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) +{ + int needSync = 0; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != copyType) { + needSync = 1; + + + if (_last_command_type == ihipCommandKernel) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); + if (hsaSignal) { + *waitSignal = * hsaSignal; + } + } else if (_last_copy_signal) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + assert (_last_copy_signal->_ref_cnt > 0); + *waitSignal = _last_copy_signal->_hsa_signal; + } + + _last_command_type = copyType; + } + + _last_copy_signal = lastCopy; + + return needSync; +} + + + + + +// TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. -hc::accelerator_view *ihipLaunchKernel(hipStream_t stream) +hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av) { - stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandKernel, &marker); + stream->preKernelCommand(); - return &(stream->_av); + *av = &stream->_av; + + return (stream); +} + + +//--- +//Called after kernel finishes execution. +void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFuture) +{ + stream->postKernelCommand(kernelFuture); } @@ -1202,7 +1280,7 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int { // Super-conservative version of this - TODO - remove me: - stream->_av.wait(); + stream->wait(); e = hipSuccess; } @@ -1220,7 +1298,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) ihipDevice_t *device = ihipGetTlsDefaultDevice(); ihipWaitNullStream(device); } else { - stream->_av.wait(); + stream->wait(); e = hipSuccess; } @@ -1389,7 +1467,7 @@ void ihipSetTs(hipEvent_t e) // already recorded, done: return; } else { - // Test this code: + // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (eh->_marker.get_native_handle()); if (sig) { if (hsa_signal_load_acquire(*sig) == 0) { @@ -1694,8 +1772,9 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } auto device = ihipGetTlsDefaultDevice(); - hc::completion_future marker; - ihipCheckCommandSwitchSync(device._null_stream, ihipCommandData, &marker); + //hsa_signal_t depSignal; + //int depSignalCnt = device._null_stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); + assert(0); // Need to properly synchronize the copy - do something with depSignal if != NULL. device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset); #endif @@ -1762,7 +1841,12 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); + hsa_signal_t depSignal; + int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); + + printf ("need sync\n"); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif @@ -1807,7 +1891,13 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); + hsa_signal_t depSignal; + // TODO + int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); + + printf ("need sync\n"); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif @@ -1867,6 +1957,14 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB } } +#if 0 + + //TODO + hsa_signal_t depSignal; + int dep_signals = stream->commandCopy(&depSignal, ); + pass to CopyHostToDevice +#endif + if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { std::lock_guard l (device->_copy_lock[0]); @@ -1918,7 +2016,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); + hipError_t e = hipSuccess; @@ -1955,44 +2053,52 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp hipError_t e = hipSuccess; - stream = ihipSyncAndResolveStream(stream); - - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); - - // Dispatch async memory copy to synchronize with items in the specified stream. - - // Async - need to set up dependency on the last command queued to the device? - + stream = ihipSyncAndResolveStream(stream); #if USE_AM_TRACKER - - hipStream_t s = ihipSyncAndResolveStream(stream); - - if (s) { - ihipDevice_t *device = s->getDevice(); + if (stream) { + ihipDevice_t *device = stream->getDevice(); if (kind == hipMemcpyDefault) { e = hipErrorInvalidMemcpyDirection; + + } else if (kind == hipMemcpyHostToHost) { + tprintf (TRACE_COPY2, "H2H copy with memcpy"); + + memcpy(dst, src, sizeBytes); + } else { - // Let HSA runtime handle it: - // TODO - need buffer pool for the signals rather than lock: ihipSignal_t *ihip_signal = stream->getSignal(); - //stream->saveLastSignal(ihipSignal); + ihipCommand_t copyType; + if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) { + copyType = ihipCommandCopyH2D; + } else if (kind == hipMemcpyDeviceToHost) { + copyType = ihipCommandCopyD2H; + } else { + e = hipErrorInvalidMemcpyDirection; + copyType = ihipCommandCopyD2H; + } #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, ihip_signal->_hsa_signal); + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(ihip_signal, &depSignal, copyType); + + tprintf (TRACE_SYNC, " copy-async, waitFor=%d(%lu) completion=%lu\n", depSignalCnt, depSignal.handle, ihip_signal->_seq_id); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); #endif - if (hsa_status == HSA_STATUS_SUCCESS) { + if (hsa_status == HSA_STATUS_SUCCESS) { if (HIP_LAUNCH_BLOCKING) { hsa_signal_wait_relaxed(ihip_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); stream->releaseSignal(ihip_signal); - } + } else { + //stream->releaseSignal(ihip_signal); + } } else { // This path can be hit if src or dst point to unpinned host memory. // TODO - does async-copy fall back to sync if input pointers are not pinned? @@ -2014,6 +2120,8 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. +/** @return #hipErrorInvalidValue + */ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream ) { std::call_once(hip_initialized, ihipInit); @@ -2021,37 +2129,42 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t e = hipSuccess; stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); + stream->preKernelCommand(); + if (stream) { - hc::completion_future cf ; + hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { - // use a faster word-per-workitem copy: - try { - value = value & 0xff; - unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); - } - catch (std::exception &ex) { - e = hipErrorInvalidValue; + if ((sizeBytes & 0x3) == 0) { + // use a faster word-per-workitem copy: + try { + value = value & 0xff; + unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; + cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + } + catch (std::exception &ex) { + e = hipErrorInvalidValue; + } + } else { + // use a slow byte-per-workitem copy: + try { + cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); + } + catch (std::exception &ex) { + e = hipErrorInvalidValue; + } } + + stream->postKernelCommand(cf); + + + if (HIP_LAUNCH_BLOCKING) { + tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); + cf.wait(); + tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); + } } else { - // use a slow byte-per-workitem copy: - try { - cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); - } - catch (std::exception &ex) { - e = hipErrorInvalidValue; - } - } - - - if (HIP_LAUNCH_BLOCKING) { - tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); - cf.wait(); - tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); + e = hipErrorInvalidValue; } diff --git a/projects/hip/tests/src/hipMemcpyAsync.cpp b/projects/hip/tests/src/hipMemcpyAsync.cpp index 19f1a94761..8669b986d8 100644 --- a/projects/hip/tests/src/hipMemcpyAsync.cpp +++ b/projects/hip/tests/src/hipMemcpyAsync.cpp @@ -36,24 +36,62 @@ void simpleNegTest() //Send many async copies to the same stream. //This requires runtime to keep track of many outstanding commands, and in the case of HCC requires growing/tracking the signal pool: template -void test_manyCopies(int nElements, size_t numCopies, int nStreams) +void test_manyCopies(int nElements, int numCopies) { size_t Nbytes = nElements*sizeof(T); - printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0); + size_t eachCopyElements = nElements / numCopies; + size_t eachCopyBytes = eachCopyElements * sizeof(T); - int *A_d, *B_d, *C_d; - int *A_h, *B_h, *C_h; + printf ("-----------------------------------------------------------------------------------------------\n"); + printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + __func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes); - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); + T *A_d; + T *A_h1, *A_h2; - size_t eachCopyBytes = Nbytes / numCopies; - - for (size_t i=0; i (i); } - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true); + + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + //stream=0; // fixme TODO + + + for (int i=0; i(1024, 16); + test_manyCopies(1024, 4); + test_manyCopies(1024*4, 64); + } - test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version - test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version - test_chunkedAsyncExample(p_streams, false, false, true); // Some async - test_chunkedAsyncExample(p_streams, false, false, false); // All async + if (p_tests & 0x4) { + test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version + test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version + test_chunkedAsyncExample(p_streams, false, false, true); // Some async + test_chunkedAsyncExample(p_streams, false, false, false); // All async + } diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 5b631d2c3a..f133696d78 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -25,7 +25,7 @@ printf (__VA_ARGS__);\ printf ("\n");\ printf ("error: TEST FAILED\n%s", KNRM );\ - exit(EXIT_FAILURE); + abort(); #define HIPCHECK(error) \ From d40b8d8fb0b1db0062f7e62c4c76f03c186f2249 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 22 Feb 2016 15:00:53 +0530 Subject: [PATCH 63/94] Enable cospi,rsqrt,sinpi tests for HCC newer than 16073 [ROCm/hip commit: c5c11d370745f3ef0ed9c0bbca169c16273b0e4d] --- .../hip/tests/src/hipDoublePrecisionMathDevice.cpp | 12 +++++++++--- .../hip/tests/src/hipDoublePrecisionMathHost.cpp | 12 +++++++++--- .../hip/tests/src/hipSinglePrecisionMathDevice.cpp | 12 +++++++++--- .../hip/tests/src/hipSinglePrecisionMathHost.cpp | 12 +++++++++--- 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/projects/hip/tests/src/hipDoublePrecisionMathDevice.cpp b/projects/hip/tests/src/hipDoublePrecisionMathDevice.cpp index 4f36b91eb7..7e1d862392 100644 --- a/projects/hip/tests/src/hipDoublePrecisionMathDevice.cpp +++ b/projects/hip/tests/src/hipDoublePrecisionMathDevice.cpp @@ -42,7 +42,9 @@ __device__ void double_precision_math_functions() copysign(1.0, -2.0); cos(0.0); cosh(0.0); - //cospi(0.0); +#if __hcc_workweek__ >= 16073 + cospi(0.0); +#endif //cyl_bessel_i0(0.0); //cyl_bessel_i1(0.0); erf(0.0); @@ -100,7 +102,9 @@ __device__ void double_precision_math_functions() //rnorm3d(0.0, 0.0, 1.0); //rnorm4d(0.0, 0.0, 0.0, 1.0); round(0.0); - //rsqrt(1.0); +#if __hcc_workweek__ >= 16073 + rsqrt(1.0); +#endif //scalbln(0.0, 1); scalbn(0.0, 1); signbit(1.0); @@ -108,7 +112,9 @@ __device__ void double_precision_math_functions() //sincos(0.0, &fX, &fY); //sincospi(0.0, &fX, &fY); sinh(0.0); - //sinpi(0.0); +#if __hcc_workweek__ >= 16073 + sinpi(0.0); +#endif sqrt(0.0); tan(0.0); tanh(0.0); diff --git a/projects/hip/tests/src/hipDoublePrecisionMathHost.cpp b/projects/hip/tests/src/hipDoublePrecisionMathHost.cpp index 9e4c43e2be..d45423a879 100644 --- a/projects/hip/tests/src/hipDoublePrecisionMathHost.cpp +++ b/projects/hip/tests/src/hipDoublePrecisionMathHost.cpp @@ -42,7 +42,9 @@ __host__ void double_precision_math_functions() copysign(1.0, -2.0); cos(0.0); cosh(0.0); - //cospi(0.0); +#if __hcc_workweek__ >= 16073 + cospi(0.0); +#endif //cyl_bessel_i0(0.0); //cyl_bessel_i1(0.0); erf(0.0); @@ -100,7 +102,9 @@ __host__ void double_precision_math_functions() //rnorm3d(0.0, 0.0, 1.0); //rnorm4d(0.0, 0.0, 0.0, 1.0); round(0.0); - //rsqrt(1.0); +#if __hcc_workweek__ >= 16073 + rsqrt(1.0); +#endif ///scalbln(0.0, 1); scalbn(0.0, 1); signbit(1.0); @@ -108,7 +112,9 @@ __host__ void double_precision_math_functions() sincos(0.0, &fX, &fY); //sincospi(0.0, &fX, &fY); sinh(0.0); - //sinpi(0.0); +#if __hcc_workweek__ >= 16073 + sinpi(0.0); +#endif sqrt(0.0); tan(0.0); tanh(0.0); diff --git a/projects/hip/tests/src/hipSinglePrecisionMathDevice.cpp b/projects/hip/tests/src/hipSinglePrecisionMathDevice.cpp index 8413c37b77..acb74d3f2d 100644 --- a/projects/hip/tests/src/hipSinglePrecisionMathDevice.cpp +++ b/projects/hip/tests/src/hipSinglePrecisionMathDevice.cpp @@ -42,7 +42,9 @@ __device__ void single_precision_math_functions() copysignf(1.0f, -2.0f); cosf(0.0f); coshf(0.0f); - //cospif(0.0f); +#if __hcc_workweek__ >= 16073 + cospif(0.0f); +#endif //cyl_bessel_i0f(0.0f); //cyl_bessel_i1f(0.0f); erfcf(0.0f); @@ -101,7 +103,9 @@ __device__ void single_precision_math_functions() //rnorm4df(0.0f, 0.0f, 0.0f, 1.0f); //fX = 1.0f; rnormf(1, &fX); roundf(0.0f); - //rsqrtf(1.0f); +#if __hcc_workweek__ >= 16073 + rsqrtf(1.0f); +#endif //scalblnf(0.0f, 1); scalbnf(0.0f, 1); signbit(1.0f); @@ -109,7 +113,9 @@ __device__ void single_precision_math_functions() //sincospif(0.0f, &fX, &fY); sinf(0.0f); sinhf(0.0f); - //sinpif(0.0f); +#if __hcc_workweek__ >= 16073 + sinpif(0.0f); +#endif sqrtf(0.0f); tanf(0.0f); tanhf(0.0f); diff --git a/projects/hip/tests/src/hipSinglePrecisionMathHost.cpp b/projects/hip/tests/src/hipSinglePrecisionMathHost.cpp index 6dd1c07f1b..c12b553e0f 100644 --- a/projects/hip/tests/src/hipSinglePrecisionMathHost.cpp +++ b/projects/hip/tests/src/hipSinglePrecisionMathHost.cpp @@ -42,7 +42,9 @@ __host__ void single_precision_math_functions() copysignf(1.0f, -2.0f); cosf(0.0f); coshf(0.0f); - //cospif(0.0f); +#if __hcc_workweek__ >= 16073 + cospif(0.0f); +#endif //cyl_bessel_i0f(0.0f); //cyl_bessel_i1f(0.0f); erfcf(0.0f); @@ -101,7 +103,9 @@ __host__ void single_precision_math_functions() //rnorm4df(0.0f, 0.0f, 0.0f, 1.0f); //fX = 1.0f; rnormf(1, &fX); roundf(0.0f); - //rsqrtf(1.0f); +#if __hcc_workweek__ >= 16073 + rsqrtf(1.0f); +#endif ///scalblnf(0.0f, 1); scalbnf(0.0f, 1); signbit(1.0f); @@ -109,7 +113,9 @@ __host__ void single_precision_math_functions() //sincospif(0.0f, &fX, &fY); sinf(0.0f); sinhf(0.0f); - //sinpif(0.0f); +#if __hcc_workweek__ >= 16073 + sinpif(0.0f); +#endif sqrtf(0.0f); tanf(0.0f); tanhf(0.0f); From ccd1ed0a97820f430dc86c8690233b53ab5871b1 Mon Sep 17 00:00:00 2001 From: gargrahul Date: Mon, 22 Feb 2016 16:21:52 +0530 Subject: [PATCH 64/94] Update for shared atomics support [ROCm/hip commit: a2fbf0612995d360ed1216e9557e3700fe1cd1e9] --- projects/hip/docs/markdown/hip_kernel_language.md | 1 - projects/hip/include/hcc_detail/hip_runtime.h | 4 ++-- projects/hip/src/hip_hcc.cpp | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/projects/hip/docs/markdown/hip_kernel_language.md b/projects/hip/docs/markdown/hip_kernel_language.md index 9d20fe82a3..4d3e72ce65 100644 --- a/projects/hip/docs/markdown/hip_kernel_language.md +++ b/projects/hip/docs/markdown/hip_kernel_language.md @@ -475,7 +475,6 @@ HIP supports the following atomic operations. ### Caveats and Features Under-Development: - HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently. -- hcc currently maps `__shared__` atomics to `__device__` atomics. Optimal support is under development. - wrapping increment and decrement are under development. ## Warp Cross-Lane Functions diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h index 8474f066df..75bc40aade 100644 --- a/projects/hip/include/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hcc_detail/hip_runtime.h @@ -66,8 +66,8 @@ THE SOFTWARE. // 32-bit Atomics: #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1) #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1) -#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0) -#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0) +#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1) +#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1) #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0) // 64-bit Atomics: diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 91c9c7ed55..ee208bcced 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -40,7 +40,7 @@ THE SOFTWARE. #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) -//#define USE_ASYNC_COPY +//#define USE_ASYNC_COPY #define INLINE static inline @@ -360,11 +360,11 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) prop->arch.hasGlobalInt32Atomics = 1; prop->arch.hasGlobalFloatAtomicExch = 1; - prop->arch.hasSharedInt32Atomics = 0; // TODO-hcc-atomics - prop->arch.hasSharedFloatAtomicExch = 0; // TODO-hcc-atomics + prop->arch.hasSharedInt32Atomics = 1; + prop->arch.hasSharedFloatAtomicExch = 1; prop->arch.hasFloatAtomicAdd = 0; prop->arch.hasGlobalInt64Atomics = 1; - prop->arch.hasSharedInt64Atomics = 0; // TODO-hcc-atomics + prop->arch.hasSharedInt64Atomics = 1; prop->arch.hasDoubles = 1; // TODO - true for Fiji. prop->arch.hasWarpVote = 1; prop->arch.hasWarpBallot = 1; @@ -476,7 +476,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice() { // If this is invalid, the TLS state is corrupt. // This can fire if called before devices are initialized. - // TODO - consider replacing assert with error code + // TODO - consider replacing assert with error code assert (ihipIsValidDevice(tls_defaultDevice)); return &g_devices[tls_defaultDevice]; @@ -1428,7 +1428,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind } else { e = hipErrorInvalidResourceHandle; } - + #else // TODO-hsart - what synchronization does hsa_copy provide? From 1d027bcaea8036df2310445b1bb68ed67decf6b2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 22 Feb 2016 15:09:23 -0600 Subject: [PATCH 65/94] Fix memcpy for Titan. Add to common includes [ROCm/hip commit: c2d66a48a70cc0ce053f971f0033916709c5cfe1] --- projects/hip/include/hip_runtime.h | 5 +++++ projects/hip/tests/src/CMakeLists.txt | 4 ++++ projects/hip/tests/src/hipMemcpy.cpp | 13 ++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/projects/hip/include/hip_runtime.h b/projects/hip/include/hip_runtime.h index 59d3d6c4c9..0594726c90 100644 --- a/projects/hip/include/hip_runtime.h +++ b/projects/hip/include/hip_runtime.h @@ -43,6 +43,11 @@ THE SOFTWARE. #include #include +#ifdef __cplusplus +#include +#endif + + #include #if defined(__HIP_PLATFORM_HCC__) and not defined (__HIP_PLATFORM_NVCC__) diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index 09c0ca7162..c1582ecf88 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -41,6 +41,10 @@ if (${HIP_PLATFORM} STREQUAL "hcc") elseif (${HIP_PLATFORM} STREQUAL "nvcc") MESSAGE ("HIP_PLATFORM=nvcc") + + #Need C++11 for threads in some of the tests. + add_definitions(-std=c++11) + # NVCC does not not support -rdynamic option set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS ) set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS ) diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 8286454098..f9bde2df9f 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -63,6 +63,13 @@ void simpleTest1() } +#ifdef __HIP_PLATFORM_HCC +#define TYPENAME(T) typeid(T).name() +#else +#define TYPENAME(T) "?" +#endif + + //--- // Test many different kinds of memory copies. // THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. @@ -79,7 +86,7 @@ void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, boo size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, - typeid(T).name(), + TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); @@ -169,7 +176,7 @@ template void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) { printSep(); - printf ("test: %s<%s>\n", __func__, typeid(T).name()); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); int deviceId; HIPCHECK(hipGetDevice(&deviceId)); @@ -199,7 +206,7 @@ template void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); - printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, typeid(T).name(), serialize, usePinnedHost); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); if (serialize) { t1.join(); From f0c734e256a9f177f9e8ddbef262fa069db04ab7 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 22 Feb 2016 23:15:24 -0600 Subject: [PATCH 66/94] Improve async copy implementation. - Add device-side signal waits when transitioning between command classes (Kernel, H2D copy, D2H copy). - Support waiting in staged memory copies as well. - Add several chicken bits to control implementation: - HIP_DISABLE_ENQ_BARRIER - HIP_DISABLE_BIDIR_MEMCPY - HIP_ONESHOT_COPY_DEP - Refactor signal pool to support efficient deallocation based on signsequnm. - Deallocate copy signals on eventSynchronize. - Improve copy tests, add pingpong. [ROCm/hip commit: 549b18ce774abe4931c9958eb361f5b3e2deb1c9] --- projects/hip/src/hip_hcc.cpp | 524 +++++++++++++--------- projects/hip/tests/src/hipMemcpyAsync.cpp | 162 ++++++- projects/hip/tests/src/test_common.h | 2 +- 3 files changed, 462 insertions(+), 226 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 2d7650b6ed..3ba578d52c 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -65,6 +65,8 @@ THE SOFTWARE. //static const int debug = 0; static const int release = 1; +#define ENABLE_CHECKS 1 + int HIP_LAUNCH_BLOCKING = 0; int HIP_PRINT_ENV = 0; @@ -73,14 +75,25 @@ int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ -#define TRACE_API 0x1 /* trace API calls and return values */ -#define TRACE_SYNC 0x2 /* trace synchronization pieces */ -#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ -#define TRACE_COPY2 0x8 /* trace memory copy commands. Detailed. */ + +//--- +// Chicken bits for disabling functionality to work around potential issues: +int HIP_DISABLE_ENQ_BARRIER = 1; +int HIP_DISABLE_BIDIR_MEMCPY = 1; +int HIP_ONESHOT_COPY_DEP = 1; // this is a good thing + + +//--- +//Debug flags: +#define TRACE_API 0x01 /* trace API calls and return values */ +#define TRACE_SYNC 0x02 /* trace synchronization pieces */ +#define TRACE_MEM 0x04 /* trace memory allocation / deallocation */ +#define TRACE_COPY2 0x08 /* trace memory copy commands. Detailed. */ +#define TRACE_SIGNAL 0x10 /* trace signal pool commands */ #define tprintf(trace_level, ...) {\ if (HIP_TRACE_API & trace_level) {\ - fprintf (stderr, "hiptrace%d: ", trace_level); \ + fprintf (stderr, "hiptrace%x: ", trace_level); \ fprintf (stderr, __VA_ARGS__);\ }\ } @@ -101,30 +114,28 @@ const char* ihipCommandName[] = { }; + +typedef uint64_t SIGSEQNUM; + +//--- // Small wrapper around signals. // Designed to be used from stream. +// TODO-someday refactor this class so it can be stored in a vector<> +// we already store the index here so we can use for garbage collection. struct ihipSignal_t { hsa_signal_t _hsa_signal; // hsa signal handle - int _ref_cnt; // reference count, 0 == signal is free. - uint64_t _seq_id; // unique sequentially increasig ID. + int _index; // Index in pool, used for garbage collection. + SIGSEQNUM _sig_id; // unique sequentially increasing ID. - ihipSignal_t() : _ref_cnt(0), _seq_id(0) { - if (hsa_signal_create(1, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { - throw; - } - tprintf (TRACE_SYNC, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); - } + ihipSignal_t(); + ~ihipSignal_t(); - ~ihipSignal_t() { - if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { - throw; - } - // _ref_cnt should be 0, unless we are shutting down... - _ref_cnt = 0; - }; + inline void release(); }; + + // Internal stream structure. class ihipStream_t { public: @@ -132,31 +143,38 @@ public: ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); ~ihipStream_t(); + inline void reclaimSignals(SIGSEQNUM sigNum); + inline void waitAndReclaimOlder(ihipSignal_t *signal); inline void wait(); inline ihipDevice_t * getDevice() const; ihipSignal_t * getSignal() ; - void releaseSignal(ihipSignal_t *signal) ; inline bool preKernelCommand(); inline void postKernelCommand(hc::completion_future &kernel_future); inline int copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + inline void resetToEmpty(); + + inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; + //--- - unsigned _device_index; hc::accelerator_view _av; unsigned _flags; private: void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); - uint64_t _seq_signal_id; // Monotonically increasing unique signal id. + unsigned _device_index; ihipCommand_t _last_command_type; // type of the last command ihipSignal_t *_last_copy_signal; // signal of last copy command sent to the stream. Copy can be either H2D or D2H. hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. int _signalCursor; - std::deque _signalPool; + + SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. + SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. + std::deque _signalPool; // Pool of signals for use by this stream. }; @@ -180,6 +198,8 @@ struct ihipEvent_t { hc::completion_future _marker; uint64_t _timestamp; // store timestamp, may be set on host or by marker. + + SIGSEQNUM _copy_seq_id; } ; @@ -191,8 +211,8 @@ struct StagingBuffer { StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); - void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); - void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); + void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); private: ihipDevice_t *_device; @@ -249,27 +269,57 @@ unsigned g_deviceCnt; //================================================================================================= +//================================================================================================= +//Forward Declarations: +//================================================================================================= +INLINE bool ihipIsValidDevice(unsigned deviceIndex); + //================================================================================================= // Implementation: //================================================================================================= +//================================================================================================= +// ihipSignal_t: +//================================================================================================= +// +//--- +ihipSignal_t::ihipSignal_t() : _sig_id(0) +{ + if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + tprintf (TRACE_SIGNAL, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); +} + +//--- +ihipSignal_t::~ihipSignal_t() +{ + tprintf (TRACE_SIGNAL, " destroy hsa_signal #%lu (#%lu)\n", (_hsa_signal.handle), _sig_id); + if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; // TODO + } +}; + + + //================================================================================================= // ihipStream_t: //================================================================================================= //--- ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), - _seq_signal_id(0), - _last_command_type(ihipCommandCopyH2D), - _last_copy_signal (NULL), - _signalCursor(0) + _device_index(device_index), + _last_copy_signal(0), + _signalCursor(0), + _stream_sig_id(0), + _oldest_live_sig_id(1) { tprintf(TRACE_SYNC, " streamCreate: stream=%p\n", this); _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + resetToEmpty(); }; @@ -280,26 +330,62 @@ ihipStream_t::~ihipStream_t() } -void ihipStream_t::wait() { - tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_seq_id: 0x0 ); +//--- +// Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal. +void ihipStream_t::resetToEmpty() +{ + _last_command_type = ihipCommandCopyH2D; + _last_copy_signal = NULL; +} + +//--- +void ihipStream_t::reclaimSignals(SIGSEQNUM sigNum) +{ + tprintf(TRACE_SIGNAL, "reclaim signal #%lu\n", sigNum); + // Mark all signals older and including this one as available for + _oldest_live_sig_id = sigNum+1; +} + + +//--- +void ihipStream_t::waitAndReclaimOlder(ihipSignal_t *signal) +{ + hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + reclaimSignals(_last_copy_signal->_sig_id); + +} + + +//--- +//Wait for all queues kernels in the associated accelerator_view to complete. +void ihipStream_t::wait() +{ + tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_sig_id: 0x0 ); _av.wait(); if (_last_copy_signal) { - hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - // TODO-stream : reset ? + this->waitAndReclaimOlder(_last_copy_signal); } + resetToEmpty(); }; //--- inline ihipDevice_t * ihipStream_t::getDevice() const { - return &g_devices[_device_index]; + if (ihipIsValidDevice(_device_index)) { + return &g_devices[_device_index]; + } else { + return NULL; + } }; +//--- // Allocate a new signal from the signal pool. -// Returned signals are initialized to a value of "1". +// Returned signals have value of 0. +// Signals are intended for use in this stream and are always reclaimed "in-order". ihipSignal_t *ihipStream_t::getSignal() { int numToScan = _signalPool.size(); @@ -309,21 +395,22 @@ ihipSignal_t *ihipStream_t::getSignal() _signalCursor = 0; } - if (_signalPool[thisCursor]._ref_cnt == 0) { - _signalPool[thisCursor]._ref_cnt ++; // allocate it - _signalPool[thisCursor]._seq_id = ++_seq_signal_id; // allocate it + if (_signalPool[thisCursor]._sig_id < _oldest_live_sig_id) { + _signalPool[thisCursor]._index = thisCursor; + _signalPool[thisCursor]._sig_id = ++_stream_sig_id; // allocate it. + + return &_signalPool[thisCursor]; } - numToScan--; - } while (numToScan) ; + } while (--numToScan) ; assert(numToScan == 0); // Have to grow the pool: _signalCursor = _signalPool.size(); // set to the beginning of the new entries: _signalPool.resize(_signalPool.size() * 2); - tprintf (TRACE_SYNC, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); + tprintf (TRACE_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); return getSignal(); // try again, // Should never reach here. @@ -331,14 +418,113 @@ ihipSignal_t *ihipStream_t::getSignal() } -void ihipStream_t::releaseSignal(ihipSignal_t *signal) +//--- +void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { - if (--signal->_ref_cnt <= 0) { - // restore signal to the initial value 1 - hsa_signal_store_release(signal->_hsa_signal, 1); - } + + // Obtain the write index for the command queue + uint64_t index = hsa_queue_load_write_index_relaxed(queue); + const uint32_t queueMask = queue->size - 1; + + // Define the barrier packet to be at the calculated queue index address + hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); + memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); + + // setup header + uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + header |= 1 << HSA_PACKET_HEADER_BARRIER; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + barrier->header = header; + + barrier->dep_signal[0] = depSignal->_hsa_signal; + + barrier->completion_signal.handle = 0; + + // TODO - check queue overflow, return error: + // Increment write index and ring doorbell to dispatch the kernel + hsa_queue_store_write_index_relaxed(queue, index+1); + hsa_signal_store_relaxed(queue->doorbell_signal, index); } + +//-- +//When the commands in a stream change types (ie kernel command follows a data command, +//or data command follows a kernel command), then we need to add a barrier packet +//into the stream to mimic CUDA stream semantics. (some hardware uses separate +//queues for data commands and kernel commands, and no implicit ordering is provided). +// +inline bool ihipStream_t::preKernelCommand() +{ + bool addedSync = false; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != ihipCommandKernel) { + if (_last_copy_signal) { + addedSync = true; + + hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); + if (! HIP_DISABLE_ENQ_BARRIER) { + this->enqueueBarrier(q, _last_copy_signal); + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id) + + } else { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); + this->waitAndReclaimOlder(_last_copy_signal); + } + } + _last_command_type = ihipCommandKernel; + } + + return addedSync; +} + + +//--- +inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) +{ + _last_kernel_future = kernelFuture; +}; + + + +//--- +// Called whenever a copy command is set to the stream. +// Examines the last command sent to this stream and returns a signal to wait on, if required. +inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) +{ + int needSync = 0; + + waitSignal->handle = 0; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != copyType) { + + + if (_last_command_type == ihipCommandKernel) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + needSync = 1; + hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); + if (hsaSignal) { + *waitSignal = * hsaSignal; + } + } else if (_last_copy_signal) { + needSync = 1; + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType], _last_copy_signal->_sig_id); + *waitSignal = _last_copy_signal->_hsa_signal; + } + + _last_command_type = copyType; + } + + _last_copy_signal = lastCopy; + + return needSync; +} + + //================================================================================================= // //Reset the device - this is called from hipDeviceReset. @@ -680,6 +866,10 @@ void ihipInit() READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); + READ_ENV_I(release, HIP_DISABLE_ENQ_BARRIER, 0, "Disable enqueue of barrier packet - instead wait for copy completion on host."); + READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device"); + READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy. If clear, set the dep for each copy."); + /* * Build a table of valid compute devices. */ @@ -784,103 +974,6 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } -void -ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { - - // Obtain the write index for the command queue - uint64_t index = hsa_queue_load_write_index_relaxed(queue); - const uint32_t queueMask = queue->size - 1; - - // Define the barrier packet to be at the calculated queue index address - hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); - memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); - - // setup header - uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - header |= 1 << HSA_PACKET_HEADER_BARRIER; - //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - barrier->header = header; - - barrier->dep_signal[0] = depSignal->_hsa_signal; - - barrier->completion_signal.handle = 0; - - // TODO - check queue overflow, return error: - // Increment write index and ring doorbell to dispatch the kernel - hsa_queue_store_write_index_relaxed(queue, index+1); - hsa_signal_store_relaxed(queue->doorbell_signal, index); -} - - -//-- -//When the commands in a stream change types (ie kernel command follows a data command, -//or data command follows a kernel command), then we need to add a barrier packet -//into the stream to mimic CUDA stream semantics. (some hardware uses separate -//queues for data commands and kernel commands, and no implicit ordering is provided). -// -inline bool ihipStream_t::preKernelCommand() -{ - bool addedSync = false; - // If switching command types, we need to add a barrier packet to synchronize things. - if (_last_command_type != ihipCommandKernel) { - if (_last_copy_signal) { - addedSync = true; - - hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); - this->enqueueBarrier(q, _last_copy_signal); - - tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", - this, - ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]) - } - _last_command_type = ihipCommandKernel; - } - - return addedSync; -} - - -//--- -inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) -{ - _last_kernel_future = kernelFuture; -}; - - - -//--- -// Called whenever a copy command is set to the stream. -// Examines the last command sent to this stream and returns a signal to wait on, if required. -inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) -{ - int needSync = 0; - // If switching command types, we need to add a barrier packet to synchronize things. - if (_last_command_type != copyType) { - needSync = 1; - - - if (_last_command_type == ihipCommandKernel) { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", - this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); - hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); - if (hsaSignal) { - *waitSignal = * hsaSignal; - } - } else if (_last_copy_signal) { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy)\n", - this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); - assert (_last_copy_signal->_ref_cnt > 0); - *waitSignal = _last_copy_signal->_hsa_signal; - } - - _last_command_type = copyType; - } - - _last_copy_signal = lastCopy; - - return needSync; -} @@ -906,6 +999,9 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av) void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFuture) { stream->postKernelCommand(kernelFuture); + if (HIP_LAUNCH_BLOCKING) { + tprintf(TRACE_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream); + } } @@ -1317,20 +1413,16 @@ hipError_t hipStreamDestroy(hipStream_t stream) hipError_t e = hipSuccess; - if (ihipIsValidDevice(stream->_device_index)) { - - ihipDevice_t *device = &g_devices[stream->_device_index]; + ihipDevice_t *device = stream->getDevice(); + if (device) { device->_streams.remove(stream); - delete stream; - - e = hipSuccess; } else { e = hipErrorInvalidResourceHandle; } - return ihipLogStatus(hipSuccess); + return ihipLogStatus(e); } @@ -1371,6 +1463,8 @@ hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) eh->_state = hipEventStatusCreated; eh->_stream = NULL; eh->_flags = flags; + eh->_timestamp = 0; + eh->_copy_seq_id = 0; } else { e = hipErrorInvalidValue; } @@ -1405,6 +1499,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) // Clear timestamps eh->_timestamp = 0; eh->_marker = stream->_av.create_marker(); + eh->_copy_seq_id = stream->lastCopySeqId(); return ihipLogStatus(hipSuccess); } @@ -1452,6 +1547,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) #else eh->_marker.wait(); #endif + eh->_stream->reclaimSignals(eh->_copy_seq_id); + return ihipLogStatus(hipSuccess); } } else { @@ -1636,7 +1733,7 @@ template hc::completion_future ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes) { - int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units); + int wg = std::min((unsigned)8, stream->getDevice()->_compute_units); const int threads_per_wg = 256; int threads = wg * threads_per_wg; @@ -1673,7 +1770,7 @@ template hc::completion_future ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units); + int wg = std::min((unsigned)8, stream->getDevice()->_compute_units); const int threads_per_wg = 256; int threads = wg * threads_per_wg; @@ -1815,7 +1912,11 @@ StagingBuffer::~StagingBuffer() //--- -void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from host CPU. +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { const char *srcp = static_cast (src); char *dstp = static_cast (dst); @@ -1830,10 +1931,10 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (TRACE_COPY2, "waiting... on completion signal\n"); + tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); @@ -1841,16 +1942,11 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_signal_t depSignal; - int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); - - printf ("need sync\n"); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p status=%x\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw @@ -1859,6 +1955,10 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte if (++bufferIndex >= _numBuffers) { bufferIndex = 0; } + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } } @@ -1868,7 +1968,11 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte } //--- -void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from agent this buffer is assocaited with (via _device). +//IN: src - src pointer for copy. Must be accessible from host CPU. +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); @@ -1888,22 +1992,21 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; - tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + tprintf (TRACE_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_signal_t depSignal; - // TODO - int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); - - printf ("need sync\n"); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp0 += theseBytes; + + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } } // Now unload the staging buffers: @@ -1914,7 +2017,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + tprintf (TRACE_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); dstp1 += theseBytes; @@ -1931,8 +2034,14 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #if USE_AM_TRACKER -void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { + ihipDevice_t *device = stream->getDevice(); + + if (device == NULL) { + throw; + } + hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); @@ -1940,9 +2049,9 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); - bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. + bool useStagingBuffer = true; - // Resolve default to a specific Kind, since we use different algorithms: + // Resolve default to a specific Kind so we know which algorithm to use: if (kind == hipMemcpyDefault) { bool dstIsHost = (dstNotTracked || !dstPtrInfo._isInDeviceMem); bool srcIsHost = (srcNotTracked || !srcPtrInfo._isInDeviceMem); @@ -1957,26 +2066,29 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB } } -#if 0 - - //TODO - hsa_signal_t depSignal; - int dep_signals = stream->commandCopy(&depSignal, ); - pass to CopyHostToDevice -#endif if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { std::lock_guard l (device->_copy_lock[0]); - device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes); + //printf ("staged-copy- read dep signals\n"); + + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + + // The copy waits for inputs and then completes before returning. + stream->resetToEmpty(); } else { // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { if (useStagingBuffer) { - std::lock_guard l (device->_copy_lock[1]); - device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes); + std::lock_guard l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]); + //printf ("staged-copy- read dep signals\n"); + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyD2H); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); @@ -1988,7 +2100,7 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB // Let HSA runtime handle it: // TODO - need buffer pool for the signals: - device->_copy_lock[1].lock(); + device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY? 0:1].lock(); hsa_signal_store_relaxed(device->_copy_signal, 1); #if USE_ROCR_V2 @@ -2001,7 +2113,7 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } - device->_copy_lock[1].unlock(); + device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1].unlock(); } } @@ -2017,17 +2129,13 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hc::completion_future marker; - hipError_t e = hipSuccess; #if USE_AM_TRACKER - if (ihipIsValidDevice(stream->_device_index)) { - - ihipDevice_t *device = &g_devices[stream->_device_index]; - - ihipSyncCopy(device, dst, src, sizeBytes, kind); - - } else { + try { + ihipSyncCopy(stream, dst, src, sizeBytes, kind); + } + catch (...) { e = hipErrorInvalidResourceHandle; } @@ -2046,6 +2154,9 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ #endif +/** + * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue + */ //--- hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { @@ -2059,7 +2170,10 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp if (stream) { ihipDevice_t *device = stream->getDevice(); - if (kind == hipMemcpyDefault) { + if (device == NULL) { + e = hipErrorInvalidDevice; + + } else if (kind == hipMemcpyDefault) { e = hipErrorInvalidMemcpyDirection; } else if (kind == hipMemcpyHostToHost) { @@ -2069,6 +2183,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp } else { ihipSignal_t *ihip_signal = stream->getSignal(); + hsa_signal_store_relaxed(ihip_signal->_hsa_signal, 1); ihipCommand_t copyType; if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) { @@ -2084,7 +2199,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp hsa_signal_t depSignal; int depSignalCnt = stream->copyCommand(ihip_signal, &depSignal, copyType); - tprintf (TRACE_SYNC, " copy-async, waitFor=%d(%lu) completion=%lu\n", depSignalCnt, depSignal.handle, ihip_signal->_seq_id); + tprintf (TRACE_SYNC, " copy-async, waitFor=%lu completion=#%lu(%lu)\n", depSignalCnt? depSignal.handle:0x0, ihip_signal->_sig_id, ihip_signal->_hsa_signal.handle); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); #else @@ -2093,15 +2208,14 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp if (hsa_status == HSA_STATUS_SUCCESS) { + // TODO-stream - fix release-signal calls here. if (HIP_LAUNCH_BLOCKING) { - hsa_signal_wait_relaxed(ihip_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - stream->releaseSignal(ihip_signal); - } else { - //stream->releaseSignal(ihip_signal); - } + tprintf(TRACE_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); + stream->wait(); + } } else { // This path can be hit if src or dst point to unpinned host memory. - // TODO - does async-copy fall back to sync if input pointers are not pinned? + // TODO-stream - does async-copy fall back to sync if input pointers are not pinned? e = hipErrorInvalidValue; } } diff --git a/projects/hip/tests/src/hipMemcpyAsync.cpp b/projects/hip/tests/src/hipMemcpyAsync.cpp index 8669b986d8..4b92e2fc1e 100644 --- a/projects/hip/tests/src/hipMemcpyAsync.cpp +++ b/projects/hip/tests/src/hipMemcpyAsync.cpp @@ -31,15 +31,129 @@ void simpleNegTest() //HIPASSERT (e==hipErrorInvalidValue); } +class Pinned; +class Unpinned; + +template struct HostTraits; + +template<> +struct HostTraits +{ + static const char *Name() { return "Pinned"; } ; + + static void *Alloc(size_t sizeBytes) { + void *p; + HIPCHECK(hipMallocHost(&p, sizeBytes)); + return p; + }; +}; + + +template +__global__ void +addK (hipLaunchParm lp, T *A, T K, size_t numElements) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) +{ + HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. + size_t Nbytes = numElements*sizeof(T); + size_t eachCopyElements = numElements / numInflight; + size_t eachCopyBytes = eachCopyElements * sizeof(T); + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + printf ("-----------------------------------------------------------------------------------------------\n"); + printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + __func__, HostTraits::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); + + T *A_h; + T *A_d; + + A_h = (T*)(HostTraits::Alloc(Nbytes)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + + // Initialize the host array: + const T initValue = 13; + const T deviceConst = 2; + const T hostConst = 10000; + for (size_t i=0; i, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); + + for (int i=0; i (i); } - hipStream_t stream; - HIPCHECK (hipStreamCreate(&stream)); - //stream=0; // fixme TODO for (int i=0; i(1024, 16); - test_manyCopies(1024, 4); - test_manyCopies(1024*4, 64); + if (p_tests & 0x02) { + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + test_manyInflightCopies(stream, 1024, 16, true); + test_manyInflightCopies(stream, 1024, 4, true); // verify we re-use the same entries instead of growing pool. + test_manyInflightCopies(stream, 1024*8, 64, false); + + HIPCHECK(hipStreamDestroy(stream)); } - if (p_tests & 0x4) { + if (p_tests & 0x04) { test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, false, true); // Some async test_chunkedAsyncExample(p_streams, false, false, false); // All async } + if (p_tests & 0x08) { + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + test_pingpong(stream, 1024*1024*32, 1, 1, false); + test_pingpong(stream, 1024*1024*32, 1, 10, false); + + HIPCHECK(hipStreamDestroy(stream)); + } passed(); diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index f133696d78..1bf89f1604 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -88,7 +88,7 @@ vectorADD(hipLaunchParm lp, size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i Date: Tue, 23 Feb 2016 04:05:41 -0600 Subject: [PATCH 67/94] add hipLaunchParm [ROCm/hip commit: 9c259bb86c238d8ec79845908a9b474901cfd453] --- projects/hip/util/vim/hip.vim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim index e4ea0a4a9e..34b111a42f 100644 --- a/projects/hip/util/vim/hip.vim +++ b/projects/hip/util/vim/hip.vim @@ -116,8 +116,8 @@ syn keyword hipFunctionName hipGLRegisterBufferObject syn keyword hipFunctionName hipGLSetGLDevice syn keyword hipFunctionName hipGLUnmapBufferObject syn keyword hipFunctionName hipGLUnregisterBufferObject -syn keyword hipFunctionName hipLaunch syn keyword hipFunctionName hipLaunchKernel +syn keyword hipFunctionName hipLaunchParm syn keyword hipFunctionName hipMalloc syn keyword hipFunctionName hipMalloc3D syn keyword hipFunctionName hipMalloc3DArray From 1888acb5f3318edc93dc5036fbeff2c45f6160ad Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 Feb 2016 04:07:05 -0600 Subject: [PATCH 68/94] Sync review. - add calls to ihipInit missing from some routines. - sync before draining a stream. [ROCm/hip commit: 3886d494f40578f2c3e9bc65eeb70b9718b31ee6] --- projects/hip/src/hip_hcc.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 3ba578d52c..f01579ae71 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -65,7 +65,6 @@ THE SOFTWARE. //static const int debug = 0; static const int release = 1; -#define ENABLE_CHECKS 1 int HIP_LAUNCH_BLOCKING = 0; @@ -1413,6 +1412,15 @@ hipError_t hipStreamDestroy(hipStream_t stream) hipError_t e = hipSuccess; + //--- Drain the stream: + if (stream == NULL) { + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + ihipWaitNullStream(device); + } else { + stream->wait(); + e = hipSuccess; + } + ihipDevice_t *device = stream->getDevice(); if (device) { @@ -1862,6 +1870,8 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) //--- hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { + std::call_once(hip_initialized, ihipInit); + #ifdef USE_MEMCPYTOSYMBOL if(kind != hipMemcpyHostToDevice) { @@ -2288,6 +2298,8 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { + std::call_once(hip_initialized, ihipInit); + // TODO - call an ihip memset so HIP_TRACE is correct. return hipMemsetAsync(dst, value, sizeBytes, hipStreamNull); } From 60552f5133244437f811749516c05876bd164ca5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 Feb 2016 12:08:22 -0600 Subject: [PATCH 69/94] Add tests for multi-threaded streams [ROCm/hip commit: 7090f5c3f91642c0cda29ad60f37da843e1b84de] --- projects/hip/src/hip_hcc.cpp | 2 + .../hip/tests/src/hipMultiThreadStreams.cpp | 272 ++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 projects/hip/tests/src/hipMultiThreadStreams.cpp diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 8a0d0df1d4..2085ccbb19 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1183,6 +1183,8 @@ hipError_t hipDeviceReset(void) } #endif + // TODO - reset all streams on the device. + return ihipLogStatus(hipSuccess); } diff --git a/projects/hip/tests/src/hipMultiThreadStreams.cpp b/projects/hip/tests/src/hipMultiThreadStreams.cpp new file mode 100644 index 0000000000..f9bde2df9f --- /dev/null +++ b/projects/hip/tests/src/hipMultiThreadStreams.cpp @@ -0,0 +1,272 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "hip_runtime.h" +#include "test_common.h" + + +void printSep() +{ + printf ("======================================================================================\n"); +} + +//--- +// Test simple H2D copies and back. +// Designed to stress a small number of simple smoke tests +void simpleTest1() +{ + printf ("test: %s\n", __func__); + size_t Nbytes = N*sizeof(int); + printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); + + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + HIPCHECK (hipDeviceSynchronize()); + + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK (hipDeviceReset()); + + printf (" %s success\n", __func__); +} + + +#ifdef __HIP_PLATFORM_HCC +#define TYPENAME(T) typeid(T).name() +#else +#define TYPENAME(T) "?" +#endif + + +//--- +// Test many different kinds of memory copies. +// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. +// +// IN: numElements controls the number of elements used for allocations. +// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. +// IN: useHostToHost : If true, add an extra host-to-host copy. +// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. +// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. +// +template +void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +{ + size_t sizeElements = numElements * sizeof(T); + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + __func__, + TYPENAME(T), + sizeElements, sizeElements/1024.0/1024.0, + usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + T *A_hh = NULL; + T *B_hh = NULL; + T *C_dd = NULL; + + + + if (useHostToHost) { + if (usePinnedHost) { + HIPCHECK ( hipMallocHost(&A_hh, sizeElements) ); + HIPCHECK ( hipMallocHost(&B_hh, sizeElements) ); + } else { + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); + } + + + // Do some extra host-to-host copies here to mix things up: + HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + + HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + + if (useDeviceToDevice) { + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + + // Do an extra device-to-device copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } + + HIPCHECK ( hipDeviceSynchronize() ); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); + + printf (" %s success\n", __func__); +} + + +//--- +//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault +template +void memcpytest2_loop(size_t numElements) +{ + printSep(); + + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { + for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO + for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { + for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { + memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + } + } + } + } +} + + +//--- +//Try many different sizes to memory copy. +template +void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + if (maxElem == 0) { + maxElem = free/sizeof(T)/5; + } + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + + for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + HIPCHECK ( hipDeviceReset() ); + memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + HIPCHECK ( hipDeviceReset() ); + memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + } +} + + +//--- +//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: +template +void multiThread_1(bool serialize, bool usePinnedHost) +{ + printSep(); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); + std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + if (serialize) { + t1.join(); + } + + + std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + if (serialize) { + t2.join(); + } + + if (!serialize) { + t1.join(); + t2.join(); + } +} + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + + if (p_tests & 0x1) { + HIPCHECK ( hipDeviceReset() ); + simpleTest1(); + } + + if (p_tests & 0x2) { + HIPCHECK ( hipDeviceReset() ); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + } + + if (p_tests & 0x4) { + HIPCHECK ( hipDeviceReset() ); + printSep(); + memcpytest2_sizes(0,0); + printSep(); + memcpytest2_sizes(0,64); + printSep(); + memcpytest2_sizes(1024*1024, 13); + printSep(); + memcpytest2_sizes(1024*1024, 50); + } + + if (p_tests & 0x8) { + HIPCHECK ( hipDeviceReset() ); + printSep(); + multiThread_1(true, true); + multiThread_1(false, true); + multiThread_1(false, false); // TODO + } + + passed(); + +} From 55e5190b57225a03f5dd5091a140e17751455f0c Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Tue, 23 Feb 2016 13:40:20 -0600 Subject: [PATCH 70/94] Update hip.vim [ROCm/hip commit: 4851cedf6c1e1556c08b4523181a45a7e3dbad38] --- projects/hip/util/vim/hip.vim | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim index 01f3b3f2ad..5264461ce9 100644 --- a/projects/hip/util/vim/hip.vim +++ b/projects/hip/util/vim/hip.vim @@ -23,6 +23,7 @@ syn keyword hipKeyword hipThreadIdx_x hipThreadIdx_y hipThreadIdx_z syn keyword hipKeyword hipBlockDim_x hipBlockDim_y hipBlockDim_z syn keyword hipKeyword hipBlockIdx_x hipBlockIdx_y hipBlockIdx_z syn keyword hipKeyword hipGridIdx_x hipGridIdx_y hipGridIdx_z +syn keyword hipKeyword hipGridDim_x hipGridDim_y hipGridDim_z syn keyword hipType uint uint1 uint2 uint3 uint4 syn keyword hipType int1 int2 int3 int4 @@ -31,6 +32,7 @@ syn keyword hipType char1 char2 char3 char4 syn keyword hipType uchar1 uchar2 uchar3 uchar4 syn keyword hipType short1 short2 short3 short4 syn keyword hipType dim1 dim2 dim3 dim4 +syn keyword hipType hipLaunchParm " Atomic functions syn keyword hipFunctionName atomicAdd atomicAnd atomicCAS atomicDec atomicExch @@ -124,6 +126,7 @@ syn keyword hipFunctionName hipMallocArray syn keyword hipFunctionName hipMallocHost syn keyword hipFunctionName hipMallocPitch syn keyword hipFunctionName hipMemcpy +syn keyword hipFunctionName hipMemcpyAsync syn keyword hipFunctionName hipMemcpy2D syn keyword hipFunctionName hipMemcpy2DArrayToArray syn keyword hipFunctionName hipMemcpy2DFromArray From 1707760bbf299c4afc0d7da2a84607d280ed596a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 Feb 2016 04:21:24 -0600 Subject: [PATCH 71/94] Add abstraction for pinned/unpinned, and sync/async mem copies selection in tests [ROCm/hip commit: ecec7e36d90fe281c9f023c2c3c3fb7e5dff4930] --- projects/hip/tests/src/test_common.h | 80 +++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 1bf89f1604..e37eec7e86 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -16,6 +16,16 @@ #define KCYN "\x1B[36m" #define KWHT "\x1B[37m" + + +#ifdef __HIP_PLATFORM_HCC +#define TYPENAME(T) typeid(T).name() +#else +#define TYPENAME(T) "?" +#endif + + + #define passed() \ printf ("%sPASSED!%s\n",KGRN, KNRM);\ exit(0); @@ -82,12 +92,12 @@ vectorADD(hipLaunchParm lp, const T *A_d, const T *B_d, T *C_d, - size_t N) + size_t NELEM) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (size_t i=offset; i struct MemTraits; + + +template<> +struct MemTraits +{ + + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + { + HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind)); + } +}; + + +template<> +struct MemTraits +{ + + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + { + HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream)); + } +}; + }; // namespace HipTest From ecdb33dee15ceddd3683944a0301a78194ff5fd3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 Feb 2016 04:22:34 -0600 Subject: [PATCH 72/94] Add test for thread-safety on streams [ROCm/hip commit: 7e45addbee2f88e7ef2013444651577a8af7046a] --- projects/hip/tests/src/CMakeLists.txt | 2 + projects/hip/tests/src/hipMemcpy.cpp | 5 - .../hip/tests/src/hipMultiThreadStreams.cpp | 240 ++++-------------- 3 files changed, 57 insertions(+), 190 deletions(-) diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index c1582ecf88..cc6af0b5d2 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -122,6 +122,7 @@ make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisio make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp) make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) +make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -137,6 +138,7 @@ make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) make_test(hipPointerAttrib " " ) +make_test(hipMultiThreadStreams " " ) make_test(hipMemcpy " " ) make_test(hipMemcpyAsync " " ) diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index f9bde2df9f..1d4efcbc3f 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -63,11 +63,6 @@ void simpleTest1() } -#ifdef __HIP_PLATFORM_HCC -#define TYPENAME(T) typeid(T).name() -#else -#define TYPENAME(T) "?" -#endif //--- diff --git a/projects/hip/tests/src/hipMultiThreadStreams.cpp b/projects/hip/tests/src/hipMultiThreadStreams.cpp index f9bde2df9f..a3dd94e077 100644 --- a/projects/hip/tests/src/hipMultiThreadStreams.cpp +++ b/projects/hip/tests/src/hipMultiThreadStreams.cpp @@ -23,6 +23,7 @@ THE SOFTWARE. #include "test_common.h" + void printSep() { printf ("======================================================================================\n"); @@ -31,189 +32,63 @@ void printSep() //--- // Test simple H2D copies and back. // Designed to stress a small number of simple smoke tests -void simpleTest1() + +template< + typename T=float, + class P=HipTest::Unpinned, + class C=HipTest::Memcpy +> +void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) { - printf ("test: %s\n", __func__); - size_t Nbytes = N*sizeof(int); - printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); + using HipTest::MemTraits; - int *A_d, *B_d, *C_d; - int *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); - - printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); - - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); - - HIPCHECK (hipDeviceSynchronize()); - - HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); - HIPCHECK (hipDeviceReset()); - - printf (" %s success\n", __func__); -} - - -#ifdef __HIP_PLATFORM_HCC -#define TYPENAME(T) typeid(T).name() -#else -#define TYPENAME(T) "?" -#endif - - -//--- -// Test many different kinds of memory copies. -// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. -// -// IN: numElements controls the number of elements used for allocations. -// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. -// IN: useHostToHost : If true, add an extra host-to-host copy. -// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. -// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. -// -template -void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) -{ - size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", - __func__, - TYPENAME(T), - sizeElements, sizeElements/1024.0/1024.0, - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + std::thread::id pid = std::this_thread::get_id(); + printf ("test: %s <%s> %s %s\n", __func__, TYPENAME(T), P::str(), C::str()); + size_t Nbytes = numElements*sizeof(T); + printf ("numElements=%zu Nbytes=%6.2fMB\n", numElements, Nbytes/1024.0/1024.0); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - - T *A_hh = NULL; - T *B_hh = NULL; - T *C_dd = NULL; + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned); + for (int i=0; i::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); - // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); - HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - } else { - HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - } + HIPCHECK (hipDeviceSynchronize()); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + } - if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned); + HIPCHECK (hipDeviceSynchronize()); - // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - - HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); - } else { - HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); - } - - HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, numElements); - - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); - - printf (" %s success\n", __func__); + std::cout <<" pid" << pid << " success\n"; } - -//--- -//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault -template -void memcpytest2_loop(size_t numElements) +template +void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t stream1, bool serialize) { - printSep(); + printSep(); + printf ("%s\n", __func__); + std::cout << testName << std::endl; - for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { - for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO - for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { - for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); - } - } - } - } -} - - -//--- -//Try many different sizes to memory copy. -template -void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) -{ - printSep(); - printf ("test: %s<%s>\n", __func__, TYPENAME(T)); - - int deviceId; - HIPCHECK(hipGetDevice(&deviceId)); - - size_t free, total; - HIPCHECK(hipMemGetInfo(&free, &total)); - - if (maxElem == 0) { - maxElem = free/sizeof(T)/5; - } - - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", - deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); - - for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host - } -} - - -//--- -//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: -template -void multiThread_1(bool serialize, bool usePinnedHost) -{ - printSep(); - printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + // Test 2 threads operating on same stream: + std::thread t1 (simpleVectorCopy, 2000000/*mb*/, 1000, stream0); if (serialize) { t1.join(); } - - - std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + std::thread t2 (simpleVectorCopy, 2000000/*mb*/, 1000, stream1); if (serialize) { t2.join(); } @@ -222,8 +97,9 @@ void multiThread_1(bool serialize, bool usePinnedHost) t1.join(); t2.join(); } -} + HIPCHECK(hipDeviceSynchronize()); +}; int main(int argc, char *argv[]) @@ -236,36 +112,30 @@ int main(int argc, char *argv[]) if (p_tests & 0x1) { HIPCHECK ( hipDeviceReset() ); - simpleTest1(); + + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + + //HIPCHECK(hipStreamDestroy(stream)); } + if (p_tests & 0x2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2_loop(N); - memcpytest2_loop(N); - memcpytest2_loop(N); - memcpytest2_loop(N); - } + hipStream_t stream0, stream1; + HIPCHECK (hipStreamCreate(&stream0)); + HIPCHECK (hipStreamCreate(&stream1)); - if (p_tests & 0x4) { - HIPCHECK ( hipDeviceReset() ); - printSep(); - memcpytest2_sizes(0,0); - printSep(); - memcpytest2_sizes(0,64); - printSep(); - memcpytest2_sizes(1024*1024, 13); - printSep(); - memcpytest2_sizes(1024*1024, 50); - } + // Easy tests to verify the test works - these don't allow overlap between the threads: + test_multiThread_1 ("Multithread NULL with serialized", NULL, NULL, true); + test_multiThread_1 ("Multithread with serialized", stream0, stream1, true); - if (p_tests & 0x8) { - HIPCHECK ( hipDeviceReset() ); - printSep(); - multiThread_1(true, true); - multiThread_1(false, true); - multiThread_1(false, false); // TODO - } + test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); + test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); + test_multiThread_1 ("Multithread with one stream", stream0, stream0, false); + } passed(); From 82900a18885161f7eeeaf8b00c4f709049d06e51 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 25 Feb 2016 23:44:39 +0300 Subject: [PATCH 73/94] Attribute hipDeviceAttributeIsMultiGpuBoard for obtaining Device property isMultiGpuBoard is added. On HIP path property obtaining done through hsa_iterate_agents and counting the devices of HSA_DEVICE_TYPE_GPU type. P.S. On multi-boards systems it might be problems with detection what board a GPU plugged into (not tested). [ROCm/hip commit: 57e212606d104277c04b813c9501590b0ceef316] --- projects/hip/include/hip_runtime_api.h | 2 ++ .../hip/include/nvcc_detail/hip_runtime_api.h | 2 ++ .../hip/samples/1_Utils/hipInfo/hipInfo.cpp | 1 + projects/hip/src/hip_hcc.cpp | 25 +++++++++++++++++++ .../hip/tests/src/hipGetDeviceAttribute.cpp | 1 + 5 files changed, 31 insertions(+) diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 2b5c98bc2a..1fc00299f4 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -98,6 +98,7 @@ typedef struct hipDeviceProp_t { int pciBusID; ///< PCI Bus ID. int pciDeviceID; ///< PCI Device ID. size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. + int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. } hipDeviceProp_t; @@ -161,6 +162,7 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributePciBusId, ///< PCI Bus ID. hipDeviceAttributePciDeviceId, ///< PCI Device ID. hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per Multiprocessor. + hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; /** diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index fac9380bfd..b4d9b06ad6 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -266,6 +266,8 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att cdattr = cudaDevAttrPciDeviceId; break; case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor; break; + case hipDeviceAttributeIsMultiGpuBoard: + cdattr = cudaDevAttrIsMultiGpuBoard; break; default: cerror = cudaErrorInvalidValue; break; } diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index c7b298705b..19e8cfc210 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -80,6 +80,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; #ifdef USE_ROCR_20 cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index ee208bcced..25e2ec30d7 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -227,6 +227,21 @@ hsa_status_t get_region_info(hsa_region_t region, void* data) return HSA_STATUS_SUCCESS; } +// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it. +static hsa_status_t countGpuAgents(hsa_agent_t agent, void *data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + hsa_device_type_t device_type; + hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + if (device_type == HSA_DEVICE_TYPE_GPU) { + (*static_cast(data))++; + } + return HSA_STATUS_SUCCESS; +} // Internal version, hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) @@ -245,6 +260,14 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) return hipErrorInvalidDevice; } + // Iterates over the agents to determine Multiple GPU devices + // using the countGpuAgents callback. + int gpuAgentsCount = 0; + err = hsa_iterate_agents(countGpuAgents, &gpuAgentsCount); + if (err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } + DeviceErrorCheck(err); + prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1; + // Get agent name err = hsa_agent_get_info(_hsa_agent, HSA_AGENT_INFO_NAME, &(prop->name)); DeviceErrorCheck(err); @@ -846,6 +869,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->pciDeviceID; break; case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: *pi = prop->maxSharedMemoryPerMultiProcessor; break; + case hipDeviceAttributeIsMultiGpuBoard: + *pi = prop->isMultiGpuBoard; break; default: e = hipErrorInvalidValue; break; } diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index dfd5c28f99..4471f532f5 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -72,6 +72,7 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryBusWidth, props.memoryBusWidth)); #endif CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeIsMultiGpuBoard, props.isMultiGpuBoard)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeL2CacheSize, props.l2CacheSize)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerMultiProcessor, props.maxThreadsPerMultiProcessor)); From 6bb5485bebc8ffaf9793b3097b9002fc555d1fb0 Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Thu, 25 Feb 2016 14:49:58 -0600 Subject: [PATCH 74/94] Update hip.vim Added d2d and h2h highlights [ROCm/hip commit: 29c385ed7213262b9d0b7e90bd4240673936b543] --- projects/hip/util/vim/hip.vim | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim index 5264461ce9..6c0b9a8d33 100644 --- a/projects/hip/util/vim/hip.vim +++ b/projects/hip/util/vim/hip.vim @@ -64,6 +64,7 @@ syn keyword hipFunctionName expf __expf exp logf __logf log " Runtime Data Types syn keyword hipType hipDeviceProp_t syn keyword hipType hipError_t +syn keyword hipType hipStream_t " Runtime functions syn keyword hipFunctionName hipBindTexture hipBindTextureToArray @@ -153,6 +154,8 @@ syn keyword hipFunctionName hipUnbindTexture " HIP Flags syn keyword hipFlags hipFilterModePoint syn keyword hipFlags hipMemcpyHostToDevice +syn keyword hipFlags hipMemcpyDeviceToDevice +syn keyword hipFlags hipMemcpyHostToHost syn keyword hipFlags hipMemcpyDeviceToHost syn keyword hipFlags hipReadModeElementType syn keyword hipFlags hipSuccess From ae365b4ae23198907d6d87d5032126193cf0c0a7 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 Feb 2016 19:17:28 -0600 Subject: [PATCH 75/94] Improve memory copy and commands switching - Add chicken bits to use host-side dependency management. - Add optional PinInPlace path for unpinned copies - Synchronize before pinned memcpy path. - Add mutex to protect two threads launching to same stream. [ROCm/hip commit: 8b64c0dc62bfab8b25d10d4bc5612d712db99fd0] --- projects/hip/src/hip_hcc.cpp | 182 +++++++++++++++++++++++---- projects/hip/tests/src/hipMemcpy.cpp | 21 +++- 2 files changed, 174 insertions(+), 29 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index f01579ae71..55d7da99df 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -71,15 +71,18 @@ int HIP_LAUNCH_BLOCKING = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_BUFFERS = 2; +int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. +int HIP_PININPLACE = 0; int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ //--- // Chicken bits for disabling functionality to work around potential issues: -int HIP_DISABLE_ENQ_BARRIER = 1; -int HIP_DISABLE_BIDIR_MEMCPY = 1; -int HIP_ONESHOT_COPY_DEP = 1; // this is a good thing +int HIP_DISABLE_HW_KERNEL_DEP = 1; +int HIP_DISABLE_HW_COPY_DEP = 1; + +int HIP_DISABLE_BIDIR_MEMCPY = 0; +int HIP_ONESHOT_COPY_DEP = 1; // TODO - setting this =1 is a good thing, reduces input deps on //--- @@ -133,6 +136,15 @@ struct ihipSignal_t { }; +// Used to remove lock, for performance or stimulating bugs. +class FakeMutex +{ + public: + void lock() { } + bool try_lock() {return true; } + void unlock() { } +}; + // Internal stream structure. @@ -157,6 +169,7 @@ public: inline void resetToEmpty(); inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; + std::mutex & mutex() {return _mutex;}; //--- hc::accelerator_view _av; @@ -166,7 +179,11 @@ private: unsigned _device_index; ihipCommand_t _last_command_type; // type of the last command - ihipSignal_t *_last_copy_signal; // signal of last copy command sent to the stream. Copy can be either H2D or D2H. + + // signal of last copy command sent to the stream. + // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. + // Copy can be either H2D or D2H. + ihipSignal_t *_last_copy_signal; hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. int _signalCursor; @@ -174,6 +191,8 @@ private: SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. std::deque _signalPool; // Pool of signals for use by this stream. + + std::mutex _mutex; }; @@ -210,8 +229,12 @@ struct StagingBuffer { StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); - void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + + void CopyDeviceToHost (void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + private: ihipDevice_t *_device; @@ -241,7 +264,7 @@ struct ihipDevice_t unsigned _compute_units; - hsa_signal_t _copy_signal; // signal to use for copies + hsa_signal_t _copy_signal; // signal to use for synchronous memcopies std::mutex _copy_lock[2]; // mutex for each direction. StagingBuffer *_staging_buffer[2]; // one buffer for each direction. @@ -455,6 +478,8 @@ void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) // inline bool ihipStream_t::preKernelCommand() { + _mutex.lock(); // will be unlocked in postKernelCommand + bool addedSync = false; // If switching command types, we need to add a barrier packet to synchronize things. if (_last_command_type != ihipCommandKernel) { @@ -462,7 +487,7 @@ inline bool ihipStream_t::preKernelCommand() addedSync = true; hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); - if (! HIP_DISABLE_ENQ_BARRIER) { + if (! HIP_DISABLE_HW_KERNEL_DEP) { this->enqueueBarrier(q, _last_copy_signal); tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id) @@ -484,6 +509,8 @@ inline bool ihipStream_t::preKernelCommand() inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) { _last_kernel_future = kernelFuture; + + _mutex.unlock(); }; @@ -515,6 +542,12 @@ inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitS *waitSignal = _last_copy_signal->_hsa_signal; } + if (HIP_DISABLE_HW_COPY_DEP && needSync) { + // do the wait here on the host, and disable the device-side command resolution. + hsa_signal_wait_acquire(*waitSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + needSync = 0; + } + _last_command_type = copyType; } @@ -862,10 +895,12 @@ void ihipInit() READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); - READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); + READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy."); + READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy"); READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); - READ_ENV_I(release, HIP_DISABLE_ENQ_BARRIER, 0, "Disable enqueue of barrier packet - instead wait for copy completion on host."); + READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host."); + READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands - instead wait for dependency on host."); READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device"); READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy. If clear, set the dep for each copy."); @@ -1157,20 +1192,31 @@ hipError_t hipDeviceSynchronize(void) //--- /** * @return @ref hipSuccess - * @bug On HCC, hipDeviceReset is a nop and does not reset the device state. */ hipError_t hipDeviceReset(void) { std::call_once(hip_initialized, ihipInit); - // TODO-HCC - // This function needs some support from HSART and KFD. - // It should destroy and clean up all resources allocated with the default device in the current process. - // and needs to destroy all queues as well. - // -#if USE_AM_TRACKER - // TODO - remove bug above. ihipDevice_t *device = ihipGetTlsDefaultDevice(); + + // TODO-HCC + // This function currently does a user-level cleanup of known resources. + // It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries. + + + //--- + //Wait for pending activity to complete? + //TODO - check if this is required behavior: + for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { + ihipStream_t *stream = *streamI; + stream->wait(); + } + + // Reset and remove streams: + device->_streams.clear(); + + +#if USE_AM_TRACKER if (device) { am_memtracker_reset(device->_acc); device->reset(); // re-allocate required resources. @@ -1921,6 +1967,72 @@ StagingBuffer::~StagingBuffer() } + +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from host CPU. +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + assert(sizeBytes < UINT64_MAX/2); // TODO + int bufferIndex = 0; + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { + + size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; + + tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); + + + memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); + void *locked_srcp; + hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_device->_hsa_agent, 1, &locked_srcp); + + assert (hsa_status == HSA_STATUS_SUCCESS); + + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + +#if USE_ROCR_V2 + hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, locked_srcp, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); +#else + assert(0); +#endif + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); + + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + + srcp += theseBytes; + dstp += theseBytes; + if (++bufferIndex >= _numBuffers) { + bufferIndex = 0; + } + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } + } + + // TODO - + printf ("unpin the memory\n"); + + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } +} + + + + //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. @@ -2059,7 +2171,6 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); - bool useStagingBuffer = true; // Resolve default to a specific Kind so we know which algorithm to use: if (kind == hipMemcpyDefault) { @@ -2078,13 +2189,18 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { - if (useStagingBuffer) { + if (HIP_STAGING_BUFFERS) { std::lock_guard l (device->_copy_lock[0]); //printf ("staged-copy- read dep signals\n"); hsa_signal_t depSignal; int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); - device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + + if (HIP_PININPLACE) { + device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } else { + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } // The copy waits for inputs and then completes before returning. stream->resetToEmpty(); @@ -2093,7 +2209,7 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB hc::am_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { - if (useStagingBuffer) { + if (HIP_STAGING_BUFFERS) { std::lock_guard l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]); //printf ("staged-copy- read dep signals\n"); hsa_signal_t depSignal; @@ -2103,18 +2219,30 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); } - } else if (kind == hipMemcpyHostToHost) { - memcpy(dst, src, sizeBytes); + } else if (kind == hipMemcpyHostToHost) { // TODO-refactor. + memcpy(dst, src, sizeBytes); } else { - // Let HSA runtime handle it: - // TODO - need buffer pool for the signals: + ihipCommand_t copyType; + if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) { + copyType = ihipCommandCopyH2D; + } else if (kind == hipMemcpyDeviceToHost) { + copyType = ihipCommandCopyD2H; + } else { + // TODO - return error condition: + //e = hipErrorInvalidMemcpyDirection; + copyType = ihipCommandCopyD2H; + } device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY? 0:1].lock(); hsa_signal_store_relaxed(device->_copy_signal, 1); + + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, copyType); + #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, device->_copy_signal); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, device->_copy_signal); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); #endif diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 8286454098..3fa499aa2a 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -63,9 +63,16 @@ void simpleTest1() } +class hipMemcpy; +class hipMemcpyAsync; + + + + + //--- // Test many different kinds of memory copies. -// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. +// The subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. // // IN: numElements controls the number of elements used for allocations. // IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. @@ -255,8 +262,18 @@ int main(int argc, char *argv[]) if (p_tests & 0x8) { HIPCHECK ( hipDeviceReset() ); printSep(); - multiThread_1(true, true); + + // Simplest cases: serialize the threads, and also used pinned memory: + // This verifies that the sub-calls to memcpytest2 are correct. + multiThread_1(true, true); + + // Serialize, but use unpinned memory to stress the unpinned memory xfer path. + multiThread_1(true, false); + + // Remove serialization, so two threads are performing memory copies in parallel. multiThread_1(false, true); + + // Remove serialization, and use unpinned. multiThread_1(false, false); // TODO } From 22a3806c0c2e236991ba28c2944fd755c91290d2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 21:29:51 -0600 Subject: [PATCH 76/94] Tweak version numbers [ROCm/hip commit: b1da7e4a705a90254cf4e0f3f985980f402aa51a] --- projects/hip/RELEASE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hip/RELEASE.md b/projects/hip/RELEASE.md index 9fea5d4c78..ae0a0d2b4e 100644 --- a/projects/hip/RELEASE.md +++ b/projects/hip/RELEASE.md @@ -17,17 +17,17 @@ Stay tuned - the work for many of these features is already in-flight. ## Revision History: =================================================================================================== -Release:0.80.01.00 +Release:0.80.01 Date: 2016.02.18 - Improve reporting and support for device-side math functions. - Update Runtime Documentation. - Improve implementations of cross-lane operations (_ballot, _any, _all). - Provide shuffle intrinsics (performance optimization in-progress). - Support hipDeviceAttribute for querying "one-shot" device attributes, as an alternative to hipDeviceGetProperties. -- + =================================================================================================== -Release:0.80.00.00 : +Release:0.80.00 : Date: 2016.01.25 Initial release with GPUOpen Launch. From 2a650be66183d4e1c494526c5c54ebd0d8c58917 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 Feb 2016 23:07:18 -0600 Subject: [PATCH 77/94] Add PATH and LD_LIBRARY_FLAGS [ROCm/hip commit: a30018e16668f8a1eea8b3f9aa3016da4ef24520] --- projects/hip/bin/hipconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/projects/hip/bin/hipconfig b/projects/hip/bin/hipconfig index 1687983330..b369a83e52 100755 --- a/projects/hip/bin/hipconfig +++ b/projects/hip/bin/hipconfig @@ -14,7 +14,7 @@ GetOptions( ,"compiler|c" => \$p_compiler ,"platform|P" => \$p_platform ,"cpp_config|cxx_config|C" => \$p_cpp_config - ,"full|f" => \$p_full, + ,"full|f|info" => \$p_full, ,"newline|n" => \$p_newline ); @@ -94,7 +94,9 @@ if ($p_full) { print "\n" ; print "=== Environment Variables\n"; - system("env | egrep '^HIP|^HSA|^HCC|^CUDA'"); + system("echo PATH=\$PATH"); + system("env | egrep '^HIP|^HSA|^HCC|^CUDA|^LD_LIBRARY_PATH'"); + print "\n" ; print "== Linux Kernel\n"; From b46a90852ee04388e6661b7ca410df566ee8b325 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 26 Feb 2016 23:34:45 -0600 Subject: [PATCH 78/94] Disable ROCR_V2 [ROCm/hip commit: 6e0ccdfb95d503fd14623258633b21d42ed7425f] --- projects/hip/src/hip_hcc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 912dec77de..b5ab9863ba 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -43,7 +43,7 @@ THE SOFTWARE. #define USE_AM_TRACKER 1 /* >0 = use new AM memory tracker features. */ -#define USE_ROCR_V2 1 /* use the ROCR v2 async copy API with dst and src agents */ +#define USE_ROCR_V2 0 /* use the ROCR v2 async copy API with dst and src agents */ #if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074) #error (USE_AM_TRACKER requries HCC version of 16074 or newer) @@ -81,7 +81,7 @@ int HIP_DISABLE_HW_KERNEL_DEP = 1; int HIP_DISABLE_HW_COPY_DEP = 1; int HIP_DISABLE_BIDIR_MEMCPY = 0; -int HIP_ONESHOT_COPY_DEP = 1; // TODO - setting this =1 is a good thing, reduces input deps on +int HIP_ONESHOT_COPY_DEP = 1; // TODO - setting this =1 is a good thing, reduces input deps //--- @@ -2253,10 +2253,10 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB hsa_signal_store_relaxed(device->_copy_signal, 1); - hsa_signal_t depSignal; - int depSignalCnt = stream->copyCommand(NULL, &depSignal, copyType); #if USE_ROCR_V2 + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, copyType); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, device->_copy_signal); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); From 1ac07d2b8735234c243fa71d1d7db8cac52a259a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 26 Feb 2016 05:25:30 -0600 Subject: [PATCH 79/94] fixes for titan platform [ROCm/hip commit: ff66ef0779f63d0c4037581fe307ba37917e9fc8] --- projects/hip/include/hip_runtime.h | 2 +- projects/hip/include/hip_runtime_api.h | 6 +----- projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp | 2 -- projects/hip/src/hip_hcc.cpp | 4 ++-- projects/hip/tests/src/hipGetDeviceAttribute.cpp | 2 -- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/projects/hip/include/hip_runtime.h b/projects/hip/include/hip_runtime.h index 0594726c90..de2a49d975 100644 --- a/projects/hip/include/hip_runtime.h +++ b/projects/hip/include/hip_runtime.h @@ -43,7 +43,7 @@ THE SOFTWARE. #include #include -#ifdef __cplusplus +#if __cplusplus > 199711L #include #endif diff --git a/projects/hip/include/hip_runtime_api.h b/projects/hip/include/hip_runtime_api.h index 2c827a8bef..61e2b17407 100644 --- a/projects/hip/include/hip_runtime_api.h +++ b/projects/hip/include/hip_runtime_api.h @@ -81,10 +81,8 @@ typedef struct hipDeviceProp_t { int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. int maxGridSize[3]; ///< Max grid dimensions (XYZ). int clockRate; ///< Max clock frequency of the multiProcessors in khz. -#ifdef USE_ROCR_20 int memoryClockRate; ///< Max global memory clock frequency in khz. int memoryBusWidth; ///< Global memory bus width in bits. -#endif size_t totalConstMem; ///< Size of shared memory region (in bytes). int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps. @@ -135,7 +133,7 @@ typedef struct hipPointerAttribute_t { /* - * @brief hipError_t + asdasd* @brief hipError_t * @enum * @ingroup Enumerations */ @@ -175,10 +173,8 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeWarpSize, ///< Warp size in threads. hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor. hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. -#ifdef USE_ROCR_20 hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. -#endif hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index 19e8cfc210..824ab17d37 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -82,10 +82,8 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; -#ifdef USE_ROCR_20 cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; -#endif cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index b5ab9863ba..c2b4d5ffe7 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -792,7 +792,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; -#ifdef USE_ROCR_20 +#ifdef USE_ROCR_V2 // Get Max memory clock frequency err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); DeviceErrorCheck(err); @@ -1269,7 +1269,7 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->regsPerBlock; break; case hipDeviceAttributeClockRate: *pi = prop->clockRate; break; -#ifdef USE_ROCR_20 +#ifdef USE_ROCR_V2 case hipDeviceAttributeMemoryClockRate: *pi = prop->memoryClockRate; break; case hipDeviceAttributeMemoryBusWidth: diff --git a/projects/hip/tests/src/hipGetDeviceAttribute.cpp b/projects/hip/tests/src/hipGetDeviceAttribute.cpp index 4471f532f5..0073dfeed7 100644 --- a/projects/hip/tests/src/hipGetDeviceAttribute.cpp +++ b/projects/hip/tests/src/hipGetDeviceAttribute.cpp @@ -67,10 +67,8 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeWarpSize, props.warpSize)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxRegistersPerBlock, props.regsPerBlock)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeClockRate, props.clockRate)); -#ifdef USE_ROCR_20 CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryClockRate, props.memoryClockRate)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryBusWidth, props.memoryBusWidth)); -#endif CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeIsMultiGpuBoard, props.isMultiGpuBoard)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode)); From dc3710cee4d8a66d2a6644b45709b123a7a33ba9 Mon Sep 17 00:00:00 2001 From: pensun Date: Fri, 26 Feb 2016 09:50:00 -0600 Subject: [PATCH 80/94] fix compiling error [ROCm/hip commit: ee7ac16396268c65f6a3a99c5820c8148dc8ec3d] --- projects/hip/src/hip_hcc.cpp | 132 +++++++++++++++++------------------ 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index c2b4d5ffe7..ed2dd79f2e 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -71,7 +71,7 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. -int HIP_PININPLACE = 0; +int HIP_PININPLACE = 0; int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ @@ -121,14 +121,14 @@ typedef uint64_t SIGSEQNUM; //--- // Small wrapper around signals. // Designed to be used from stream. -// TODO-someday refactor this class so it can be stored in a vector<> +// TODO-someday refactor this class so it can be stored in a vector<> // we already store the index here so we can use for garbage collection. struct ihipSignal_t { hsa_signal_t _hsa_signal; // hsa signal handle int _index; // Index in pool, used for garbage collection. SIGSEQNUM _sig_id; // unique sequentially increasing ID. - ihipSignal_t(); + ihipSignal_t(); ~ihipSignal_t(); inline void release(); @@ -179,14 +179,14 @@ private: unsigned _device_index; ihipCommand_t _last_command_type; // type of the last command - // signal of last copy command sent to the stream. + // signal of last copy command sent to the stream. // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. // Copy can be either H2D or D2H. - ihipSignal_t *_last_copy_signal; + ihipSignal_t *_last_copy_signal; hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. - + int _signalCursor; - + SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. std::deque _signalPool; // Pool of signals for use by this stream. @@ -305,7 +305,7 @@ INLINE bool ihipIsValidDevice(unsigned deviceIndex); //================================================================================================= // //--- -ihipSignal_t::ihipSignal_t() : _sig_id(0) +ihipSignal_t::ihipSignal_t() : _sig_id(0) { if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { throw; @@ -314,7 +314,7 @@ ihipSignal_t::ihipSignal_t() : _sig_id(0) } //--- -ihipSignal_t::~ihipSignal_t() +ihipSignal_t::~ihipSignal_t() { tprintf (TRACE_SIGNAL, " destroy hsa_signal #%lu (#%lu)\n", (_hsa_signal.handle), _sig_id); if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { @@ -329,9 +329,9 @@ ihipSignal_t::~ihipSignal_t() //================================================================================================= //--- ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _av(av), - _flags(flags), - _device_index(device_index), + _av(av), + _flags(flags), + _device_index(device_index), _last_copy_signal(0), _signalCursor(0), _stream_sig_id(0), @@ -353,7 +353,7 @@ ihipStream_t::~ihipStream_t() //--- // Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal. -void ihipStream_t::resetToEmpty() +void ihipStream_t::resetToEmpty() { _last_command_type = ihipCommandCopyH2D; _last_copy_signal = NULL; @@ -363,24 +363,24 @@ void ihipStream_t::resetToEmpty() void ihipStream_t::reclaimSignals(SIGSEQNUM sigNum) { tprintf(TRACE_SIGNAL, "reclaim signal #%lu\n", sigNum); - // Mark all signals older and including this one as available for + // Mark all signals older and including this one as available for _oldest_live_sig_id = sigNum+1; } //--- -void ihipStream_t::waitAndReclaimOlder(ihipSignal_t *signal) +void ihipStream_t::waitAndReclaimOlder(ihipSignal_t *signal) { hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - reclaimSignals(_last_copy_signal->_sig_id); + reclaimSignals(_last_copy_signal->_sig_id); } //--- //Wait for all queues kernels in the associated accelerator_view to complete. -void ihipStream_t::wait() +void ihipStream_t::wait() { tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_sig_id: 0x0 ); _av.wait(); @@ -393,10 +393,10 @@ void ihipStream_t::wait() //--- -inline ihipDevice_t * ihipStream_t::getDevice() const -{ +inline ihipDevice_t * ihipStream_t::getDevice() const +{ if (ihipIsValidDevice(_device_index)) { - return &g_devices[_device_index]; + return &g_devices[_device_index]; } else { return NULL; } @@ -407,7 +407,7 @@ inline ihipDevice_t * ihipStream_t::getDevice() const // Allocate a new signal from the signal pool. // Returned signals have value of 0. // Signals are intended for use in this stream and are always reclaimed "in-order". -ihipSignal_t *ihipStream_t::getSignal() +ihipSignal_t *ihipStream_t::getSignal() { int numToScan = _signalPool.size(); do { @@ -432,7 +432,7 @@ ihipSignal_t *ihipStream_t::getSignal() _signalCursor = _signalPool.size(); // set to the beginning of the new entries: _signalPool.resize(_signalPool.size() * 2); tprintf (TRACE_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); - return getSignal(); // try again, + return getSignal(); // try again, // Should never reach here. assert(0); @@ -440,7 +440,7 @@ ihipSignal_t *ihipStream_t::getSignal() //--- -void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) +void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { // Obtain the write index for the command queue @@ -488,11 +488,11 @@ inline bool ihipStream_t::preKernelCommand() hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); if (! HIP_DISABLE_HW_KERNEL_DEP) { this->enqueueBarrier(q, _last_copy_signal); - tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id) } else { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", + tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); this->waitAndReclaimOlder(_last_copy_signal); } @@ -527,7 +527,7 @@ inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitS if (_last_command_type == ihipCommandKernel) { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); needSync = 1; hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); @@ -536,7 +536,7 @@ inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitS } } else if (_last_copy_signal) { needSync = 1; - tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", this, ihipCommandName[_last_command_type], ihipCommandName[copyType], _last_copy_signal->_sig_id); *waitSignal = _last_copy_signal->_hsa_signal; } @@ -558,9 +558,9 @@ inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitS //================================================================================================= // -//Reset the device - this is called from hipDeviceReset. +//Reset the device - this is called from hipDeviceReset. //Device may be reset multiple times, and may be reset after init. -void ihipDevice_t::reset() +void ihipDevice_t::reset() { _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); @@ -794,12 +794,12 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) #ifdef USE_ROCR_V2 // Get Max memory clock frequency - err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); + //err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); DeviceErrorCheck(err); prop->memoryClockRate *= 1000.0; // convert Mhz to Khz. // Get global memory bus width in bits - err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH, &prop->memoryBusWidth); + //err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH, &prop->memoryBusWidth); DeviceErrorCheck(err); #endif @@ -893,7 +893,7 @@ void ihipInit() /* * Environment variables */ - READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); + READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); @@ -1721,7 +1721,7 @@ hipError_t hipEventQuery(hipEvent_t event) /** * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice */ -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) { std::call_once(hip_initialized, ihipInit); @@ -1767,7 +1767,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) #if USE_AM_TRACKER // TODO - test this function: /** - * @returns #hipSuccess, + * @returns #hipSuccess, * @returns #hipErrorInvalidValue if flags are not 0 * @returns #hipErrorMemoryAllocation if hostPointer is not a tracked allocation. */ @@ -1787,7 +1787,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi if (status == AM_SUCCESS) { *devicePointer = amPointerInfo._devicePointer; } else { - e = hipErrorMemoryAllocation; + e = hipErrorMemoryAllocation; *devicePointer = NULL; } } @@ -1874,7 +1874,7 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) //--- /** - * @returns #hipSuccess #hipErrorMemoryAllocation + * @returns #hipSuccess #hipErrorMemoryAllocation */ hipError_t hipMalloc(void** ptr, size_t sizeBytes) { @@ -1956,9 +1956,9 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuf _bufferSize(bufferSize), _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) { - - + + for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, device->_acc, amHostPinned); @@ -1987,10 +1987,10 @@ StagingBuffer::~StagingBuffer() //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completion_signal[i], 0); @@ -2003,7 +2003,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); @@ -2023,7 +2023,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ #endif tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp += theseBytes; dstp += theseBytes; @@ -2036,12 +2036,12 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ } } - // TODO - + // TODO - printf ("unpin the memory\n"); for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } @@ -2053,10 +2053,10 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completion_signal[i], 0); @@ -2069,7 +2069,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. @@ -2085,7 +2085,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte #endif tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp += theseBytes; dstp += theseBytes; @@ -2100,7 +2100,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } @@ -2109,10 +2109,10 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte //IN: dst - dest pointer - must be accessible from agent this buffer is assocaited with (via _device). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - const char *srcp0 = static_cast (src); - char *dstp1 = static_cast (dst); + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completion_signal[i], 0); @@ -2136,11 +2136,11 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif - assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp0 += theseBytes; - + if (HIP_ONESHOT_COPY_DEP) { waitFor = NULL; // TODO - don't need dependency after first copy submitted? } @@ -2152,7 +2152,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (TRACE_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); @@ -2163,7 +2163,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte //for (int i=0; i<_numBuffers; i++) { - // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); //} } @@ -2212,9 +2212,9 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); if (HIP_PININPLACE) { - device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { - device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } // The copy waits for inputs and then completes before returning. @@ -2229,13 +2229,13 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB //printf ("staged-copy- read dep signals\n"); hsa_signal_t depSignal; int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyD2H); - device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { // TODO-refactor. - memcpy(dst, src, sizeBytes); + memcpy(dst, src, sizeBytes); } else { ihipCommand_t copyType; @@ -2263,7 +2263,7 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB #endif if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1].unlock(); @@ -2287,7 +2287,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #if USE_AM_TRACKER try { ihipSyncCopy(stream, dst, src, sizeBytes, kind); - } + } catch (...) { e = hipErrorInvalidResourceHandle; } @@ -2365,7 +2365,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp if (HIP_LAUNCH_BLOCKING) { tprintf(TRACE_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); stream->wait(); - } + } } else { // This path can be hit if src or dst point to unpinned host memory. // TODO-stream - does async-copy fall back to sync if input pointers are not pinned? @@ -2429,7 +2429,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); cf.wait(); tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); - } + } } else { e = hipErrorInvalidValue; } From ed96744f76d0a63360b1a518fc884a1eb8605755 Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Fri, 26 Feb 2016 12:33:55 -0600 Subject: [PATCH 81/94] Added CUDA support for hipPointerGetAttributes [ROCm/hip commit: 6d66bd63de82231f54179a5a9568ae7b73a720f8] --- .../hip/include/nvcc_detail/hip_runtime_api.h | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/projects/hip/include/nvcc_detail/hip_runtime_api.h b/projects/hip/include/nvcc_detail/hip_runtime_api.h index 0ef4b38c67..29666cfb8f 100644 --- a/projects/hip/include/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/nvcc_detail/hip_runtime_api.h @@ -271,6 +271,29 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att return hipCUDAErrorTohipError(cerror); } + +inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr){ + cudaPointerAttributes cPA; + hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr)); + if(err == hipSuccess){ + switch (cPA.memoryType){ + case cudaMemoryTypeDevice: + attributes->memoryType = hipMemoryTypeDevice; break; + case cudaMemoryTypeHost: + attributes->memoryType = hipMemoryTypeHost; break; + default: + return hipErrorUnknownSymbol; + } + attributes->device = cPA.device; + attributes->devicePointer = cPA.devicePointer; + attributes->hostPointer = cPA.hostPointer; + attributes->isManaged = 0; + attributes->allocationFlags = 0; + } + return err; +} + + inline static hipError_t hipMemGetInfo( size_t* free, size_t* total) { return hipCUDAErrorTohipError(cudaMemGetInfo(free,total)); From 433cf742b9db748974f8c46eb64b0981304c26d5 Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Fri, 26 Feb 2016 13:47:58 -0600 Subject: [PATCH 82/94] Null argument handling HIPCC This change makes hipcc handle no-arguments passed to it. It solves https://github.com/AMDComputeLibraries/HIP-privatestaging/issues/1 [ROCm/hip commit: 3238185276e7834e6ae70b9fa89e1bdf1034480c] --- projects/hip/bin/hipcc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index 1ab4cf2759..ee58ba535a 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -19,6 +19,10 @@ use File::Basename; # HCC_HOME : Path to HCC SDK (default /opt/hcc). Used on AMD platforms only. # HSA_PATH : Path to HSA dir (default /opt/hsa). Used on AMD platforms only. +if(scalar @ARGV == 0){ +print "No Arguments passed, exiting ...\n"; +exit(-1); +} $verbose = $ENV{'HIPCC_VERBOSE'}; $verbose = 0 unless defined $verbose; From e7631f28e5f70ef230a241c4e087486e2becbab2 Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Fri, 26 Feb 2016 18:50:40 -0600 Subject: [PATCH 83/94] Corrected hipPointerGetAttribute Made hipPointerGetAttribute work same as cudaPointerGetAttribute for HCC [ROCm/hip commit: 66aa7f2f8a76cdda09090427759d0a9383056770] --- projects/hip/src/hip_hcc.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e3770b7001..aa2cae652e 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1732,7 +1732,12 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->hostPointer = amPointerInfo._hostPointer; attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; - + if(attributes->memoryType == hipMemoryTypeHost){ + attributes->hostPointer = ptr; + } + if(attributes->memoryType == hipMemoryTypeDevice){ + attributes->devicePointer = ptr; + } attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; From 73fa2a98648c4face3837da9f76a1775fef9c438 Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Fri, 26 Feb 2016 19:38:22 -0600 Subject: [PATCH 84/94] Update README.md [ROCm/hip commit: 0fda8711d5c2de5e68da513fd0d93984a3852b00] --- projects/hip/tests/README.md | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/projects/hip/tests/README.md b/projects/hip/tests/README.md index 30c7173b35..bc8a9772f3 100644 --- a/projects/hip/tests/README.md +++ b/projects/hip/tests/README.md @@ -1,34 +1,39 @@ Tests uses CMAKE as teh build infrastructure. Use : - -> mkdir build -> cd build -> cmake ../src -> make -> make test - +``` +$ mkdir build +$ cd build +$ cmake ../src +$ make +$ make test +``` #----- -# How to add a new test; +## How to add a new test # edit src/CMakeFiles to add the test: # add the executable and list of required CPP files, ie: -# make_test (EXE CPP_FILES) -> make_hip_executable (hipMemset hipMemset.cpp) +``` +make_test (EXE CPP_FILES) +make_hip_executable (hipMemset hipMemset.cpp) +``` # Add to automated Test framework: -# make_test (TESTNAME ARGS) -> make_test(hipMemset " ") - +``` +make_test (TESTNAME ARGS) +make_test(hipMemset " ") +``` # Running tests: +``` make test +``` # Run a specific test: +``` ./hipMemset - - +``` From 52f2fc5b957a1a65a17cdb6c9829065e6acff96d Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Fri, 26 Feb 2016 19:39:14 -0600 Subject: [PATCH 85/94] Update README.md [ROCm/hip commit: 875403fec889b0ea9601b774d258c870ad6d6ccd] --- projects/hip/tests/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/projects/hip/tests/README.md b/projects/hip/tests/README.md index bc8a9772f3..96de22b558 100644 --- a/projects/hip/tests/README.md +++ b/projects/hip/tests/README.md @@ -10,24 +10,24 @@ $ make test ``` #----- -## How to add a new test +### How to add a new test -# edit src/CMakeFiles to add the test: +edit src/CMakeFiles to add the test: -# add the executable and list of required CPP files, ie: +### add the executable and list of required CPP files, ie: ``` make_test (EXE CPP_FILES) make_hip_executable (hipMemset hipMemset.cpp) ``` -# Add to automated Test framework: +### Add to automated Test framework: ``` make_test (TESTNAME ARGS) make_test(hipMemset " ") ``` -# Running tests: +### Running tests: ``` make test ``` From ec7375ea3cb3b3032ab8f58da08f0a056cdef98b Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Fri, 26 Feb 2016 19:42:07 -0600 Subject: [PATCH 86/94] Updated hipPointerAttrib.cpp to pass HC backend The changes are according to the new API function change in HC backend for HIP. Especially hipPointerGetAttribute(...) [ROCm/hip commit: 4c354c988878fc8b31fb93e5bc12cbab367ef02e] --- projects/hip/tests/src/hipPointerAttrib.cpp | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp index 6928ec9a64..8bed5af869 100644 --- a/projects/hip/tests/src/hipPointerAttrib.cpp +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -38,7 +38,7 @@ size_t Nbytes = 0; // Utility Functions: //================================================================================================= -bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) { return ((lhs.hostPointer == rhs.hostPointer) && (lhs.devicePointer == rhs.devicePointer) && @@ -50,7 +50,7 @@ bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &r }; -bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) { return ! (lhs == rhs); } @@ -66,7 +66,7 @@ const char *memoryTypeToString(hipMemoryType memoryType) } -void resetAttribs(hipPointerAttribute_t *attribs) +void resetAttribs(hipPointerAttribute_t *attribs) { attribs->hostPointer = (void*) (-1); attribs->devicePointer = (void*) (-1); @@ -77,9 +77,9 @@ void resetAttribs(hipPointerAttribute_t *attribs) }; -void printAttribs(const hipPointerAttribute_t *attribs) +void printAttribs(const hipPointerAttribute_t *attribs) { - printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", + printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", attribs->hostPointer, attribs->devicePointer, memoryTypeToString(attribs->memoryType), @@ -90,7 +90,7 @@ void printAttribs(const hipPointerAttribute_t *attribs) }; -inline int zrand(int max) +inline int zrand(int max) { return rand() % max; } @@ -101,7 +101,7 @@ inline int zrand(int max) //================================================================================================= //-- //Run through a couple simple cases to test lookups and host pointer arithmetic: -void testSimple() +void testSimple() { printf ("\n"); printf ("===========================================================================\n"); @@ -135,22 +135,22 @@ void testSimple() resetAttribs(&attribs2); HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+100)); printf("getAttr:%-20s", "A_d+100"); printAttribs(&attribs2); - HIPASSERT(attribs == attribs2); + HIPASSERT((char*)attribs.devicePointer+100 == (char*)attribs2.devicePointer); // Corner case at end of array: resetAttribs(&attribs2); HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+Nbytes-1)); - printf("getAttr:%-20s", "A_d+NBytes-1"); printAttribs(&attribs2); - HIPASSERT(attribs == attribs2); + printf("getAttr:%-20s", "A_d+Nbytes-1"); printAttribs(&attribs2); + HIPASSERT((char*)attribs.devicePointer+Nbytes-1 == (char*)attribs2.devicePointer); // Pointer just beyond array - must be invalid or at least a different pointer resetAttribs(&attribs2); e = hipPointerGetAttributes(&attribs2, A_d+Nbytes+1); - printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); if (e != hipErrorInvalidValue) { // We might have strayed into another pointer area. printf("getAttr:%-20s", "A_d+NBytes"); printAttribs(&attribs2); - HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + HIPASSERT((char*)attribs.devicePointer != (char*)attribs2.devicePointer); } @@ -174,26 +174,26 @@ void testSimple() resetAttribs(&attribs2); HIPCHECK( hipPointerGetAttributes(&attribs2, A_Pinned_h+Nbytes/2)); printf("getAttr:%-20s", "A_pinned_h+NBytes/2"); printAttribs(&attribs2); - HIPASSERT(attribs == attribs2); + HIPASSERT((char*)attribs.hostPointer+Nbytes/2 == (char*)attribs2.hostPointer); hipFreeHost(A_Pinned_h); e = hipPointerGetAttributes(&attribs, A_Pinned_h); HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. - printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); // OS memory printf ("\nOS-allocated memory (malloc)\n"); e = hipPointerGetAttributes(&attribs, A_OSAlloc_h); - printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e)); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e)); HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue. } //--- //Reset the memory tracker (remove allocations from all known devices): //This frees any memory allocated through the runtime. -//The routine will not release any +//The routine will not release any void resetTracker () { if (p_verbose & 0x1) { @@ -232,8 +232,8 @@ void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointe HIPCHECK(e); printf(" ref :: "); printAttribs(&ref._attrib); printf(" getattr:: "); printAttribs(&attribs); - - HIPASSERT(attribs == ref._attrib); + + HIPASSERT(attribs != ref._attrib); } else { if (p_verbose & 0x1) { printf("#%4d.%d GOOD:%p getattr :: ",major, minor, pointer); printAttribs(&attribs); @@ -303,7 +303,7 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) size_t free, total; HIPCHECK(hipSetDevice(i)); HIPCHECK(hipMemGetInfo(&free, &total)); - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) clusterAllocTotalDevice=%lu (%4.2fMB) total=%zu (%4.2fMB)\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) clusterAllocTotalDevice=%lu (%4.2fMB) total=%zu (%4.2fMB)\n", i, free, (float)(free/1024.0/1024.0), totalDeviceAllocated[i], (float)(totalDeviceAllocated[i])/1024.0/1024.0, total, (float)(total/1024.0/1024.0)); HIPASSERT(free + totalDeviceAllocated[i] <= total); } @@ -432,9 +432,9 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove //--- //Multi-thread test that is effective at catching locking errors in the alloc/dealloc/tracker. //The query thread repeately requests information on the same block of memory. -//Meanwhile, the thread_noise_generator registers a large number of blocks, and +//Meanwhile, the thread_noise_generator registers a large number of blocks, and //then unregisters them. This causes a large amount of rebalancing in the tree -//structure and will generate errors unless the locks in the tracker are preventing reading +//structure and will generate errors unless the locks in the tracker are preventing reading //while writing. void testMultiThreaded_2() { From 66d6dbf6861e359ae81a5f4b688d6c9218d661e3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 27 Feb 2016 03:31:30 -0600 Subject: [PATCH 87/94] disable rocrv2, properly [ROCm/hip commit: ea09557e1be911f276cc133ff9f34d67c09837fe] --- projects/hip/bin/hipcc | 2 +- projects/hip/docs/markdown/hip_porting_guide.md | 2 +- projects/hip/src/hip_hcc.cpp | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index ee58ba535a..29163d7c02 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -168,7 +168,7 @@ if ($needHipHcc) { if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) { my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object"; if ($verbose & 0x10) { - $CMD .= " -g -O2" ; + $CMD .= " -g -O0" ; } else { $CMD .= " -O3" ; } diff --git a/projects/hip/docs/markdown/hip_porting_guide.md b/projects/hip/docs/markdown/hip_porting_guide.md index 9f1c7c67bd..cb599a5c4a 100644 --- a/projects/hip/docs/markdown/hip_porting_guide.md +++ b/projects/hip/docs/markdown/hip_porting_guide.md @@ -290,7 +290,7 @@ hipcc adds the necessary libraries for HIP as well as for the accelerator compil ### -lm Option -hcc does not add “-lm” by default. If you see errors about missing math functions at link time (e.g., "sqrt@@GLIBC_2.2.5"), ensure that “-lm” is in the link options. +hipcc adds -lm by default to the link command. ## Linking Code With Other Compilers diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index c2b4d5ffe7..5ea2c88d56 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -792,7 +792,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; -#ifdef USE_ROCR_V2 +#if USE_ROCR_V2 // Get Max memory clock frequency err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, &prop->memoryClockRate); DeviceErrorCheck(err); @@ -1269,12 +1269,10 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) *pi = prop->regsPerBlock; break; case hipDeviceAttributeClockRate: *pi = prop->clockRate; break; -#ifdef USE_ROCR_V2 case hipDeviceAttributeMemoryClockRate: *pi = prop->memoryClockRate; break; case hipDeviceAttributeMemoryBusWidth: *pi = prop->memoryBusWidth; break; -#endif case hipDeviceAttributeMultiprocessorCount: *pi = prop->multiProcessorCount; break; case hipDeviceAttributeComputeMode: @@ -2309,6 +2307,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #endif /** * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue + * @warning on HCC hipMemcpyAsync does not support overlapped H2D and D2H copies. */ //--- hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) From 0201608b4467ee38ee6cf54130a50e72e4210e8f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Sat, 27 Feb 2016 05:44:57 -0600 Subject: [PATCH 88/94] Added test to check dispatches on single stream [ROCm/hip commit: f134c6ccb6865a51728c09ac42ed620f679038eb] --- projects/hip/tests/src/CMakeLists.txt | 7 +- projects/hip/tests/src/hipStream.h | 102 ++++ projects/hip/tests/src/hipStreamL5.cpp | 785 +++++++++++++++++++++++++ 3 files changed, 892 insertions(+), 2 deletions(-) create mode 100644 projects/hip/tests/src/hipStream.h create mode 100644 projects/hip/tests/src/hipStreamL5.cpp diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index cc6af0b5d2..3da90ae009 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -103,13 +103,13 @@ macro (make_test_matches exe match_string) ) endmacro() -make_hip_executable (hip_ballot hip_ballot.cpp) +#make_hip_executable (hip_ballot hip_ballot.cpp) make_hip_executable (hip_anyall hip_anyall.cpp) make_hip_executable (hip_popc hip_popc.cpp) make_hip_executable (hip_clz hip_clz.cpp) make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) -make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) +#make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) make_hip_executable (hipMemcpy hipMemcpy.cpp) make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) make_hip_executable (hipMemset hipMemset.cpp) @@ -123,6 +123,7 @@ make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecis make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp) +make_hip_executable (hipStreamL5 hipStreamL5.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -145,4 +146,6 @@ make_test(hipMemcpyAsync " " ) make_test(hipHcc " " ) +make_test(hipStreamL5 " ") + make_hipify_test(specialFunc.cu ) diff --git a/projects/hip/tests/src/hipStream.h b/projects/hip/tests/src/hipStream.h new file mode 100644 index 0000000000..f9ec3472d0 --- /dev/null +++ b/projects/hip/tests/src/hipStream.h @@ -0,0 +1,102 @@ +#ifndef HIPSTREAM_H +#define HIPSTREAM_H +#include + +#define NUM_STREAMS 4 + +/* +* H2H - 1 +* H2D - 2 +* KER - 3 +* D2D - 4 +* D2H - 5 +*/ + +template +void H2HAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyHostToHost, stream)); +} + +template +void H2DAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyHostToDevice, stream)); +} + +template +void D2DAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyDeviceToDevice, stream)); +} + +template +void D2HAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyDeviceToHost, stream)); +} + +template +void H2H(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyHostToHost)); +} + +template +void H2D(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyHostToDevice)); +} + +template +void D2D(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyDeviceToDevice)); +} + +template +void D2H(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyDeviceToHost)); +} + +template +__global__ void Inc(hipLaunchParm lp, T *In){ +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +In[tx] = In[tx] + 1; +} + +template +void initArrays(T **Ad, T **Ah, + size_t N, bool usePinnedHost=false){ + size_t NBytes = N * sizeof(T); + if(Ad){ + HIPCHECK( hipMalloc(Ad, NBytes)); + } + if(usePinnedHost){ + HIPCHECK( hipMallocHost(Ah, NBytes)); + } + else{ + *Ah = new T[N]; + HIPASSERT(*Ah != NULL); + } +} + +template +void initArrays(T **Ad, size_t N, + bool deviceMemory = false, + bool usePinnedHost = false){ + size_t NBytes = N * sizeof(T); + if(deviceMemory){ + HIPCHECK( hipMalloc(Ad, NBytes)); + }else{ + if(usePinnedHost){ + HIPCHECK(hipMallocHost(Ad, NBytes)); + }else{ + *Ad = new T[N]; + HIPASSERT(*Ad != NULL); + } + } +} + +template +void setArray(T* Array, int N, T val){ +for(int i=0;i +#include"hip_runtime.h" +#include"test_common.h" +#include"hipStream.h" + +template +void test12345(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const size_t N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Ad, *Bd; + initArrays(&Ad, &Ah, N, true); + initArrays(&Bd, &Bh, N, true); + initArrays(&Ch, N, false, true); + + setArray(Ah, N, T(1)); + + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Ad, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ch, Bd, size, stream); + HIPCHECK(hipDeviceSynchronize()); + + HIPASSERT(Ah[10] + T(1)== Ch[10]); + HIPCHECK(hipStreamDestroy(stream)); +} + +template +void test13452(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const size_t N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; + + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + + H2D(Ad, Dh, size); + + H2HAsync(Bh, Ah, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ch, Bd, size, stream); + H2DAsync(Cd, Ch, size, stream); + HIPCHECK(hipDeviceSynchronize()); + + D2H(Eh,Cd,size); + + HIPASSERT(Ah[10] == Bh[10]); + HIPASSERT(Eh[10] == Dh[10] + T(1)); + +} + +template +void test14523(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const int N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; + + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + + H2D(Ad,Dh,size); + + H2HAsync(Bh, Ah, size, stream); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ch, Bd, size, stream); + H2DAsync(Cd, Ch, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); + + HIPCHECK(hipDeviceSynchronize()); + + D2H(Eh, Cd, size); + + HIPASSERT(Ah[10] == Bh[10]); + HIPASSERT(Ch[10] + T(1) == Eh[10]); +} + +template +void test15234(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const size_t N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; + + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + + H2D(Ad, Dh, size); + + H2HAsync(Bh, Ah, size, stream); + D2HAsync(Ch, Ad, size, stream); + H2DAsync(Bd, Ch, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2DAsync(Cd, Bd, size, stream); + + D2H(Eh, Cd, size); + + HIPASSERT(Ah[10] == Bh[10]); + HIPASSERT(Eh[10] == Dh[10] + T(1)); + +} + +template +void test23451(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch; +T *Ad, *Bd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); + +setArray(Ah, N, T(1)); + +H2DAsync(Ad, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Bh, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +HIPCHECK(hipDeviceSynchronize()); +//HIPASSERT(Ah[10] == Ch[10]); +} + +template +void test24513(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Dh, N, T(2)); + +H2D(Cd, Dh, size); + +H2DAsync(Ad, Ah, size, stream); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Bh, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +HIPCHECK(hipDeviceSynchronize()); + +D2H(Eh, Cd, size); + +HIPASSERT(Eh[0] == Dh[0] + T(1)); +//HIPASSERT(Ah[0] == Ch[0]); +} + +template +void test25134(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Dh, N, T(2)); + +H2D(Bd, Dh, size); + +H2DAsync(Ad, Ah, size, stream); +D2HAsync(Bh, Ad, size, stream); +H2HAsync(Ch, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); + +D2H(Eh, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +//HIPASSERT(Ah[10] == Ch[10]); +HIPASSERT(Dh[10] + T(1) == Eh[10]); + +} + +template +void test21345(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch, *Dh; +T *Ad, *Bd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, true); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Bh, N, T(2)); + +H2DAsync(Ad, Ah, size, stream); +H2HAsync(Ch, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Dh, Bd, size, stream); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT( Bh[10] == Ch[10] ); +HIPASSERT( Ah[10] + T(1) == Dh[10]); +} + +template +void test34512(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Bh, *Ch, *Dh; +T *Ah, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, true); +initArrays(&Ah, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); + +H2D(Ad, Ah, size); + +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Bh, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +H2DAsync(Cd, Ch, size, stream); + +D2H(Dh, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); +//HIPASSERT( Ah[10] + T(1) == Dh[10] ); +} + +template +void test35124(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2HAsync(Ah, Ad, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Bd, Bh, size, stream); +D2DAsync(Cd, Bd, size, stream); + +D2H(Ch, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Dh[10] + T(1) == Ch[10]); +} + +template +void test31245(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); +setArray(Ah, N, T(2)); + +H2D(Ad, Dh, size); + +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Bd, Bh, size, stream); +D2DAsync(Cd, Bd, size, stream); +D2HAsync(Ch, Cd, size, stream); + +D2H(Eh, Ad, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Dh[10] + T(1) == Eh[10]); +HIPASSERT(Bh[10] == Ch[10]); +} + + +template +void test32451(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Eh, N, T(2)); + +H2D(Ad, Eh, size); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +H2DAsync(Bd, Ah, size, stream); +D2DAsync(Cd, Bd, size, stream); +D2HAsync(Bh, Cd, size, stream); +H2HAsync(Ch, Bh, size, stream); +HIPCHECK(hipDeviceSynchronize()); +D2H(Dh, Ad, size); + +//HIPASSERT(Ah[10] == Ch[10]); +HIPASSERT(Eh[10] + T(1) == Dh[10]); + +} + +template +void test45123(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Ah, Bd, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Cd, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +D2H(Ch, Cd, size); +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Dh[10] + T(1) == Ch[10]); + +} + + +template +void test41235(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ch, N, T(1)); + +H2D(Ad, Ch, size); + +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Ah, Bd, size, stream); +H2DAsync(Cd, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +D2HAsync(Bh, Cd, size, stream); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Ch[10] + T(1) == Bh[10]); +} + +template +void test42351(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(2)); +setArray(Ah, N, T(1)); + +H2D(Ad, Dh, size); + +D2DAsync(Bd, Ad, size, stream); +H2DAsync(Cd, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +D2HAsync(Bh, Cd, size, stream); +H2HAsync(Ch, Bh, size, stream); + +D2H(Eh, Bd, size); + +HIPCHECK(hipDeviceSynchronize()); +HIPASSERT(Dh[10] == Eh[10]); +//HIPASSERT(Ah[10] + T(1) == Ch[10]); + +} + +template +void test43512(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +D2DAsync(Bd, Ad, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2HAsync(Ah, Bd, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Cd, Bh, size, stream); + +D2H(Ch, Cd, size); +HIPCHECK(hipDeviceSynchronize()); +//HIPASSERT( Dh[10] + T(1) == Ch[10]); +} + +template +void test51234(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +D2HAsync(Ah, Ad, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Bd, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); + +D2H(Ch, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Ch[10] == Dh[10] + T(1)); +} + +template +void test52341(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Eh, N, T(1)); +setArray(Bh, N, T(2)); + +H2D(Ad, Eh, size); + +D2HAsync(Ah, Ad, size, stream); +H2DAsync(Bd, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); + +D2H(Dh, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Eh[10] + T(1) == Dh[10]); +HIPASSERT(Ch[10] == Bh[10]); +} + +template +void test53412(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch, *Dh; +T *Eh, *Fh, *Gh; +T *Ad, *Bd, *Cd, *Dd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, true); +initArrays(&Eh, N, false, false); +initArrays(&Fh, N, false, false); +initArrays(&Gh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); +initArrays(&Dd, N, true, false); + +setArray(Dh, N, T(1)); +setArray(Eh, N, T(2)); +setArray(Bh, N, T(3)); + +H2D(Ad, Dh, size); +H2D(Bd, Eh, size); + +D2HAsync(Ah, Ad, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +H2DAsync(Dd, Ch, size, stream); + +D2H(Fh, Cd, size); +D2H(Gh, Dd, size); + +HIPASSERT(Ah[10] == Dh[10]); +HIPASSERT(Eh[10] + T(1) == Fh[10]); +HIPASSERT(Bh[10] == Gh[10]); +} + +template +void test54123(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh, *Fh, *Gh; +T *Ad, *Bd, *Cd, *Dd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Fh, N, false, false); +initArrays(&Gh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); +initArrays(&Dd, N, true, false); + +setArray(Dh, N, T(1)); +setArray(Eh, N, T(1)); +setArray(Bh, N, T(1)); + +H2D(Ad, Dh, size); +H2D(Bd, Eh, size); + +D2HAsync(Ah, Ad, size, stream); +D2DAsync(Cd, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +H2DAsync(Dd, Ch, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Dd); + +D2H(Fh, Cd, size); +D2H(Gh, Dd, size); + +HIPCHECK(hipDeviceSynchronize()); +HIPASSERT(Dh[10] == Ah[10]); +HIPASSERT(Eh[10] == Fh[10]); +HIPASSERT(Bh[10] + T(1) == Gh[10]); + +} + +int main(int argc, char *argv[]) +{ +HipTest::parseStandardArguments(argc, argv, true); + +test12345(); +test13452(); +test14523(); +test15234(); + +test23451(); +test24513(); +test25134(); +test21345(); + +test34512(); +test35124(); +test31245(); +test32451(); + +test45123(); +test41235(); +test42351(); +test43512(); + +test51234(); +test52341(); +test53412(); +test54123(); +passed(); +} + From d3c0e23113e928698d3b4c2dd0bd485980a5a38e Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Sat, 27 Feb 2016 05:48:41 -0600 Subject: [PATCH 89/94] Revert "Added test to check dispatches on single stream" This reverts commit 0201608b4467ee38ee6cf54130a50e72e4210e8f. [ROCm/hip commit: 3733599cb4c046d970a26da47161662e061ca054] --- projects/hip/tests/src/CMakeLists.txt | 7 +- projects/hip/tests/src/hipStream.h | 102 ---- projects/hip/tests/src/hipStreamL5.cpp | 785 ------------------------- 3 files changed, 2 insertions(+), 892 deletions(-) delete mode 100644 projects/hip/tests/src/hipStream.h delete mode 100644 projects/hip/tests/src/hipStreamL5.cpp diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index 3da90ae009..cc6af0b5d2 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -103,13 +103,13 @@ macro (make_test_matches exe match_string) ) endmacro() -#make_hip_executable (hip_ballot hip_ballot.cpp) +make_hip_executable (hip_ballot hip_ballot.cpp) make_hip_executable (hip_anyall hip_anyall.cpp) make_hip_executable (hip_popc hip_popc.cpp) make_hip_executable (hip_clz hip_clz.cpp) make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) -#make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) +make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) make_hip_executable (hipMemcpy hipMemcpy.cpp) make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) make_hip_executable (hipMemset hipMemset.cpp) @@ -123,7 +123,6 @@ make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecis make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp) -make_hip_executable (hipStreamL5 hipStreamL5.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -146,6 +145,4 @@ make_test(hipMemcpyAsync " " ) make_test(hipHcc " " ) -make_test(hipStreamL5 " ") - make_hipify_test(specialFunc.cu ) diff --git a/projects/hip/tests/src/hipStream.h b/projects/hip/tests/src/hipStream.h deleted file mode 100644 index f9ec3472d0..0000000000 --- a/projects/hip/tests/src/hipStream.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef HIPSTREAM_H -#define HIPSTREAM_H -#include - -#define NUM_STREAMS 4 - -/* -* H2H - 1 -* H2D - 2 -* KER - 3 -* D2D - 4 -* D2H - 5 -*/ - -template -void H2HAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ - HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyHostToHost, stream)); -} - -template -void H2DAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ - HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyHostToDevice, stream)); -} - -template -void D2DAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ - HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyDeviceToDevice, stream)); -} - -template -void D2HAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ - HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyDeviceToHost, stream)); -} - -template -void H2H(T *Dst, T *Src, size_t size){ - HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyHostToHost)); -} - -template -void H2D(T *Dst, T *Src, size_t size){ - HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyHostToDevice)); -} - -template -void D2D(T *Dst, T *Src, size_t size){ - HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyDeviceToDevice)); -} - -template -void D2H(T *Dst, T *Src, size_t size){ - HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyDeviceToHost)); -} - -template -__global__ void Inc(hipLaunchParm lp, T *In){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; -In[tx] = In[tx] + 1; -} - -template -void initArrays(T **Ad, T **Ah, - size_t N, bool usePinnedHost=false){ - size_t NBytes = N * sizeof(T); - if(Ad){ - HIPCHECK( hipMalloc(Ad, NBytes)); - } - if(usePinnedHost){ - HIPCHECK( hipMallocHost(Ah, NBytes)); - } - else{ - *Ah = new T[N]; - HIPASSERT(*Ah != NULL); - } -} - -template -void initArrays(T **Ad, size_t N, - bool deviceMemory = false, - bool usePinnedHost = false){ - size_t NBytes = N * sizeof(T); - if(deviceMemory){ - HIPCHECK( hipMalloc(Ad, NBytes)); - }else{ - if(usePinnedHost){ - HIPCHECK(hipMallocHost(Ad, NBytes)); - }else{ - *Ad = new T[N]; - HIPASSERT(*Ad != NULL); - } - } -} - -template -void setArray(T* Array, int N, T val){ -for(int i=0;i -#include"hip_runtime.h" -#include"test_common.h" -#include"hipStream.h" - -template -void test12345(){ - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - const size_t N = 1000; - const size_t size = sizeof(T) * N; - - T *Ah, *Bh, *Ch; - T *Ad, *Bd; - initArrays(&Ad, &Ah, N, true); - initArrays(&Bd, &Bh, N, true); - initArrays(&Ch, N, false, true); - - setArray(Ah, N, T(1)); - - H2HAsync(Bh, Ah, size, stream); - H2DAsync(Ad, Bh, size, stream); - hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); - D2DAsync(Bd, Ad, size, stream); - D2HAsync(Ch, Bd, size, stream); - HIPCHECK(hipDeviceSynchronize()); - - HIPASSERT(Ah[10] + T(1)== Ch[10]); - HIPCHECK(hipStreamDestroy(stream)); -} - -template -void test13452(){ - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - const size_t N = 1000; - const size_t size = sizeof(T) * N; - - T *Ah, *Bh, *Ch; - T *Dh, *Eh; - T *Ad, *Bd, *Cd; - - initArrays(&Ah, N, false, true); - initArrays(&Bh, N, false, true); - initArrays(&Ch, N, false, true); - initArrays(&Dh, N, false, false); - initArrays(&Eh, N, false, false); - initArrays(&Ad, N, true, false); - initArrays(&Bd, N, true, false); - initArrays(&Cd, N, true, false); - - setArray(Ah, N, T(1)); - setArray(Dh, N, T(2)); - - H2D(Ad, Dh, size); - - H2HAsync(Bh, Ah, size, stream); - hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); - D2DAsync(Bd, Ad, size, stream); - D2HAsync(Ch, Bd, size, stream); - H2DAsync(Cd, Ch, size, stream); - HIPCHECK(hipDeviceSynchronize()); - - D2H(Eh,Cd,size); - - HIPASSERT(Ah[10] == Bh[10]); - HIPASSERT(Eh[10] == Dh[10] + T(1)); - -} - -template -void test14523(){ - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - const int N = 1000; - const size_t size = sizeof(T) * N; - - T *Ah, *Bh, *Ch; - T *Dh, *Eh; - T *Ad, *Bd, *Cd; - - initArrays(&Ah, N, false, true); - initArrays(&Bh, N, false, true); - initArrays(&Ch, N, false, true); - initArrays(&Dh, N, false, false); - initArrays(&Eh, N, false, false); - initArrays(&Ad, N, true, false); - initArrays(&Bd, N, true, false); - initArrays(&Cd, N, true, false); - - setArray(Ah, N, T(1)); - setArray(Dh, N, T(2)); - - H2D(Ad,Dh,size); - - H2HAsync(Bh, Ah, size, stream); - D2DAsync(Bd, Ad, size, stream); - D2HAsync(Ch, Bd, size, stream); - H2DAsync(Cd, Ch, size, stream); - hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); - - HIPCHECK(hipDeviceSynchronize()); - - D2H(Eh, Cd, size); - - HIPASSERT(Ah[10] == Bh[10]); - HIPASSERT(Ch[10] + T(1) == Eh[10]); -} - -template -void test15234(){ - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - const size_t N = 1000; - const size_t size = sizeof(T) * N; - - T *Ah, *Bh, *Ch; - T *Dh, *Eh; - T *Ad, *Bd, *Cd; - - initArrays(&Ah, N, false, true); - initArrays(&Bh, N, false, true); - initArrays(&Ch, N, false, true); - initArrays(&Dh, N, false, false); - initArrays(&Eh, N, false, false); - initArrays(&Ad, N, true, false); - initArrays(&Bd, N, true, false); - initArrays(&Cd, N, true, false); - - setArray(Ah, N, T(1)); - setArray(Dh, N, T(2)); - - H2D(Ad, Dh, size); - - H2HAsync(Bh, Ah, size, stream); - D2HAsync(Ch, Ad, size, stream); - H2DAsync(Bd, Ch, size, stream); - hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); - D2DAsync(Cd, Bd, size, stream); - - D2H(Eh, Cd, size); - - HIPASSERT(Ah[10] == Bh[10]); - HIPASSERT(Eh[10] == Dh[10] + T(1)); - -} - -template -void test23451(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = sizeof(T) * N; - -T *Ah, *Bh, *Ch; -T *Ad, *Bd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); - -setArray(Ah, N, T(1)); - -H2DAsync(Ad, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Bh, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -HIPCHECK(hipDeviceSynchronize()); -//HIPASSERT(Ah[10] == Ch[10]); -} - -template -void test24513(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = sizeof(T) * N; - -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Ah, N, T(1)); -setArray(Dh, N, T(2)); - -H2D(Cd, Dh, size); - -H2DAsync(Ad, Ah, size, stream); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Bh, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -HIPCHECK(hipDeviceSynchronize()); - -D2H(Eh, Cd, size); - -HIPASSERT(Eh[0] == Dh[0] + T(1)); -//HIPASSERT(Ah[0] == Ch[0]); -} - -template -void test25134(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = sizeof(T) * N; - -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Ah, N, T(1)); -setArray(Dh, N, T(2)); - -H2D(Bd, Dh, size); - -H2DAsync(Ad, Ah, size, stream); -D2HAsync(Bh, Ad, size, stream); -H2HAsync(Ch, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); - -D2H(Eh, Cd, size); - -HIPCHECK(hipDeviceSynchronize()); - -//HIPASSERT(Ah[10] == Ch[10]); -HIPASSERT(Dh[10] + T(1) == Eh[10]); - -} - -template -void test21345(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh, *Ch, *Dh; -T *Ad, *Bd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, true); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); - -setArray(Ah, N, T(1)); -setArray(Bh, N, T(2)); - -H2DAsync(Ad, Ah, size, stream); -H2HAsync(Ch, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Dh, Bd, size, stream); - -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT( Bh[10] == Ch[10] ); -HIPASSERT( Ah[10] + T(1) == Dh[10]); -} - -template -void test34512(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Bh, *Ch, *Dh; -T *Ah, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, true); -initArrays(&Ah, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Ah, N, T(1)); - -H2D(Ad, Ah, size); - -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Bh, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -H2DAsync(Cd, Ch, size, stream); - -D2H(Dh, Cd, size); - -HIPCHECK(hipDeviceSynchronize()); -//HIPASSERT( Ah[10] + T(1) == Dh[10] ); -} - -template -void test35124(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Dh, N, T(1)); - -H2D(Ad, Dh, size); - -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2HAsync(Ah, Ad, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Bd, Bh, size, stream); -D2DAsync(Cd, Bd, size, stream); - -D2H(Ch, Cd, size); - -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Dh[10] + T(1) == Ch[10]); -} - -template -void test31245(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Dh, N, T(1)); -setArray(Ah, N, T(2)); - -H2D(Ad, Dh, size); - -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Bd, Bh, size, stream); -D2DAsync(Cd, Bd, size, stream); -D2HAsync(Ch, Cd, size, stream); - -D2H(Eh, Ad, size); - -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Dh[10] + T(1) == Eh[10]); -HIPASSERT(Bh[10] == Ch[10]); -} - - -template -void test32451(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Ah, N, T(1)); -setArray(Eh, N, T(2)); - -H2D(Ad, Eh, size); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -H2DAsync(Bd, Ah, size, stream); -D2DAsync(Cd, Bd, size, stream); -D2HAsync(Bh, Cd, size, stream); -H2HAsync(Ch, Bh, size, stream); -HIPCHECK(hipDeviceSynchronize()); -D2H(Dh, Ad, size); - -//HIPASSERT(Ah[10] == Ch[10]); -HIPASSERT(Eh[10] + T(1) == Dh[10]); - -} - -template -void test45123(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Dh, N, T(1)); - -H2D(Ad, Dh, size); - -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Ah, Bd, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Cd, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -D2H(Ch, Cd, size); -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Dh[10] + T(1) == Ch[10]); - -} - - -template -void test41235(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh; -T *Ch; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Ch, N, T(1)); - -H2D(Ad, Ch, size); - -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Ah, Bd, size, stream); -H2DAsync(Cd, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -D2HAsync(Bh, Cd, size, stream); - -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Ch[10] + T(1) == Bh[10]); -} - -template -void test42351(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Dh, N, T(2)); -setArray(Ah, N, T(1)); - -H2D(Ad, Dh, size); - -D2DAsync(Bd, Ad, size, stream); -H2DAsync(Cd, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -D2HAsync(Bh, Cd, size, stream); -H2HAsync(Ch, Bh, size, stream); - -D2H(Eh, Bd, size); - -HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Dh[10] == Eh[10]); -//HIPASSERT(Ah[10] + T(1) == Ch[10]); - -} - -template -void test43512(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Dh, N, T(1)); - -H2D(Ad, Dh, size); - -D2DAsync(Bd, Ad, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2HAsync(Ah, Bd, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Cd, Bh, size, stream); - -D2H(Ch, Cd, size); -HIPCHECK(hipDeviceSynchronize()); -//HIPASSERT( Dh[10] + T(1) == Ch[10]); -} - -template -void test51234(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Dh, N, T(1)); - -H2D(Ad, Dh, size); - -D2HAsync(Ah, Ad, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Bd, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); - -D2H(Ch, Cd, size); - -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Ch[10] == Dh[10] + T(1)); -} - -template -void test52341(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); - -setArray(Eh, N, T(1)); -setArray(Bh, N, T(2)); - -H2D(Ad, Eh, size); - -D2HAsync(Ah, Ad, size, stream); -H2DAsync(Bd, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); - -D2H(Dh, Cd, size); - -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Eh[10] + T(1) == Dh[10]); -HIPASSERT(Ch[10] == Bh[10]); -} - -template -void test53412(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = sizeof(T) * N; - -T *Ah, *Bh, *Ch, *Dh; -T *Eh, *Fh, *Gh; -T *Ad, *Bd, *Cd, *Dd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, true); -initArrays(&Eh, N, false, false); -initArrays(&Fh, N, false, false); -initArrays(&Gh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); -initArrays(&Dd, N, true, false); - -setArray(Dh, N, T(1)); -setArray(Eh, N, T(2)); -setArray(Bh, N, T(3)); - -H2D(Ad, Dh, size); -H2D(Bd, Eh, size); - -D2HAsync(Ah, Ad, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -H2DAsync(Dd, Ch, size, stream); - -D2H(Fh, Cd, size); -D2H(Gh, Dd, size); - -HIPASSERT(Ah[10] == Dh[10]); -HIPASSERT(Eh[10] + T(1) == Fh[10]); -HIPASSERT(Bh[10] == Gh[10]); -} - -template -void test54123(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); - -const size_t N = 1000; -const size_t size = N * sizeof(T); - -T *Ah, *Bh, *Ch; -T *Dh, *Eh, *Fh, *Gh; -T *Ad, *Bd, *Cd, *Dd; - -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Fh, N, false, false); -initArrays(&Gh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); -initArrays(&Dd, N, true, false); - -setArray(Dh, N, T(1)); -setArray(Eh, N, T(1)); -setArray(Bh, N, T(1)); - -H2D(Ad, Dh, size); -H2D(Bd, Eh, size); - -D2HAsync(Ah, Ad, size, stream); -D2DAsync(Cd, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -H2DAsync(Dd, Ch, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Dd); - -D2H(Fh, Cd, size); -D2H(Gh, Dd, size); - -HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Dh[10] == Ah[10]); -HIPASSERT(Eh[10] == Fh[10]); -HIPASSERT(Bh[10] + T(1) == Gh[10]); - -} - -int main(int argc, char *argv[]) -{ -HipTest::parseStandardArguments(argc, argv, true); - -test12345(); -test13452(); -test14523(); -test15234(); - -test23451(); -test24513(); -test25134(); -test21345(); - -test34512(); -test35124(); -test31245(); -test32451(); - -test45123(); -test41235(); -test42351(); -test43512(); - -test51234(); -test52341(); -test53412(); -test54123(); -passed(); -} - From d0e96d1b27aeaa58ea0116fcf9c5777841c33ec8 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Sat, 27 Feb 2016 05:55:56 -0600 Subject: [PATCH 90/94] [v2]: Added test to check single stream dispatches [ROCm/hip commit: 14ec56acab95f9dd6eeb60698bd0d5006f5e2da8] --- projects/hip/tests/src/CMakeLists.txt | 3 + projects/hip/tests/src/hipStream.h | 102 ++++ projects/hip/tests/src/hipStreamL5.cpp | 787 +++++++++++++++++++++++++ 3 files changed, 892 insertions(+) create mode 100644 projects/hip/tests/src/hipStream.h create mode 100644 projects/hip/tests/src/hipStreamL5.cpp diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index cc6af0b5d2..1abd0198c5 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -123,6 +123,7 @@ make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecis make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp) +make_hip_executable (hipStreamL5 hipStreamL5.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -145,4 +146,6 @@ make_test(hipMemcpyAsync " " ) make_test(hipHcc " " ) +make_test(hipStreamL5 " ") + make_hipify_test(specialFunc.cu ) diff --git a/projects/hip/tests/src/hipStream.h b/projects/hip/tests/src/hipStream.h new file mode 100644 index 0000000000..f9ec3472d0 --- /dev/null +++ b/projects/hip/tests/src/hipStream.h @@ -0,0 +1,102 @@ +#ifndef HIPSTREAM_H +#define HIPSTREAM_H +#include + +#define NUM_STREAMS 4 + +/* +* H2H - 1 +* H2D - 2 +* KER - 3 +* D2D - 4 +* D2H - 5 +*/ + +template +void H2HAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyHostToHost, stream)); +} + +template +void H2DAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyHostToDevice, stream)); +} + +template +void D2DAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyDeviceToDevice, stream)); +} + +template +void D2HAsync(T *Dst, T *Src, size_t size, hipStream_t stream){ + HIPCHECK(hipMemcpyAsync(Dst, Src, size, hipMemcpyDeviceToHost, stream)); +} + +template +void H2H(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyHostToHost)); +} + +template +void H2D(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyHostToDevice)); +} + +template +void D2D(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyDeviceToDevice)); +} + +template +void D2H(T *Dst, T *Src, size_t size){ + HIPCHECK(hipMemcpy(Dst, Src, size, hipMemcpyDeviceToHost)); +} + +template +__global__ void Inc(hipLaunchParm lp, T *In){ +int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +In[tx] = In[tx] + 1; +} + +template +void initArrays(T **Ad, T **Ah, + size_t N, bool usePinnedHost=false){ + size_t NBytes = N * sizeof(T); + if(Ad){ + HIPCHECK( hipMalloc(Ad, NBytes)); + } + if(usePinnedHost){ + HIPCHECK( hipMallocHost(Ah, NBytes)); + } + else{ + *Ah = new T[N]; + HIPASSERT(*Ah != NULL); + } +} + +template +void initArrays(T **Ad, size_t N, + bool deviceMemory = false, + bool usePinnedHost = false){ + size_t NBytes = N * sizeof(T); + if(deviceMemory){ + HIPCHECK( hipMalloc(Ad, NBytes)); + }else{ + if(usePinnedHost){ + HIPCHECK(hipMallocHost(Ad, NBytes)); + }else{ + *Ad = new T[N]; + HIPASSERT(*Ad != NULL); + } + } +} + +template +void setArray(T* Array, int N, T val){ +for(int i=0;i +#include"hip_runtime.h" +#include"test_common.h" +#include"hipStream.h" + +template +void test12345(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const size_t N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Ad, *Bd; + initArrays(&Ad, &Ah, N, true); + initArrays(&Bd, &Bh, N, true); + initArrays(&Ch, N, false, true); + + setArray(Ah, N, T(1)); + + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Ad, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ch, Bd, size, stream); + HIPCHECK(hipDeviceSynchronize()); + + HIPASSERT(Ah[10] + T(1)== Ch[10]); + HIPCHECK(hipStreamDestroy(stream)); +} + +template +void test13452(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const size_t N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; + + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + + H2D(Ad, Dh, size); + + H2HAsync(Bh, Ah, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ch, Bd, size, stream); + H2DAsync(Cd, Ch, size, stream); + HIPCHECK(hipDeviceSynchronize()); + + D2H(Eh,Cd,size); + + HIPASSERT(Ah[10] == Bh[10]); + HIPASSERT(Eh[10] == Dh[10] + T(1)); + +} + +template +void test14523(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const int N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; + + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + + H2D(Ad,Dh,size); + + H2HAsync(Bh, Ah, size, stream); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ch, Bd, size, stream); + H2DAsync(Cd, Ch, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); + + HIPCHECK(hipDeviceSynchronize()); + + D2H(Eh, Cd, size); + + HIPASSERT(Ah[10] == Bh[10]); + HIPASSERT(Ch[10] + T(1) == Eh[10]); +} + +template +void test15234(){ + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + const size_t N = 1000; + const size_t size = sizeof(T) * N; + + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; + + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + + H2D(Ad, Dh, size); + + H2HAsync(Bh, Ah, size, stream); + D2HAsync(Ch, Ad, size, stream); + H2DAsync(Bd, Ch, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2DAsync(Cd, Bd, size, stream); + + D2H(Eh, Cd, size); + + HIPASSERT(Ah[10] == Bh[10]); + HIPASSERT(Eh[10] == Dh[10] + T(1)); + +} + +template +void test23451(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch; +T *Ad, *Bd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); + +setArray(Ah, N, T(1)); + +H2DAsync(Ad, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Bh, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +HIPCHECK(hipDeviceSynchronize()); +//HIPASSERT(Ah[10] == Ch[10]); +} + +template +void test24513(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Dh, N, T(2)); + +H2D(Cd, Dh, size); + +H2DAsync(Ad, Ah, size, stream); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Bh, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +HIPCHECK(hipDeviceSynchronize()); + +D2H(Eh, Cd, size); + +HIPASSERT(Eh[0] == Dh[0] + T(1)); +//HIPASSERT(Ah[0] == Ch[0]); +} + +template +void test25134(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Dh, N, T(2)); + +H2D(Bd, Dh, size); + +H2DAsync(Ad, Ah, size, stream); +D2HAsync(Bh, Ad, size, stream); +H2HAsync(Ch, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); + +D2H(Eh, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +//HIPASSERT(Ah[10] == Ch[10]); +HIPASSERT(Dh[10] + T(1) == Eh[10]); + +} + +template +void test21345(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch, *Dh; +T *Ad, *Bd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, true); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Bh, N, T(2)); + +H2DAsync(Ad, Ah, size, stream); +H2HAsync(Ch, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Dh, Bd, size, stream); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT( Bh[10] == Ch[10] ); +HIPASSERT( Ah[10] + T(1) == Dh[10]); +} + +template +void test34512(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Bh, *Ch, *Dh; +T *Ah, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, true); +initArrays(&Ah, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); + +H2D(Ad, Ah, size); + +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Bh, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +H2DAsync(Cd, Ch, size, stream); + +D2H(Dh, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); +//HIPASSERT( Ah[10] + T(1) == Dh[10] ); +} + +template +void test35124(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +D2HAsync(Ah, Ad, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Bd, Bh, size, stream); +D2DAsync(Cd, Bd, size, stream); + +D2H(Ch, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Dh[10] + T(1) == Ch[10]); +} + +template +void test31245(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); +setArray(Ah, N, T(2)); + +H2D(Ad, Dh, size); + +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Bd, Bh, size, stream); +D2DAsync(Cd, Bd, size, stream); +D2HAsync(Ch, Cd, size, stream); + +D2H(Eh, Ad, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Dh[10] + T(1) == Eh[10]); +HIPASSERT(Bh[10] == Ch[10]); +} + + +template +void test32451(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ah, N, T(1)); +setArray(Eh, N, T(2)); + +H2D(Ad, Eh, size); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); +H2DAsync(Bd, Ah, size, stream); +D2DAsync(Cd, Bd, size, stream); +D2HAsync(Bh, Cd, size, stream); +H2HAsync(Ch, Bh, size, stream); +HIPCHECK(hipDeviceSynchronize()); +D2H(Dh, Ad, size); + +//HIPASSERT(Ah[10] == Ch[10]); +HIPASSERT(Eh[10] + T(1) == Dh[10]); + +} + +template +void test45123(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Ah, Bd, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Cd, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +D2H(Ch, Cd, size); +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Dh[10] + T(1) == Ch[10]); + +} + + +template +void test41235(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Ch, N, T(1)); + +H2D(Ad, Ch, size); + +D2DAsync(Bd, Ad, size, stream); +D2HAsync(Ah, Bd, size, stream); +H2DAsync(Cd, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +D2HAsync(Bh, Cd, size, stream); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Ch[10] + T(1) == Bh[10]); +} + +template +void test42351(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(2)); +setArray(Ah, N, T(1)); + +H2D(Ad, Dh, size); + +D2DAsync(Bd, Ad, size, stream); +H2DAsync(Cd, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); +D2HAsync(Bh, Cd, size, stream); +H2HAsync(Ch, Bh, size, stream); + +D2H(Eh, Bd, size); + +HIPCHECK(hipDeviceSynchronize()); +HIPASSERT(Dh[10] == Eh[10]); +//HIPASSERT(Ah[10] + T(1) == Ch[10]); + +} + +template +void test43512(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +D2DAsync(Bd, Ad, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2HAsync(Ah, Bd, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Cd, Bh, size, stream); + +D2H(Ch, Cd, size); +HIPCHECK(hipDeviceSynchronize()); +//HIPASSERT( Dh[10] + T(1) == Ch[10]); +} + +template +void test51234(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh; +T *Ch, *Dh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, false); +initArrays(&Dh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Dh, N, T(1)); + +H2D(Ad, Dh, size); + +D2HAsync(Ah, Ad, size, stream); +H2HAsync(Bh, Ah, size, stream); +H2DAsync(Bd, Bh, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); + +D2H(Ch, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Ch[10] == Dh[10] + T(1)); +} + +template +void test52341(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh; +T *Ad, *Bd, *Cd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); + +setArray(Eh, N, T(1)); +setArray(Bh, N, T(2)); + +H2D(Ad, Eh, size); + +D2HAsync(Ah, Ad, size, stream); +H2DAsync(Bd, Ah, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); + +D2H(Dh, Cd, size); + +HIPCHECK(hipDeviceSynchronize()); + +HIPASSERT(Eh[10] + T(1) == Dh[10]); +HIPASSERT(Ch[10] == Bh[10]); +} + +template +void test53412(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); +const size_t N = 1000; +const size_t size = sizeof(T) * N; + +T *Ah, *Bh, *Ch, *Dh; +T *Eh, *Fh, *Gh; +T *Ad, *Bd, *Cd, *Dd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, true); +initArrays(&Eh, N, false, false); +initArrays(&Fh, N, false, false); +initArrays(&Gh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); +initArrays(&Dd, N, true, false); + +setArray(Dh, N, T(1)); +setArray(Eh, N, T(2)); +setArray(Bh, N, T(3)); + +H2D(Ad, Dh, size); +H2D(Bd, Eh, size); + +D2HAsync(Ah, Ad, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); +D2DAsync(Cd, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +H2DAsync(Dd, Ch, size, stream); + +D2H(Fh, Cd, size); +D2H(Gh, Dd, size); + +HIPASSERT(Ah[10] == Dh[10]); +HIPASSERT(Eh[10] + T(1) == Fh[10]); +HIPASSERT(Bh[10] == Gh[10]); +} + +template +void test54123(){ +hipStream_t stream; +HIPCHECK(hipStreamCreate(&stream)); + +const size_t N = 1000; +const size_t size = N * sizeof(T); + +T *Ah, *Bh, *Ch; +T *Dh, *Eh, *Fh, *Gh; +T *Ad, *Bd, *Cd, *Dd; + +initArrays(&Ah, N, false, true); +initArrays(&Bh, N, false, true); +initArrays(&Ch, N, false, true); +initArrays(&Dh, N, false, false); +initArrays(&Eh, N, false, false); +initArrays(&Fh, N, false, false); +initArrays(&Gh, N, false, false); +initArrays(&Ad, N, true, false); +initArrays(&Bd, N, true, false); +initArrays(&Cd, N, true, false); +initArrays(&Dd, N, true, false); + +setArray(Dh, N, T(1)); +setArray(Eh, N, T(1)); +setArray(Bh, N, T(1)); + +H2D(Ad, Dh, size); +H2D(Bd, Eh, size); + +D2HAsync(Ah, Ad, size, stream); +D2DAsync(Cd, Bd, size, stream); +H2HAsync(Ch, Bh, size, stream); +H2DAsync(Dd, Ch, size, stream); +hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Dd); + +D2H(Fh, Cd, size); +D2H(Gh, Dd, size); + +HIPCHECK(hipDeviceSynchronize()); +HIPASSERT(Dh[10] == Ah[10]); +HIPASSERT(Eh[10] == Fh[10]); +HIPASSERT(Bh[10] + T(1) == Gh[10]); + +} + +int main(int argc, char *argv[]) +{ +HipTest::parseStandardArguments(argc, argv, true); + +test12345(); +test13452(); +test14523(); +test15234(); + +test23451(); +test24513(); +test25134(); +test21345(); + +test34512(); +test35124(); +test31245(); +test32451(); + +test45123(); +test41235(); +test42351(); +test43512(); + +test51234(); +test52341(); +test53412(); +test54123(); + +passed(); + +} + From dcf5ac2c068c8a392547ea1b4c8ec11aa24c8286 Mon Sep 17 00:00:00 2001 From: pensun Date: Sat, 27 Feb 2016 09:43:38 -0600 Subject: [PATCH 91/94] add test case and its driver for HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES [ROCm/hip commit: 43315ad62b4a13b7c2fc66da1c48f802b18f54ea] --- projects/hip/tests/src/hipEnvVar.cpp | 102 +++++++++++++++++++++ projects/hip/tests/src/hipEnvVarDriver.cpp | 38 ++++++++ 2 files changed, 140 insertions(+) create mode 100644 projects/hip/tests/src/hipEnvVar.cpp create mode 100644 projects/hip/tests/src/hipEnvVarDriver.cpp diff --git a/projects/hip/tests/src/hipEnvVar.cpp b/projects/hip/tests/src/hipEnvVar.cpp new file mode 100644 index 0000000000..e0d834d43f --- /dev/null +++ b/projects/hip/tests/src/hipEnvVar.cpp @@ -0,0 +1,102 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +#include +#include +#include +#include +#include +#include + +int debug = 0; + +void usage() { + printf("hipEnvVar [otpions]\n\ + -a,\t\ttotal number ofavailable GPUs and their pciBusID\n\ + -s,\t\tselect one GPU and return its pciBusID\n\ + -h,\t\tshow this help message\n\ + "); +} +int main(int argc, char **argv) +{ + extern char *optarg; + extern int optind; + int c, err = 0; + int retDevCnt=0, retDevInfo=0; + int device=0; + //std::cout << "reach here!!" << std::endl; + while ((c = getopt(argc, argv, "cd:h")) != -1) + switch (c) { + case 'c': + retDevCnt = true; + break; + case 'd': + retDevInfo = true; + device = atoi(optarg); + break; + case 'h': + usage(); + return 0; + break; + default : + //usage(); + return -1; + break; + case '?': + err = 1; + break; + } + // device init + int devCount=0; + hipGetDeviceCount(&devCount); + + //printf("\nTotal number of GPU devices in the system is %d\n",devCount); + + if (devCount == 0) { + printf("No HIP enabled device\n"); + return -1; + } + if (device < 0 || device > devCount -1) { + printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n", + device, 0, devCount -1); + return -1; + } + if (retDevCnt) { + std::cout << "Total number of devices visible in system is "<< devCount << std::endl; + } + if (retDevInfo) { + hipSetDevice(device); + hipDeviceProp_t devProp; + + hipDeviceGetProperties(&devProp, device); + if (devProp.major < 1) { + printf("Device %d does not support HIP\n", device); + return -1; + } + + std::cout << "The selected device pciBusID is " << devProp.pciBusID << std::endl; + } + + exit(0); +} + diff --git a/projects/hip/tests/src/hipEnvVarDriver.cpp b/projects/hip/tests/src/hipEnvVarDriver.cpp new file mode 100644 index 0000000000..df8d2c0f2c --- /dev/null +++ b/projects/hip/tests/src/hipEnvVarDriver.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the +following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN TcHE SOFTWARE. */ + +#include +#include + +using namespace std; + +int main() { + FILE *in; + char buff[512]; + + if(!(in = popen("ls -sail", "r"))){ + return 1; + } + + while(fgets(buff, sizeof(buff), in)!=NULL){ + cout << buff; + } + pclose(in); + + return 0; + +} From ab2f5e015830131f5f438e13f363b51bd6786e5f Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Sat, 27 Feb 2016 13:20:55 -0600 Subject: [PATCH 92/94] Update hipStreamL5.cpp - Added Copyright - Removed unnecessary headers - Added naming scheme - Added comments for failing cases - Reformatted source [ROCm/hip commit: cdccdb9faac9bbf75b10c5716a77b55cb5943b44] --- projects/hip/tests/src/hipStreamL5.cpp | 930 +++++++++++++------------ 1 file changed, 487 insertions(+), 443 deletions(-) diff --git a/projects/hip/tests/src/hipStreamL5.cpp b/projects/hip/tests/src/hipStreamL5.cpp index 66a95f3091..5a65dc50dc 100644 --- a/projects/hip/tests/src/hipStreamL5.cpp +++ b/projects/hip/tests/src/hipStreamL5.cpp @@ -1,7 +1,55 @@ -#include -#include"hip_runtime.h" -#include"test_common.h" -#include"hipStream.h" +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "test_common.h" +#include "hipStream.h" + +/* +The naming of tests is done by assigning a number to +type of disptach possible on stream. +The following are possible stream dispatches: +1. H2H - hipMemcpyHostToHost : indexed as 1 +2. H2D - hipMemcpyHostToDevice : indexed as 2 +3. Ker - Kernel Dispatch : indexed as 3 +4. D2D - hipMemcpyDeviceToDevice : indexed as 4 +5. D2H - hipMemcpyDeviceToHost : indexed as 5 +For example, +a test for Ker, D2D, D2H, H2H, H2D is given as test34512(); +Note that all memory copies are Async. + +*WARNING: The commented out assertions are failing cases. +According to my observation, they are happening with tests +which end in HostToHost and take data from previous +dispatch in the stream. This also include disjoint data passes. +The list of failing tests are: +test23451(); +test32451(); +test42351(); + +For disjoint data passed: +test24513 +test25134 +test34512 +*/ template void test12345(){ @@ -151,637 +199,633 @@ void test15234(){ template void test23451(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = sizeof(T) * N; + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + const size_t N = 1000; + const size_t size = sizeof(T) * N; -T *Ah, *Bh, *Ch; -T *Ad, *Bd; + T *Ah, *Bh, *Ch; + T *Ad, *Bd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); -setArray(Ah, N, T(1)); + setArray(Ah, N, T(1)); -H2DAsync(Ad, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Bh, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -HIPCHECK(hipDeviceSynchronize()); -//HIPASSERT(Ah[10] == Ch[10]); + H2DAsync(Ad, Ah, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Bh, Bd, size, stream); + H2HAsync(Ch, Bh, size, stream); + HIPCHECK(hipDeviceSynchronize()); + //HIPASSERT(Ah[10] == Ch[10]); } template void test24513(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = sizeof(T) * N; + const size_t N = 1000; + const size_t size = sizeof(T) * N; -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Ah, N, T(1)); -setArray(Dh, N, T(2)); + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); -H2D(Cd, Dh, size); + H2D(Cd, Dh, size); -H2DAsync(Ad, Ah, size, stream); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Bh, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -HIPCHECK(hipDeviceSynchronize()); + H2DAsync(Ad, Ah, size, stream); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Bh, Bd, size, stream); + H2HAsync(Ch, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); + HIPCHECK(hipDeviceSynchronize()); -D2H(Eh, Cd, size); + D2H(Eh, Cd, size); -HIPASSERT(Eh[0] == Dh[0] + T(1)); -//HIPASSERT(Ah[0] == Ch[0]); + HIPASSERT(Eh[0] == Dh[0] + T(1)); + //HIPASSERT(Ah[0] == Ch[0]); } template void test25134(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = sizeof(T) * N; + const size_t N = 1000; + const size_t size = sizeof(T) * N; -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Ah, N, T(1)); -setArray(Dh, N, T(2)); + setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); -H2D(Bd, Dh, size); + H2D(Bd, Dh, size); -H2DAsync(Ad, Ah, size, stream); -D2HAsync(Bh, Ad, size, stream); -H2HAsync(Ch, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); + H2DAsync(Ad, Ah, size, stream); + D2HAsync(Bh, Ad, size, stream); + H2HAsync(Ch, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2DAsync(Cd, Bd, size, stream); -D2H(Eh, Cd, size); + D2H(Eh, Cd, size); -HIPCHECK(hipDeviceSynchronize()); - -//HIPASSERT(Ah[10] == Ch[10]); -HIPASSERT(Dh[10] + T(1) == Eh[10]); + HIPCHECK(hipDeviceSynchronize()); + //HIPASSERT(Ah[10] == Ch[10]); + HIPASSERT(Dh[10] + T(1) == Eh[10]); } template void test21345(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch, *Dh; -T *Ad, *Bd; + T *Ah, *Bh, *Ch, *Dh; + T *Ad, *Bd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, true); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, true); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); -setArray(Ah, N, T(1)); -setArray(Bh, N, T(2)); + setArray(Ah, N, T(1)); + setArray(Bh, N, T(2)); -H2DAsync(Ad, Ah, size, stream); -H2HAsync(Ch, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Dh, Bd, size, stream); + H2DAsync(Ad, Ah, size, stream); + H2HAsync(Ch, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Dh, Bd, size, stream); -HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); -HIPASSERT( Bh[10] == Ch[10] ); -HIPASSERT( Ah[10] + T(1) == Dh[10]); + HIPASSERT( Bh[10] == Ch[10] ); + HIPASSERT( Ah[10] + T(1) == Dh[10]); } template void test34512(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Bh, *Ch, *Dh; -T *Ah, *Eh; -T *Ad, *Bd, *Cd; + T *Bh, *Ch, *Dh; + T *Ah, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, true); -initArrays(&Ah, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, true); + initArrays(&Ah, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Ah, N, T(1)); + setArray(Ah, N, T(1)); -H2D(Ad, Ah, size); + H2D(Ad, Ah, size); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Bh, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -H2DAsync(Cd, Ch, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Bh, Bd, size, stream); + H2HAsync(Ch, Bh, size, stream); + H2DAsync(Cd, Ch, size, stream); -D2H(Dh, Cd, size); + D2H(Dh, Cd, size); -HIPCHECK(hipDeviceSynchronize()); -//HIPASSERT( Ah[10] + T(1) == Dh[10] ); + HIPCHECK(hipDeviceSynchronize()); + //HIPASSERT( Ah[10] + T(1) == Dh[10] ); } template void test35124(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh; + T *Ch, *Dh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, false); + initArrays(&Dh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Dh, N, T(1)); + setArray(Dh, N, T(1)); -H2D(Ad, Dh, size); + H2D(Ad, Dh, size); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -D2HAsync(Ah, Ad, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Bd, Bh, size, stream); -D2DAsync(Cd, Bd, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + D2HAsync(Ah, Ad, size, stream); + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Bd, Bh, size, stream); + D2DAsync(Cd, Bd, size, stream); -D2H(Ch, Cd, size); + D2H(Ch, Cd, size); -HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Dh[10] + T(1) == Ch[10]); + HIPASSERT(Dh[10] + T(1) == Ch[10]); } template void test31245(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; + const size_t N = 1000; + const size_t size = N * sizeof(T); + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Dh, N, T(1)); -setArray(Ah, N, T(2)); + setArray(Dh, N, T(1)); + setArray(Ah, N, T(2)); -H2D(Ad, Dh, size); + H2D(Ad, Dh, size); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Bd, Bh, size, stream); -D2DAsync(Cd, Bd, size, stream); -D2HAsync(Ch, Cd, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Bd, Bh, size, stream); + D2DAsync(Cd, Bd, size, stream); + D2HAsync(Ch, Cd, size, stream); -D2H(Eh, Ad, size); + D2H(Eh, Ad, size); -HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Dh[10] + T(1) == Eh[10]); -HIPASSERT(Bh[10] == Ch[10]); + HIPASSERT(Dh[10] + T(1) == Eh[10]); + HIPASSERT(Bh[10] == Ch[10]); } template void test32451(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Ah, N, T(1)); -setArray(Eh, N, T(2)); + setArray(Ah, N, T(1)); + setArray(Eh, N, T(2)); -H2D(Ad, Eh, size); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); -H2DAsync(Bd, Ah, size, stream); -D2DAsync(Cd, Bd, size, stream); -D2HAsync(Bh, Cd, size, stream); -H2HAsync(Ch, Bh, size, stream); -HIPCHECK(hipDeviceSynchronize()); -D2H(Dh, Ad, size); + H2D(Ad, Eh, size); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Ad); + H2DAsync(Bd, Ah, size, stream); + D2DAsync(Cd, Bd, size, stream); + D2HAsync(Bh, Cd, size, stream); + H2HAsync(Ch, Bh, size, stream); + HIPCHECK(hipDeviceSynchronize()); + D2H(Dh, Ad, size); -//HIPASSERT(Ah[10] == Ch[10]); -HIPASSERT(Eh[10] + T(1) == Dh[10]); + //HIPASSERT(Ah[10] == Ch[10]); + HIPASSERT(Eh[10] + T(1) == Dh[10]); } template void test45123(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh; + T *Ch, *Dh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, false); + initArrays(&Dh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Dh, N, T(1)); + setArray(Dh, N, T(1)); -H2D(Ad, Dh, size); + H2D(Ad, Dh, size); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Ah, Bd, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Cd, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -D2H(Ch, Cd, size); -HIPCHECK(hipDeviceSynchronize()); - -HIPASSERT(Dh[10] + T(1) == Ch[10]); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ah, Bd, size, stream); + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Cd, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); + D2H(Ch, Cd, size); + HIPCHECK(hipDeviceSynchronize()); + HIPASSERT(Dh[10] + T(1) == Ch[10]); } template void test41235(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh; -T *Ch; -T *Ad, *Bd, *Cd; + T *Ah, *Bh; + T *Ch; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Ch, N, T(1)); + setArray(Ch, N, T(1)); -H2D(Ad, Ch, size); + H2D(Ad, Ch, size); -D2DAsync(Bd, Ad, size, stream); -D2HAsync(Ah, Bd, size, stream); -H2DAsync(Cd, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -D2HAsync(Bh, Cd, size, stream); + D2DAsync(Bd, Ad, size, stream); + D2HAsync(Ah, Bd, size, stream); + H2DAsync(Cd, Ah, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); + D2HAsync(Bh, Cd, size, stream); -HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Ch[10] + T(1) == Bh[10]); + HIPASSERT(Ch[10] + T(1) == Bh[10]); } template void test42351(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Dh, N, T(2)); -setArray(Ah, N, T(1)); + setArray(Dh, N, T(2)); + setArray(Ah, N, T(1)); -H2D(Ad, Dh, size); + H2D(Ad, Dh, size); -D2DAsync(Bd, Ad, size, stream); -H2DAsync(Cd, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); -D2HAsync(Bh, Cd, size, stream); -H2HAsync(Ch, Bh, size, stream); + D2DAsync(Bd, Ad, size, stream); + H2DAsync(Cd, Ah, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); + D2HAsync(Bh, Cd, size, stream); + H2HAsync(Ch, Bh, size, stream); -D2H(Eh, Bd, size); - -HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Dh[10] == Eh[10]); -//HIPASSERT(Ah[10] + T(1) == Ch[10]); + D2H(Eh, Bd, size); + HIPCHECK(hipDeviceSynchronize()); + HIPASSERT(Dh[10] == Eh[10]); + //HIPASSERT(Ah[10] + T(1) == Ch[10]); } template void test43512(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh; + T *Ch, *Dh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, false); + initArrays(&Dh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Dh, N, T(1)); + setArray(Dh, N, T(1)); -H2D(Ad, Dh, size); + H2D(Ad, Dh, size); -D2DAsync(Bd, Ad, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2HAsync(Ah, Bd, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Cd, Bh, size, stream); + D2DAsync(Bd, Ad, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2HAsync(Ah, Bd, size, stream); + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Cd, Bh, size, stream); -D2H(Ch, Cd, size); -HIPCHECK(hipDeviceSynchronize()); -//HIPASSERT( Dh[10] + T(1) == Ch[10]); + D2H(Ch, Cd, size); + HIPCHECK(hipDeviceSynchronize()); + //HIPASSERT( Dh[10] + T(1) == Ch[10]); } template void test51234(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh; -T *Ch, *Dh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh; + T *Ch, *Dh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, false); -initArrays(&Dh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, false); + initArrays(&Dh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Dh, N, T(1)); + setArray(Dh, N, T(1)); -H2D(Ad, Dh, size); + H2D(Ad, Dh, size); -D2HAsync(Ah, Ad, size, stream); -H2HAsync(Bh, Ah, size, stream); -H2DAsync(Bd, Bh, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); + D2HAsync(Ah, Ad, size, stream); + H2HAsync(Bh, Ah, size, stream); + H2DAsync(Bd, Bh, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2DAsync(Cd, Bd, size, stream); -D2H(Ch, Cd, size); + D2H(Ch, Cd, size); -HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Ch[10] == Dh[10] + T(1)); + HIPASSERT(Ch[10] == Dh[10] + T(1)); } template void test52341(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch; -T *Dh, *Eh; -T *Ad, *Bd, *Cd; + T *Ah, *Bh, *Ch; + T *Dh, *Eh; + T *Ad, *Bd, *Cd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); -setArray(Eh, N, T(1)); -setArray(Bh, N, T(2)); + setArray(Eh, N, T(1)); + setArray(Bh, N, T(2)); -H2D(Ad, Eh, size); + H2D(Ad, Eh, size); -D2HAsync(Ah, Ad, size, stream); -H2DAsync(Bd, Ah, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); + D2HAsync(Ah, Ad, size, stream); + H2DAsync(Bd, Ah, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2DAsync(Cd, Bd, size, stream); + H2HAsync(Ch, Bh, size, stream); -D2H(Dh, Cd, size); + D2H(Dh, Cd, size); -HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Eh[10] + T(1) == Dh[10]); -HIPASSERT(Ch[10] == Bh[10]); + HIPASSERT(Eh[10] + T(1) == Dh[10]); + HIPASSERT(Ch[10] == Bh[10]); } template void test53412(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = sizeof(T) * N; + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + const size_t N = 1000; + const size_t size = sizeof(T) * N; -T *Ah, *Bh, *Ch, *Dh; -T *Eh, *Fh, *Gh; -T *Ad, *Bd, *Cd, *Dd; + T *Ah, *Bh, *Ch, *Dh; + T *Eh, *Fh, *Gh; + T *Ad, *Bd, *Cd, *Dd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, true); -initArrays(&Eh, N, false, false); -initArrays(&Fh, N, false, false); -initArrays(&Gh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); -initArrays(&Dd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, true); + initArrays(&Eh, N, false, false); + initArrays(&Fh, N, false, false); + initArrays(&Gh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + initArrays(&Dd, N, true, false); -setArray(Dh, N, T(1)); -setArray(Eh, N, T(2)); -setArray(Bh, N, T(3)); + setArray(Dh, N, T(1)); + setArray(Eh, N, T(2)); + setArray(Bh, N, T(3)); -H2D(Ad, Dh, size); -H2D(Bd, Eh, size); + H2D(Ad, Dh, size); + H2D(Bd, Eh, size); -D2HAsync(Ah, Ad, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); -D2DAsync(Cd, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -H2DAsync(Dd, Ch, size, stream); + D2HAsync(Ah, Ad, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Bd); + D2DAsync(Cd, Bd, size, stream); + H2HAsync(Ch, Bh, size, stream); + H2DAsync(Dd, Ch, size, stream); -D2H(Fh, Cd, size); -D2H(Gh, Dd, size); + D2H(Fh, Cd, size); + D2H(Gh, Dd, size); -HIPASSERT(Ah[10] == Dh[10]); -HIPASSERT(Eh[10] + T(1) == Fh[10]); -HIPASSERT(Bh[10] == Gh[10]); + HIPASSERT(Ah[10] == Dh[10]); + HIPASSERT(Eh[10] + T(1) == Fh[10]); + HIPASSERT(Bh[10] == Gh[10]); } template void test54123(){ -hipStream_t stream; -HIPCHECK(hipStreamCreate(&stream)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); -const size_t N = 1000; -const size_t size = N * sizeof(T); + const size_t N = 1000; + const size_t size = N * sizeof(T); -T *Ah, *Bh, *Ch; -T *Dh, *Eh, *Fh, *Gh; -T *Ad, *Bd, *Cd, *Dd; + T *Ah, *Bh, *Ch; + T *Dh, *Eh, *Fh, *Gh; + T *Ad, *Bd, *Cd, *Dd; -initArrays(&Ah, N, false, true); -initArrays(&Bh, N, false, true); -initArrays(&Ch, N, false, true); -initArrays(&Dh, N, false, false); -initArrays(&Eh, N, false, false); -initArrays(&Fh, N, false, false); -initArrays(&Gh, N, false, false); -initArrays(&Ad, N, true, false); -initArrays(&Bd, N, true, false); -initArrays(&Cd, N, true, false); -initArrays(&Dd, N, true, false); + initArrays(&Ah, N, false, true); + initArrays(&Bh, N, false, true); + initArrays(&Ch, N, false, true); + initArrays(&Dh, N, false, false); + initArrays(&Eh, N, false, false); + initArrays(&Fh, N, false, false); + initArrays(&Gh, N, false, false); + initArrays(&Ad, N, true, false); + initArrays(&Bd, N, true, false); + initArrays(&Cd, N, true, false); + initArrays(&Dd, N, true, false); -setArray(Dh, N, T(1)); -setArray(Eh, N, T(1)); -setArray(Bh, N, T(1)); + setArray(Dh, N, T(1)); + setArray(Eh, N, T(1)); + setArray(Bh, N, T(1)); -H2D(Ad, Dh, size); -H2D(Bd, Eh, size); + H2D(Ad, Dh, size); + H2D(Bd, Eh, size); -D2HAsync(Ah, Ad, size, stream); -D2DAsync(Cd, Bd, size, stream); -H2HAsync(Ch, Bh, size, stream); -H2DAsync(Dd, Ch, size, stream); -hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Dd); + D2HAsync(Ah, Ad, size, stream); + D2DAsync(Cd, Bd, size, stream); + H2HAsync(Ch, Bh, size, stream); + H2DAsync(Dd, Ch, size, stream); + hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Dd); -D2H(Fh, Cd, size); -D2H(Gh, Dd, size); - -HIPCHECK(hipDeviceSynchronize()); -HIPASSERT(Dh[10] == Ah[10]); -HIPASSERT(Eh[10] == Fh[10]); -HIPASSERT(Bh[10] + T(1) == Gh[10]); + D2H(Fh, Cd, size); + D2H(Gh, Dd, size); + HIPCHECK(hipDeviceSynchronize()); + HIPASSERT(Dh[10] == Ah[10]); + HIPASSERT(Eh[10] == Fh[10]); + HIPASSERT(Bh[10] + T(1) == Gh[10]); } int main(int argc, char *argv[]) { -HipTest::parseStandardArguments(argc, argv, true); + HipTest::parseStandardArguments(argc, argv, true); -test12345(); -test13452(); -test14523(); -test15234(); + test12345(); + test13452(); + test14523(); + test15234(); -test23451(); -test24513(); -test25134(); -test21345(); + test23451(); + test24513(); + test25134(); + test21345(); -test34512(); -test35124(); -test31245(); -test32451(); + test34512(); + test35124(); + test31245(); + test32451(); -test45123(); -test41235(); -test42351(); -test43512(); + test45123(); + test41235(); + test42351(); + test43512(); -test51234(); -test52341(); -test53412(); -test54123(); + test51234(); + test52341(); + test53412(); + test54123(); -passed(); + passed(); } From 1154872b0d35a899da934e4d849311b9dcc4834c Mon Sep 17 00:00:00 2001 From: pensun Date: Sat, 27 Feb 2016 14:14:08 -0600 Subject: [PATCH 93/94] improve the HIP_VISIBLE_DEVICES implementation [ROCm/hip commit: 1f606261c1c724d2d009f8ca7d82ba5b8c945fff] --- projects/hip/src/hip_hcc.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 945248c11f..e9de7f53a6 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -291,6 +291,7 @@ thread_local int tls_defaultDevice = 0; // Global initialization. std::once_flag hip_initialized; ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +bool g_visible_device = false; // Set the flag when HIP_VISIBLE_DEVICES is set unsigned g_deviceCnt; //================================================================================================= @@ -299,7 +300,7 @@ unsigned g_deviceCnt; //Forward Declarations: //================================================================================================= INLINE bool ihipIsValidDevice(unsigned deviceIndex); - +INLINE bool ihipIsVisibleDevice(unsigned deviceIndex); //================================================================================================= // Implementation: //================================================================================================= @@ -865,6 +866,7 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c std::string device_id; // Clean up the defult value g_hip_visible_devices.clear(); + g_visible_device = true; // Read the visible device numbers while (std::getline(ss, device_id, ',')) { if (atoi(device_id.c_str()) >= 0) { @@ -965,9 +967,10 @@ void ihipInit() for (int i=0; i"); - } INLINE bool ihipIsValidDevice(unsigned deviceIndex) From b9421830a4c18ab82b50559d4000a42b751ef817 Mon Sep 17 00:00:00 2001 From: pensun Date: Sat, 27 Feb 2016 14:48:00 -0600 Subject: [PATCH 94/94] Test cases for HIP_VISIBLE_DEVICES/CUDA_VISIBLE_DEVICES. hipEnvVar is the base test case, to be called by hipEnvVarDriver at the run time. Test case includes tests for normal use case of the environment variable, invalid value/sequence and use CUDA_VISIBLE_DEVICES as a alternative. [ROCm/hip commit: 39b44cb484e0d656ff06126981a9b75364bfce9a] --- projects/hip/src/hip_hcc.cpp | 24 +++--- projects/hip/tests/src/CMakeLists.txt | 3 + projects/hip/tests/src/hipEnvVar.cpp | 44 ++++++++--- projects/hip/tests/src/hipEnvVarDriver.cpp | 91 ++++++++++++++++++---- 4 files changed, 125 insertions(+), 37 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e9de7f53a6..a96227dc7f 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -300,7 +300,7 @@ unsigned g_deviceCnt; //Forward Declarations: //================================================================================================= INLINE bool ihipIsValidDevice(unsigned deviceIndex); -INLINE bool ihipIsVisibleDevice(unsigned deviceIndex); + //================================================================================================= // Implementation: //================================================================================================= @@ -967,25 +967,23 @@ void ihipInit() for (int i=0; i"); + } INLINE bool ihipIsValidDevice(unsigned deviceIndex) @@ -994,12 +992,12 @@ INLINE bool ihipIsValidDevice(unsigned deviceIndex) return (deviceIndex < g_deviceCnt); } -// check if the device ID is set as visible -INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) -{ - return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), - (int)deviceIndex) != g_hip_visible_devices.end(); -} +/*// check if the device ID is set as visible*/ +//INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) +//{ + //return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), + //(int)deviceIndex) != g_hip_visible_devices.end(); +/*}*/ //--- INLINE ihipDevice_t *ihipGetTlsDefaultDevice() diff --git a/projects/hip/tests/src/CMakeLists.txt b/projects/hip/tests/src/CMakeLists.txt index cc6af0b5d2..49ab081766 100644 --- a/projects/hip/tests/src/CMakeLists.txt +++ b/projects/hip/tests/src/CMakeLists.txt @@ -110,6 +110,8 @@ make_hip_executable (hip_clz hip_clz.cpp) make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) +make_hip_executable (hipEnvVar hipEnvVar.cpp) +make_hip_executable (hipEnvVarDriver hipEnvVarDriver.cpp) make_hip_executable (hipMemcpy hipMemcpy.cpp) make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) make_hip_executable (hipMemset hipMemset.cpp) @@ -137,6 +139,7 @@ make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes. make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) +make_test(hipEnvVarDriver " " ) make_test(hipPointerAttrib " " ) make_test(hipMultiThreadStreams " " ) diff --git a/projects/hip/tests/src/hipEnvVar.cpp b/projects/hip/tests/src/hipEnvVar.cpp index e0d834d43f..6f9047776c 100644 --- a/projects/hip/tests/src/hipEnvVar.cpp +++ b/projects/hip/tests/src/hipEnvVar.cpp @@ -28,24 +28,27 @@ THE SOFTWARE. #include #include -int debug = 0; +using namespace std; void usage() { printf("hipEnvVar [otpions]\n\ - -a,\t\ttotal number ofavailable GPUs and their pciBusID\n\ - -s,\t\tselect one GPU and return its pciBusID\n\ + -c,\t\ttotal number ofavailable GPUs and their pciBusID\n\ + -d,\t\tselect one GPU and return its pciBusID\n\ + -v,\t\tsend the list to HIP_VISIBLE_DEVICES env var\n\ -h,\t\tshow this help message\n\ "); } int main(int argc, char **argv) { + //string str = getenv("HIP_VISIBLE_DEVICES"); + //std::cout << "The current env HIP_VISIBLE_DEVICES is"< +#include #include - +#include +#include using namespace std; -int main() { - FILE *in; - char buff[512]; +//./hipEnvVar -c -d 0 -h + //putenv("SomeVariable=SomeValue"); + //putenv("export HIP_VISIBLE_DEVICES=0,1,2,3"); - if(!(in = popen("ls -sail", "r"))){ +int getDeviceNumber(){ + FILE *in; + char buff[512]; + string str; + if(!(in = popen("./hipEnvVar -c", "r"))){ return 1; } - - while(fgets(buff, sizeof(buff), in)!=NULL){ - cout << buff; - } + fgets(buff, sizeof(buff), in); pclose(in); - - return 0; - + return atoi(buff); +} + +int getDevicePCIBusNum(int deviceID){ + FILE *in; + char buff[512]; + string str = "./hipEnvVar -d "; + str += std::to_string(deviceID); + if(!(in = popen(str.c_str(), "r"))){ + return 1; + } + fgets(buff, sizeof(buff), in); + pclose(in); + return atoi(buff); +} + +int main() { + unsetenv("HIP_VISIBLE_DEVICES"); + //collect the device pci bus ID for all devices + int totalDeviceNum = getDeviceNumber(); + std::cout << "The total number of available devices is " << totalDeviceNum<< std::endl + <<"Valid index range is 0 - "< devPCINum; + for (int i = 0; i < totalDeviceNum ; i++) { + devPCINum.push_back(getDevicePCIBusNum(i)); + std::cout <<"The collected device PCI Bus ID of Device "< 2){ + setenv("HIP_VISIBLE_DEVICES","0,1,1000,2",1); + assert(getDeviceNumber() == 2); + + setenv("HIP_VISIBLE_DEVICES","0,1,2",1); + assert(getDeviceNumber() == 3); + // test if CUDA_VISIBLE_DEVICES will be accepted by the runtime + unsetenv("HIP_VISIBLE_DEVICES"); + setenv("CUDA_VISIBLE_DEVICES","0,1,2",1); + assert(getDeviceNumber() == 3); + } + + setenv("HIP_VISIBLE_DEVICES","-100,0,1",1); + assert(getDeviceNumber() == 0); + + std::cout << "Passed!" << std::endl; + return 0; }