From a2c405c2654d5dd33af2957c643b24f720b1644a Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 26 Apr 2017 18:56:57 -0500 Subject: [PATCH 01/77] added hipFuncSetCacheConfig API for nvcc path Change-Id: I87fae35bc0e10a0dca5ae1c5015fe5d9e52a1d0d [ROCm/hip commit: fc6248ce82cb62032ed7138e659d520ba59f95f6] --- .../include/hip/nvcc_detail/hip_runtime_api.h | 4 +++ .../module/hipFuncSetCacheConfig.cpp | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index 0cc40f32af..4feefcc342 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -897,6 +897,10 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, } +inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) +{ + return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); +} #ifdef __cplusplus } diff --git a/projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp b/projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp new file mode 100644 index 0000000000..e3c3efad3d --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp @@ -0,0 +1,36 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include +#include +#include"test_common.h" + +int main(){ + hipFuncCache_t cacheConfig; + void *func; + hipFuncSetCacheConfig(func, cacheConfig); + passed(); +} + From a565e85c087a434cf20c2e27f205771f70e21925 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 26 Apr 2017 19:01:10 -0500 Subject: [PATCH 02/77] fixed fast math expf and exp10f Change-Id: I73963220f902efebb0a7404c5f8966dffb4c35ca [ROCm/hip commit: ab2eb420e2bfdfd1f5972a4341b5fb4ccf41c24a] --- projects/hip/src/device_util.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp index 8ce53765b5..b730412874 100644 --- a/projects/hip/src/device_util.cpp +++ b/projects/hip/src/device_util.cpp @@ -1163,18 +1163,18 @@ __device__ double __hip_precise_dsqrt_rz(double x) { return hc::precise_math::sqrt(x); } -#define LOG_BASE2_E_DIV_2 0.4426950408894701 -#define LOG_BASE2_5 2.321928094887362 +#define LOG_BASE2_E 1.4426950408889634 +#define LOG_BASE2_10 3.32192809488736 #define ONE_DIV_LOG_BASE2_E 0.69314718056 #define ONE_DIV_LOG_BASE2_10 0.30102999566 // Fast Math Intrinsics __device__ float __hip_fast_exp10f(float x) { - return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2); + return __hip_fast_exp2f(x*LOG_BASE2_E); } __device__ float __hip_fast_expf(float x) { - return __hip_fast_expf(x*LOG_BASE2_5); + return __hip_fast_exp2f(x*LOG_BASE2_10); } __device__ float __hip_fast_frsqrt_rn(float x) { From eb68f5d5af7b117fd1020223bca16aa8ba26548f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 28 Apr 2017 11:53:11 -0500 Subject: [PATCH 03/77] fixed hipFuncSetCacheConfig on rocm path Change-Id: I937a3afbf115edc94a753a0beb2230ed60a6f021 [ROCm/hip commit: a5cb2d40ec2e8684ee57faa709b293d84eef4459] --- projects/hip/include/hip/hcc_detail/hip_runtime_api.h | 2 +- projects/hip/src/hip_device.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 7a99ff0810..6917f04f96 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -385,7 +385,7 @@ hipError_t hipDeviceGetLimit(size_t *pValue, enum hipLimit_t limit); * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * */ -hipError_t hipFuncSetCacheConfig ( hipFuncCache_t config ); +hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t config ); /** * @brief Returns bank width of shared memory for current device diff --git a/projects/hip/src/hip_device.cpp b/projects/hip/src/hip_device.cpp index 88d94411e8..01a213190f 100644 --- a/projects/hip/src/hip_device.cpp +++ b/projects/hip/src/hip_device.cpp @@ -112,7 +112,7 @@ hipError_t hipDeviceGetLimit (size_t *pValue, hipLimit_t limit) } } -hipError_t hipFuncSetCacheConfig (hipFuncCache_t cacheConfig) +hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t cacheConfig) { HIP_INIT_API(cacheConfig); From 4fe32b302fe1992c5c9b4e4913a7693dcd68e08a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:03:03 +0300 Subject: [PATCH 04/77] [HIPIFY] [FIX] replacement error: cudaError_t -> hipError_t_t https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/71 [Solution] getUnqualifiedType for enumConstantDecl's type is added, except ordinary enum declarations (w/o typedef). [ToDo] Find more appropriate way of distinguishing redefined enum declarations and ordinary ones. [ROCm/hip commit: 3d88932c8de55de0e44df5f324899f87645c876c] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 5a2940322e..f0fa8331dc 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -191,7 +191,7 @@ struct cuda2hipMap { // Error codes and return types cuda2hipRename["CUresult"] = {"hipError_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; +// cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaError"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; @@ -2806,12 +2806,11 @@ private: bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { - StringRef name = - enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); - // anonymous typedef enum - if (name.empty()) { - QualType QT = enumConstantDecl->getType().getUnqualifiedType(); - name = QT.getAsString(); + StringRef name = enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); + QualType QT = enumConstantDecl->getType().getUnqualifiedType(); + StringRef name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { + name = name_unqualified; } SourceLocation sl = enumConstantDecl->getLocStart(); SourceManager *SM = Result.SourceManager; From a83172c49b49f3fee0c919232751cd624512fd3b Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:45:36 +0300 Subject: [PATCH 05/77] * [HIPIFY] [FIX] Replacement error: enum cudaMemcpyKind kind -> hipMemcpyKindyKind kind https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/72 [Solution] [Workaround] Offset calculation for enum VarDecl as param decl, declared with enum type specifier. [Result] enum cudaMemcpyKind kind -> enum hipMemcpyKind kind [ToDo] Test on terminal qualifiers (const, etc). [ROCm/hip commit: eddd02199620f12bea2bb3f3f94f896ad788ccf3] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index f0fa8331dc..35c930f3af 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -2812,8 +2812,19 @@ private: if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { name = name_unqualified; } - SourceLocation sl = enumConstantDecl->getLocStart(); + // Workaround for enum VarDecl as param decl, declared with enum type specifier + // Example: void func(enum cudaMemcpyKind kind); + //------------------------------------------------- SourceManager *SM = Result.SourceManager; + SourceLocation sl(enumConstantDecl->getLocStart()); + SourceLocation end(enumConstantDecl->getLocEnd()); + size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); + StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + //------------------------------------------------- const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { updateCounters(found->second, name.str()); From d439bbd746f18203a4ad4d6e8b37f671a51fe3f8 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:59:33 +0300 Subject: [PATCH 06/77] [HIPIFY] Rename enumConstantDecl -> enumDecl Reason: not to mix up with clang's enumConstantDecl, used for enum DeclRefExpr (enum constant). [ROCm/hip commit: c7958cbb8b3c3a59921ec5ac5a669bc4430b2ee5] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 35c930f3af..390b4ee88c 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -2804,10 +2804,10 @@ private: return false; } - bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { - if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { - StringRef name = enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); - QualType QT = enumConstantDecl->getType().getUnqualifiedType(); + bool cudaEnumDecl(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumDecl = Result.Nodes.getNodeAs("cudaEnumDecl")) { + StringRef name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); + QualType QT = enumDecl->getType().getUnqualifiedType(); StringRef name_unqualified = QT.getAsString(); if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { name = name_unqualified; @@ -2816,8 +2816,8 @@ private: // Example: void func(enum cudaMemcpyKind kind); //------------------------------------------------- SourceManager *SM = Result.SourceManager; - SourceLocation sl(enumConstantDecl->getLocStart()); - SourceLocation end(enumConstantDecl->getLocEnd()); + SourceLocation sl(enumDecl->getLocStart()); + SourceLocation end(enumDecl->getLocEnd()); size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); size_t offset = sfull.find(name); @@ -3123,7 +3123,7 @@ public: if (cudaCall(Result)) break; if (cudaBuiltin(Result)) break; if (cudaEnumConstantRef(Result)) break; - if (cudaEnumConstantDecl(Result)) break; + if (cudaEnumDecl(Result)) break; if (cudaTypedefVar(Result)) break; if (cudaTypedefVarPtr(Result)) break; if (cudaStructVar(Result)) break; @@ -3169,7 +3169,7 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(enumDecl())) - .bind("cudaEnumConstantDecl"), + .bind("cudaEnumDecl"), Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(typedefDecl(matchesName("cu.*|CU.*")))) From 340cd36e8d35ef86bcc62c000915e01ef591121b Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 3 May 2017 22:29:12 +0530 Subject: [PATCH 07/77] Added support for hipMemcpy2DAsync in HIP/HCC Change-Id: Ia4a8306f2dc1e33a81a7195ec29aef652fcccc4b [ROCm/hip commit: ec27c695c4ea83804d679dc7d07f947a07ece3c8] --- .../include/hip/hcc_detail/hip_runtime_api.h | 21 +++++++++++++++++++ projects/hip/src/hip_memory.cpp | 18 ++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 6917f04f96..9cfd21c1d2 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -1308,6 +1308,27 @@ hipError_t hipFreeArray(hipArray* array); */ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind); +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] dpitch Pitch of destination memory + * @param[in] src Source memory address + * @param[in] spitch Pitch of source memory + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * @param[in] stream Stream to use + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync + */ +#if __cplusplus +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0); +#else +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream); +#endif + /** * @brief Copies data between host and device. * diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 821f64bc76..c4bc7db096 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -793,6 +793,24 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, return ihipLogStatus(e); } +hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, + size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + if(width > dpitch || width > spitch) + return ihipLogStatus(hipErrorUnknown); + hipError_t e = hipSuccess; + try { + for(int i = 0; i < height; ++i) { + e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream); + } + } + catch (ihipException ex) { + e = ex._code; + } + + return ihipLogStatus(e); +} + hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { From 115709ba28450a1427f9cc4fb24e48d3fef9f4a8 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 21:59:48 +0300 Subject: [PATCH 08/77] [HIPIFY] HIPIFY and HIP sync with CUDA Driver API data types. + Update CUDA_Driver_API_functions_supported_by_HIP.md. + Final update of HIPIFY with CUDA driver data types. [TODO] Syncing HIPIFY and HIP by CUDA Driver API functions. [ROCm/hip commit: 70c94d7b835dcff1a7d43a065ccea1218bbc48f5] --- ...A_Driver_API_functions_supported_by_HIP.md | 62 ++++ projects/hip/hipify-clang/src/Cuda2Hip.cpp | 324 +++++++++++------- 2 files changed, 253 insertions(+), 133 deletions(-) diff --git a/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 3434d29a70..ad9d791a6d 100644 --- a/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -323,6 +323,68 @@ | 500 |*`CUDA_ERROR_NOT_FOUND`* |*`hipErrorNotFound`* | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. | | 600 |*`CUDA_ERROR_NOT_READY`* |*`hipErrorNotReady`* | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). | | 700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`* |*`hipErrorIllegalAddress`* | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 701 |*`CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`* |*`hipErrorLaunchOutOfResources`* | This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too many arguments and can also result in this error. | +| 702 |*`CUDA_ERROR_LAUNCH_TIMEOUT`* |*`hipErrorLaunchTimeOut`* | This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The context cannot be used (and must be destroyed similar to CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 703 |*`CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`* | | This error indicates a kernel launch that uses an incompatible texturing mode. | +| 704 |*`CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`* |*`hipErrorPeerAccessAlreadyEnabled`* | This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a context which has already had peer access to it enabled. | +| 705 |*`CUDA_ERROR_PEER_ACCESS_NOT_ENABLED`* |*`hipErrorPeerAccessNotEnabled`* | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). | +| 708 |*`CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE`* | | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). | +| 709 |*`CUDA_ERROR_CONTEXT_IS_DESTROYED`* | | This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized. | +| 710 |*`CUDA_ERROR_ASSERT`* | | A device-side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 711 |*`CUDA_ERROR_TOO_MANY_PEERS`* | | This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to cuCtxEnablePeerAccess(). | +| 712 |*`CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`* |*`hipErrorHostMemoryAlreadyRegistered`* | This error indicates that the memory range passed to cuMemHostRegister() has already been registered. | +| 713 |*`CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED`* |*`hipErrorHostMemoryNotRegistered`* | This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any currently registered memory region. | +| 714 |*`CUDA_ERROR_HARDWARE_STACK_ERROR`* | | While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 715 |*`CUDA_ERROR_ILLEGAL_INSTRUCTION`* | | While executing a kernel, the device encountered an illegal instruction. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 716 |*`CUDA_ERROR_MISALIGNED_ADDRESS`* | | While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`* | | While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 718 |*`CUDA_ERROR_INVALID_PC`* | | While executing a kernel, the device program counter wrapped its address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 719 |*`CUDA_ERROR_LAUNCH_FAILED`* | | An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 800 |*`CUDA_ERROR_NOT_PERMITTED`* | | This error indicates that the attempted operation is not permitted. | +| 801 |*`CUDA_ERROR_NOT_SUPPORTED`* | | This error indicates that the attempted operation is not supported on the current system or device. | +| 999 |*`CUDA_ERROR_UNKNOWN`* | | This indicates that an unknown internal error has occurred. | +| enum |***`CUstream_flags`*** |***`hipStreamFlags`*** | Stream creation flags | +| 0x0 |*`CU_STREAM_DEFAULT`* |*`hipStreamDefault`* | Default stream flag | +| 0x1 |*`CU_STREAM_NON_BLOCKING`* |*`hipStreamNonBlocking`* | Stream does not synchronize with stream 0 (the NULL stream) | +| typedef | `CUarray` | `hipArray *` | CUDA array | +| struct | `CUarray_st` | `hipArray` | CUDA array | +| typedef | `CUcontext` | `hipCtx_t` | CUDA context | +| typedef | `CUdevice` | `hipDevice_t` | CUDA device | +| typedef | `CUdeviceptr` | `hipDeviceptr_t` | CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. | +| typedef | `CUevent` | `hipEvent_t` | CUDA event | +| typedef | `CUfunction` | `hipFunction_t` | CUDA function | +| typedef | `CUgraphicsResource` | | CUDA graphics interop resource | +| typedef | `CUmipmappedArray` | | CUDA mipmapped array | +| typedef | `CUmodule` | `hipModule_t` | CUDA module | +| typedef | `CUstream` | `hipStream_t` | CUDA module | +| typedef | `CUstreamCallback` | `hipStreamCallback_t` | CUDA stream callback | +| typedef | `CUsurfObject` | | An opaque value that represents a CUDA surface object | +| typedef | `CUsurfref` | | CUDA surface reference | +| typedef | `CUtexObject` | | An opaque value that represents a CUDA texture object | +| typedef | `CUtexref` | | CUDA texture reference | +| define |`CU_IPC_HANDLE_SIZE` | | CUDA IPC handle size. | +| define |`CU_LAUNCH_PARAM_BUFFER_POINTER` | `HIP_LAUNCH_PARAM_BUFFER_POINTER` | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a buffer containing all kernel parameters used for launching kernel f. This buffer needs to honor all alignment/padding requirements of the individual parameters. If CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the extra array, then CU_LAUNCH_PARAM_BUFFER_POINTER will have no effect. | +| define |`CU_LAUNCH_PARAM_BUFFER_SIZE` | `HIP_LAUNCH_PARAM_BUFFER_SIZE` | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a size_t which contains the size of the buffer specified with CU_LAUNCH_PARAM_BUFFER_POINTER. It is required that CU_LAUNCH_PARAM_BUFFER_POINTER also be specified in the extra array if the value associated with CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. | +| define |`CU_LAUNCH_PARAM_END` | `HIP_LAUNCH_PARAM_END` | End of array terminator for the extra parameter to cuLaunchKernel. | +| define |`CU_MEMHOSTALLOC_DEVICEMAP` | | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTALLOC_PORTABLE` | | If set, host memory is portable between CUDA contexts. Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTALLOC_WRITECOMBINED` | | If set, host memory is allocated as write-combined - fast to write, faster to DMA, slow to read except via SSE4 streaming load instruction (MOVNTDQA). Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTREGISTER_DEVICEMAP` | | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostRegister(). | +| define |`CU_MEMHOSTREGISTER_IOMEMORY` | | If set, the passed memory pointer is treated as pointing to some memory-mapped I/O space, e.g. belonging to a third-party PCIe device. On Windows the flag is a no-op. On Linux that memory is marked as non cache-coherent for the GPU and is expected to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED is returned. Flag for cuMemHostRegister(). | +| define |`CU_MEMHOSTREGISTER_PORTABLE` | | If set, host memory is portable between CUDA contexts. Flag for cuMemHostRegister(). | +| define |`CU_PARAM_TR_DEFAULT` | | For texture references loaded into the module, use default texunit from texture reference. | +| define |`CU_STREAM_LEGACY` | | Legacy stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior. See details of the synchronization behavior. | +| define |`CU_STREAM_PER_THREAD` | | Per-thread stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with perthread synchronization behavior. See details of the synchronization behavior. | +| define |`CU_TRSA_OVERRIDE_FORMAT` | | Override the texref format with a format inferred from the array. Flag for cuTexRefSetArray(). | +| define |`CU_TRSF_NORMALIZED_COORDINATES` | | Use normalized texture coordinates in the range [0,1) instead of [0,dim). Flag for cuTexRefSetFlags(). | +| define |`CU_TRSF_SRGB` | | Perform sRGB->linear conversion during texture read. Flag for cuTexRefSetFlags(). | +| define |`CUDA_ARRAY3D_2DARRAY` | | Deprecated, use CUDA_ARRAY3D_LAYERED. | +| define |`CUDA_ARRAY3D_CUBEMAP` | | If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six. | +| define |`CUDA_ARRAY3D_DEPTH_TEXTURE` | | This flag if set indicates that the CUDA array is a DEPTH_TEXTURE. | +| define |`CUDA_ARRAY3D_LAYERED` | | If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array. | +| define |`CUDA_ARRAY3D_SURFACE_LDST` | | This flag must be set in order to bind a surface reference to the CUDA array. | +| define |`CUDA_ARRAY3D_TEXTURE_GATHER` | | This flag must be set in order to perform texture gather operations on a CUDA array. | +| define |`CUDA_VERSION` | | CUDA API version number. | ## **2. Error Handling** diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 390b4ee88c..b3b2b993e3 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -208,59 +208,55 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 + cuda2hipRename["CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"] = {"hipErrorLaunchIncompatibleTexturing", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 703 + cuda2hipRename["CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"] = {"hipErrorPrimaryContextActive", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 708 + cuda2hipRename["CUDA_ERROR_CONTEXT_IS_DESTROYED"] = {"hipErrorContextIsDestroyed", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 709 + cuda2hipRename["CUDA_ERROR_NOT_PERMITTED"] = {"hipErrorNotPermitted", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 800 + cuda2hipRename["CUDA_ERROR_NOT_SUPPORTED"] = {"hipErrorNotSupported", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 801 // CUDA RT API error code only - cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["cudaErrorPriorLaunchFailure"] = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5 - cuda2hipRename["cudaErrorInvalidDeviceFunction"] = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8 - cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 - cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 - cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 - cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 - cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 - cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 - cuda2hipRename["cudaErrorInvalidTextureBinding"] = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19 - cuda2hipRename["cudaErrorInvalidChannelDescriptor"] = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20 - cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21 - cuda2hipRename["cudaErrorAddressOfConstant"] = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22 - cuda2hipRename["cudaErrorTextureFetchFailed"] = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23 - cuda2hipRename["cudaErrorTextureNotBound"] = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24 - cuda2hipRename["cudaErrorSynchronizationError"] = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25 - cuda2hipRename["cudaErrorInvalidFilterSetting"] = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26 - cuda2hipRename["cudaErrorInvalidNormSetting"] = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27 - cuda2hipRename["cudaErrorMixedDeviceExecution"] = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28 + cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["cudaErrorPriorLaunchFailure"] = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5 + cuda2hipRename["cudaErrorInvalidDeviceFunction"] = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 + cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 + cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 + cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 + cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 + cuda2hipRename["cudaErrorInvalidTextureBinding"] = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19 + cuda2hipRename["cudaErrorInvalidChannelDescriptor"] = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20 + cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21 + cuda2hipRename["cudaErrorAddressOfConstant"] = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22 + cuda2hipRename["cudaErrorTextureFetchFailed"] = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23 + cuda2hipRename["cudaErrorTextureNotBound"] = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24 + cuda2hipRename["cudaErrorSynchronizationError"] = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25 + cuda2hipRename["cudaErrorInvalidFilterSetting"] = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26 + cuda2hipRename["cudaErrorInvalidNormSetting"] = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27 + cuda2hipRename["cudaErrorMixedDeviceExecution"] = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28 // Deprecated as of CUDA 4.1 - cuda2hipRename["cudaErrorNotYetImplemented"] = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31 + cuda2hipRename["cudaErrorNotYetImplemented"] = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31 // Deprecated as of CUDA 3.1 - cuda2hipRename["cudaErrorMemoryValueTooLarge"] = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32 - cuda2hipRename["cudaErrorInsufficientDriver"] = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35 - cuda2hipRename["cudaErrorSetOnActiveProcess"] = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36 - cuda2hipRename["cudaErrorInvalidSurface"] = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37 - cuda2hipRename["cudaErrorDuplicateVariableName"] = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43 - cuda2hipRename["cudaErrorDuplicateTextureName"] = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44 - cuda2hipRename["cudaErrorDuplicateSurfaceName"] = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45 - cuda2hipRename["cudaErrorDevicesUnavailable"] = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46 - cuda2hipRename["cudaErrorIncompatibleDriverContext"] = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49 - cuda2hipRename["cudaErrorDeviceAlreadyInUse"] = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54 - cuda2hipRename["cudaErrorAssert"] = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59 - cuda2hipRename["cudaErrorTooManyPeers"] = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60 - cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"] = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65 - cuda2hipRename["cudaErrorLaunchFileScopedTex"] = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66 - cuda2hipRename["cudaErrorLaunchFileScopedSurf"] = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67 - cuda2hipRename["cudaErrorSyncDepthExceeded"] = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68 - cuda2hipRename["cudaErrorLaunchPendingCountExceeded"] = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69 - cuda2hipRename["cudaErrorNotPermitted"] = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70 - cuda2hipRename["cudaErrorNotSupported"] = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71 - cuda2hipRename["cudaErrorHardwareStackError"] = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72 - cuda2hipRename["cudaErrorIllegalInstruction"] = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73 - cuda2hipRename["cudaErrorMisalignedAddress"] = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74 - cuda2hipRename["cudaErrorInvalidAddressSpace"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75 - cuda2hipRename["cudaErrorInvalidPc"] = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76 - cuda2hipRename["cudaErrorStartupFailure"] = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f + cuda2hipRename["cudaErrorMemoryValueTooLarge"] = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32 + cuda2hipRename["cudaErrorInsufficientDriver"] = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35 + cuda2hipRename["cudaErrorSetOnActiveProcess"] = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36 + cuda2hipRename["cudaErrorInvalidSurface"] = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37 + cuda2hipRename["cudaErrorDuplicateVariableName"] = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43 + cuda2hipRename["cudaErrorDuplicateTextureName"] = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44 + cuda2hipRename["cudaErrorDuplicateSurfaceName"] = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45 + cuda2hipRename["cudaErrorDevicesUnavailable"] = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46 + cuda2hipRename["cudaErrorIncompatibleDriverContext"] = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49 + cuda2hipRename["cudaErrorDeviceAlreadyInUse"] = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54 + cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"] = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65 + cuda2hipRename["cudaErrorLaunchFileScopedTex"] = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66 + cuda2hipRename["cudaErrorLaunchFileScopedSurf"] = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67 + cuda2hipRename["cudaErrorSyncDepthExceeded"] = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68 + cuda2hipRename["cudaErrorLaunchPendingCountExceeded"] = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69 + cuda2hipRename["cudaErrorNotPermitted"] = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70 + cuda2hipRename["cudaErrorNotSupported"] = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71 + cuda2hipRename["cudaErrorStartupFailure"] = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f // Deprecated as of CUDA 4.1 - cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 - - + cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; // 0 cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 @@ -346,34 +342,50 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; // 700 cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; // 701 + cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; // 719 + cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; // 702 + cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; // 704 + cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; // 705 + cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 + + cuda2hipRename["CUDA_ERROR_ASSERT"] = {"hipErrorAssert", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 710 + cuda2hipRename["cudaErrorAssert"] = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59 + + cuda2hipRename["CUDA_ERROR_TOO_MANY_PEERS"] = {"hipErrorTooManyPeers", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 711 + cuda2hipRename["cudaErrorTooManyPeers"] = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60 + + cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; // 712 + cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 + + cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; // 713 + cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 + + cuda2hipRename["CUDA_ERROR_HARDWARE_STACK_ERROR"] = {"hipErrorHardwareStackError", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 714 + cuda2hipRename["cudaErrorHardwareStackError"] = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72 + + cuda2hipRename["CUDA_ERROR_ILLEGAL_INSTRUCTION"] = {"hipErrorIllegalInstruction", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 715 + cuda2hipRename["cudaErrorIllegalInstruction"] = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73 + + cuda2hipRename["CUDA_ERROR_MISALIGNED_ADDRESS"] = {"hipErrorMisalignedAddress", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 716 + cuda2hipRename["cudaErrorMisalignedAddress"] = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74 + + cuda2hipRename["CUDA_ERROR_INVALID_ADDRESS_SPACE"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 717 + cuda2hipRename["cudaErrorInvalidAddressSpace"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75 + + cuda2hipRename["CUDA_ERROR_INVALID_PC"] = {"hipErrorInvalidPc", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 718 + cuda2hipRename["cudaErrorInvalidPc"] = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76 + + cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 719 cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 - cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 - - cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - - cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 - -// cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorInitializationError", CONV_ERR, API_DRIVER}; -// cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 - - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 - - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 + cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 999 + cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 ///////////////////////////// CUDA DRIVER API ///////////////////////////// // enums @@ -389,59 +401,86 @@ struct cuda2hipMap { cuda2hipRename["CUipcEventHandle"] = {"hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUipcMemHandle"] = {"hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["CUaddress_mode"] = {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 cuda2hipRename["CUarray_cubemap_face"] = {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 cuda2hipRename["CUarray_format"] = {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a - cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a + cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // Compute mode - cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) - cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) + cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 // API_RUNTIME ANALOGUE (cudaComputeModeDefault = 0) + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1) + cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2) + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3) // Context flags cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines - cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_LAUNCH_PARAM_END"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) + cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; // ((void*)0x02) + cuda2hipRename["CU_LAUNCH_PARAM_END"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER}; // ((void*)0x00) + cuda2hipRename["CU_IPC_HANDLE_SIZE"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 + cuda2hipRename["CU_MEMHOSTALLOC_DEVICEMAP"] = {"HIP_MEMHOSTALLOC_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_MEMHOSTALLOC_PORTABLE"] = {"HIP_MEMHOSTALLOC_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_MEMHOSTALLOC_WRITECOMBINED"] = {"HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_MEMHOSTREGISTER_DEVICEMAP"] = {"HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_MEMHOSTREGISTER_IOMEMORY"] = {"HIP_MEMHOSTREGISTER_IOMEMORY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_MEMHOSTREGISTER_PORTABLE"] = {"HIP_MEMHOSTREGISTER_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_PARAM_TR_DEFAULT"] = {"HIP_PARAM_TR_DEFAULT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // -1 + cuda2hipRename["CU_STREAM_LEGACY"] = {"HIP_STREAM_LEGACY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // ((CUstream)0x1) + cuda2hipRename["CU_STREAM_PER_THREAD"] = {"HIP_STREAM_PER_THREAD", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // ((CUstream)0x2) + cuda2hipRename["CU_TRSA_OVERRIDE_FORMAT"] = {"HIP_TRSA_OVERRIDE_FORMAT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_TRSF_NORMALIZED_COORDINATES"] = {"HIP_TRSF_NORMALIZED_COORDINATES", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};// 0x02 + cuda2hipRename["CU_TRSF_READ_AS_INTEGER"] = {"HIP_TRSF_READ_AS_INTEGER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_TRSF_SRGB"] = {"HIP_TRSF_SRGB", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + // Deprecated, use CUDA_ARRAY3D_LAYERED + cuda2hipRename["CUDA_ARRAY3D_2DARRAY"] = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CUDA_ARRAY3D_CUBEMAP"] = {"HIP_ARRAY3D_CUBEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CUDA_ARRAY3D_DEPTH_TEXTURE"] = {"HIP_ARRAY3D_DEPTH_TEXTURE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CUDA_ARRAY3D_LAYERED"] = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CUDA_ARRAY3D_SURFACE_LDST"] = {"HIP_ARRAY3D_SURFACE_LDST", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CUDA_ARRAY3D_TEXTURE_GATHER"] = {"HIP_ARRAY3D_TEXTURE_GATHER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CUDA_VERSION"] = {"HIP_VERSION", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7050 // Types // NOTE: CUdevice might be changed to typedef int in the future. cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) - cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdeviceptr"] = {"hipDeviceptr_t", CONV_TYPE, API_DRIVER}; + // CUDA: "The types::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other." + // typedef struct cudaArray *cudaArray_t; + // typedef struct CUarray_st *CUarray; + cuda2hipRename["CUarray_st"] = {"hipArray", CONV_MEM, API_RUNTIME}; // API_Runtime ANALOGUE (cudaArray) + cuda2hipRename["CUarray"] = {"hipArray *", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaArray_t) + // unsupported yet by HIP cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) @@ -554,8 +593,8 @@ struct cuda2hipMap { // TODO: Analogues enum is needed in HIP. Couldn't map enum to struct hipPointerAttribute_t. // TODO: Do for Pointer Attributes the same as for Device Attributes. - // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) - // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_CONTEXT"] = {"hipPointerAttributeContext", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_MEMORY_TYPE"] = {"hipPointerAttributeMemoryType", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_DEVICE_POINTER"] = {"hipPointerAttributeDevicePointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (no) @@ -565,13 +604,17 @@ struct cuda2hipMap { cuda2hipRename["CU_POINTER_ATTRIBUTE_BUFFER_ID"] = {"hipPointerAttributeBufferId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_IS_MANAGED"] = {"hipPointerAttributeIsManaged", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (no) - // pointer to CUfunc_st cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; - // TODO: in HIP ihipModuleSymbol_t should be declared in hip_runtime_api.h, not in hcc_detail/hip_runtime_api.h, as it's analogue CUfunc_st is declared also in cuda.h - // ToDO: examples are needed with CUfunc_st + // TODO: move "typedef struct ihipModuleSymbol_t *hipFunction_t;" from hcc_details to HIP + // typedef struct CUfunc_st *CUfunction; // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; + // typedef struct CUgraphicsResource_st *CUgraphicsResource; + cuda2hipRename["CUgraphicsResource"] = {"hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUmipmappedArray_st *CUmipmappedArray; + cuda2hipRename["CUmipmappedArray"] = {"hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -607,8 +650,6 @@ struct cuda2hipMap { cuda2hipRename["CU_OCCUPANCY_DEFAULT"] = {"hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaOccupancyDefault = 0x0) cuda2hipRename["CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaOccupancyDisableCachingOverride = 0x1) - - cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; // 0x00 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) @@ -752,24 +793,41 @@ struct cuda2hipMap { cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x21 // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed6H = 0x21) cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x22 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed7 = 0x22) - - - cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUctx_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // TODO: move "typedef struct ihipCtx_t *hipCtx_t;" from hcc_details to HIP + // typedef struct CUctx_st *CUcontext; + // cuda2hipRename["CUctx_st"] = {"ihipCtx_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUmod_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // TODO: move "typedef struct ihipModule_t *hipModule_t;" from hcc_details to HIP + // typedef struct CUmod_st *CUmodule; + // cuda2hipRename["CUmod_st"] = {"ihipModule_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUstream_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - // Stream Flags + // TODO: move "typedef struct ihipStream_t *hipStream_t;" from hcc_details to HIP + // typedef struct CUstream_st *CUstream; + // cuda2hipRename["CUstream_st"] = {"ihipStream_t", CONV_TYPE, API_DRIVER}; + + // typedef void (*hipStreamCallback_t) (hipStream_t stream, hipError_t status, void* userData); + // typedef void (CUDA_CB *CUstreamCallback) (CUstream hStream, CUresult status, void* userData) + cuda2hipRename["CUstreamCallback"] = {"hipStreamCallback_t", CONV_TYPE, API_DRIVER}; + + cuda2hipRename["CUsurfObject"] = {"hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUsurfref_st *CUsurfref; + cuda2hipRename["CUsurfref"] = {"hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUsurfref_st"] = {"ihipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUtexObject"] = {"hipTextureObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUtexref_st *CUtexref; + cuda2hipRename["CUtexref"] = {"hipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUtexref_st"] = {"ihipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + // Stream Flags enum + cuda2hipRename["CUstream_flags"] = {"hipStreamFlags", CONV_STREAM, API_DRIVER}; + // cuda2hipRename["CUstream_flags_enum"] = {"hipStreamFlags", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; @@ -1254,10 +1312,10 @@ struct cuda2hipMap { // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) - cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0) + cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE = 1) + cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_PROHIBITED = 2) + cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3) // Device Flags // unsupported yet by HIP From 4841eb2bffc4a0b5ec660ec8ce117041b8ee41d3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 22:45:56 +0300 Subject: [PATCH 09/77] [HIPIFY] Blas update: add a few functions, supported by HIP. cublasDaxpy -> hipblasDaxpy cublasDgemv -> hipblasDgemv cublasDger -> hipblasDger cublasDgemm -> hipblasDgemm cublasDgemmBatched -> hipblasDgemmBatched cublasGetStream -> hipblasGetStream cublasSetStream -> hipblasSetStream cublasDaxpy -> hipblasDaxpy [ROCm/hip commit: 126989760608eeac800a3942c672070dbf59fed9] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index b3b2b993e3..3b34f0c1c1 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -1601,7 +1601,7 @@ struct cuda2hipMap { // Blas types cuda2hipRename["cublasHandle_t"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // TODO: dereferencing: typedef struct cublasContext *cublasHandle_t; - cuda2hipRename["cublasContext"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; + // cuda2hipRename["cublasContext"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // Blas management functions // unsupported yet by hipblas/hcblas cuda2hipRename["cublasInit"] = {"hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1709,10 +1709,9 @@ struct cuda2hipMap { // AXPY cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; - // there is no such a function in CUDA cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1789,8 +1788,8 @@ struct cuda2hipMap { cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1868,8 +1867,8 @@ struct cuda2hipMap { // GER cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1906,8 +1905,7 @@ struct cuda2hipMap { // Blas3 (v1) Routines // GEMM cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas @@ -1915,8 +1913,7 @@ struct cuda2hipMap { // BATCH GEMM cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas @@ -2074,10 +2071,9 @@ struct cuda2hipMap { cuda2hipRename["cublasCreate_v2"] = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas cuda2hipRename["cublasGetVersion_v2"] = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; - cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; - cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -2294,7 +2290,7 @@ struct cuda2hipMap { // AXPY cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; From 869dfcd01f17f3cbe7004a9b53bb27ac9836f3a3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 23:05:44 +0300 Subject: [PATCH 10/77] [HIPIFY] CUDA RT memcpy functions update. cudaMemcpyFromSymbol -> hipMemcpyFromSymbol cudaMemcpyFromSymbolAsync -> hipMemcpyFromSymbolAsync cudaMemcpy2DAsync -> hipMemcpy2DAsync [ROCm/hip commit: 9b65358c682b8b1026c70d1e40035b68d0039073] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 3b34f0c1c1..07fc817b53 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -1030,10 +1030,10 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpyToSymbolAsync"] = {"hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyAsync"] = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpy2D"] = {"hipMemcpy2D", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpy2DAsync"] = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpy2DToArray"] = {"hipMemcpy2DToArray", CONV_MEM, API_RUNTIME}; // unsupported yet by HIP cuda2hipRename["cudaMemcpy2DArrayToArray"] = {"hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMemcpy2DAsync"] = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DFromArray"] = {"hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DFromArrayAsync"] = {"hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DToArrayAsync"] = {"hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; @@ -1043,7 +1043,8 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyArrayToArray"] = {"hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyFromArrayAsync"] = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMemcpyFromSymbol"] = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME}; // memcpy kind cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; From 0557f54200d6e5ed758f4bce3535bb3bb89eb1e0 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 4 May 2017 06:47:55 +0530 Subject: [PATCH 11/77] hipMemcpy2DAsync for HIP/NVCC Change-Id: I46f0057fef49bdaaac41c1df80c3e27432b8f376 [ROCm/hip commit: 1cb51d614e730f56267e14e30951c5f7781b16e0] --- projects/hip/include/hip/nvcc_detail/hip_runtime_api.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index 4feefcc342..aad3ffcc44 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -345,7 +345,11 @@ inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolN } inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){ - return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); + return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); +} + +inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind),stream)); } inline static hipError_t hipMemcpy2DToArray(hipArray *dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){ From dfbc039ebba7df6fc6f16e35ddc079a0cdd7144c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 5 May 2017 21:28:02 +0300 Subject: [PATCH 12/77] [HIPIFY] LLVM 3.9 support 3.8 and 3.9 are both supported. 3.8 is stable, 3.9 needs more testing. [ROCm/hip commit: 05be936fd6ee2fa45fbab68cc70765782e7af8fe] --- projects/hip/hipify-clang/CMakeLists.txt | 9 +++++++-- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/projects/hip/hipify-clang/CMakeLists.txt b/projects/hip/hipify-clang/CMakeLists.txt index a02b91407f..872db3defe 100644 --- a/projects/hip/hipify-clang/CMakeLists.txt +++ b/projects/hip/hipify-clang/CMakeLists.txt @@ -6,8 +6,12 @@ set(BUILD_HIPIFY_CLANG 0 PARENT_SCOPE) # Find LLVM package find_package(LLVM 3.8 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH) if (NOT ${LLVM_FOUND}) - message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM (v3.8) package using -DHIPIFY_CLANG_LLVM_DIR") -else() + find_package(LLVM 3.9 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH) + if (NOT ${LLVM_FOUND}) + message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM 3.8 or LLVM 3.9 package using -DHIPIFY_CLANG_LLVM_DIR") + endif() +endif() +if (${LLVM_FOUND}) list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR}) include(AddLLVM) @@ -31,6 +35,7 @@ else() clangSerialization clangSema clangEdit + clangFormat clangLex clangAnalysis clangDriver diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 07fc817b53..1c6406b4ab 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -3533,7 +3533,11 @@ void printAllStats(const std::string &csvFile, int64_t totalFiles, int64_t conve int main(int argc, const char **argv) { auto start = std::chrono::steady_clock::now(); auto begin = start; +#if (LLVM_VERSION_MAJOR >= 3) && (LLVM_VERSION_MINOR >= 9) + llvm::sys::PrintStackTraceOnErrorSignal(StringRef()); +#else llvm::sys::PrintStackTraceOnErrorSignal(); +#endif CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore); std::vector fileSources = OptionsParser.getSourcePathList(); std::string dst = OutputFilename; From 3bf30acbd78ecad76543b42b8699a23aaffd128a Mon Sep 17 00:00:00 2001 From: wsttiger Date: Wed, 3 May 2017 14:21:08 +0000 Subject: [PATCH 13/77] Improve hipStreamWaitEvent test. - use addOne kernel, use local initializer rather than init_array. - use addOneReverse test to add from back of array. Test alternate fwd and backward to stress dependency logic. - check device-side dependencies. [ROCm/hip commit: 2a253680da5e342f484689b672ead3ba966a6272] --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 637275c381..63c42da557 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -28,7 +28,41 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include +#include unsigned p_streams = 6; +unsigned p_db = 0; + + +template +__global__ void +addOne( const T *A_d, + T *C_d, + size_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +__global__ void +addOneReverse( const T *A_d, + T *C_d, + int64_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = A_d[i] + (T)1; + //C_d[i] = (T)1; + } +} //------ @@ -36,49 +70,90 @@ unsigned p_streams = 6; template class Streamer { public: - Streamer(size_t numElements); + Streamer(T *input, size_t numElements, bool reverse); ~Streamer(); - void runAsync(); + void runAsyncAfter(Streamer *depStreamer); + void runAsyncWaitSameStream(); void queryUntilComplete(); + void syncAndCheck(int streamerNum, T initValue, T expectedOffset); + + hipEvent_t event() { return _event; }; + + T *C_d() { return _C_d; }; + private: - T *_A_h; - T *_B_h; T *_C_h; T *_A_d; - T *_B_d; T *_C_d; hipStream_t _stream; hipEvent_t _event; size_t _numElements; + bool _reverse; }; + template -Streamer::Streamer(size_t numElements) : - _numElements(numElements) +Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : + _A_d(A_d), + _numElements(numElements), + _reverse(reverse) { - HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true); + size_t sizeElements = numElements * sizeof(int); + + HIPCHECK(hipMalloc(&_C_d, sizeElements)); + HIPCHECK(hipHostMalloc(&_C_h, sizeElements)); + + HIPCHECK(hipMemset(_C_d, -1, sizeElements)); + HIPCHECK(hipMemset(_C_h, -2, sizeElements)); HIPCHECK(hipStreamCreate(&_stream)); HIPCHECK(hipEventCreate(&_event)); }; + template -void Streamer::runAsync() +void Streamer::runAsyncAfter(Streamer *depStreamer) +{ + if (p_db) { + printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); + } + + if (depStreamer) { + HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0)); + } + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } else { + hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } + HIPCHECK(hipEventRecord(_event, _stream)); +} + + +template +void Streamer::runAsyncWaitSameStream() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } else { + hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } // Test case where hipStreamWaitEvent waits on same event we just placed into the queue. HIPCHECK(hipEventRecord(_event, _stream)); HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); } + template void Streamer::queryUntilComplete() { @@ -89,10 +164,26 @@ void Streamer::queryUntilComplete() e = hipStreamQuery(_stream); } while (e != hipSuccess) ; - printf ("completed after %d queries\n", numQueries); + printf ("info: hipStreamQuery completed after %d queries\n", numQueries); }; +template +void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) +{ + HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, _stream)); + HIPCHECK(hipStreamSynchronize(_stream)); + + T expected = initValue + expectedOffset; + + for (size_t i=0; i<_numElements; i++) { + if (_C_h[i] != expected) { + failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + } + } +} + + //--- //Parse arguments specific to this test. @@ -122,39 +213,68 @@ int main(int argc, char *argv[]) HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); - typedef Streamer FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; + size_t sizeElements = numElements * sizeof(int); + + assert (sizeElements <= std::numeric_limits::max()); + + + int initValue = 1000; + + int * initArray_d, *initArray_h; + HIPCHECK(hipMalloc(&initArray_d, sizeElements)); + HIPCHECK(hipHostMalloc(&initArray_h, sizeElements)); + for (size_t i=0; iC_d() : initArray_d, numElements, i&1 /*reverse?*/); streamers.push_back(s); } if (p_tests & 0x1) { - printf ("==> Test 0x1 runAsnc\n"); + printf ("==> Test 0x1 runAsyncAfter\n"); for (int i=0; irunAsync(); + streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL); } HIPCHECK(hipDeviceSynchronize()); + + for (int i=0; isyncAndCheck(i+1, initValue, i+1); + } } if (p_tests & 0x2) { printf ("==> Test 0x2 queryUntilComplete\n"); for (int i=0; irunAsync(); + streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL); streamers[i]->queryUntilComplete(); } HIPCHECK(hipDeviceSynchronize()); } if (p_tests & 0x4) { + printf ("==> Test 0x4 try null stream"); hipStreamQuery(0/* try null stream*/); } + if (p_tests & 0x8) { + printf ("==> Test 0x8 runAsyncWaitSameStream\n"); + for (int i=0; irunAsyncWaitSameStream(); + } + HIPCHECK(hipDeviceSynchronize()); + } + passed(); } From 2e2c773643981c40ecbc49700c8eae6cd5bcbf69 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:19:37 -0500 Subject: [PATCH 14/77] Update streamEventTEst. - add checks for events across devices. - refactor test to make sure it runs long enough to sensitive sync techniques. - add tests for DeviceSync, streamWaitEvent. [ROCm/hip commit: 2d0f509de52096e49afe476abadcfce2cded1244] --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 321 ++++++++++++++---- 1 file changed, 250 insertions(+), 71 deletions(-) diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 63c42da557..1d9ec45685 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -29,39 +29,47 @@ THE SOFTWARE. #include "test_common.h" #include #include -unsigned p_streams = 6; +unsigned p_streams = 8; unsigned p_db = 0; +unsigned p_count = 100; + template __global__ void -addOne( const T *A_d, +addCount( const T *A_d, T *C_d, - size_t NELEM) + size_t NELEM, + int count) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (size_t i=offset; i __global__ void -addOneReverse( const T *A_d, +addCountReverse( const T *A_d, T *C_d, - int64_t NELEM) + int64_t NELEM, + int count) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { - C_d[i] = A_d[i] + (T)1; - //C_d[i] = (T)1; - } + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } } @@ -70,41 +78,65 @@ addOneReverse( const T *A_d, template class Streamer { public: - Streamer(T *input, size_t numElements, bool reverse); + Streamer(int deviceId, T *input, size_t numElements, bool reverse); ~Streamer(); - void runAsyncAfter(Streamer *depStreamer); + void runAsyncAfter(Streamer *depStreamer, bool waitSameStream=false); void runAsyncWaitSameStream(); void queryUntilComplete(); - void syncAndCheck(int streamerNum, T initValue, T expectedOffset); + size_t check(int streamerNum, T initValue, T expectedOffset, bool expectPass=true); + void copyToHost(hipStream_t copyStream); hipEvent_t event() { return _event; }; + int deviceId() const { return _deviceId; }; + size_t mismatchCount() const { return _mismatchCount; }; T *C_d() { return _C_d; }; private: + T *_C_h; + T *_preA_d; // if input is on another device, this is pointer to that memory. T *_A_d; T *_C_d; hipStream_t _stream; hipEvent_t _event; + int _deviceId; size_t _numElements; bool _reverse; + + size_t _mismatchCount; }; template -Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : +Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : + _preA_d(NULL), _A_d(A_d), + _deviceId(deviceId), _numElements(numElements), _reverse(reverse) { size_t sizeElements = numElements * sizeof(int); + HIPCHECK(hipSetDevice(_deviceId)); + + + hipPointerAttribute_t attr; + HIPCHECK(hipPointerGetAttributes(&attr, A_d)); + if (attr.device != deviceId) { + // source is on another device, we will need to copy later. + // So save original source pointer and allocate local space. + printf ("info: source for streamer on another device, will insert memcpy\n"); + _preA_d = A_d; + HIPCHECK(hipMalloc(&_A_d, sizeElements)); + HIPCHECK(hipMemset(_A_d, -3, sizeElements)); + } + HIPCHECK(hipMalloc(&_C_d, sizeElements)); HIPCHECK(hipHostMalloc(&_C_h, sizeElements)); @@ -113,12 +145,16 @@ Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : HIPCHECK(hipStreamCreate(&_stream)); HIPCHECK(hipEventCreate(&_event)); + + + }; template -void Streamer::runAsyncAfter(Streamer *depStreamer) +void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) { + HIPCHECK(hipSetDevice(_deviceId)); if (p_db) { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); } @@ -127,36 +163,31 @@ void Streamer::runAsyncAfter(Streamer *depStreamer) HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0)); } - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { - hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } else { - hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } - HIPCHECK(hipEventRecord(_event, _stream)); -} - - -template -void Streamer::runAsyncWaitSameStream() -{ - printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { - hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } else { - hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + if (_preA_d) { + // _preA_d is on another device, so copy to local device so kernel can access it: + HIPCHECK(hipMemcpyAsync(_A_d, _preA_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } - // Test case where hipStreamWaitEvent waits on same event we just placed into the queue. + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } else { + hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } HIPCHECK(hipEventRecord(_event, _stream)); - HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); + + if (waitSameStream) { + HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); // this is essentially a no-op, but make sure it doesn't crash + } } + template void Streamer::queryUntilComplete() { + HIPCHECK(hipSetDevice(_deviceId)); int numQueries = 0; hipError_t e = hipSuccess; do { @@ -168,19 +199,48 @@ void Streamer::queryUntilComplete() }; +// If copyStream is !nullptr it is used for the copy. template -void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) +void Streamer::copyToHost(hipStream_t copyStream) { - HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, _stream)); - HIPCHECK(hipStreamSynchronize(_stream)); + if (p_db) { + printf ("db: copy back to host\n"); + } + HIPCHECK(hipSetDevice(_deviceId)); + HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, copyStream ? copyStream : _stream)); + HIPCHECK(hipStreamSynchronize(copyStream ? copyStream:_stream)); +} + + +template +size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool expectPass) +{ T expected = initValue + expectedOffset; + if (p_db) { + printf ("db: check\n"); + } + _mismatchCount = 0; for (size_t i=0; i<_numElements; i++) { if (_C_h[i] != expected) { - failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + _mismatchCount++; + if (expectPass) { + fprintf(stderr, "for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + if (_mismatchCount > 10) { + failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + } + } } } + + if (!expectPass && (_mismatchCount ==0)) { + // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported. + //failed("for streamer:%d we expected inavalid synchronization to lead to mismatch but none was detected. Increase --N to sensitize sync.\n", streamerNum); + + } + + return _mismatchCount; } @@ -189,6 +249,8 @@ void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) //Parse arguments specific to this test. void parseMyArguments(int argc, char *argv[]) { + N = 64*1024*1024; + int more_argc = HipTest::parseStandardArguments(argc, argv, false); // parse args for this test: @@ -199,6 +261,14 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--count")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_count)) { + failed("Bad count argument"); + } + } else if (!strcmp(arg, "--db")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_db)) { + failed("Bad db argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -206,6 +276,91 @@ void parseMyArguments(int argc, char *argv[]) }; +typedef Streamer IntStreamer; + + + + +void runStreamerLoop(std::vector &streamers) +{ + for (int i=0; irunAsyncAfter(i ? streamers[i-1] : NULL); + } +} + + +void checkAll(int initValue, std::vector &streamers, std::vector &sideStreams, bool expectPass=true) +{ + size_t mismatchCount=0; + + // Copy in reverse order to catch anything not yet finished... + for (int i=streamers.size()-1; i>=0; i--) { + streamers[i]->copyToHost(sideStreams.empty() ? NULL : sideStreams[streamers[i]->deviceId()]); + } + + + // Check in forward order so we can find first mismatch: + for (int i=0; icheck(i+1, initValue, (i+1)*p_count, expectPass); + + } + if (!expectPass && (mismatchCount==0)) { + // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported. + failed("we expected inavalid synchronization to lead to mismatch but none was detected. Increase --count to sensitize sync.\n"); + } + +} + + + +#define RUN_SYNC_TEST(_enableBit, _streamers, _sync, _expectPass)\ + if (p_tests & (_enableBit)) {\ + printf ("==> Test %02x runAsyncAfter sync=%s\n", (_enableBit), #_sync);\ + runStreamerLoop(_streamers);\ + (_sync);\ + checkAll (initValue, _streamers, sideStreams, _expectPass);\ + } + + + + +//--- +// A family of sync functions which somehow wait for inflight activity to finish: + + +void sync_none(void) {}; + +void sync_allDevices(int numDevices) +{ + for (int d=0; d streamers) +{ + for (int i=0; iqueryUntilComplete(); + }; +} + + +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +{ + HIPCHECK(hipSetDevice(sideDeviceId)); + + // wait on the last event in the stream of chained streamers: + // This plants a marker which the subsquent copy for this device will wait on: + HIPCHECK(hipStreamWaitEvent(sideStream, lastEvent, 0)); + + if (waitHere) { + HIPCHECK(hipStreamSynchronize(sideStream)); + } +} + + //--- int main(int argc, char *argv[]) @@ -213,13 +368,17 @@ int main(int argc, char *argv[]) HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); - typedef Streamer IntStreamer; + std::vector streamers; + std::vector streamersDev0; // streamers for first device. size_t numElements = N; size_t sizeElements = numElements * sizeof(int); + printf("info: sizeof arrays = %zu elements (%6.3f MB)\n", numElements, sizeElements / 1024.0/1024.0); + printf("info: streams=%d count=%d\n", p_streams, p_count); + assert (sizeElements <= std::numeric_limits::max()); @@ -234,45 +393,65 @@ int main(int argc, char *argv[]) HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + numDevices =2; // TODO - remove me. - for (int i=0; iC_d() : initArray_d, numElements, i&1 /*reverse?*/); - streamers.push_back(s); - } - - if (p_tests & 0x1) { - printf ("==> Test 0x1 runAsyncAfter\n"); + for (int d=0; drunAsyncAfter(i ? streamers[i-1] : NULL); - } - HIPCHECK(hipDeviceSynchronize()); - - for (int i=0; isyncAndCheck(i+1, initValue, i+1); + IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, i&1 /*reverse?*/); + streamers.push_back(s); + if (d==0) { + streamersDev0.push_back(s); + } } } - if (p_tests & 0x2) { - printf ("==> Test 0x2 queryUntilComplete\n"); - for (int i=0; irunAsyncAfter(i ? streamers[i-1] : NULL); - streamers[i]->queryUntilComplete(); - } - HIPCHECK(hipDeviceSynchronize()); + // A sideband stream channel that is independent from above. + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // asynchronous wrt the other streams. + std::vector sideStreams; + for (int d=0; d Test 0x4 try null stream"); + + // Tests on first GPU: + RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false); + RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices), true); + RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); + RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); + + if (numDevices > 1) { + // Sync on second device for activity running on device 0: + RUN_SYNC_TEST(0x10, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 1, sideStreams[1], true), true); + } + + + // Tests on all GPUs: + // RUN_SYNC_TEST(0x100, streamers, sync_streamWaitEvent(streamers.back()->event(), 0, sideStreams[0], false), true); + + + + + if (p_tests & 0x1000) { + printf ("==> Test 0x1000 try null stream\n"); hipStreamQuery(0/* try null stream*/); } - if (p_tests & 0x8) { - printf ("==> Test 0x8 runAsyncWaitSameStream\n"); - for (int i=0; irunAsyncWaitSameStream(); + + // Insert small wrinkle here, insert a wait on event just recorded, all in the same stream. + if (p_tests & 0x2000) { + printf ("==> Test 0x2000 runAsyncWaitSameStream\n"); + for (int i=0; irunAsyncAfter(i ? streamersDev0[i-1] : NULL, true/*waitSameStream*/); } - HIPCHECK(hipDeviceSynchronize()); + + sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false); + checkAll (initValue, streamersDev0, sideStreams); } From 639d152ff807f6ffc54191a5c7331dd81bb7cdfd Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:20:56 -0500 Subject: [PATCH 15/77] Refactor hipHostRegister test. Run all tests in one command. Run 128 offsets. [ROCm/hip commit: 6437f5d2b20f8c6e204cc35921dc1329a3d9dfb2] --- .../hip/tests/src/runtimeApi/memory/hipHostRegister.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp index 8cf0979261..3376ee04f1 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -19,9 +19,7 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp - * RUN: %t --tests 0x1 - * RUN: %t --tests 0x2 - * RUN: %t --tests 0x4 + * RUN: %t * HIT_END */ @@ -131,7 +129,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipMalloc(&Bd, size)); // TODO - set to 128 -#define OFFSETS_TO_TRY 1 +#define OFFSETS_TO_TRY 128 assert (N>OFFSETS_TO_TRY); if (p_tests & 0x2) { From ef98415d0e6c67e5d1a1a839a850a1350e236cdf Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:28:11 -0500 Subject: [PATCH 16/77] Fix some typos, add additional guidance for -BSymbolic [ROCm/hip commit: 687809104b93c89904146cb174af971d9a301c7d] --- projects/hip/docs/markdown/hip_bugs.md | 2 ++ projects/hip/src/hip_memory.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/projects/hip/docs/markdown/hip_bugs.md b/projects/hip/docs/markdown/hip_bugs.md index abb31d80e8..91b2a5a019 100644 --- a/projects/hip/docs/markdown/hip_bugs.md +++ b/projects/hip/docs/markdown/hip_bugs.md @@ -45,6 +45,8 @@ To correct, add the following flag to hcc or hipcc: $ hipcc -Wl,-Bsymbolic ... ``` +Ensure there is no space in the "Wl,-Bsymbolic" option. + ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index c4bc7db096..1ba698f461 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -104,8 +104,8 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig auto device = ctx->getWriteableDevice(); ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags); - tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n", - msg, ptr, sizeBytes, device->_deviceId); + tprintf(DB_MEM, " alloc %s ptr:%p-%p size:%zu on dev:%d\n", + msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { int r = sharePtr(ptr, ctx, hipFlags); From bdce32238f94552998fcefce673882231bb4b2e6 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 9 May 2017 10:14:16 -0500 Subject: [PATCH 17/77] added guard against hip_runtime.h so that non-hcc compilers can use it Change-Id: I3d68deda9ce8a5956e21e15a69e549d6c21e3e39 [ROCm/hip commit: a38e36ec2f3905a3701619b6e72c23a39c9709c1] --- projects/hip/include/hip/hcc_detail/hip_runtime.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index 06ce65bc9a..4d8876d8f4 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -41,6 +41,8 @@ THE SOFTWARE. #include #endif//__cplusplus +#if __HCC__ + // Define NVCC_COMPAT for CUDA compatibility #define NVCC_COMPAT #define CUDA_SUCCESS hipSuccess @@ -481,6 +483,6 @@ do {\ */ - +#endif #endif//HIP_HCC_DETAIL_RUNTIME_H From 5d558861c205c0ca7bfb140196abf0be456ac2d3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 10 May 2017 13:23:49 -0500 Subject: [PATCH 18/77] Fix hipStreamWaitEvent for single GPU. [ROCm/hip commit: e0c3ea15b294ba8d6f1404b32a443489db38d6ca] --- projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 1d9ec45685..adf0d4af0c 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -395,7 +395,7 @@ int main(int argc, char *argv[]) int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - numDevices =2; // TODO - remove me. + numDevices = min(2, numDevices); // multi-GPU to 2 device. for (int d=0; d Date: Wed, 10 May 2017 17:32:25 -0500 Subject: [PATCH 19/77] hipHostMalloc allocation are mapped to all devices by default. Support hipHostMallocPortable flag. Default flags are hipHostMallocPortable | hipHostMallocMapped. Also: -refactor tests to move addCount and addCountReverse into HipTest namespace. -test multi-GPU host memory. [ROCm/hip commit: c3ccaa01e548cc29dc0f6347584ae27385b4a981] --- projects/hip/src/hip_hcc.cpp | 9 ++ projects/hip/src/hip_hcc_internal.h | 1 + projects/hip/src/hip_memory.cpp | 58 ++++---- projects/hip/tests/src/hipPointerAttrib.cpp | 2 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 129 +++++++++++++----- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 40 +----- projects/hip/tests/src/test_common.h | 38 ++++++ 7 files changed, 181 insertions(+), 96 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 71d947488d..81a2079b5b 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -118,6 +118,7 @@ bool g_visible_device = false; unsigned g_deviceCnt; std::vector g_hip_visible_devices; hsa_agent_t g_cpu_agent; +hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. unsigned g_numLogicalThreads; std::atomic g_lastShortTid(1); @@ -1389,6 +1390,14 @@ void ihipInit() g_deviceCnt++; } } + + g_allAgents = static_cast (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t))); + g_allAgents[0] = g_cpu_agent; + for (int i=0; i_hsaAgent; + } + + g_numLogicalThreads = std::thread::hardware_concurrency(); // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 9c17c6e98c..132f099ce8 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -826,6 +826,7 @@ private: // Critical data, protected with locked access: extern std::once_flag hip_initialized; extern unsigned g_deviceCnt; extern hsa_agent_t g_cpu_agent ; // the CPU agent. +extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. //================================================================================================= // Extern functions: diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 1ba698f461..c4f0f64e50 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK } // return 0 on success or -1 on error: -int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) +int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) { int ret = 0; auto device = ctx->getWriteableDevice(); hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); - if (peerCnt > 1) { - //printf ("peer self access\n"); + if (shareWithAll) { + hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } + } else { + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + peerCnt = crit->peerCnt(); + tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); + if (peerCnt > 1) { - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { - tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); - }; + //printf ("peer self access\n"); - hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); - if (s != HSA_STATUS_SUCCESS) { - ret = -1; + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } } } } @@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) // Allocate a new pointer with am_alloc and share with all valid peers. // Returns null-ptr if a memory error occurs (either allocation or sharing) -void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags) +void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags) { void *ptr = nullptr; @@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { - int r = sharePtr(ptr, ctx, hipFlags); + int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); if (r != 0) { ptr = nullptr; } @@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) } else { auto device = ctx->getWriteableDevice(); - *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); + *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/); if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; @@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } else { unsigned trueFlags = flags; if (flags == hipHostMallocDefault) { - trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined; + // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; } const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; @@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) auto device = ctx->getWriteableDevice(); unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", - sizeBytes, ctx, amFlags, flags); + sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); + if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; } @@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; - *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, hip_status = hipErrorUnknown; break; } - *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0); if (size && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp index fb7832d9a6..7a2ab64bea 100644 --- a/projects/hip/tests/src/hipPointerAttrib.cpp +++ b/projects/hip/tests/src/hipPointerAttrib.cpp @@ -99,7 +99,7 @@ inline int zrand(int max) //================================================================================================= -// Functins to run tests +// Functions to run tests //================================================================================================= //-- //Run through a couple simple cases to test lookups and host pointer arithmetic: diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 0a256d6362..1ee5cbc9bb 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -25,45 +25,106 @@ THE SOFTWARE. #include"test_common.h" -#define SIZE 1024*1024*256 +#define NUM_ELEMENTS 1024*1024*64 +#define SIZE NUM_ELEMENTS*sizeof(int) -int main(){ - float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0; - B = (float*)malloc(SIZE); - hipMalloc((void**)&Ad, SIZE); - hipHostMalloc((void**)&B, SIZE); - hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault); - hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped); - hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped); - - hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/); - - HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess); - - HIPCHECK_API(hipFree(Ad) , hipSuccess); - HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue); - - HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory - HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(ptr_0) , hipSuccess); - HIPCHECK_API(hipHostFree(Bd) , hipSuccess); - HIPCHECK_API(hipHostFree(Bm) , hipSuccess); - - HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipHostFree(C) , hipSuccess); +int p_count = 4; - HIPCHECK_API(hipFree(NULL) , hipSuccess); - HIPCHECK_API(hipHostFree(NULL) , hipSuccess); +void multiGpuHostAlloc(int allocDevice) +{ + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + assert(numDevices > 1); + + printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); + + + HIPCHECK(hipSetDevice(allocDevice)); + + int *Ah, *Ch; + hipHostMalloc((void**)&Ah, SIZE); + hipHostMalloc((void**)&Ch, SIZE); + + const int init = -1; + for (size_t i=0; i 1); + + multiGpuHostAlloc(0); + multiGpuHostAlloc(1); } passed(); diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index adf0d4af0c..80ff7ad98d 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -35,42 +35,6 @@ unsigned p_count = 100; -template -__global__ void -addCount( const T *A_d, - T *C_d, - size_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i -__global__ void -addCountReverse( const T *A_d, - T *C_d, - int64_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i=0; i-=stride) { - C_d[i] = A_d[i] + (T)count; - } - } -} //------ @@ -171,9 +135,9 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_reverse) { - hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } else { - hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } HIPCHECK(hipEventRecord(_event, _stream)); diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 1250de4801..633ee6f825 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +addCount( const T *A_d, + T *C_d, + size_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i +__global__ void +addCountReverse( const T *A_d, + T *C_d, + int64_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } +} + + template void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) From 3408458f9f2814771a4c8f1594334de17f2b23fd Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 11 May 2017 21:50:36 +0300 Subject: [PATCH 20/77] [HIPIFY] Fix string routines. Some Clang tooling functions return std::string, some return StringRef. Assigning of returning std::string to StringRef variables leads to garbage in it. DEBUG build is always affected. [ROCm/hip commit: 25d470c3801a82054461972d7b2f478115920647] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 51 +++++++++++----------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 1c6406b4ab..47434babac 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -2690,7 +2690,7 @@ private: bool cudaCall(const MatchFinder::MatchResult &Result) { if (const CallExpr *call = Result.Nodes.getNodeAs("cudaCall")) { const FunctionDecl *funcDcl = call->getDirectCallee(); - StringRef name = funcDcl->getDeclName().getAsString(); + std::string name = funcDcl->getDeclName().getAsString(); SourceManager *SM = Result.SourceManager; SourceLocation sl = call->getLocStart(); const auto found = N.cuda2hipRename.find(name); @@ -2714,16 +2714,16 @@ private: } } if (bReplace) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); Replacement Rep(*SM, sl, length, repName); FullSourceLoc fullSL(sl, *SM); insertReplacement(Rep, fullSL); } } else { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [function call]."; + std::string msg = "the following reference is not handled: '" + name + "' [function call]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2838,7 +2838,7 @@ private: bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) { if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs("cudaEnumConstantRef")) { - StringRef name = enumConstantRef->getDecl()->getNameAsString(); + StringRef name = enumConstantRef->getDecl()->getName(); SourceLocation sl = enumConstantRef->getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); @@ -2861,10 +2861,10 @@ private: bool cudaEnumDecl(const MatchFinder::MatchResult &Result) { if (const VarDecl *enumDecl = Result.Nodes.getNodeAs("cudaEnumDecl")) { - StringRef name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); + std::string name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); QualType QT = enumDecl->getType().getUnqualifiedType(); - StringRef name_unqualified = QT.getAsString(); - if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { + std::string name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) { name = name_unqualified; } // Workaround for enum VarDecl as param decl, declared with enum type specifier @@ -2882,7 +2882,7 @@ private: //------------------------------------------------- const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2890,7 +2890,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [enum constant decl]."; + std::string msg = "the following reference is not handled: '" + name + "' [enum constant decl]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2905,12 +2905,12 @@ private: QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType(); } QT = QT.getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); SourceLocation sl = typedefVar->getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2918,7 +2918,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var]."; + std::string msg = "the following reference is not handled: '" + name + "' [typedef var]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2935,10 +2935,10 @@ private: SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); QualType QT = t->getPointeeType(); QT = QT.getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2947,7 +2947,7 @@ private: } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var ptr]."; + std::string msg = "the following reference is not handled: '" + name + "' [typedef var ptr]."; printHipifyMessage(*SM, sl, msg); } } @@ -2961,13 +2961,13 @@ private: QualType QT = structVar->getType(); // ToDo: find case-studies with types other than Struct. if (QT->isStructureType()) { - StringRef name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString(); + std::string name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString(); TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc(); SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2976,7 +2976,7 @@ private: } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [struct var]."; + std::string msg = "the following reference is not handled: '" + name + "' [struct var]."; printHipifyMessage(*SM, sl, msg); } } @@ -3049,7 +3049,7 @@ private: // Example: extern __shared__ uint sRadix1[]; if (sharedVar->hasExternalFormalLinkage()) { QualType QT = sharedVar->getType(); - StringRef typeName; + std::string typeName; if (QT->isIncompleteArrayType()) { const ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); QT = AT->getElementType(); @@ -3071,9 +3071,8 @@ private: SourceLocation slEnd = sharedVar->getLocEnd(); SourceManager *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; - SmallString<128> tmpData; - StringRef varName = sharedVar->getNameAsString(); - StringRef repName = Twine("HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")").toStringRef(tmpData); + std::string varName = sharedVar->getNameAsString(); + std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")"; Replacement Rep(*SM, slStart, repLength, repName); FullSourceLoc fullSL(slStart, *SM); insertReplacement(Rep, fullSL); @@ -3089,7 +3088,7 @@ private: bool cudaParamDecl(const MatchFinder::MatchResult &Result) { if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs("cudaParamDecl")) { QualType QT = paramDecl->getOriginalType().getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); const Type *t = QT.getTypePtr(); if (t->isStructureOrClassType()) { name = t->getAsCXXRecordDecl()->getName(); @@ -3099,7 +3098,7 @@ private: SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -3107,7 +3106,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [param decl]."; + std::string msg = "the following reference is not handled: '" + name + "' [param decl]."; printHipifyMessage(*SM, sl, msg); } return true; From b1b71c4c75a8b30f26a6c7ee23ed99375198091b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 15:57:32 -0500 Subject: [PATCH 21/77] Add hipEventDisableSystemRelease flag. [ROCm/hip commit: b306095ac28d7e399487b5d72df7ed7fcd340150] --- projects/hip/include/hip/hcc_detail/hip_runtime_api.h | 1 + projects/hip/include/hip/nvcc_detail/hip_runtime_api.h | 2 ++ projects/hip/src/hip_hcc.cpp | 10 +++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 9cfd21c1d2..175fd64d29 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -106,6 +106,7 @@ enum hipLimit_t #define hipEventBlockingSync 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency. #define hipEventDisableTiming 0x2 ///< Disable event's capability to record timing information. May improve performance. #define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP. +#define hipEventDisableSystemRelease 0x80000000 /// < Disable the system-scope release that event normally performs when it records. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. //! Flags that can be used with hipHostMalloc diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index aad3ffcc44..e9f926b336 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -58,6 +58,8 @@ hipMemcpyHostToHost #define hipEventBlockingSync cudaEventBlockingSync #define hipEventDisableTiming cudaEventDisableTiming #define hipEventInterprocess cudaEventInterprocess +#define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ + #define hipHostMallocDefault cudaHostAllocDefault #define hipHostMallocPortable cudaHostAllocPortable diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 81a2079b5b..e936cf3af3 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -56,6 +56,9 @@ THE SOFTWARE. #define USE_ROCR_1_4 1 #endif +// needs HCC change for hc::no_scope +#define USE_NO_SCOPE 0 + //================================================================================================= //Global variables: //================================================================================================= @@ -364,8 +367,13 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); this->ensureHaveQueue(crit); +#if USE_NO_SCOPE + printf ("create_marker, flags = %x\n", event->_flags); + event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); +#else event->_marker = crit->_av.create_marker(); -} +#endif +}; //============================================================================= From be423aa0341e0843814b1b6e2858557f6178aad4 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 16:05:28 -0500 Subject: [PATCH 22/77] Remove old USE_ switches no longer needed. [ROCm/hip commit: 0679831384072c4f9ddeff128f0c284c31887de8] --- projects/hip/src/hip_hcc.cpp | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e936cf3af3..12f1792c33 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -48,14 +48,6 @@ THE SOFTWARE. #include "env.h" -#ifndef USE_COPY_EXT_V2 -#define USE_COPY_EXT_V2 1 -#endif - -#ifndef USE_ROCR_1_4 -#define USE_ROCR_1_4 1 -#endif - // needs HCC change for hc::no_scope #define USE_NO_SCOPE 0 @@ -107,10 +99,6 @@ int HCC_OPT_FLUSH = 0; -#define HIP_USE_PRODUCT_NAME 1 -//#define DISABLE_COPY_EXT 1 - - std::once_flag hip_initialized; // Array of pointers to devices. @@ -857,11 +845,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) // Get Max Threads Per Multiprocessor uint32_t max_waves_per_cu; -#if USE_ROCR_1_4 err = hsa_agent_get_info(_hsaAgent,(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu); -#else - max_waves_per_cu = 10; -#endif DeviceErrorCheck(err); prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu; @@ -1919,11 +1903,7 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, this->ensureHaveQueue(crit); -#if USE_COPY_EXT_V2 crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } } @@ -2031,18 +2011,10 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes this->ensureHaveQueue(crit); if (HIP_FORCE_SYNC_COPY) { -#if USE_COPY_EXT_V2 crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); -#else - crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } else { -#if USE_COPY_EXT_V2 crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); -#else - crit->_av.copy_async(src, dst, sizeBytes); -#endif } } catch (Kalmar::runtime_exception) { throw ihipException(hipErrorRuntimeOther); @@ -2075,11 +2047,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes this->ensureHaveQueue(crit); -#if USE_COPY_EXT_V2 crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } } } From 62e5cda4afc5589f80a00443164068b3b924d709 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 12 May 2017 21:43:34 -0500 Subject: [PATCH 23/77] added gfx900 to hipDeviceProp_t Change-Id: I49e7a32f218926fd55f1c94c5dc2366d6c8ac4ca [ROCm/hip commit: a43149135e104cc03e582a68060f29374d080106] --- projects/hip/src/hip_hcc.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 12f1792c33..a655e35aa1 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -772,6 +772,9 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) if(strcmp(archName,"gfx803")==0){ prop->gcnArch = 803; } + if(strcmp(archName,"gfx900")==0){ + prop->gcnArch = 900; + } DeviceErrorCheck(err); From 6b76c979cea235d84ffe948e9fdfbed0648033a7 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 15 May 2017 15:35:52 +0300 Subject: [PATCH 24/77] [HIPIFY] CUDA Driver API: Primary Context Management support. [ROCm/hip commit: a97cb6810cd6a1aa3af708f6726358ebecb2ed80] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 47434babac..72cbcaf52a 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -858,6 +858,13 @@ struct cuda2hipMap { cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + // Primary Context Management + cuda2hipRename["cuDevicePrimaryCtxGetState"] = {"hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRelease"] = {"hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxReset"] = {"hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxSetFlags"] = {"hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER}; + // Device cuda2hipRename["cuDeviceGet"] = {"hipGetDevice", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetName"] = {"hipDeviceGetName", CONV_DEV, API_DRIVER}; From c6a2d65d5e8f7c3139ea4dbc6c14f4ac4f68374c Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 16 May 2017 07:15:13 +0530 Subject: [PATCH 25/77] Added hipMallocPitch on HIP/NVCC path Change-Id: Ie3ba7d3f95acac23805efa919531043b350a3f21 [ROCm/hip commit: 1223612331678dae749aec6a125efdf5ccd9e007] --- projects/hip/include/hip/nvcc_detail/hip_runtime_api.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index e9f926b336..69a9b46570 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -204,6 +204,10 @@ inline static hipError_t hipMalloc(void** ptr, size_t size) { return hipCUDAErrorTohipError(cudaMalloc(ptr, size)); } +inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { + return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height)); +} + inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); } From 1a183a71a578641017c1ae2cd1b75706bf4090c4 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 16 May 2017 18:21:25 +0300 Subject: [PATCH 26/77] [HIPIFY] cudaMallocPitch -> hipMallocPitch [ROCm/hip commit: 12d8c53c900b7c8c3ed49d1cef80618c59f6860e] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 72cbcaf52a..0c6b0f1efc 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -1088,7 +1088,7 @@ struct cuda2hipMap { cuda2hipRename["cudaMalloc3DArray"] = {"hipMalloc3DArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMallocManaged"] = {"hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMallocMipmappedArray"] = {"hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMallocPitch"] = {"hipMallocPitch", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMallocPitch"] = {"hipMallocPitch", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaFree"] = {"hipFree", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaFreeHost"] = {"hipHostFree", CONV_MEM, API_RUNTIME}; From dce8786e26f90a7a4247722edaeeb3e0be7c6164 Mon Sep 17 00:00:00 2001 From: emankov Date: Tue, 16 May 2017 19:52:39 +0300 Subject: [PATCH 27/77] [HIPIFY] *.inl extension support for batch processing [ROCm/hip commit: 30000ef130cdd0d086812f0693af36af6c44fe75] --- projects/hip/bin/findcode.sh | 2 +- projects/hip/bin/hipexamine.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hip/bin/findcode.sh b/projects/hip/bin/findcode.sh index a2334b3e2d..d092d6bf8d 100755 --- a/projects/hip/bin/findcode.sh +++ b/projects/hip/bin/findcode.sh @@ -2,4 +2,4 @@ SEARCH_DIRS=$@ -find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' +find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl' diff --git a/projects/hip/bin/hipexamine.sh b/projects/hip/bin/hipexamine.sh index 2a6fab7110..79e1b469f9 100755 --- a/projects/hip/bin/hipexamine.sh +++ b/projects/hip/bin/hipexamine.sh @@ -1,6 +1,6 @@ #!/bin/bash -#usage : hipexamine2.sh DIRNAME [hipify options] [--] [clang options] +#usage : hipexamine.sh DIRNAME [hipify options] [--] [clang options] # Generate CUDA->HIP conversion statistics for all the code files in the specified directory. From 2cf05ad54bd14800fa49e10d103ef4faa08c0b8f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 27 Jan 2017 11:21:08 -0600 Subject: [PATCH 28/77] Add HIP_TRACE_API=4. Only display memory allocation/free apis. [ROCm/hip commit: 0edab14139c564781e85febb603957a09590f9b6] --- projects/hip/README.md | 1 + projects/hip/docs/markdown/hip_profiling.md | 5 +++ projects/hip/src/hip_hcc.cpp | 2 +- projects/hip/src/hip_hcc_internal.h | 16 +++---- projects/hip/src/hip_memory.cpp | 46 ++++++++++----------- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/projects/hip/README.md b/projects/hip/README.md index f61c3b106a..d54032c6df 100644 --- a/projects/hip/README.md +++ b/projects/hip/README.md @@ -32,6 +32,7 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](docs/markdown/hip_porting_guide.md) - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md) +- [HIP Profiling and Debugging](docs/markdown/hip_profiling.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) - [hipify-clang](hipify-clang/README.md) - [Developer/CONTRIBUTING Info](CONTRIBUTING.md) diff --git a/projects/hip/docs/markdown/hip_profiling.md b/projects/hip/docs/markdown/hip_profiling.md index 6e5cde700d..b5c4672464 100644 --- a/projects/hip/docs/markdown/hip_profiling.md +++ b/projects/hip/docs/markdown/hip_profiling.md @@ -267,6 +267,11 @@ info: check result PASSED! ``` +HIP_TRACE_API supports multiple levels of debug information: + - 0x1 = print all HIP APIs + - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets. Includes hipLaunchKernel, hipMemcpy*, hipMemset*. + - 0x4 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + #### Color Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index a655e35aa1..07604fe85d 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1435,7 +1435,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) { - if ((HIP_TRACE_API & (1< dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); @@ -826,7 +826,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { - HIP_INIT_CMD_API(dst, wOffset, hOffset, src, spitch, width, height, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind); hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -879,7 +879,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) { - HIP_INIT_CMD_API(dst, wOffset, hOffset, src, count, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, count, kind); hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -938,7 +938,7 @@ ihipMemsetKernel(hipStream_t stream, // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream ) { - HIP_INIT_CMD_API(dst, value, sizeBytes, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes, stream); hipError_t e = hipSuccess; @@ -988,7 +988,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; @@ -1148,7 +1148,7 @@ hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) hipError_t hipFree(void* ptr) { - HIP_INIT_API(ptr); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr); hipError_t hipStatus = hipErrorInvalidDevicePointer; @@ -1176,7 +1176,7 @@ hipError_t hipFree(void* ptr) hipError_t hipHostFree(void* ptr) { - HIP_INIT_API(ptr); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr); // Synchronize to ensure all work has finished. ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. @@ -1210,7 +1210,7 @@ hipError_t hipFreeHost(void* ptr) hipError_t hipFreeArray(hipArray* array) { - HIP_INIT_API(array); + HIP_INIT_SPECIAL_API((TRACE_MEM), array); hipError_t hipStatus = hipErrorInvalidDevicePointer; From 7d07d804d803e0610b2dd7b03fc8b1df7fe230c5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 14 Feb 2017 21:50:16 -0600 Subject: [PATCH 29/77] split debugging into separate .md file [ROCm/hip commit: db097ab392a0aad6e3407b0d33d9d2d78638c28a] --- projects/hip/README.md | 3 +- .../src/runtimeApi/stream/hipNullStream.cpp | 200 ++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp diff --git a/projects/hip/README.md b/projects/hip/README.md index d54032c6df..d04d63714f 100644 --- a/projects/hip/README.md +++ b/projects/hip/README.md @@ -32,7 +32,8 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](docs/markdown/hip_porting_guide.md) - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md) -- [HIP Profiling and Debugging](docs/markdown/hip_profiling.md) +- [HIP Profiling ](docs/markdown/hip_profiling.md) +- [HIP Debugging](docs/markdown/hip_debugging.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) - [hipify-clang](hipify-clang/README.md) - [Developer/CONTRIBUTING Info](CONTRIBUTING.md) diff --git a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp new file mode 100644 index 0000000000..f8d201cb51 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -0,0 +1,200 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "hip/hip_runtime.h" +#include "test_common.h" +#include +unsigned p_streams = 6; +int p_repeat = 10; + + +template +__global__ void +vectorADDRepeat(hipLaunchParm lp, + const T *A_d, + const T *B_d, + T *C_d, + size_t NELEM, + int repeat) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int j=1; j<=repeat;j++) { + for (size_t i=offset; i +class Streamer { +public: + Streamer(size_t numElements, bool useNullStream=false); + ~Streamer(); + void enqueAsync(); + void queryUntilComplete(); + + +public: + T *_A_h; + T *_B_h; + T *_C_h; + + T *_A_d; + T *_B_d; + T *_C_d; + + hipStream_t _stream; + hipEvent_t _event; + + size_t _numElements; +}; + +template +Streamer::Streamer(size_t numElements, bool useNullStream) : + _numElements(numElements) +{ + HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true); + + if (useNullStream) { + _stream = 0x0; + } else { + HIPCHECK(hipStreamCreate(&_stream)); + } + HIPCHECK(hipEventCreate(&_event)); +}; + +template +void Streamer::enqueAsync() +{ + printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + +} + +template +void Streamer::queryUntilComplete() +{ + int numQueries = 0; + hipError_t e = hipSuccess; + do { + numQueries++; + e = hipStreamQuery(_stream); + } while (e != hipSuccess) ; + + printf ("completed after %d queries\n", numQueries); +}; + + + +//--- +//Parse arguments specific to this test. +void parseMyArguments(int argc, char *argv[]) +{ + int more_argc = HipTest::parseStandardArguments(argc, argv, false); + + // parse args for this test: + for (int i = 1; i < more_argc; i++) { + const char *arg = argv[i]; + + if (!strcmp(arg, "--streams")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { + failed("Bad streams argument"); + } + } else { + failed("Bad argument '%s'", arg); + } + }; +}; + + + + + +//--- +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, false); + parseMyArguments(argc, argv); + + typedef Streamer FloatStreamer; + + std::vector streamers; + + size_t numElements = N; + + float *expected_H = (float*)malloc(numElements*sizeof(float)); + + + auto nullStreamer = new FloatStreamer(numElements, true); + for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + } + + + for (int i=0; i Test 0x1 runAsnc\n"); + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[p_streams - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete. + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipStreamSynchronize(0)); + + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + } + + + if (p_tests & 0x2) { + printf ("==> Test 0x2 runAsnc-odd-only\n"); + for (int i=0; ienqueAsync(); + } + } + } + + + passed(); +} From 3107e70ea2a3a64856633cb357f6f1e7de354158 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:01:35 -0500 Subject: [PATCH 30/77] Doc update - split hip_debugging.md into separate file. [ROCm/hip commit: 704ba30b32c55548797160ace632d70af7e631a9] --- projects/hip/docs/markdown/hip_debugging.md | 168 ++++++++++++++++++++ projects/hip/docs/markdown/hip_faq.md | 4 +- projects/hip/docs/markdown/hip_profiling.md | 160 +------------------ 3 files changed, 171 insertions(+), 161 deletions(-) create mode 100644 projects/hip/docs/markdown/hip_debugging.md diff --git a/projects/hip/docs/markdown/hip_debugging.md b/projects/hip/docs/markdown/hip_debugging.md new file mode 100644 index 0000000000..e7e058d17a --- /dev/null +++ b/projects/hip/docs/markdown/hip_debugging.md @@ -0,0 +1,168 @@ +Table of Contents +================= + + * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true">" in the printed output. +This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. +The gdb syntax also supports using the variable name (in this case 'dst'): +``` +(gdb) p dst +$33 = (void *) 0x5ec7e9000 +(gdb) call hc::am_memtracker_print(dst) +TargetAddress:0x5ec7e9000 + 0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) +... +-->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) + +``` + +To debug an explicit address, cast the address to (void*) : +``` +(gdb) call hc::am_memtracker_print((void*)0x508c7f000) +``` +- Debugging GPUVM fault. +For example: +``` +Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. + +Program received signal SIGABRT, Aborted. +[Switching to Thread 0x7fffdffb5700 (LWP 14893)] +0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 +56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. +(gdb) bt +#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 +#1 0x00007ffff205b028 in __GI_abort () at abort.c:89 +#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312 +#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 +(gdb) info threads + Id Target Id Frame + 4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 + 3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 +* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 + 1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +(gdb) thread 1 +[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))] +#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +(gdb) bt +#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so +#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 +... +``` + +### General Debugging Tips +- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above. +- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3. This will force HCC to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so. +- VM faults inside kernels can be caused byi: + - incorrect code (ie a for loop which extends past array boundaries), i + - memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers). + - synchronization issues + - compiler issues (incorrect code generation from the compiler) + - runtime issues + +-- General debug tips: +- 'gdb --args' can be used to conviently pass the executable and arguments to gdb. +- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign: +``` +(gdb) set env HIP_DB 1 +``` + +#### Print env var state +Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info. +Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info. diff --git a/projects/hip/docs/markdown/hip_faq.md b/projects/hip/docs/markdown/hip_faq.md index e316d449ef..07ec5f1d8b 100644 --- a/projects/hip/docs/markdown/hip_faq.md +++ b/projects/hip/docs/markdown/hip_faq.md @@ -53,7 +53,7 @@ At a high-level, the following features are not supported: - Dynamic parallelism (CUDA 5.0) - Managed memory (CUDA 6.5) - Graphics interoperability with OpenGL or Direct3D -- CUDA Driver API (Under Development) +- CUDA Driver API - CUDA IPC Functions (Under Development) - CUDA array, mipmappedArray and pitched memory - MemcpyToSymbol functions @@ -102,7 +102,7 @@ However, we can provide a rough summary of the features included in each CUDA SD - Per-thread-streams (under development) - C++11 (HCC supports all of C++11, all of C++14 and some C++17 features) - CUDA 7.5 - - float16 (under development) + - float16 - CUDA 8.0 - TBD. diff --git a/projects/hip/docs/markdown/hip_profiling.md b/projects/hip/docs/markdown/hip_profiling.md index b5c4672464..ef349ef2a5 100644 --- a/projects/hip/docs/markdown/hip_profiling.md +++ b/projects/hip/docs/markdown/hip_profiling.md @@ -1,4 +1,4 @@ -# Profiling and Debugging HIP Code +# Profiling HIP Code This section describes the profiling and debugging capabilities that HIP provides. Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs. @@ -280,161 +280,3 @@ None will disable use of color control codes for both the opening and closing an -### Using HIP_DB - -This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well. -The HIP debug information is designed to print important information during the execution of a HIP API. HIP provides -different color-coded levels of debug information: - - api : Print the beginning and end of each HIP API, including the arguments and return codes. This is equivalent to setting HIP_TRACE_API=1. - - sync : Print multi-thread and other synchronization debug information. - - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory. - - mem : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more. - -DB_MEM format is flags separated by '+' sign, or a hex code for the bitmask. Generally the + format is preferred. -For example: -``` -$ HIP_DB=api+copy+mem my-application -$ HIP_DB=0xF my-application -``` - -### Using ltrace -ltrace is a standard linux tool which provides a message to stderr on every dynamic library call. Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries. Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger. -The trace can also show performance issues related to accidental calls to expensive API calls on the critical path. - -ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack. Here's a sample command-line and output: - -``` -$ HIP_DB=api ltrace -C -e 'hsa*' - -... - -<hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0 -libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0 -libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 -libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0 -libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0 -<... hsa_amd_memory_lock resumed> ) = 0 -libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0 -libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0 -libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0 -libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 -libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0 -libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0 -<... hsa_amd_memory_unlock resumed> ) = 0 - hip-api tid:1.17 hipMemcpy ret= 0 (hipSuccess)>> -``` - -Some key information from the trace above. - - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output. The messages show the thread id and API sequence number (`1.17`). ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing. - - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls. - - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so". - - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy. The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output. - - -### Chicken bits -Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization. -These are not intended for production but can be useful diagnose synchronization problems in the application (or driver). - -Some of the most useful chicken bits are described here. These bits are supported on the ROCm path: - -HIP provides 3 environment variables in the HIP_*_BLOCKING family. These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization. These flags will have performance impact and are not intended for production use. - -- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch. Equivalent to setting CUDA_LAUNCH_BLOCKING. -- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names. The HIP runtime will wait on the host after one of the named kernels executes. This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality. There is no indication if kernel names are spelled incorrectly. One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING". -- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller. - -These options cause HCC to serialize. Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP. -- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize. -- HCC_SERIALIZE_COPY : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize. - -- HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines. -- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts. Can be useful to diagnose interrupt storm issues in the driver. -- HSA_DISABLE_CACHE=1 : Disables the GPU L2 data cache. - -### Debugging HIP Applications - -- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread. This can be useful for setting conditional breakpoints. Also, each new HIP thread is mapped to monotically increasing shortTid ID. Both of these fields are displayed in the HIP debug info. -``` -(gdb) p tls_tidInfo -$32 = {_shortTid = 1, _apiSeqNum = 803} -``` - -- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc". -If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. -An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output. -This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. -The gdb syntax also supports using the variable name (in this case 'dst'): -``` -(gdb) p dst -$33 = (void *) 0x5ec7e9000 -(gdb) call hc::am_memtracker_print(dst) -TargetAddress:0x5ec7e9000 - 0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) -... --->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) - -``` - -To debug an explicit address, cast the address to (void*) : -``` -(gdb) call hc::am_memtracker_print((void*)0x508c7f000) -``` -- Debugging GPUVM fault. -For example: -``` -Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. - -Program received signal SIGABRT, Aborted. -[Switching to Thread 0x7fffdffb5700 (LWP 14893)] -0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 -56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. -(gdb) bt -#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 -#1 0x00007ffff205b028 in __GI_abort () at abort.c:89 -#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312 -#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 -(gdb) info threads - Id Target Id Frame - 4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 - 3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 -* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 - 1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -(gdb) thread 1 -[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))] -#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -(gdb) bt -#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so -#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 -... -``` - -### General Debugging Tips -- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above. -- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3. This will force HCC to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so. -- VM faults inside kernels can be caused byi: - - incorrect code (ie a for loop which extends past array boundaries), i - - memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers). - - synchronization issues - - compiler issues (incorrect code generation from the compiler) - - runtime issues - --- General debug tips: -- 'gdb --args' can be used to conviently pass the executable and arguments to gdb. -- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign: -``` -(gdb) set env HIP_DB 1 -``` -Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info. -Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info. - - - From cfe81dfbf428db08df324349de0c72c58468abd6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:02:31 -0500 Subject: [PATCH 31/77] Update tests README [ROCm/hip commit: c67b828a5aae908ec3608f1be46919ebaab67fd7] --- projects/hip/tests/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/hip/tests/README.md b/projects/hip/tests/README.md index 223bd149dc..cb41cc10cd 100644 --- a/projects/hip/tests/README.md +++ b/projects/hip/tests/README.md @@ -59,5 +59,9 @@ Find the test and commandline that fail: grep -IR hipMemcpy-modes -IR ../tests/ ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 +# Guidelines for adding new tests +- Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues. +- Make the test run standalone without requirement for command-line arguments. THis makes it easier to debug since the name of the test is shown in the test report and if you know the name of the test you can the run the test. +- For long-running tests or tests with multiple phases, consider using the --tests option as an optional mechanism to allow debuggers to start with the failing subset of the test. From a55ce5bee421434bdfdaaa8d30e1778bca7525f1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:04:23 -0500 Subject: [PATCH 32/77] Add initial HIP_SYNC_NULL_STREAM=0 mode. This eliminates host-synchronization for null stream. Instead, the null-stream uses GPU-side events to wait for other streams. Default is OFF pending additional testing. Add enhanced null-stream test. Also refine HIP_TRACE_API. [ROCm/hip commit: 27877f8854c88c6b806b5528c81faf9009ccab78] --- .../include/hip/hcc_detail/hip_runtime_api.h | 7 +- projects/hip/src/grid_launch.cpp | 2 +- projects/hip/src/hip_device.cpp | 2 +- projects/hip/src/hip_event.cpp | 16 +- projects/hip/src/hip_hcc.cpp | 145 +++++++++++----- projects/hip/src/hip_hcc_internal.h | 12 +- projects/hip/src/hip_memory.cpp | 8 +- projects/hip/src/hip_module.cpp | 4 +- projects/hip/src/hip_stream.cpp | 4 +- .../src/runtimeApi/stream/hipNullStream.cpp | 156 ++++++++++++++---- projects/hip/tests/src/test_common.h | 60 ++++++- 11 files changed, 320 insertions(+), 96 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 175fd64d29..e1aecef1e8 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -602,9 +602,12 @@ hipError_t hipStreamQuery(hipStream_t stream); * * @return #hipSuccess, #hipErrorInvalidResourceHandle * - * If the null stream is specified, this command blocks until all + * This command is host-synchronous : the host will block until the specified stream is empty. + * + * This command follows standard null-stream semantics. Specifically, specifying the null stream will cause the + * command to wait for other streams on the same device to complete all pending operations. + * * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking. - * This command is host-synchronous : the host will block until the stream is empty. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy * diff --git a/projects/hip/src/grid_launch.cpp b/projects/hip/src/grid_launch.cpp index cac01df7dc..ffa50dec95 100644 --- a/projects/hip/src/grid_launch.cpp +++ b/projects/hip/src/grid_launch.cpp @@ -52,7 +52,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream) { - if ((HIP_TRACE_API & (1 << TRACE_CMD)) || + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) { std::stringstream os; diff --git a/projects/hip/src/hip_device.cpp b/projects/hip/src/hip_device.cpp index 01a213190f..93c1c20484 100644 --- a/projects/hip/src/hip_device.cpp +++ b/projects/hip/src/hip_device.cpp @@ -298,7 +298,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device) hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) { HIP_INIT_API(props, device); - return ihipGetDeviceProperties(props, device); + return ihipLogStatus(ihipGetDeviceProperties(props, device)); } hipError_t hipSetDeviceFlags( unsigned int flags) diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index 61ac5cd3ab..fbaf5cc463 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -114,14 +114,17 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) HIP_INIT_API(event, stream); if (event && event->_state != hipEventStatusUnitialized) { + stream = ihipSyncAndResolveStream(stream); + event->_stream = stream; - if (stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream == NULL) { + + // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 + // If stream == NULL, wait on all queues. - // TODO-HCC fix this - is this conservative or still uses device timestamps? - // TODO-HCC can we use barrier or event marker to implement better solution? ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); event->_state = hipEventStatusRecorded; @@ -164,9 +167,10 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (event->_stream == NULL) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { auto *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + // TODO-HIP_SYNC_NULL_STREAM - can remove this code + ctx->locked_syncDefaultStream(true, true); return ihipLogStatus(hipSuccess); } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 07604fe85d..979a2e5028 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -92,6 +92,9 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; +// Sync on host between +int HIP_SYNC_NULL_STREAM = 1; + int HCC_OPT_FLUSH = 0; @@ -289,6 +292,32 @@ inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCri assert(streamCrit->_hasQueue); } +hc::hcWaitMode ihipStream_t::waitMode() const +{ + hc::hcWaitMode waitMode = hc::hcWaitModeActive; + + if (_scheduleMode == Auto) { + if (g_deviceCnt > g_numLogicalThreads) { + waitMode = hc::hcWaitModeActive; + } else { + waitMode = hc::hcWaitModeBlocked; + } + } else if (_scheduleMode == Spin) { + waitMode = hc::hcWaitModeActive; + } else if (_scheduleMode == Yield) { + waitMode = hc::hcWaitModeBlocked; + } else { + assert(0); // bad wait mode. + } + + if (HIP_WAIT_MODE == 1) { + waitMode = hc::hcWaitModeBlocked; + } else if (HIP_WAIT_MODE == 2) { + waitMode = hc::hcWaitModeActive; + } + + return waitMode; +} //Wait for all kernel and data copy commands in this stream to complete. //This signature should be used in routines that already have locked the stream mutex @@ -296,29 +325,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { if (crit->_hasQueue) { tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - hc::hcWaitMode waitMode = hc::hcWaitModeActive; - if (_scheduleMode == Auto) { - if (g_deviceCnt > g_numLogicalThreads) { - waitMode = hc::hcWaitModeActive; - } else { - waitMode = hc::hcWaitModeBlocked; - } - } else if (_scheduleMode == Spin) { - waitMode = hc::hcWaitModeActive; - } else if (_scheduleMode == Yield) { - waitMode = hc::hcWaitModeBlocked; - } else { - assert(0); // bad wait mode. - } - - if (HIP_WAIT_MODE == 1) { - waitMode = hc::hcWaitModeBlocked; - } else if (HIP_WAIT_MODE == 2) { - waitMode = hc::hcWaitModeActive; - } - - crit->_av.wait(waitMode); + crit->_av.wait(waitMode()); } else { tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); } @@ -337,7 +345,7 @@ void ihipStream_t::locked_wait() }; // Causes current stream to wait for specified event to complete: -// Note this does not require any kind of host serialization. +// Note this does not provide any kind of host serialization. void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); @@ -1061,26 +1069,57 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) // Implement "default" stream syncronization // This waits for all other streams to drain before continuing. // If waitOnSelf is set, this additionally waits for the default stream to empty. -void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf) +// In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other +// activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. +void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { LockedAccessor_CtxCrit_t crit(_criticalData); - tprintf(DB_SYNC, "syncDefaultStream\n"); + tprintf(DB_SYNC, "syncDefaultStream \n"); + + // Vector of ops sent to each stream that will complete before ops sent to null stream: + std::vector depOps; for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { + if (HIP_SYNC_NULL_STREAM) { - if (waitOnSelf || (stream != _defaultStream)) { - // TODO-hcc - use blocking or active wait here? - // TODO-sync - cudaDeviceBlockingSync - stream->locked_wait(); + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream + if (!(stream->_flags & hipStreamNonBlocking)) { + + if (waitOnSelf || (stream != _defaultStream)) { + stream->locked_wait(); + } + } + } else { + if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); + + // The last marker will provide appropriate visibility: + if (!streamCrit->_av.get_is_empty()) { + depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + } } } } + + + + // Enqueue a barrier to wait on all the barriers we sent above: + if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { + LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); + if (syncHost) { + defaultCf.wait(); // TODO - account for active or blocking here. + } + } + + tprintf(DB_SYNC, " syncDefaultStream depOps=%zu\n", depOps.size()); + } @@ -1267,6 +1306,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_FAIL_SOC, 0, "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation. Bit 0x1=Fail on async copy with unpinned memory. Bit 0x2=Fail peer copy rather than use staging buffer copy"); READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); + READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); // TODO - review, can we remove this? READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); @@ -1274,7 +1314,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impact HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { @@ -1415,17 +1455,44 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { - ihipCtx_t *device = ihipGetTlsDefaultCtx(); + ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); #ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - device->locked_syncDefaultStream(false); + ctx->locked_syncDefaultStream(false, false); #endif - return device->_defaultStream; + return ctx->_defaultStream; } else { - // ALl streams have to wait for legacy default stream to be empty: + // All streams have to wait for legacy default stream to be empty: if (!(stream->_flags & hipStreamNonBlocking)) { - tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str()); - stream->getCtx()->_defaultStream->locked_wait(); + if (HIP_SYNC_NULL_STREAM) { + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + stream->getCtx()->_defaultStream->locked_wait(); + } else { + ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; + + tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); + + bool needMarker = false; + hc::completion_future dcf; + { + LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); + // TODO - could call create_blocking_marker(queue) + if (!defaultStreamCrit->_av.get_is_empty()) { + needMarker = true; + + // TODO - add "none_scope". + dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } + } + + if (needMarker) { + // ensure any commands sent to this stream wait on the NULL stream before continuing + LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); + // TODO - could be "noret" version of create_blocking_marker + thisStreamCrit->_av.create_blocking_marker(dcf); + } + } } return stream; diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 7787242ca7..0d080f9225 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -66,6 +66,8 @@ extern int HIP_COHERENT_HOST_ALLOC; // Chicken bits for disabling functionality to work around potential issues: extern int HIP_SYNC_HOST_ALLOC; +extern int HIP_SYNC_NULL_STREAM; + // TODO - remove when this is standard behavior. extern int HCC_OPT_FLUSH; @@ -187,11 +189,11 @@ extern const char *API_COLOR_END; //--- -//HIP Trace modes +//HIP Trace modes - use with HIP_TRACE_API=... #define TRACE_ALL 0 // 0x1 #define TRACE_KCMD 1 // 0x2, kernel command #define TRACE_MCMD 2 // 0x4, memory command -#define TRACE_MEM 3 // 0x8 +#define TRACE_MEM 3 // 0x8, memory allocation or deallocation. //--- @@ -276,7 +278,7 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); API_TRACE(0, __VA_ARGS__); -// Like above, but will trace with TRACE_CMD. +// Like above, but will trace with a specified "special" bit. // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU: // kernel launches, copy commands, memory sets, etc. #define HIP_INIT_SPECIAL_API(tbit, ...) \ @@ -521,8 +523,10 @@ public: void locked_waitEvent(hipEvent_t event); void locked_recordEvent(hipEvent_t event); + ihipStreamCritical_t &criticalData() { return _criticalData; }; //--- + hc::hcWaitMode waitMode() const; // Use this if we already have the stream critical data mutex: void wait(LockedAccessor_StreamCrit_t &crit); @@ -786,7 +790,7 @@ public: // Functions: void locked_removeStream(ihipStream_t *s); void locked_reset(); void locked_waitAllStreams(); - void locked_syncDefaultStream(bool waitOnSelf); + void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); // Will allocate a queue and assign it to the needyStream: hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index cef676b572..5501fec734 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -525,7 +525,7 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind); if(symbolName == nullptr) { @@ -598,7 +598,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind, stream); if(symbolName == nullptr) { @@ -807,7 +807,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); if(width > dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); hipError_t e = hipSuccess; @@ -1041,7 +1041,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index b359e7a63c..da01f23769 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -352,14 +352,14 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *func = sym; hmod->funcTrack.push_back(*func); } - return ihipLogStatus(ret); + return ret; } hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name){ HIP_INIT_API(hfunc, hmod, name); - return ihipModuleGetSymbol(hfunc, hmod, name); + return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name)); } diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index d7f8717725..34b4bc8851 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -150,7 +150,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; @@ -174,7 +174,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; diff --git a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp index f8d201cb51..380979f6bc 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -27,8 +27,9 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include -unsigned p_streams = 6; +unsigned p_streams =16; int p_repeat = 10; +int p_db = 0; template @@ -45,7 +46,7 @@ vectorADDRepeat(hipLaunchParm lp, for (int j=1; j<=repeat;j++) { for (size_t i=offset; i::Streamer(size_t numElements, bool useNullStream) : HIPCHECK(hipStreamCreate(&_stream)); } HIPCHECK(hipEventCreate(&_event)); + + H2D(); + }; +template +void Streamer::H2D() +{ + HIPCHECK(hipMemcpy(_A_d, _A_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(_B_d, _B_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); +} + +template +void Streamer::D2H() +{ + HIPCHECK(hipMemcpy(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost)); +} + +template +void Streamer::reset() +{ + HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); + H2D(); + +} + + template void Streamer::enqueAsync() { @@ -131,6 +161,10 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--repeat") || (!strcmp(arg, "-r"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_repeat)) { + failed("Bad repeat argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -138,6 +172,15 @@ void parseMyArguments(int argc, char *argv[]) }; +void +printBuffer(std::string name, int *f, size_t numElements) +{ + std::cout << name << "\n"; + for (size_t i=0; i FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; - float *expected_H = (float*)malloc(numElements*sizeof(float)); + int *expected_H = (int*)malloc(numElements*sizeof(int)); - auto nullStreamer = new FloatStreamer(numElements, true); + auto nullStreamer = new IntStreamer(numElements, true); + + // Expected resultr - last streamer runs vectorADDRepeat, then nullstreamer adds lastStreamer->_C_d + lastStreamer->_C_d for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat) *2; } for (int i=0; i Test 0x1 runAsnc\n"); - for (int i=0; ienqueAsync(); + for (int s=1; s Test %x runAsnc, #streams=%d\n", (1<reset(); + + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + + if (p_db) { + HIPCHECK(hipDeviceSynchronize()); + lastStreamer->D2H(); + printBuffer("lastStream _A_h", lastStreamer->_A_h, min(numElements, size_t(20))); + printBuffer("lastStream _B_h", lastStreamer->_B_h, min(numElements, size_t(20))); + printBuffer("lastStream _C_h", lastStreamer->_C_h, min(numElements, size_t(20))); + } + nullStreamer->D2H(); + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - - auto lastStreamer = streamers[p_streams - 1]; - - // Dispatch to NULL stream, should wait for prior async activity to complete. - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); - HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); - HIPCHECK(hipStreamSynchronize(0)); - - - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - if (p_tests & 0x2) { - printf ("==> Test 0x2 runAsnc-odd-only\n"); - for (int i=0; ienqueAsync(); + for (int s=1; sreset(); + printf ("==> Test %x runAsnc-odd-only, #streams=%d\n", tmask, s); + for (int i=0; ienqueAsync(); + } } + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + nullStreamer->D2H(); + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } + // Expected resultr - last streamer runs vectorADDRepeat + for (size_t i=0; i_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat); + } + + if (p_tests & 0x20000) { + + assert (p_streams >=2); // need a couple streams in order to run this test. + nullStreamer->reset(); + printf ("\n==> Test hipStreamSynchronize with defaultStream \n"); + + // Enqueue a long-running job to stream1 + streamers[0]->enqueAsync(); + + // Check to see if synchronizing on a null stream synchronizes all other streams or just the null stream. + // This function follows null stream semantics and will wait for all other blocking streams before returning. + // This will wait on the host + HIPCHECK(hipStreamSynchronize(0)); + + // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. + HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); + + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + } + passed(); } diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 633ee6f825..1a6e51e08e 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -184,6 +184,20 @@ addCountReverse( const T *A_d, } +void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) +{ + // Initialize the host data: + for (size_t i=0; i void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) @@ -217,15 +231,10 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - // Initialize the host data: - for (size_t i=0; i void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, @@ -367,6 +376,43 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } +// Assumes C_h contains vector add of A_h + B_h +// Calls the test "failed" macro if a mismatch is detected. +template +void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch=true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for (size_t i=0; i Date: Sat, 13 May 2017 16:00:26 +0000 Subject: [PATCH 33/77] Fix HIP_TRACE_API so kernel launch only printed when requested. [ROCm/hip commit: 427f8472aa4fb51203f217f9f69a195579d4c7de] --- projects/hip/src/grid_launch.cpp | 2 +- projects/hip/tests/src/test_common.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/hip/src/grid_launch.cpp b/projects/hip/src/grid_launch.cpp index ffa50dec95..f3b28c5f60 100644 --- a/projects/hip/src/grid_launch.cpp +++ b/projects/hip/src/grid_launch.cpp @@ -54,7 +54,7 @@ namespace hip_impl { if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || - (COMPILE_HIP_DB && HIP_TRACE_API)) { + (COMPILE_HIP_DB && (HIP_TRACE_API & (1< void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { // Initialize the host data: From bd7a374f2024975e6810265175b8ca743e00a7c5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 May 2017 18:56:40 -0500 Subject: [PATCH 34/77] Make hipMultiThreadStreams1 test a little harsher. Fail faster if synchronization rules are violated. Run vectorAddRevers to read last elements of array first - if the vector add kernel starts before preceding copy finishes we will read stale data and flag the error. Increase default array sizes, so synchronization errors more easily exposed. [ROCm/hip commit: 2e1fec47ab44e3716c49e6f5f67a3512a45d3c46] --- .../multiThread/hipMultiThreadStreams1.cpp | 45 +++++++++++++++---- projects/hip/tests/src/test_common.h | 39 ++++++++++++---- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 229ceea440..4f73b67ad7 100644 --- a/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -29,6 +29,8 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" +int p_iters=10; + void printSep() { printf ("======================================================================================\n"); @@ -43,7 +45,7 @@ template< class P=HipTest::Unpinned, class C=HipTest::Memcpy > -void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) +void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) { using HipTest::MemTraits; @@ -57,6 +59,24 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned); + for (size_t i=0; i::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(C_d, C_h, Nbytes, hipMemcpyHostToDevice, stream); + HIPCHECK (hipDeviceSynchronize()); + + for (size_t i=0; i::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); MemTraits::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + //HIPCHECK(hipStreamSynchronize(stream)); + + // This is the null stream? + //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); @@ -76,9 +100,9 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) } HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned); + std::cout <<" pid" << pid << " success\n"; HIPCHECK (hipDeviceSynchronize()); - std::cout <<" pid" << pid << " success\n"; } template @@ -88,12 +112,14 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s printf ("%s\n", __func__); std::cout << testName << std::endl; + size_t numElements = N; + // Test 2 threads operating on same stream: - std::thread t1 (simpleVectorCopy, 2000000/*mb*/, 100/*iters*/, stream0); + std::thread t1 (simpleVectorAdd, numElements, p_iters/*iters*/, stream0); if (serialize) { t1.join(); } - std::thread t2 (simpleVectorCopy, 2000000/*mb*/, 100/*iters*/, stream1); + std::thread t2 (simpleVectorAdd, numElements, p_iters/*iters*/, stream1); if (serialize) { t2.join(); } @@ -109,6 +135,7 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s int main(int argc, char *argv[]) { + N = 8000000; HipTest::parseStandardArguments(argc, argv, true); printf ("info: set device to %d\n", p_gpuDevice); @@ -121,8 +148,8 @@ int main(int argc, char *argv[]) hipStream_t stream; HIPCHECK (hipStreamCreate(&stream)); - simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); - simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + simpleVectorAdd (N/*mb*/, 10/*iters*/, stream); + simpleVectorAdd (N/*mb*/, 10/*iters*/, stream); HIPCHECK(hipStreamDestroy(stream)); } @@ -139,8 +166,8 @@ int main(int argc, char *argv[]) } if (p_tests & 0x4) { - test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); - test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); + //test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); + //test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); test_multiThread_1 ("Multithread with one stream", stream0, stream0, false); } diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 2c6905eea2..bb44c94745 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -146,6 +146,23 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +vectorADDReverse(hipLaunchParm lp, + const T *A_d, + const T *B_d, + T *C_d, + size_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = A_d[i] + B_d[i]; + } +} + + template __global__ void addCount( const T *A_d, @@ -343,7 +360,7 @@ inline void initHIPArrays(hipArray **A_d, hipArray **B_d, hipArray **C_d, // Assumes C_h contains vector add of A_h + B_h // Calls the test "failed" macro if a mismatch is detected. template -void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true) +size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true, bool reportMismatch=true) { size_t mismatchCount = 0; size_t firstMismatch = 0; @@ -364,15 +381,19 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } } - if (expectMatch) { - if (mismatchCount) { - failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + if (reportMismatch) { + if (expectMatch) { + if (mismatchCount) { + failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + } + } else { + if (mismatchCount == 0) { + failed("expected mismatches but did not detect any!"); + } } - } else { - if (mismatchCount == 0) { - failed("expected mismatches but did not detect any!"); - } - } + } + + return mismatchCount; } From 4dbebe0409e25f6381a997c6572dbe547930185d Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 16 May 2017 21:35:40 -0500 Subject: [PATCH 35/77] changed vector types to make sure it generate proper llvm vector types Change-Id: I6c4616dae137dc4eac35e5827dc5b7f3251e0247 [ROCm/hip commit: 9dceccf1365ad058a5ba383d5f83473278796d1b] --- .../hip/include/hip/hcc_detail/hip_fp16.h | 125 +- .../include/hip/hcc_detail/hip_vector_types.h | 4067 +---------------- projects/hip/src/hip_fp16.cpp | 442 +- projects/hip/src/hip_hc_gfx803.ll | 147 +- 4 files changed, 266 insertions(+), 4515 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_fp16.h b/projects/hip/include/hip/hcc_detail/hip_fp16.h index 0a861b64af..f1f52e4122 100644 --- a/projects/hip/include/hip/hcc_detail/hip_fp16.h +++ b/projects/hip/include/hip/hcc_detail/hip_fp16.h @@ -25,17 +25,6 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_vector_types.h" -#if __clang_major__ > 3 - -typedef __fp16 __half; - -typedef struct __attribute__((aligned(4))){ - union { - __half p[2]; - unsigned int q; - }; -} __half2; - typedef __half half; typedef __half2 half2; @@ -214,10 +203,10 @@ __device__ __half __ushort2half_ru(unsigned short int i); __device__ __half __ushort2half_rz(unsigned short int i); __device__ __half __ushort_as_half(const unsigned short int i); -extern "C" int __hip_hc_ir_hadd2_int(int, int); -extern "C" int __hip_hc_ir_hfma2_int(int, int, int); -extern "C" int __hip_hc_ir_hmul2_int(int, int); -extern "C" int __hip_hc_ir_hsub2_int(int, int); +extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2); +extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2); +extern "C" __half2 __hip_hc_ir_hmul2_int(__half2, __half2); +extern "C" __half2 __hip_hc_ir_hsub2_int(__half2, __half2); extern "C" __half __hip_hc_ir_hceil_half(__half) __asm("llvm.ceil.f16"); extern "C" __half __hip_hc_ir_hcos_half(__half) __asm("llvm.cos.f16"); @@ -231,16 +220,16 @@ extern "C" __half __hip_hc_ir_hsin_half(__half) __asm("llvm.sin.f16"); extern "C" __half __hip_hc_ir_hsqrt_half(__half) __asm("llvm.sqrt.f16"); extern "C" __half __hip_hc_ir_htrunc_half(__half) __asm("llvm.trunc.f16"); -extern "C" int __hip_hc_ir_h2ceil_int(int); -extern "C" int __hip_hc_ir_h2cos_int(int); -extern "C" int __hip_hc_ir_h2exp2_int(int); -extern "C" int __hip_hc_ir_h2floor_int(int); -extern "C" int __hip_hc_ir_h2log2_int(int); -extern "C" int __hip_hc_ir_h2rcp_int(int); -extern "C" int __hip_hc_ir_h2rsqrt_int(int); -extern "C" int __hip_hc_ir_h2sin_int(int); -extern "C" int __hip_hc_ir_h2sqrt_int(int); -extern "C" int __hip_hc_ir_h2trunc_int(int); +extern "C" __half2 __hip_hc_ir_h2ceil_int(__half2); +extern "C" __half2 __hip_hc_ir_h2cos_int(__half2); +extern "C" __half2 __hip_hc_ir_h2exp2_int(__half2); +extern "C" __half2 __hip_hc_ir_h2floor_int(__half2); +extern "C" __half2 __hip_hc_ir_h2log2_int(__half2); +extern "C" __half2 __hip_hc_ir_h2rcp_int(__half2); +extern "C" __half2 __hip_hc_ir_h2rsqrt_int(__half2); +extern "C" __half2 __hip_hc_ir_h2sin_int(__half2); +extern "C" __half2 __hip_hc_ir_h2sqrt_int(__half2); +extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2); /* Half2 Arithmetic Functions @@ -248,63 +237,63 @@ extern "C" int __hip_hc_ir_h2trunc_int(int); __device__ static inline __half2 __hadd2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) { __half2 d; - d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); return d; } __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) { __half2 d; - d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); return d; } __device__ static inline __half2 __hmul2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hsub2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hneg2(__half2 a) { __half2 c; - c.p[0] = - a.p[0]; - c.p[1] = - a.p[1]; + c.x = - a.x; + c.y = - a.y; return c; } __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); return c; } __device__ static inline __half2 h2div(__half2 a, __half2 b) { __half2 c; - c.p[0] = a.p[0] / b.p[0]; - c.p[1] = a.p[1] / b.p[1]; + c.x = a.x / b.x; + c.y = a.y / b.y; return c; } @@ -375,112 +364,94 @@ Half2 Math Operations __device__ static inline __half2 h2ceil(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2ceil_int(h.q); + a.xy = __hip_hc_ir_h2ceil_int(h.xy); return a; } __device__ static inline __half2 h2cos(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2cos_int(h.q); + a.xy = __hip_hc_ir_h2cos_int(h.xy); return a; } __device__ static inline __half2 h2exp(const __half2 h) { __half2 factor; - factor.p[0] = 1.442694; - factor.p[1] = 1.442694; - factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q)); + factor.x = 1.442694; + factor.y = 1.442694; + factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); return factor; } __device__ static inline __half2 h2exp10(const __half2 h) { __half2 factor; - factor.p[0] = 3.3219281; - factor.p[1] = 3.3219281; - factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q)); + factor.x = 3.3219281; + factor.y = 3.3219281; + factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); return factor; } __device__ static inline __half2 h2exp2(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2exp2_int(h.q); + a.xy = __hip_hc_ir_h2exp2_int(h.xy); return a; } __device__ static inline __half2 h2floor(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2floor_int(h.q); + a.xy = __hip_hc_ir_h2floor_int(h.xy); return a; } __device__ static inline __half2 h2log(const __half2 h) { __half2 factor; - factor.p[0] = 0.693147; - factor.p[1] = 0.693147; - factor. q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q); + factor.x = 0.693147; + factor.y = 0.693147; + factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); return factor; } __device__ static inline __half2 h2log10(const __half2 h) { __half2 factor; - factor.p[0] = 0.301029; - factor.p[1] = 0.301029; - factor.q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q); + factor.x = 0.301029; + factor.y = 0.301029; + factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); return factor; } __device__ static inline __half2 h2log2(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2log2_int(h.q); + a.xy = __hip_hc_ir_h2log2_int(h.xy); return a; } __device__ static inline __half2 h2rcp(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2rcp_int(h.q); + a.xy = __hip_hc_ir_h2rcp_int(h.xy); return a; } __device__ static inline __half2 h2rsqrt(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2rsqrt_int(h.q); + a.xy = __hip_hc_ir_h2rsqrt_int(h.xy); return a; } __device__ static inline __half2 h2sin(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2sin_int(h.q); + a.xy = __hip_hc_ir_h2sin_int(h.xy); return a; } __device__ static inline __half2 h2sqrt(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2sqrt_int(h.q); + a.xy = __hip_hc_ir_h2sqrt_int(h.xy); return a; } __device__ static inline __half2 h2trunc(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2trunc_int(h.q); + a.xy = __hip_hc_ir_h2trunc_int(h.xy); return a; } -#endif - -#if __clang_major__ == 3 - -typedef struct { - unsigned x: 16; -} __half; - -typedef struct __attribute__((aligned(4))){ - union { - __half p[2]; - unsigned int q; - }; -} __half2; - - -#endif - #endif diff --git a/projects/hip/include/hip/hcc_detail/hip_vector_types.h b/projects/hip/include/hip/hcc_detail/hip_vector_types.h index 35c6c23548..251da504ab 100644 --- a/projects/hip/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/hip/include/hip/hcc_detail/hip_vector_types.h @@ -34,1120 +34,93 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x) { } \ -__device__ __host__ type(const type& val) : x(val.x) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } - - -#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val) {} \ - -#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val) {} \ -__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} - -#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} - -#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} - -struct uchar1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long) - - #endif - unsigned char x; - -} __attribute__((aligned(1))); - -struct uchar2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long) - #endif - union { - struct { - unsigned char x, y; - }; - unsigned short a; - }; -} __attribute__((aligned(2))); - -struct uchar3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long) - #endif - unsigned char x, y, z; -}; - -struct uchar4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long) - #endif - union { - struct { - unsigned char x, y, z, w; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - - -struct char1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long) - #endif - signed char x; -} __attribute__((aligned(1))); - -struct char2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long) - #endif - union { - struct { - signed char x, y; - }; - unsigned short a; - }; -} __attribute__((aligned(2))); - -struct char3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long) - #endif - signed char x, y, z; -}; - -struct char4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long) - #endif - union { - struct { - signed char x, y, z, w; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - - - -struct ushort1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long) - #endif - unsigned short x; -} __attribute__((aligned(2))); - -struct ushort2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long) - #endif - union { - struct { - unsigned short x, y; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - -struct ushort3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long) - #endif - unsigned short x, y, z; -}; - -struct ushort4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long) - #endif - union { - struct { - unsigned short x, y, z, w; - }; - unsigned int a, b; - }; -} __attribute__((aligned(8))); - -struct short1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long) - #endif - signed short x; -} __attribute__((aligned(2))); - -struct short2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long) - #endif - union { - struct { - signed short x, y; - }; - unsigned int a; - }; - -} __attribute__((aligned(4))); - -struct short3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long) - #endif - signed short x, y, z; -}; - -struct short4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long) - #endif - union { - struct { - signed short x, y, z, w; - }; - unsigned int a, b; - }; -} __attribute__((aligned(8))); - - -struct uint1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long) - #endif - unsigned int x; -} __attribute__((aligned(4))); - -struct uint2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long) - #endif - unsigned int x, y; -} __attribute__((aligned(8))); - -struct uint3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long) - #endif - unsigned int x, y, z; -}; - -struct uint4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long) - #endif - unsigned int x, y, z, w; -} __attribute__((aligned(16))); - -struct int1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long) - #endif - signed int x; -} __attribute__((aligned(4))); - -struct int2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long) - #endif - signed int x, y; -} __attribute__((aligned(8))); - -struct int3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long) - #endif - signed int x, y, z; -}; - -struct int4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long) - #endif - signed int x, y, z, w; -} __attribute__((aligned(16))); - - -struct float1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long) - #endif - float x; -} __attribute__((aligned(4))); - -struct float2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long) - #endif - float x, y; -} __attribute__((aligned(8))); - -struct float3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long) - #endif - float x, y, z; -}; - -struct float4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long) - #endif - float x, y, z, w; -} __attribute__((aligned(16))); - - - -struct double1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long) - #endif - double x; -} __attribute__((aligned(8))); - -struct double2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long) - #endif - double x, y; -} __attribute__((aligned(16))); - -struct double3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long) - #endif - double x, y, z; -}; - -struct double4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long) - #endif - double x, y, z, w; -} __attribute__((aligned(32))); - - -struct ulong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long) - #endif - unsigned long x; -} __attribute__((aligned(8))); - -struct ulong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long) - #endif - unsigned long x, y; -} __attribute__((aligned(16))); - -struct ulong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long) - #endif - unsigned long x, y, z; -}; - -struct ulong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long) - #endif - unsigned long x, y, z, w; -} __attribute__((aligned(32))); - - -struct long1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long) - #endif - signed long x; -} __attribute__((aligned(8))); - -struct long2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long) - #endif - signed long x, y; -} __attribute__((aligned(16))); - -struct long3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long) - #endif - signed long x, y, z; -}; - -struct long4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long) - #endif - signed long x, y, z, w; -} __attribute__((aligned(32))); - - -struct ulonglong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long) - #endif - unsigned long long x; -} __attribute__((aligned(8))); - -struct ulonglong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long) - #endif - unsigned long long x, y; -} __attribute__((aligned(16))); - -struct ulonglong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long) - #endif - unsigned long long x, y, z; -}; - -struct ulonglong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long) - #endif - unsigned long long x, y, z, w; -} __attribute__((aligned(32))); - - -struct longlong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long) - #endif - signed long long x; -} __attribute__((aligned(8))); - -struct longlong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long) - #endif - signed long long x, y; -} __attribute__((aligned(16))); - -struct longlong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long) - #endif - signed long long x, y, z; -}; - -struct longlong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long) - #endif - signed long x, y, z, w; -} __attribute__((aligned(32))); +#if __cplusplus + +typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); +typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); +typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); +typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); + +typedef signed char char1 __attribute__((ext_vector_type(1))); +typedef signed char char2 __attribute__((ext_vector_type(2))); +typedef signed char char3 __attribute__((ext_vector_type(3))); +typedef signed char char4 __attribute__((ext_vector_type(4))); + +typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); +typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); +typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); +typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); + +typedef signed short short1 __attribute__((ext_vector_type(1))); +typedef signed short short2 __attribute__((ext_vector_type(2))); +typedef signed short short3 __attribute__((ext_vector_type(3))); +typedef signed short short4 __attribute__((ext_vector_type(4))); + +typedef __fp16 __half; + +typedef __fp16 __half1 __attribute__((ext_vector_type(1))); +typedef __fp16 __half2 __attribute__((ext_vector_type(2))); +typedef __fp16 __half3 __attribute__((ext_vector_type(3))); +typedef __fp16 __half4 __attribute__((ext_vector_type(4))); + +typedef unsigned int uint1 __attribute__((ext_vector_type(1))); +typedef unsigned int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned int uint3 __attribute__((ext_vector_type(3))); +typedef unsigned int uint4 __attribute__((ext_vector_type(4))); + +typedef signed int int1 __attribute__((ext_vector_type(1))); +typedef signed int int2 __attribute__((ext_vector_type(2))); +typedef signed int int3 __attribute__((ext_vector_type(3))); +typedef signed int int4 __attribute__((ext_vector_type(4))); + +typedef float float1 __attribute__((ext_vector_type(1))); +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); + +typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); +typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); +typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); +typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); + +typedef signed long long1 __attribute__((ext_vector_type(1))); +typedef signed long long2 __attribute__((ext_vector_type(2))); +typedef signed long long3 __attribute__((ext_vector_type(3))); +typedef signed long long4 __attribute__((ext_vector_type(4))); + +typedef double double1 __attribute__((ext_vector_type(1))); +typedef double double2 __attribute__((ext_vector_type(2))); +typedef double double3 __attribute__((ext_vector_type(3))); +typedef double double4 __attribute__((ext_vector_type(4))); + +typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); +typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); +typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); +typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); + +typedef signed long long longlong1 __attribute__((ext_vector_type(1))); +typedef signed long long longlong2 __attribute__((ext_vector_type(2))); +typedef signed long long longlong3 __attribute__((ext_vector_type(3))); +typedef signed long long longlong4 __attribute__((ext_vector_type(4))); #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x) { \ + type ret; \ ret.x = x; \ return ret; \ } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y) { \ + type ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ + type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -1155,8 +128,8 @@ __device__ __host__ static inline struct type make_##type(comp x, comp y, comp z } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z, comp w) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \ + type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -1164,6 +137,7 @@ __device__ __host__ static inline struct type make_##type(comp x, comp y, comp z return ret; \ } + DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1); DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2); DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3); @@ -1225,2894 +199,9 @@ DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4); -#if __cplusplus - -#define DECLOP_1VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - return lhs; \ -} - -#define DECLOP_1VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - return val; \ -} - -#define DECLOP_1VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - val.x op; \ - return ret; \ -} - -#define DECLOP_1VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return lhs.x op rhs.x; \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return lhs.x op rhs.x; \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return lhs.x op rhs.x ; \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return lhs.x op rhs.x ; \ -} - -#define DECLOP_1VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type& rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type& rhs) { \ - return op rhs.x; \ -} - -/* - Two Element Access -*/ - -#define DECLOP_2VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - return lhs; \ -} - -#define DECLOP_2VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - return val; \ -} - -#define DECLOP_2VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - val.x op; \ - val.y op; \ - return ret; \ -} - -#define DECLOP_2VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} - -#define DECLOP_2VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y); \ -} - - -/* - Three Element Access -*/ - -#define DECLOP_3VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - ret.z = lhs.z op rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - ret.z = lhs.z * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - ret.z = lhs * rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - lhs.z op rhs.z; \ - return lhs; \ -} - -#define DECLOP_3VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - op val.z; \ - return val; \ -} - -#define DECLOP_3VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - ret.z = val.z; \ - val.x op; \ - val.y op; \ - val.z op; \ - return ret; \ -} - -#define DECLOP_3VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ - -#define DECLOP_3VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - ret.z = op rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y) && (op rhs.z); \ -} - - -/* - Four Element Access -*/ - -#define DECLOP_4VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op ( const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - ret.z = lhs.z op rhs.z; \ - ret.w = lhs.w op rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - ret.z = lhs.z * rhs; \ - ret.w = lhs.w * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - ret.z = lhs * rhs.z; \ - ret.w = lhs * rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - lhs.z op rhs.z; \ - lhs.w op rhs.w; \ - return lhs; \ -} - -#define DECLOP_4VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - op val.z; \ - op val.w; \ - return val; \ -} - -#define DECLOP_4VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - ret.z = val.z; \ - ret.w = val.w; \ - val.x op; \ - val.y op; \ - val.z op; \ - val.w op; \ - return ret; \ -} - -#define DECLOP_4VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} - -#define DECLOP_4VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - ret.z = op rhs.z; \ - ret.w = op rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ -} - - -/* -Overloading operators -*/ - -// UNSIGNED CHAR1 - -DECLOP_1VAR_2IN_1OUT(uchar1, +) -DECLOP_1VAR_2IN_1OUT(uchar1, -) -DECLOP_1VAR_2IN_1OUT(uchar1, *) -DECLOP_1VAR_2IN_1OUT(uchar1, /) -DECLOP_1VAR_2IN_1OUT(uchar1, %) -DECLOP_1VAR_2IN_1OUT(uchar1, &) -DECLOP_1VAR_2IN_1OUT(uchar1, |) -DECLOP_1VAR_2IN_1OUT(uchar1, ^) -DECLOP_1VAR_2IN_1OUT(uchar1, <<) -DECLOP_1VAR_2IN_1OUT(uchar1, >>) - - -DECLOP_1VAR_ASSIGN(uchar1, +=) -DECLOP_1VAR_ASSIGN(uchar1, -=) -DECLOP_1VAR_ASSIGN(uchar1, *=) -DECLOP_1VAR_ASSIGN(uchar1, /=) -DECLOP_1VAR_ASSIGN(uchar1, %=) -DECLOP_1VAR_ASSIGN(uchar1, &=) -DECLOP_1VAR_ASSIGN(uchar1, |=) -DECLOP_1VAR_ASSIGN(uchar1, ^=) -DECLOP_1VAR_ASSIGN(uchar1, <<=) -DECLOP_1VAR_ASSIGN(uchar1, >>=) - -DECLOP_1VAR_PREOP(uchar1, ++) -DECLOP_1VAR_PREOP(uchar1, --) - -DECLOP_1VAR_POSTOP(uchar1, ++) -DECLOP_1VAR_POSTOP(uchar1, --) - -DECLOP_1VAR_COMP(uchar1, ==) -DECLOP_1VAR_COMP(uchar1, !=) -DECLOP_1VAR_COMP(uchar1, <) -DECLOP_1VAR_COMP(uchar1, >) -DECLOP_1VAR_COMP(uchar1, <=) -DECLOP_1VAR_COMP(uchar1, >=) - -DECLOP_1VAR_COMP(uchar1, &&) -DECLOP_1VAR_COMP(uchar1, ||) - -DECLOP_1VAR_1IN_1OUT(uchar1, ~) -DECLOP_1VAR_1IN_BOOLOUT(uchar1, !) - -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, float) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, double) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long) - -// UNSIGNED CHAR2 - -DECLOP_2VAR_2IN_1OUT(uchar2, +) -DECLOP_2VAR_2IN_1OUT(uchar2, -) -DECLOP_2VAR_2IN_1OUT(uchar2, *) -DECLOP_2VAR_2IN_1OUT(uchar2, /) -DECLOP_2VAR_2IN_1OUT(uchar2, %) -DECLOP_2VAR_2IN_1OUT(uchar2, &) -DECLOP_2VAR_2IN_1OUT(uchar2, |) -DECLOP_2VAR_2IN_1OUT(uchar2, ^) -DECLOP_2VAR_2IN_1OUT(uchar2, <<) -DECLOP_2VAR_2IN_1OUT(uchar2, >>) - -DECLOP_2VAR_ASSIGN(uchar2, +=) -DECLOP_2VAR_ASSIGN(uchar2, -=) -DECLOP_2VAR_ASSIGN(uchar2, *=) -DECLOP_2VAR_ASSIGN(uchar2, /=) -DECLOP_2VAR_ASSIGN(uchar2, %=) -DECLOP_2VAR_ASSIGN(uchar2, &=) -DECLOP_2VAR_ASSIGN(uchar2, |=) -DECLOP_2VAR_ASSIGN(uchar2, ^=) -DECLOP_2VAR_ASSIGN(uchar2, <<=) -DECLOP_2VAR_ASSIGN(uchar2, >>=) - -DECLOP_2VAR_PREOP(uchar2, ++) -DECLOP_2VAR_PREOP(uchar2, --) - -DECLOP_2VAR_POSTOP(uchar2, ++) -DECLOP_2VAR_POSTOP(uchar2, --) - -DECLOP_2VAR_COMP(uchar2, ==) -DECLOP_2VAR_COMP(uchar2, !=) -DECLOP_2VAR_COMP(uchar2, <) -DECLOP_2VAR_COMP(uchar2, >) -DECLOP_2VAR_COMP(uchar2, <=) -DECLOP_2VAR_COMP(uchar2, >=) - -DECLOP_2VAR_COMP(uchar2, &&) -DECLOP_2VAR_COMP(uchar2, ||) - -DECLOP_2VAR_1IN_1OUT(uchar2, ~) -DECLOP_2VAR_1IN_BOOLOUT(uchar2, !) - -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, float) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, double) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long) - -// UNSIGNED CHAR3 - -DECLOP_3VAR_2IN_1OUT(uchar3, +) -DECLOP_3VAR_2IN_1OUT(uchar3, -) -DECLOP_3VAR_2IN_1OUT(uchar3, *) -DECLOP_3VAR_2IN_1OUT(uchar3, /) -DECLOP_3VAR_2IN_1OUT(uchar3, %) -DECLOP_3VAR_2IN_1OUT(uchar3, &) -DECLOP_3VAR_2IN_1OUT(uchar3, |) -DECLOP_3VAR_2IN_1OUT(uchar3, ^) -DECLOP_3VAR_2IN_1OUT(uchar3, <<) -DECLOP_3VAR_2IN_1OUT(uchar3, >>) - -DECLOP_3VAR_ASSIGN(uchar3, +=) -DECLOP_3VAR_ASSIGN(uchar3, -=) -DECLOP_3VAR_ASSIGN(uchar3, *=) -DECLOP_3VAR_ASSIGN(uchar3, /=) -DECLOP_3VAR_ASSIGN(uchar3, %=) -DECLOP_3VAR_ASSIGN(uchar3, &=) -DECLOP_3VAR_ASSIGN(uchar3, |=) -DECLOP_3VAR_ASSIGN(uchar3, ^=) -DECLOP_3VAR_ASSIGN(uchar3, <<=) -DECLOP_3VAR_ASSIGN(uchar3, >>=) - -DECLOP_3VAR_PREOP(uchar3, ++) -DECLOP_3VAR_PREOP(uchar3, --) - -DECLOP_3VAR_POSTOP(uchar3, ++) -DECLOP_3VAR_POSTOP(uchar3, --) - -DECLOP_3VAR_COMP(uchar3, ==) -DECLOP_3VAR_COMP(uchar3, !=) -DECLOP_3VAR_COMP(uchar3, <) -DECLOP_3VAR_COMP(uchar3, >) -DECLOP_3VAR_COMP(uchar3, <=) -DECLOP_3VAR_COMP(uchar3, >=) - -DECLOP_3VAR_COMP(uchar3, &&) -DECLOP_3VAR_COMP(uchar3, ||) - -DECLOP_3VAR_1IN_1OUT(uchar3, ~) -DECLOP_3VAR_1IN_BOOLOUT(uchar3, !) - -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, float) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, double) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long) - -// UNSIGNED CHAR4 - -DECLOP_4VAR_2IN_1OUT(uchar4, +) -DECLOP_4VAR_2IN_1OUT(uchar4, -) -DECLOP_4VAR_2IN_1OUT(uchar4, *) -DECLOP_4VAR_2IN_1OUT(uchar4, /) -DECLOP_4VAR_2IN_1OUT(uchar4, %) -DECLOP_4VAR_2IN_1OUT(uchar4, &) -DECLOP_4VAR_2IN_1OUT(uchar4, |) -DECLOP_4VAR_2IN_1OUT(uchar4, ^) -DECLOP_4VAR_2IN_1OUT(uchar4, <<) -DECLOP_4VAR_2IN_1OUT(uchar4, >>) - -DECLOP_4VAR_ASSIGN(uchar4, +=) -DECLOP_4VAR_ASSIGN(uchar4, -=) -DECLOP_4VAR_ASSIGN(uchar4, *=) -DECLOP_4VAR_ASSIGN(uchar4, /=) -DECLOP_4VAR_ASSIGN(uchar4, %=) -DECLOP_4VAR_ASSIGN(uchar4, &=) -DECLOP_4VAR_ASSIGN(uchar4, |=) -DECLOP_4VAR_ASSIGN(uchar4, ^=) -DECLOP_4VAR_ASSIGN(uchar4, <<=) -DECLOP_4VAR_ASSIGN(uchar4, >>=) - -DECLOP_4VAR_PREOP(uchar4, ++) -DECLOP_4VAR_PREOP(uchar4, --) - -DECLOP_4VAR_POSTOP(uchar4, ++) -DECLOP_4VAR_POSTOP(uchar4, --) - -DECLOP_4VAR_COMP(uchar4, ==) -DECLOP_4VAR_COMP(uchar4, !=) -DECLOP_4VAR_COMP(uchar4, <) -DECLOP_4VAR_COMP(uchar4, >) -DECLOP_4VAR_COMP(uchar4, <=) -DECLOP_4VAR_COMP(uchar4, >=) - -DECLOP_4VAR_COMP(uchar4, &&) -DECLOP_4VAR_COMP(uchar4, ||) - -DECLOP_4VAR_1IN_1OUT(uchar4, ~) -DECLOP_4VAR_1IN_BOOLOUT(uchar4, !) - -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, float) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, double) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long) - -// SIGNED CHAR1 - -DECLOP_1VAR_2IN_1OUT(char1, +) -DECLOP_1VAR_2IN_1OUT(char1, -) -DECLOP_1VAR_2IN_1OUT(char1, *) -DECLOP_1VAR_2IN_1OUT(char1, /) -DECLOP_1VAR_2IN_1OUT(char1, %) -DECLOP_1VAR_2IN_1OUT(char1, &) -DECLOP_1VAR_2IN_1OUT(char1, |) -DECLOP_1VAR_2IN_1OUT(char1, ^) -DECLOP_1VAR_2IN_1OUT(char1, <<) -DECLOP_1VAR_2IN_1OUT(char1, >>) - - -DECLOP_1VAR_ASSIGN(char1, +=) -DECLOP_1VAR_ASSIGN(char1, -=) -DECLOP_1VAR_ASSIGN(char1, *=) -DECLOP_1VAR_ASSIGN(char1, /=) -DECLOP_1VAR_ASSIGN(char1, %=) -DECLOP_1VAR_ASSIGN(char1, &=) -DECLOP_1VAR_ASSIGN(char1, |=) -DECLOP_1VAR_ASSIGN(char1, ^=) -DECLOP_1VAR_ASSIGN(char1, <<=) -DECLOP_1VAR_ASSIGN(char1, >>=) - -DECLOP_1VAR_PREOP(char1, ++) -DECLOP_1VAR_PREOP(char1, --) - -DECLOP_1VAR_POSTOP(char1, ++) -DECLOP_1VAR_POSTOP(char1, --) - -DECLOP_1VAR_COMP(char1, ==) -DECLOP_1VAR_COMP(char1, !=) -DECLOP_1VAR_COMP(char1, <) -DECLOP_1VAR_COMP(char1, >) -DECLOP_1VAR_COMP(char1, <=) -DECLOP_1VAR_COMP(char1, >=) - -DECLOP_1VAR_COMP(char1, &&) -DECLOP_1VAR_COMP(char1, ||) - -DECLOP_1VAR_1IN_1OUT(char1, ~) -DECLOP_1VAR_1IN_BOOLOUT(char1, !) - -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(char1, float) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(char1, double) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long) - -// SIGNED CHAR2 - -DECLOP_2VAR_2IN_1OUT(char2, +) -DECLOP_2VAR_2IN_1OUT(char2, -) -DECLOP_2VAR_2IN_1OUT(char2, *) -DECLOP_2VAR_2IN_1OUT(char2, /) -DECLOP_2VAR_2IN_1OUT(char2, %) -DECLOP_2VAR_2IN_1OUT(char2, &) -DECLOP_2VAR_2IN_1OUT(char2, |) -DECLOP_2VAR_2IN_1OUT(char2, ^) -DECLOP_2VAR_2IN_1OUT(char2, <<) -DECLOP_2VAR_2IN_1OUT(char2, >>) - -DECLOP_2VAR_ASSIGN(char2, +=) -DECLOP_2VAR_ASSIGN(char2, -=) -DECLOP_2VAR_ASSIGN(char2, *=) -DECLOP_2VAR_ASSIGN(char2, /=) -DECLOP_2VAR_ASSIGN(char2, %=) -DECLOP_2VAR_ASSIGN(char2, &=) -DECLOP_2VAR_ASSIGN(char2, |=) -DECLOP_2VAR_ASSIGN(char2, ^=) -DECLOP_2VAR_ASSIGN(char2, <<=) -DECLOP_2VAR_ASSIGN(char2, >>=) - -DECLOP_2VAR_PREOP(char2, ++) -DECLOP_2VAR_PREOP(char2, --) - -DECLOP_2VAR_POSTOP(char2, ++) -DECLOP_2VAR_POSTOP(char2, --) - -DECLOP_2VAR_COMP(char2, ==) -DECLOP_2VAR_COMP(char2, !=) -DECLOP_2VAR_COMP(char2, <) -DECLOP_2VAR_COMP(char2, >) -DECLOP_2VAR_COMP(char2, <=) -DECLOP_2VAR_COMP(char2, >=) - -DECLOP_2VAR_COMP(char2, &&) -DECLOP_2VAR_COMP(char2, ||) - -DECLOP_2VAR_1IN_1OUT(char2, ~) -DECLOP_2VAR_1IN_BOOLOUT(char2, !) - -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(char2, float) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(char2, double) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long) - -// SIGNED CHAR3 - -DECLOP_3VAR_2IN_1OUT(char3, +) -DECLOP_3VAR_2IN_1OUT(char3, -) -DECLOP_3VAR_2IN_1OUT(char3, *) -DECLOP_3VAR_2IN_1OUT(char3, /) -DECLOP_3VAR_2IN_1OUT(char3, %) -DECLOP_3VAR_2IN_1OUT(char3, &) -DECLOP_3VAR_2IN_1OUT(char3, |) -DECLOP_3VAR_2IN_1OUT(char3, ^) -DECLOP_3VAR_2IN_1OUT(char3, <<) -DECLOP_3VAR_2IN_1OUT(char3, >>) - -DECLOP_3VAR_ASSIGN(char3, +=) -DECLOP_3VAR_ASSIGN(char3, -=) -DECLOP_3VAR_ASSIGN(char3, *=) -DECLOP_3VAR_ASSIGN(char3, /=) -DECLOP_3VAR_ASSIGN(char3, %=) -DECLOP_3VAR_ASSIGN(char3, &=) -DECLOP_3VAR_ASSIGN(char3, |=) -DECLOP_3VAR_ASSIGN(char3, ^=) -DECLOP_3VAR_ASSIGN(char3, <<=) -DECLOP_3VAR_ASSIGN(char3, >>=) - -DECLOP_3VAR_PREOP(char3, ++) -DECLOP_3VAR_PREOP(char3, --) - -DECLOP_3VAR_POSTOP(char3, ++) -DECLOP_3VAR_POSTOP(char3, --) - -DECLOP_3VAR_COMP(char3, ==) -DECLOP_3VAR_COMP(char3, !=) -DECLOP_3VAR_COMP(char3, <) -DECLOP_3VAR_COMP(char3, >) -DECLOP_3VAR_COMP(char3, <=) -DECLOP_3VAR_COMP(char3, >=) - -DECLOP_3VAR_COMP(char3, &&) -DECLOP_3VAR_COMP(char3, ||) - -DECLOP_3VAR_1IN_1OUT(char3, ~) -DECLOP_3VAR_1IN_BOOLOUT(char3, !) - -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(char3, float) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(char3, double) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long) - -// SIGNED CHAR4 - -DECLOP_4VAR_2IN_1OUT(char4, +) -DECLOP_4VAR_2IN_1OUT(char4, -) -DECLOP_4VAR_2IN_1OUT(char4, *) -DECLOP_4VAR_2IN_1OUT(char4, /) -DECLOP_4VAR_2IN_1OUT(char4, %) -DECLOP_4VAR_2IN_1OUT(char4, &) -DECLOP_4VAR_2IN_1OUT(char4, |) -DECLOP_4VAR_2IN_1OUT(char4, ^) -DECLOP_4VAR_2IN_1OUT(char4, <<) -DECLOP_4VAR_2IN_1OUT(char4, >>) - -DECLOP_4VAR_ASSIGN(char4, +=) -DECLOP_4VAR_ASSIGN(char4, -=) -DECLOP_4VAR_ASSIGN(char4, *=) -DECLOP_4VAR_ASSIGN(char4, /=) -DECLOP_4VAR_ASSIGN(char4, %=) -DECLOP_4VAR_ASSIGN(char4, &=) -DECLOP_4VAR_ASSIGN(char4, |=) -DECLOP_4VAR_ASSIGN(char4, ^=) -DECLOP_4VAR_ASSIGN(char4, <<=) -DECLOP_4VAR_ASSIGN(char4, >>=) - -DECLOP_4VAR_PREOP(char4, ++) -DECLOP_4VAR_PREOP(char4, --) - -DECLOP_4VAR_POSTOP(char4, ++) -DECLOP_4VAR_POSTOP(char4, --) - -DECLOP_4VAR_COMP(char4, ==) -DECLOP_4VAR_COMP(char4, !=) -DECLOP_4VAR_COMP(char4, <) -DECLOP_4VAR_COMP(char4, >) -DECLOP_4VAR_COMP(char4, <=) -DECLOP_4VAR_COMP(char4, >=) - -DECLOP_4VAR_COMP(char4, &&) -DECLOP_4VAR_COMP(char4, ||) - -DECLOP_4VAR_1IN_1OUT(char4, ~) -DECLOP_4VAR_1IN_BOOLOUT(char4, !) - -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(char4, float) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(char4, double) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long) - -// UNSIGNED SHORT1 - -DECLOP_1VAR_2IN_1OUT(ushort1, +) -DECLOP_1VAR_2IN_1OUT(ushort1, -) -DECLOP_1VAR_2IN_1OUT(ushort1, *) -DECLOP_1VAR_2IN_1OUT(ushort1, /) -DECLOP_1VAR_2IN_1OUT(ushort1, %) -DECLOP_1VAR_2IN_1OUT(ushort1, &) -DECLOP_1VAR_2IN_1OUT(ushort1, |) -DECLOP_1VAR_2IN_1OUT(ushort1, ^) -DECLOP_1VAR_2IN_1OUT(ushort1, <<) -DECLOP_1VAR_2IN_1OUT(ushort1, >>) - - -DECLOP_1VAR_ASSIGN(ushort1, +=) -DECLOP_1VAR_ASSIGN(ushort1, -=) -DECLOP_1VAR_ASSIGN(ushort1, *=) -DECLOP_1VAR_ASSIGN(ushort1, /=) -DECLOP_1VAR_ASSIGN(ushort1, %=) -DECLOP_1VAR_ASSIGN(ushort1, &=) -DECLOP_1VAR_ASSIGN(ushort1, |=) -DECLOP_1VAR_ASSIGN(ushort1, ^=) -DECLOP_1VAR_ASSIGN(ushort1, <<=) -DECLOP_1VAR_ASSIGN(ushort1, >>=) - -DECLOP_1VAR_PREOP(ushort1, ++) -DECLOP_1VAR_PREOP(ushort1, --) - -DECLOP_1VAR_POSTOP(ushort1, ++) -DECLOP_1VAR_POSTOP(ushort1, --) - -DECLOP_1VAR_COMP(ushort1, ==) -DECLOP_1VAR_COMP(ushort1, !=) -DECLOP_1VAR_COMP(ushort1, <) -DECLOP_1VAR_COMP(ushort1, >) -DECLOP_1VAR_COMP(ushort1, <=) -DECLOP_1VAR_COMP(ushort1, >=) - -DECLOP_1VAR_COMP(ushort1, &&) -DECLOP_1VAR_COMP(ushort1, ||) - -DECLOP_1VAR_1IN_1OUT(ushort1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ushort1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, float) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, double) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long) - -// UNSIGNED SHORT2 - -DECLOP_2VAR_2IN_1OUT(ushort2, +) -DECLOP_2VAR_2IN_1OUT(ushort2, -) -DECLOP_2VAR_2IN_1OUT(ushort2, *) -DECLOP_2VAR_2IN_1OUT(ushort2, /) -DECLOP_2VAR_2IN_1OUT(ushort2, %) -DECLOP_2VAR_2IN_1OUT(ushort2, &) -DECLOP_2VAR_2IN_1OUT(ushort2, |) -DECLOP_2VAR_2IN_1OUT(ushort2, ^) -DECLOP_2VAR_2IN_1OUT(ushort2, <<) -DECLOP_2VAR_2IN_1OUT(ushort2, >>) - -DECLOP_2VAR_ASSIGN(ushort2, +=) -DECLOP_2VAR_ASSIGN(ushort2, -=) -DECLOP_2VAR_ASSIGN(ushort2, *=) -DECLOP_2VAR_ASSIGN(ushort2, /=) -DECLOP_2VAR_ASSIGN(ushort2, %=) -DECLOP_2VAR_ASSIGN(ushort2, &=) -DECLOP_2VAR_ASSIGN(ushort2, |=) -DECLOP_2VAR_ASSIGN(ushort2, ^=) -DECLOP_2VAR_ASSIGN(ushort2, <<=) -DECLOP_2VAR_ASSIGN(ushort2, >>=) - -DECLOP_2VAR_PREOP(ushort2, ++) -DECLOP_2VAR_PREOP(ushort2, --) - -DECLOP_2VAR_POSTOP(ushort2, ++) -DECLOP_2VAR_POSTOP(ushort2, --) - -DECLOP_2VAR_COMP(ushort2, ==) -DECLOP_2VAR_COMP(ushort2, !=) -DECLOP_2VAR_COMP(ushort2, <) -DECLOP_2VAR_COMP(ushort2, >) -DECLOP_2VAR_COMP(ushort2, <=) -DECLOP_2VAR_COMP(ushort2, >=) - -DECLOP_2VAR_COMP(ushort2, &&) -DECLOP_2VAR_COMP(ushort2, ||) - -DECLOP_2VAR_1IN_1OUT(ushort2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ushort2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, float) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, double) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long) - -// UNSIGNED SHORT3 - -DECLOP_3VAR_2IN_1OUT(ushort3, +) -DECLOP_3VAR_2IN_1OUT(ushort3, -) -DECLOP_3VAR_2IN_1OUT(ushort3, *) -DECLOP_3VAR_2IN_1OUT(ushort3, /) -DECLOP_3VAR_2IN_1OUT(ushort3, %) -DECLOP_3VAR_2IN_1OUT(ushort3, &) -DECLOP_3VAR_2IN_1OUT(ushort3, |) -DECLOP_3VAR_2IN_1OUT(ushort3, ^) -DECLOP_3VAR_2IN_1OUT(ushort3, <<) -DECLOP_3VAR_2IN_1OUT(ushort3, >>) - -DECLOP_3VAR_ASSIGN(ushort3, +=) -DECLOP_3VAR_ASSIGN(ushort3, -=) -DECLOP_3VAR_ASSIGN(ushort3, *=) -DECLOP_3VAR_ASSIGN(ushort3, /=) -DECLOP_3VAR_ASSIGN(ushort3, %=) -DECLOP_3VAR_ASSIGN(ushort3, &=) -DECLOP_3VAR_ASSIGN(ushort3, |=) -DECLOP_3VAR_ASSIGN(ushort3, ^=) -DECLOP_3VAR_ASSIGN(ushort3, <<=) -DECLOP_3VAR_ASSIGN(ushort3, >>=) - -DECLOP_3VAR_PREOP(ushort3, ++) -DECLOP_3VAR_PREOP(ushort3, --) - -DECLOP_3VAR_POSTOP(ushort3, ++) -DECLOP_3VAR_POSTOP(ushort3, --) - -DECLOP_3VAR_COMP(ushort3, ==) -DECLOP_3VAR_COMP(ushort3, !=) -DECLOP_3VAR_COMP(ushort3, <) -DECLOP_3VAR_COMP(ushort3, >) -DECLOP_3VAR_COMP(ushort3, <=) -DECLOP_3VAR_COMP(ushort3, >=) - -DECLOP_3VAR_COMP(ushort3, &&) -DECLOP_3VAR_COMP(ushort3, ||) - -DECLOP_3VAR_1IN_1OUT(ushort3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ushort3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, float) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, double) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long) - -// UNSIGNED SHORT4 - -DECLOP_4VAR_2IN_1OUT(ushort4, +) -DECLOP_4VAR_2IN_1OUT(ushort4, -) -DECLOP_4VAR_2IN_1OUT(ushort4, *) -DECLOP_4VAR_2IN_1OUT(ushort4, /) -DECLOP_4VAR_2IN_1OUT(ushort4, %) -DECLOP_4VAR_2IN_1OUT(ushort4, &) -DECLOP_4VAR_2IN_1OUT(ushort4, |) -DECLOP_4VAR_2IN_1OUT(ushort4, ^) -DECLOP_4VAR_2IN_1OUT(ushort4, <<) -DECLOP_4VAR_2IN_1OUT(ushort4, >>) - -DECLOP_4VAR_ASSIGN(ushort4, +=) -DECLOP_4VAR_ASSIGN(ushort4, -=) -DECLOP_4VAR_ASSIGN(ushort4, *=) -DECLOP_4VAR_ASSIGN(ushort4, /=) -DECLOP_4VAR_ASSIGN(ushort4, %=) -DECLOP_4VAR_ASSIGN(ushort4, &=) -DECLOP_4VAR_ASSIGN(ushort4, |=) -DECLOP_4VAR_ASSIGN(ushort4, ^=) -DECLOP_4VAR_ASSIGN(ushort4, <<=) -DECLOP_4VAR_ASSIGN(ushort4, >>=) - -DECLOP_4VAR_PREOP(ushort4, ++) -DECLOP_4VAR_PREOP(ushort4, --) - -DECLOP_4VAR_POSTOP(ushort4, ++) -DECLOP_4VAR_POSTOP(ushort4, --) - -DECLOP_4VAR_COMP(ushort4, ==) -DECLOP_4VAR_COMP(ushort4, !=) -DECLOP_4VAR_COMP(ushort4, <) -DECLOP_4VAR_COMP(ushort4, >) -DECLOP_4VAR_COMP(ushort4, <=) -DECLOP_4VAR_COMP(ushort4, >=) - -DECLOP_4VAR_COMP(ushort4, &&) -DECLOP_4VAR_COMP(ushort4, ||) - -DECLOP_4VAR_1IN_1OUT(ushort4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ushort4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, float) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, double) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long) - -// SIGNED SHORT1 - -DECLOP_1VAR_2IN_1OUT(short1, +) -DECLOP_1VAR_2IN_1OUT(short1, -) -DECLOP_1VAR_2IN_1OUT(short1, *) -DECLOP_1VAR_2IN_1OUT(short1, /) -DECLOP_1VAR_2IN_1OUT(short1, %) -DECLOP_1VAR_2IN_1OUT(short1, &) -DECLOP_1VAR_2IN_1OUT(short1, |) -DECLOP_1VAR_2IN_1OUT(short1, ^) -DECLOP_1VAR_2IN_1OUT(short1, <<) -DECLOP_1VAR_2IN_1OUT(short1, >>) - - -DECLOP_1VAR_ASSIGN(short1, +=) -DECLOP_1VAR_ASSIGN(short1, -=) -DECLOP_1VAR_ASSIGN(short1, *=) -DECLOP_1VAR_ASSIGN(short1, /=) -DECLOP_1VAR_ASSIGN(short1, %=) -DECLOP_1VAR_ASSIGN(short1, &=) -DECLOP_1VAR_ASSIGN(short1, |=) -DECLOP_1VAR_ASSIGN(short1, ^=) -DECLOP_1VAR_ASSIGN(short1, <<=) -DECLOP_1VAR_ASSIGN(short1, >>=) - -DECLOP_1VAR_PREOP(short1, ++) -DECLOP_1VAR_PREOP(short1, --) - -DECLOP_1VAR_POSTOP(short1, ++) -DECLOP_1VAR_POSTOP(short1, --) - -DECLOP_1VAR_COMP(short1, ==) -DECLOP_1VAR_COMP(short1, !=) -DECLOP_1VAR_COMP(short1, <) -DECLOP_1VAR_COMP(short1, >) -DECLOP_1VAR_COMP(short1, <=) -DECLOP_1VAR_COMP(short1, >=) - -DECLOP_1VAR_COMP(short1, &&) -DECLOP_1VAR_COMP(short1, ||) - -DECLOP_1VAR_1IN_1OUT(short1, ~) -DECLOP_1VAR_1IN_BOOLOUT(short1, !) - -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(short1, float) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(short1, double) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long) - -// SIGNED SHORT2 - -DECLOP_2VAR_2IN_1OUT(short2, +) -DECLOP_2VAR_2IN_1OUT(short2, -) -DECLOP_2VAR_2IN_1OUT(short2, *) -DECLOP_2VAR_2IN_1OUT(short2, /) -DECLOP_2VAR_2IN_1OUT(short2, %) -DECLOP_2VAR_2IN_1OUT(short2, &) -DECLOP_2VAR_2IN_1OUT(short2, |) -DECLOP_2VAR_2IN_1OUT(short2, ^) -DECLOP_2VAR_2IN_1OUT(short2, <<) -DECLOP_2VAR_2IN_1OUT(short2, >>) - -DECLOP_2VAR_ASSIGN(short2, +=) -DECLOP_2VAR_ASSIGN(short2, -=) -DECLOP_2VAR_ASSIGN(short2, *=) -DECLOP_2VAR_ASSIGN(short2, /=) -DECLOP_2VAR_ASSIGN(short2, %=) -DECLOP_2VAR_ASSIGN(short2, &=) -DECLOP_2VAR_ASSIGN(short2, |=) -DECLOP_2VAR_ASSIGN(short2, ^=) -DECLOP_2VAR_ASSIGN(short2, <<=) -DECLOP_2VAR_ASSIGN(short2, >>=) - -DECLOP_2VAR_PREOP(short2, ++) -DECLOP_2VAR_PREOP(short2, --) - -DECLOP_2VAR_POSTOP(short2, ++) -DECLOP_2VAR_POSTOP(short2, --) - -DECLOP_2VAR_COMP(short2, ==) -DECLOP_2VAR_COMP(short2, !=) -DECLOP_2VAR_COMP(short2, <) -DECLOP_2VAR_COMP(short2, >) -DECLOP_2VAR_COMP(short2, <=) -DECLOP_2VAR_COMP(short2, >=) - -DECLOP_2VAR_COMP(short2, &&) -DECLOP_2VAR_COMP(short2, ||) - -DECLOP_2VAR_1IN_1OUT(short2, ~) -DECLOP_2VAR_1IN_BOOLOUT(short2, !) - -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(short2, float) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(short2, double) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long) - -// SIGNED SHORT3 - -DECLOP_3VAR_2IN_1OUT(short3, +) -DECLOP_3VAR_2IN_1OUT(short3, -) -DECLOP_3VAR_2IN_1OUT(short3, *) -DECLOP_3VAR_2IN_1OUT(short3, /) -DECLOP_3VAR_2IN_1OUT(short3, %) -DECLOP_3VAR_2IN_1OUT(short3, &) -DECLOP_3VAR_2IN_1OUT(short3, |) -DECLOP_3VAR_2IN_1OUT(short3, ^) -DECLOP_3VAR_2IN_1OUT(short3, <<) -DECLOP_3VAR_2IN_1OUT(short3, >>) - -DECLOP_3VAR_ASSIGN(short3, +=) -DECLOP_3VAR_ASSIGN(short3, -=) -DECLOP_3VAR_ASSIGN(short3, *=) -DECLOP_3VAR_ASSIGN(short3, /=) -DECLOP_3VAR_ASSIGN(short3, %=) -DECLOP_3VAR_ASSIGN(short3, &=) -DECLOP_3VAR_ASSIGN(short3, |=) -DECLOP_3VAR_ASSIGN(short3, ^=) -DECLOP_3VAR_ASSIGN(short3, <<=) -DECLOP_3VAR_ASSIGN(short3, >>=) - -DECLOP_3VAR_PREOP(short3, ++) -DECLOP_3VAR_PREOP(short3, --) - -DECLOP_3VAR_POSTOP(short3, ++) -DECLOP_3VAR_POSTOP(short3, --) - -DECLOP_3VAR_COMP(short3, ==) -DECLOP_3VAR_COMP(short3, !=) -DECLOP_3VAR_COMP(short3, <) -DECLOP_3VAR_COMP(short3, >) -DECLOP_3VAR_COMP(short3, <=) -DECLOP_3VAR_COMP(short3, >=) - -DECLOP_3VAR_COMP(short3, &&) -DECLOP_3VAR_COMP(short3, ||) - -DECLOP_3VAR_1IN_1OUT(short3, ~) -DECLOP_3VAR_1IN_BOOLOUT(short3, !) - -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(short3, float) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(short3, double) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long) - -// SIGNED SHORT4 - -DECLOP_4VAR_2IN_1OUT(short4, +) -DECLOP_4VAR_2IN_1OUT(short4, -) -DECLOP_4VAR_2IN_1OUT(short4, *) -DECLOP_4VAR_2IN_1OUT(short4, /) -DECLOP_4VAR_2IN_1OUT(short4, %) -DECLOP_4VAR_2IN_1OUT(short4, &) -DECLOP_4VAR_2IN_1OUT(short4, |) -DECLOP_4VAR_2IN_1OUT(short4, ^) -DECLOP_4VAR_2IN_1OUT(short4, <<) -DECLOP_4VAR_2IN_1OUT(short4, >>) - -DECLOP_4VAR_ASSIGN(short4, +=) -DECLOP_4VAR_ASSIGN(short4, -=) -DECLOP_4VAR_ASSIGN(short4, *=) -DECLOP_4VAR_ASSIGN(short4, /=) -DECLOP_4VAR_ASSIGN(short4, %=) -DECLOP_4VAR_ASSIGN(short4, &=) -DECLOP_4VAR_ASSIGN(short4, |=) -DECLOP_4VAR_ASSIGN(short4, ^=) -DECLOP_4VAR_ASSIGN(short4, <<=) -DECLOP_4VAR_ASSIGN(short4, >>=) - -DECLOP_4VAR_PREOP(short4, ++) -DECLOP_4VAR_PREOP(short4, --) - -DECLOP_4VAR_POSTOP(short4, ++) -DECLOP_4VAR_POSTOP(short4, --) - -DECLOP_4VAR_COMP(short4, ==) -DECLOP_4VAR_COMP(short4, !=) -DECLOP_4VAR_COMP(short4, <) -DECLOP_4VAR_COMP(short4, >) -DECLOP_4VAR_COMP(short4, <=) -DECLOP_4VAR_COMP(short4, >=) - -DECLOP_4VAR_COMP(short4, &&) -DECLOP_4VAR_COMP(short4, ||) - -DECLOP_4VAR_1IN_1OUT(short4, ~) -DECLOP_4VAR_1IN_BOOLOUT(short4, !) - -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(short4, float) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(short4, double) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long) - -// UNSIGNED INT1 - -DECLOP_1VAR_2IN_1OUT(uint1, +) -DECLOP_1VAR_2IN_1OUT(uint1, -) -DECLOP_1VAR_2IN_1OUT(uint1, *) -DECLOP_1VAR_2IN_1OUT(uint1, /) -DECLOP_1VAR_2IN_1OUT(uint1, %) -DECLOP_1VAR_2IN_1OUT(uint1, &) -DECLOP_1VAR_2IN_1OUT(uint1, |) -DECLOP_1VAR_2IN_1OUT(uint1, ^) -DECLOP_1VAR_2IN_1OUT(uint1, <<) -DECLOP_1VAR_2IN_1OUT(uint1, >>) - - -DECLOP_1VAR_ASSIGN(uint1, +=) -DECLOP_1VAR_ASSIGN(uint1, -=) -DECLOP_1VAR_ASSIGN(uint1, *=) -DECLOP_1VAR_ASSIGN(uint1, /=) -DECLOP_1VAR_ASSIGN(uint1, %=) -DECLOP_1VAR_ASSIGN(uint1, &=) -DECLOP_1VAR_ASSIGN(uint1, |=) -DECLOP_1VAR_ASSIGN(uint1, ^=) -DECLOP_1VAR_ASSIGN(uint1, <<=) -DECLOP_1VAR_ASSIGN(uint1, >>=) - -DECLOP_1VAR_PREOP(uint1, ++) -DECLOP_1VAR_PREOP(uint1, --) - -DECLOP_1VAR_POSTOP(uint1, ++) -DECLOP_1VAR_POSTOP(uint1, --) - -DECLOP_1VAR_COMP(uint1, ==) -DECLOP_1VAR_COMP(uint1, !=) -DECLOP_1VAR_COMP(uint1, <) -DECLOP_1VAR_COMP(uint1, >) -DECLOP_1VAR_COMP(uint1, <=) -DECLOP_1VAR_COMP(uint1, >=) - -DECLOP_1VAR_COMP(uint1, &&) -DECLOP_1VAR_COMP(uint1, ||) - -DECLOP_1VAR_1IN_1OUT(uint1, ~) -DECLOP_1VAR_1IN_BOOLOUT(uint1, !) - -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(uint1, float) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, double) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long) - -// UNSIGNED INT2 - -DECLOP_2VAR_2IN_1OUT(uint2, +) -DECLOP_2VAR_2IN_1OUT(uint2, -) -DECLOP_2VAR_2IN_1OUT(uint2, *) -DECLOP_2VAR_2IN_1OUT(uint2, /) -DECLOP_2VAR_2IN_1OUT(uint2, %) -DECLOP_2VAR_2IN_1OUT(uint2, &) -DECLOP_2VAR_2IN_1OUT(uint2, |) -DECLOP_2VAR_2IN_1OUT(uint2, ^) -DECLOP_2VAR_2IN_1OUT(uint2, <<) -DECLOP_2VAR_2IN_1OUT(uint2, >>) - -DECLOP_2VAR_ASSIGN(uint2, +=) -DECLOP_2VAR_ASSIGN(uint2, -=) -DECLOP_2VAR_ASSIGN(uint2, *=) -DECLOP_2VAR_ASSIGN(uint2, /=) -DECLOP_2VAR_ASSIGN(uint2, %=) -DECLOP_2VAR_ASSIGN(uint2, &=) -DECLOP_2VAR_ASSIGN(uint2, |=) -DECLOP_2VAR_ASSIGN(uint2, ^=) -DECLOP_2VAR_ASSIGN(uint2, <<=) -DECLOP_2VAR_ASSIGN(uint2, >>=) - -DECLOP_2VAR_PREOP(uint2, ++) -DECLOP_2VAR_PREOP(uint2, --) - -DECLOP_2VAR_POSTOP(uint2, ++) -DECLOP_2VAR_POSTOP(uint2, --) - -DECLOP_2VAR_COMP(uint2, ==) -DECLOP_2VAR_COMP(uint2, !=) -DECLOP_2VAR_COMP(uint2, <) -DECLOP_2VAR_COMP(uint2, >) -DECLOP_2VAR_COMP(uint2, <=) -DECLOP_2VAR_COMP(uint2, >=) - -DECLOP_2VAR_COMP(uint2, &&) -DECLOP_2VAR_COMP(uint2, ||) - -DECLOP_2VAR_1IN_1OUT(uint2, ~) -DECLOP_2VAR_1IN_BOOLOUT(uint2, !) - -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(uint2, float) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, double) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long) - -// UNSIGNED INT3 - -DECLOP_3VAR_2IN_1OUT(uint3, +) -DECLOP_3VAR_2IN_1OUT(uint3, -) -DECLOP_3VAR_2IN_1OUT(uint3, *) -DECLOP_3VAR_2IN_1OUT(uint3, /) -DECLOP_3VAR_2IN_1OUT(uint3, %) -DECLOP_3VAR_2IN_1OUT(uint3, &) -DECLOP_3VAR_2IN_1OUT(uint3, |) -DECLOP_3VAR_2IN_1OUT(uint3, ^) -DECLOP_3VAR_2IN_1OUT(uint3, <<) -DECLOP_3VAR_2IN_1OUT(uint3, >>) - -DECLOP_3VAR_ASSIGN(uint3, +=) -DECLOP_3VAR_ASSIGN(uint3, -=) -DECLOP_3VAR_ASSIGN(uint3, *=) -DECLOP_3VAR_ASSIGN(uint3, /=) -DECLOP_3VAR_ASSIGN(uint3, %=) -DECLOP_3VAR_ASSIGN(uint3, &=) -DECLOP_3VAR_ASSIGN(uint3, |=) -DECLOP_3VAR_ASSIGN(uint3, ^=) -DECLOP_3VAR_ASSIGN(uint3, <<=) -DECLOP_3VAR_ASSIGN(uint3, >>=) - -DECLOP_3VAR_PREOP(uint3, ++) -DECLOP_3VAR_PREOP(uint3, --) - -DECLOP_3VAR_POSTOP(uint3, ++) -DECLOP_3VAR_POSTOP(uint3, --) - -DECLOP_3VAR_COMP(uint3, ==) -DECLOP_3VAR_COMP(uint3, !=) -DECLOP_3VAR_COMP(uint3, <) -DECLOP_3VAR_COMP(uint3, >) -DECLOP_3VAR_COMP(uint3, <=) -DECLOP_3VAR_COMP(uint3, >=) - -DECLOP_3VAR_COMP(uint3, &&) -DECLOP_3VAR_COMP(uint3, ||) - -DECLOP_3VAR_1IN_1OUT(uint3, ~) -DECLOP_3VAR_1IN_BOOLOUT(uint3, !) - -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(uint3, float) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, double) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long) - -// UNSIGNED INT4 - -DECLOP_4VAR_2IN_1OUT(uint4, +) -DECLOP_4VAR_2IN_1OUT(uint4, -) -DECLOP_4VAR_2IN_1OUT(uint4, *) -DECLOP_4VAR_2IN_1OUT(uint4, /) -DECLOP_4VAR_2IN_1OUT(uint4, %) -DECLOP_4VAR_2IN_1OUT(uint4, &) -DECLOP_4VAR_2IN_1OUT(uint4, |) -DECLOP_4VAR_2IN_1OUT(uint4, ^) -DECLOP_4VAR_2IN_1OUT(uint4, <<) -DECLOP_4VAR_2IN_1OUT(uint4, >>) - -DECLOP_4VAR_ASSIGN(uint4, +=) -DECLOP_4VAR_ASSIGN(uint4, -=) -DECLOP_4VAR_ASSIGN(uint4, *=) -DECLOP_4VAR_ASSIGN(uint4, /=) -DECLOP_4VAR_ASSIGN(uint4, %=) -DECLOP_4VAR_ASSIGN(uint4, &=) -DECLOP_4VAR_ASSIGN(uint4, |=) -DECLOP_4VAR_ASSIGN(uint4, ^=) -DECLOP_4VAR_ASSIGN(uint4, <<=) -DECLOP_4VAR_ASSIGN(uint4, >>=) - -DECLOP_4VAR_PREOP(uint4, ++) -DECLOP_4VAR_PREOP(uint4, --) - -DECLOP_4VAR_POSTOP(uint4, ++) -DECLOP_4VAR_POSTOP(uint4, --) - -DECLOP_4VAR_COMP(uint4, ==) -DECLOP_4VAR_COMP(uint4, !=) -DECLOP_4VAR_COMP(uint4, <) -DECLOP_4VAR_COMP(uint4, >) -DECLOP_4VAR_COMP(uint4, <=) -DECLOP_4VAR_COMP(uint4, >=) - -DECLOP_4VAR_COMP(uint4, &&) -DECLOP_4VAR_COMP(uint4, ||) - -DECLOP_4VAR_1IN_1OUT(uint4, ~) -DECLOP_4VAR_1IN_BOOLOUT(uint4, !) - -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(uint4, float) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, double) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long) - -// SIGNED INT1 - -DECLOP_1VAR_2IN_1OUT(int1, +) -DECLOP_1VAR_2IN_1OUT(int1, -) -DECLOP_1VAR_2IN_1OUT(int1, *) -DECLOP_1VAR_2IN_1OUT(int1, /) -DECLOP_1VAR_2IN_1OUT(int1, %) -DECLOP_1VAR_2IN_1OUT(int1, &) -DECLOP_1VAR_2IN_1OUT(int1, |) -DECLOP_1VAR_2IN_1OUT(int1, ^) -DECLOP_1VAR_2IN_1OUT(int1, <<) -DECLOP_1VAR_2IN_1OUT(int1, >>) - - -DECLOP_1VAR_ASSIGN(int1, +=) -DECLOP_1VAR_ASSIGN(int1, -=) -DECLOP_1VAR_ASSIGN(int1, *=) -DECLOP_1VAR_ASSIGN(int1, /=) -DECLOP_1VAR_ASSIGN(int1, %=) -DECLOP_1VAR_ASSIGN(int1, &=) -DECLOP_1VAR_ASSIGN(int1, |=) -DECLOP_1VAR_ASSIGN(int1, ^=) -DECLOP_1VAR_ASSIGN(int1, <<=) -DECLOP_1VAR_ASSIGN(int1, >>=) - -DECLOP_1VAR_PREOP(int1, ++) -DECLOP_1VAR_PREOP(int1, --) - -DECLOP_1VAR_POSTOP(int1, ++) -DECLOP_1VAR_POSTOP(int1, --) - -DECLOP_1VAR_COMP(int1, ==) -DECLOP_1VAR_COMP(int1, !=) -DECLOP_1VAR_COMP(int1, <) -DECLOP_1VAR_COMP(int1, >) -DECLOP_1VAR_COMP(int1, <=) -DECLOP_1VAR_COMP(int1, >=) - -DECLOP_1VAR_COMP(int1, &&) -DECLOP_1VAR_COMP(int1, ||) - -DECLOP_1VAR_1IN_1OUT(int1, ~) -DECLOP_1VAR_1IN_BOOLOUT(int1, !) - -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(int1, float) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(int1, double) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long) - -// SIGNED INT2 - -DECLOP_2VAR_2IN_1OUT(int2, +) -DECLOP_2VAR_2IN_1OUT(int2, -) -DECLOP_2VAR_2IN_1OUT(int2, *) -DECLOP_2VAR_2IN_1OUT(int2, /) -DECLOP_2VAR_2IN_1OUT(int2, %) -DECLOP_2VAR_2IN_1OUT(int2, &) -DECLOP_2VAR_2IN_1OUT(int2, |) -DECLOP_2VAR_2IN_1OUT(int2, ^) -DECLOP_2VAR_2IN_1OUT(int2, <<) -DECLOP_2VAR_2IN_1OUT(int2, >>) - -DECLOP_2VAR_ASSIGN(int2, +=) -DECLOP_2VAR_ASSIGN(int2, -=) -DECLOP_2VAR_ASSIGN(int2, *=) -DECLOP_2VAR_ASSIGN(int2, /=) -DECLOP_2VAR_ASSIGN(int2, %=) -DECLOP_2VAR_ASSIGN(int2, &=) -DECLOP_2VAR_ASSIGN(int2, |=) -DECLOP_2VAR_ASSIGN(int2, ^=) -DECLOP_2VAR_ASSIGN(int2, <<=) -DECLOP_2VAR_ASSIGN(int2, >>=) - -DECLOP_2VAR_PREOP(int2, ++) -DECLOP_2VAR_PREOP(int2, --) - -DECLOP_2VAR_POSTOP(int2, ++) -DECLOP_2VAR_POSTOP(int2, --) - -DECLOP_2VAR_COMP(int2, ==) -DECLOP_2VAR_COMP(int2, !=) -DECLOP_2VAR_COMP(int2, <) -DECLOP_2VAR_COMP(int2, >) -DECLOP_2VAR_COMP(int2, <=) -DECLOP_2VAR_COMP(int2, >=) - -DECLOP_2VAR_COMP(int2, &&) -DECLOP_2VAR_COMP(int2, ||) - -DECLOP_2VAR_1IN_1OUT(int2, ~) -DECLOP_2VAR_1IN_BOOLOUT(int2, !) - -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(int2, float) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(int2, double) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long) - -// SIGNED INT3 - -DECLOP_3VAR_2IN_1OUT(int3, +) -DECLOP_3VAR_2IN_1OUT(int3, -) -DECLOP_3VAR_2IN_1OUT(int3, *) -DECLOP_3VAR_2IN_1OUT(int3, /) -DECLOP_3VAR_2IN_1OUT(int3, %) -DECLOP_3VAR_2IN_1OUT(int3, &) -DECLOP_3VAR_2IN_1OUT(int3, |) -DECLOP_3VAR_2IN_1OUT(int3, ^) -DECLOP_3VAR_2IN_1OUT(int3, <<) -DECLOP_3VAR_2IN_1OUT(int3, >>) - -DECLOP_3VAR_ASSIGN(int3, +=) -DECLOP_3VAR_ASSIGN(int3, -=) -DECLOP_3VAR_ASSIGN(int3, *=) -DECLOP_3VAR_ASSIGN(int3, /=) -DECLOP_3VAR_ASSIGN(int3, %=) -DECLOP_3VAR_ASSIGN(int3, &=) -DECLOP_3VAR_ASSIGN(int3, |=) -DECLOP_3VAR_ASSIGN(int3, ^=) -DECLOP_3VAR_ASSIGN(int3, <<=) -DECLOP_3VAR_ASSIGN(int3, >>=) - -DECLOP_3VAR_PREOP(int3, ++) -DECLOP_3VAR_PREOP(int3, --) - -DECLOP_3VAR_POSTOP(int3, ++) -DECLOP_3VAR_POSTOP(int3, --) - -DECLOP_3VAR_COMP(int3, ==) -DECLOP_3VAR_COMP(int3, !=) -DECLOP_3VAR_COMP(int3, <) -DECLOP_3VAR_COMP(int3, >) -DECLOP_3VAR_COMP(int3, <=) -DECLOP_3VAR_COMP(int3, >=) - -DECLOP_3VAR_COMP(int3, &&) -DECLOP_3VAR_COMP(int3, ||) - -DECLOP_3VAR_1IN_1OUT(int3, ~) -DECLOP_3VAR_1IN_BOOLOUT(int3, !) - -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(int3, float) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(int3, double) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long) - -// SIGNED INT4 - -DECLOP_4VAR_2IN_1OUT(int4, +) -DECLOP_4VAR_2IN_1OUT(int4, -) -DECLOP_4VAR_2IN_1OUT(int4, *) -DECLOP_4VAR_2IN_1OUT(int4, /) -DECLOP_4VAR_2IN_1OUT(int4, %) -DECLOP_4VAR_2IN_1OUT(int4, &) -DECLOP_4VAR_2IN_1OUT(int4, |) -DECLOP_4VAR_2IN_1OUT(int4, ^) -DECLOP_4VAR_2IN_1OUT(int4, <<) -DECLOP_4VAR_2IN_1OUT(int4, >>) - -DECLOP_4VAR_ASSIGN(int4, +=) -DECLOP_4VAR_ASSIGN(int4, -=) -DECLOP_4VAR_ASSIGN(int4, *=) -DECLOP_4VAR_ASSIGN(int4, /=) -DECLOP_4VAR_ASSIGN(int4, %=) -DECLOP_4VAR_ASSIGN(int4, &=) -DECLOP_4VAR_ASSIGN(int4, |=) -DECLOP_4VAR_ASSIGN(int4, ^=) -DECLOP_4VAR_ASSIGN(int4, <<=) -DECLOP_4VAR_ASSIGN(int4, >>=) - -DECLOP_4VAR_PREOP(int4, ++) -DECLOP_4VAR_PREOP(int4, --) - -DECLOP_4VAR_POSTOP(int4, ++) -DECLOP_4VAR_POSTOP(int4, --) - -DECLOP_4VAR_COMP(int4, ==) -DECLOP_4VAR_COMP(int4, !=) -DECLOP_4VAR_COMP(int4, <) -DECLOP_4VAR_COMP(int4, >) -DECLOP_4VAR_COMP(int4, <=) -DECLOP_4VAR_COMP(int4, >=) - -DECLOP_4VAR_COMP(int4, &&) -DECLOP_4VAR_COMP(int4, ||) - -DECLOP_4VAR_1IN_1OUT(int4, ~) -DECLOP_4VAR_1IN_BOOLOUT(int4, !) - -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(int4, float) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(int4, double) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long) - -// FLOAT1 - -DECLOP_1VAR_2IN_1OUT(float1, +) -DECLOP_1VAR_2IN_1OUT(float1, -) -DECLOP_1VAR_2IN_1OUT(float1, *) -DECLOP_1VAR_2IN_1OUT(float1, /) - -DECLOP_1VAR_ASSIGN(float1, +=) -DECLOP_1VAR_ASSIGN(float1, -=) -DECLOP_1VAR_ASSIGN(float1, *=) -DECLOP_1VAR_ASSIGN(float1, /=) - -DECLOP_1VAR_PREOP(float1, ++) -DECLOP_1VAR_PREOP(float1, --) - -DECLOP_1VAR_POSTOP(float1, ++) -DECLOP_1VAR_POSTOP(float1, --) - -DECLOP_1VAR_COMP(float1, ==) -DECLOP_1VAR_COMP(float1, !=) -DECLOP_1VAR_COMP(float1, <) -DECLOP_1VAR_COMP(float1, >) -DECLOP_1VAR_COMP(float1, <=) -DECLOP_1VAR_COMP(float1, >=) - -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(float1, float) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(float1, double) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long) - -// FLOAT2 - -DECLOP_2VAR_2IN_1OUT(float2, +) -DECLOP_2VAR_2IN_1OUT(float2, -) -DECLOP_2VAR_2IN_1OUT(float2, *) -DECLOP_2VAR_2IN_1OUT(float2, /) - -DECLOP_2VAR_ASSIGN(float2, +=) -DECLOP_2VAR_ASSIGN(float2, -=) -DECLOP_2VAR_ASSIGN(float2, *=) -DECLOP_2VAR_ASSIGN(float2, /=) - -DECLOP_2VAR_PREOP(float2, ++) -DECLOP_2VAR_PREOP(float2, --) - -DECLOP_2VAR_POSTOP(float2, ++) -DECLOP_2VAR_POSTOP(float2, --) - -DECLOP_2VAR_COMP(float2, ==) -DECLOP_2VAR_COMP(float2, !=) -DECLOP_2VAR_COMP(float2, <) -DECLOP_2VAR_COMP(float2, >) -DECLOP_2VAR_COMP(float2, <=) -DECLOP_2VAR_COMP(float2, >=) - -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(float2, float) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(float2, double) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long) - -// FLOAT3 - -DECLOP_3VAR_2IN_1OUT(float3, +) -DECLOP_3VAR_2IN_1OUT(float3, -) -DECLOP_3VAR_2IN_1OUT(float3, *) -DECLOP_3VAR_2IN_1OUT(float3, /) - -DECLOP_3VAR_ASSIGN(float3, +=) -DECLOP_3VAR_ASSIGN(float3, -=) -DECLOP_3VAR_ASSIGN(float3, *=) -DECLOP_3VAR_ASSIGN(float3, /=) - -DECLOP_3VAR_PREOP(float3, ++) -DECLOP_3VAR_PREOP(float3, --) - -DECLOP_3VAR_POSTOP(float3, ++) -DECLOP_3VAR_POSTOP(float3, --) - -DECLOP_3VAR_COMP(float3, ==) -DECLOP_3VAR_COMP(float3, !=) -DECLOP_3VAR_COMP(float3, <) -DECLOP_3VAR_COMP(float3, >) -DECLOP_3VAR_COMP(float3, <=) -DECLOP_3VAR_COMP(float3, >=) - -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(float3, float) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(float3, double) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long) - -// FLOAT4 - -DECLOP_4VAR_2IN_1OUT(float4, +) -DECLOP_4VAR_2IN_1OUT(float4, -) -DECLOP_4VAR_2IN_1OUT(float4, *) -DECLOP_4VAR_2IN_1OUT(float4, /) - -DECLOP_4VAR_ASSIGN(float4, +=) -DECLOP_4VAR_ASSIGN(float4, -=) -DECLOP_4VAR_ASSIGN(float4, *=) -DECLOP_4VAR_ASSIGN(float4, /=) - -DECLOP_4VAR_PREOP(float4, ++) -DECLOP_4VAR_PREOP(float4, --) - -DECLOP_4VAR_POSTOP(float4, ++) -DECLOP_4VAR_POSTOP(float4, --) - -DECLOP_4VAR_COMP(float4, ==) -DECLOP_4VAR_COMP(float4, !=) -DECLOP_4VAR_COMP(float4, <) -DECLOP_4VAR_COMP(float4, >) -DECLOP_4VAR_COMP(float4, <=) -DECLOP_4VAR_COMP(float4, >=) - -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(float4, float) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(float4, double) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long) - -// DOUBLE1 - -DECLOP_1VAR_2IN_1OUT(double1, +) -DECLOP_1VAR_2IN_1OUT(double1, -) -DECLOP_1VAR_2IN_1OUT(double1, *) -DECLOP_1VAR_2IN_1OUT(double1, /) - -DECLOP_1VAR_ASSIGN(double1, +=) -DECLOP_1VAR_ASSIGN(double1, -=) -DECLOP_1VAR_ASSIGN(double1, *=) -DECLOP_1VAR_ASSIGN(double1, /=) - -DECLOP_1VAR_PREOP(double1, ++) -DECLOP_1VAR_PREOP(double1, --) - -DECLOP_1VAR_POSTOP(double1, ++) -DECLOP_1VAR_POSTOP(double1, --) - -DECLOP_1VAR_COMP(double1, ==) -DECLOP_1VAR_COMP(double1, !=) -DECLOP_1VAR_COMP(double1, <) -DECLOP_1VAR_COMP(double1, >) -DECLOP_1VAR_COMP(double1, <=) -DECLOP_1VAR_COMP(double1, >=) - -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(double1, float) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(double1, double) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long) - -// DOUBLE2 - -DECLOP_2VAR_2IN_1OUT(double2, +) -DECLOP_2VAR_2IN_1OUT(double2, -) -DECLOP_2VAR_2IN_1OUT(double2, *) -DECLOP_2VAR_2IN_1OUT(double2, /) - -DECLOP_2VAR_ASSIGN(double2, +=) -DECLOP_2VAR_ASSIGN(double2, -=) -DECLOP_2VAR_ASSIGN(double2, *=) -DECLOP_2VAR_ASSIGN(double2, /=) - -DECLOP_2VAR_PREOP(double2, ++) -DECLOP_2VAR_PREOP(double2, --) - -DECLOP_2VAR_POSTOP(double2, ++) -DECLOP_2VAR_POSTOP(double2, --) - -DECLOP_2VAR_COMP(double2, ==) -DECLOP_2VAR_COMP(double2, !=) -DECLOP_2VAR_COMP(double2, <) -DECLOP_2VAR_COMP(double2, >) -DECLOP_2VAR_COMP(double2, <=) -DECLOP_2VAR_COMP(double2, >=) - -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(double2, float) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(double2, double) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long) - -// DOUBLE3 - -DECLOP_3VAR_2IN_1OUT(double3, +) -DECLOP_3VAR_2IN_1OUT(double3, -) -DECLOP_3VAR_2IN_1OUT(double3, *) -DECLOP_3VAR_2IN_1OUT(double3, /) - -DECLOP_3VAR_ASSIGN(double3, +=) -DECLOP_3VAR_ASSIGN(double3, -=) -DECLOP_3VAR_ASSIGN(double3, *=) -DECLOP_3VAR_ASSIGN(double3, /=) - -DECLOP_3VAR_PREOP(double3, ++) -DECLOP_3VAR_PREOP(double3, --) - -DECLOP_3VAR_POSTOP(double3, ++) -DECLOP_3VAR_POSTOP(double3, --) - -DECLOP_3VAR_COMP(double3, ==) -DECLOP_3VAR_COMP(double3, !=) -DECLOP_3VAR_COMP(double3, <) -DECLOP_3VAR_COMP(double3, >) -DECLOP_3VAR_COMP(double3, <=) -DECLOP_3VAR_COMP(double3, >=) - -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(double3, float) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(double3, double) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long) - -// DOUBLE4 - -DECLOP_4VAR_2IN_1OUT(double4, +) -DECLOP_4VAR_2IN_1OUT(double4, -) -DECLOP_4VAR_2IN_1OUT(double4, *) -DECLOP_4VAR_2IN_1OUT(double4, /) - -DECLOP_4VAR_ASSIGN(double4, +=) -DECLOP_4VAR_ASSIGN(double4, -=) -DECLOP_4VAR_ASSIGN(double4, *=) -DECLOP_4VAR_ASSIGN(double4, /=) - -DECLOP_4VAR_PREOP(double4, ++) -DECLOP_4VAR_PREOP(double4, --) - -DECLOP_4VAR_POSTOP(double4, ++) -DECLOP_4VAR_POSTOP(double4, --) - -DECLOP_4VAR_COMP(double4, ==) -DECLOP_4VAR_COMP(double4, !=) -DECLOP_4VAR_COMP(double4, <) -DECLOP_4VAR_COMP(double4, >) -DECLOP_4VAR_COMP(double4, <=) -DECLOP_4VAR_COMP(double4, >=) - -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(double4, float) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(double4, double) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long) - -// UNSIGNED LONG1 - -DECLOP_1VAR_2IN_1OUT(ulong1, +) -DECLOP_1VAR_2IN_1OUT(ulong1, -) -DECLOP_1VAR_2IN_1OUT(ulong1, *) -DECLOP_1VAR_2IN_1OUT(ulong1, /) -DECLOP_1VAR_2IN_1OUT(ulong1, %) -DECLOP_1VAR_2IN_1OUT(ulong1, &) -DECLOP_1VAR_2IN_1OUT(ulong1, |) -DECLOP_1VAR_2IN_1OUT(ulong1, ^) -DECLOP_1VAR_2IN_1OUT(ulong1, <<) -DECLOP_1VAR_2IN_1OUT(ulong1, >>) - - -DECLOP_1VAR_ASSIGN(ulong1, +=) -DECLOP_1VAR_ASSIGN(ulong1, -=) -DECLOP_1VAR_ASSIGN(ulong1, *=) -DECLOP_1VAR_ASSIGN(ulong1, /=) -DECLOP_1VAR_ASSIGN(ulong1, %=) -DECLOP_1VAR_ASSIGN(ulong1, &=) -DECLOP_1VAR_ASSIGN(ulong1, |=) -DECLOP_1VAR_ASSIGN(ulong1, ^=) -DECLOP_1VAR_ASSIGN(ulong1, <<=) -DECLOP_1VAR_ASSIGN(ulong1, >>=) - -DECLOP_1VAR_PREOP(ulong1, ++) -DECLOP_1VAR_PREOP(ulong1, --) - -DECLOP_1VAR_POSTOP(ulong1, ++) -DECLOP_1VAR_POSTOP(ulong1, --) - -DECLOP_1VAR_COMP(ulong1, ==) -DECLOP_1VAR_COMP(ulong1, !=) -DECLOP_1VAR_COMP(ulong1, <) -DECLOP_1VAR_COMP(ulong1, >) -DECLOP_1VAR_COMP(ulong1, <=) -DECLOP_1VAR_COMP(ulong1, >=) - -DECLOP_1VAR_COMP(ulong1, &&) -DECLOP_1VAR_COMP(ulong1, ||) - -DECLOP_1VAR_1IN_1OUT(ulong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ulong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, float) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, double) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long) - -// UNSIGNED LONG2 - -DECLOP_2VAR_2IN_1OUT(ulong2, +) -DECLOP_2VAR_2IN_1OUT(ulong2, -) -DECLOP_2VAR_2IN_1OUT(ulong2, *) -DECLOP_2VAR_2IN_1OUT(ulong2, /) -DECLOP_2VAR_2IN_1OUT(ulong2, %) -DECLOP_2VAR_2IN_1OUT(ulong2, &) -DECLOP_2VAR_2IN_1OUT(ulong2, |) -DECLOP_2VAR_2IN_1OUT(ulong2, ^) -DECLOP_2VAR_2IN_1OUT(ulong2, <<) -DECLOP_2VAR_2IN_1OUT(ulong2, >>) - -DECLOP_2VAR_ASSIGN(ulong2, +=) -DECLOP_2VAR_ASSIGN(ulong2, -=) -DECLOP_2VAR_ASSIGN(ulong2, *=) -DECLOP_2VAR_ASSIGN(ulong2, /=) -DECLOP_2VAR_ASSIGN(ulong2, %=) -DECLOP_2VAR_ASSIGN(ulong2, &=) -DECLOP_2VAR_ASSIGN(ulong2, |=) -DECLOP_2VAR_ASSIGN(ulong2, ^=) -DECLOP_2VAR_ASSIGN(ulong2, <<=) -DECLOP_2VAR_ASSIGN(ulong2, >>=) - -DECLOP_2VAR_PREOP(ulong2, ++) -DECLOP_2VAR_PREOP(ulong2, --) - -DECLOP_2VAR_POSTOP(ulong2, ++) -DECLOP_2VAR_POSTOP(ulong2, --) - -DECLOP_2VAR_COMP(ulong2, ==) -DECLOP_2VAR_COMP(ulong2, !=) -DECLOP_2VAR_COMP(ulong2, <) -DECLOP_2VAR_COMP(ulong2, >) -DECLOP_2VAR_COMP(ulong2, <=) -DECLOP_2VAR_COMP(ulong2, >=) - -DECLOP_2VAR_COMP(ulong2, &&) -DECLOP_2VAR_COMP(ulong2, ||) - -DECLOP_2VAR_1IN_1OUT(ulong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ulong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, float) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, double) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long) - -// UNSIGNED LONG3 - -DECLOP_3VAR_2IN_1OUT(ulong3, +) -DECLOP_3VAR_2IN_1OUT(ulong3, -) -DECLOP_3VAR_2IN_1OUT(ulong3, *) -DECLOP_3VAR_2IN_1OUT(ulong3, /) -DECLOP_3VAR_2IN_1OUT(ulong3, %) -DECLOP_3VAR_2IN_1OUT(ulong3, &) -DECLOP_3VAR_2IN_1OUT(ulong3, |) -DECLOP_3VAR_2IN_1OUT(ulong3, ^) -DECLOP_3VAR_2IN_1OUT(ulong3, <<) -DECLOP_3VAR_2IN_1OUT(ulong3, >>) - -DECLOP_3VAR_ASSIGN(ulong3, +=) -DECLOP_3VAR_ASSIGN(ulong3, -=) -DECLOP_3VAR_ASSIGN(ulong3, *=) -DECLOP_3VAR_ASSIGN(ulong3, /=) -DECLOP_3VAR_ASSIGN(ulong3, %=) -DECLOP_3VAR_ASSIGN(ulong3, &=) -DECLOP_3VAR_ASSIGN(ulong3, |=) -DECLOP_3VAR_ASSIGN(ulong3, ^=) -DECLOP_3VAR_ASSIGN(ulong3, <<=) -DECLOP_3VAR_ASSIGN(ulong3, >>=) - -DECLOP_3VAR_PREOP(ulong3, ++) -DECLOP_3VAR_PREOP(ulong3, --) - -DECLOP_3VAR_POSTOP(ulong3, ++) -DECLOP_3VAR_POSTOP(ulong3, --) - -DECLOP_3VAR_COMP(ulong3, ==) -DECLOP_3VAR_COMP(ulong3, !=) -DECLOP_3VAR_COMP(ulong3, <) -DECLOP_3VAR_COMP(ulong3, >) -DECLOP_3VAR_COMP(ulong3, <=) -DECLOP_3VAR_COMP(ulong3, >=) - -DECLOP_3VAR_COMP(ulong3, &&) -DECLOP_3VAR_COMP(ulong3, ||) - -DECLOP_3VAR_1IN_1OUT(ulong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ulong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, float) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, double) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long) - -// UNSIGNED LONG4 - -DECLOP_4VAR_2IN_1OUT(ulong4, +) -DECLOP_4VAR_2IN_1OUT(ulong4, -) -DECLOP_4VAR_2IN_1OUT(ulong4, *) -DECLOP_4VAR_2IN_1OUT(ulong4, /) -DECLOP_4VAR_2IN_1OUT(ulong4, %) -DECLOP_4VAR_2IN_1OUT(ulong4, &) -DECLOP_4VAR_2IN_1OUT(ulong4, |) -DECLOP_4VAR_2IN_1OUT(ulong4, ^) -DECLOP_4VAR_2IN_1OUT(ulong4, <<) -DECLOP_4VAR_2IN_1OUT(ulong4, >>) - -DECLOP_4VAR_ASSIGN(ulong4, +=) -DECLOP_4VAR_ASSIGN(ulong4, -=) -DECLOP_4VAR_ASSIGN(ulong4, *=) -DECLOP_4VAR_ASSIGN(ulong4, /=) -DECLOP_4VAR_ASSIGN(ulong4, %=) -DECLOP_4VAR_ASSIGN(ulong4, &=) -DECLOP_4VAR_ASSIGN(ulong4, |=) -DECLOP_4VAR_ASSIGN(ulong4, ^=) -DECLOP_4VAR_ASSIGN(ulong4, <<=) -DECLOP_4VAR_ASSIGN(ulong4, >>=) - -DECLOP_4VAR_PREOP(ulong4, ++) -DECLOP_4VAR_PREOP(ulong4, --) - -DECLOP_4VAR_POSTOP(ulong4, ++) -DECLOP_4VAR_POSTOP(ulong4, --) - -DECLOP_4VAR_COMP(ulong4, ==) -DECLOP_4VAR_COMP(ulong4, !=) -DECLOP_4VAR_COMP(ulong4, <) -DECLOP_4VAR_COMP(ulong4, >) -DECLOP_4VAR_COMP(ulong4, <=) -DECLOP_4VAR_COMP(ulong4, >=) - -DECLOP_4VAR_COMP(ulong4, &&) -DECLOP_4VAR_COMP(ulong4, ||) - -DECLOP_4VAR_1IN_1OUT(ulong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ulong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, float) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, double) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long) - -// SIGNED LONG1 - -DECLOP_1VAR_2IN_1OUT(long1, +) -DECLOP_1VAR_2IN_1OUT(long1, -) -DECLOP_1VAR_2IN_1OUT(long1, *) -DECLOP_1VAR_2IN_1OUT(long1, /) -DECLOP_1VAR_2IN_1OUT(long1, %) -DECLOP_1VAR_2IN_1OUT(long1, &) -DECLOP_1VAR_2IN_1OUT(long1, |) -DECLOP_1VAR_2IN_1OUT(long1, ^) -DECLOP_1VAR_2IN_1OUT(long1, <<) -DECLOP_1VAR_2IN_1OUT(long1, >>) - - -DECLOP_1VAR_ASSIGN(long1, +=) -DECLOP_1VAR_ASSIGN(long1, -=) -DECLOP_1VAR_ASSIGN(long1, *=) -DECLOP_1VAR_ASSIGN(long1, /=) -DECLOP_1VAR_ASSIGN(long1, %=) -DECLOP_1VAR_ASSIGN(long1, &=) -DECLOP_1VAR_ASSIGN(long1, |=) -DECLOP_1VAR_ASSIGN(long1, ^=) -DECLOP_1VAR_ASSIGN(long1, <<=) -DECLOP_1VAR_ASSIGN(long1, >>=) - -DECLOP_1VAR_PREOP(long1, ++) -DECLOP_1VAR_PREOP(long1, --) - -DECLOP_1VAR_POSTOP(long1, ++) -DECLOP_1VAR_POSTOP(long1, --) - -DECLOP_1VAR_COMP(long1, ==) -DECLOP_1VAR_COMP(long1, !=) -DECLOP_1VAR_COMP(long1, <) -DECLOP_1VAR_COMP(long1, >) -DECLOP_1VAR_COMP(long1, <=) -DECLOP_1VAR_COMP(long1, >=) - -DECLOP_1VAR_COMP(long1, &&) -DECLOP_1VAR_COMP(long1, ||) - -DECLOP_1VAR_1IN_1OUT(long1, ~) -DECLOP_1VAR_1IN_BOOLOUT(long1, !) - -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(long1, float) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(long1, double) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long) - -// SIGNED LONG2 - -DECLOP_2VAR_2IN_1OUT(long2, +) -DECLOP_2VAR_2IN_1OUT(long2, -) -DECLOP_2VAR_2IN_1OUT(long2, *) -DECLOP_2VAR_2IN_1OUT(long2, /) -DECLOP_2VAR_2IN_1OUT(long2, %) -DECLOP_2VAR_2IN_1OUT(long2, &) -DECLOP_2VAR_2IN_1OUT(long2, |) -DECLOP_2VAR_2IN_1OUT(long2, ^) -DECLOP_2VAR_2IN_1OUT(long2, <<) -DECLOP_2VAR_2IN_1OUT(long2, >>) - -DECLOP_2VAR_ASSIGN(long2, +=) -DECLOP_2VAR_ASSIGN(long2, -=) -DECLOP_2VAR_ASSIGN(long2, *=) -DECLOP_2VAR_ASSIGN(long2, /=) -DECLOP_2VAR_ASSIGN(long2, %=) -DECLOP_2VAR_ASSIGN(long2, &=) -DECLOP_2VAR_ASSIGN(long2, |=) -DECLOP_2VAR_ASSIGN(long2, ^=) -DECLOP_2VAR_ASSIGN(long2, <<=) -DECLOP_2VAR_ASSIGN(long2, >>=) - -DECLOP_2VAR_PREOP(long2, ++) -DECLOP_2VAR_PREOP(long2, --) - -DECLOP_2VAR_POSTOP(long2, ++) -DECLOP_2VAR_POSTOP(long2, --) - -DECLOP_2VAR_COMP(long2, ==) -DECLOP_2VAR_COMP(long2, !=) -DECLOP_2VAR_COMP(long2, <) -DECLOP_2VAR_COMP(long2, >) -DECLOP_2VAR_COMP(long2, <=) -DECLOP_2VAR_COMP(long2, >=) - -DECLOP_2VAR_COMP(long2, &&) -DECLOP_2VAR_COMP(long2, ||) - -DECLOP_2VAR_1IN_1OUT(long2, ~) -DECLOP_2VAR_1IN_BOOLOUT(long2, !) - -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(long2, float) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(long2, double) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long) - -// SIGNED LONG3 - -DECLOP_3VAR_2IN_1OUT(long3, +) -DECLOP_3VAR_2IN_1OUT(long3, -) -DECLOP_3VAR_2IN_1OUT(long3, *) -DECLOP_3VAR_2IN_1OUT(long3, /) -DECLOP_3VAR_2IN_1OUT(long3, %) -DECLOP_3VAR_2IN_1OUT(long3, &) -DECLOP_3VAR_2IN_1OUT(long3, |) -DECLOP_3VAR_2IN_1OUT(long3, ^) -DECLOP_3VAR_2IN_1OUT(long3, <<) -DECLOP_3VAR_2IN_1OUT(long3, >>) - -DECLOP_3VAR_ASSIGN(long3, +=) -DECLOP_3VAR_ASSIGN(long3, -=) -DECLOP_3VAR_ASSIGN(long3, *=) -DECLOP_3VAR_ASSIGN(long3, /=) -DECLOP_3VAR_ASSIGN(long3, %=) -DECLOP_3VAR_ASSIGN(long3, &=) -DECLOP_3VAR_ASSIGN(long3, |=) -DECLOP_3VAR_ASSIGN(long3, ^=) -DECLOP_3VAR_ASSIGN(long3, <<=) -DECLOP_3VAR_ASSIGN(long3, >>=) - -DECLOP_3VAR_PREOP(long3, ++) -DECLOP_3VAR_PREOP(long3, --) - -DECLOP_3VAR_POSTOP(long3, ++) -DECLOP_3VAR_POSTOP(long3, --) - -DECLOP_3VAR_COMP(long3, ==) -DECLOP_3VAR_COMP(long3, !=) -DECLOP_3VAR_COMP(long3, <) -DECLOP_3VAR_COMP(long3, >) -DECLOP_3VAR_COMP(long3, <=) -DECLOP_3VAR_COMP(long3, >=) - -DECLOP_3VAR_COMP(long3, &&) -DECLOP_3VAR_COMP(long3, ||) - -DECLOP_3VAR_1IN_1OUT(long3, ~) -DECLOP_3VAR_1IN_BOOLOUT(long3, !) - -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(long3, float) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(long3, double) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long) - -// SIGNED LONG4 - -DECLOP_4VAR_2IN_1OUT(long4, +) -DECLOP_4VAR_2IN_1OUT(long4, -) -DECLOP_4VAR_2IN_1OUT(long4, *) -DECLOP_4VAR_2IN_1OUT(long4, /) -DECLOP_4VAR_2IN_1OUT(long4, %) -DECLOP_4VAR_2IN_1OUT(long4, &) -DECLOP_4VAR_2IN_1OUT(long4, |) -DECLOP_4VAR_2IN_1OUT(long4, ^) -DECLOP_4VAR_2IN_1OUT(long4, <<) -DECLOP_4VAR_2IN_1OUT(long4, >>) - -DECLOP_4VAR_ASSIGN(long4, +=) -DECLOP_4VAR_ASSIGN(long4, -=) -DECLOP_4VAR_ASSIGN(long4, *=) -DECLOP_4VAR_ASSIGN(long4, /=) -DECLOP_4VAR_ASSIGN(long4, %=) -DECLOP_4VAR_ASSIGN(long4, &=) -DECLOP_4VAR_ASSIGN(long4, |=) -DECLOP_4VAR_ASSIGN(long4, ^=) -DECLOP_4VAR_ASSIGN(long4, <<=) -DECLOP_4VAR_ASSIGN(long4, >>=) - -DECLOP_4VAR_PREOP(long4, ++) -DECLOP_4VAR_PREOP(long4, --) - -DECLOP_4VAR_POSTOP(long4, ++) -DECLOP_4VAR_POSTOP(long4, --) - -DECLOP_4VAR_COMP(long4, ==) -DECLOP_4VAR_COMP(long4, !=) -DECLOP_4VAR_COMP(long4, <) -DECLOP_4VAR_COMP(long4, >) -DECLOP_4VAR_COMP(long4, <=) -DECLOP_4VAR_COMP(long4, >=) - -DECLOP_4VAR_COMP(long4, &&) -DECLOP_4VAR_COMP(long4, ||) - -DECLOP_4VAR_1IN_1OUT(long4, ~) -DECLOP_4VAR_1IN_BOOLOUT(long4, !) - -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(long4, float) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(long4, double) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long) - -// UNSIGNED LONGLONG1 - -DECLOP_1VAR_2IN_1OUT(ulonglong1, +) -DECLOP_1VAR_2IN_1OUT(ulonglong1, -) -DECLOP_1VAR_2IN_1OUT(ulonglong1, *) -DECLOP_1VAR_2IN_1OUT(ulonglong1, /) -DECLOP_1VAR_2IN_1OUT(ulonglong1, %) -DECLOP_1VAR_2IN_1OUT(ulonglong1, &) -DECLOP_1VAR_2IN_1OUT(ulonglong1, |) -DECLOP_1VAR_2IN_1OUT(ulonglong1, ^) -DECLOP_1VAR_2IN_1OUT(ulonglong1, <<) -DECLOP_1VAR_2IN_1OUT(ulonglong1, >>) - - -DECLOP_1VAR_ASSIGN(ulonglong1, +=) -DECLOP_1VAR_ASSIGN(ulonglong1, -=) -DECLOP_1VAR_ASSIGN(ulonglong1, *=) -DECLOP_1VAR_ASSIGN(ulonglong1, /=) -DECLOP_1VAR_ASSIGN(ulonglong1, %=) -DECLOP_1VAR_ASSIGN(ulonglong1, &=) -DECLOP_1VAR_ASSIGN(ulonglong1, |=) -DECLOP_1VAR_ASSIGN(ulonglong1, ^=) -DECLOP_1VAR_ASSIGN(ulonglong1, <<=) -DECLOP_1VAR_ASSIGN(ulonglong1, >>=) - -DECLOP_1VAR_PREOP(ulonglong1, ++) -DECLOP_1VAR_PREOP(ulonglong1, --) - -DECLOP_1VAR_POSTOP(ulonglong1, ++) -DECLOP_1VAR_POSTOP(ulonglong1, --) - -DECLOP_1VAR_COMP(ulonglong1, ==) -DECLOP_1VAR_COMP(ulonglong1, !=) -DECLOP_1VAR_COMP(ulonglong1, <) -DECLOP_1VAR_COMP(ulonglong1, >) -DECLOP_1VAR_COMP(ulonglong1, <=) -DECLOP_1VAR_COMP(ulonglong1, >=) - -DECLOP_1VAR_COMP(ulonglong1, &&) -DECLOP_1VAR_COMP(ulonglong1, ||) - -DECLOP_1VAR_1IN_1OUT(ulonglong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long) - -// UNSIGNED LONGLONG2 - -DECLOP_2VAR_2IN_1OUT(ulonglong2, +) -DECLOP_2VAR_2IN_1OUT(ulonglong2, -) -DECLOP_2VAR_2IN_1OUT(ulonglong2, *) -DECLOP_2VAR_2IN_1OUT(ulonglong2, /) -DECLOP_2VAR_2IN_1OUT(ulonglong2, %) -DECLOP_2VAR_2IN_1OUT(ulonglong2, &) -DECLOP_2VAR_2IN_1OUT(ulonglong2, |) -DECLOP_2VAR_2IN_1OUT(ulonglong2, ^) -DECLOP_2VAR_2IN_1OUT(ulonglong2, <<) -DECLOP_2VAR_2IN_1OUT(ulonglong2, >>) - -DECLOP_2VAR_ASSIGN(ulonglong2, +=) -DECLOP_2VAR_ASSIGN(ulonglong2, -=) -DECLOP_2VAR_ASSIGN(ulonglong2, *=) -DECLOP_2VAR_ASSIGN(ulonglong2, /=) -DECLOP_2VAR_ASSIGN(ulonglong2, %=) -DECLOP_2VAR_ASSIGN(ulonglong2, &=) -DECLOP_2VAR_ASSIGN(ulonglong2, |=) -DECLOP_2VAR_ASSIGN(ulonglong2, ^=) -DECLOP_2VAR_ASSIGN(ulonglong2, <<=) -DECLOP_2VAR_ASSIGN(ulonglong2, >>=) - -DECLOP_2VAR_PREOP(ulonglong2, ++) -DECLOP_2VAR_PREOP(ulonglong2, --) - -DECLOP_2VAR_POSTOP(ulonglong2, ++) -DECLOP_2VAR_POSTOP(ulonglong2, --) - -DECLOP_2VAR_COMP(ulonglong2, ==) -DECLOP_2VAR_COMP(ulonglong2, !=) -DECLOP_2VAR_COMP(ulonglong2, <) -DECLOP_2VAR_COMP(ulonglong2, >) -DECLOP_2VAR_COMP(ulonglong2, <=) -DECLOP_2VAR_COMP(ulonglong2, >=) - -DECLOP_2VAR_COMP(ulonglong2, &&) -DECLOP_2VAR_COMP(ulonglong2, ||) - -DECLOP_2VAR_1IN_1OUT(ulonglong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long) - -// UNSIGNED LONGLONG3 - -DECLOP_3VAR_2IN_1OUT(ulonglong3, +) -DECLOP_3VAR_2IN_1OUT(ulonglong3, -) -DECLOP_3VAR_2IN_1OUT(ulonglong3, *) -DECLOP_3VAR_2IN_1OUT(ulonglong3, /) -DECLOP_3VAR_2IN_1OUT(ulonglong3, %) -DECLOP_3VAR_2IN_1OUT(ulonglong3, &) -DECLOP_3VAR_2IN_1OUT(ulonglong3, |) -DECLOP_3VAR_2IN_1OUT(ulonglong3, ^) -DECLOP_3VAR_2IN_1OUT(ulonglong3, <<) -DECLOP_3VAR_2IN_1OUT(ulonglong3, >>) - -DECLOP_3VAR_ASSIGN(ulonglong3, +=) -DECLOP_3VAR_ASSIGN(ulonglong3, -=) -DECLOP_3VAR_ASSIGN(ulonglong3, *=) -DECLOP_3VAR_ASSIGN(ulonglong3, /=) -DECLOP_3VAR_ASSIGN(ulonglong3, %=) -DECLOP_3VAR_ASSIGN(ulonglong3, &=) -DECLOP_3VAR_ASSIGN(ulonglong3, |=) -DECLOP_3VAR_ASSIGN(ulonglong3, ^=) -DECLOP_3VAR_ASSIGN(ulonglong3, <<=) -DECLOP_3VAR_ASSIGN(ulonglong3, >>=) - -DECLOP_3VAR_PREOP(ulonglong3, ++) -DECLOP_3VAR_PREOP(ulonglong3, --) - -DECLOP_3VAR_POSTOP(ulonglong3, ++) -DECLOP_3VAR_POSTOP(ulonglong3, --) - -DECLOP_3VAR_COMP(ulonglong3, ==) -DECLOP_3VAR_COMP(ulonglong3, !=) -DECLOP_3VAR_COMP(ulonglong3, <) -DECLOP_3VAR_COMP(ulonglong3, >) -DECLOP_3VAR_COMP(ulonglong3, <=) -DECLOP_3VAR_COMP(ulonglong3, >=) - -DECLOP_3VAR_COMP(ulonglong3, &&) -DECLOP_3VAR_COMP(ulonglong3, ||) - -DECLOP_3VAR_1IN_1OUT(ulonglong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long) - -// UNSIGNED LONGLONG4 - -DECLOP_4VAR_2IN_1OUT(ulonglong4, +) -DECLOP_4VAR_2IN_1OUT(ulonglong4, -) -DECLOP_4VAR_2IN_1OUT(ulonglong4, *) -DECLOP_4VAR_2IN_1OUT(ulonglong4, /) -DECLOP_4VAR_2IN_1OUT(ulonglong4, %) -DECLOP_4VAR_2IN_1OUT(ulonglong4, &) -DECLOP_4VAR_2IN_1OUT(ulonglong4, |) -DECLOP_4VAR_2IN_1OUT(ulonglong4, ^) -DECLOP_4VAR_2IN_1OUT(ulonglong4, <<) -DECLOP_4VAR_2IN_1OUT(ulonglong4, >>) - -DECLOP_4VAR_ASSIGN(ulonglong4, +=) -DECLOP_4VAR_ASSIGN(ulonglong4, -=) -DECLOP_4VAR_ASSIGN(ulonglong4, *=) -DECLOP_4VAR_ASSIGN(ulonglong4, /=) -DECLOP_4VAR_ASSIGN(ulonglong4, %=) -DECLOP_4VAR_ASSIGN(ulonglong4, &=) -DECLOP_4VAR_ASSIGN(ulonglong4, |=) -DECLOP_4VAR_ASSIGN(ulonglong4, ^=) -DECLOP_4VAR_ASSIGN(ulonglong4, <<=) -DECLOP_4VAR_ASSIGN(ulonglong4, >>=) - -DECLOP_4VAR_PREOP(ulonglong4, ++) -DECLOP_4VAR_PREOP(ulonglong4, --) - -DECLOP_4VAR_POSTOP(ulonglong4, ++) -DECLOP_4VAR_POSTOP(ulonglong4, --) - -DECLOP_4VAR_COMP(ulonglong4, ==) -DECLOP_4VAR_COMP(ulonglong4, !=) -DECLOP_4VAR_COMP(ulonglong4, <) -DECLOP_4VAR_COMP(ulonglong4, >) -DECLOP_4VAR_COMP(ulonglong4, <=) -DECLOP_4VAR_COMP(ulonglong4, >=) - -DECLOP_4VAR_COMP(ulonglong4, &&) -DECLOP_4VAR_COMP(ulonglong4, ||) - -DECLOP_4VAR_1IN_1OUT(ulonglong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long) - -// SIGNED LONGLONG1 - -DECLOP_1VAR_2IN_1OUT(longlong1, +) -DECLOP_1VAR_2IN_1OUT(longlong1, -) -DECLOP_1VAR_2IN_1OUT(longlong1, *) -DECLOP_1VAR_2IN_1OUT(longlong1, /) -DECLOP_1VAR_2IN_1OUT(longlong1, %) -DECLOP_1VAR_2IN_1OUT(longlong1, &) -DECLOP_1VAR_2IN_1OUT(longlong1, |) -DECLOP_1VAR_2IN_1OUT(longlong1, ^) -DECLOP_1VAR_2IN_1OUT(longlong1, <<) -DECLOP_1VAR_2IN_1OUT(longlong1, >>) - - -DECLOP_1VAR_ASSIGN(longlong1, +=) -DECLOP_1VAR_ASSIGN(longlong1, -=) -DECLOP_1VAR_ASSIGN(longlong1, *=) -DECLOP_1VAR_ASSIGN(longlong1, /=) -DECLOP_1VAR_ASSIGN(longlong1, %=) -DECLOP_1VAR_ASSIGN(longlong1, &=) -DECLOP_1VAR_ASSIGN(longlong1, |=) -DECLOP_1VAR_ASSIGN(longlong1, ^=) -DECLOP_1VAR_ASSIGN(longlong1, <<=) -DECLOP_1VAR_ASSIGN(longlong1, >>=) - -DECLOP_1VAR_PREOP(longlong1, ++) -DECLOP_1VAR_PREOP(longlong1, --) - -DECLOP_1VAR_POSTOP(longlong1, ++) -DECLOP_1VAR_POSTOP(longlong1, --) - -DECLOP_1VAR_COMP(longlong1, ==) -DECLOP_1VAR_COMP(longlong1, !=) -DECLOP_1VAR_COMP(longlong1, <) -DECLOP_1VAR_COMP(longlong1, >) -DECLOP_1VAR_COMP(longlong1, <=) -DECLOP_1VAR_COMP(longlong1, >=) - -DECLOP_1VAR_COMP(longlong1, &&) -DECLOP_1VAR_COMP(longlong1, ||) - -DECLOP_1VAR_1IN_1OUT(longlong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(longlong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, float) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, double) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long) - -// SIGNED LONGLONG2 - -DECLOP_2VAR_2IN_1OUT(longlong2, +) -DECLOP_2VAR_2IN_1OUT(longlong2, -) -DECLOP_2VAR_2IN_1OUT(longlong2, *) -DECLOP_2VAR_2IN_1OUT(longlong2, /) -DECLOP_2VAR_2IN_1OUT(longlong2, %) -DECLOP_2VAR_2IN_1OUT(longlong2, &) -DECLOP_2VAR_2IN_1OUT(longlong2, |) -DECLOP_2VAR_2IN_1OUT(longlong2, ^) -DECLOP_2VAR_2IN_1OUT(longlong2, <<) -DECLOP_2VAR_2IN_1OUT(longlong2, >>) - -DECLOP_2VAR_ASSIGN(longlong2, +=) -DECLOP_2VAR_ASSIGN(longlong2, -=) -DECLOP_2VAR_ASSIGN(longlong2, *=) -DECLOP_2VAR_ASSIGN(longlong2, /=) -DECLOP_2VAR_ASSIGN(longlong2, %=) -DECLOP_2VAR_ASSIGN(longlong2, &=) -DECLOP_2VAR_ASSIGN(longlong2, |=) -DECLOP_2VAR_ASSIGN(longlong2, ^=) -DECLOP_2VAR_ASSIGN(longlong2, <<=) -DECLOP_2VAR_ASSIGN(longlong2, >>=) - -DECLOP_2VAR_PREOP(longlong2, ++) -DECLOP_2VAR_PREOP(longlong2, --) - -DECLOP_2VAR_POSTOP(longlong2, ++) -DECLOP_2VAR_POSTOP(longlong2, --) - -DECLOP_2VAR_COMP(longlong2, ==) -DECLOP_2VAR_COMP(longlong2, !=) -DECLOP_2VAR_COMP(longlong2, <) -DECLOP_2VAR_COMP(longlong2, >) -DECLOP_2VAR_COMP(longlong2, <=) -DECLOP_2VAR_COMP(longlong2, >=) - -DECLOP_2VAR_COMP(longlong2, &&) -DECLOP_2VAR_COMP(longlong2, ||) - -DECLOP_2VAR_1IN_1OUT(longlong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(longlong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, float) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, double) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long) - -// SIGNED LONGLONG3 - -DECLOP_3VAR_2IN_1OUT(longlong3, +) -DECLOP_3VAR_2IN_1OUT(longlong3, -) -DECLOP_3VAR_2IN_1OUT(longlong3, *) -DECLOP_3VAR_2IN_1OUT(longlong3, /) -DECLOP_3VAR_2IN_1OUT(longlong3, %) -DECLOP_3VAR_2IN_1OUT(longlong3, &) -DECLOP_3VAR_2IN_1OUT(longlong3, |) -DECLOP_3VAR_2IN_1OUT(longlong3, ^) -DECLOP_3VAR_2IN_1OUT(longlong3, <<) -DECLOP_3VAR_2IN_1OUT(longlong3, >>) - -DECLOP_3VAR_ASSIGN(longlong3, +=) -DECLOP_3VAR_ASSIGN(longlong3, -=) -DECLOP_3VAR_ASSIGN(longlong3, *=) -DECLOP_3VAR_ASSIGN(longlong3, /=) -DECLOP_3VAR_ASSIGN(longlong3, %=) -DECLOP_3VAR_ASSIGN(longlong3, &=) -DECLOP_3VAR_ASSIGN(longlong3, |=) -DECLOP_3VAR_ASSIGN(longlong3, ^=) -DECLOP_3VAR_ASSIGN(longlong3, <<=) -DECLOP_3VAR_ASSIGN(longlong3, >>=) - -DECLOP_3VAR_PREOP(longlong3, ++) -DECLOP_3VAR_PREOP(longlong3, --) - -DECLOP_3VAR_POSTOP(longlong3, ++) -DECLOP_3VAR_POSTOP(longlong3, --) - -DECLOP_3VAR_COMP(longlong3, ==) -DECLOP_3VAR_COMP(longlong3, !=) -DECLOP_3VAR_COMP(longlong3, <) -DECLOP_3VAR_COMP(longlong3, >) -DECLOP_3VAR_COMP(longlong3, <=) -DECLOP_3VAR_COMP(longlong3, >=) - -DECLOP_3VAR_COMP(longlong3, &&) -DECLOP_3VAR_COMP(longlong3, ||) - -DECLOP_3VAR_1IN_1OUT(longlong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(longlong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, float) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, double) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long) - -// SIGNED LONGLONG4 - -DECLOP_4VAR_2IN_1OUT(longlong4, +) -DECLOP_4VAR_2IN_1OUT(longlong4, -) -DECLOP_4VAR_2IN_1OUT(longlong4, *) -DECLOP_4VAR_2IN_1OUT(longlong4, /) -DECLOP_4VAR_2IN_1OUT(longlong4, %) -DECLOP_4VAR_2IN_1OUT(longlong4, &) -DECLOP_4VAR_2IN_1OUT(longlong4, |) -DECLOP_4VAR_2IN_1OUT(longlong4, ^) -DECLOP_4VAR_2IN_1OUT(longlong4, <<) -DECLOP_4VAR_2IN_1OUT(longlong4, >>) - -DECLOP_4VAR_ASSIGN(longlong4, +=) -DECLOP_4VAR_ASSIGN(longlong4, -=) -DECLOP_4VAR_ASSIGN(longlong4, *=) -DECLOP_4VAR_ASSIGN(longlong4, /=) -DECLOP_4VAR_ASSIGN(longlong4, %=) -DECLOP_4VAR_ASSIGN(longlong4, &=) -DECLOP_4VAR_ASSIGN(longlong4, |=) -DECLOP_4VAR_ASSIGN(longlong4, ^=) -DECLOP_4VAR_ASSIGN(longlong4, <<=) -DECLOP_4VAR_ASSIGN(longlong4, >>=) - -DECLOP_4VAR_PREOP(longlong4, ++) -DECLOP_4VAR_PREOP(longlong4, --) - -DECLOP_4VAR_POSTOP(longlong4, ++) -DECLOP_4VAR_POSTOP(longlong4, --) - -DECLOP_4VAR_COMP(longlong4, ==) -DECLOP_4VAR_COMP(longlong4, !=) -DECLOP_4VAR_COMP(longlong4, <) -DECLOP_4VAR_COMP(longlong4, >) -DECLOP_4VAR_COMP(longlong4, <=) -DECLOP_4VAR_COMP(longlong4, >=) - -DECLOP_4VAR_COMP(longlong4, &&) -DECLOP_4VAR_COMP(longlong4, ||) - -DECLOP_4VAR_1IN_1OUT(longlong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(longlong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, float) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, double) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long) #endif + #endif diff --git a/projects/hip/src/hip_fp16.cpp b/projects/hip/src/hip_fp16.cpp index c2b7b47597..8e8f003f56 100644 --- a/projects/hip/src/hip_fp16.cpp +++ b/projects/hip/src/hip_fp16.cpp @@ -90,11 +90,11 @@ __device__ bool __hgt(__half a, __half b) { } __device__ bool __hisinf(__half a) { - return a == __hInfValue.h ? true : false; + return a == HINF ? true : false; } __device__ bool __hisnan(__half a) { - return a > __hInfValue.h ? true : false; + return a > HINF ? true : false; } __device__ bool __hle(__half a, __half b) { @@ -114,75 +114,75 @@ Half2 Comparision Functions */ __device__ bool __hbeq2(__half2 a, __half2 b) { - return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false); + return (a.x == b.x ? true : false) && (a.y == b.y ? true : false); } __device__ bool __hbge2(__half2 a, __half2 b) { - return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false); + return (a.x >= b.x ? true : false) && (a.y >= b.y ? true : false); } __device__ bool __hbgt2(__half2 a, __half2 b) { - return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false); + return (a.x > b.x ? true : false) && (a.y > b.y ? true : false); } __device__ bool __hble2(__half2 a, __half2 b) { - return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false); + return (a.x <= b.x ? true : false) && (a.y <= b.y ? true : false); } __device__ bool __hblt2(__half2 a, __half2 b) { - return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false); + return (a.x < b.x ? true : false) && (a.y < b.y ? true : false); } __device__ bool __hbne2(__half2 a, __half2 b) { - return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false); + return (a.x != b.x ? true : false) && (a.y != b.y ? true : false); } __device__ __half2 __heq2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] == b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] == b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x == b.x) ? (__half)1 : (__half)0; + c.y = (a.y == b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hge2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] >= b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] >= b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x >= b.x) ? (__half)1 : (__half)0; + c.y = (a.y >= b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hgt2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] > b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] > b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x > b.x) ? (__half)1 : (__half)0; + c.y = (a.y > b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hisnan2(__half2 a) { __half2 c; - c.p[0] = (a.p[0] > __hInfValue.h) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] > __hInfValue.h) ? (__half)1 : (__half)0; + c.x = (a.x > HINF) ? (__half)1 : (__half)0; + c.y = (a.y > HINF) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hle2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] <= b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] <= b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x <= b.x) ? (__half)1 : (__half)0; + c.y = (a.y <= b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hlt2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] < b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] < b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x < b.x) ? (__half)1 : (__half)0; + c.y = (a.y < b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hne2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] != b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] != b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x != b.x) ? (__half)1 : (__half)0; + c.y = (a.y != b.y) ? (__half)1 : (__half)0; return c; } @@ -191,8 +191,8 @@ Conversion instructions */ __device__ __half2 __float22half2_rn(const float2 a) { __half2 b; - b.p[0] = (__half)a.x; - b.p[1] = (__half)a.y; + b.x = (__half)a.x; + b.y = (__half)a.y; return b; } @@ -202,8 +202,8 @@ __device__ __half __float2half(const float a) { __device__ __half2 __float2half2_rn(const float a) { __half2 b; - b.p[0] = (__half)a; - b.p[1] = (__half)a; + b.x = (__half)a; + b.y = (__half)a; return b; } @@ -225,15 +225,15 @@ __device__ __half __float2half_rz(const float a) { __device__ __half2 __floats2half2_rn(const float a, const float b) { __half2 c; - c.p[0] = (__half)a; - c.p[1] = (__half)b; + c.x = (__half)a; + c.y = (__half)b; return c; } __device__ float2 __half22float2(const __half2 a) { float2 b; - b.x = (float)a.p[0]; - b.y = (float)a.p[1]; + b.x = (float)a.x; + b.y = (float)a.y; return b; } @@ -243,8 +243,8 @@ __device__ float __half2float(const __half a) { __device__ __half2 half2half2(const __half a) { __half2 b; - b.p[0] = a; - b.p[1] = a; + b.x = a; + b.y = a; return b; } @@ -358,30 +358,30 @@ __device__ unsigned short int __half_as_ushort(const __half h) { __device__ __half2 __halves2half2(const __half a, const __half b) { __half2 c; - c.p[0] = a; - c.p[1] = b; + c.x = a; + c.y = b; return c; } __device__ float __high2float(const __half2 a) { - return (float)a.p[1]; + return (float)a.y; } __device__ __half __high2half(const __half2 a) { - return a.p[1]; + return a.y; } __device__ __half2 __high2half2(const __half2 a) { __half2 b; - b.p[0] = a.p[1]; - b.p[1] = a.p[1]; + b.x = a.y; + b.y = a.y; return b; } __device__ __half2 __highs2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[1]; - c.p[1] = b.p[1]; + c.x = a.y; + c.y = b.y; return c; } @@ -418,38 +418,38 @@ __device__ __half __ll2half_rz(long long int i){ } __device__ float __low2float(const __half2 a) { - return (float)a.p[0]; + return (float)a.x; } __device__ __half __low2half(const __half2 a) { - return a.p[0]; + return a.x; } __device__ __half2 __low2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[0]; - c.p[1] = b.p[0]; + c.x = a.x; + c.y = b.x; return c; } __device__ __half2 __low2half2(const __half2 a) { __half2 b; - b.p[0] = a.p[0]; - b.p[1] = a.p[0]; + b.x = a.x; + b.y = a.x; return b; } __device__ __half2 __lowhigh2highlow(const __half2 a) { __half2 b; - b.p[0] = a.p[1]; - b.p[1] = a.p[0]; + b.x = a.y; + b.y = a.x; return b; } __device__ __half2 __lows2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[0]; - c.p[1] = b.p[0]; + c.y = a.x; + c.y = b.x; return c; } @@ -542,346 +542,4 @@ typedef struct{ }; } struct_float; -#if __clang_major__ == 3 -static __device__ float cvt_half_to_float(__half a){ - struct_float ret = {0}; - if(a.x == 0){ - return 0.0f; - } - if(a.x == 0x8000){ - return -0.0f; - } - ret.u = ((a.x&0x8000)<<16) | (((a.x&0x7c00)+0x1C000)<<13) | ((a.x&0x03FF)<<13); - return ret.f; -} - -static __device__ __half cvt_float_to_half(float b){ - struct_float f = {0}; - __half ret = {0}; - f.f = b; - if(f.f == 0.0f){ - ret.x = 0; - return ret; - } - if(f.f == -0.0f){ - ret.x = 0x8000; - return ret; - } - ret.x = ((f.u>>16)&0x8000)|((((f.u&0x7f800000)-0x38000000)>>13)&0x7c00)|((f.u>>13)&0x03ff); - return ret; -} - - -__device__ __half __soft_hadd(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b)); -} - -__device__ __half __soft_hadd_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) + cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hfma(const __half a, const __half b, const __half c){ - return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c))); -} - -__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){ - float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hmul(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b)); -} - -__device__ __half __soft_hmul_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) * cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hneq(const __half a){ - __half ret = {a.x}; - ret.x ^= 1 << 15; - return ret; -} - -__device__ __half __soft_hsub(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b)); -} - -__device__ __half __soft_hsub_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) - cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - - -/* -Half2 Arithmetic Instructions -*/ - -__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hadd(a.p[1], b.p[1]); - ret.p[0] = __soft_hadd(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){ - __half2 ret; - ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]); - ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]); - return ret; -} - -__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){ - __half2 ret; - ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]); - ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]); - return ret; -} - -__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hmul(a.p[1], b.p[1]); - ret.p[0] = __soft_hmul(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hneq2(const __half2 a){ - __half2 ret; - ret.p[1] = __soft_hneq(a.p[1]); - ret.p[0] = __soft_hneq(a.p[0]); - return ret; -} - -__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hsub(a.p[1], b.p[1]); - ret.p[0] = __soft_hsub(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]); - return ret; -} - -/* -Half Cmps -*/ - -__device__ bool __soft_heq(const __half a, const __half b){ - return (a.x == b.x ? true:false); -} - -__device__ bool __soft_hge(const __half a, const __half b){ - return (cvt_half_to_float(a) >= cvt_half_to_float(b)); -} - -__device__ bool __soft_hgt(const __half a, const __half b){ - return (cvt_half_to_float(a) > cvt_half_to_float(b)); -} - -__device__ bool __soft_hisinf(const __half a){ - return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0); -} - -__device__ bool __soft_hisnan(const __half a){ - if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){ - return true; - }else{ - return false; - } -} - -__device__ bool __soft_hle(const __half a, const __half b){ - return (cvt_half_to_float(a) <= cvt_half_to_float(b)); -} - -__device__ bool __soft_hlt(const __half a, const __half b){ - return (cvt_half_to_float(a) < cvt_half_to_float(b)); -} - -__device__ bool __soft_hne(const __half a, const __half b){ - return a.x == b.x ? false : true; -} - -/* -Half2 Cmps -*/ - -__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){ - return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbge2(const __half2 a, const __half2 b){ - return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){ - return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]); -} - -__device__ bool __soft_hble2(const __half2 a, const __half2 b){ - return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]); -} - -__device__ bool __soft_hblt2(const __half2 a, const __half2 b){ - return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbne2(const __half2 a, const __half2 b){ - return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]); -} - - - -__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hisnan2(const __half2 a){ - __half2 ret = {0}; - ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -/* -Half Cnvs and Data Mvmnt -*/ - -__device__ __half2 __soft_float22half2_rn(const float2 a){ - __half2 ret = {0}; - ret.p[1] = cvt_float_to_half(a.x); - ret.p[0] = cvt_float_to_half(a.y); - return ret; -} - -__device__ __half __soft_float2half(const float a){ - return cvt_float_to_half(a); -} - -__device__ __half2 __soft_float2half2_rn(const float a){ - __half ret = cvt_float_to_half(a); - return {ret, ret}; -} - -__device__ __half2 __soft_floats2half2_rn(const float a, const float b){ - return {cvt_float_to_half(a), cvt_float_to_half(b)}; -} - -__device__ float2 __soft_half22float2(const __half2 a){ - return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])}; -} - -__device__ float __soft_half2float(const __half a){ - return cvt_half_to_float(a); -} - -__device__ __half2 __soft_half2half2(const __half a){ - return {a,a}; -} - -__device__ __half2 __soft_halves2half2(const __half a, const __half b){ - return {a,b}; -} - -__device__ float __soft_high2float(const __half2 a){ - return cvt_half_to_float(a.p[1]); -} - -__device__ __half __soft_high2half(const __half2 a){ - return a.p[1]; -} - -__device__ __half2 __soft_high2half2(const __half2 a){ - return {a.p[1], a.p[1]}; -} - -__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){ - return {a.p[1], b.p[1]}; -} - -__device__ float __soft_low2float(const __half2 a){ - return cvt_half_to_float(a.p[0]); -} - -__device__ __half __soft_low2half(const __half2 a){ - return a.p[0]; -} - -__device__ __half2 __soft_low2half2(const __half2 a){ - return {a.p[0], a.p[0]}; -} - -__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){ - return {a.p[0], b.p[0]}; -} - -__device__ __half2 __soft_lowhigh2highlow(const __half2 a){ - return {a.p[0], a.p[1]}; -} - -__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){ - return {a.p[0], b.p[0]}; -} - - - -#endif diff --git a/projects/hip/src/hip_hc_gfx803.ll b/projects/hip/src/hip_hc_gfx803.ll index 0080fc7d81..7e3d0e37dd 100644 --- a/projects/hip/src/hip_hc_gfx803.ll +++ b/projects/hip/src/hip_hc_gfx803.ll @@ -2,89 +2,122 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64: target triple = "amdgcn--amdhsa" -define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hadd2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 { - %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) - tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hfma2_int(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = bitcast <2 x half> %c to i32 + %4 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %1, i32 %2, i32 %3) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %1, i32 %2) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %4, i32 %3) + %5 = bitcast i32 %4 to <2 x half> + ret <2 x half> %5 } -define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hmul2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hsub2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_h2ceil_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2ceil_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2cos_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2cos_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2exp2_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2exp2_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2floor_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2floor_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2log2_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2log2_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2rcp_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2rcp_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2rsqrt_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2rsqrt_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2sin_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2sin_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2sqrt_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2sqrt_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2trunc_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } attributes #1 = { alwaysinline nounwind } From 324dfb870f1cef82f45dd8e83ac3f0831b0eb5c9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 May 2017 21:59:14 -0500 Subject: [PATCH 36/77] Return precise address for hipHostGetDevicePointer. [ROCm/hip commit: 46030bb2d2a30641023dd9a6d53266e32e9a5f9f] --- projects/hip/src/hip_memory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 5501fec734..fc2ada134e 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -202,7 +202,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); if (status == AM_SUCCESS) { - *devicePointer = amPointerInfo._devicePointer; + *devicePointer = static_cast(amPointerInfo._devicePointer) + (static_cast(hostPointer) - static_cast(amPointerInfo._hostPointer)) ; + tprintf(DB_MEM, " host_ptr=%p returned device_pointer=%p\n", hostPointer, *devicePointer); } else { e = hipErrorMemoryAllocation; } From 721cb0f7db87222e3c7ebebb27894b9ae91c4939 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 18 May 2017 10:50:56 -0500 Subject: [PATCH 37/77] fixed vector type issues by reverting to old code, changed __half2 to map to vector types in llvm Change-Id: I7317408c25e8c1a0c02a346042c9137e160c8bbd [ROCm/hip commit: bdc08fcf10ebedff169ea3611c1b019f7070c829] --- .../hip/include/hip/hcc_detail/hip_fp16.h | 5 +- .../include/hip/hcc_detail/hip_vector_types.h | 4038 ++++++++++++++++- 2 files changed, 3978 insertions(+), 65 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_fp16.h b/projects/hip/include/hip/hcc_detail/hip_fp16.h index f1f52e4122..a1abce2191 100644 --- a/projects/hip/include/hip/hcc_detail/hip_fp16.h +++ b/projects/hip/include/hip/hcc_detail/hip_fp16.h @@ -25,8 +25,9 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_vector_types.h" -typedef __half half; -typedef __half2 half2; +typedef __fp16 __half; +typedef __fp16 __half1 __attribute__((ext_vector_type(1))); +typedef __fp16 __half2 __attribute__((ext_vector_type(2))); /* Half Arithmetic Functions diff --git a/projects/hip/include/hip/hcc_detail/hip_vector_types.h b/projects/hip/include/hip/hcc_detail/hip_vector_types.h index 251da504ab..3c3b26c12a 100644 --- a/projects/hip/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/hip/include/hip/hcc_detail/hip_vector_types.h @@ -34,93 +34,1120 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#if __cplusplus +#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x) { } \ +__device__ __host__ type(const type& val) : x(val.x) { } -typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); -typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); -typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); -typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } -typedef signed char char1 __attribute__((ext_vector_type(1))); -typedef signed char char2 __attribute__((ext_vector_type(2))); -typedef signed char char3 __attribute__((ext_vector_type(3))); -typedef signed char char4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } -typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); -typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); -typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); -typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } -typedef signed short short1 __attribute__((ext_vector_type(1))); -typedef signed short short2 __attribute__((ext_vector_type(2))); -typedef signed short short3 __attribute__((ext_vector_type(3))); -typedef signed short short4 __attribute__((ext_vector_type(4))); -typedef __fp16 __half; +#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val) {} \ -typedef __fp16 __half1 __attribute__((ext_vector_type(1))); -typedef __fp16 __half2 __attribute__((ext_vector_type(2))); -typedef __fp16 __half3 __attribute__((ext_vector_type(3))); -typedef __fp16 __half4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val) {} \ +__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} -typedef unsigned int uint1 __attribute__((ext_vector_type(1))); -typedef unsigned int uint2 __attribute__((ext_vector_type(2))); -typedef unsigned int uint3 __attribute__((ext_vector_type(3))); -typedef unsigned int uint4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ +__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} -typedef signed int int1 __attribute__((ext_vector_type(1))); -typedef signed int int2 __attribute__((ext_vector_type(2))); -typedef signed int int3 __attribute__((ext_vector_type(3))); -typedef signed int int4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ +__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} -typedef float float1 __attribute__((ext_vector_type(1))); -typedef float float2 __attribute__((ext_vector_type(2))); -typedef float float3 __attribute__((ext_vector_type(3))); -typedef float float4 __attribute__((ext_vector_type(4))); +struct uchar1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long) -typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); -typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); -typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); -typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); + #endif + unsigned char x; -typedef signed long long1 __attribute__((ext_vector_type(1))); -typedef signed long long2 __attribute__((ext_vector_type(2))); -typedef signed long long3 __attribute__((ext_vector_type(3))); -typedef signed long long4 __attribute__((ext_vector_type(4))); +} __attribute__((aligned(1))); -typedef double double1 __attribute__((ext_vector_type(1))); -typedef double double2 __attribute__((ext_vector_type(2))); -typedef double double3 __attribute__((ext_vector_type(3))); -typedef double double4 __attribute__((ext_vector_type(4))); +struct uchar2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2) -typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); -typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); -typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); -typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long) + #endif + union { + struct { + unsigned char x, y; + }; + unsigned short a; + }; +} __attribute__((aligned(2))); -typedef signed long long longlong1 __attribute__((ext_vector_type(1))); -typedef signed long long longlong2 __attribute__((ext_vector_type(2))); -typedef signed long long longlong3 __attribute__((ext_vector_type(3))); -typedef signed long long longlong4 __attribute__((ext_vector_type(4))); +struct uchar3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long) + #endif + unsigned char x, y, z; +}; + +struct uchar4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long) + #endif + union { + struct { + unsigned char x, y, z, w; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + + +struct char1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long) + #endif + signed char x; +} __attribute__((aligned(1))); + +struct char2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long) + #endif + union { + struct { + signed char x, y; + }; + unsigned short a; + }; +} __attribute__((aligned(2))); + +struct char3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long) + #endif + signed char x, y, z; +}; + +struct char4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long) + #endif + union { + struct { + signed char x, y, z, w; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + + + +struct ushort1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long) + #endif + unsigned short x; +} __attribute__((aligned(2))); + +struct ushort2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long) + #endif + union { + struct { + unsigned short x, y; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + +struct ushort3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long) + #endif + unsigned short x, y, z; +}; + +struct ushort4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long) + #endif + union { + struct { + unsigned short x, y, z, w; + }; + unsigned int a, b; + }; +} __attribute__((aligned(8))); + +struct short1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long) + #endif + signed short x; +} __attribute__((aligned(2))); + +struct short2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long) + #endif + union { + struct { + signed short x, y; + }; + unsigned int a; + }; + +} __attribute__((aligned(4))); + +struct short3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long) + #endif + signed short x, y, z; +}; + +struct short4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long) + #endif + union { + struct { + signed short x, y, z, w; + }; + unsigned int a, b; + }; +} __attribute__((aligned(8))); + + +struct uint1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long) + #endif + unsigned int x; +} __attribute__((aligned(4))); + +struct uint2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long) + #endif + unsigned int x, y; +} __attribute__((aligned(8))); + +struct uint3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long) + #endif + unsigned int x, y, z; +}; + +struct uint4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long) + #endif + unsigned int x, y, z, w; +} __attribute__((aligned(16))); + +struct int1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long) + #endif + signed int x; +} __attribute__((aligned(4))); + +struct int2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long) + #endif + signed int x, y; +} __attribute__((aligned(8))); + +struct int3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long) + #endif + signed int x, y, z; +}; + +struct int4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long) + #endif + signed int x, y, z, w; +} __attribute__((aligned(16))); + + +struct float1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long) + #endif + float x; +} __attribute__((aligned(4))); + +struct float2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long) + #endif + float x, y; +} __attribute__((aligned(8))); + +struct float3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long) + #endif + float x, y, z; +}; + +struct float4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long) + #endif + float x, y, z, w; +} __attribute__((aligned(16))); + + + +struct double1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long) + #endif + double x; +} __attribute__((aligned(8))); + +struct double2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long) + #endif + double x, y; +} __attribute__((aligned(16))); + +struct double3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long) + #endif + double x, y, z; +}; + +struct double4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long) + #endif + double x, y, z, w; +} __attribute__((aligned(32))); + + +struct ulong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long) + #endif + unsigned long x; +} __attribute__((aligned(8))); + +struct ulong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long) + #endif + unsigned long x, y; +} __attribute__((aligned(16))); + +struct ulong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long) + #endif + unsigned long x, y, z; +}; + +struct ulong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long) + #endif + unsigned long x, y, z, w; +} __attribute__((aligned(32))); + + +struct long1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long) + #endif + signed long x; +} __attribute__((aligned(8))); + +struct long2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long) + #endif + signed long x, y; +} __attribute__((aligned(16))); + +struct long3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long) + #endif + signed long x, y, z; +}; + +struct long4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long) + #endif + signed long x, y, z, w; +} __attribute__((aligned(32))); + + +struct ulonglong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long) + #endif + unsigned long long x; +} __attribute__((aligned(8))); + +struct ulonglong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long) + #endif + unsigned long long x, y; +} __attribute__((aligned(16))); + +struct ulonglong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long) + #endif + unsigned long long x, y, z; +}; + +struct ulonglong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long) + #endif + unsigned long long x, y, z, w; +} __attribute__((aligned(32))); + + +struct longlong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long) + #endif + signed long long x; +} __attribute__((aligned(8))); + +struct longlong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long) + #endif + signed long long x, y; +} __attribute__((aligned(16))); + +struct longlong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long) + #endif + signed long long x, y, z; +}; + +struct longlong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long) + #endif + signed long x, y, z, w; +} __attribute__((aligned(32))); #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x) { \ + struct type ret; \ ret.x = x; \ return ret; \ } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -128,8 +1155,8 @@ __device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z, comp w) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -137,7 +1164,6 @@ __device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp return ret; \ } - DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1); DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2); DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3); @@ -199,6 +1225,2892 @@ DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4); +#if __cplusplus + +#define DECLOP_1VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + return lhs; \ +} + +#define DECLOP_1VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + return val; \ +} + +#define DECLOP_1VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + val.x op; \ + return ret; \ +} + +#define DECLOP_1VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} + +#define DECLOP_1VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type& rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type& rhs) { \ + return op rhs.x; \ +} + +/* + Two Element Access +*/ + +#define DECLOP_2VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + return lhs; \ +} + +#define DECLOP_2VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + return val; \ +} + +#define DECLOP_2VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + val.x op; \ + val.y op; \ + return ret; \ +} + +#define DECLOP_2VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} + +#define DECLOP_2VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y); \ +} + + +/* + Three Element Access +*/ + +#define DECLOP_3VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + ret.z = lhs.z op rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + ret.z = lhs.z * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + ret.z = lhs * rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + lhs.z op rhs.z; \ + return lhs; \ +} + +#define DECLOP_3VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + op val.z; \ + return val; \ +} + +#define DECLOP_3VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + ret.z = val.z; \ + val.x op; \ + val.y op; \ + val.z op; \ + return ret; \ +} + +#define DECLOP_3VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ + +#define DECLOP_3VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + ret.z = op rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y) && (op rhs.z); \ +} + + +/* + Four Element Access +*/ + +#define DECLOP_4VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op ( const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + ret.z = lhs.z op rhs.z; \ + ret.w = lhs.w op rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + ret.z = lhs.z * rhs; \ + ret.w = lhs.w * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + ret.z = lhs * rhs.z; \ + ret.w = lhs * rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + lhs.z op rhs.z; \ + lhs.w op rhs.w; \ + return lhs; \ +} + +#define DECLOP_4VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + op val.z; \ + op val.w; \ + return val; \ +} + +#define DECLOP_4VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + ret.z = val.z; \ + ret.w = val.w; \ + val.x op; \ + val.y op; \ + val.z op; \ + val.w op; \ + return ret; \ +} + +#define DECLOP_4VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} + +#define DECLOP_4VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + ret.z = op rhs.z; \ + ret.w = op rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ +} + + +/* +Overloading operators +*/ + +// UNSIGNED CHAR1 + +DECLOP_1VAR_2IN_1OUT(uchar1, +) +DECLOP_1VAR_2IN_1OUT(uchar1, -) +DECLOP_1VAR_2IN_1OUT(uchar1, *) +DECLOP_1VAR_2IN_1OUT(uchar1, /) +DECLOP_1VAR_2IN_1OUT(uchar1, %) +DECLOP_1VAR_2IN_1OUT(uchar1, &) +DECLOP_1VAR_2IN_1OUT(uchar1, |) +DECLOP_1VAR_2IN_1OUT(uchar1, ^) +DECLOP_1VAR_2IN_1OUT(uchar1, <<) +DECLOP_1VAR_2IN_1OUT(uchar1, >>) + + +DECLOP_1VAR_ASSIGN(uchar1, +=) +DECLOP_1VAR_ASSIGN(uchar1, -=) +DECLOP_1VAR_ASSIGN(uchar1, *=) +DECLOP_1VAR_ASSIGN(uchar1, /=) +DECLOP_1VAR_ASSIGN(uchar1, %=) +DECLOP_1VAR_ASSIGN(uchar1, &=) +DECLOP_1VAR_ASSIGN(uchar1, |=) +DECLOP_1VAR_ASSIGN(uchar1, ^=) +DECLOP_1VAR_ASSIGN(uchar1, <<=) +DECLOP_1VAR_ASSIGN(uchar1, >>=) + +DECLOP_1VAR_PREOP(uchar1, ++) +DECLOP_1VAR_PREOP(uchar1, --) + +DECLOP_1VAR_POSTOP(uchar1, ++) +DECLOP_1VAR_POSTOP(uchar1, --) + +DECLOP_1VAR_COMP(uchar1, ==) +DECLOP_1VAR_COMP(uchar1, !=) +DECLOP_1VAR_COMP(uchar1, <) +DECLOP_1VAR_COMP(uchar1, >) +DECLOP_1VAR_COMP(uchar1, <=) +DECLOP_1VAR_COMP(uchar1, >=) + +DECLOP_1VAR_COMP(uchar1, &&) +DECLOP_1VAR_COMP(uchar1, ||) + +DECLOP_1VAR_1IN_1OUT(uchar1, ~) +DECLOP_1VAR_1IN_BOOLOUT(uchar1, !) + +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, float) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, double) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long) + +// UNSIGNED CHAR2 + +DECLOP_2VAR_2IN_1OUT(uchar2, +) +DECLOP_2VAR_2IN_1OUT(uchar2, -) +DECLOP_2VAR_2IN_1OUT(uchar2, *) +DECLOP_2VAR_2IN_1OUT(uchar2, /) +DECLOP_2VAR_2IN_1OUT(uchar2, %) +DECLOP_2VAR_2IN_1OUT(uchar2, &) +DECLOP_2VAR_2IN_1OUT(uchar2, |) +DECLOP_2VAR_2IN_1OUT(uchar2, ^) +DECLOP_2VAR_2IN_1OUT(uchar2, <<) +DECLOP_2VAR_2IN_1OUT(uchar2, >>) + +DECLOP_2VAR_ASSIGN(uchar2, +=) +DECLOP_2VAR_ASSIGN(uchar2, -=) +DECLOP_2VAR_ASSIGN(uchar2, *=) +DECLOP_2VAR_ASSIGN(uchar2, /=) +DECLOP_2VAR_ASSIGN(uchar2, %=) +DECLOP_2VAR_ASSIGN(uchar2, &=) +DECLOP_2VAR_ASSIGN(uchar2, |=) +DECLOP_2VAR_ASSIGN(uchar2, ^=) +DECLOP_2VAR_ASSIGN(uchar2, <<=) +DECLOP_2VAR_ASSIGN(uchar2, >>=) + +DECLOP_2VAR_PREOP(uchar2, ++) +DECLOP_2VAR_PREOP(uchar2, --) + +DECLOP_2VAR_POSTOP(uchar2, ++) +DECLOP_2VAR_POSTOP(uchar2, --) + +DECLOP_2VAR_COMP(uchar2, ==) +DECLOP_2VAR_COMP(uchar2, !=) +DECLOP_2VAR_COMP(uchar2, <) +DECLOP_2VAR_COMP(uchar2, >) +DECLOP_2VAR_COMP(uchar2, <=) +DECLOP_2VAR_COMP(uchar2, >=) + +DECLOP_2VAR_COMP(uchar2, &&) +DECLOP_2VAR_COMP(uchar2, ||) + +DECLOP_2VAR_1IN_1OUT(uchar2, ~) +DECLOP_2VAR_1IN_BOOLOUT(uchar2, !) + +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, float) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, double) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long) + +// UNSIGNED CHAR3 + +DECLOP_3VAR_2IN_1OUT(uchar3, +) +DECLOP_3VAR_2IN_1OUT(uchar3, -) +DECLOP_3VAR_2IN_1OUT(uchar3, *) +DECLOP_3VAR_2IN_1OUT(uchar3, /) +DECLOP_3VAR_2IN_1OUT(uchar3, %) +DECLOP_3VAR_2IN_1OUT(uchar3, &) +DECLOP_3VAR_2IN_1OUT(uchar3, |) +DECLOP_3VAR_2IN_1OUT(uchar3, ^) +DECLOP_3VAR_2IN_1OUT(uchar3, <<) +DECLOP_3VAR_2IN_1OUT(uchar3, >>) + +DECLOP_3VAR_ASSIGN(uchar3, +=) +DECLOP_3VAR_ASSIGN(uchar3, -=) +DECLOP_3VAR_ASSIGN(uchar3, *=) +DECLOP_3VAR_ASSIGN(uchar3, /=) +DECLOP_3VAR_ASSIGN(uchar3, %=) +DECLOP_3VAR_ASSIGN(uchar3, &=) +DECLOP_3VAR_ASSIGN(uchar3, |=) +DECLOP_3VAR_ASSIGN(uchar3, ^=) +DECLOP_3VAR_ASSIGN(uchar3, <<=) +DECLOP_3VAR_ASSIGN(uchar3, >>=) + +DECLOP_3VAR_PREOP(uchar3, ++) +DECLOP_3VAR_PREOP(uchar3, --) + +DECLOP_3VAR_POSTOP(uchar3, ++) +DECLOP_3VAR_POSTOP(uchar3, --) + +DECLOP_3VAR_COMP(uchar3, ==) +DECLOP_3VAR_COMP(uchar3, !=) +DECLOP_3VAR_COMP(uchar3, <) +DECLOP_3VAR_COMP(uchar3, >) +DECLOP_3VAR_COMP(uchar3, <=) +DECLOP_3VAR_COMP(uchar3, >=) + +DECLOP_3VAR_COMP(uchar3, &&) +DECLOP_3VAR_COMP(uchar3, ||) + +DECLOP_3VAR_1IN_1OUT(uchar3, ~) +DECLOP_3VAR_1IN_BOOLOUT(uchar3, !) + +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, float) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, double) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long) + +// UNSIGNED CHAR4 + +DECLOP_4VAR_2IN_1OUT(uchar4, +) +DECLOP_4VAR_2IN_1OUT(uchar4, -) +DECLOP_4VAR_2IN_1OUT(uchar4, *) +DECLOP_4VAR_2IN_1OUT(uchar4, /) +DECLOP_4VAR_2IN_1OUT(uchar4, %) +DECLOP_4VAR_2IN_1OUT(uchar4, &) +DECLOP_4VAR_2IN_1OUT(uchar4, |) +DECLOP_4VAR_2IN_1OUT(uchar4, ^) +DECLOP_4VAR_2IN_1OUT(uchar4, <<) +DECLOP_4VAR_2IN_1OUT(uchar4, >>) + +DECLOP_4VAR_ASSIGN(uchar4, +=) +DECLOP_4VAR_ASSIGN(uchar4, -=) +DECLOP_4VAR_ASSIGN(uchar4, *=) +DECLOP_4VAR_ASSIGN(uchar4, /=) +DECLOP_4VAR_ASSIGN(uchar4, %=) +DECLOP_4VAR_ASSIGN(uchar4, &=) +DECLOP_4VAR_ASSIGN(uchar4, |=) +DECLOP_4VAR_ASSIGN(uchar4, ^=) +DECLOP_4VAR_ASSIGN(uchar4, <<=) +DECLOP_4VAR_ASSIGN(uchar4, >>=) + +DECLOP_4VAR_PREOP(uchar4, ++) +DECLOP_4VAR_PREOP(uchar4, --) + +DECLOP_4VAR_POSTOP(uchar4, ++) +DECLOP_4VAR_POSTOP(uchar4, --) + +DECLOP_4VAR_COMP(uchar4, ==) +DECLOP_4VAR_COMP(uchar4, !=) +DECLOP_4VAR_COMP(uchar4, <) +DECLOP_4VAR_COMP(uchar4, >) +DECLOP_4VAR_COMP(uchar4, <=) +DECLOP_4VAR_COMP(uchar4, >=) + +DECLOP_4VAR_COMP(uchar4, &&) +DECLOP_4VAR_COMP(uchar4, ||) + +DECLOP_4VAR_1IN_1OUT(uchar4, ~) +DECLOP_4VAR_1IN_BOOLOUT(uchar4, !) + +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, float) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, double) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long) + +// SIGNED CHAR1 + +DECLOP_1VAR_2IN_1OUT(char1, +) +DECLOP_1VAR_2IN_1OUT(char1, -) +DECLOP_1VAR_2IN_1OUT(char1, *) +DECLOP_1VAR_2IN_1OUT(char1, /) +DECLOP_1VAR_2IN_1OUT(char1, %) +DECLOP_1VAR_2IN_1OUT(char1, &) +DECLOP_1VAR_2IN_1OUT(char1, |) +DECLOP_1VAR_2IN_1OUT(char1, ^) +DECLOP_1VAR_2IN_1OUT(char1, <<) +DECLOP_1VAR_2IN_1OUT(char1, >>) + + +DECLOP_1VAR_ASSIGN(char1, +=) +DECLOP_1VAR_ASSIGN(char1, -=) +DECLOP_1VAR_ASSIGN(char1, *=) +DECLOP_1VAR_ASSIGN(char1, /=) +DECLOP_1VAR_ASSIGN(char1, %=) +DECLOP_1VAR_ASSIGN(char1, &=) +DECLOP_1VAR_ASSIGN(char1, |=) +DECLOP_1VAR_ASSIGN(char1, ^=) +DECLOP_1VAR_ASSIGN(char1, <<=) +DECLOP_1VAR_ASSIGN(char1, >>=) + +DECLOP_1VAR_PREOP(char1, ++) +DECLOP_1VAR_PREOP(char1, --) + +DECLOP_1VAR_POSTOP(char1, ++) +DECLOP_1VAR_POSTOP(char1, --) + +DECLOP_1VAR_COMP(char1, ==) +DECLOP_1VAR_COMP(char1, !=) +DECLOP_1VAR_COMP(char1, <) +DECLOP_1VAR_COMP(char1, >) +DECLOP_1VAR_COMP(char1, <=) +DECLOP_1VAR_COMP(char1, >=) + +DECLOP_1VAR_COMP(char1, &&) +DECLOP_1VAR_COMP(char1, ||) + +DECLOP_1VAR_1IN_1OUT(char1, ~) +DECLOP_1VAR_1IN_BOOLOUT(char1, !) + +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(char1, float) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(char1, double) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long) + +// SIGNED CHAR2 + +DECLOP_2VAR_2IN_1OUT(char2, +) +DECLOP_2VAR_2IN_1OUT(char2, -) +DECLOP_2VAR_2IN_1OUT(char2, *) +DECLOP_2VAR_2IN_1OUT(char2, /) +DECLOP_2VAR_2IN_1OUT(char2, %) +DECLOP_2VAR_2IN_1OUT(char2, &) +DECLOP_2VAR_2IN_1OUT(char2, |) +DECLOP_2VAR_2IN_1OUT(char2, ^) +DECLOP_2VAR_2IN_1OUT(char2, <<) +DECLOP_2VAR_2IN_1OUT(char2, >>) + +DECLOP_2VAR_ASSIGN(char2, +=) +DECLOP_2VAR_ASSIGN(char2, -=) +DECLOP_2VAR_ASSIGN(char2, *=) +DECLOP_2VAR_ASSIGN(char2, /=) +DECLOP_2VAR_ASSIGN(char2, %=) +DECLOP_2VAR_ASSIGN(char2, &=) +DECLOP_2VAR_ASSIGN(char2, |=) +DECLOP_2VAR_ASSIGN(char2, ^=) +DECLOP_2VAR_ASSIGN(char2, <<=) +DECLOP_2VAR_ASSIGN(char2, >>=) + +DECLOP_2VAR_PREOP(char2, ++) +DECLOP_2VAR_PREOP(char2, --) + +DECLOP_2VAR_POSTOP(char2, ++) +DECLOP_2VAR_POSTOP(char2, --) + +DECLOP_2VAR_COMP(char2, ==) +DECLOP_2VAR_COMP(char2, !=) +DECLOP_2VAR_COMP(char2, <) +DECLOP_2VAR_COMP(char2, >) +DECLOP_2VAR_COMP(char2, <=) +DECLOP_2VAR_COMP(char2, >=) + +DECLOP_2VAR_COMP(char2, &&) +DECLOP_2VAR_COMP(char2, ||) + +DECLOP_2VAR_1IN_1OUT(char2, ~) +DECLOP_2VAR_1IN_BOOLOUT(char2, !) + +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(char2, float) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(char2, double) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long) + +// SIGNED CHAR3 + +DECLOP_3VAR_2IN_1OUT(char3, +) +DECLOP_3VAR_2IN_1OUT(char3, -) +DECLOP_3VAR_2IN_1OUT(char3, *) +DECLOP_3VAR_2IN_1OUT(char3, /) +DECLOP_3VAR_2IN_1OUT(char3, %) +DECLOP_3VAR_2IN_1OUT(char3, &) +DECLOP_3VAR_2IN_1OUT(char3, |) +DECLOP_3VAR_2IN_1OUT(char3, ^) +DECLOP_3VAR_2IN_1OUT(char3, <<) +DECLOP_3VAR_2IN_1OUT(char3, >>) + +DECLOP_3VAR_ASSIGN(char3, +=) +DECLOP_3VAR_ASSIGN(char3, -=) +DECLOP_3VAR_ASSIGN(char3, *=) +DECLOP_3VAR_ASSIGN(char3, /=) +DECLOP_3VAR_ASSIGN(char3, %=) +DECLOP_3VAR_ASSIGN(char3, &=) +DECLOP_3VAR_ASSIGN(char3, |=) +DECLOP_3VAR_ASSIGN(char3, ^=) +DECLOP_3VAR_ASSIGN(char3, <<=) +DECLOP_3VAR_ASSIGN(char3, >>=) + +DECLOP_3VAR_PREOP(char3, ++) +DECLOP_3VAR_PREOP(char3, --) + +DECLOP_3VAR_POSTOP(char3, ++) +DECLOP_3VAR_POSTOP(char3, --) + +DECLOP_3VAR_COMP(char3, ==) +DECLOP_3VAR_COMP(char3, !=) +DECLOP_3VAR_COMP(char3, <) +DECLOP_3VAR_COMP(char3, >) +DECLOP_3VAR_COMP(char3, <=) +DECLOP_3VAR_COMP(char3, >=) + +DECLOP_3VAR_COMP(char3, &&) +DECLOP_3VAR_COMP(char3, ||) + +DECLOP_3VAR_1IN_1OUT(char3, ~) +DECLOP_3VAR_1IN_BOOLOUT(char3, !) + +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(char3, float) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(char3, double) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long) + +// SIGNED CHAR4 + +DECLOP_4VAR_2IN_1OUT(char4, +) +DECLOP_4VAR_2IN_1OUT(char4, -) +DECLOP_4VAR_2IN_1OUT(char4, *) +DECLOP_4VAR_2IN_1OUT(char4, /) +DECLOP_4VAR_2IN_1OUT(char4, %) +DECLOP_4VAR_2IN_1OUT(char4, &) +DECLOP_4VAR_2IN_1OUT(char4, |) +DECLOP_4VAR_2IN_1OUT(char4, ^) +DECLOP_4VAR_2IN_1OUT(char4, <<) +DECLOP_4VAR_2IN_1OUT(char4, >>) + +DECLOP_4VAR_ASSIGN(char4, +=) +DECLOP_4VAR_ASSIGN(char4, -=) +DECLOP_4VAR_ASSIGN(char4, *=) +DECLOP_4VAR_ASSIGN(char4, /=) +DECLOP_4VAR_ASSIGN(char4, %=) +DECLOP_4VAR_ASSIGN(char4, &=) +DECLOP_4VAR_ASSIGN(char4, |=) +DECLOP_4VAR_ASSIGN(char4, ^=) +DECLOP_4VAR_ASSIGN(char4, <<=) +DECLOP_4VAR_ASSIGN(char4, >>=) + +DECLOP_4VAR_PREOP(char4, ++) +DECLOP_4VAR_PREOP(char4, --) + +DECLOP_4VAR_POSTOP(char4, ++) +DECLOP_4VAR_POSTOP(char4, --) + +DECLOP_4VAR_COMP(char4, ==) +DECLOP_4VAR_COMP(char4, !=) +DECLOP_4VAR_COMP(char4, <) +DECLOP_4VAR_COMP(char4, >) +DECLOP_4VAR_COMP(char4, <=) +DECLOP_4VAR_COMP(char4, >=) + +DECLOP_4VAR_COMP(char4, &&) +DECLOP_4VAR_COMP(char4, ||) + +DECLOP_4VAR_1IN_1OUT(char4, ~) +DECLOP_4VAR_1IN_BOOLOUT(char4, !) + +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(char4, float) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(char4, double) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long) + +// UNSIGNED SHORT1 + +DECLOP_1VAR_2IN_1OUT(ushort1, +) +DECLOP_1VAR_2IN_1OUT(ushort1, -) +DECLOP_1VAR_2IN_1OUT(ushort1, *) +DECLOP_1VAR_2IN_1OUT(ushort1, /) +DECLOP_1VAR_2IN_1OUT(ushort1, %) +DECLOP_1VAR_2IN_1OUT(ushort1, &) +DECLOP_1VAR_2IN_1OUT(ushort1, |) +DECLOP_1VAR_2IN_1OUT(ushort1, ^) +DECLOP_1VAR_2IN_1OUT(ushort1, <<) +DECLOP_1VAR_2IN_1OUT(ushort1, >>) + + +DECLOP_1VAR_ASSIGN(ushort1, +=) +DECLOP_1VAR_ASSIGN(ushort1, -=) +DECLOP_1VAR_ASSIGN(ushort1, *=) +DECLOP_1VAR_ASSIGN(ushort1, /=) +DECLOP_1VAR_ASSIGN(ushort1, %=) +DECLOP_1VAR_ASSIGN(ushort1, &=) +DECLOP_1VAR_ASSIGN(ushort1, |=) +DECLOP_1VAR_ASSIGN(ushort1, ^=) +DECLOP_1VAR_ASSIGN(ushort1, <<=) +DECLOP_1VAR_ASSIGN(ushort1, >>=) + +DECLOP_1VAR_PREOP(ushort1, ++) +DECLOP_1VAR_PREOP(ushort1, --) + +DECLOP_1VAR_POSTOP(ushort1, ++) +DECLOP_1VAR_POSTOP(ushort1, --) + +DECLOP_1VAR_COMP(ushort1, ==) +DECLOP_1VAR_COMP(ushort1, !=) +DECLOP_1VAR_COMP(ushort1, <) +DECLOP_1VAR_COMP(ushort1, >) +DECLOP_1VAR_COMP(ushort1, <=) +DECLOP_1VAR_COMP(ushort1, >=) + +DECLOP_1VAR_COMP(ushort1, &&) +DECLOP_1VAR_COMP(ushort1, ||) + +DECLOP_1VAR_1IN_1OUT(ushort1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ushort1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, float) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, double) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long) + +// UNSIGNED SHORT2 + +DECLOP_2VAR_2IN_1OUT(ushort2, +) +DECLOP_2VAR_2IN_1OUT(ushort2, -) +DECLOP_2VAR_2IN_1OUT(ushort2, *) +DECLOP_2VAR_2IN_1OUT(ushort2, /) +DECLOP_2VAR_2IN_1OUT(ushort2, %) +DECLOP_2VAR_2IN_1OUT(ushort2, &) +DECLOP_2VAR_2IN_1OUT(ushort2, |) +DECLOP_2VAR_2IN_1OUT(ushort2, ^) +DECLOP_2VAR_2IN_1OUT(ushort2, <<) +DECLOP_2VAR_2IN_1OUT(ushort2, >>) + +DECLOP_2VAR_ASSIGN(ushort2, +=) +DECLOP_2VAR_ASSIGN(ushort2, -=) +DECLOP_2VAR_ASSIGN(ushort2, *=) +DECLOP_2VAR_ASSIGN(ushort2, /=) +DECLOP_2VAR_ASSIGN(ushort2, %=) +DECLOP_2VAR_ASSIGN(ushort2, &=) +DECLOP_2VAR_ASSIGN(ushort2, |=) +DECLOP_2VAR_ASSIGN(ushort2, ^=) +DECLOP_2VAR_ASSIGN(ushort2, <<=) +DECLOP_2VAR_ASSIGN(ushort2, >>=) + +DECLOP_2VAR_PREOP(ushort2, ++) +DECLOP_2VAR_PREOP(ushort2, --) + +DECLOP_2VAR_POSTOP(ushort2, ++) +DECLOP_2VAR_POSTOP(ushort2, --) + +DECLOP_2VAR_COMP(ushort2, ==) +DECLOP_2VAR_COMP(ushort2, !=) +DECLOP_2VAR_COMP(ushort2, <) +DECLOP_2VAR_COMP(ushort2, >) +DECLOP_2VAR_COMP(ushort2, <=) +DECLOP_2VAR_COMP(ushort2, >=) + +DECLOP_2VAR_COMP(ushort2, &&) +DECLOP_2VAR_COMP(ushort2, ||) + +DECLOP_2VAR_1IN_1OUT(ushort2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ushort2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, float) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, double) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long) + +// UNSIGNED SHORT3 + +DECLOP_3VAR_2IN_1OUT(ushort3, +) +DECLOP_3VAR_2IN_1OUT(ushort3, -) +DECLOP_3VAR_2IN_1OUT(ushort3, *) +DECLOP_3VAR_2IN_1OUT(ushort3, /) +DECLOP_3VAR_2IN_1OUT(ushort3, %) +DECLOP_3VAR_2IN_1OUT(ushort3, &) +DECLOP_3VAR_2IN_1OUT(ushort3, |) +DECLOP_3VAR_2IN_1OUT(ushort3, ^) +DECLOP_3VAR_2IN_1OUT(ushort3, <<) +DECLOP_3VAR_2IN_1OUT(ushort3, >>) + +DECLOP_3VAR_ASSIGN(ushort3, +=) +DECLOP_3VAR_ASSIGN(ushort3, -=) +DECLOP_3VAR_ASSIGN(ushort3, *=) +DECLOP_3VAR_ASSIGN(ushort3, /=) +DECLOP_3VAR_ASSIGN(ushort3, %=) +DECLOP_3VAR_ASSIGN(ushort3, &=) +DECLOP_3VAR_ASSIGN(ushort3, |=) +DECLOP_3VAR_ASSIGN(ushort3, ^=) +DECLOP_3VAR_ASSIGN(ushort3, <<=) +DECLOP_3VAR_ASSIGN(ushort3, >>=) + +DECLOP_3VAR_PREOP(ushort3, ++) +DECLOP_3VAR_PREOP(ushort3, --) + +DECLOP_3VAR_POSTOP(ushort3, ++) +DECLOP_3VAR_POSTOP(ushort3, --) + +DECLOP_3VAR_COMP(ushort3, ==) +DECLOP_3VAR_COMP(ushort3, !=) +DECLOP_3VAR_COMP(ushort3, <) +DECLOP_3VAR_COMP(ushort3, >) +DECLOP_3VAR_COMP(ushort3, <=) +DECLOP_3VAR_COMP(ushort3, >=) + +DECLOP_3VAR_COMP(ushort3, &&) +DECLOP_3VAR_COMP(ushort3, ||) + +DECLOP_3VAR_1IN_1OUT(ushort3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ushort3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, float) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, double) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long) + +// UNSIGNED SHORT4 + +DECLOP_4VAR_2IN_1OUT(ushort4, +) +DECLOP_4VAR_2IN_1OUT(ushort4, -) +DECLOP_4VAR_2IN_1OUT(ushort4, *) +DECLOP_4VAR_2IN_1OUT(ushort4, /) +DECLOP_4VAR_2IN_1OUT(ushort4, %) +DECLOP_4VAR_2IN_1OUT(ushort4, &) +DECLOP_4VAR_2IN_1OUT(ushort4, |) +DECLOP_4VAR_2IN_1OUT(ushort4, ^) +DECLOP_4VAR_2IN_1OUT(ushort4, <<) +DECLOP_4VAR_2IN_1OUT(ushort4, >>) + +DECLOP_4VAR_ASSIGN(ushort4, +=) +DECLOP_4VAR_ASSIGN(ushort4, -=) +DECLOP_4VAR_ASSIGN(ushort4, *=) +DECLOP_4VAR_ASSIGN(ushort4, /=) +DECLOP_4VAR_ASSIGN(ushort4, %=) +DECLOP_4VAR_ASSIGN(ushort4, &=) +DECLOP_4VAR_ASSIGN(ushort4, |=) +DECLOP_4VAR_ASSIGN(ushort4, ^=) +DECLOP_4VAR_ASSIGN(ushort4, <<=) +DECLOP_4VAR_ASSIGN(ushort4, >>=) + +DECLOP_4VAR_PREOP(ushort4, ++) +DECLOP_4VAR_PREOP(ushort4, --) + +DECLOP_4VAR_POSTOP(ushort4, ++) +DECLOP_4VAR_POSTOP(ushort4, --) + +DECLOP_4VAR_COMP(ushort4, ==) +DECLOP_4VAR_COMP(ushort4, !=) +DECLOP_4VAR_COMP(ushort4, <) +DECLOP_4VAR_COMP(ushort4, >) +DECLOP_4VAR_COMP(ushort4, <=) +DECLOP_4VAR_COMP(ushort4, >=) + +DECLOP_4VAR_COMP(ushort4, &&) +DECLOP_4VAR_COMP(ushort4, ||) + +DECLOP_4VAR_1IN_1OUT(ushort4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ushort4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, float) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, double) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long) + +// SIGNED SHORT1 + +DECLOP_1VAR_2IN_1OUT(short1, +) +DECLOP_1VAR_2IN_1OUT(short1, -) +DECLOP_1VAR_2IN_1OUT(short1, *) +DECLOP_1VAR_2IN_1OUT(short1, /) +DECLOP_1VAR_2IN_1OUT(short1, %) +DECLOP_1VAR_2IN_1OUT(short1, &) +DECLOP_1VAR_2IN_1OUT(short1, |) +DECLOP_1VAR_2IN_1OUT(short1, ^) +DECLOP_1VAR_2IN_1OUT(short1, <<) +DECLOP_1VAR_2IN_1OUT(short1, >>) + + +DECLOP_1VAR_ASSIGN(short1, +=) +DECLOP_1VAR_ASSIGN(short1, -=) +DECLOP_1VAR_ASSIGN(short1, *=) +DECLOP_1VAR_ASSIGN(short1, /=) +DECLOP_1VAR_ASSIGN(short1, %=) +DECLOP_1VAR_ASSIGN(short1, &=) +DECLOP_1VAR_ASSIGN(short1, |=) +DECLOP_1VAR_ASSIGN(short1, ^=) +DECLOP_1VAR_ASSIGN(short1, <<=) +DECLOP_1VAR_ASSIGN(short1, >>=) + +DECLOP_1VAR_PREOP(short1, ++) +DECLOP_1VAR_PREOP(short1, --) + +DECLOP_1VAR_POSTOP(short1, ++) +DECLOP_1VAR_POSTOP(short1, --) + +DECLOP_1VAR_COMP(short1, ==) +DECLOP_1VAR_COMP(short1, !=) +DECLOP_1VAR_COMP(short1, <) +DECLOP_1VAR_COMP(short1, >) +DECLOP_1VAR_COMP(short1, <=) +DECLOP_1VAR_COMP(short1, >=) + +DECLOP_1VAR_COMP(short1, &&) +DECLOP_1VAR_COMP(short1, ||) + +DECLOP_1VAR_1IN_1OUT(short1, ~) +DECLOP_1VAR_1IN_BOOLOUT(short1, !) + +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(short1, float) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(short1, double) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long) + +// SIGNED SHORT2 + +DECLOP_2VAR_2IN_1OUT(short2, +) +DECLOP_2VAR_2IN_1OUT(short2, -) +DECLOP_2VAR_2IN_1OUT(short2, *) +DECLOP_2VAR_2IN_1OUT(short2, /) +DECLOP_2VAR_2IN_1OUT(short2, %) +DECLOP_2VAR_2IN_1OUT(short2, &) +DECLOP_2VAR_2IN_1OUT(short2, |) +DECLOP_2VAR_2IN_1OUT(short2, ^) +DECLOP_2VAR_2IN_1OUT(short2, <<) +DECLOP_2VAR_2IN_1OUT(short2, >>) + +DECLOP_2VAR_ASSIGN(short2, +=) +DECLOP_2VAR_ASSIGN(short2, -=) +DECLOP_2VAR_ASSIGN(short2, *=) +DECLOP_2VAR_ASSIGN(short2, /=) +DECLOP_2VAR_ASSIGN(short2, %=) +DECLOP_2VAR_ASSIGN(short2, &=) +DECLOP_2VAR_ASSIGN(short2, |=) +DECLOP_2VAR_ASSIGN(short2, ^=) +DECLOP_2VAR_ASSIGN(short2, <<=) +DECLOP_2VAR_ASSIGN(short2, >>=) + +DECLOP_2VAR_PREOP(short2, ++) +DECLOP_2VAR_PREOP(short2, --) + +DECLOP_2VAR_POSTOP(short2, ++) +DECLOP_2VAR_POSTOP(short2, --) + +DECLOP_2VAR_COMP(short2, ==) +DECLOP_2VAR_COMP(short2, !=) +DECLOP_2VAR_COMP(short2, <) +DECLOP_2VAR_COMP(short2, >) +DECLOP_2VAR_COMP(short2, <=) +DECLOP_2VAR_COMP(short2, >=) + +DECLOP_2VAR_COMP(short2, &&) +DECLOP_2VAR_COMP(short2, ||) + +DECLOP_2VAR_1IN_1OUT(short2, ~) +DECLOP_2VAR_1IN_BOOLOUT(short2, !) + +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(short2, float) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(short2, double) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long) + +// SIGNED SHORT3 + +DECLOP_3VAR_2IN_1OUT(short3, +) +DECLOP_3VAR_2IN_1OUT(short3, -) +DECLOP_3VAR_2IN_1OUT(short3, *) +DECLOP_3VAR_2IN_1OUT(short3, /) +DECLOP_3VAR_2IN_1OUT(short3, %) +DECLOP_3VAR_2IN_1OUT(short3, &) +DECLOP_3VAR_2IN_1OUT(short3, |) +DECLOP_3VAR_2IN_1OUT(short3, ^) +DECLOP_3VAR_2IN_1OUT(short3, <<) +DECLOP_3VAR_2IN_1OUT(short3, >>) + +DECLOP_3VAR_ASSIGN(short3, +=) +DECLOP_3VAR_ASSIGN(short3, -=) +DECLOP_3VAR_ASSIGN(short3, *=) +DECLOP_3VAR_ASSIGN(short3, /=) +DECLOP_3VAR_ASSIGN(short3, %=) +DECLOP_3VAR_ASSIGN(short3, &=) +DECLOP_3VAR_ASSIGN(short3, |=) +DECLOP_3VAR_ASSIGN(short3, ^=) +DECLOP_3VAR_ASSIGN(short3, <<=) +DECLOP_3VAR_ASSIGN(short3, >>=) + +DECLOP_3VAR_PREOP(short3, ++) +DECLOP_3VAR_PREOP(short3, --) + +DECLOP_3VAR_POSTOP(short3, ++) +DECLOP_3VAR_POSTOP(short3, --) + +DECLOP_3VAR_COMP(short3, ==) +DECLOP_3VAR_COMP(short3, !=) +DECLOP_3VAR_COMP(short3, <) +DECLOP_3VAR_COMP(short3, >) +DECLOP_3VAR_COMP(short3, <=) +DECLOP_3VAR_COMP(short3, >=) + +DECLOP_3VAR_COMP(short3, &&) +DECLOP_3VAR_COMP(short3, ||) + +DECLOP_3VAR_1IN_1OUT(short3, ~) +DECLOP_3VAR_1IN_BOOLOUT(short3, !) + +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(short3, float) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(short3, double) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long) + +// SIGNED SHORT4 + +DECLOP_4VAR_2IN_1OUT(short4, +) +DECLOP_4VAR_2IN_1OUT(short4, -) +DECLOP_4VAR_2IN_1OUT(short4, *) +DECLOP_4VAR_2IN_1OUT(short4, /) +DECLOP_4VAR_2IN_1OUT(short4, %) +DECLOP_4VAR_2IN_1OUT(short4, &) +DECLOP_4VAR_2IN_1OUT(short4, |) +DECLOP_4VAR_2IN_1OUT(short4, ^) +DECLOP_4VAR_2IN_1OUT(short4, <<) +DECLOP_4VAR_2IN_1OUT(short4, >>) + +DECLOP_4VAR_ASSIGN(short4, +=) +DECLOP_4VAR_ASSIGN(short4, -=) +DECLOP_4VAR_ASSIGN(short4, *=) +DECLOP_4VAR_ASSIGN(short4, /=) +DECLOP_4VAR_ASSIGN(short4, %=) +DECLOP_4VAR_ASSIGN(short4, &=) +DECLOP_4VAR_ASSIGN(short4, |=) +DECLOP_4VAR_ASSIGN(short4, ^=) +DECLOP_4VAR_ASSIGN(short4, <<=) +DECLOP_4VAR_ASSIGN(short4, >>=) + +DECLOP_4VAR_PREOP(short4, ++) +DECLOP_4VAR_PREOP(short4, --) + +DECLOP_4VAR_POSTOP(short4, ++) +DECLOP_4VAR_POSTOP(short4, --) + +DECLOP_4VAR_COMP(short4, ==) +DECLOP_4VAR_COMP(short4, !=) +DECLOP_4VAR_COMP(short4, <) +DECLOP_4VAR_COMP(short4, >) +DECLOP_4VAR_COMP(short4, <=) +DECLOP_4VAR_COMP(short4, >=) + +DECLOP_4VAR_COMP(short4, &&) +DECLOP_4VAR_COMP(short4, ||) + +DECLOP_4VAR_1IN_1OUT(short4, ~) +DECLOP_4VAR_1IN_BOOLOUT(short4, !) + +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(short4, float) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(short4, double) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long) + +// UNSIGNED INT1 + +DECLOP_1VAR_2IN_1OUT(uint1, +) +DECLOP_1VAR_2IN_1OUT(uint1, -) +DECLOP_1VAR_2IN_1OUT(uint1, *) +DECLOP_1VAR_2IN_1OUT(uint1, /) +DECLOP_1VAR_2IN_1OUT(uint1, %) +DECLOP_1VAR_2IN_1OUT(uint1, &) +DECLOP_1VAR_2IN_1OUT(uint1, |) +DECLOP_1VAR_2IN_1OUT(uint1, ^) +DECLOP_1VAR_2IN_1OUT(uint1, <<) +DECLOP_1VAR_2IN_1OUT(uint1, >>) + + +DECLOP_1VAR_ASSIGN(uint1, +=) +DECLOP_1VAR_ASSIGN(uint1, -=) +DECLOP_1VAR_ASSIGN(uint1, *=) +DECLOP_1VAR_ASSIGN(uint1, /=) +DECLOP_1VAR_ASSIGN(uint1, %=) +DECLOP_1VAR_ASSIGN(uint1, &=) +DECLOP_1VAR_ASSIGN(uint1, |=) +DECLOP_1VAR_ASSIGN(uint1, ^=) +DECLOP_1VAR_ASSIGN(uint1, <<=) +DECLOP_1VAR_ASSIGN(uint1, >>=) + +DECLOP_1VAR_PREOP(uint1, ++) +DECLOP_1VAR_PREOP(uint1, --) + +DECLOP_1VAR_POSTOP(uint1, ++) +DECLOP_1VAR_POSTOP(uint1, --) + +DECLOP_1VAR_COMP(uint1, ==) +DECLOP_1VAR_COMP(uint1, !=) +DECLOP_1VAR_COMP(uint1, <) +DECLOP_1VAR_COMP(uint1, >) +DECLOP_1VAR_COMP(uint1, <=) +DECLOP_1VAR_COMP(uint1, >=) + +DECLOP_1VAR_COMP(uint1, &&) +DECLOP_1VAR_COMP(uint1, ||) + +DECLOP_1VAR_1IN_1OUT(uint1, ~) +DECLOP_1VAR_1IN_BOOLOUT(uint1, !) + +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(uint1, float) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, double) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long) + +// UNSIGNED INT2 + +DECLOP_2VAR_2IN_1OUT(uint2, +) +DECLOP_2VAR_2IN_1OUT(uint2, -) +DECLOP_2VAR_2IN_1OUT(uint2, *) +DECLOP_2VAR_2IN_1OUT(uint2, /) +DECLOP_2VAR_2IN_1OUT(uint2, %) +DECLOP_2VAR_2IN_1OUT(uint2, &) +DECLOP_2VAR_2IN_1OUT(uint2, |) +DECLOP_2VAR_2IN_1OUT(uint2, ^) +DECLOP_2VAR_2IN_1OUT(uint2, <<) +DECLOP_2VAR_2IN_1OUT(uint2, >>) + +DECLOP_2VAR_ASSIGN(uint2, +=) +DECLOP_2VAR_ASSIGN(uint2, -=) +DECLOP_2VAR_ASSIGN(uint2, *=) +DECLOP_2VAR_ASSIGN(uint2, /=) +DECLOP_2VAR_ASSIGN(uint2, %=) +DECLOP_2VAR_ASSIGN(uint2, &=) +DECLOP_2VAR_ASSIGN(uint2, |=) +DECLOP_2VAR_ASSIGN(uint2, ^=) +DECLOP_2VAR_ASSIGN(uint2, <<=) +DECLOP_2VAR_ASSIGN(uint2, >>=) + +DECLOP_2VAR_PREOP(uint2, ++) +DECLOP_2VAR_PREOP(uint2, --) + +DECLOP_2VAR_POSTOP(uint2, ++) +DECLOP_2VAR_POSTOP(uint2, --) + +DECLOP_2VAR_COMP(uint2, ==) +DECLOP_2VAR_COMP(uint2, !=) +DECLOP_2VAR_COMP(uint2, <) +DECLOP_2VAR_COMP(uint2, >) +DECLOP_2VAR_COMP(uint2, <=) +DECLOP_2VAR_COMP(uint2, >=) + +DECLOP_2VAR_COMP(uint2, &&) +DECLOP_2VAR_COMP(uint2, ||) + +DECLOP_2VAR_1IN_1OUT(uint2, ~) +DECLOP_2VAR_1IN_BOOLOUT(uint2, !) + +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(uint2, float) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, double) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long) + +// UNSIGNED INT3 + +DECLOP_3VAR_2IN_1OUT(uint3, +) +DECLOP_3VAR_2IN_1OUT(uint3, -) +DECLOP_3VAR_2IN_1OUT(uint3, *) +DECLOP_3VAR_2IN_1OUT(uint3, /) +DECLOP_3VAR_2IN_1OUT(uint3, %) +DECLOP_3VAR_2IN_1OUT(uint3, &) +DECLOP_3VAR_2IN_1OUT(uint3, |) +DECLOP_3VAR_2IN_1OUT(uint3, ^) +DECLOP_3VAR_2IN_1OUT(uint3, <<) +DECLOP_3VAR_2IN_1OUT(uint3, >>) + +DECLOP_3VAR_ASSIGN(uint3, +=) +DECLOP_3VAR_ASSIGN(uint3, -=) +DECLOP_3VAR_ASSIGN(uint3, *=) +DECLOP_3VAR_ASSIGN(uint3, /=) +DECLOP_3VAR_ASSIGN(uint3, %=) +DECLOP_3VAR_ASSIGN(uint3, &=) +DECLOP_3VAR_ASSIGN(uint3, |=) +DECLOP_3VAR_ASSIGN(uint3, ^=) +DECLOP_3VAR_ASSIGN(uint3, <<=) +DECLOP_3VAR_ASSIGN(uint3, >>=) + +DECLOP_3VAR_PREOP(uint3, ++) +DECLOP_3VAR_PREOP(uint3, --) + +DECLOP_3VAR_POSTOP(uint3, ++) +DECLOP_3VAR_POSTOP(uint3, --) + +DECLOP_3VAR_COMP(uint3, ==) +DECLOP_3VAR_COMP(uint3, !=) +DECLOP_3VAR_COMP(uint3, <) +DECLOP_3VAR_COMP(uint3, >) +DECLOP_3VAR_COMP(uint3, <=) +DECLOP_3VAR_COMP(uint3, >=) + +DECLOP_3VAR_COMP(uint3, &&) +DECLOP_3VAR_COMP(uint3, ||) + +DECLOP_3VAR_1IN_1OUT(uint3, ~) +DECLOP_3VAR_1IN_BOOLOUT(uint3, !) + +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(uint3, float) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, double) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long) + +// UNSIGNED INT4 + +DECLOP_4VAR_2IN_1OUT(uint4, +) +DECLOP_4VAR_2IN_1OUT(uint4, -) +DECLOP_4VAR_2IN_1OUT(uint4, *) +DECLOP_4VAR_2IN_1OUT(uint4, /) +DECLOP_4VAR_2IN_1OUT(uint4, %) +DECLOP_4VAR_2IN_1OUT(uint4, &) +DECLOP_4VAR_2IN_1OUT(uint4, |) +DECLOP_4VAR_2IN_1OUT(uint4, ^) +DECLOP_4VAR_2IN_1OUT(uint4, <<) +DECLOP_4VAR_2IN_1OUT(uint4, >>) + +DECLOP_4VAR_ASSIGN(uint4, +=) +DECLOP_4VAR_ASSIGN(uint4, -=) +DECLOP_4VAR_ASSIGN(uint4, *=) +DECLOP_4VAR_ASSIGN(uint4, /=) +DECLOP_4VAR_ASSIGN(uint4, %=) +DECLOP_4VAR_ASSIGN(uint4, &=) +DECLOP_4VAR_ASSIGN(uint4, |=) +DECLOP_4VAR_ASSIGN(uint4, ^=) +DECLOP_4VAR_ASSIGN(uint4, <<=) +DECLOP_4VAR_ASSIGN(uint4, >>=) + +DECLOP_4VAR_PREOP(uint4, ++) +DECLOP_4VAR_PREOP(uint4, --) + +DECLOP_4VAR_POSTOP(uint4, ++) +DECLOP_4VAR_POSTOP(uint4, --) + +DECLOP_4VAR_COMP(uint4, ==) +DECLOP_4VAR_COMP(uint4, !=) +DECLOP_4VAR_COMP(uint4, <) +DECLOP_4VAR_COMP(uint4, >) +DECLOP_4VAR_COMP(uint4, <=) +DECLOP_4VAR_COMP(uint4, >=) + +DECLOP_4VAR_COMP(uint4, &&) +DECLOP_4VAR_COMP(uint4, ||) + +DECLOP_4VAR_1IN_1OUT(uint4, ~) +DECLOP_4VAR_1IN_BOOLOUT(uint4, !) + +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(uint4, float) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, double) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long) + +// SIGNED INT1 + +DECLOP_1VAR_2IN_1OUT(int1, +) +DECLOP_1VAR_2IN_1OUT(int1, -) +DECLOP_1VAR_2IN_1OUT(int1, *) +DECLOP_1VAR_2IN_1OUT(int1, /) +DECLOP_1VAR_2IN_1OUT(int1, %) +DECLOP_1VAR_2IN_1OUT(int1, &) +DECLOP_1VAR_2IN_1OUT(int1, |) +DECLOP_1VAR_2IN_1OUT(int1, ^) +DECLOP_1VAR_2IN_1OUT(int1, <<) +DECLOP_1VAR_2IN_1OUT(int1, >>) + + +DECLOP_1VAR_ASSIGN(int1, +=) +DECLOP_1VAR_ASSIGN(int1, -=) +DECLOP_1VAR_ASSIGN(int1, *=) +DECLOP_1VAR_ASSIGN(int1, /=) +DECLOP_1VAR_ASSIGN(int1, %=) +DECLOP_1VAR_ASSIGN(int1, &=) +DECLOP_1VAR_ASSIGN(int1, |=) +DECLOP_1VAR_ASSIGN(int1, ^=) +DECLOP_1VAR_ASSIGN(int1, <<=) +DECLOP_1VAR_ASSIGN(int1, >>=) + +DECLOP_1VAR_PREOP(int1, ++) +DECLOP_1VAR_PREOP(int1, --) + +DECLOP_1VAR_POSTOP(int1, ++) +DECLOP_1VAR_POSTOP(int1, --) + +DECLOP_1VAR_COMP(int1, ==) +DECLOP_1VAR_COMP(int1, !=) +DECLOP_1VAR_COMP(int1, <) +DECLOP_1VAR_COMP(int1, >) +DECLOP_1VAR_COMP(int1, <=) +DECLOP_1VAR_COMP(int1, >=) + +DECLOP_1VAR_COMP(int1, &&) +DECLOP_1VAR_COMP(int1, ||) + +DECLOP_1VAR_1IN_1OUT(int1, ~) +DECLOP_1VAR_1IN_BOOLOUT(int1, !) + +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(int1, float) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(int1, double) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long) + +// SIGNED INT2 + +DECLOP_2VAR_2IN_1OUT(int2, +) +DECLOP_2VAR_2IN_1OUT(int2, -) +DECLOP_2VAR_2IN_1OUT(int2, *) +DECLOP_2VAR_2IN_1OUT(int2, /) +DECLOP_2VAR_2IN_1OUT(int2, %) +DECLOP_2VAR_2IN_1OUT(int2, &) +DECLOP_2VAR_2IN_1OUT(int2, |) +DECLOP_2VAR_2IN_1OUT(int2, ^) +DECLOP_2VAR_2IN_1OUT(int2, <<) +DECLOP_2VAR_2IN_1OUT(int2, >>) + +DECLOP_2VAR_ASSIGN(int2, +=) +DECLOP_2VAR_ASSIGN(int2, -=) +DECLOP_2VAR_ASSIGN(int2, *=) +DECLOP_2VAR_ASSIGN(int2, /=) +DECLOP_2VAR_ASSIGN(int2, %=) +DECLOP_2VAR_ASSIGN(int2, &=) +DECLOP_2VAR_ASSIGN(int2, |=) +DECLOP_2VAR_ASSIGN(int2, ^=) +DECLOP_2VAR_ASSIGN(int2, <<=) +DECLOP_2VAR_ASSIGN(int2, >>=) + +DECLOP_2VAR_PREOP(int2, ++) +DECLOP_2VAR_PREOP(int2, --) + +DECLOP_2VAR_POSTOP(int2, ++) +DECLOP_2VAR_POSTOP(int2, --) + +DECLOP_2VAR_COMP(int2, ==) +DECLOP_2VAR_COMP(int2, !=) +DECLOP_2VAR_COMP(int2, <) +DECLOP_2VAR_COMP(int2, >) +DECLOP_2VAR_COMP(int2, <=) +DECLOP_2VAR_COMP(int2, >=) + +DECLOP_2VAR_COMP(int2, &&) +DECLOP_2VAR_COMP(int2, ||) + +DECLOP_2VAR_1IN_1OUT(int2, ~) +DECLOP_2VAR_1IN_BOOLOUT(int2, !) + +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(int2, float) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(int2, double) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long) + +// SIGNED INT3 + +DECLOP_3VAR_2IN_1OUT(int3, +) +DECLOP_3VAR_2IN_1OUT(int3, -) +DECLOP_3VAR_2IN_1OUT(int3, *) +DECLOP_3VAR_2IN_1OUT(int3, /) +DECLOP_3VAR_2IN_1OUT(int3, %) +DECLOP_3VAR_2IN_1OUT(int3, &) +DECLOP_3VAR_2IN_1OUT(int3, |) +DECLOP_3VAR_2IN_1OUT(int3, ^) +DECLOP_3VAR_2IN_1OUT(int3, <<) +DECLOP_3VAR_2IN_1OUT(int3, >>) + +DECLOP_3VAR_ASSIGN(int3, +=) +DECLOP_3VAR_ASSIGN(int3, -=) +DECLOP_3VAR_ASSIGN(int3, *=) +DECLOP_3VAR_ASSIGN(int3, /=) +DECLOP_3VAR_ASSIGN(int3, %=) +DECLOP_3VAR_ASSIGN(int3, &=) +DECLOP_3VAR_ASSIGN(int3, |=) +DECLOP_3VAR_ASSIGN(int3, ^=) +DECLOP_3VAR_ASSIGN(int3, <<=) +DECLOP_3VAR_ASSIGN(int3, >>=) + +DECLOP_3VAR_PREOP(int3, ++) +DECLOP_3VAR_PREOP(int3, --) + +DECLOP_3VAR_POSTOP(int3, ++) +DECLOP_3VAR_POSTOP(int3, --) + +DECLOP_3VAR_COMP(int3, ==) +DECLOP_3VAR_COMP(int3, !=) +DECLOP_3VAR_COMP(int3, <) +DECLOP_3VAR_COMP(int3, >) +DECLOP_3VAR_COMP(int3, <=) +DECLOP_3VAR_COMP(int3, >=) + +DECLOP_3VAR_COMP(int3, &&) +DECLOP_3VAR_COMP(int3, ||) + +DECLOP_3VAR_1IN_1OUT(int3, ~) +DECLOP_3VAR_1IN_BOOLOUT(int3, !) + +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(int3, float) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(int3, double) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long) + +// SIGNED INT4 + +DECLOP_4VAR_2IN_1OUT(int4, +) +DECLOP_4VAR_2IN_1OUT(int4, -) +DECLOP_4VAR_2IN_1OUT(int4, *) +DECLOP_4VAR_2IN_1OUT(int4, /) +DECLOP_4VAR_2IN_1OUT(int4, %) +DECLOP_4VAR_2IN_1OUT(int4, &) +DECLOP_4VAR_2IN_1OUT(int4, |) +DECLOP_4VAR_2IN_1OUT(int4, ^) +DECLOP_4VAR_2IN_1OUT(int4, <<) +DECLOP_4VAR_2IN_1OUT(int4, >>) + +DECLOP_4VAR_ASSIGN(int4, +=) +DECLOP_4VAR_ASSIGN(int4, -=) +DECLOP_4VAR_ASSIGN(int4, *=) +DECLOP_4VAR_ASSIGN(int4, /=) +DECLOP_4VAR_ASSIGN(int4, %=) +DECLOP_4VAR_ASSIGN(int4, &=) +DECLOP_4VAR_ASSIGN(int4, |=) +DECLOP_4VAR_ASSIGN(int4, ^=) +DECLOP_4VAR_ASSIGN(int4, <<=) +DECLOP_4VAR_ASSIGN(int4, >>=) + +DECLOP_4VAR_PREOP(int4, ++) +DECLOP_4VAR_PREOP(int4, --) + +DECLOP_4VAR_POSTOP(int4, ++) +DECLOP_4VAR_POSTOP(int4, --) + +DECLOP_4VAR_COMP(int4, ==) +DECLOP_4VAR_COMP(int4, !=) +DECLOP_4VAR_COMP(int4, <) +DECLOP_4VAR_COMP(int4, >) +DECLOP_4VAR_COMP(int4, <=) +DECLOP_4VAR_COMP(int4, >=) + +DECLOP_4VAR_COMP(int4, &&) +DECLOP_4VAR_COMP(int4, ||) + +DECLOP_4VAR_1IN_1OUT(int4, ~) +DECLOP_4VAR_1IN_BOOLOUT(int4, !) + +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(int4, float) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(int4, double) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long) + +// FLOAT1 + +DECLOP_1VAR_2IN_1OUT(float1, +) +DECLOP_1VAR_2IN_1OUT(float1, -) +DECLOP_1VAR_2IN_1OUT(float1, *) +DECLOP_1VAR_2IN_1OUT(float1, /) + +DECLOP_1VAR_ASSIGN(float1, +=) +DECLOP_1VAR_ASSIGN(float1, -=) +DECLOP_1VAR_ASSIGN(float1, *=) +DECLOP_1VAR_ASSIGN(float1, /=) + +DECLOP_1VAR_PREOP(float1, ++) +DECLOP_1VAR_PREOP(float1, --) + +DECLOP_1VAR_POSTOP(float1, ++) +DECLOP_1VAR_POSTOP(float1, --) + +DECLOP_1VAR_COMP(float1, ==) +DECLOP_1VAR_COMP(float1, !=) +DECLOP_1VAR_COMP(float1, <) +DECLOP_1VAR_COMP(float1, >) +DECLOP_1VAR_COMP(float1, <=) +DECLOP_1VAR_COMP(float1, >=) + +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(float1, float) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(float1, double) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long) + +// FLOAT2 + +DECLOP_2VAR_2IN_1OUT(float2, +) +DECLOP_2VAR_2IN_1OUT(float2, -) +DECLOP_2VAR_2IN_1OUT(float2, *) +DECLOP_2VAR_2IN_1OUT(float2, /) + +DECLOP_2VAR_ASSIGN(float2, +=) +DECLOP_2VAR_ASSIGN(float2, -=) +DECLOP_2VAR_ASSIGN(float2, *=) +DECLOP_2VAR_ASSIGN(float2, /=) + +DECLOP_2VAR_PREOP(float2, ++) +DECLOP_2VAR_PREOP(float2, --) + +DECLOP_2VAR_POSTOP(float2, ++) +DECLOP_2VAR_POSTOP(float2, --) + +DECLOP_2VAR_COMP(float2, ==) +DECLOP_2VAR_COMP(float2, !=) +DECLOP_2VAR_COMP(float2, <) +DECLOP_2VAR_COMP(float2, >) +DECLOP_2VAR_COMP(float2, <=) +DECLOP_2VAR_COMP(float2, >=) + +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(float2, float) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(float2, double) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long) + +// FLOAT3 + +DECLOP_3VAR_2IN_1OUT(float3, +) +DECLOP_3VAR_2IN_1OUT(float3, -) +DECLOP_3VAR_2IN_1OUT(float3, *) +DECLOP_3VAR_2IN_1OUT(float3, /) + +DECLOP_3VAR_ASSIGN(float3, +=) +DECLOP_3VAR_ASSIGN(float3, -=) +DECLOP_3VAR_ASSIGN(float3, *=) +DECLOP_3VAR_ASSIGN(float3, /=) + +DECLOP_3VAR_PREOP(float3, ++) +DECLOP_3VAR_PREOP(float3, --) + +DECLOP_3VAR_POSTOP(float3, ++) +DECLOP_3VAR_POSTOP(float3, --) + +DECLOP_3VAR_COMP(float3, ==) +DECLOP_3VAR_COMP(float3, !=) +DECLOP_3VAR_COMP(float3, <) +DECLOP_3VAR_COMP(float3, >) +DECLOP_3VAR_COMP(float3, <=) +DECLOP_3VAR_COMP(float3, >=) + +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(float3, float) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(float3, double) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long) + +// FLOAT4 + +DECLOP_4VAR_2IN_1OUT(float4, +) +DECLOP_4VAR_2IN_1OUT(float4, -) +DECLOP_4VAR_2IN_1OUT(float4, *) +DECLOP_4VAR_2IN_1OUT(float4, /) + +DECLOP_4VAR_ASSIGN(float4, +=) +DECLOP_4VAR_ASSIGN(float4, -=) +DECLOP_4VAR_ASSIGN(float4, *=) +DECLOP_4VAR_ASSIGN(float4, /=) + +DECLOP_4VAR_PREOP(float4, ++) +DECLOP_4VAR_PREOP(float4, --) + +DECLOP_4VAR_POSTOP(float4, ++) +DECLOP_4VAR_POSTOP(float4, --) + +DECLOP_4VAR_COMP(float4, ==) +DECLOP_4VAR_COMP(float4, !=) +DECLOP_4VAR_COMP(float4, <) +DECLOP_4VAR_COMP(float4, >) +DECLOP_4VAR_COMP(float4, <=) +DECLOP_4VAR_COMP(float4, >=) + +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(float4, float) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(float4, double) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long) + +// DOUBLE1 + +DECLOP_1VAR_2IN_1OUT(double1, +) +DECLOP_1VAR_2IN_1OUT(double1, -) +DECLOP_1VAR_2IN_1OUT(double1, *) +DECLOP_1VAR_2IN_1OUT(double1, /) + +DECLOP_1VAR_ASSIGN(double1, +=) +DECLOP_1VAR_ASSIGN(double1, -=) +DECLOP_1VAR_ASSIGN(double1, *=) +DECLOP_1VAR_ASSIGN(double1, /=) + +DECLOP_1VAR_PREOP(double1, ++) +DECLOP_1VAR_PREOP(double1, --) + +DECLOP_1VAR_POSTOP(double1, ++) +DECLOP_1VAR_POSTOP(double1, --) + +DECLOP_1VAR_COMP(double1, ==) +DECLOP_1VAR_COMP(double1, !=) +DECLOP_1VAR_COMP(double1, <) +DECLOP_1VAR_COMP(double1, >) +DECLOP_1VAR_COMP(double1, <=) +DECLOP_1VAR_COMP(double1, >=) + +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(double1, float) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(double1, double) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long) + +// DOUBLE2 + +DECLOP_2VAR_2IN_1OUT(double2, +) +DECLOP_2VAR_2IN_1OUT(double2, -) +DECLOP_2VAR_2IN_1OUT(double2, *) +DECLOP_2VAR_2IN_1OUT(double2, /) + +DECLOP_2VAR_ASSIGN(double2, +=) +DECLOP_2VAR_ASSIGN(double2, -=) +DECLOP_2VAR_ASSIGN(double2, *=) +DECLOP_2VAR_ASSIGN(double2, /=) + +DECLOP_2VAR_PREOP(double2, ++) +DECLOP_2VAR_PREOP(double2, --) + +DECLOP_2VAR_POSTOP(double2, ++) +DECLOP_2VAR_POSTOP(double2, --) + +DECLOP_2VAR_COMP(double2, ==) +DECLOP_2VAR_COMP(double2, !=) +DECLOP_2VAR_COMP(double2, <) +DECLOP_2VAR_COMP(double2, >) +DECLOP_2VAR_COMP(double2, <=) +DECLOP_2VAR_COMP(double2, >=) + +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(double2, float) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(double2, double) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long) + +// DOUBLE3 + +DECLOP_3VAR_2IN_1OUT(double3, +) +DECLOP_3VAR_2IN_1OUT(double3, -) +DECLOP_3VAR_2IN_1OUT(double3, *) +DECLOP_3VAR_2IN_1OUT(double3, /) + +DECLOP_3VAR_ASSIGN(double3, +=) +DECLOP_3VAR_ASSIGN(double3, -=) +DECLOP_3VAR_ASSIGN(double3, *=) +DECLOP_3VAR_ASSIGN(double3, /=) + +DECLOP_3VAR_PREOP(double3, ++) +DECLOP_3VAR_PREOP(double3, --) + +DECLOP_3VAR_POSTOP(double3, ++) +DECLOP_3VAR_POSTOP(double3, --) + +DECLOP_3VAR_COMP(double3, ==) +DECLOP_3VAR_COMP(double3, !=) +DECLOP_3VAR_COMP(double3, <) +DECLOP_3VAR_COMP(double3, >) +DECLOP_3VAR_COMP(double3, <=) +DECLOP_3VAR_COMP(double3, >=) + +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(double3, float) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(double3, double) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long) + +// DOUBLE4 + +DECLOP_4VAR_2IN_1OUT(double4, +) +DECLOP_4VAR_2IN_1OUT(double4, -) +DECLOP_4VAR_2IN_1OUT(double4, *) +DECLOP_4VAR_2IN_1OUT(double4, /) + +DECLOP_4VAR_ASSIGN(double4, +=) +DECLOP_4VAR_ASSIGN(double4, -=) +DECLOP_4VAR_ASSIGN(double4, *=) +DECLOP_4VAR_ASSIGN(double4, /=) + +DECLOP_4VAR_PREOP(double4, ++) +DECLOP_4VAR_PREOP(double4, --) + +DECLOP_4VAR_POSTOP(double4, ++) +DECLOP_4VAR_POSTOP(double4, --) + +DECLOP_4VAR_COMP(double4, ==) +DECLOP_4VAR_COMP(double4, !=) +DECLOP_4VAR_COMP(double4, <) +DECLOP_4VAR_COMP(double4, >) +DECLOP_4VAR_COMP(double4, <=) +DECLOP_4VAR_COMP(double4, >=) + +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(double4, float) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(double4, double) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long) + +// UNSIGNED LONG1 + +DECLOP_1VAR_2IN_1OUT(ulong1, +) +DECLOP_1VAR_2IN_1OUT(ulong1, -) +DECLOP_1VAR_2IN_1OUT(ulong1, *) +DECLOP_1VAR_2IN_1OUT(ulong1, /) +DECLOP_1VAR_2IN_1OUT(ulong1, %) +DECLOP_1VAR_2IN_1OUT(ulong1, &) +DECLOP_1VAR_2IN_1OUT(ulong1, |) +DECLOP_1VAR_2IN_1OUT(ulong1, ^) +DECLOP_1VAR_2IN_1OUT(ulong1, <<) +DECLOP_1VAR_2IN_1OUT(ulong1, >>) + + +DECLOP_1VAR_ASSIGN(ulong1, +=) +DECLOP_1VAR_ASSIGN(ulong1, -=) +DECLOP_1VAR_ASSIGN(ulong1, *=) +DECLOP_1VAR_ASSIGN(ulong1, /=) +DECLOP_1VAR_ASSIGN(ulong1, %=) +DECLOP_1VAR_ASSIGN(ulong1, &=) +DECLOP_1VAR_ASSIGN(ulong1, |=) +DECLOP_1VAR_ASSIGN(ulong1, ^=) +DECLOP_1VAR_ASSIGN(ulong1, <<=) +DECLOP_1VAR_ASSIGN(ulong1, >>=) + +DECLOP_1VAR_PREOP(ulong1, ++) +DECLOP_1VAR_PREOP(ulong1, --) + +DECLOP_1VAR_POSTOP(ulong1, ++) +DECLOP_1VAR_POSTOP(ulong1, --) + +DECLOP_1VAR_COMP(ulong1, ==) +DECLOP_1VAR_COMP(ulong1, !=) +DECLOP_1VAR_COMP(ulong1, <) +DECLOP_1VAR_COMP(ulong1, >) +DECLOP_1VAR_COMP(ulong1, <=) +DECLOP_1VAR_COMP(ulong1, >=) + +DECLOP_1VAR_COMP(ulong1, &&) +DECLOP_1VAR_COMP(ulong1, ||) + +DECLOP_1VAR_1IN_1OUT(ulong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ulong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, float) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, double) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long) + +// UNSIGNED LONG2 + +DECLOP_2VAR_2IN_1OUT(ulong2, +) +DECLOP_2VAR_2IN_1OUT(ulong2, -) +DECLOP_2VAR_2IN_1OUT(ulong2, *) +DECLOP_2VAR_2IN_1OUT(ulong2, /) +DECLOP_2VAR_2IN_1OUT(ulong2, %) +DECLOP_2VAR_2IN_1OUT(ulong2, &) +DECLOP_2VAR_2IN_1OUT(ulong2, |) +DECLOP_2VAR_2IN_1OUT(ulong2, ^) +DECLOP_2VAR_2IN_1OUT(ulong2, <<) +DECLOP_2VAR_2IN_1OUT(ulong2, >>) + +DECLOP_2VAR_ASSIGN(ulong2, +=) +DECLOP_2VAR_ASSIGN(ulong2, -=) +DECLOP_2VAR_ASSIGN(ulong2, *=) +DECLOP_2VAR_ASSIGN(ulong2, /=) +DECLOP_2VAR_ASSIGN(ulong2, %=) +DECLOP_2VAR_ASSIGN(ulong2, &=) +DECLOP_2VAR_ASSIGN(ulong2, |=) +DECLOP_2VAR_ASSIGN(ulong2, ^=) +DECLOP_2VAR_ASSIGN(ulong2, <<=) +DECLOP_2VAR_ASSIGN(ulong2, >>=) + +DECLOP_2VAR_PREOP(ulong2, ++) +DECLOP_2VAR_PREOP(ulong2, --) + +DECLOP_2VAR_POSTOP(ulong2, ++) +DECLOP_2VAR_POSTOP(ulong2, --) + +DECLOP_2VAR_COMP(ulong2, ==) +DECLOP_2VAR_COMP(ulong2, !=) +DECLOP_2VAR_COMP(ulong2, <) +DECLOP_2VAR_COMP(ulong2, >) +DECLOP_2VAR_COMP(ulong2, <=) +DECLOP_2VAR_COMP(ulong2, >=) + +DECLOP_2VAR_COMP(ulong2, &&) +DECLOP_2VAR_COMP(ulong2, ||) + +DECLOP_2VAR_1IN_1OUT(ulong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ulong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, float) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, double) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long) + +// UNSIGNED LONG3 + +DECLOP_3VAR_2IN_1OUT(ulong3, +) +DECLOP_3VAR_2IN_1OUT(ulong3, -) +DECLOP_3VAR_2IN_1OUT(ulong3, *) +DECLOP_3VAR_2IN_1OUT(ulong3, /) +DECLOP_3VAR_2IN_1OUT(ulong3, %) +DECLOP_3VAR_2IN_1OUT(ulong3, &) +DECLOP_3VAR_2IN_1OUT(ulong3, |) +DECLOP_3VAR_2IN_1OUT(ulong3, ^) +DECLOP_3VAR_2IN_1OUT(ulong3, <<) +DECLOP_3VAR_2IN_1OUT(ulong3, >>) + +DECLOP_3VAR_ASSIGN(ulong3, +=) +DECLOP_3VAR_ASSIGN(ulong3, -=) +DECLOP_3VAR_ASSIGN(ulong3, *=) +DECLOP_3VAR_ASSIGN(ulong3, /=) +DECLOP_3VAR_ASSIGN(ulong3, %=) +DECLOP_3VAR_ASSIGN(ulong3, &=) +DECLOP_3VAR_ASSIGN(ulong3, |=) +DECLOP_3VAR_ASSIGN(ulong3, ^=) +DECLOP_3VAR_ASSIGN(ulong3, <<=) +DECLOP_3VAR_ASSIGN(ulong3, >>=) + +DECLOP_3VAR_PREOP(ulong3, ++) +DECLOP_3VAR_PREOP(ulong3, --) + +DECLOP_3VAR_POSTOP(ulong3, ++) +DECLOP_3VAR_POSTOP(ulong3, --) + +DECLOP_3VAR_COMP(ulong3, ==) +DECLOP_3VAR_COMP(ulong3, !=) +DECLOP_3VAR_COMP(ulong3, <) +DECLOP_3VAR_COMP(ulong3, >) +DECLOP_3VAR_COMP(ulong3, <=) +DECLOP_3VAR_COMP(ulong3, >=) + +DECLOP_3VAR_COMP(ulong3, &&) +DECLOP_3VAR_COMP(ulong3, ||) + +DECLOP_3VAR_1IN_1OUT(ulong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ulong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, float) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, double) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long) + +// UNSIGNED LONG4 + +DECLOP_4VAR_2IN_1OUT(ulong4, +) +DECLOP_4VAR_2IN_1OUT(ulong4, -) +DECLOP_4VAR_2IN_1OUT(ulong4, *) +DECLOP_4VAR_2IN_1OUT(ulong4, /) +DECLOP_4VAR_2IN_1OUT(ulong4, %) +DECLOP_4VAR_2IN_1OUT(ulong4, &) +DECLOP_4VAR_2IN_1OUT(ulong4, |) +DECLOP_4VAR_2IN_1OUT(ulong4, ^) +DECLOP_4VAR_2IN_1OUT(ulong4, <<) +DECLOP_4VAR_2IN_1OUT(ulong4, >>) + +DECLOP_4VAR_ASSIGN(ulong4, +=) +DECLOP_4VAR_ASSIGN(ulong4, -=) +DECLOP_4VAR_ASSIGN(ulong4, *=) +DECLOP_4VAR_ASSIGN(ulong4, /=) +DECLOP_4VAR_ASSIGN(ulong4, %=) +DECLOP_4VAR_ASSIGN(ulong4, &=) +DECLOP_4VAR_ASSIGN(ulong4, |=) +DECLOP_4VAR_ASSIGN(ulong4, ^=) +DECLOP_4VAR_ASSIGN(ulong4, <<=) +DECLOP_4VAR_ASSIGN(ulong4, >>=) + +DECLOP_4VAR_PREOP(ulong4, ++) +DECLOP_4VAR_PREOP(ulong4, --) + +DECLOP_4VAR_POSTOP(ulong4, ++) +DECLOP_4VAR_POSTOP(ulong4, --) + +DECLOP_4VAR_COMP(ulong4, ==) +DECLOP_4VAR_COMP(ulong4, !=) +DECLOP_4VAR_COMP(ulong4, <) +DECLOP_4VAR_COMP(ulong4, >) +DECLOP_4VAR_COMP(ulong4, <=) +DECLOP_4VAR_COMP(ulong4, >=) + +DECLOP_4VAR_COMP(ulong4, &&) +DECLOP_4VAR_COMP(ulong4, ||) + +DECLOP_4VAR_1IN_1OUT(ulong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ulong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, float) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, double) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long) + +// SIGNED LONG1 + +DECLOP_1VAR_2IN_1OUT(long1, +) +DECLOP_1VAR_2IN_1OUT(long1, -) +DECLOP_1VAR_2IN_1OUT(long1, *) +DECLOP_1VAR_2IN_1OUT(long1, /) +DECLOP_1VAR_2IN_1OUT(long1, %) +DECLOP_1VAR_2IN_1OUT(long1, &) +DECLOP_1VAR_2IN_1OUT(long1, |) +DECLOP_1VAR_2IN_1OUT(long1, ^) +DECLOP_1VAR_2IN_1OUT(long1, <<) +DECLOP_1VAR_2IN_1OUT(long1, >>) + + +DECLOP_1VAR_ASSIGN(long1, +=) +DECLOP_1VAR_ASSIGN(long1, -=) +DECLOP_1VAR_ASSIGN(long1, *=) +DECLOP_1VAR_ASSIGN(long1, /=) +DECLOP_1VAR_ASSIGN(long1, %=) +DECLOP_1VAR_ASSIGN(long1, &=) +DECLOP_1VAR_ASSIGN(long1, |=) +DECLOP_1VAR_ASSIGN(long1, ^=) +DECLOP_1VAR_ASSIGN(long1, <<=) +DECLOP_1VAR_ASSIGN(long1, >>=) + +DECLOP_1VAR_PREOP(long1, ++) +DECLOP_1VAR_PREOP(long1, --) + +DECLOP_1VAR_POSTOP(long1, ++) +DECLOP_1VAR_POSTOP(long1, --) + +DECLOP_1VAR_COMP(long1, ==) +DECLOP_1VAR_COMP(long1, !=) +DECLOP_1VAR_COMP(long1, <) +DECLOP_1VAR_COMP(long1, >) +DECLOP_1VAR_COMP(long1, <=) +DECLOP_1VAR_COMP(long1, >=) + +DECLOP_1VAR_COMP(long1, &&) +DECLOP_1VAR_COMP(long1, ||) + +DECLOP_1VAR_1IN_1OUT(long1, ~) +DECLOP_1VAR_1IN_BOOLOUT(long1, !) + +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(long1, float) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(long1, double) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long) + +// SIGNED LONG2 + +DECLOP_2VAR_2IN_1OUT(long2, +) +DECLOP_2VAR_2IN_1OUT(long2, -) +DECLOP_2VAR_2IN_1OUT(long2, *) +DECLOP_2VAR_2IN_1OUT(long2, /) +DECLOP_2VAR_2IN_1OUT(long2, %) +DECLOP_2VAR_2IN_1OUT(long2, &) +DECLOP_2VAR_2IN_1OUT(long2, |) +DECLOP_2VAR_2IN_1OUT(long2, ^) +DECLOP_2VAR_2IN_1OUT(long2, <<) +DECLOP_2VAR_2IN_1OUT(long2, >>) + +DECLOP_2VAR_ASSIGN(long2, +=) +DECLOP_2VAR_ASSIGN(long2, -=) +DECLOP_2VAR_ASSIGN(long2, *=) +DECLOP_2VAR_ASSIGN(long2, /=) +DECLOP_2VAR_ASSIGN(long2, %=) +DECLOP_2VAR_ASSIGN(long2, &=) +DECLOP_2VAR_ASSIGN(long2, |=) +DECLOP_2VAR_ASSIGN(long2, ^=) +DECLOP_2VAR_ASSIGN(long2, <<=) +DECLOP_2VAR_ASSIGN(long2, >>=) + +DECLOP_2VAR_PREOP(long2, ++) +DECLOP_2VAR_PREOP(long2, --) + +DECLOP_2VAR_POSTOP(long2, ++) +DECLOP_2VAR_POSTOP(long2, --) + +DECLOP_2VAR_COMP(long2, ==) +DECLOP_2VAR_COMP(long2, !=) +DECLOP_2VAR_COMP(long2, <) +DECLOP_2VAR_COMP(long2, >) +DECLOP_2VAR_COMP(long2, <=) +DECLOP_2VAR_COMP(long2, >=) + +DECLOP_2VAR_COMP(long2, &&) +DECLOP_2VAR_COMP(long2, ||) + +DECLOP_2VAR_1IN_1OUT(long2, ~) +DECLOP_2VAR_1IN_BOOLOUT(long2, !) + +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(long2, float) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(long2, double) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long) + +// SIGNED LONG3 + +DECLOP_3VAR_2IN_1OUT(long3, +) +DECLOP_3VAR_2IN_1OUT(long3, -) +DECLOP_3VAR_2IN_1OUT(long3, *) +DECLOP_3VAR_2IN_1OUT(long3, /) +DECLOP_3VAR_2IN_1OUT(long3, %) +DECLOP_3VAR_2IN_1OUT(long3, &) +DECLOP_3VAR_2IN_1OUT(long3, |) +DECLOP_3VAR_2IN_1OUT(long3, ^) +DECLOP_3VAR_2IN_1OUT(long3, <<) +DECLOP_3VAR_2IN_1OUT(long3, >>) + +DECLOP_3VAR_ASSIGN(long3, +=) +DECLOP_3VAR_ASSIGN(long3, -=) +DECLOP_3VAR_ASSIGN(long3, *=) +DECLOP_3VAR_ASSIGN(long3, /=) +DECLOP_3VAR_ASSIGN(long3, %=) +DECLOP_3VAR_ASSIGN(long3, &=) +DECLOP_3VAR_ASSIGN(long3, |=) +DECLOP_3VAR_ASSIGN(long3, ^=) +DECLOP_3VAR_ASSIGN(long3, <<=) +DECLOP_3VAR_ASSIGN(long3, >>=) + +DECLOP_3VAR_PREOP(long3, ++) +DECLOP_3VAR_PREOP(long3, --) + +DECLOP_3VAR_POSTOP(long3, ++) +DECLOP_3VAR_POSTOP(long3, --) + +DECLOP_3VAR_COMP(long3, ==) +DECLOP_3VAR_COMP(long3, !=) +DECLOP_3VAR_COMP(long3, <) +DECLOP_3VAR_COMP(long3, >) +DECLOP_3VAR_COMP(long3, <=) +DECLOP_3VAR_COMP(long3, >=) + +DECLOP_3VAR_COMP(long3, &&) +DECLOP_3VAR_COMP(long3, ||) + +DECLOP_3VAR_1IN_1OUT(long3, ~) +DECLOP_3VAR_1IN_BOOLOUT(long3, !) + +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(long3, float) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(long3, double) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long) + +// SIGNED LONG4 + +DECLOP_4VAR_2IN_1OUT(long4, +) +DECLOP_4VAR_2IN_1OUT(long4, -) +DECLOP_4VAR_2IN_1OUT(long4, *) +DECLOP_4VAR_2IN_1OUT(long4, /) +DECLOP_4VAR_2IN_1OUT(long4, %) +DECLOP_4VAR_2IN_1OUT(long4, &) +DECLOP_4VAR_2IN_1OUT(long4, |) +DECLOP_4VAR_2IN_1OUT(long4, ^) +DECLOP_4VAR_2IN_1OUT(long4, <<) +DECLOP_4VAR_2IN_1OUT(long4, >>) + +DECLOP_4VAR_ASSIGN(long4, +=) +DECLOP_4VAR_ASSIGN(long4, -=) +DECLOP_4VAR_ASSIGN(long4, *=) +DECLOP_4VAR_ASSIGN(long4, /=) +DECLOP_4VAR_ASSIGN(long4, %=) +DECLOP_4VAR_ASSIGN(long4, &=) +DECLOP_4VAR_ASSIGN(long4, |=) +DECLOP_4VAR_ASSIGN(long4, ^=) +DECLOP_4VAR_ASSIGN(long4, <<=) +DECLOP_4VAR_ASSIGN(long4, >>=) + +DECLOP_4VAR_PREOP(long4, ++) +DECLOP_4VAR_PREOP(long4, --) + +DECLOP_4VAR_POSTOP(long4, ++) +DECLOP_4VAR_POSTOP(long4, --) + +DECLOP_4VAR_COMP(long4, ==) +DECLOP_4VAR_COMP(long4, !=) +DECLOP_4VAR_COMP(long4, <) +DECLOP_4VAR_COMP(long4, >) +DECLOP_4VAR_COMP(long4, <=) +DECLOP_4VAR_COMP(long4, >=) + +DECLOP_4VAR_COMP(long4, &&) +DECLOP_4VAR_COMP(long4, ||) + +DECLOP_4VAR_1IN_1OUT(long4, ~) +DECLOP_4VAR_1IN_BOOLOUT(long4, !) + +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(long4, float) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(long4, double) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long) + +// UNSIGNED LONGLONG1 + +DECLOP_1VAR_2IN_1OUT(ulonglong1, +) +DECLOP_1VAR_2IN_1OUT(ulonglong1, -) +DECLOP_1VAR_2IN_1OUT(ulonglong1, *) +DECLOP_1VAR_2IN_1OUT(ulonglong1, /) +DECLOP_1VAR_2IN_1OUT(ulonglong1, %) +DECLOP_1VAR_2IN_1OUT(ulonglong1, &) +DECLOP_1VAR_2IN_1OUT(ulonglong1, |) +DECLOP_1VAR_2IN_1OUT(ulonglong1, ^) +DECLOP_1VAR_2IN_1OUT(ulonglong1, <<) +DECLOP_1VAR_2IN_1OUT(ulonglong1, >>) + + +DECLOP_1VAR_ASSIGN(ulonglong1, +=) +DECLOP_1VAR_ASSIGN(ulonglong1, -=) +DECLOP_1VAR_ASSIGN(ulonglong1, *=) +DECLOP_1VAR_ASSIGN(ulonglong1, /=) +DECLOP_1VAR_ASSIGN(ulonglong1, %=) +DECLOP_1VAR_ASSIGN(ulonglong1, &=) +DECLOP_1VAR_ASSIGN(ulonglong1, |=) +DECLOP_1VAR_ASSIGN(ulonglong1, ^=) +DECLOP_1VAR_ASSIGN(ulonglong1, <<=) +DECLOP_1VAR_ASSIGN(ulonglong1, >>=) + +DECLOP_1VAR_PREOP(ulonglong1, ++) +DECLOP_1VAR_PREOP(ulonglong1, --) + +DECLOP_1VAR_POSTOP(ulonglong1, ++) +DECLOP_1VAR_POSTOP(ulonglong1, --) + +DECLOP_1VAR_COMP(ulonglong1, ==) +DECLOP_1VAR_COMP(ulonglong1, !=) +DECLOP_1VAR_COMP(ulonglong1, <) +DECLOP_1VAR_COMP(ulonglong1, >) +DECLOP_1VAR_COMP(ulonglong1, <=) +DECLOP_1VAR_COMP(ulonglong1, >=) + +DECLOP_1VAR_COMP(ulonglong1, &&) +DECLOP_1VAR_COMP(ulonglong1, ||) + +DECLOP_1VAR_1IN_1OUT(ulonglong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long) + +// UNSIGNED LONGLONG2 + +DECLOP_2VAR_2IN_1OUT(ulonglong2, +) +DECLOP_2VAR_2IN_1OUT(ulonglong2, -) +DECLOP_2VAR_2IN_1OUT(ulonglong2, *) +DECLOP_2VAR_2IN_1OUT(ulonglong2, /) +DECLOP_2VAR_2IN_1OUT(ulonglong2, %) +DECLOP_2VAR_2IN_1OUT(ulonglong2, &) +DECLOP_2VAR_2IN_1OUT(ulonglong2, |) +DECLOP_2VAR_2IN_1OUT(ulonglong2, ^) +DECLOP_2VAR_2IN_1OUT(ulonglong2, <<) +DECLOP_2VAR_2IN_1OUT(ulonglong2, >>) + +DECLOP_2VAR_ASSIGN(ulonglong2, +=) +DECLOP_2VAR_ASSIGN(ulonglong2, -=) +DECLOP_2VAR_ASSIGN(ulonglong2, *=) +DECLOP_2VAR_ASSIGN(ulonglong2, /=) +DECLOP_2VAR_ASSIGN(ulonglong2, %=) +DECLOP_2VAR_ASSIGN(ulonglong2, &=) +DECLOP_2VAR_ASSIGN(ulonglong2, |=) +DECLOP_2VAR_ASSIGN(ulonglong2, ^=) +DECLOP_2VAR_ASSIGN(ulonglong2, <<=) +DECLOP_2VAR_ASSIGN(ulonglong2, >>=) + +DECLOP_2VAR_PREOP(ulonglong2, ++) +DECLOP_2VAR_PREOP(ulonglong2, --) + +DECLOP_2VAR_POSTOP(ulonglong2, ++) +DECLOP_2VAR_POSTOP(ulonglong2, --) + +DECLOP_2VAR_COMP(ulonglong2, ==) +DECLOP_2VAR_COMP(ulonglong2, !=) +DECLOP_2VAR_COMP(ulonglong2, <) +DECLOP_2VAR_COMP(ulonglong2, >) +DECLOP_2VAR_COMP(ulonglong2, <=) +DECLOP_2VAR_COMP(ulonglong2, >=) + +DECLOP_2VAR_COMP(ulonglong2, &&) +DECLOP_2VAR_COMP(ulonglong2, ||) + +DECLOP_2VAR_1IN_1OUT(ulonglong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long) + +// UNSIGNED LONGLONG3 + +DECLOP_3VAR_2IN_1OUT(ulonglong3, +) +DECLOP_3VAR_2IN_1OUT(ulonglong3, -) +DECLOP_3VAR_2IN_1OUT(ulonglong3, *) +DECLOP_3VAR_2IN_1OUT(ulonglong3, /) +DECLOP_3VAR_2IN_1OUT(ulonglong3, %) +DECLOP_3VAR_2IN_1OUT(ulonglong3, &) +DECLOP_3VAR_2IN_1OUT(ulonglong3, |) +DECLOP_3VAR_2IN_1OUT(ulonglong3, ^) +DECLOP_3VAR_2IN_1OUT(ulonglong3, <<) +DECLOP_3VAR_2IN_1OUT(ulonglong3, >>) + +DECLOP_3VAR_ASSIGN(ulonglong3, +=) +DECLOP_3VAR_ASSIGN(ulonglong3, -=) +DECLOP_3VAR_ASSIGN(ulonglong3, *=) +DECLOP_3VAR_ASSIGN(ulonglong3, /=) +DECLOP_3VAR_ASSIGN(ulonglong3, %=) +DECLOP_3VAR_ASSIGN(ulonglong3, &=) +DECLOP_3VAR_ASSIGN(ulonglong3, |=) +DECLOP_3VAR_ASSIGN(ulonglong3, ^=) +DECLOP_3VAR_ASSIGN(ulonglong3, <<=) +DECLOP_3VAR_ASSIGN(ulonglong3, >>=) + +DECLOP_3VAR_PREOP(ulonglong3, ++) +DECLOP_3VAR_PREOP(ulonglong3, --) + +DECLOP_3VAR_POSTOP(ulonglong3, ++) +DECLOP_3VAR_POSTOP(ulonglong3, --) + +DECLOP_3VAR_COMP(ulonglong3, ==) +DECLOP_3VAR_COMP(ulonglong3, !=) +DECLOP_3VAR_COMP(ulonglong3, <) +DECLOP_3VAR_COMP(ulonglong3, >) +DECLOP_3VAR_COMP(ulonglong3, <=) +DECLOP_3VAR_COMP(ulonglong3, >=) + +DECLOP_3VAR_COMP(ulonglong3, &&) +DECLOP_3VAR_COMP(ulonglong3, ||) + +DECLOP_3VAR_1IN_1OUT(ulonglong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long) + +// UNSIGNED LONGLONG4 + +DECLOP_4VAR_2IN_1OUT(ulonglong4, +) +DECLOP_4VAR_2IN_1OUT(ulonglong4, -) +DECLOP_4VAR_2IN_1OUT(ulonglong4, *) +DECLOP_4VAR_2IN_1OUT(ulonglong4, /) +DECLOP_4VAR_2IN_1OUT(ulonglong4, %) +DECLOP_4VAR_2IN_1OUT(ulonglong4, &) +DECLOP_4VAR_2IN_1OUT(ulonglong4, |) +DECLOP_4VAR_2IN_1OUT(ulonglong4, ^) +DECLOP_4VAR_2IN_1OUT(ulonglong4, <<) +DECLOP_4VAR_2IN_1OUT(ulonglong4, >>) + +DECLOP_4VAR_ASSIGN(ulonglong4, +=) +DECLOP_4VAR_ASSIGN(ulonglong4, -=) +DECLOP_4VAR_ASSIGN(ulonglong4, *=) +DECLOP_4VAR_ASSIGN(ulonglong4, /=) +DECLOP_4VAR_ASSIGN(ulonglong4, %=) +DECLOP_4VAR_ASSIGN(ulonglong4, &=) +DECLOP_4VAR_ASSIGN(ulonglong4, |=) +DECLOP_4VAR_ASSIGN(ulonglong4, ^=) +DECLOP_4VAR_ASSIGN(ulonglong4, <<=) +DECLOP_4VAR_ASSIGN(ulonglong4, >>=) + +DECLOP_4VAR_PREOP(ulonglong4, ++) +DECLOP_4VAR_PREOP(ulonglong4, --) + +DECLOP_4VAR_POSTOP(ulonglong4, ++) +DECLOP_4VAR_POSTOP(ulonglong4, --) + +DECLOP_4VAR_COMP(ulonglong4, ==) +DECLOP_4VAR_COMP(ulonglong4, !=) +DECLOP_4VAR_COMP(ulonglong4, <) +DECLOP_4VAR_COMP(ulonglong4, >) +DECLOP_4VAR_COMP(ulonglong4, <=) +DECLOP_4VAR_COMP(ulonglong4, >=) + +DECLOP_4VAR_COMP(ulonglong4, &&) +DECLOP_4VAR_COMP(ulonglong4, ||) + +DECLOP_4VAR_1IN_1OUT(ulonglong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long) + +// SIGNED LONGLONG1 + +DECLOP_1VAR_2IN_1OUT(longlong1, +) +DECLOP_1VAR_2IN_1OUT(longlong1, -) +DECLOP_1VAR_2IN_1OUT(longlong1, *) +DECLOP_1VAR_2IN_1OUT(longlong1, /) +DECLOP_1VAR_2IN_1OUT(longlong1, %) +DECLOP_1VAR_2IN_1OUT(longlong1, &) +DECLOP_1VAR_2IN_1OUT(longlong1, |) +DECLOP_1VAR_2IN_1OUT(longlong1, ^) +DECLOP_1VAR_2IN_1OUT(longlong1, <<) +DECLOP_1VAR_2IN_1OUT(longlong1, >>) + + +DECLOP_1VAR_ASSIGN(longlong1, +=) +DECLOP_1VAR_ASSIGN(longlong1, -=) +DECLOP_1VAR_ASSIGN(longlong1, *=) +DECLOP_1VAR_ASSIGN(longlong1, /=) +DECLOP_1VAR_ASSIGN(longlong1, %=) +DECLOP_1VAR_ASSIGN(longlong1, &=) +DECLOP_1VAR_ASSIGN(longlong1, |=) +DECLOP_1VAR_ASSIGN(longlong1, ^=) +DECLOP_1VAR_ASSIGN(longlong1, <<=) +DECLOP_1VAR_ASSIGN(longlong1, >>=) + +DECLOP_1VAR_PREOP(longlong1, ++) +DECLOP_1VAR_PREOP(longlong1, --) + +DECLOP_1VAR_POSTOP(longlong1, ++) +DECLOP_1VAR_POSTOP(longlong1, --) + +DECLOP_1VAR_COMP(longlong1, ==) +DECLOP_1VAR_COMP(longlong1, !=) +DECLOP_1VAR_COMP(longlong1, <) +DECLOP_1VAR_COMP(longlong1, >) +DECLOP_1VAR_COMP(longlong1, <=) +DECLOP_1VAR_COMP(longlong1, >=) + +DECLOP_1VAR_COMP(longlong1, &&) +DECLOP_1VAR_COMP(longlong1, ||) + +DECLOP_1VAR_1IN_1OUT(longlong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(longlong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, float) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, double) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long) + +// SIGNED LONGLONG2 + +DECLOP_2VAR_2IN_1OUT(longlong2, +) +DECLOP_2VAR_2IN_1OUT(longlong2, -) +DECLOP_2VAR_2IN_1OUT(longlong2, *) +DECLOP_2VAR_2IN_1OUT(longlong2, /) +DECLOP_2VAR_2IN_1OUT(longlong2, %) +DECLOP_2VAR_2IN_1OUT(longlong2, &) +DECLOP_2VAR_2IN_1OUT(longlong2, |) +DECLOP_2VAR_2IN_1OUT(longlong2, ^) +DECLOP_2VAR_2IN_1OUT(longlong2, <<) +DECLOP_2VAR_2IN_1OUT(longlong2, >>) + +DECLOP_2VAR_ASSIGN(longlong2, +=) +DECLOP_2VAR_ASSIGN(longlong2, -=) +DECLOP_2VAR_ASSIGN(longlong2, *=) +DECLOP_2VAR_ASSIGN(longlong2, /=) +DECLOP_2VAR_ASSIGN(longlong2, %=) +DECLOP_2VAR_ASSIGN(longlong2, &=) +DECLOP_2VAR_ASSIGN(longlong2, |=) +DECLOP_2VAR_ASSIGN(longlong2, ^=) +DECLOP_2VAR_ASSIGN(longlong2, <<=) +DECLOP_2VAR_ASSIGN(longlong2, >>=) + +DECLOP_2VAR_PREOP(longlong2, ++) +DECLOP_2VAR_PREOP(longlong2, --) + +DECLOP_2VAR_POSTOP(longlong2, ++) +DECLOP_2VAR_POSTOP(longlong2, --) + +DECLOP_2VAR_COMP(longlong2, ==) +DECLOP_2VAR_COMP(longlong2, !=) +DECLOP_2VAR_COMP(longlong2, <) +DECLOP_2VAR_COMP(longlong2, >) +DECLOP_2VAR_COMP(longlong2, <=) +DECLOP_2VAR_COMP(longlong2, >=) + +DECLOP_2VAR_COMP(longlong2, &&) +DECLOP_2VAR_COMP(longlong2, ||) + +DECLOP_2VAR_1IN_1OUT(longlong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(longlong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, float) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, double) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long) + +// SIGNED LONGLONG3 + +DECLOP_3VAR_2IN_1OUT(longlong3, +) +DECLOP_3VAR_2IN_1OUT(longlong3, -) +DECLOP_3VAR_2IN_1OUT(longlong3, *) +DECLOP_3VAR_2IN_1OUT(longlong3, /) +DECLOP_3VAR_2IN_1OUT(longlong3, %) +DECLOP_3VAR_2IN_1OUT(longlong3, &) +DECLOP_3VAR_2IN_1OUT(longlong3, |) +DECLOP_3VAR_2IN_1OUT(longlong3, ^) +DECLOP_3VAR_2IN_1OUT(longlong3, <<) +DECLOP_3VAR_2IN_1OUT(longlong3, >>) + +DECLOP_3VAR_ASSIGN(longlong3, +=) +DECLOP_3VAR_ASSIGN(longlong3, -=) +DECLOP_3VAR_ASSIGN(longlong3, *=) +DECLOP_3VAR_ASSIGN(longlong3, /=) +DECLOP_3VAR_ASSIGN(longlong3, %=) +DECLOP_3VAR_ASSIGN(longlong3, &=) +DECLOP_3VAR_ASSIGN(longlong3, |=) +DECLOP_3VAR_ASSIGN(longlong3, ^=) +DECLOP_3VAR_ASSIGN(longlong3, <<=) +DECLOP_3VAR_ASSIGN(longlong3, >>=) + +DECLOP_3VAR_PREOP(longlong3, ++) +DECLOP_3VAR_PREOP(longlong3, --) + +DECLOP_3VAR_POSTOP(longlong3, ++) +DECLOP_3VAR_POSTOP(longlong3, --) + +DECLOP_3VAR_COMP(longlong3, ==) +DECLOP_3VAR_COMP(longlong3, !=) +DECLOP_3VAR_COMP(longlong3, <) +DECLOP_3VAR_COMP(longlong3, >) +DECLOP_3VAR_COMP(longlong3, <=) +DECLOP_3VAR_COMP(longlong3, >=) + +DECLOP_3VAR_COMP(longlong3, &&) +DECLOP_3VAR_COMP(longlong3, ||) + +DECLOP_3VAR_1IN_1OUT(longlong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(longlong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, float) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, double) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long) + +// SIGNED LONGLONG4 + +DECLOP_4VAR_2IN_1OUT(longlong4, +) +DECLOP_4VAR_2IN_1OUT(longlong4, -) +DECLOP_4VAR_2IN_1OUT(longlong4, *) +DECLOP_4VAR_2IN_1OUT(longlong4, /) +DECLOP_4VAR_2IN_1OUT(longlong4, %) +DECLOP_4VAR_2IN_1OUT(longlong4, &) +DECLOP_4VAR_2IN_1OUT(longlong4, |) +DECLOP_4VAR_2IN_1OUT(longlong4, ^) +DECLOP_4VAR_2IN_1OUT(longlong4, <<) +DECLOP_4VAR_2IN_1OUT(longlong4, >>) + +DECLOP_4VAR_ASSIGN(longlong4, +=) +DECLOP_4VAR_ASSIGN(longlong4, -=) +DECLOP_4VAR_ASSIGN(longlong4, *=) +DECLOP_4VAR_ASSIGN(longlong4, /=) +DECLOP_4VAR_ASSIGN(longlong4, %=) +DECLOP_4VAR_ASSIGN(longlong4, &=) +DECLOP_4VAR_ASSIGN(longlong4, |=) +DECLOP_4VAR_ASSIGN(longlong4, ^=) +DECLOP_4VAR_ASSIGN(longlong4, <<=) +DECLOP_4VAR_ASSIGN(longlong4, >>=) + +DECLOP_4VAR_PREOP(longlong4, ++) +DECLOP_4VAR_PREOP(longlong4, --) + +DECLOP_4VAR_POSTOP(longlong4, ++) +DECLOP_4VAR_POSTOP(longlong4, --) + +DECLOP_4VAR_COMP(longlong4, ==) +DECLOP_4VAR_COMP(longlong4, !=) +DECLOP_4VAR_COMP(longlong4, <) +DECLOP_4VAR_COMP(longlong4, >) +DECLOP_4VAR_COMP(longlong4, <=) +DECLOP_4VAR_COMP(longlong4, >=) + +DECLOP_4VAR_COMP(longlong4, &&) +DECLOP_4VAR_COMP(longlong4, ||) + +DECLOP_4VAR_1IN_1OUT(longlong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(longlong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, float) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, double) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long) #endif From 042de3e1757b7e5162664642cd3b42a3105b5507 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 May 2017 17:22:14 +0300 Subject: [PATCH 38/77] [HIP] [HIPIFY] [FIX] cuModuleLoadDataEx -> hipModuleLoadDataEx https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/81 1. Do not use JIT options on HCC path, call hipModuleLoadData instead. 2. NVCC path is unchanged, to call cuModuleLoadDataEx with all options. 3. Get rid of manual hipification, based on #ifdef #else for NVCC/HIP. 4. Update documentation accordingly. [ROCm/hip commit: 270f643c9c60b1f6b45f74ef094e83706769f2c7] --- .../docs/markdown/hip_porting_driver_api.md | 65 +++++++++++-------- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 38 +++++------ .../include/hip/hcc_detail/hip_runtime_api.h | 19 ++++-- projects/hip/include/hip/hip_runtime_api.h | 24 +++++++ .../include/hip/nvcc_detail/hip_runtime_api.h | 44 ++++++++++--- projects/hip/src/hip_module.cpp | 6 +- 6 files changed, 134 insertions(+), 62 deletions(-) diff --git a/projects/hip/docs/markdown/hip_porting_driver_api.md b/projects/hip/docs/markdown/hip_porting_driver_api.md index dd3b9c3e86..0912e676cc 100644 --- a/projects/hip/docs/markdown/hip_porting_driver_api.md +++ b/projects/hip/docs/markdown/hip_porting_driver_api.md @@ -98,48 +98,57 @@ HIP/HCC will push primary context to context stack when it is empty. This can ha #### Interoperation between HIP and CUDA Driver CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction. -|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| -| ---- | ---- | ---- | -| hipModule | CUmodule | | -| hipFunction | CUfunction | | -| hipCtx_t | CUcontext | | -| hipDevice_t | CUdevice | | -| hipStream_t | CUstream | cudaStream_t | -| hipEvent_t | CUevent | cudaEvent_t | -| hipArray | CUarray | cudaArray | - -#### Compilation Flags -The hipModule interface does not support the `cuModuleLoadDataEx` function, which is used to control PTX compilation options. -HCC does not use PTX and does not support the same compilation options. -In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step. -Code which requires this functionally should use platform-specific coding, calling `cuModuleLoadDataEx` on the NVCC path and `hipModuleLoadData` on the hcc path. -For example: +|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| +| ---- | ---- | ---- | +| hipModule_t | CUmodule | | +| hipFunction_t | CUfunction | | +| hipCtx_t | CUcontext | | +| hipDevice_t | CUdevice | | +| hipStream_t | CUstream | cudaStream_t | +| hipEvent_t | CUevent | cudaEvent_t | +| hipArray | CUarray | cudaArray | +#### Compilation Options +The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options. +HCC does not use PTX and does not support these compilation options. +In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step. +The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HCC path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path. +For example (CUDA): ``` -hipModule module; -void *imagePtr = ... ; // Somehow populate data pointer with code object +CUmodule module; +void *imagePtr = ...; // Somehow populate data pointer with code object -#ifdef __HIP_PLATFORM_NVCC__ -// Use CUDA driver API but write to hipModule since they are same type: const int numOptions = 1; CUJit_option options[numOptions]; void * optionValues[numOptions]; options[0] = CU_JIT_MAX_REGISTERS; -unsigned maxRegs=15; -optionValues[0] = (void*) (&maxRegs); +unsigned maxRegs = 15; +optionValues[0] = (void*)(&maxRegs); cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); -#else // __HIP_PLATFORM_HCC__ +CUfunction k; +cuModuleGetFunction(&k, module, "myKernel"); +``` +HIP: +``` +hipModule_t module; +void *imagePtr = ...; // Somehow populate data pointer with code object -// HCC path does not support or require JIT options, so just load the module. -hipModuleLoadData(&module, imagePtr); +const int numOptions = 1; +hipJitOption options[numOptions]; +void * optionValues[numOptions]; -#endif +options[0] = hipJitOptionMaxRegisters; +unsigned maxRegs = 15; +optionValues[0] = (void*)(&maxRegs); -// Back to unified code - both paths above loaded the "module" variable. -hipFunction k; +// hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and +// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path +hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + +hipFunction_t k; hipModuleGetFunction(&k, module, "myKernel"); ``` diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 0c6b0f1efc..e07baab3fd 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -678,24 +678,24 @@ struct cuda2hipMap { cuda2hipRename["CU_PREFER_PTX"] = {"hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_PREFER_BINARY"] = {"hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // enum CUjit_option/CUjit_option_enum - cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) - cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CUjit_target_enum"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; @@ -905,7 +905,7 @@ struct cuda2hipMap { cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; // unsupported yet by HIP - cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index e1aecef1e8..34ed2ed5ce 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -1915,7 +1915,7 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name * * @param [out] dptr - * @param [out[ bytes + * @param [out] bytes * @param [in] hmod * @param [in] name * @@ -1923,7 +1923,6 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con */ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name); - /** * @brief builds module from code object which resides in host memory. Image is pointer to that location. * @@ -1934,11 +1933,23 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h */ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); +/** +* @brief builds module from code object which resides in host memory. Image is pointer to that location. Options are not used. hipModuleLoadData is called. +* +* @param [in] image +* @param [out] module +* @param [in] number of options +* @param [in] options for JIT +* @param [in] option values for JIT +* +* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized +*/ +hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues); /** * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra * - * @param [in[ f Kernel to launch. + * @param [in] f Kernel to launch. * @param [in] gridDimX X grid dimension specified as multiple of blockDimX. * @param [in] gridDimY Y grid dimension specified as multiple of blockDimY. * @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ. @@ -1946,7 +1957,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); * @param [in] blockDimY Y grid dimension specified in work-items * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. - * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h index 8eae1d6a3a..fa54dda5dc 100644 --- a/projects/hip/include/hip/hip_runtime_api.h +++ b/projects/hip/include/hip/hip_runtime_api.h @@ -250,6 +250,30 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; +/* +* @brief hipJitOption +* @enum +* @ingroup Enumerations +*/ +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionNumOptions +} hipJitOption; + /** * @} */ diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index 69a9b46570..01a93f7ba4 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -54,26 +54,44 @@ hipMemcpyHostToHost #define hipFilterModePoint cudaFilterModePoint //! Flags that can be used with hipEventCreateWithFlags: -#define hipEventDefault cudaEventDefault -#define hipEventBlockingSync cudaEventBlockingSync -#define hipEventDisableTiming cudaEventDisableTiming -#define hipEventInterprocess cudaEventInterprocess +#define hipEventDefault cudaEventDefault +#define hipEventBlockingSync cudaEventBlockingSync +#define hipEventDisableTiming cudaEventDisableTiming +#define hipEventInterprocess cudaEventInterprocess #define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ -#define hipHostMallocDefault cudaHostAllocDefault -#define hipHostMallocPortable cudaHostAllocPortable -#define hipHostMallocMapped cudaHostAllocMapped +#define hipHostMallocDefault cudaHostAllocDefault +#define hipHostMallocPortable cudaHostAllocPortable +#define hipHostMallocMapped cudaHostAllocMapped #define hipHostMallocWriteCombined cudaHostAllocWriteCombined #define hipHostRegisterPortable cudaHostRegisterPortable -#define hipHostRegisterMapped cudaHostRegisterMapped +#define hipHostRegisterMapped cudaHostRegisterMapped #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER -#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE +#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE #define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END #define hipLimitMallocHeapSize cudaLimitMallocHeapSize -#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess +#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess + +// enum CUjit_option redefines +#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS +#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK +#define hipJitOptionWallTime CU_JIT_WALL_TIME +#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER +#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES +#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER +#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES +#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL +#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT +#define hipJitOptionTarget CU_JIT_TARGET +#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY +#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO +#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE +#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO +#define hipJitOptionCacheMode CU_JIT_CACHE_MODE +#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS typedef cudaEvent_t hipEvent_t; typedef cudaStream_t hipStream_t; @@ -84,6 +102,7 @@ typedef cudaFuncCache hipFuncCache_t; typedef CUcontext hipCtx_t; typedef CUsharedconfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; +typedef CUjit_option hipJitOption; typedef CUdevice hipDevice_t; typedef CUmodule hipModule_t; typedef CUfunction hipFunction_t; @@ -894,6 +913,11 @@ inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *imag return hipCUResultTohipError(cuModuleLoadData(module, image)); } +inline static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) +{ + return hipCUResultTohipError(cuModuleLoadDataEx(module, image, numOptions, options, optionValues)); +} + inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index da01f23769..d364a6b519 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -525,7 +525,6 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, } } - hipError_t hipModuleLoadData(hipModule_t *module, const void *image) { HIP_INIT_API(module, image); @@ -575,3 +574,8 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image) } return ihipLogStatus(ret); } + +hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) +{ + return hipModuleLoadData(module, image); +} From cb7e073c42922bf9f2df233a3d9a10c5f2dfc613 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 May 2017 17:39:09 +0300 Subject: [PATCH 39/77] [HIPIFY] [FIX] [HIPIFY] Matcher for pointer to enum var declaration is missing. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/79 Example from CUDA 8.0.44 sample (CUDASamples\0_Simple\matrixMulDrv\matrixMulDrv.cpp): CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; where CUjit_option is enum, should be: hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; [TODO] 1. new CUjit_option -> new hipJitOption. Matcher for new operator is missing: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/79 2. Merge matchers cudaEnumDecl and cudaEnumVarPtr. [ROCm/hip commit: b5a1d47e68e54bfd82df38199f436ece116992bb] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index e07baab3fd..34d3d6e24f 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -2905,6 +2905,51 @@ private: return false; } + bool cudaEnumVarPtr(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumVarPtr = Result.Nodes.getNodeAs("cudaEnumVarPtr")) { + const Type *t = enumVarPtr->getType().getTypePtrOrNull(); + if (t) { + QualType QT = t->getPointeeType(); + std::string name = QT.getAsString(); + QT = enumVarPtr->getType().getUnqualifiedType(); + std::string name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) { + name = name_unqualified; + } + // Workaround for enum VarDecl as param decl, declared with enum type specifier + // Example: void func(enum cudaMemcpyKind kind); + //------------------------------------------------- + SourceManager *SM = Result.SourceManager; + TypeLoc TL = enumVarPtr->getTypeSourceInfo()->getTypeLoc(); + SourceLocation sl(TL.getUnqualifiedLoc().getLocStart()); + SourceLocation end(TL.getUnqualifiedLoc().getLocEnd()); + size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); + StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + //------------------------------------------------- + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [enum var ptr]."; + printHipifyMessage(*SM, sl, msg); + } + } + return true; + } + return false; + } + bool cudaTypedefVar(const MatchFinder::MatchResult &Result) { if (const VarDecl *typedefVar = Result.Nodes.getNodeAs("cudaTypedefVar")) { QualType QT = typedefVar->getType(); @@ -3185,6 +3230,7 @@ public: if (cudaBuiltin(Result)) break; if (cudaEnumConstantRef(Result)) break; if (cudaEnumDecl(Result)) break; + if (cudaEnumVarPtr(Result)) break; if (cudaTypedefVar(Result)) break; if (cudaTypedefVarPtr(Result)) break; if (cudaStructVar(Result)) break; @@ -3232,6 +3278,11 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(enumDecl())) .bind("cudaEnumDecl"), Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(pointsTo(enumDecl( + matchesName("cu.*|CU.*"))))) + .bind("cudaEnumVarPtr"), + Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(typedefDecl(matchesName("cu.*|CU.*")))) .bind("cudaTypedefVar"), From b19e862dc4754f811ff25e5b34ed19a0ab81c3f1 Mon Sep 17 00:00:00 2001 From: pensun Date: Mon, 22 May 2017 08:52:43 -0500 Subject: [PATCH 40/77] fix GGL helper header file, reorder for C++17 Change-Id: I3d9ddfe670bf7e3e8e7bd85e52cc61f48c19c213 [ROCm/hip commit: 6d0f58b93988a03eab45ec78bdf20a280b917288] --- projects/hip/include/hip/hcc_detail/helpers.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/helpers.hpp b/projects/hip/include/hip/hcc_detail/helpers.hpp index 611929766b..b5502c1efb 100644 --- a/projects/hip/include/hip/hcc_detail/helpers.hpp +++ b/projects/hip/include/hip/hcc_detail/helpers.hpp @@ -102,9 +102,6 @@ namespace hip_impl // Not callable. template struct is_callable_impl : std::false_type {}; - - template - struct is_callable : is_callable_impl {}; #else template struct is_callable_impl : std::false_type {}; @@ -114,6 +111,8 @@ namespace hip_impl F(Ts...), void_t_>> : std::true_type {}; #endif + template + struct is_callable : is_callable_impl {}; #define count_macro_args_impl_hip_(\ _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\ From 6e117234bfc2b8248f50b1fe8d87cb0cf8f8ae7f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 23 May 2017 09:32:19 -0500 Subject: [PATCH 41/77] fixed erfinv build error as it is implemented in hcc Change-Id: I27a512147c53f658a63fdf3e90f5e9cfac09ada8 [ROCm/hip commit: 0559fc69e951b8547f9817ba1b2a9053626dea95] --- projects/hip/src/math_functions.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/projects/hip/src/math_functions.cpp b/projects/hip/src/math_functions.cpp index 3472216309..f1e52c4036 100644 --- a/projects/hip/src/math_functions.cpp +++ b/projects/hip/src/math_functions.cpp @@ -830,16 +830,6 @@ __host__ double erfcinv(double y) return __hip_host_erfcinv(y); } -__host__ float erfinvf(float x) -{ - return __hip_host_erfinvf(x); -} - -__host__ double erfinv(double x) -{ - return __hip_host_erfinv(x); -} - __host__ double fdivide(double x, double y) { return x/y; @@ -949,7 +939,7 @@ __host__ void sincospi(double x, double *sptr, double *cptr) __host__ float normcdfinvf(float x) { - return std::sqrt(2) * erfinv(2*x-1); + return std::sqrt(2) * erfinvf(2*x-1); } __host__ double normcdfinv(double x) From 0a43773e0de00118851628cfabfaf32f4e88a6ae Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 23 May 2017 19:45:38 +0300 Subject: [PATCH 42/77] [FIX] [HIPIFY] Matcher for new operator is missing. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/80 Example from CUDA 8.0.44 sample (CUDASamples\0_Simple\matrixMulDrv\matrixMulDrv.cpp): CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; where CUjit_option is enum, should be: hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; [ROCm/hip commit: 3d973dc4da7aedc0becd13523b9ee2f4e3f3d989] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 34d3d6e24f..dcb9c3d216 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -3095,6 +3095,35 @@ private: return false; } + bool cudaNewOperatorDecl(const MatchFinder::MatchResult &Result) { + if (const auto *newOperator = Result.Nodes.getNodeAs("cudaNewOperatorDecl")) { + const Type *t = newOperator->getType().getTypePtrOrNull(); + if (t) { + SourceManager *SM = Result.SourceManager; + TypeLoc TL = newOperator->getAllocatedTypeSourceInfo()->getTypeLoc(); + SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + QualType QT = t->getPointeeType(); + std::string name = QT.getAsString(); + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [new operator]."; + printHipifyMessage(*SM, sl, msg); + } + } + } + return false; + } + + bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { StringRef refName = "cudaSharedIncompleteArrayVar"; if (const VarDecl *sharedVar = Result.Nodes.getNodeAs(refName)) { @@ -3239,6 +3268,7 @@ public: if (cudaParamDecl(Result)) break; if (cudaParamDeclPtr(Result)) break; if (cudaLaunchKernel(Result)) break; + if (cudaNewOperatorDecl(Result)) break; if (cudaSharedIncompleteArrayVar(Result)) break; if (stringLiteral(Result)) break; if (unresolvedTemplateName(Result)) break; @@ -3336,6 +3366,13 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(incompleteArrayType()))) .bind("cudaSharedIncompleteArrayVar"), Callback); + // Example: + // CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + // hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; + Finder.addMatcher(cxxNewExpr(isExpansionInMainFile(), + hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*"))))) + .bind("cudaNewOperatorDecl"), + Callback); } int64_t printStats(const std::string &csvFile, const std::string &srcFile, From 1e0046f7371abfeeafd1ba55cc61d55442b383c0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 22:59:22 -0500 Subject: [PATCH 43/77] Fix trace category for hipHostMalloc [ROCm/hip commit: 7cfe07cff4ffbc816ee99ca07316bd46c211f9a0] --- projects/hip/src/hip_memory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index fc2ada134e..3f95cd22b4 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -245,7 +245,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { - HIP_INIT_SPECIAL_API((TRACE_MCMD), ptr, sizeBytes, flags); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; From 97bdbe659098538cf2994904f38c0066a895ffc0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 22:59:54 -0500 Subject: [PATCH 44/77] Use accelerator_scope for create_marker and create_blocking_marker. As optimization when system-scope is not needed. [ROCm/hip commit: 2b253a48b64f9c15bbef1ce37594bdba57e18307] --- projects/hip/src/hip_hcc.cpp | 4 ++-- projects/hip/src/math_functions.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 979a2e5028..efa05cbb93 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -352,7 +352,7 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) this->ensureHaveQueue(crit); - crit->_av.create_blocking_marker(event->_marker); + crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope); } // Create a marker in this stream. @@ -1490,7 +1490,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) // ensure any commands sent to this stream wait on the NULL stream before continuing LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); // TODO - could be "noret" version of create_blocking_marker - thisStreamCrit->_av.create_blocking_marker(dcf); + thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); } } } diff --git a/projects/hip/src/math_functions.cpp b/projects/hip/src/math_functions.cpp index f1e52c4036..151627fc73 100644 --- a/projects/hip/src/math_functions.cpp +++ b/projects/hip/src/math_functions.cpp @@ -942,10 +942,10 @@ __host__ float normcdfinvf(float x) return std::sqrt(2) * erfinvf(2*x-1); } -__host__ double normcdfinv(double x) -{ - return std::sqrt(2) * erfinv(2*x-1); -} +//__host__ double normcdfinv(double x) +//{ +// return std::sqrt(2) * erfinv(2*x-1); +//} __host__ float nextafterf(float x, float y) { From 247c34195fefc004969f25220902824dff15b098 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 23:14:38 -0500 Subject: [PATCH 45/77] Expand test to cover copy followed by event sync [ROCm/hip commit: 236ce70e942945f55213c0000567b47ee4eb797d] --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 81 ++++++++++++++++--- 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 80ff7ad98d..d12b07289b 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -41,8 +41,14 @@ unsigned p_count = 100; // Structure for one stream; template class Streamer { + +#define COMMAND_ADD_FORWARD 0 +#define COMMAND_ADD_REVERSE 1 +#define COMMAND_COPY 2 + + public: - Streamer(int deviceId, T *input, size_t numElements, bool reverse); + Streamer(int deviceId, T *input, size_t numElements, int commandType); ~Streamer(); void runAsyncAfter(Streamer *depStreamer, bool waitSameStream=false); void runAsyncWaitSameStream(); @@ -57,7 +63,11 @@ public: size_t mismatchCount() const { return _mismatchCount; }; T *C_d() { return _C_d; }; + // How much does this streamer add to A[i] after running runAsyncAfter + int expectedAdd() const { return (_commandType == COMMAND_COPY) ? 0 : p_count; }; + + int _commandType; // 0=addReverse, 1=addFwd, 2=move private: T *_C_h; @@ -71,22 +81,23 @@ private: int _deviceId; size_t _numElements; - bool _reverse; size_t _mismatchCount; }; template -Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : +Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), - _reverse(reverse) + _commandType(commandType) { size_t sizeElements = numElements * sizeof(int); + //if (commandType == 0) _commandType = 1; // TODO - remove me + HIPCHECK(hipSetDevice(_deviceId)); @@ -115,6 +126,23 @@ Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : }; +template +Streamer::~Streamer() +{ + HIPCHECK(hipSetDevice(_deviceId)); + + printf ("info: ~Streamer\n"); + if (_preA_d) { + HIPCHECK(hipFree(_preA_d)); + } + HIPCHECK(hipFree(_C_d)); + HIPCHECK(hipHostFree(_C_h)); + + HIPCHECK(hipStreamDestroy(_stream)); + HIPCHECK(hipEventDestroy(_event)); +} + + template void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) { @@ -134,10 +162,14 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { + if (_commandType == COMMAND_ADD_REVERSE) { hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); - } else { + } else if (_commandType == COMMAND_ADD_FORWARD) { hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } else if (_commandType == COMMAND_COPY) { + HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); + } else { + assert(0); // bad command type } HIPCHECK(hipEventRecord(_event, _stream)); @@ -263,9 +295,13 @@ void checkAll(int initValue, std::vector &streamers, std::vector< } + int expected = 0; // Check in forward order so we can find first mismatch: for (int i=0; icheck(i+1, initValue, (i+1)*p_count, expectPass); + + expected += streamers[i]->expectedAdd(); + + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } if (!expectPass && (mismatchCount==0)) { @@ -305,7 +341,7 @@ void sync_allDevices(int numDevices) void sync_queryAllUntilComplete(std::vector streamers) { - for (int i=0; i=0; i--) { streamers[i]->queryUntilComplete(); }; } @@ -334,8 +370,6 @@ int main(int argc, char *argv[]) - std::vector streamers; - std::vector streamersDev0; // streamers for first device. size_t numElements = N; size_t sizeElements = numElements * sizeof(int); @@ -361,9 +395,13 @@ int main(int argc, char *argv[]) HIPCHECK(hipGetDeviceCount(&numDevices)); numDevices = min(2, numDevices); // multi-GPU to 2 device. + std::vector streamers; + std::vector streamersDev0; // streamers for first device. + for (int d=0; dC_d() : initArray_d, numElements, i&1 /*reverse?*/); + int command = (i%2) ? COMMAND_ADD_FORWARD : COMMAND_ADD_REVERSE; + IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, command); streamers.push_back(s); if (d==0) { streamersDev0.push_back(s); @@ -371,6 +409,10 @@ int main(int argc, char *argv[]) } } + + + + // A sideband stream channel that is independent from above. // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. @@ -383,7 +425,10 @@ int main(int argc, char *argv[]) // Tests on first GPU: + // + // This test has no synchronization - make sure it mismatches so we can ensure the other tests properyl prevent the mismatch: RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false); + RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices), true); RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); @@ -419,5 +464,19 @@ int main(int argc, char *argv[]) } + // Change Adds to copies to stimulate different case with event followign copy: + for (auto &s : streamers) { + if (s->_commandType == COMMAND_ADD_FORWARD) + s->_commandType = COMMAND_COPY; + } + + + { + printf ("test: alternating memcpy/count-reverse followed by event\n"); + RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); + RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); + } + + passed(); } From 07865c3a02f7dc9311c1b1df6fb9f7324d5fbbf3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 23:47:56 -0500 Subject: [PATCH 46/77] Remove HIP_MAX_QUEUES (replaced with HCC_MAX_QUEUES) [ROCm/hip commit: d0ef9d8462d91de9ff0d495b125f099178f5c444] --- projects/hip/src/hip_hcc.cpp | 91 +---------------------------- projects/hip/src/hip_hcc_internal.h | 12 ---- projects/hip/src/hip_memory.cpp | 3 - projects/hip/src/hip_stream.cpp | 6 +- 4 files changed, 4 insertions(+), 108 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index efa05cbb93..e77c4186e8 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -64,7 +64,6 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS; std::vector g_hipLaunchBlockingKernels; int HIP_API_BLOCKING = 0; -int HIP_MAX_QUEUES = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; @@ -267,31 +266,6 @@ ihipStream_t::~ihipStream_t() } -inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit) -{ - if (HIP_MAX_QUEUES && !streamCrit->_hasQueue) { - - // To avoid deadlock, we have to release the stream lock before acquiring context lock. - // Else we can get hung if another thread has the context lock is trying to get lock for this stream. - // We lock it again below. - streamCrit->munlock(); - - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t ctxCrit(this->_ctx->criticalData()); - // TODO - auto needyCritPtr = this->_criticalData.mlock(); - - // Second test to ensure we still need to steal the queue - another thread may have - // snuck in here and already solved the issue. - if (!needyCritPtr->_hasQueue) { - needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this); - } - - streamCrit->_hasQueue = true; - } - assert(streamCrit->_hasQueue); -} - hc::hcWaitMode ihipStream_t::waitMode() const { hc::hcWaitMode waitMode = hc::hcWaitModeActive; @@ -323,13 +297,9 @@ hc::hcWaitMode ihipStream_t::waitMode() const //This signature should be used in routines that already have locked the stream mutex void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { - if (crit->_hasQueue) { - tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); + tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - crit->_av.wait(waitMode()); - } else { - tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); - } + crit->_av.wait(waitMode()); crit->_kernelCnt = 0; } @@ -350,7 +320,6 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope); } @@ -362,7 +331,6 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); #if USE_NO_SCOPE printf ("create_marker, flags = %x\n", event->_flags); event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); @@ -406,7 +374,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() crit->_kernelCnt = 0; } - this->ensureHaveQueue(crit); @@ -1001,55 +968,6 @@ std::string ihipCtx_t::toString() const }; -hc::accelerator_view -ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream) -{ - - // TODO - review handling if queue can't be found. - while (1) { - - for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) { - if (*iter != needyStream) { - auto victimCritPtr = (*iter)->_criticalData.mtry_lock(); - if (victimCritPtr) { - // try-lock succeeded: - if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) { - - victimCritPtr->_hasQueue = false; - - tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n", - ToString(*iter).c_str(), ToString(needyStream).c_str()); - - hc::accelerator_view av = victimCritPtr->_av; - - // TODO - cleanup to remove forced setting to N - uint64_t *p = (uint64_t*)(&victimCritPtr->_av); - *p = 0; // damage the victim av so attempt to use it will fault. - - (*iter)->_criticalData.munlock(); - return av; - } - (*iter)->_criticalData.munlock(); - } - } - } - } -} - - -hc::accelerator_view -ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) -{ - if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) { - // Steal a queue from an existing stream: - hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr); - return av; - } else { - // Create a new view - return getWriteableDevice()->_acc.create_view(); - } -} - //---- @@ -1279,7 +1197,6 @@ void HipReadEnv() READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." ); - READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device. Additional streams will share the specified number of queues. 0=no limit."); READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback); if ((HIP_DB & (1<ensureHaveQueue(crit); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); } @@ -2078,7 +1994,6 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Perform fast asynchronous copy - we know copyDevice != NULL based on check above try { - this->ensureHaveQueue(crit); if (HIP_FORCE_SYNC_COPY) { crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); @@ -2115,7 +2030,6 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Perform slow synchronous copy: LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); } @@ -2170,7 +2084,6 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc) //--- -// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned... hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av) { HIP_INIT_API(stream, av); diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 0d080f9225..278f52dc51 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -447,7 +447,6 @@ public: ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) : _kernelCnt(0), _av(av), - _hasQueue(true), _parent(parentStream) { }; @@ -473,11 +472,6 @@ public: uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). hc::accelerator_view _av; - - // True if the stream has an allocated queue (accelerato_view) for its use: - // Always true at ihipStream creation but queue may later be stolen. - // This acts as a valid bit for the _av. - bool _hasQueue; private: }; @@ -544,8 +538,6 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; - void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit); - public: //--- //Public member vars - these are set at initialization and never change: @@ -792,10 +784,6 @@ public: // Functions: void locked_waitAllStreams(); void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); - // Will allocate a queue and assign it to the needyStream: - hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); - hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit); - ihipCtxCritical_t &criticalData() { return _criticalData; }; const ihipDevice_t *getDevice() const { return _device; }; diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 3f95cd22b4..a3d761752e 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -948,7 +948,6 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; @@ -1000,7 +999,6 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; if ((sizeBytes & 0x3) == 0) { @@ -1053,7 +1051,6 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; if ((sizeBytes & 0x3) == 0) { diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index 34b4bc8851..b4a0740b96 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -49,7 +49,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags) // Obtain mutex access to the device critical data, release by destructor LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); - auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags); + auto istream = new ihipStream_t(ctx, acc.create_view(), flags); ctxCrit->addStream(istream); *stream = istream; @@ -129,9 +129,7 @@ hipError_t hipStreamQuery(hipStream_t stream) { LockedAccessor_StreamCrit_t crit(stream->_criticalData); - if (crit->_hasQueue) { - pendingOps = crit->_av.get_pending_async_ops(); - } + pendingOps = crit->_av.get_pending_async_ops(); } From 5b9ce4163375925b359952700f162a75e969b67c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 24 May 2017 00:48:10 -0500 Subject: [PATCH 47/77] Add hipHostMallocCoherent, hipHostMallocNonCoherent Provide per-allocation control over coherent/non-coherent mem. These overrid the default HIP_COHERENT_HOST_ALLOC setting. [ROCm/hip commit: 75f691ec2fdfc424ab75cac71db453b01bd2cc73] --- .../include/hip/hcc_detail/hip_runtime_api.h | 10 ++- .../include/hip/nvcc_detail/hip_runtime_api.h | 2 + projects/hip/src/hip_hcc.cpp | 2 +- projects/hip/src/hip_memory.cpp | 31 +++++-- .../src/runtimeApi/memory/hipHostMalloc.cpp | 83 +++++++++++++++---- projects/hip/util/vim/hip.vim | 2 + 6 files changed, 104 insertions(+), 26 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 34ed2ed5ce..6fb7c0256e 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -111,17 +111,21 @@ enum hipLimit_t //! Flags that can be used with hipHostMalloc #define hipHostMallocDefault 0x0 -#define hipHostMallocPortable 0x1 -#define hipHostMallocMapped 0x2 +#define hipHostMallocPortable 0x1 ///< Memory is considered allocated by all contexts. +#define hipHostMallocMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer. #define hipHostMallocWriteCombined 0x4 +#define hipHostMallocCoherent 0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation. +#define hipHostMallocNonCoherent 0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation. + //! Flags that can be used with hipHostRegister #define hipHostRegisterDefault 0x0 ///< Memory is Mapped and Portable -#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. HIP only supports one context so this is always assumed true. +#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. #define hipHostRegisterMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer. #define hipHostRegisterIoMemory 0x4 ///< Not supported. + #define hipDeviceScheduleAuto 0x0 ///< Automatically select between Spin and Yield #define hipDeviceScheduleSpin 0x1 ///< Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may consume more power. #define hipDeviceScheduleYield 0x2 ///< Yield the CPU to the operating system when waiting. May increase latency, but lowers power and is friendlier to other threads in the system. diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index 01a93f7ba4..cbc7ed9f9c 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -65,6 +65,8 @@ hipMemcpyHostToHost #define hipHostMallocPortable cudaHostAllocPortable #define hipHostMallocMapped cudaHostAllocMapped #define hipHostMallocWriteCombined cudaHostAllocWriteCombined +#define hipHostMallocCoherent 0x0 +#define hipHostMallocNonCoherent 0x0 #define hipHostRegisterPortable cudaHostRegisterPortable #define hipHostRegisterMapped cudaHostRegisterMapped diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index e77c4186e8..4588f67c2d 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -74,7 +74,7 @@ int HIP_PROFILE_API= 0; std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; -int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ +int HIP_VISIBLE_DEVICES = 0; int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index a3d761752e..3ab7713afa 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -267,17 +267,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) trueFlags = hipHostMallocMapped | hipHostMallocPortable; } - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; - if (flags & ~supportedFlags) { + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent + | hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || + ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent hip_status = hipErrorInvalidValue; - } - else { + } else { auto device = ctx->getWriteableDevice(); - unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + + unsigned amFlags = 0; + if (flags & hipHostMallocCoherent) { + amFlags = amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags = amHostPinned; + } else { + // depends on env variables: + amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + } - *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", + *ptr = hip_internal::allocAndSharePtr((amFlags & amHostCoherent) ? "finegrained_host":"pinned_host", sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); if(sizeBytes && (*ptr == NULL)){ diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp index d6b3b05a1d..31596b5ea5 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -31,14 +31,19 @@ #define LEN 1024*1024 #define SIZE LEN*sizeof(float) -__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){ +__global__ void Add(float *Ad, float *Bd, float *Cd){ int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Cd[tx] = Ad[tx] + Bd[tx]; } + +__global__ void Set(int *Ad, int val){ + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Ad[tx] = val; +} + int main(){ - float *A, *B, *C; - float *Ad, *Bd, *Cd; + hipDeviceProp_t prop; int device; @@ -49,26 +54,72 @@ int main(){ failed("Does support HostPinned Memory"); } - HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped)); - HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault)); - HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped)); - HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0)); - HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0)); + { + float *A, *B, *C; + float *Ad, *Bd, *Cd; + HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped)); + HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault)); + HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped)); - for(int i=0;i Date: Wed, 24 May 2017 01:03:28 -0500 Subject: [PATCH 48/77] Remove HIP_NUM_KERNELS_INFLIGHT. (redundant with HCC controls) [ROCm/hip commit: 578d430bb3068e6835dd012fce69b3bfcaef3b84] --- projects/hip/docs/markdown/hip_porting_guide.md | 1 - projects/hip/src/hip_hcc.cpp | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/projects/hip/docs/markdown/hip_porting_guide.md b/projects/hip/docs/markdown/hip_porting_guide.md index 72f6384f6d..84887fd512 100644 --- a/projects/hip/docs/markdown/hip_porting_guide.md +++ b/projects/hip/docs/markdown/hip_porting_guide.md @@ -569,7 +569,6 @@ HIP_TRACE_API = 0 : Trace each HIP API call. Print function n HIP_TRACE_API_COLOR = green : Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White HIP_PROFILE_API = 0 : Add HIP function begin/end to ATP file generated with CodeXL HIP_VISIBLE_DEVICES = 0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence -HIP_NUM_KERNELS_INFLIGHT = 128 : Number of kernels per stream ``` diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 4588f67c2d..8e4a20ad74 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -75,7 +75,6 @@ std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; int HIP_VISIBLE_DEVICES = 0; -int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; int HIP_FORCE_P2P_HOST = 0; @@ -369,13 +368,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/); - if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){ - this->wait(crit); - crit->_kernelCnt = 0; - } - - - return crit; } @@ -1225,8 +1217,6 @@ void HipReadEnv() READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); - // TODO - review, can we remove this? - READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); From bc9b970f823a373bb7d49d97a36e2d06fce5e4aa Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 24 May 2017 18:25:40 +0300 Subject: [PATCH 49/77] [FIX] [HIPIFY] Add matchers for function return types. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/73 Examples (https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/trivial_copy.inl): template cudaStream_t cuda_memcpy_stream(const thrust::cpp::execution_policy &, const thrust::cuda::execution_policy &exec) template cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy &, const thrust::cpp::execution_policy &) [ROCm/hip commit: c8632156114577404bd1279aa3903c7a022f4409] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index dcb9c3d216..f17c3e2646 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -3123,6 +3123,33 @@ private: return false; } + bool cudaFunctionReturn(const MatchFinder::MatchResult &Result) { + if (const auto *ret = Result.Nodes.getNodeAs("cudaFunctionReturn")) { + QualType QT = ret->getReturnType(); + SourceManager *SM = Result.SourceManager; + SourceRange sr = ret->getReturnTypeSourceRange(); + SourceLocation sl = sr.getBegin(); + std::string name = QT.getAsString(); + if (QT.getTypePtr()->isEnumeralType()) { + name = QT.getTypePtr()->getAs()->getDecl()->getNameAsString(); + } + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [function return]."; + printHipifyMessage(*SM, sl, msg); + } + } + return false; + } bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { StringRef refName = "cudaSharedIncompleteArrayVar"; @@ -3269,6 +3296,7 @@ public: if (cudaParamDeclPtr(Result)) break; if (cudaLaunchKernel(Result)) break; if (cudaNewOperatorDecl(Result)) break; + if (cudaFunctionReturn(Result)) break; if (cudaSharedIncompleteArrayVar(Result)) break; if (stringLiteral(Result)) break; if (unresolvedTemplateName(Result)) break; @@ -3373,6 +3401,16 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*"))))) .bind("cudaNewOperatorDecl"), Callback); + // Examples: + // 1. + // cudaStream_t cuda_memcpy_stream(...) + // 2. + // template cudaMemcpyKind cuda_memcpy_kind(...) + Finder.addMatcher(functionDecl(isExpansionInMainFile(), + returns(hasDeclaration(namedDecl(matchesName("cu.*|CU.*"))))) + .bind("cudaFunctionReturn"), + Callback); + } int64_t printStats(const std::string &csvFile, const std::string &srcFile, From ef1a8c2788d8f27a9ca543edf0515ad1842b6b6a Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Thu, 25 May 2017 23:15:30 -0400 Subject: [PATCH 50/77] fix hip_fast_dsqrt* to call a double fp sqrt function [ROCm/hip commit: 1dce01f9bb085e6288852af5657ededd8d384dcc] --- projects/hip/src/device_util.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp index b730412874..bea42aba46 100644 --- a/projects/hip/src/device_util.cpp +++ b/projects/hip/src/device_util.cpp @@ -1215,20 +1215,23 @@ __device__ float __hip_fast_tanf(float x) { } // Double Precision Math +// FIXME - HCC doesn't have a fast_math version double FP sqrt +// Another issue is that these intrinsics call for a specific rounding mode; +// however, their implementation all map to the same sqrt builtin __device__ double __hip_fast_dsqrt_rd(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_rn(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_ru(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_rz(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ void __threadfence_system(void){ From 715aeef97c4a2f62e8f5cbe40c073c2e9ee0701b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 May 2017 10:37:03 -0500 Subject: [PATCH 51/77] Add isDefaultStream() accessor. Fix code that checked for stream==nullptr after stream had been resolved to a "true stream". [ROCm/hip commit: b251d72917f9832d2062cd8def60fa84b839376a] --- projects/hip/src/hip_event.cpp | 19 +++++++++++++++---- projects/hip/src/hip_hcc_internal.h | 5 ++++- projects/hip/src/hip_stream.cpp | 4 ++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index fbaf5cc463..c11a47b341 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -53,14 +53,19 @@ void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihip void ihipEvent_t::setTimestamp() { + bool isReady0 = _marker.is_ready(); + bool isReady1; + int val = 0; if (_state == hipEventStatusRecorded) { // already recorded, done: return; } else { // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (_marker.get_native_handle()); + isReady1 = _marker.is_ready(); if (sig) { - if (hsa_signal_load_acquire(*sig) == 0) { + val = hsa_signal_load_acquire(*sig); + if (val == 0) { if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) { _timestamp = _marker.get_end_tick(); @@ -75,6 +80,10 @@ void ihipEvent_t::setTimestamp() } } } + + if (_state != hipEventStatusRecorded) { + printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); + } } @@ -118,11 +127,11 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) event->_stream = stream; - if (HIP_SYNC_NULL_STREAM && stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) { // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 - // If stream == NULL, wait on all queues. + // If default stream , then wait on all queues. ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true, true); @@ -167,7 +176,7 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) { auto *ctx = ihipGetTlsDefaultCtx(); // TODO-HIP_SYNC_NULL_STREAM - can remove this code ctx->locked_syncDefaultStream(true, true); @@ -175,6 +184,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + assert (event->_marker.is_ready()); + return ihipLogStatus(hipSuccess); } } else { diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 278f52dc51..94ad4f9340 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -538,10 +538,12 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; + bool isDefaultStream() const { return _id == 0; }; + public: //--- //Public member vars - these are set at initialization and never change: - SeqNum_t _id; // monotonic sequence ID + SeqNum_t _id; // monotonic sequence ID. 0 is the default stream. unsigned _flags; @@ -560,6 +562,7 @@ private: void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes); + public: // TODO - move private // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t ihipStreamCritical_t _criticalData; diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index b4a0740b96..9f1228d6f7 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -146,7 +146,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) hipError_t e = hipSuccess; - if (stream == NULL) { + if (stream == hipStreamNull) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { @@ -198,7 +198,7 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) if (flags == NULL) { return ihipLogStatus(hipErrorInvalidValue); - } else if (stream == NULL) { + } else if (stream == hipStreamNull) { return ihipLogStatus(hipErrorInvalidResourceHandle); } else { *flags = stream->_flags; From c88190d6984bfa4365972d9a34e62f43e9bcdf48 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 27 May 2017 15:55:07 -0500 Subject: [PATCH 52/77] Updates so hip compiles on CUDA. [ROCm/hip commit: be8d0ba644f0897ed0111a5cabbecb72fc7145d4] --- .../include/hip/hcc_detail/hip_runtime_api.h | 25 +++++++++++++++++++ projects/hip/include/hip/hip_runtime_api.h | 23 ----------------- .../src/runtimeApi/stream/hipNullStream.cpp | 2 +- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 2 +- projects/hip/tests/src/test_common.h | 2 +- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 6fb7c0256e..a8db84c4f2 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -136,6 +136,31 @@ enum hipLimit_t #define hipDeviceLmemResizeToMax 0x16 +/* +* @brief hipJitOption +* @enum +* @ingroup Enumerations +*/ +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionNumOptions +} hipJitOption; + + /** * @warning On AMD devices and recent Nvidia devices, these hints and controls are ignored. */ diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h index fa54dda5dc..dc163d5c25 100644 --- a/projects/hip/include/hip/hip_runtime_api.h +++ b/projects/hip/include/hip/hip_runtime_api.h @@ -250,29 +250,6 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; -/* -* @brief hipJitOption -* @enum -* @ingroup Enumerations -*/ -typedef enum hipJitOption { - hipJitOptionMaxRegisters = 0, - hipJitOptionThreadsPerBlock, - hipJitOptionWallTime, - hipJitOptionInfoLogBuffer, - hipJitOptionInfoLogBufferSizeBytes, - hipJitOptionErrorLogBuffer, - hipJitOptionErrorLogBufferSizeBytes, - hipJitOptionOptimizationLevel, - hipJitOptionTargetFromContext, - hipJitOptionTarget, - hipJitOptionFallbackStrategy, - hipJitOptionGenerateDebugInfo, - hipJitOptionLogVerbose, - hipJitOptionGenerateLineInfo, - hipJitOptionCacheMode, - hipJitOptionNumOptions -} hipJitOption; /** * @} diff --git a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp index 380979f6bc..b610315608 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * RUN: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index d12b07289b..9bbd43828c 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * RUN: %t * HIT_END */ diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index bb44c94745..81edca4e1e 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -249,7 +249,7 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - setDefaultData(N, A_h ? *A_h : nullptr, B_h ? *B_h : nullptr, C_h ? *C_h : nullptr); + setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL); } From e24627aa807ee6a52daa5fd63bda9cc0ca109eb6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 26 May 2017 14:48:27 -0500 Subject: [PATCH 53/77] Cleanup hipEvent. (Intermediate checkpoint) Support hipEventDisableSystemRelease flag. Update test. Remove stray printf [ROCm/hip commit: 620eb3069121ffec946ec67e4ff4e8f351585648] --- projects/hip/src/hip_event.cpp | 28 ++--- projects/hip/src/hip_hcc.cpp | 6 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 102 ++++++++++++++---- 3 files changed, 100 insertions(+), 36 deletions(-) diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index c11a47b341..71f6d8ed5b 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -82,7 +82,7 @@ void ihipEvent_t::setTimestamp() } if (_state != hipEventStatusRecorded) { - printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); + //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); } } @@ -92,7 +92,10 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) hipError_t e = hipSuccess; // TODO-IPC - support hipEventInterprocess. - unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming; + unsigned supportedFlags = hipEventDefault + | hipEventBlockingSync + | hipEventDisableTiming + | hipEventDisableSystemRelease; if ((flags & ~supportedFlags) == 0) { ihipEvent_t *eh = new ihipEvent_t(flags); @@ -197,20 +200,18 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - ihipEvent_t *start_eh = start; - ihipEvent_t *stop_eh = stop; - start->setTimestamp(); stop->setTimestamp(); hipError_t status = hipSuccess; *ms = 0.0f; - if (start_eh && stop_eh) { - if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) { + if (start && stop) { + // refresh status: + if ((start->_state == hipEventStatusRecorded) && (stop->_state == hipEventStatusRecorded)) { // Common case, we have good information for both events. - int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp()); + int64_t tickDiff = (stop->timestamp() - start->timestamp()); uint64_t freqHz; hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); @@ -223,13 +224,16 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) } - } else if ((start_eh->_state == hipEventStatusRecording) || - (stop_eh->_state == hipEventStatusRecording)) { + } else if ((start->_state == hipEventStatusRecording) || + (stop->_state == hipEventStatusRecording)) { + status = hipErrorNotReady; - } else if ((start_eh->_state == hipEventStatusUnitialized) || - (stop_eh->_state == hipEventStatusUnitialized)) { + } else if ((start->_state == hipEventStatusUnitialized) || + (stop->_state == hipEventStatusUnitialized)) { status = hipErrorInvalidResourceHandle; } + } else { + status = hipErrorInvalidResourceHandle; } return ihipLogStatus(status); diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 8e4a20ad74..5e13904521 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -49,7 +49,7 @@ THE SOFTWARE. // needs HCC change for hc::no_scope -#define USE_NO_SCOPE 0 +#define USE_NO_SCOPE 1 //================================================================================================= //Global variables: @@ -331,10 +331,10 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); #if USE_NO_SCOPE - printf ("create_marker, flags = %x\n", event->_flags); + //printf ("create_marker, flags = %x\n", event->_flags); event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); #else - event->_marker = crit->_av.create_marker(); + event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::accelerator_scope : hc::system_scope); #endif }; diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 31596b5ea5..0e88570e17 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -42,6 +42,63 @@ __global__ void Set(int *Ad, int val){ Ad[tx] = val; } + +#define SYNC_EVENT 0 +#define SYNC_STREAM 1 +#define SYNC_DEVICE 2 + +std::vector syncMsg = {"event", "stream", "device"}; + +void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg) +{ + std::cerr << "test: CheckHostPointer " << msg + << " ptr=" << ptr + << " syncMethod=" << syncMsg[syncMethod] << "\n"; + + hipStream_t s; + hipEvent_t e; + + // Init: + HIPCHECK(hipStreamCreate(&s)); + HIPCHECK(hipEventCreateWithFlags(&e, hipEventDisableSystemRelease)); + dim3 dimBlock(64,1,1); + dim3 dimGrid(numElements/dimBlock.x,1,1); + + const int expected = 13; + + // Init array to know state: + hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, 0x0, ptr, -42); + HIPCHECK(hipDeviceSynchronize()); + + hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, s, ptr, expected); + HIPCHECK(hipEventRecord(e, s)); + + // Host waits for event : + switch (syncMethod) { + case SYNC_EVENT: + HIPCHECK(hipEventSynchronize(e)); + break; + case SYNC_STREAM: + HIPCHECK(hipStreamSynchronize(s)); + break; + case SYNC_DEVICE: + HIPCHECK(hipDeviceSynchronize()); + break; + default: + assert(0); + }; + + for (int i=0; i Date: Sat, 27 May 2017 16:01:23 -0500 Subject: [PATCH 54/77] Add event controls for release fences. Env var : HIP_EVENT_SYS_RELEASE Event allocation flags : hipEventReleaseToDevice, hipEventReleaseToSystem (remove hipEventDisableSystemRelease) Update test for new functionality. [ROCm/hip commit: ec7102f4611852d4fbfe9191be4a6ed624c39588] --- .../include/hip/hcc_detail/hip_runtime_api.h | 3 ++- .../include/hip/nvcc_detail/hip_runtime_api.h | 3 ++- projects/hip/src/hip_event.cpp | 11 ++++++-- projects/hip/src/hip_hcc.cpp | 24 ++++++++++++----- .../src/runtimeApi/memory/hipHostMalloc.cpp | 27 ++++++++++++------- 5 files changed, 48 insertions(+), 20 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index a8db84c4f2..6059e1e92d 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -106,7 +106,8 @@ enum hipLimit_t #define hipEventBlockingSync 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency. #define hipEventDisableTiming 0x2 ///< Disable event's capability to record timing information. May improve performance. #define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP. -#define hipEventDisableSystemRelease 0x80000000 /// < Disable the system-scope release that event normally performs when it records. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. +#define hipEventReleaseToDevice 0x40000000 /// < Use a device-scope release when recording this event. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. +#define hipEventReleaseToSystem 0x80000000 /// < Use a system-scope release that when recording this event. This flag is useful to make non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms. //! Flags that can be used with hipHostMalloc diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index cbc7ed9f9c..b09c9323c7 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -58,7 +58,8 @@ hipMemcpyHostToHost #define hipEventBlockingSync cudaEventBlockingSync #define hipEventDisableTiming cudaEventDisableTiming #define hipEventInterprocess cudaEventInterprocess -#define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ +#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */ +#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */ #define hipHostMallocDefault cudaHostAllocDefault diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index 71f6d8ed5b..2c31769718 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -95,8 +95,15 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming - | hipEventDisableSystemRelease; - if ((flags & ~supportedFlags) == 0) { + | hipEventReleaseToDevice + | hipEventReleaseToSystem + ; + const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); + + const bool illegalFlags = (flags & ~supportedFlags) || // can't set any unsupported flags. + (flags & releaseFlags) == releaseFlags; // can't set both + + if (!illegalFlags) { ihipEvent_t *eh = new ihipEvent_t(flags); *event = eh; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 5e13904521..4400e4596e 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -93,8 +93,11 @@ int HIP_SYNC_HOST_ALLOC = 1; // Sync on host between int HIP_SYNC_NULL_STREAM = 1; +// HIP needs to change some behavior based on HCC_OPT_FLUSH : int HCC_OPT_FLUSH = 0; +int HIP_EVENT_SYS_RELEASE=0; + @@ -330,12 +333,18 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); -#if USE_NO_SCOPE - //printf ("create_marker, flags = %x\n", event->_flags); - event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); -#else - event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::accelerator_scope : hc::system_scope); -#endif + auto scopeFlag = hc::accelerator_scope; + // The env var HIP_EVENT_SYS_RELEASE sets the default, + // The explicit flags override the env var (if specified) + if (event->_flags & hipEventReleaseToSystem) { + scopeFlag = hc::system_scope; + } else if (event->_flags & hipEventReleaseToDevice) { + scopeFlag = hc::accelerator_scope; + } else { + scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope; + } + + event->_marker = crit->_av.create_marker(scopeFlag); }; //============================================================================= @@ -1221,7 +1230,8 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "When set, use agent-scope fence operations rather than system-scope fence operationsflush when possible. This flag controls both HIP and HCC behavior."); + READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0, "If set, event are created with hipEventReleaseToSystem by default. If 0, events are created with hipEventReleaseToDevice by default. The defaults can be overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag when creating the event."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 0e88570e17..54073e4901 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -49,9 +49,12 @@ __global__ void Set(int *Ad, int val){ std::vector syncMsg = {"event", "stream", "device"}; -void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg) +void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { std::cerr << "test: CheckHostPointer " << msg + << " eventFlags = " << std::hex << eventFlags + << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") + << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") << " ptr=" << ptr << " syncMethod=" << syncMsg[syncMethod] << "\n"; @@ -60,7 +63,7 @@ void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg // Init: HIPCHECK(hipStreamCreate(&s)); - HIPCHECK(hipEventCreateWithFlags(&e, hipEventDisableSystemRelease)); + HIPCHECK(hipEventCreateWithFlags(&e, eventFlags)) dim3 dimBlock(64,1,1); dim3 dimGrid(numElements/dimBlock.x,1,1); @@ -161,18 +164,24 @@ int main(){ int *A = nullptr; HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocNonCoherent)); const char *ptrType = "non-coherent"; // TODO - //CheckHostPointer(numElements, A, SYNC_DEVICE, ptrType); - //CheckHostPointer(numElements, A, SYNC_STREAM, ptrType); - CheckHostPointer(numElements, A, SYNC_EVENT, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT, ptrType); + + // agent-scope releases don't provide host visibility, don't use them here: } - if (0) { // TODO, remove me + if (1) { int *A = nullptr; HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent)); const char *ptrType = "coherent"; - CheckHostPointer(numElements, A, SYNC_DEVICE, ptrType); - CheckHostPointer(numElements, A, SYNC_STREAM, ptrType); - CheckHostPointer(numElements, A, SYNC_EVENT, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_EVENT, ptrType); + + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT, ptrType); } From 56bf96c6df30dec42ef842bc60b252d882e3aa88 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 30 May 2017 15:45:22 +0530 Subject: [PATCH 55/77] Disable normcdfinvf on __host__ Change-Id: If7bfc9826a09eb9b7675ea2a417b9418759b7912 [ROCm/hip commit: 4ff01c971f695568ae9b71aa23f53a13e8d8e502] --- projects/hip/src/math_functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/hip/src/math_functions.cpp b/projects/hip/src/math_functions.cpp index 151627fc73..f66f0a4312 100644 --- a/projects/hip/src/math_functions.cpp +++ b/projects/hip/src/math_functions.cpp @@ -937,10 +937,10 @@ __host__ void sincospi(double x, double *sptr, double *cptr) *cptr = std::cos(HIP_PI*x); } -__host__ float normcdfinvf(float x) -{ - return std::sqrt(2) * erfinvf(2*x-1); -} +//__host__ float normcdfinvf(float x) +//{ +// return std::sqrt(2) * erfinvf(2*x-1); +//} //__host__ double normcdfinv(double x) //{ From 8574437d3681573c4755ef7cb532843f25d5bb20 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 17:58:13 +0300 Subject: [PATCH 56/77] [HIPIFY] Add more CUDA Driver API 8.0.44 Data structures. [ROCm/hip commit: 3e99bc23e72553377cae84c3270c5e66a0be3b33] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index f17c3e2646..b163c4d20a 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -205,6 +205,7 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 + cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER}; // 220 [CUDA 8.0.44] cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 @@ -695,6 +696,10 @@ struct cuda2hipMap { cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) @@ -711,6 +716,11 @@ struct cuda2hipMap { cuda2hipRename["CU_TARGET_COMPUTE_37"] = {"hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_TARGET_COMPUTE_50"] = {"hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_TARGET_COMPUTE_52"] = {"hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + cuda2hipRename["CU_TARGET_COMPUTE_53"] = {"hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_60"] = {"hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_61"] = {"hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_62"] = {"hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // enum CUjitInputType/CUjitInputType_enum cuda2hipRename["CUjitInputType"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CUjitInputType_enum"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; @@ -831,6 +841,26 @@ struct cuda2hipMap { cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + // Flags for ::cuStreamWaitValue32 + cuda2hipRename["CUstreamWaitValue_flags"] = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamWaitValue_flags_enum"] = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_WAIT_VALUE_GEQ"] = {"hipStreamWaitValueGeq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x0 + cuda2hipRename["CU_STREAM_WAIT_VALUE_EQ"] = {"hipStreamWaitValueEq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 + cuda2hipRename["CU_STREAM_WAIT_VALUE_AND"] = {"hipStreamWaitValueAnd", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x2 + cuda2hipRename["CU_STREAM_WAIT_VALUE_FLUSH"] = {"hipStreamWaitValueFlush", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 1<<30 + // Flags for ::cuStreamWriteValue32 + cuda2hipRename["CUstreamWriteValue_flags"] = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamWriteValue_flags"] = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_WRITE_VALUE_DEFAULT"] = {"hipStreamWriteValueDefault", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x0 + cuda2hipRename["CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER"] = {"hipStreamWriteValueNoMemoryBarrier", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 + // Flags for ::cuStreamBatchMemOp + cuda2hipRename["CUstreamBatchMemOpType"] = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamBatchMemOpType_enum"] = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_MEM_OP_WAIT_VALUE_32"] = {"hipStreamBatchMemOpWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_STREAM_MEM_OP_WRITE_VALUE_32"] = {"hipStreamBatchMemOpWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES"] = {"hipStreamBatchMemOpFlushRemoteWrites", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Init cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; From 8a05cf882fc0cbaa6c5212305f2b5e1d7243720b Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 18:29:14 +0300 Subject: [PATCH 57/77] [HIPIFY] Add the rest CUDA Driver API 8.0.44 Data structures. + Memory advise values + Memory Range Attributes + P2P Attributes P.S. There is no any new changes in CUDA Driver API 8.0.61 Data structures since 8.0.44. [ROCm/hip commit: 1cc5f42e34013e69363de5d34bc13cb40c394239] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 47 +++++++++++++++++----- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index b163c4d20a..512144f3b9 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -431,17 +431,36 @@ struct cuda2hipMap { cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1) cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2) cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3) + + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUmem_advise_enum"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 + cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 + // CUmem_range_attribute + cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUmem_range_attribute_enum"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + // Context flags - cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) @@ -908,6 +927,14 @@ struct cuda2hipMap { cuda2hipRename["cuDeviceComputeCapability"] = {"hipDeviceComputeCapability", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + // P2P Attributes + cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUdevice_P2PAttribute_enum"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; From f99ab93b6113046876a2596ec35e4175af1ae580 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 19:45:59 +0300 Subject: [PATCH 58/77] [HIPIFY] Add the rest CUDA Runtime API 8.0.44 Data structures. + sync with corresponding CUDA Driver API Data structures. P.S. There is no any new changes in CUDA Runtime API 8.0.61 Data structures since 8.0.44. [ROCm/hip commit: 063539308ec68e7c45353147e696b028478bf99e] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 269 +++++++++++---------- 1 file changed, 147 insertions(+), 122 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 512144f3b9..59d05e69f7 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -205,7 +205,6 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 - cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER}; // 220 [CUDA 8.0.44] cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 @@ -325,6 +324,9 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; // 219 cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 + cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 220 [CUDA 8.0.44] + cuda2hipRename["cudaErrorNvlinkUncorrectable"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 80 [CUDA 8.0.44] + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; // 302 cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 @@ -434,21 +436,21 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // Memory advise values - cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) // cuda2hipRename["CUmem_advise_enum"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 - cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 - cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 - cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 + cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaMemAdviseSetReadMostly = 1) + cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetReadMostly = 2) + cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaMemAdviseSetPreferredLocation = 3) + cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetPreferredLocation = 4) + cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_RUNTIME ANALOGUE (cudaMemAdviseSetAccessedBy = 5) + cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetAccessedBy = 6) // CUmem_range_attribute - cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaMemRangeAttribute) // cuda2hipRename["CUmem_range_attribute_enum"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeReadMostly = 1) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaMemRangeAttributePreferredLocation = 2) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeAccessedBy = 3) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeLastPrefetchLocation = 4) // Context flags cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -502,111 +504,111 @@ struct cuda2hipMap { cuda2hipRename["CUarray"] = {"hipArray *", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaArray_t) // unsupported yet by HIP - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) // deprecated, do not use - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (no) - + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (cudaDevAttrHostNativeAtomicSupported = 86) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 87 // API_Runtime ANALOGUE (cudaDevAttrSingleToDoublePrecisionPerfRatio = 87) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 88 // API_Runtime ANALOGUE (cudaDevAttrPageableMemoryAccess = 88) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 89 // API_Runtime ANALOGUE (cudaDevAttrConcurrentManagedAccess = 89) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 90 // API_Runtime ANALOGUE (cudaDevAttrComputePreemptionSupported = 90) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 91 // API_Runtime ANALOGUE (cudaDevAttrCanUseHostPointerForRegisteredMem = 91) + + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 92 // API_Runtime ANALOGUE (no) cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; @@ -929,11 +931,11 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes - cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaDeviceP2PAttr) // cuda2hipRename["CUdevice_P2PAttribute_enum"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaDevP2PAttrPerformanceRank = 0x01) + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02) + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03) // Events // pointer to CUevent_st @@ -1352,12 +1354,12 @@ struct cuda2hipMap { cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 85 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85) // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 86 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86) + cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 87 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87) + cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 88 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88) + cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 89 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89) + cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 90 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90) + cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 91 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91) // Pointer Attributes // struct cudaPointerAttributes @@ -1375,6 +1377,13 @@ struct cuda2hipMap { cuda2hipRename["cudaDeviceGetStreamPriorityRange"] = {"hipDeviceGetStreamPriorityRange", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaSetValidDevices"] = {"hipSetValidDevices", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + // P2P Attributes + cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (CUdevice_P2PAttribute) + cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) + cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) + cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0) @@ -1538,7 +1547,7 @@ struct cuda2hipMap { cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_LINEAR = 0x02) cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_PITCH2D = 0x03) - + // enum cudaResourceViewFormat cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourceViewFormat) cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_NONE = 0x00) cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01) @@ -1582,6 +1591,22 @@ struct cuda2hipMap { cuda2hipRename["cudaAddressModeMirror"] = {"hipAddressModeMirror", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeBorder"] = {"hipAddressModeBorder", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) + cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) + cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) + cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) + cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) + cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) + cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) + // CUmem_range_attribute + cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) + cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) + cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) + cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) + cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) + // functions cuda2hipRename["cudaCreateTextureObject"] = {"hipCreateTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDestroyTextureObject"] = {"hipDestroyTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; From 59833bea80125f48fed6c1286202bf4dcc079e8e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 30 May 2017 21:54:33 -0500 Subject: [PATCH 59/77] Set event->_stream on hipHccModuleLaunchKernel path if start/stop used Ensure _stream is always non-null in recorded events. Fixes isDefaultStream fault. [ROCm/hip commit: cb60763737049bdd35de7f31078a2e0e503e2941] --- projects/hip/src/hip_event.cpp | 4 +++- projects/hip/src/hip_hcc_internal.h | 3 ++- projects/hip/src/hip_module.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index 2c31769718..8ef652489a 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -42,11 +42,13 @@ ihipEvent_t::ihipEvent_t(unsigned flags) // Attach to an existing completion future: -void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType) +void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, + hipStream_t stream, ihipEventType_t eventType) { _state = hipEventStatusRecording; _marker = *cf; _type = eventType; + _stream = stream; } diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 94ad4f9340..b15d5a73e4 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -538,6 +538,7 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; + // Before calling this function, stream must be resolved from "0" to the actual stream: bool isDefaultStream() const { return _id == 0; }; public: @@ -602,7 +603,7 @@ enum ihipEventType_t { class ihipEvent_t { public: ihipEvent_t(unsigned flags); - void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); + void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); void setTimestamp(); uint64_t timestamp() const { return _timestamp; } ; ihipEventType_t type() const { return _type; }; diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index d364a6b519..2a3bfabc28 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, if (startEvent) { - startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand); } if (stopEvent) { - stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + stopEvent->attachToCompletionFuture (&cf, hStream, hipEventTypeStopCommand); } From a28399f456c9df8a6e84e246cff8302b5b142cb9 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 4 May 2017 13:57:01 +0530 Subject: [PATCH 60/77] Print msg for single gpu Change-Id: I2d23c73542add8973990ba96592016726994422e [ROCm/hip commit: dfcba01db602b8481535b9baad7cd8caa803fd18] --- .../hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp index 990599e1cb..0f532a2f0a 100644 --- a/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp +++ b/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -55,13 +55,9 @@ void checkPeer2PeerSupport() { int gpuCount; int canAccessPeer; - int p2pCapableDeviceCount=0; HIPCHECK(hipGetDeviceCount(&gpuCount)); - if (gpuCount < 2) - printf("Peer2Peer application requires atleast 2 gpu devices"); - for (int currentGpu=0; currentGpu Date: Thu, 11 May 2017 11:30:49 +0530 Subject: [PATCH 61/77] Add unroll and inline asm cookbook samples Change-Id: Ie5a0fbb01b7fca82959090d89299533d49e092f1 [ROCm/hip commit: e4f0b28bb913a9c9c9df7909e653f2456a228178] --- .../samples/2_Cookbook/10_inline_asm/Makefile | 35 ++++ .../2_Cookbook/10_inline_asm/inline_asm.cpp | 174 ++++++++++++++++++ .../hip/samples/2_Cookbook/9_unroll/Makefile | 39 ++++ .../samples/2_Cookbook/9_unroll/unroll.cpp | 141 ++++++++++++++ 4 files changed, 389 insertions(+) create mode 100644 projects/hip/samples/2_Cookbook/10_inline_asm/Makefile create mode 100644 projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp create mode 100644 projects/hip/samples/2_Cookbook/9_unroll/Makefile create mode 100644 projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/Makefile b/projects/hip/samples/2_Cookbook/10_inline_asm/Makefile new file mode 100644 index 0000000000..77a7699635 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/10_inline_asm/Makefile @@ -0,0 +1,35 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = inline_asm.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./inline_asm + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp new file mode 100644 index 0000000000..2b4fc3de90 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define WIDTH 1024 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/projects/hip/samples/2_Cookbook/9_unroll/Makefile b/projects/hip/samples/2_Cookbook/9_unroll/Makefile new file mode 100644 index 0000000000..b71f3d8353 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/9_unroll/Makefile @@ -0,0 +1,39 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET))) + $(error gfx701 is not a supported device for this sample) +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = unroll.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./unroll + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp b/projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp new file mode 100644 index 0000000000..22f1c75e6e --- /dev/null +++ b/projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + + +#define WIDTH 4 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + float val = in[x]; + +#pragma unroll + for(int i=0;i eps ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} From 96a7f1853e31b60f809e1d1ff034a2aadc8351d0 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 16:33:31 +0530 Subject: [PATCH 62/77] Add inline asm hip directed tests for v_add and v_mac Change-Id: Ie5ace2e42d5da89b16e040537df2bb13d3883c6d [ROCm/hip commit: 830f2b100d59d6cc58301db68d55f01981f8fbde] --- .../hip/tests/src/kernel/inline_asm_vadd.cpp | 126 ++++++++++++++++++ .../hip/tests/src/kernel/inline_asm_vmac.cpp | 125 +++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 projects/hip/tests/src/kernel/inline_asm_vadd.cpp create mode 100644 projects/hip/tests/src/kernel/inline_asm_vmac.cpp diff --git a/projects/hip/tests/src/kernel/inline_asm_vadd.cpp b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp new file mode 100644 index 0000000000..481b606e89 --- /dev/null +++ b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp @@ -0,0 +1,126 @@ +/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the +following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s + * RUN: %t + * HIT_END + */ + + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define NUM 1024 + +#define THREADS_PER_BLOCK_X 4 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void vadd_asm(hipLaunchParm lp, + float *out, + float *in) +{ + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i])); +} + +// CPU implementation of Vector Result +void addCPUReference( + float * output, + float * input) +{ + for(unsigned int j=0; j < NUM; j++) + { + + output[j]= input[j] + output[j]; + } +} + +int main(){ + + float* VectorA; + float* ResultVector; + float* VectorB; + + float* gpuVector; + float* gpuResultVector; + + int i; + int errors; + + VectorA = (float*)malloc(NUM * sizeof(float)); + ResultVector = (float*)malloc(NUM * sizeof(float)); + VectorB = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + VectorA[i] = (float)i*10.0f; + VectorB[i] = (float)i*30.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuVector, NUM * sizeof(float)); + hipMalloc((void**)&gpuResultVector, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(vadd_asm, + dim3(NUM/THREADS_PER_BLOCK_X), + dim3(THREADS_PER_BLOCK_X), + 0, 0, + gpuResultVector , gpuVector); + + // Memory transfer from device to host + hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU Result computation + addCPUReference(VectorB, VectorA); + + // verify the results + errors = 0; + double eps = 1.0E-3; + for (i = 0; i < NUM; i++) { + if (std::abs(ResultVector[i] - VectorB[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuVector); + hipFree(gpuResultVector); + + hipDeviceReset(); + + //free the resources on host side + free(VectorA); + free(ResultVector); + free(VectorB); + + return errors; +} diff --git a/projects/hip/tests/src/kernel/inline_asm_vmac.cpp b/projects/hip/tests/src/kernel/inline_asm_vmac.cpp new file mode 100644 index 0000000000..1b6941c249 --- /dev/null +++ b/projects/hip/tests/src/kernel/inline_asm_vmac.cpp @@ -0,0 +1,125 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define NUM 1024 + +#define THREADS_PER_BLOCK_X 4 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void vmac_asm(hipLaunchParm lp, + float *out, + float *in) +{ + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); +} + +// CPU implementation of saxpy +void CPUReference( + float * output, + float * input) +{ + for(unsigned int j=0; j < NUM; j++) + { + + output[j]= a*input[j] + output[j]; + } +} + +int main(){ + + float* VectorA; + float* ResultVector; + float* VectorB; + + float* gpuVector; + float* gpuResultVector; + + const float a = 10.0f + int i; + int errors; + + VectorA = (float*)malloc(NUM * sizeof(float)); + ResultVector = (float*)malloc(NUM * sizeof(float)); + VectorB = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + VectorA[i] = (float)i*10.0f; + VectorB[i] = (float)i*30.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuVector, NUM * sizeof(float)); + hipMalloc((void**)&gpuResultVector, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(vmac_asm, + dim3(NUM/THREADS_PER_BLOCK_X), + dim3(THREADS_PER_BLOCK_X), + 0, 0, + gpuResultVector , gpuVector); + + // Memory transfer from device to host + hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU Result computation + addCPUReference(VectorB, VectorA); + + // verify the results + errors = 0; + double eps = 1.0E-3; + for (i = 0; i < NUM; i++) { + if (std::abs(ResultVector[i] - VectorB[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuVector); + hipFree(gpuResultVector); + + hipDeviceReset(); + + //free the resources on host side + free(VectorA); + free(ResultVector); + free(VectorB); + + return errors; +} From ee62cf4869d5343db7d5ee49e02ea6c83d5ea864 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 18:43:24 +0530 Subject: [PATCH 63/77] Add readme for inline asm and unroll cookbook samples Change-Id: I71b7a5652c3dad181c5df60ab0dd1b81d79f1bfb [ROCm/hip commit: 0154c97ddd61151b2b4978b83f9b9551a34b11cb] --- .../2_Cookbook/10_inline_asm/Readme.md | 47 ++++++++++++++++++ .../hip/samples/2_Cookbook/9_unroll/Readme.md | 48 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md create mode 100644 projects/hip/samples/2_Cookbook/9_unroll/Readme.md diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md new file mode 100644 index 0000000000..8c98547220 --- /dev/null +++ b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md @@ -0,0 +1,47 @@ +## inline asm ### + +This tutorial is about how to use inline GCN asm in kernel. In this tutorial, we'll explain how to by using the simple Matrix Transpose. + +## Introduction: + +If you want to take advantage of the extra performance benefits of writing in assembly as well as take advantage of special GPU hardware features that were only available through assemby, then this tutorial is for you. In this tutorial we'll be explaining how to start writing inline asm in kernel. + +For more insight Please read the following blogs by Ben Sander +[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](gpuopen.com/amdgcn-assembly) +[AMD GCN Assembly: Cross-Lane Operations](http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/) + +For more information: +[AMD GCN3 ISA Architecture Manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +[User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html) + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the our very first tutorial. + +## asm() Assembler statement + +We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following: + +` asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); ` + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/projects/hip/samples/2_Cookbook/9_unroll/Readme.md b/projects/hip/samples/2_Cookbook/9_unroll/Readme.md new file mode 100644 index 0000000000..3c2635c0eb --- /dev/null +++ b/projects/hip/samples/2_Cookbook/9_unroll/Readme.md @@ -0,0 +1,48 @@ +## Using Pragma unroll ### + +In this tutorial, we'll explain how to use #pragma unroll to improve the performance. + +## Introduction: + +Loop unrolling optimization hints can be specified with #pragma unroll and #pragma nounroll. The pragma is placed immediately before a for loop. +Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel. + +In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following: + +`#pragma unroll ` +` for(int i=0;i Date: Wed, 31 May 2017 10:15:41 +0530 Subject: [PATCH 64/77] Disable rcbrt, scalbln and scalbn double precision device test Change-Id: I46bd895701c46d3592b553090cafba99e41a2e2d [ROCm/hip commit: 4919863d3e9ce60be9eb692efef568bc2490dcf1] --- .../tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp b/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp index df5dad3968..f4f7ab0479 100644 --- a/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp +++ b/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp @@ -99,7 +99,7 @@ __device__ void double_precision_math_functions() normcdf(0.0); normcdfinv(1.0); pow(1.0, 0.0); - rcbrt(1.0); + //rcbrt(1.0); remainder(2.0, 1.0); // remquo(1.0, 2.0, &iX); rhypot(0.0, 1.0); @@ -109,8 +109,8 @@ __device__ void double_precision_math_functions() rnorm4d(0.0, 0.0, 0.0, 1.0); round(0.0); rsqrt(1.0); - scalbln(0.0, 1); - scalbn(0.0, 1); + //scalbln(0.0, 1); + //scalbn(0.0, 1); signbit(1.0); sin(0.0); sincos(0.0, &fX, &fY); From be2c61b72a8695d38f7b7b2eb92b402dac297a0c Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 31 May 2017 10:16:19 +0530 Subject: [PATCH 65/77] Disable rcbrtf, scalblnf, scalbnf in single precision device test Change-Id: I8a250a64a0cb05132d022a11d9766ced9cdf11a7 [ROCm/hip commit: 5cdd1b2bf5d74dafc72a7b3f9bd8be573bef3b57] --- .../tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp b/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp index 53ccd2251f..de3dec35ef 100644 --- a/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp +++ b/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp @@ -100,7 +100,7 @@ __device__ void single_precision_math_functions() normcdfinvf(1.0f); fX = 1.0f; normf(1, &fX); powf(1.0f, 0.0f); - rcbrtf(1.0f); + //rcbrtf(1.0f); remainderf(2.0f, 1.0f); //remquof(1.0f, 2.0f, &iX); rhypotf(0.0f, 1.0f); @@ -110,8 +110,8 @@ __device__ void single_precision_math_functions() fX = 1.0f; rnormf(1, &fX); roundf(0.0f); rsqrtf(1.0f); - scalblnf(0.0f, 1); - scalbnf(0.0f, 1); + //scalblnf(0.0f, 1); + //scalbnf(0.0f, 1); signbit(1.0f); sincosf(0.0f, &fX, &fY); sincospif(0.0f, &fX, &fY); From 404c3f67519ca6ec8898456726c29d3d065c1b6c Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 31 May 2017 10:16:57 +0530 Subject: [PATCH 66/77] Fix hipMemoryAllocate test for single GPU Change-Id: If121c18ab490ba125dc689ffc08a8839fd280c38 [ROCm/hip commit: 502a74fcd678635e59f3d7f16d95c1f000e39343] --- .../hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 1ee5cbc9bb..34951f0a09 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -36,7 +36,6 @@ void multiGpuHostAlloc(int allocDevice) int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - assert(numDevices > 1); printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); @@ -121,10 +120,12 @@ int main(int argc, char *argv[]) { int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - assert(numDevices > 1); multiGpuHostAlloc(0); - multiGpuHostAlloc(1); + if (numDevices > 1) + { + multiGpuHostAlloc(1); + } } passed(); From 7a3befc55550bd3a37c6c7cf85c2a51f32ec1ea8 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 31 May 2017 18:55:29 +0300 Subject: [PATCH 67/77] [HIP] [HIPIFY] CUDA Driver API 8.0.44 JIT options support. [ROCm/hip commit: 6e99e388ea171837ae0a3c7be39b5bfd02d6f99d] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 4 ++-- projects/hip/include/hip/hcc_detail/hip_runtime_api.h | 2 ++ projects/hip/include/hip/nvcc_detail/hip_runtime_api.h | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 59d05e69f7..9c22fde573 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -718,8 +718,8 @@ struct cuda2hipMap { cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 6059e1e92d..25eac31ec6 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -158,6 +158,8 @@ typedef enum hipJitOption { hipJitOptionLogVerbose, hipJitOptionGenerateLineInfo, hipJitOptionCacheMode, + hipJitOptionSm3xOpt, + hipJitOptionFastCompile, hipJitOptionNumOptions } hipJitOption; diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index b09c9323c7..f92523a3e3 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -94,6 +94,8 @@ hipMemcpyHostToHost #define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE #define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO #define hipJitOptionCacheMode CU_JIT_CACHE_MODE +#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT +#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE #define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS typedef cudaEvent_t hipEvent_t; From cc54bc4d857d65a5b00b7b29cec8e201014b594d Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 31 May 2017 15:19:26 -0400 Subject: [PATCH 68/77] fix atomicCAS:remove load for the return value after CAS [ROCm/hip commit: 741eb844fe5ffb4bab7e73820ba710672763fe66] --- projects/hip/src/device_util.cpp | 34 ++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp index bea42aba46..e59a44e5ba 100644 --- a/projects/hip/src/device_util.cpp +++ b/projects/hip/src/device_util.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include "device_util.h" #include "hip/hcc_detail/device_functions.h" #include "hip/hip_runtime.h" +#include //================================================================================================= /* @@ -923,24 +924,45 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address, } //atomicCAS() +template +__device__ T atomicCAS_impl(T* address, T compare, T val) +{ + // the implementation assumes the atomic is lock-free and + // has the same size as the non-atmoic equivalent type + static_assert(sizeof(T) == sizeof(std::atomic) + , "size mismatch between atomic and non-atomic types"); + + union { + T* address; + std::atomic* atomic_address; + } u; + u.address = address; + + T expected = compare; + + // hcc should generate a system scope atomic CAS + std::atomic_compare_exchange_weak_explicit(u.atomic_address + , &expected, val + , std::memory_order_acq_rel + , std::memory_order_relaxed); + return expected; +} + __device__ int atomicCAS(int* address, int compare, int val) { - hc::atomic_compare_exchange(address,&compare,val); - return *address; + return atomicCAS_impl(address, compare, val); } __device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) { - hc::atomic_compare_exchange(address,&compare,val); - return *address; + return atomicCAS_impl(address, compare, val); } __device__ unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val) { - hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val); - return *address; + return atomicCAS_impl(address, compare, val); } //atomicAnd() From 0c12f09ba54d4c0c500ec0f6da469f6eaf3f1602 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 1 Jun 2017 21:08:33 +0300 Subject: [PATCH 69/77] [HIPIFY] All CUDA 8.0.44 API functions update (for both Driver and Runtime APIs) 1) P2P cuDeviceGetP2PAttribute cudaDeviceGetP2PAttribute 2) Memory Mngmnt cuMemPrefetchAsync cudaMemPrefetchAsync cuMemAdvise cudaMemAdvise cuMemRangeGetAttribute cudaMemRangeGetAttribute cuMemRangeGetAttributes cudaMemRangeGetAttributes 3) Streams (Driver API only, no analogues in Runtime API) cuStreamWaitValue32 cuStreamWaitValue32 cuStreamWriteValue32 4) Texture Reference Mngmnt (Driver API only, no analogues in Runtime API) cuTexRefSetBorderColor cuTexRefGetBorderColor [ROCm/hip commit: 4a5484c616cf8dde06fd7008c7582e04b039a92b] --- projects/hip/hipify-clang/src/Cuda2Hip.cpp | 59 ++++++++++++++-------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp index 9c22fde573..0825285b51 100644 --- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp @@ -937,6 +937,8 @@ struct cuda2hipMap { cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02) cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03) + cuda2hipRename["cuDeviceGetP2PAttribute"] = {"hipDeviceGetP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaDeviceGetP2PAttribute) + // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; @@ -973,6 +975,9 @@ struct cuda2hipMap { // Streams // unsupported yet by HIP cuda2hipRename["cuStreamAddCallback"] = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamDestroy_v2"] = {"hipStreamDestroy", CONV_STREAM, API_DRIVER}; @@ -1014,6 +1019,11 @@ struct cuda2hipMap { cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) + cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) + cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) + cuda2hipRename["cuMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes) + // Texture Reference Mngmnt // Texture reference filtering modes cuda2hipRename["CUfilter_mode"] = {"hipTextureFilterMode", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) @@ -1022,6 +1032,9 @@ struct cuda2hipMap { cuda2hipRename["CU_TR_FILTER_MODE_POINT"] = {"hipFilterModePoint", CONV_TEX, API_DRIVER}; // 0 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) cuda2hipRename["CU_TR_FILTER_MODE_LINEAR"] = {"hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaFilterModeLinear = 1) + cuda2hipRename["cuTexRefSetBorderColor"] = {"hipTexRefSetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuTexRefGetBorderColor"] = {"hipTexRefGetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + // Profiler // unsupported yet by HIP cuda2hipRename["cuProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED}; @@ -1111,6 +1124,25 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpyFromArrayAsync"] = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyFromSymbol"] = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + cuda2hipRename["cudaMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + cuda2hipRename["cudaMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) + cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) + cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) + cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) + cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) + cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) + cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) + // CUmem_range_attribute + cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) + cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) + cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) + cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) + cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) // memcpy kind cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; @@ -1137,6 +1169,7 @@ struct cuda2hipMap { cuda2hipRename["cudaGetMipmappedArrayLevel"] = {"hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGetSymbolAddress"] = {"hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGetSymbolSize"] = {"hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMemPrefetchAsync"] = {"hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Driver ANALOGUE (cuMemPrefetchAsync) // malloc cuda2hipRename["cudaMalloc"] = {"hipMalloc", CONV_MEM, API_RUNTIME}; @@ -1379,10 +1412,12 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes - cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (CUdevice_P2PAttribute) - cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) - cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) - cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUdevice_P2PAttribute) + cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) + cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) + cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + // [CUDA 8.0.44] + cuda2hipRename["cudaDeviceGetP2PAttribute"] = {"hipDeviceGetP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (cuDeviceGetP2PAttribute) // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) @@ -1591,22 +1626,6 @@ struct cuda2hipMap { cuda2hipRename["cudaAddressModeMirror"] = {"hipAddressModeMirror", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeBorder"] = {"hipAddressModeBorder", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - // unsupported yet by HIP [CUDA 8.0.44] - // Memory advise values - cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) - cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) - cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) - cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) - cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) - cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) - cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) - // CUmem_range_attribute - cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) - cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) - cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) - cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) - cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) - // functions cuda2hipRename["cudaCreateTextureObject"] = {"hipCreateTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDestroyTextureObject"] = {"hipDestroyTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; From ca707fe6933d893904fdc69b26c997139c55856c Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 2 Jun 2017 16:30:43 +0300 Subject: [PATCH 70/77] [HIPIFY] rename legacy hipify perl script and its usage to hipify-perl [ROCm/hip commit: 4d035caedfdf02340a1e8140d0e16a033e7bb87a] --- projects/hip/bin/hipconvertinplace-perl.sh | 10 +++++----- projects/hip/bin/hipexamine-perl.sh | 6 +++--- projects/hip/bin/{hipify => hipify-perl} | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) rename projects/hip/bin/{hipify => hipify-perl} (99%) diff --git a/projects/hip/bin/hipconvertinplace-perl.sh b/projects/hip/bin/hipconvertinplace-perl.sh index a8c8d6d9e8..d500cc14c6 100755 --- a/projects/hip/bin/hipconvertinplace-perl.sh +++ b/projects/hip/bin/hipconvertinplace-perl.sh @@ -1,18 +1,18 @@ #!/bin/bash -#usage : hipconvertinplace.sh [DIRNAME] [HIPIFY_OPTIONS] +#usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options] -#hipify "inplace" all code files in specified directory. +#hipify "inplace" all code files in specified directory. # This can be quite handy when dealing with an existing CUDA code base since the script # preserves the existing directory structure. # For each code file, this script will: -# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then Hipify the code file. +# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file. # - If ".prehip" file exists, this is used as input to hipify. -# (this is useful for testing improvements to the hipify toolset). +# (this is useful for testing improvements to the hipify-perl toolset). SCRIPT_DIR=`dirname $0` SEARCH_DIR=$1 shift -$SCRIPT_DIR/hipify -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` +$SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` diff --git a/projects/hip/bin/hipexamine-perl.sh b/projects/hip/bin/hipexamine-perl.sh index 40c1bf466d..9e0b01df44 100755 --- a/projects/hip/bin/hipexamine-perl.sh +++ b/projects/hip/bin/hipexamine-perl.sh @@ -1,12 +1,12 @@ #!/bin/bash -#usage : hipexamine.sh DIRNAME [hipify.pl options] +#usage : hipexamine.sh DIRNAME [hipify-perl options] -# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files +# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files # in the specified directory. SCRIPT_DIR=`dirname $0` SEARCH_DIR=$1 shift -$SCRIPT_DIR/hipify -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` +$SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` diff --git a/projects/hip/bin/hipify b/projects/hip/bin/hipify-perl similarity index 99% rename from projects/hip/bin/hipify rename to projects/hip/bin/hipify-perl index 4d77fad3ed..27acc5bccc 100755 --- a/projects/hip/bin/hipify +++ b/projects/hip/bin/hipify-perl @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ## -#usage hipify [OPTIONS] INPUT_FILE +#usage hipify-perl [OPTIONS] INPUT_FILE use Getopt::Long; my $warn_whitelist =""; @@ -201,7 +201,7 @@ while (@ARGV) { my %ft; clearStats(\%ft, \@statNames); my $countIncludes = 0; - my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify but counted here. + my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify-perl, but counted here. my $warnings = 0; my $warningsCublas = 0; my $warningsCurand = 0; From 737e0cc93e34d13537589d9a3c469410605a433c Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 2 Jun 2017 16:33:48 +0300 Subject: [PATCH 71/77] [HIPIFY] annotation [ROCm/hip commit: ccc4cd1a3ebc423e4485d85ffec2d7775406690f] --- projects/hip/bin/hipexamine-perl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/bin/hipexamine-perl.sh b/projects/hip/bin/hipexamine-perl.sh index 9e0b01df44..4e3a261aa4 100755 --- a/projects/hip/bin/hipexamine-perl.sh +++ b/projects/hip/bin/hipexamine-perl.sh @@ -1,6 +1,6 @@ #!/bin/bash -#usage : hipexamine.sh DIRNAME [hipify-perl options] +#usage : hipexamine-perl.sh DIRNAME [hipify-perl options] # Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files # in the specified directory. From 47ec04014054d3a28125f63f14ec77ead0ae7551 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 2 Jun 2017 11:19:33 -0500 Subject: [PATCH 72/77] added half data type and vector destructors 1. Added half data types to hip_fp16.h 2. Added destructor to vector data types Change-Id: Id5ae76a663bb90a4bde2839ec79c58fbaee5072f [ROCm/hip commit: cb7c4c423c8ec1fd835f0ae4119adc3cfd61b036] --- .../hip/include/hip/hcc_detail/hip_fp16.h | 1 + .../include/hip/hcc_detail/hip_vector_types.h | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_fp16.h b/projects/hip/include/hip/hcc_detail/hip_fp16.h index a1abce2191..b1ecc61cb0 100644 --- a/projects/hip/include/hip/hcc_detail/hip_fp16.h +++ b/projects/hip/include/hip/hcc_detail/hip_fp16.h @@ -28,6 +28,7 @@ THE SOFTWARE. typedef __fp16 __half; typedef __fp16 __half1 __attribute__((ext_vector_type(1))); typedef __fp16 __half2 __attribute__((ext_vector_type(2))); +typedef __fp16 half; /* Half Arithmetic Functions diff --git a/projects/hip/include/hip/hcc_detail/hip_vector_types.h b/projects/hip/include/hip/hcc_detail/hip_vector_types.h index 3c3b26c12a..9da34d9f32 100644 --- a/projects/hip/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/hip/include/hip/hcc_detail/hip_vector_types.h @@ -37,38 +37,41 @@ THE SOFTWARE. #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x) { } \ -__device__ __host__ type(const type& val) : x(val.x) { } +__device__ __host__ type(const type& val) : x(val.x) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } +__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } - +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ +__device__ __host__ ~type() {} #define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val) {} \ -__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} +__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} +__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} +__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} \ struct uchar1 { #ifdef __cplusplus From a1bdd5f58507a09c05fb989134338eb51aa47d91 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 3 Jun 2017 17:09:19 -0500 Subject: [PATCH 73/77] Update tests, add p2p coherency test. [ROCm/hip commit: d5c161632476af9a18671393b948dfbc4635e3b6] --- projects/hip/src/hip_hcc.cpp | 4 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 17 ++ .../runtimeApi/memory/p2p_copy_coherency.cpp | 170 ++++++++++++++++++ 3 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 4400e4596e..5d8846da1e 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -84,6 +84,8 @@ int HIP_DENY_PEER_ACCESS = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; +// TODO - set these to 0 and 1 +int HIP_EVENT_SYS_RELEASE=1; int HIP_COHERENT_HOST_ALLOC = 0; // TODO - set to 0 once we resolve stability. @@ -94,9 +96,9 @@ int HIP_SYNC_HOST_ALLOC = 1; int HIP_SYNC_NULL_STREAM = 1; // HIP needs to change some behavior based on HCC_OPT_FLUSH : +// TODO - set this to 1 int HCC_OPT_FLUSH = 0; -int HIP_EVENT_SYS_RELEASE=0; diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 54073e4901..47baf5c206 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -52,6 +52,8 @@ std::vector syncMsg = {"event", "stream", "device"}; void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { std::cerr << "test: CheckHostPointer " << msg + //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC + //<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE << " eventFlags = " << std::hex << eventFlags << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") @@ -185,6 +187,21 @@ int main(){ } + // Check defaults: + if (1) { + int *A = nullptr; + HIPCHECK(hipHostMalloc((void**)&A, sizeBytes)); + const char *ptrType = "default"; + CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType); + + CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType); + } + + } diff --git a/projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp new file mode 100644 index 0000000000..459c0054c9 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -0,0 +1,170 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Simple test for memset. +// Also serves as a template for other tests. + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * RUN: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +#ifdef __HIP_PLATFORM_HCC__ +#include +#endif + +#define USE_HSA_COPY 1 + +int enablePeers(int dev0, int dev1) +{ + int canAccessPeer01, canAccessPeer10; + HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer01, dev0, dev1)); + HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer10, dev1, dev0)); + if (!canAccessPeer01 || !canAccessPeer10) { + return -1; + } + + HIPCHECK(hipSetDevice(dev0)); + HIPCHECK(hipDeviceEnablePeerAccess(dev1, 0/*flags*/)); + HIPCHECK(hipSetDevice(dev1)); + HIPCHECK(hipDeviceEnablePeerAccess(dev0, 0/*flags*/)); + + return 0; +}; + + +__global__ void +memsetIntKernel(int * ptr, int val, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + if (gid < numElements) { + ptr[gid] = val; + } +}; + + +void checkReverse(const int *ptr, int numElements, int expected) { + for (int i=numElements-1; i>=0; i--) { + if (ptr[i] != expected) { + printf ("i=%d, ptr[](%d) != expected (%d)\n", i, ptr[i], expected); + assert (ptr[i] == expected); + } + } + + printf ("test: OK\n"); +} + + +void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + int * dataGpu0, int *dataGpu1, int *dataHost, int expected) +{ + hipEvent_t e; + HIPCHECK(hipEventCreateWithFlags(&e,0)); + + printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel"); + const size_t sizeElements = numElements * sizeof(int); + + hipStream_t stepAStream = gpu0Stream; + + if (stepAIsCopy) { +#ifdef USE_HSA_COPY + HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); +#endif + } else { + assert(0); // not yet supported. + } + + HIPCHECK(hipEventRecord(e, stepAStream)); + HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + + HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream)); + + HIPCHECK(hipStreamSynchronize(gpu1Stream)); + + checkReverse(dataHost, numElements, expected); +} + + +void testMultiGpu0(int dev0, int dev1, int numElements) +{ + const size_t sizeElements = numElements * sizeof(int); + + int * dataGpu0, *dataGpu1, *dataHost; + hipStream_t gpu0Stream, gpu1Stream; + const int expected = 42; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + HIPCHECK(hipSetDevice(dev0)); + + HIPCHECK(hipMalloc(&dataGpu0, sizeElements)); + HIPCHECK(hipStreamCreate(&gpu0Stream)); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu0, expected, numElements); + HIPCHECK(hipDeviceSynchronize()); + + + HIPCHECK(hipSetDevice(dev1)); + HIPCHECK(hipMalloc(&dataGpu1, sizeElements)); + HIPCHECK(hipStreamCreate(&gpu1Stream)); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu1, 0x34, numElements); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipHostMalloc(&dataHost, sizeElements)); + memset(dataHost, 13, sizeElements); + +#ifdef __HIP_PLATFORM_HCC__ + hc::am_memtracker_print(0x0); +#endif + + printf (" test: init complete\n"); + + runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected); + +}; + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + int numElements = N; + + int dev0 = 0; + int dev1 = 1; + + // TODO - only works on multi-GPU system: + if (enablePeers(dev0,dev1) == -1) { + printf ("warning : could not find peer gpus\n"); + return -1; + }; + + //testMultiGpu0(dev0, dev1, numElements); + + + + passed(); +}; From f3950e074839565c93d2bfcfb21c70ac35765b40 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 4 Jun 2017 20:18:37 -0500 Subject: [PATCH 74/77] Update tests. Fix some NVCC issues. Add hipStreamSync2, record_event tests. [ROCm/hip commit: 8ce6d179830bad76e7c853d74efcfb13c53b8f61] --- .../hip/tests/src/kernel/inline_asm_vadd.cpp | 2 +- .../src/runtimeApi/event/record_event.cpp | 149 +++++++++++++++ .../src/runtimeApi/memory/hipHostMalloc.cpp | 3 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 132 -------------- .../src/runtimeApi/stream/hipStreamSync2.cpp | 169 ++++++++++++++++++ projects/hip/tests/src/test_common.h | 14 ++ 6 files changed, 335 insertions(+), 134 deletions(-) create mode 100644 projects/hip/tests/src/runtimeApi/event/record_event.cpp delete mode 100644 projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp create mode 100644 projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp diff --git a/projects/hip/tests/src/kernel/inline_asm_vadd.cpp b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp index 481b606e89..7a941d31af 100644 --- a/projects/hip/tests/src/kernel/inline_asm_vadd.cpp +++ b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp @@ -16,7 +16,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTI THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s + * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc * RUN: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/event/record_event.cpp b/projects/hip/tests/src/runtimeApi/event/record_event.cpp new file mode 100644 index 0000000000..66027b1643 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/event/record_event.cpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "test_common.h" + +enum SyncMode { + syncNone, + syncNullStream, + syncOtherStream, +}; + + +const char *syncModeString(int syncMode) { + switch (syncMode) { + case syncNone: + return "syncNone"; + case syncNullStream: + return "syncNullStream"; + case syncOtherStream: + return "syncOtherStream"; + default: + return "unknown"; + }; +}; + + +void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) +{ + printf ("\ntest: syncMode=%s\n", syncModeString(syncMode)); + + size_t sizeBytes = numElements * sizeof(int); + + int count =100; + int init0 = 0; + HIPCHECK(hipMemset(C_d, init0, sizeBytes)); + for (int i=0; i0.0f); + printf ("time=%6.2f\n", t); + + HIPCHECK(hipEventElapsedTime(&t, stop, start)); + assert (t<0.0f); + printf ("negtime=%6.2f\n", t); + + HIPCHECK(hipEventElapsedTime(&t, start, start)); + assert (t==0.0f); + HIPCHECK(hipEventElapsedTime(&t, stop, stop)); + assert (t==0.0f); + + + if (stream) { + HIPCHECK(hipStreamDestroy(stream)); + } + HIPCHECK(hipEventDestroy(start)); + HIPCHECK(hipEventDestroy(stop)); + + printf ("test: OK \n"); +} + + + +void runTests(int64_t numElements) +{ + size_t sizeBytes = numElements * sizeof(int); + + printf ("test: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); + + + int *C_h, *C_d; + HIPCHECK(hipMalloc(&C_d, sizeBytes)); + HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + + + { + test (C_d, C_h, numElements, syncNone); + test (C_d, C_h, numElements, syncNullStream); + test (C_d, C_h, numElements, syncOtherStream); + //test (C_d, C_h, numElements, syncDevice); + } + + + HIPCHECK(hipFree(C_d)); + HIPCHECK(hipHostFree(C_h)); +} + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); + + runTests(4000000); + + passed(); +} diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 47baf5c206..607e2a9f63 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -21,11 +21,12 @@ */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN: %t * HIT_END */ +#include #include"test_common.h" #define LEN 1024*1024 diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp deleted file mode 100644 index 34951f0a09..0000000000 --- a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * RUN: %t - * HIT_END - */ - -#include"test_common.h" - -#define NUM_ELEMENTS 1024*1024*64 -#define SIZE NUM_ELEMENTS*sizeof(int) - -int p_count = 4; - - -void multiGpuHostAlloc(int allocDevice) -{ - - int numDevices; - HIPCHECK(hipGetDeviceCount(&numDevices)); - - printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); - - - HIPCHECK(hipSetDevice(allocDevice)); - - int *Ah, *Ch; - hipHostMalloc((void**)&Ah, SIZE); - hipHostMalloc((void**)&Ch, SIZE); - - const int init = -1; - for (size_t i=0; i 1) - { - multiGpuHostAlloc(1); - } - } - - passed(); -} diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp new file mode 100644 index 0000000000..b57e120dcc --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -0,0 +1,169 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "test_common.h" + +enum SyncMode { + syncNone, + syncNullStream, + syncOtherStream, + syncMarkerThenOtherStream, + syncMarkerThenOtherNonBlockingStream, + syncDevice +}; + + +const char *syncModeString(int syncMode) { + switch (syncMode) { + case syncNone: + return "syncNone"; + case syncNullStream: + return "syncNullStream"; + case syncOtherStream: + return "syncOtherStream"; + case syncMarkerThenOtherStream: + return "syncMarkerThenOtherStream"; + case syncMarkerThenOtherNonBlockingStream: + return "syncMarkerThenOtherNonBlockingStream"; + case syncDevice: + return "syncDevice"; + default: + return "unknown"; + }; +}; + + +void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) +{ + printf ("\ntest: syncMode=%s expectMismatch=%d\n", syncModeString(syncMode), expectMismatch); + + size_t sizeBytes = numElements * sizeof(int); + + int count =100; + int init0 = 0; + HIPCHECK(hipMemset(C_d, init0, sizeBytes)); + for (int i=0; i 0); + } + + + HIPCHECK(hipStreamDestroy(otherStream)); + HIPCHECK(hipEventDestroy(e)); + + printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); +} + + +void testEventRecord() +{ +} + + +void runTests(int64_t numElements) +{ + size_t sizeBytes = numElements * sizeof(int); + + printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); + + + int *C_h, *C_d; + HIPCHECK(hipMalloc(&C_d, sizeBytes)); + HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + + + { + test (C_d, C_h, numElements, syncNone, true /*expectMismatch*/); + test (C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); + test (C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncMarkerThenOtherStream, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/); + } + + + HIPCHECK(hipFree(C_d)); + HIPCHECK(hipHostFree(C_h)); +} + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); + + runTests(40000000); + + passed(); +} diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 81edca4e1e..f585fb8bca 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -201,6 +201,20 @@ addCountReverse( const T *A_d, } +template +__global__ void +memsetReverse( T *C_d, T val, + int64_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = val; + } +} + + template void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { From 445042f9163258611881d33c0b8f6779e5a491d6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 5 Jun 2017 00:41:18 -0500 Subject: [PATCH 75/77] Fix HIP_SYNC_NULL_STREAM=0 mode. - Fix null-stream sync - hipStreamDestroy of null stream returns hipErrorInvalidResourceHandle - Update documentation. - Add tests for null stream sync, hipEventElapsedTime. - Rename internal enum hipEventStatusRecorded to hipEventStatusComplete - refactor hipStreamWaitEvent to streamline control-flow [ROCm/hip commit: 39c18e5e5ff1685ee84e1e35e28fb0231834e7e5] --- .../include/hip/hcc_detail/hip_runtime_api.h | 10 +- projects/hip/src/hip_event.cpp | 74 ++++----- projects/hip/src/hip_hcc.cpp | 56 ++++--- projects/hip/src/hip_hcc_internal.h | 10 +- projects/hip/src/hip_stream.cpp | 30 ++-- .../src/runtimeApi/event/record_event.cpp | 141 ++++++++++++------ .../src/runtimeApi/stream/hipStreamSync2.cpp | 72 ++++++--- 7 files changed, 249 insertions(+), 144 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 25eac31ec6..fde38c8395 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -658,10 +658,12 @@ hipError_t hipStreamSynchronize(hipStream_t stream); * * This function inserts a wait operation into the specified stream. * All future work submitted to @p stream will wait until @p event reports completion before beginning execution. - * This function is host-asynchronous and the function may return before the wait has completed. + * + * This function only waits for commands in the current stream to complete. Notably,, this function does + * not impliciy wait for commands in the default stream to complete, even if the specified stream is + * created with hipStreamNonBlocking = 0. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy - * */ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags); @@ -766,10 +768,10 @@ hipError_t hipEventCreate(hipEvent_t* event); * the specified stream, after all previous * commands in that stream have completed executing. * - * If hipEventRecord() has been previously called aon event, then this call will overwrite any existing state in event. + * If hipEventRecord() has been previously called on this event, then this call will overwrite any existing state in event. * * If this function is called on a an event that is currently being recorded, results are undefined - either - * outstanding recording may save state into the event, and the order is not guaranteed. This shoul be avoided. + * outstanding recording may save state into the event, and the order is not guaranteed. * * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime * diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index 8ef652489a..ab1c43a00b 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -53,15 +53,12 @@ void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, -void ihipEvent_t::setTimestamp() +void ihipEvent_t::refereshEventStatus() { bool isReady0 = _marker.is_ready(); bool isReady1; int val = 0; - if (_state == hipEventStatusRecorded) { - // already recorded, done: - return; - } else { + if (_state == hipEventStatusRecording) { // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (_marker.get_native_handle()); isReady1 = _marker.is_ready(); @@ -78,12 +75,12 @@ void ihipEvent_t::setTimestamp() _timestamp = 0; } - _state = hipEventStatusRecorded; + _state = hipEventStatusComplete; } } } - if (_state != hipEventStatusRecorded) { + if (_state != hipEventStatusComplete) { //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); } } @@ -103,12 +100,10 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); const bool illegalFlags = (flags & ~supportedFlags) || // can't set any unsupported flags. - (flags & releaseFlags) == releaseFlags; // can't set both + (flags & releaseFlags) == releaseFlags; // can't set both release flags if (!illegalFlags) { - ihipEvent_t *eh = new ihipEvent_t(flags); - - *event = eh; + *event = new ihipEvent_t(flags); } else { e = hipErrorInvalidValue; } @@ -148,7 +143,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); - event->_state = hipEventStatusRecorded; + event->_state = hipEventStatusComplete; return ihipLogStatus(hipSuccess); } else { event->_state = hipEventStatusRecording; @@ -209,41 +204,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - start->setTimestamp(); - stop->setTimestamp(); - hipError_t status = hipSuccess; + *ms = 0.0f; - if (start && stop) { - // refresh status: - if ((start->_state == hipEventStatusRecorded) && (stop->_state == hipEventStatusRecorded)) { - // Common case, we have good information for both events. + if ((start == nullptr) || + (start->_flags & hipEventDisableTiming) || + (start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) || + (stop == nullptr) || + (stop->_flags & hipEventDisableTiming) || + ( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) { - int64_t tickDiff = (stop->timestamp() - start->timestamp()); + // Both events must be at least recorded else return hipErrorInvalidResourceHandle - uint64_t freqHz; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); - if (freqHz) { - *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; - status = hipSuccess; - } else { - * ms = 0.0f; - status = hipErrorInvalidValue; - } + status = hipErrorInvalidResourceHandle; + + } else { + // Refresh status, if still recording... + start->refereshEventStatus(); + stop->refereshEventStatus(); + + if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) { + // Common case, we have good information for both events. + + int64_t tickDiff = (stop->timestamp() - start->timestamp()); + + uint64_t freqHz; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); + if (freqHz) { + *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; + status = hipSuccess; + } else { + * ms = 0.0f; + status = hipErrorInvalidValue; + } } else if ((start->_state == hipEventStatusRecording) || (stop->_state == hipEventStatusRecording)) { status = hipErrorNotReady; - } else if ((start->_state == hipEventStatusUnitialized) || - (stop->_state == hipEventStatusUnitialized)) { - status = hipErrorInvalidResourceHandle; + } else { + assert(0); } - } else { - status = hipErrorInvalidResourceHandle; - } + } return ihipLogStatus(status); } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 5d8846da1e..0cdc57eaab 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -92,7 +92,8 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; -// Sync on host between +// Chicken bit to sync on host to implement null stream. +// If 0, null stream synchronization is performed on the GPU int HIP_SYNC_NULL_STREAM = 1; // HIP needs to change some behavior based on HCC_OPT_FLUSH : @@ -987,11 +988,17 @@ std::string ihipCtx_t::toString() const -// Implement "default" stream syncronization -// This waits for all other streams to drain before continuing. +// This called for submissions that are sent to the null/default stream. This routine ensures +// that this new command waits for activity in the other streams to complete before proceeding. +// +// HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU +// HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host. +// // If waitOnSelf is set, this additionally waits for the default stream to empty. // In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other // activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// +// syncToHost causes host to wait for the stream to finish. // Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { @@ -1005,34 +1012,36 @@ void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream, unless waitOnSelf specified. + bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) && + (waitOnSelf || (stream != _defaultStream)); + if (HIP_SYNC_NULL_STREAM) { - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { - - if (waitOnSelf || (stream != _defaultStream)) { - stream->locked_wait(); - } + if (waitThisStream) { + stream->locked_wait(); } } else { - if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + if (waitThisStream) { LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); // The last marker will provide appropriate visibility: if (!streamCrit->_av.get_is_empty()) { depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + tprintf(DB_SYNC, " push marker to wait for stream=%s\n", ToString(stream).c_str()); + } else { + tprintf(DB_SYNC, " skipped stream=%s since it is empty\n", ToString(stream).c_str()); } } } } - // Enqueue a barrier to wait on all the barriers we sent above: if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); - tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams. sync_host=%d\n", depOps.size(), syncHost); hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); if (syncHost) { defaultCf.wait(); // TODO - account for active or blocking here. @@ -1374,6 +1383,7 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { + // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams: ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); @@ -1382,34 +1392,38 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) #endif return ctx->_defaultStream; } else { - // All streams have to wait for legacy default stream to be empty: + // Submitting to a "normal" stream, just wait for null stream: if (!(stream->_flags & hipStreamNonBlocking)) { if (HIP_SYNC_NULL_STREAM) { - tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n", ToString(stream).c_str()); stream->getCtx()->_defaultStream->locked_wait(); } else { ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; - tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); - bool needMarker = false; + bool needGatherMarker = false; // used to gather together other markers. hc::completion_future dcf; { LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); - // TODO - could call create_blocking_marker(queue) + // TODO - could call create_blocking_marker(queue) or uses existing marker. if (!defaultStreamCrit->_av.get_is_empty()) { - needMarker = true; + needGatherMarker = true; - // TODO - add "none_scope". + tprintf(DB_SYNC, " %s adding marker to default %s for dependency\n", + ToString(stream).c_str(), ToString(defaultStream).c_str()); dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } else { + tprintf(DB_SYNC, " %s skipping marker since default stream is empty\n", ToString(stream).c_str()); } } - if (needMarker) { + if (needGatherMarker) { // ensure any commands sent to this stream wait on the NULL stream before continuing LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); // TODO - could be "noret" version of create_blocking_marker thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); + tprintf(DB_SYNC, " %s adding marker to wait for freshly recorded default-stream marker \n", + ToString(stream).c_str()); } } } diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index b15d5a73e4..c3f8b72311 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -586,10 +586,10 @@ private: // Data //---- // Internal event structure: enum hipEventStatus_t { - hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. - hipEventStatusCreated = 1, - hipEventStatusRecording = 2, // event has been enqueued to record something. - hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. + hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use. + hipEventStatusCreated = 1, // event created, but not yet Recorded + hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet. + hipEventStatusComplete = 3, // event has been recorded - timestamps are valid. } ; // TODO - rename to ihip type of some kind @@ -604,7 +604,7 @@ class ihipEvent_t { public: ihipEvent_t(unsigned flags); void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); - void setTimestamp(); + void refereshEventStatus(); uint64_t timestamp() const { return _timestamp; } ; ihipEventType_t type() const { return _type; }; diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index 9f1228d6f7..40aade28b9 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -93,20 +93,17 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int } else if (event->_state != hipEventStatusUnitialized) { - bool fastWait = false; - if (stream != hipStreamNull) { + + // This will user create_blocking_marker to wait on the specified queue. stream->locked_waitEvent(event); - fastWait = true; // don't use the slow host-side synchronization. - } - - if (!fastWait) { + } else { // TODO-hcc Convert to use create_blocking_marker(...) functionality. // Currently we have a super-conservative version of this - block on host, and drain the queue. // This should create a barrier packet in the target queue. + // TODO-HIP_SYNC_NULL_STREAM stream->locked_wait(); - e = hipSuccess; } } // else event not recorded, return immediately and don't create marker. @@ -150,6 +147,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { + // note this does not synchornize with the NULL stream: stream->locked_wait(); e = hipSuccess; } @@ -171,20 +169,18 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { - ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); + e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream } else { stream->locked_wait(); - e = hipSuccess; - } - ihipCtx_t *ctx = stream->getCtx(); + ihipCtx_t *ctx = stream->getCtx(); - if (ctx) { - ctx->locked_removeStream(stream); - delete stream; - } else { - e = hipErrorInvalidResourceHandle; + if (ctx) { + ctx->locked_removeStream(stream); + delete stream; + } else { + e = hipErrorInvalidResourceHandle; + } } return ihipLogStatus(e); diff --git a/projects/hip/tests/src/runtimeApi/event/record_event.cpp b/projects/hip/tests/src/runtimeApi/event/record_event.cpp index 66027b1643..bd8a3ada8e 100644 --- a/projects/hip/tests/src/runtimeApi/event/record_event.cpp +++ b/projects/hip/tests/src/runtimeApi/event/record_event.cpp @@ -28,8 +28,8 @@ THE SOFTWARE. enum SyncMode { syncNone, - syncNullStream, - syncOtherStream, + syncStream, + syncStopEvent, }; @@ -37,19 +37,23 @@ const char *syncModeString(int syncMode) { switch (syncMode) { case syncNone: return "syncNone"; - case syncNullStream: - return "syncNullStream"; - case syncOtherStream: - return "syncOtherStream"; + case syncStream: + return "syncStream"; + case syncStopEvent: + return "syncStopEvent"; default: return "unknown"; }; }; -void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) +void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_t stream, int waitStart, SyncMode syncMode) { - printf ("\ntest: syncMode=%s\n", syncModeString(syncMode)); + if (!(testMask & p_tests)) { + return; + } + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -60,55 +64,95 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) C_h[i] = -1; // initialize } - hipStream_t stream = 0; + hipEvent_t neverCreated=0, neverRecorded, timingDisabled; + HIPCHECK(hipEventCreate(&neverRecorded)); + HIPCHECK(hipEventCreateWithFlags(&timingDisabled, hipEventDisableTiming)); - unsigned flags=0; - if (syncMode == syncOtherStream) { - HIPCHECK(hipStreamCreateWithFlags(&stream, flags)); - } - - hipEvent_t neverCreated=0; - hipEvent_t start, stop, neverRecorded; + hipEvent_t start, stop; HIPCHECK(hipEventCreate(&start)); HIPCHECK(hipEventCreate(&stop)); - HIPCHECK(hipEventCreate(&neverRecorded)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); HIPCHECK(hipEventRecord(stop, stream)); - HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... + + if (waitStart) { + HIPCHECK(hipEventSynchronize(start)); + } + + + hipError_t expectedStopError = hipSuccess; + + // How to wait for the events to finish: + switch (syncMode) { + case syncNone: + expectedStopError = hipErrorNotReady; + break; + case syncStream: + HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... + break; + case syncStopEvent: + HIPCHECK(hipEventSynchronize(stop)); + break; + default: + assert(0); + }; + float t; - HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded), hipErrorInvalidResourceHandle); - - HIPCHECK(hipEventElapsedTime(&t, start, stop)); - assert (t>0.0f); - printf ("time=%6.2f\n", t); - - HIPCHECK(hipEventElapsedTime(&t, stop, start)); - assert (t<0.0f); - printf ("negtime=%6.2f\n", t); - - HIPCHECK(hipEventElapsedTime(&t, start, start)); - assert (t==0.0f); - HIPCHECK(hipEventElapsedTime(&t, stop, stop)); - assert (t==0.0f); - - - if (stream) { - HIPCHECK(hipStreamDestroy(stream)); + hipError_t e = hipEventElapsedTime(&t, start, start); + if ((e != hipSuccess) && (e != hipErrorNotReady)) { + failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } + + if (e == hipSuccess) + assert (t==0.0f); + + + // stop usually ready unless we skipped the synchronization (syncNone) + HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); + if (e == hipSuccess) + assert (t==0.0f); + + + e = hipEventElapsedTime(&t, start, stop); + HIPCHECK_API(e, expectedStopError); + if (expectedStopError == hipSuccess) + assert (t>0.0f); + printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); + + e = hipEventElapsedTime(&t, stop, start); + HIPCHECK_API(e, expectedStopError); + if (expectedStopError == hipSuccess) + assert (t<0.0f); + printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); + + + + { + // Check some error conditions for incomplete events: + HIPCHECK_API(hipEventElapsedTime(&t, timingDisabled, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, timingDisabled), hipErrorInvalidResourceHandle); + + HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated), hipErrorInvalidResourceHandle); + + HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded), hipErrorInvalidResourceHandle); + } + HIPCHECK(hipEventDestroy(start)); HIPCHECK(hipEventDestroy(stop)); + // Clear out everything: + HIPCHECK(hipDeviceSynchronize()); + printf ("test: OK \n"); } @@ -125,15 +169,22 @@ void runTests(int64_t numElements) HIPCHECK(hipMalloc(&C_d, sizeBytes)); HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, 0x0)); - { - test (C_d, C_h, numElements, syncNone); - test (C_d, C_h, numElements, syncNullStream); - test (C_d, C_h, numElements, syncOtherStream); - //test (C_d, C_h, numElements, syncDevice); + //for (int waitStart=0; waitStart<2; waitStart++) { + for (int waitStart=1; waitStart>=0; waitStart--) { + unsigned W = waitStart ? 0x1000:0; + test (W | 0x01, C_d, C_h, numElements, 0 , waitStart, syncNone); + test (W | 0x02, C_d, C_h, numElements, stream, waitStart, syncNone); + test (W | 0x04, C_d, C_h, numElements, 0 , waitStart, syncStream); + test (W | 0x08, C_d, C_h, numElements, stream, waitStart, syncStream); + test (W | 0x10, C_d, C_h, numElements, 0, waitStart, syncStopEvent); + test (W | 0x20, C_d, C_h, numElements, stream, waitStart, syncStopEvent); } + HIPCHECK(hipStreamDestroy(stream)); HIPCHECK(hipFree(C_d)); HIPCHECK(hipHostFree(C_h)); } @@ -143,7 +194,7 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); - runTests(4000000); + runTests(80000000); passed(); } diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp index b57e120dcc..c6a58ce7d4 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -56,9 +56,27 @@ const char *syncModeString(int syncMode) { }; -void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) +void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - printf ("\ntest: syncMode=%s expectMismatch=%d\n", syncModeString(syncMode), expectMismatch); + + // This test sends a long-running kernel to the null stream, then tests to see if the + // specified synchronization technique is effective. + // + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // cases the test sets expectMismatch and the check logic below will attempt to ensure that + // the undesired synchronization did not occur - ie ensure the kernel is still running and did + // not yet update the stop event. This can be tricky since if the kernel runs fast enough it + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. + // + // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results + // are checked and we test to make sure stop event has completed. + + if (!(testMask & p_tests)) { + return; + } + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -72,13 +90,15 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); - hipEvent_t e; - HIPCHECK(hipEventCreate(&e)); + hipEvent_t stop, otherStreamEvent; + HIPCHECK(hipEventCreate(&stop)); + HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -92,7 +112,10 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - HIPCHECK(hipEventRecord(e, otherStream)); // this may wait for NULL stream depending hipStreamNonBlocking flag above + + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: @@ -102,6 +125,14 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec assert(0); }; + hipError_t done = hipEventQuery(stop); + + if (expectMismatch) { + assert (done == hipErrorNotReady); + } else { + assert (done == hipSuccess); + } + int mismatches = 0; int expected = init0 + count; for (int i=0; i Date: Mon, 5 Jun 2017 08:50:41 -0500 Subject: [PATCH 76/77] Enable HIP_SYNC_NULL_STREAM=0 optimization. [ROCm/hip commit: decf3eee1821e56d703e3210f963ac635430821c] --- projects/hip/src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 0cdc57eaab..08a2cdbfcf 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -94,7 +94,7 @@ int HIP_SYNC_HOST_ALLOC = 1; // Chicken bit to sync on host to implement null stream. // If 0, null stream synchronization is performed on the GPU -int HIP_SYNC_NULL_STREAM = 1; +int HIP_SYNC_NULL_STREAM = 0; // HIP needs to change some behavior based on HCC_OPT_FLUSH : // TODO - set this to 1 From 3f9b16c397e0e52bc05a06b4446dda3b72962df1 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 5 Jun 2017 11:38:28 -0500 Subject: [PATCH 77/77] Improve HIP kernel names, attributes and codegen, contributed by Alex Voicu Change-Id: I2cafbdc5a98e26c7f4fad84739c915e7dc09993c [ROCm/hip commit: a9808961bd14dd3c8bf076bfe72b649f7ceaa02f] --- .../hip/hcc_detail/grid_launch_GGL.hpp | 1278 +++++++++-------- .../hip/include/hip/hcc_detail/hip_runtime.h | 11 +- .../hip/include/hip/hcc_detail/host_defines.h | 2 +- 3 files changed, 660 insertions(+), 631 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp index 2dd9a95bc6..8e3dab8482 100644 --- a/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -89,8 +89,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, const hc::accelerator_view& acc_v, - K k, - Ts&&... args) + K k) { const auto d = hc::extent<3>{ num_blocks.z * dim_blocks.z, @@ -102,16 +101,11 @@ namespace hip_impl group_mem_bytes); try { - hc::parallel_for_each( - acc_v, - d, - [=](const hc::tiled_index<3>& idx) [[hc]] { - k(args...); - }); + hc::parallel_for_each(acc_v, d, k); } catch (std::exception& ex) { - std::cerr << "Failed in " << __FUNCTION__ << ", with exception: " - << ex.what() << std::endl; + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; throw; } } @@ -133,8 +127,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&&... args) + K k) { void* lck_stream = nullptr; auto acc_v = lock_stream_hip_(stream, lck_stream); @@ -156,12 +149,11 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, acc_v, - std::move(k), - std::forward(args)...); + std::move(k)); } catch (std::exception& ex) { - std::cerr << "Failed in " << __FUNCTION__ << ", with exception: " - << ex.what() << std::endl; + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; throw; } } @@ -175,8 +167,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, hipStream_t stream, - K k, - Ts&&... args) + K k) { grid_launch_hip_impl_( New_grid_launch_tag{}, @@ -184,9 +175,7 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, std::move(stream), - std::move(k), - hipLaunchParm{}, - std::forward(args)...); + std::move(k)); } template @@ -199,8 +188,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&&... args) + K k) { grid_launch_hip_impl_( New_grid_launch_tag{}, @@ -209,9 +197,7 @@ namespace hip_impl group_mem_bytes, std::move(stream), kernel_name, - std::move(k), - hipLaunchParm{}, - std::forward(args)...); + std::move(k)); } template @@ -223,8 +209,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&& ... args) + K k) { grid_launch_hip_impl_( is_new_grid_launch_t{}, @@ -233,8 +218,7 @@ namespace hip_impl group_mem_bytes, std::move(stream), kernel_name, - std::move(k), - std::forward(args)...); + std::move(k)); } template @@ -245,8 +229,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, hipStream_t stream, - K k, - Ts&& ... args) + K k) { grid_launch_hip_impl_( is_new_grid_launch_t{}, @@ -254,610 +237,649 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, std::move(stream), - std::move(k), - std::forward(args)...); + std::move(k)); } - namespace - { - template - constexpr - inline - T&& forward_(std::remove_reference_t& x) [[hc]] - { - return static_cast(x); - } + // TODO: these are temporary and purposefully noisy and disruptive. + #define make_kernel_name_hip(k, n)\ + HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ + HIP_kernel_functor_name_end ## _ ## n - template - struct Forwarder { - template - void operator()(Ts&&...args) const [[hc]] - { - k(forward_(args)...); - } - }; - } + #define make_kernel_functor_hip_27(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24)\ + struct make_kernel_name_hip(function_name, 25) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ + }\ + } + #define make_kernel_functor_hip_26(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ + struct make_kernel_name_hip(function_name, 24) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ + }\ + } + #define make_kernel_functor_hip_25(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ + struct make_kernel_name_hip(function_name, 23) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + __attribute__((used, flatten))\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_);\ + }\ + } + #define make_kernel_functor_hip_24(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ + struct make_kernel_name_hip(function_name, 22) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_);\ + }\ + } + #define make_kernel_functor_hip_23(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ + struct make_kernel_name_hip(function_name, 21) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_);\ + }\ + } + #define make_kernel_functor_hip_22(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ + struct make_kernel_name_hip(function_name, 20) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_);\ + }\ + } + #define make_kernel_functor_hip_21(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18)\ + struct make_kernel_name_hip(function_name, 19) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_);\ + }\ + } + #define make_kernel_functor_hip_20(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17)\ + struct make_kernel_name_hip(function_name, 18) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ + }\ + } + #define make_kernel_functor_hip_19(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16)\ + struct make_kernel_name_hip(function_name, 17) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ + }\ + } + #define make_kernel_functor_hip_18(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15)\ + struct make_kernel_name_hip(function_name, 16) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ + }\ + } + #define make_kernel_functor_hip_17(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14)\ + struct make_kernel_name_hip(function_name, 15) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_);\ + }\ + } + #define make_kernel_functor_hip_16(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13)\ + struct make_kernel_name_hip(function_name, 14) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_);\ + }\ + } + #define make_kernel_functor_hip_15(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12)\ + struct make_kernel_name_hip(function_name, 13) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_);\ + }\ + } + #define make_kernel_functor_hip_14(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11)\ + struct make_kernel_name_hip(function_name, 12) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_);\ + }\ + } + #define make_kernel_functor_hip_13(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ + struct make_kernel_name_hip(function_name, 11) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_);\ + }\ + } + #define make_kernel_functor_hip_12(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ + struct make_kernel_name_hip(function_name, 10) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ + _p9_);\ + }\ + } + #define make_kernel_functor_hip_11(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ + struct make_kernel_name_hip(function_name, 9) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ + }\ + } + #define make_kernel_functor_hip_10(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ + struct make_kernel_name_hip(function_name, 8) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ + }\ + } + #define make_kernel_functor_hip_9(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ + struct make_kernel_name_hip(function_name, 7) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ + }\ + } + #define make_kernel_functor_hip_8(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ + struct make_kernel_name_hip(function_name, 6) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ + }\ + } + #define make_kernel_functor_hip_7(\ + function_name, kernel_name, p0, p1, p2, p3, p4)\ + struct make_kernel_name_hip(function_name, 5) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ + }\ + } + #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ + struct make_kernel_name_hip(function_name, 4) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_);\ + }\ + } + #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ + struct make_kernel_name_hip(function_name, 3) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_);\ + }\ + } + #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ + struct make_kernel_name_hip(function_name, 2) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_);\ + }\ + } + #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n + #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ + struct make_kernel_name_hip(function_name, 1) {\ + std::decay_t _p0_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_);\ + }\ + } + #define make_kernel_functor_hip_2(function_name, kernel_name)\ + struct make_kernel_name_hip(function_name, 0) {\ + void operator()(const hc::tiled_index<3>&) [[hc]]\ + {\ + return kernel_name(hipLaunchParm{});\ + }\ + } + #define make_kernel_functor_hip_1(...) + #define make_kernel_functor_hip_0(...) + #define make_kernel_functor_hip_(...)\ + overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - template - requires(Domain == {Ts...}) - inline - void grid_launch( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - Forwarder{}, - std::forward(args)...); - } - template - requires(Domain == {Ts...}) - inline - void grid_launch( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - hipLaunchParm{}, - std::forward(args)...); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::forward(args)...); - } - - // TODO: these are temporary, they need to be completely removed once we - // enable C++14 support and can have proper generic, variadic lambdas. - #define make_kernel_lambda_hip_26(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22, p23, p24)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_,\ - const std::decay_t& _p23_,\ - const std::decay_t& _p24_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - } - #define make_kernel_lambda_hip_25(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22, p23)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_,\ - const std::decay_t& _p23_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - } - #define make_kernel_lambda_hip_24(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - } - #define make_kernel_lambda_hip_23(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - } - #define make_kernel_lambda_hip_22(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - } - #define make_kernel_lambda_hip_21(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - } - #define make_kernel_lambda_hip_20(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - } - #define make_kernel_lambda_hip_19(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - } - #define make_kernel_lambda_hip_18(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - } - #define make_kernel_lambda_hip_17(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - } - #define make_kernel_lambda_hip_16(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_);\ - } - #define make_kernel_lambda_hip_15(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_);\ - } - #define make_kernel_lambda_hip_14(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_);\ - } - #define make_kernel_lambda_hip_13(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_);\ - } - #define make_kernel_lambda_hip_12(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - } - #define make_kernel_lambda_hip_11(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_);\ - } - #define make_kernel_lambda_hip_10(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - } - #define make_kernel_lambda_hip_9(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - } - #define make_kernel_lambda_hip_8(kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - } - #define make_kernel_lambda_hip_7(kernel_name, p0, p1, p2, p3, p4, p5)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - } - #define make_kernel_lambda_hip_6(kernel_name, p0, p1, p2, p3, p4)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - } - #define make_kernel_lambda_hip_5(kernel_name, p0, p1, p2, p3)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - } - #define make_kernel_lambda_hip_4(kernel_name, p0, p1, p2)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_);\ - } - #define make_kernel_lambda_hip_3(kernel_name, p0, p1)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_) [[hc]] {\ - kernel_name(_p0_, _p1_);\ - } - #define make_kernel_lambda_hip_2(kernel_name, p0)\ - [](const std::decay_t& _p0_) [[hc]] {\ - kernel_name(_p0_);\ - } - #define make_kernel_lambda_hip_1(kernel_name)\ - []() [[hc]] { return kernel_name(hipLaunchParm{}); } - - #define make_kernel_lambda_hip_(...)\ - overload_macro_hip_(make_kernel_lambda_hip_, __VA_ARGS__) + #define hipLaunchNamedKernelGGL(\ + function_name,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ...)\ + do {\ + make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ + hip_kernel_functor_impl_{__VA_ARGS__};\ + hip_impl::grid_launch_hip_(\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + #kernel_name,\ + hip_kernel_functor_impl_);\ + } while(0) #define hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - make_kernel_lambda_hip_(kernel_name, __VA_ARGS__),\ - ##__VA_ARGS__);\ - } while(0) + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchNamedKernelGGL(\ + unnamed,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ##__VA_ARGS__);\ + } while (0) #define hipLaunchKernel(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) } #endif //GENERIC_GRID_LAUNCH diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index 4d8876d8f4..129020d9cd 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -149,8 +149,15 @@ extern int HIP_TRACE_API; #endif /* Device feature flags */ -//TODO-HCC this is currently ignored by HCC target of HIP -#define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) +#define launch_bounds_impl0(requiredMaxThreadsPerBlock)\ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) +#define launch_bounds_impl1(\ + requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\ + amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) +#define select_impl_(_1, _2, impl_, ...) impl_ +#define __launch_bounds__(...) select_impl_(\ + __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) // Detect if we are compiling C++ mode or C mode #if defined(__cplusplus) diff --git a/projects/hip/include/hip/hcc_detail/host_defines.h b/projects/hip/include/hip/hcc_detail/host_defines.h index 5864cfa0e7..140cbb0678 100644 --- a/projects/hip/include/hip/hcc_detail/host_defines.h +++ b/projects/hip/include/hip/hcc_detail/host_defines.h @@ -48,7 +48,7 @@ THE SOFTWARE. #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else //#warning "GGL global define reached" -#define __global__ __attribute__((hc, weak)) +#define __global__ __attribute__((annotate("hip__global__"), hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline))